benchbox 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchbox/__init__.py +1 -1
- benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query72.tpl +1 -1
- benchbox/_binaries/tpc-ds/{darwin-x86_64/query_templates/ansi.tpl → templates/query_templates/sqlserver.tpl} +1 -1
- benchbox/_binaries/tpc-ds/templates/query_variants/README +6 -0
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query10.tpl → templates/query_variants/query10a.tpl} +13 -14
- benchbox/_binaries/tpc-ds/{darwin-x86_64/query_templates/query14.tpl → templates/query_variants/query14a.tpl} +30 -26
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query18.tpl → templates/query_variants/query18a.tpl} +40 -19
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query22.tpl → templates/query_variants/query22a.tpl} +31 -9
- benchbox/_binaries/tpc-ds/{darwin-x86_64/query_templates/query27.tpl → templates/query_variants/query27a.tpl} +23 -10
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query35.tpl → templates/query_variants/query35a.tpl} +9 -8
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query36.tpl → templates/query_variants/query36a.tpl} +24 -12
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query51.tpl → templates/query_variants/query51a.tpl} +37 -20
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query5.tpl → templates/query_variants/query5a.tpl} +15 -10
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query67.tpl → templates/query_variants/query67a.tpl} +46 -18
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query70.tpl → templates/query_variants/query70a.tpl} +31 -27
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query77.tpl → templates/query_variants/query77a.tpl} +22 -15
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query80.tpl → templates/query_variants/query80a.tpl} +22 -8
- benchbox/_binaries/tpc-ds/{linux-arm64/query_templates/query86.tpl → templates/query_variants/query86a.tpl} +22 -13
- benchbox/_binaries/tpc-h/templates/dists.dss +836 -0
- benchbox/_binaries/tpc-h/templates/queries/1.sql +28 -0
- benchbox/_binaries/tpc-h/templates/queries/10.sql +38 -0
- benchbox/_binaries/tpc-h/templates/queries/11.sql +34 -0
- benchbox/_binaries/tpc-h/templates/queries/12.sql +35 -0
- benchbox/_binaries/tpc-h/templates/queries/13.sql +27 -0
- benchbox/_binaries/tpc-h/templates/queries/14.sql +20 -0
- benchbox/_binaries/tpc-h/templates/queries/15.sql +40 -0
- benchbox/_binaries/tpc-h/templates/queries/16.sql +37 -0
- benchbox/_binaries/tpc-h/templates/queries/17.sql +24 -0
- benchbox/_binaries/tpc-h/templates/queries/18.sql +39 -0
- benchbox/_binaries/tpc-h/templates/queries/19.sql +42 -0
- benchbox/_binaries/tpc-h/templates/queries/2.sql +50 -0
- benchbox/_binaries/tpc-h/templates/queries/20.sql +44 -0
- benchbox/_binaries/tpc-h/templates/queries/21.sql +46 -0
- benchbox/_binaries/tpc-h/templates/queries/22.sql +44 -0
- benchbox/_binaries/tpc-h/templates/queries/3.sql +29 -0
- benchbox/_binaries/tpc-h/templates/queries/4.sql +28 -0
- benchbox/_binaries/tpc-h/templates/queries/5.sql +31 -0
- benchbox/_binaries/tpc-h/templates/queries/6.sql +16 -0
- benchbox/_binaries/tpc-h/templates/queries/7.sql +46 -0
- benchbox/_binaries/tpc-h/templates/queries/8.sql +44 -0
- benchbox/_binaries/tpc-h/templates/queries/9.sql +39 -0
- benchbox/_binaries/tpc-h/templates/variants/12a.sql +27 -0
- benchbox/_binaries/tpc-h/templates/variants/13a.sql +30 -0
- benchbox/_binaries/tpc-h/templates/variants/14a.sql +18 -0
- benchbox/_binaries/tpc-h/templates/variants/15a.sql +39 -0
- benchbox/_binaries/tpc-h/templates/variants/8a.sql +77 -0
- benchbox/base.py +88 -121
- benchbox/cli/benchmarks.py +3 -3
- benchbox/cli/commands/calculate_qphh.py +55 -14
- benchbox/cli/commands/checks.py +1 -4
- benchbox/cli/commands/convert.py +8 -3
- benchbox/cli/commands/metrics.py +55 -14
- benchbox/cli/commands/results.py +131 -3
- benchbox/cli/commands/run.py +157 -22
- benchbox/cli/commands/visualize.py +3 -3
- benchbox/cli/composite_params.py +1 -1
- benchbox/cli/config.py +13 -3
- benchbox/cli/database.py +3 -3
- benchbox/cli/dryrun.py +30 -4
- benchbox/cli/exceptions.py +2 -1
- benchbox/cli/execution_pipeline.py +2 -1
- benchbox/cli/orchestrator.py +25 -71
- benchbox/cli/tuning.py +1 -1
- benchbox/core/ai_primitives/benchmark.py +53 -0
- benchbox/core/ai_primitives/dataframe_operations.py +1217 -0
- benchbox/core/base_benchmark.py +90 -68
- benchbox/core/coffeeshop/queries.py +1 -1
- benchbox/core/coffeeshop/schema.py +1 -1
- benchbox/core/comparison/plotter.py +5 -4
- benchbox/core/dataframe/__init__.py +26 -0
- benchbox/core/dataframe/benchmark_suite.py +5 -4
- benchbox/core/dataframe/context.py +45 -0
- benchbox/core/dataframe/data_loader.py +180 -79
- benchbox/core/dataframe/maintenance_interface.py +866 -0
- benchbox/core/dryrun.py +152 -22
- benchbox/core/expected_results/registry.py +22 -5
- benchbox/core/manifest/io.py +4 -3
- benchbox/core/metadata_primitives/__init__.py +31 -0
- benchbox/core/metadata_primitives/benchmark.py +337 -0
- benchbox/core/metadata_primitives/dataframe_operations.py +1824 -0
- benchbox/core/platform_registry.py +134 -45
- benchbox/core/read_primitives/benchmark.py +56 -4
- benchbox/core/read_primitives/dataframe_queries.py +6547 -0
- benchbox/core/results/__init__.py +47 -6
- benchbox/core/results/builder.py +909 -0
- benchbox/core/results/database.py +5 -5
- benchbox/core/results/exporter.py +58 -96
- benchbox/core/results/filenames.py +102 -0
- benchbox/core/results/loader.py +10 -9
- benchbox/core/results/metrics.py +211 -0
- benchbox/core/results/models.py +3 -1
- benchbox/core/results/normalizer.py +346 -0
- benchbox/core/results/platform_info.py +235 -0
- benchbox/core/results/query_normalizer.py +200 -0
- benchbox/core/results/schema.py +368 -69
- benchbox/core/runner/conversion.py +2 -0
- benchbox/core/runner/dataframe_runner.py +135 -131
- benchbox/core/runner/runner.py +111 -18
- benchbox/core/schemas.py +145 -3
- benchbox/core/ssb/generator.py +14 -2
- benchbox/core/tpc_compliance.py +4 -4
- benchbox/core/tpc_metrics.py +9 -4
- benchbox/core/tpcdi/generator/manifest.py +15 -2
- benchbox/core/tpcds/benchmark/runner.py +3 -7
- benchbox/core/tpcds/c_tools.py +34 -28
- benchbox/core/tpcds/dataframe_queries/queries.py +44 -21
- benchbox/core/tpcds/generator/filesystem.py +23 -11
- benchbox/core/tpcds/generator/manager.py +3 -2
- benchbox/core/tpcds/maintenance_test.py +281 -0
- benchbox/core/tpcds/power_test.py +21 -11
- benchbox/core/tpcds/throughput_test.py +27 -9
- benchbox/core/tpcds_obt/etl/transformer.py +24 -5
- benchbox/core/tpch/dataframe_queries.py +46 -43
- benchbox/core/tpch/generator.py +21 -8
- benchbox/core/tpch/maintenance_test.py +87 -0
- benchbox/core/tpch/power_test.py +21 -5
- benchbox/core/tpch/queries.py +2 -7
- benchbox/core/tpch/streams.py +3 -19
- benchbox/core/transaction_primitives/benchmark.py +99 -0
- benchbox/core/transaction_primitives/dataframe_operations.py +1294 -0
- benchbox/core/transaction_primitives/generator.py +11 -4
- benchbox/core/visualization/__init__.py +2 -2
- benchbox/core/visualization/charts.py +4 -4
- benchbox/core/visualization/dependencies.py +1 -12
- benchbox/core/visualization/exporters.py +15 -26
- benchbox/core/visualization/result_plotter.py +90 -49
- benchbox/core/visualization/templates.py +6 -6
- benchbox/core/write_primitives/__init__.py +13 -0
- benchbox/core/write_primitives/benchmark.py +66 -0
- benchbox/core/write_primitives/dataframe_operations.py +912 -0
- benchbox/core/write_primitives/generator.py +11 -4
- benchbox/mcp/__init__.py +5 -1
- benchbox/mcp/errors.py +29 -0
- benchbox/mcp/resources/registry.py +12 -7
- benchbox/mcp/schemas.py +62 -0
- benchbox/mcp/server.py +17 -14
- benchbox/mcp/tools/__init__.py +3 -0
- benchbox/mcp/tools/analytics.py +550 -582
- benchbox/mcp/tools/benchmark.py +603 -611
- benchbox/mcp/tools/discovery.py +156 -205
- benchbox/mcp/tools/results.py +332 -533
- benchbox/mcp/tools/visualization.py +449 -0
- benchbox/platforms/__init__.py +740 -622
- benchbox/platforms/adapter_factory.py +6 -6
- benchbox/platforms/azure_synapse.py +3 -7
- benchbox/platforms/base/adapter.py +189 -49
- benchbox/platforms/base/cloud_spark/config.py +8 -0
- benchbox/platforms/base/cloud_spark/mixins.py +96 -0
- benchbox/platforms/base/cloud_spark/session.py +4 -2
- benchbox/platforms/base/cloud_spark/staging.py +15 -7
- benchbox/platforms/base/data_loading.py +315 -1
- benchbox/platforms/base/format_capabilities.py +37 -2
- benchbox/platforms/base/utils.py +6 -4
- benchbox/platforms/bigquery.py +5 -6
- benchbox/platforms/clickhouse_cloud.py +263 -0
- benchbox/platforms/databricks/adapter.py +16 -15
- benchbox/platforms/databricks/dataframe_adapter.py +4 -1
- benchbox/platforms/dataframe/__init__.py +31 -0
- benchbox/platforms/dataframe/benchmark_mixin.py +779 -0
- benchbox/platforms/dataframe/cudf_df.py +3 -3
- benchbox/platforms/dataframe/dask_df.py +3 -3
- benchbox/platforms/dataframe/datafusion_df.py +152 -15
- benchbox/platforms/dataframe/delta_lake_maintenance.py +341 -0
- benchbox/platforms/dataframe/ducklake_maintenance.py +402 -0
- benchbox/platforms/dataframe/expression_family.py +47 -8
- benchbox/platforms/dataframe/hudi_maintenance.py +437 -0
- benchbox/platforms/dataframe/iceberg_maintenance.py +605 -0
- benchbox/platforms/dataframe/modin_df.py +3 -3
- benchbox/platforms/dataframe/pandas_df.py +3 -3
- benchbox/platforms/dataframe/pandas_family.py +59 -8
- benchbox/platforms/dataframe/platform_checker.py +16 -49
- benchbox/platforms/dataframe/polars_df.py +14 -12
- benchbox/platforms/dataframe/polars_maintenance.py +630 -0
- benchbox/platforms/dataframe/pyspark_df.py +15 -0
- benchbox/platforms/dataframe/pyspark_maintenance.py +613 -0
- benchbox/platforms/datafusion.py +5 -6
- benchbox/platforms/duckdb.py +2 -1
- benchbox/platforms/fabric_warehouse.py +15 -15
- benchbox/platforms/firebolt.py +3 -2
- benchbox/platforms/influxdb/adapter.py +7 -3
- benchbox/platforms/motherduck.py +3 -2
- benchbox/platforms/onehouse/__init__.py +39 -0
- benchbox/platforms/onehouse/onehouse_client.py +509 -0
- benchbox/platforms/onehouse/quanton_adapter.py +646 -0
- benchbox/platforms/postgresql.py +5 -9
- benchbox/platforms/presto.py +2 -2
- benchbox/platforms/pyspark/session.py +3 -3
- benchbox/platforms/pyspark/sql_adapter.py +2 -3
- benchbox/platforms/redshift.py +7 -7
- benchbox/platforms/snowflake.py +4 -4
- benchbox/platforms/snowpark_connect.py +2 -1
- benchbox/platforms/trino.py +2 -2
- benchbox/release/__init__.py +17 -0
- benchbox/release/content_validation.py +745 -0
- benchbox/release/workflow.py +17 -0
- benchbox/utils/VERSION_MANAGEMENT.md +1 -1
- benchbox/utils/cloud_storage.py +7 -5
- benchbox/utils/compression.py +8 -8
- benchbox/utils/compression_mixin.py +2 -1
- benchbox/utils/data_validation.py +23 -14
- benchbox/utils/dependencies.py +47 -7
- benchbox/utils/file_format.py +407 -0
- benchbox/utils/format_converters/__init__.py +5 -1
- benchbox/utils/format_converters/ducklake_converter.py +227 -0
- benchbox/utils/format_converters/vortex_converter.py +168 -0
- benchbox/utils/tpc_compilation.py +43 -0
- benchbox/utils/version.py +14 -2
- {benchbox-0.1.0.dist-info → benchbox-0.1.1.dist-info}/METADATA +15 -15
- benchbox-0.1.1.dist-info/RECORD +839 -0
- {benchbox-0.1.0.dist-info → benchbox-0.1.1.dist-info}/WHEEL +1 -1
- benchbox/_binaries/tpc-ds/darwin-arm64/query_templates/sqlserver.tpl +0 -37
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/README +0 -4
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/db2.tpl +0 -38
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/netezza.tpl +0 -38
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/oracle.tpl +0 -38
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query1.tpl +0 -62
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query10.tpl +0 -98
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query11.tpl +0 -119
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query12.tpl +0 -72
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query13.tpl +0 -89
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query15.tpl +0 -56
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query16.tpl +0 -76
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query17.tpl +0 -80
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query18.tpl +0 -73
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query19.tpl +0 -64
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query2.tpl +0 -94
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query20.tpl +0 -67
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query21.tpl +0 -65
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query22.tpl +0 -54
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query23.tpl +0 -144
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query24.tpl +0 -147
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query25.tpl +0 -84
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query26.tpl +0 -61
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query28.tpl +0 -90
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query29.tpl +0 -85
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query3.tpl +0 -58
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query30.tpl +0 -66
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query31.tpl +0 -88
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query32.tpl +0 -65
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query33.tpl +0 -113
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query34.tpl +0 -77
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query35.tpl +0 -98
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query36.tpl +0 -74
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query37.tpl +0 -57
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query38.tpl +0 -58
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query39.tpl +0 -93
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query4.tpl +0 -154
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query40.tpl +0 -63
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query41.tpl +0 -90
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query42.tpl +0 -64
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query43.tpl +0 -55
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query44.tpl +0 -72
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query45.tpl +0 -56
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query46.tpl +0 -78
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query47.tpl +0 -89
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query48.tpl +0 -104
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query49.tpl +0 -164
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query5.tpl +0 -165
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query50.tpl +0 -96
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query51.tpl +0 -80
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query52.tpl +0 -59
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query53.tpl +0 -64
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query54.tpl +0 -95
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query55.tpl +0 -52
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query56.tpl +0 -108
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query57.tpl +0 -87
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query58.tpl +0 -101
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query59.tpl +0 -79
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query6.tpl +0 -62
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query60.tpl +0 -115
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query61.tpl +0 -83
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query62.tpl +0 -71
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query63.tpl +0 -64
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query64.tpl +0 -157
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query65.tpl +0 -62
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query66.tpl +0 -261
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query67.tpl +0 -81
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query68.tpl +0 -82
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query69.tpl +0 -85
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query7.tpl +0 -60
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query70.tpl +0 -73
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query71.tpl +0 -74
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query72.tpl +0 -67
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query73.tpl +0 -69
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query74.tpl +0 -99
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query75.tpl +0 -107
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query76.tpl +0 -64
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query77.tpl +0 -145
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query78.tpl +0 -94
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query79.tpl +0 -60
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query8.tpl +0 -144
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query80.tpl +0 -131
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query81.tpl +0 -68
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query82.tpl +0 -56
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query83.tpl +0 -104
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query84.tpl +0 -58
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query85.tpl +0 -121
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query86.tpl +0 -60
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query87.tpl +0 -56
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query88.tpl +0 -128
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query89.tpl +0 -75
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query9.tpl +0 -88
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query90.tpl +0 -58
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query91.tpl +0 -68
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query92.tpl +0 -68
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query93.tpl +0 -53
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query94.tpl +0 -67
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query95.tpl +0 -71
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query96.tpl +0 -52
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query97.tpl +0 -62
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query98.tpl +0 -70
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/query99.tpl +0 -69
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/sqlserver.tpl +0 -37
- benchbox/_binaries/tpc-ds/darwin-x86_64/query_templates/templates.lst +0 -99
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/README +0 -4
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/ansi.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/db2.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/netezza.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/oracle.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query1.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query11.tpl +0 -119
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query12.tpl +0 -72
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query13.tpl +0 -89
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query14.tpl +0 -247
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query15.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query16.tpl +0 -76
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query17.tpl +0 -80
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query19.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query2.tpl +0 -94
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query20.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query21.tpl +0 -65
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query23.tpl +0 -144
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query24.tpl +0 -147
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query25.tpl +0 -84
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query26.tpl +0 -61
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query27.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query28.tpl +0 -90
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query29.tpl +0 -85
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query3.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query30.tpl +0 -66
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query31.tpl +0 -88
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query32.tpl +0 -65
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query33.tpl +0 -113
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query34.tpl +0 -77
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query37.tpl +0 -57
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query38.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query39.tpl +0 -93
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query4.tpl +0 -154
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query40.tpl +0 -63
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query41.tpl +0 -90
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query42.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query43.tpl +0 -55
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query44.tpl +0 -72
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query45.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query46.tpl +0 -78
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query47.tpl +0 -89
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query48.tpl +0 -104
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query49.tpl +0 -164
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query50.tpl +0 -96
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query52.tpl +0 -59
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query53.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query54.tpl +0 -95
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query55.tpl +0 -52
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query56.tpl +0 -108
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query57.tpl +0 -87
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query58.tpl +0 -101
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query59.tpl +0 -79
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query6.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query60.tpl +0 -115
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query61.tpl +0 -83
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query62.tpl +0 -71
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query63.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query64.tpl +0 -157
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query65.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query66.tpl +0 -261
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query68.tpl +0 -82
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query69.tpl +0 -85
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query7.tpl +0 -60
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query71.tpl +0 -74
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query72.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query73.tpl +0 -69
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query74.tpl +0 -99
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query75.tpl +0 -107
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query76.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query78.tpl +0 -94
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query79.tpl +0 -60
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query8.tpl +0 -144
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query81.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query82.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query83.tpl +0 -104
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query84.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query85.tpl +0 -121
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query87.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query88.tpl +0 -128
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query89.tpl +0 -75
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query9.tpl +0 -88
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query90.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query91.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query92.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query93.tpl +0 -53
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query94.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query95.tpl +0 -71
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query96.tpl +0 -52
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query97.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query98.tpl +0 -70
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/query99.tpl +0 -69
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/sqlserver.tpl +0 -37
- benchbox/_binaries/tpc-ds/linux-arm64/query_templates/templates.lst +0 -99
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/README +0 -4
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/ansi.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/db2.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/netezza.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/oracle.tpl +0 -38
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query1.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query10.tpl +0 -98
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query11.tpl +0 -119
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query12.tpl +0 -72
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query13.tpl +0 -89
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query14.tpl +0 -247
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query15.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query16.tpl +0 -76
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query17.tpl +0 -80
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query18.tpl +0 -73
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query19.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query2.tpl +0 -94
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query20.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query21.tpl +0 -65
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query22.tpl +0 -54
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query23.tpl +0 -144
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query24.tpl +0 -147
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query25.tpl +0 -84
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query26.tpl +0 -61
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query27.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query28.tpl +0 -90
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query29.tpl +0 -85
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query3.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query30.tpl +0 -66
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query31.tpl +0 -88
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query32.tpl +0 -65
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query33.tpl +0 -113
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query34.tpl +0 -77
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query35.tpl +0 -98
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query36.tpl +0 -74
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query37.tpl +0 -57
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query38.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query39.tpl +0 -93
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query4.tpl +0 -154
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query40.tpl +0 -63
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query41.tpl +0 -90
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query42.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query43.tpl +0 -55
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query44.tpl +0 -72
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query45.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query46.tpl +0 -78
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query47.tpl +0 -89
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query48.tpl +0 -104
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query49.tpl +0 -164
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query5.tpl +0 -165
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query50.tpl +0 -96
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query51.tpl +0 -80
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query52.tpl +0 -59
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query53.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query54.tpl +0 -95
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query55.tpl +0 -52
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query56.tpl +0 -108
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query57.tpl +0 -87
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query58.tpl +0 -101
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query59.tpl +0 -79
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query6.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query60.tpl +0 -115
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query61.tpl +0 -83
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query62.tpl +0 -71
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query63.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query64.tpl +0 -157
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query65.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query66.tpl +0 -261
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query67.tpl +0 -81
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query68.tpl +0 -82
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query69.tpl +0 -85
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query7.tpl +0 -60
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query70.tpl +0 -73
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query71.tpl +0 -74
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query72.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query73.tpl +0 -69
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query74.tpl +0 -99
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query75.tpl +0 -107
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query76.tpl +0 -64
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query77.tpl +0 -145
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query78.tpl +0 -94
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query79.tpl +0 -60
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query8.tpl +0 -144
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query80.tpl +0 -131
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query81.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query82.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query83.tpl +0 -104
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query84.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query85.tpl +0 -121
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query86.tpl +0 -60
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query87.tpl +0 -56
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query88.tpl +0 -128
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query89.tpl +0 -75
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query9.tpl +0 -88
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query90.tpl +0 -58
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query91.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query92.tpl +0 -68
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query93.tpl +0 -53
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query94.tpl +0 -67
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query95.tpl +0 -71
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query96.tpl +0 -52
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query97.tpl +0 -62
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query98.tpl +0 -70
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/query99.tpl +0 -69
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/sqlserver.tpl +0 -37
- benchbox/_binaries/tpc-ds/linux-x86_64/query_templates/templates.lst +0 -99
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/README +0 -4
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/ansi.tpl +0 -38
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/db2.tpl +0 -38
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/netezza.tpl +0 -38
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/oracle.tpl +0 -38
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query1.tpl +0 -62
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query10.tpl +0 -98
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query11.tpl +0 -119
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query12.tpl +0 -72
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query13.tpl +0 -89
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query14.tpl +0 -247
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query15.tpl +0 -56
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query16.tpl +0 -76
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query17.tpl +0 -80
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query18.tpl +0 -73
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query19.tpl +0 -64
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query2.tpl +0 -94
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query20.tpl +0 -67
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query21.tpl +0 -65
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query22.tpl +0 -54
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query23.tpl +0 -144
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query24.tpl +0 -147
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query25.tpl +0 -84
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query26.tpl +0 -61
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query27.tpl +0 -68
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query28.tpl +0 -90
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query29.tpl +0 -85
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query3.tpl +0 -58
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query30.tpl +0 -66
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query31.tpl +0 -88
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query32.tpl +0 -65
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query33.tpl +0 -113
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query34.tpl +0 -77
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query35.tpl +0 -98
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query36.tpl +0 -74
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query37.tpl +0 -57
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query38.tpl +0 -58
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query39.tpl +0 -93
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query4.tpl +0 -154
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query40.tpl +0 -63
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query41.tpl +0 -90
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query42.tpl +0 -64
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query43.tpl +0 -55
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query44.tpl +0 -72
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query45.tpl +0 -56
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query46.tpl +0 -78
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query47.tpl +0 -89
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query48.tpl +0 -104
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query49.tpl +0 -164
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query5.tpl +0 -165
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query50.tpl +0 -96
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query51.tpl +0 -80
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query52.tpl +0 -59
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query53.tpl +0 -64
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query54.tpl +0 -95
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query55.tpl +0 -52
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query56.tpl +0 -108
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query57.tpl +0 -87
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query58.tpl +0 -101
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query59.tpl +0 -79
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query6.tpl +0 -62
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query60.tpl +0 -115
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query61.tpl +0 -83
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query62.tpl +0 -71
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query63.tpl +0 -64
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query64.tpl +0 -157
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query65.tpl +0 -62
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query66.tpl +0 -261
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query67.tpl +0 -81
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query68.tpl +0 -82
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query69.tpl +0 -85
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query7.tpl +0 -60
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query70.tpl +0 -73
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query71.tpl +0 -74
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query72.tpl +0 -67
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query73.tpl +0 -69
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query74.tpl +0 -99
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query75.tpl +0 -107
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query76.tpl +0 -64
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query77.tpl +0 -145
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query78.tpl +0 -94
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query79.tpl +0 -60
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query8.tpl +0 -144
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query80.tpl +0 -131
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query81.tpl +0 -68
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query82.tpl +0 -56
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query83.tpl +0 -104
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query84.tpl +0 -58
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query85.tpl +0 -121
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query86.tpl +0 -60
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query87.tpl +0 -56
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query88.tpl +0 -128
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query89.tpl +0 -75
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query9.tpl +0 -88
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query90.tpl +0 -58
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query91.tpl +0 -68
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query92.tpl +0 -68
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query93.tpl +0 -53
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query94.tpl +0 -67
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query95.tpl +0 -71
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query96.tpl +0 -52
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query97.tpl +0 -62
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query98.tpl +0 -70
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/query99.tpl +0 -69
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/sqlserver.tpl +0 -37
- benchbox/_binaries/tpc-ds/windows-x86_64/query_templates/templates.lst +0 -99
- benchbox-0.1.0.dist-info/RECORD +0 -1192
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/README +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/ansi.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/db2.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/netezza.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/oracle.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query1.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query10.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query11.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query12.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query13.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query14.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query15.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query16.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query17.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query18.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query19.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query2.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query20.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query21.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query22.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query23.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query24.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query25.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query26.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query27.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query28.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query29.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query3.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query30.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query31.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query32.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query33.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query34.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query35.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query36.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query37.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query38.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query39.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query4.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query40.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query41.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query42.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query43.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query44.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query45.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query46.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query47.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query48.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query49.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query5.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query50.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query51.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query52.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query53.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query54.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query55.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query56.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query57.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query58.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query59.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query6.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query60.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query61.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query62.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query63.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query64.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query65.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query66.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query67.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query68.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query69.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query7.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query70.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query71.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query73.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query74.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query75.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query76.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query77.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query78.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query79.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query8.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query80.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query81.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query82.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query83.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query84.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query85.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query86.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query87.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query88.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query89.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query9.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query90.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query91.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query92.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query93.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query94.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query95.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query96.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query97.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query98.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/query99.tpl +0 -0
- /benchbox/_binaries/tpc-ds/{darwin-arm64 → templates}/query_templates/templates.lst +0 -0
- {benchbox-0.1.0.dist-info → benchbox-0.1.1.dist-info}/entry_points.txt +0 -0
- {benchbox-0.1.0.dist-info → benchbox-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {benchbox-0.1.0.dist-info → benchbox-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1824 @@
|
|
|
1
|
+
"""DataFrame operations for Metadata Primitives benchmark.
|
|
2
|
+
|
|
3
|
+
This module provides DataFrame implementations of metadata introspection operations,
|
|
4
|
+
enabling benchmarking of schema discovery, column introspection, and table statistics
|
|
5
|
+
on DataFrame platforms like Polars, PySpark, and Pandas.
|
|
6
|
+
|
|
7
|
+
Platform Support:
|
|
8
|
+
- Polars: Schema introspection (df.schema, df.dtypes, df.describe())
|
|
9
|
+
- Pandas: Schema introspection (df.dtypes, df.info(), df.describe())
|
|
10
|
+
- PySpark: Full catalog support via spark.catalog API + Delta Lake/Iceberg metadata
|
|
11
|
+
|
|
12
|
+
The operations are organized into categories based on capability level:
|
|
13
|
+
- Schema Introspection: Available on all platforms
|
|
14
|
+
- Catalog Operations: PySpark with configured catalog only
|
|
15
|
+
- Lakehouse Metadata: Delta Lake and Iceberg table formats
|
|
16
|
+
|
|
17
|
+
Copyright 2026 Joe Harris / BenchBox Project
|
|
18
|
+
|
|
19
|
+
Licensed under the MIT License. See LICENSE file in the project root for details.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from enum import Enum
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MetadataOperationType(Enum):
|
|
34
|
+
"""Types of metadata operations supported by the benchmark.
|
|
35
|
+
|
|
36
|
+
Operations are grouped by platform capability level:
|
|
37
|
+
- Schema introspection: All DataFrame platforms
|
|
38
|
+
- Catalog operations: PySpark with catalog
|
|
39
|
+
- Lakehouse metadata: Delta Lake / Iceberg
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# Schema introspection (all platforms)
|
|
43
|
+
LIST_COLUMNS = "list_columns"
|
|
44
|
+
GET_DTYPES = "get_dtypes"
|
|
45
|
+
GET_SCHEMA = "get_schema"
|
|
46
|
+
DESCRIBE_STATS = "describe_stats"
|
|
47
|
+
ROW_COUNT = "row_count"
|
|
48
|
+
COLUMN_COUNT = "column_count"
|
|
49
|
+
|
|
50
|
+
# Catalog operations (PySpark with catalog)
|
|
51
|
+
LIST_DATABASES = "list_databases"
|
|
52
|
+
LIST_TABLES = "list_tables"
|
|
53
|
+
LIST_TABLE_COLUMNS = "list_table_columns"
|
|
54
|
+
TABLE_EXISTS = "table_exists"
|
|
55
|
+
GET_TABLE_INFO = "get_table_info"
|
|
56
|
+
|
|
57
|
+
# Lakehouse metadata (Delta Lake / Iceberg)
|
|
58
|
+
TABLE_HISTORY = "table_history"
|
|
59
|
+
TABLE_DETAIL = "table_detail"
|
|
60
|
+
FILE_METADATA = "file_metadata"
|
|
61
|
+
PARTITION_INFO = "partition_info"
|
|
62
|
+
SNAPSHOT_INFO = "snapshot_info"
|
|
63
|
+
|
|
64
|
+
# Complexity testing
|
|
65
|
+
WIDE_TABLE_SCHEMA = "wide_table_schema"
|
|
66
|
+
LARGE_CATALOG_LIST = "large_catalog_list"
|
|
67
|
+
COMPLEX_TYPE_INTROSPECTION = "complex_type_introspection"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class MetadataOperationCategory(Enum):
|
|
71
|
+
"""Categories of metadata operations."""
|
|
72
|
+
|
|
73
|
+
SCHEMA = "schema" # Schema introspection (all platforms)
|
|
74
|
+
CATALOG = "catalog" # Catalog operations (PySpark)
|
|
75
|
+
LAKEHOUSE = "lakehouse" # Delta Lake / Iceberg metadata
|
|
76
|
+
COMPLEXITY = "complexity" # Complexity stress testing
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# Mapping of operation types to categories
|
|
80
|
+
OPERATION_CATEGORIES: dict[MetadataOperationType, MetadataOperationCategory] = {
|
|
81
|
+
# Schema operations
|
|
82
|
+
MetadataOperationType.LIST_COLUMNS: MetadataOperationCategory.SCHEMA,
|
|
83
|
+
MetadataOperationType.GET_DTYPES: MetadataOperationCategory.SCHEMA,
|
|
84
|
+
MetadataOperationType.GET_SCHEMA: MetadataOperationCategory.SCHEMA,
|
|
85
|
+
MetadataOperationType.DESCRIBE_STATS: MetadataOperationCategory.SCHEMA,
|
|
86
|
+
MetadataOperationType.ROW_COUNT: MetadataOperationCategory.SCHEMA,
|
|
87
|
+
MetadataOperationType.COLUMN_COUNT: MetadataOperationCategory.SCHEMA,
|
|
88
|
+
# Catalog operations
|
|
89
|
+
MetadataOperationType.LIST_DATABASES: MetadataOperationCategory.CATALOG,
|
|
90
|
+
MetadataOperationType.LIST_TABLES: MetadataOperationCategory.CATALOG,
|
|
91
|
+
MetadataOperationType.LIST_TABLE_COLUMNS: MetadataOperationCategory.CATALOG,
|
|
92
|
+
MetadataOperationType.TABLE_EXISTS: MetadataOperationCategory.CATALOG,
|
|
93
|
+
MetadataOperationType.GET_TABLE_INFO: MetadataOperationCategory.CATALOG,
|
|
94
|
+
# Lakehouse operations
|
|
95
|
+
MetadataOperationType.TABLE_HISTORY: MetadataOperationCategory.LAKEHOUSE,
|
|
96
|
+
MetadataOperationType.TABLE_DETAIL: MetadataOperationCategory.LAKEHOUSE,
|
|
97
|
+
MetadataOperationType.FILE_METADATA: MetadataOperationCategory.LAKEHOUSE,
|
|
98
|
+
MetadataOperationType.PARTITION_INFO: MetadataOperationCategory.LAKEHOUSE,
|
|
99
|
+
MetadataOperationType.SNAPSHOT_INFO: MetadataOperationCategory.LAKEHOUSE,
|
|
100
|
+
# Complexity operations
|
|
101
|
+
MetadataOperationType.WIDE_TABLE_SCHEMA: MetadataOperationCategory.COMPLEXITY,
|
|
102
|
+
MetadataOperationType.LARGE_CATALOG_LIST: MetadataOperationCategory.COMPLEXITY,
|
|
103
|
+
MetadataOperationType.COMPLEX_TYPE_INTROSPECTION: MetadataOperationCategory.COMPLEXITY,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class DataFrameMetadataCapabilities:
|
|
109
|
+
"""Platform capabilities for DataFrame metadata operations.
|
|
110
|
+
|
|
111
|
+
Different DataFrame platforms have varying levels of metadata introspection
|
|
112
|
+
support. This dataclass captures what operations each platform can perform.
|
|
113
|
+
|
|
114
|
+
Attributes:
|
|
115
|
+
platform_name: Name of the platform (e.g., "polars-df", "pyspark-df")
|
|
116
|
+
supports_schema_introspection: Can introspect DataFrame schema/dtypes
|
|
117
|
+
supports_describe: Can compute summary statistics
|
|
118
|
+
supports_catalog: Has catalog API (list databases, tables)
|
|
119
|
+
supports_delta_lake: Has Delta Lake metadata capabilities
|
|
120
|
+
supports_iceberg: Has Iceberg metadata capabilities
|
|
121
|
+
supports_partitions: Can introspect partition information
|
|
122
|
+
supports_complex_types: Can introspect nested/complex types
|
|
123
|
+
notes: Platform-specific notes
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
platform_name: str
|
|
127
|
+
supports_schema_introspection: bool = True # All platforms support basic schema
|
|
128
|
+
supports_describe: bool = True # Most platforms support describe
|
|
129
|
+
supports_catalog: bool = False # Only PySpark with catalog
|
|
130
|
+
supports_delta_lake: bool = False # Requires delta-spark or deltalake
|
|
131
|
+
supports_iceberg: bool = False # Requires iceberg-spark or pyiceberg
|
|
132
|
+
supports_partitions: bool = False # PySpark, some lakehouse formats
|
|
133
|
+
supports_complex_types: bool = True # Most platforms handle complex types
|
|
134
|
+
notes: str = ""
|
|
135
|
+
|
|
136
|
+
def supports_operation(self, operation: MetadataOperationType) -> bool:
|
|
137
|
+
"""Check if an operation type is supported.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
operation: The operation type to check
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
True if the operation is supported
|
|
144
|
+
"""
|
|
145
|
+
# Schema operations - all platforms
|
|
146
|
+
if operation in (
|
|
147
|
+
MetadataOperationType.LIST_COLUMNS,
|
|
148
|
+
MetadataOperationType.GET_DTYPES,
|
|
149
|
+
MetadataOperationType.GET_SCHEMA,
|
|
150
|
+
MetadataOperationType.ROW_COUNT,
|
|
151
|
+
MetadataOperationType.COLUMN_COUNT,
|
|
152
|
+
):
|
|
153
|
+
return self.supports_schema_introspection
|
|
154
|
+
|
|
155
|
+
if operation == MetadataOperationType.DESCRIBE_STATS:
|
|
156
|
+
return self.supports_describe
|
|
157
|
+
|
|
158
|
+
# Catalog operations - PySpark with catalog
|
|
159
|
+
if operation in (
|
|
160
|
+
MetadataOperationType.LIST_DATABASES,
|
|
161
|
+
MetadataOperationType.LIST_TABLES,
|
|
162
|
+
MetadataOperationType.LIST_TABLE_COLUMNS,
|
|
163
|
+
MetadataOperationType.TABLE_EXISTS,
|
|
164
|
+
MetadataOperationType.GET_TABLE_INFO,
|
|
165
|
+
):
|
|
166
|
+
return self.supports_catalog
|
|
167
|
+
|
|
168
|
+
# Lakehouse operations - Delta Lake / Iceberg
|
|
169
|
+
if operation in (
|
|
170
|
+
MetadataOperationType.TABLE_HISTORY,
|
|
171
|
+
MetadataOperationType.TABLE_DETAIL,
|
|
172
|
+
):
|
|
173
|
+
return self.supports_delta_lake
|
|
174
|
+
|
|
175
|
+
if operation == MetadataOperationType.SNAPSHOT_INFO:
|
|
176
|
+
return self.supports_iceberg
|
|
177
|
+
|
|
178
|
+
if operation == MetadataOperationType.FILE_METADATA:
|
|
179
|
+
return self.supports_delta_lake or self.supports_iceberg
|
|
180
|
+
|
|
181
|
+
if operation == MetadataOperationType.PARTITION_INFO:
|
|
182
|
+
return self.supports_partitions
|
|
183
|
+
|
|
184
|
+
# Complexity operations
|
|
185
|
+
if operation == MetadataOperationType.WIDE_TABLE_SCHEMA:
|
|
186
|
+
return self.supports_schema_introspection
|
|
187
|
+
|
|
188
|
+
if operation == MetadataOperationType.LARGE_CATALOG_LIST:
|
|
189
|
+
return self.supports_catalog
|
|
190
|
+
|
|
191
|
+
if operation == MetadataOperationType.COMPLEX_TYPE_INTROSPECTION:
|
|
192
|
+
return self.supports_complex_types
|
|
193
|
+
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
def get_supported_operations(self) -> list[MetadataOperationType]:
|
|
197
|
+
"""Get list of operations supported by this platform.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of supported MetadataOperationType values
|
|
201
|
+
"""
|
|
202
|
+
return [op for op in MetadataOperationType if self.supports_operation(op)]
|
|
203
|
+
|
|
204
|
+
def get_unsupported_operations(self) -> list[MetadataOperationType]:
|
|
205
|
+
"""Get list of operations not supported by this platform.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
List of unsupported MetadataOperationType values
|
|
209
|
+
"""
|
|
210
|
+
return [op for op in MetadataOperationType if not self.supports_operation(op)]
|
|
211
|
+
|
|
212
|
+
def get_supported_categories(self) -> list[MetadataOperationCategory]:
|
|
213
|
+
"""Get list of operation categories supported by this platform.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
List of supported MetadataOperationCategory values
|
|
217
|
+
"""
|
|
218
|
+
categories = set()
|
|
219
|
+
for op in self.get_supported_operations():
|
|
220
|
+
if op in OPERATION_CATEGORIES:
|
|
221
|
+
categories.add(OPERATION_CATEGORIES[op])
|
|
222
|
+
return sorted(categories, key=lambda c: c.value)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# Pre-defined capabilities for common DataFrame platforms
|
|
226
|
+
POLARS_METADATA_CAPABILITIES = DataFrameMetadataCapabilities(
|
|
227
|
+
platform_name="polars-df",
|
|
228
|
+
supports_schema_introspection=True,
|
|
229
|
+
supports_describe=True,
|
|
230
|
+
supports_catalog=False,
|
|
231
|
+
supports_delta_lake=False, # Can read Delta via polars, limited metadata
|
|
232
|
+
supports_iceberg=False,
|
|
233
|
+
supports_partitions=False,
|
|
234
|
+
supports_complex_types=True,
|
|
235
|
+
notes="Schema introspection via df.schema, df.dtypes. No catalog support.",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
PANDAS_METADATA_CAPABILITIES = DataFrameMetadataCapabilities(
|
|
239
|
+
platform_name="pandas-df",
|
|
240
|
+
supports_schema_introspection=True,
|
|
241
|
+
supports_describe=True,
|
|
242
|
+
supports_catalog=False,
|
|
243
|
+
supports_delta_lake=False,
|
|
244
|
+
supports_iceberg=False,
|
|
245
|
+
supports_partitions=False,
|
|
246
|
+
supports_complex_types=False, # Pandas has limited nested type support
|
|
247
|
+
notes="Schema introspection via df.dtypes, df.info(). No catalog support.",
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
PYSPARK_METADATA_CAPABILITIES = DataFrameMetadataCapabilities(
|
|
251
|
+
platform_name="pyspark-df",
|
|
252
|
+
supports_schema_introspection=True,
|
|
253
|
+
supports_describe=True,
|
|
254
|
+
supports_catalog=True, # spark.catalog API
|
|
255
|
+
supports_delta_lake=False, # Set at runtime based on available packages
|
|
256
|
+
supports_iceberg=False, # Set at runtime based on available packages
|
|
257
|
+
supports_partitions=True,
|
|
258
|
+
supports_complex_types=True,
|
|
259
|
+
notes="Full catalog support via spark.catalog. Delta Lake/Iceberg require packages.",
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
DATAFUSION_METADATA_CAPABILITIES = DataFrameMetadataCapabilities(
|
|
263
|
+
platform_name="datafusion-df",
|
|
264
|
+
supports_schema_introspection=True,
|
|
265
|
+
supports_describe=True,
|
|
266
|
+
supports_catalog=False, # Limited catalog in standalone mode
|
|
267
|
+
supports_delta_lake=False,
|
|
268
|
+
supports_iceberg=False,
|
|
269
|
+
supports_partitions=False,
|
|
270
|
+
supports_complex_types=True,
|
|
271
|
+
notes="Schema introspection via DataFrame schema. Limited catalog support.",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def get_platform_capabilities(platform_name: str, **kwargs: Any) -> DataFrameMetadataCapabilities:
|
|
276
|
+
"""Get metadata capabilities for a platform.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
platform_name: Platform name (e.g., "polars-df", "pyspark-df")
|
|
280
|
+
**kwargs: Optional overrides (e.g., supports_delta_lake=True)
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
DataFrameMetadataCapabilities for the platform
|
|
284
|
+
"""
|
|
285
|
+
platform_lower = platform_name.lower()
|
|
286
|
+
|
|
287
|
+
# Select base capabilities
|
|
288
|
+
if "polars" in platform_lower:
|
|
289
|
+
base = POLARS_METADATA_CAPABILITIES
|
|
290
|
+
elif "pandas" in platform_lower:
|
|
291
|
+
base = PANDAS_METADATA_CAPABILITIES
|
|
292
|
+
elif "pyspark" in platform_lower or "spark" in platform_lower:
|
|
293
|
+
base = PYSPARK_METADATA_CAPABILITIES
|
|
294
|
+
elif "datafusion" in platform_lower:
|
|
295
|
+
base = DATAFUSION_METADATA_CAPABILITIES
|
|
296
|
+
else:
|
|
297
|
+
# Generic capabilities
|
|
298
|
+
base = DataFrameMetadataCapabilities(
|
|
299
|
+
platform_name=platform_name,
|
|
300
|
+
supports_schema_introspection=True,
|
|
301
|
+
supports_describe=True,
|
|
302
|
+
notes="Unknown platform - basic schema introspection only.",
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Apply overrides if any
|
|
306
|
+
if kwargs:
|
|
307
|
+
return DataFrameMetadataCapabilities(
|
|
308
|
+
platform_name=platform_name,
|
|
309
|
+
supports_schema_introspection=kwargs.get(
|
|
310
|
+
"supports_schema_introspection", base.supports_schema_introspection
|
|
311
|
+
),
|
|
312
|
+
supports_describe=kwargs.get("supports_describe", base.supports_describe),
|
|
313
|
+
supports_catalog=kwargs.get("supports_catalog", base.supports_catalog),
|
|
314
|
+
supports_delta_lake=kwargs.get("supports_delta_lake", base.supports_delta_lake),
|
|
315
|
+
supports_iceberg=kwargs.get("supports_iceberg", base.supports_iceberg),
|
|
316
|
+
supports_partitions=kwargs.get("supports_partitions", base.supports_partitions),
|
|
317
|
+
supports_complex_types=kwargs.get("supports_complex_types", base.supports_complex_types),
|
|
318
|
+
notes=kwargs.get("notes", base.notes),
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
return base
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@dataclass
|
|
325
|
+
class DataFrameMetadataResult:
|
|
326
|
+
"""Result of a DataFrame metadata operation.
|
|
327
|
+
|
|
328
|
+
Attributes:
|
|
329
|
+
operation_type: Type of metadata operation
|
|
330
|
+
success: Whether the operation completed successfully
|
|
331
|
+
start_time: Operation start timestamp (Unix time)
|
|
332
|
+
end_time: Operation end timestamp (Unix time)
|
|
333
|
+
duration_ms: Operation duration in milliseconds
|
|
334
|
+
result_count: Number of items returned (columns, tables, etc.)
|
|
335
|
+
result_data: The actual metadata result (schema dict, column list, etc.)
|
|
336
|
+
error_message: Error description if operation failed
|
|
337
|
+
metrics: Additional platform-specific metrics
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
operation_type: MetadataOperationType
|
|
341
|
+
success: bool
|
|
342
|
+
start_time: float
|
|
343
|
+
end_time: float
|
|
344
|
+
duration_ms: float
|
|
345
|
+
result_count: int = 0
|
|
346
|
+
result_data: Any = None
|
|
347
|
+
error_message: str | None = None
|
|
348
|
+
metrics: dict[str, Any] = field(default_factory=dict)
|
|
349
|
+
|
|
350
|
+
@classmethod
|
|
351
|
+
def success_result(
|
|
352
|
+
cls,
|
|
353
|
+
operation_type: MetadataOperationType,
|
|
354
|
+
start_time: float,
|
|
355
|
+
result_count: int,
|
|
356
|
+
result_data: Any = None,
|
|
357
|
+
metrics: dict[str, Any] | None = None,
|
|
358
|
+
) -> DataFrameMetadataResult:
|
|
359
|
+
"""Create a successful result.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
operation_type: The operation that completed
|
|
363
|
+
start_time: When the operation started
|
|
364
|
+
result_count: Number of items in the result
|
|
365
|
+
result_data: The actual result data
|
|
366
|
+
metrics: Additional metrics
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
DataFrameMetadataResult indicating success
|
|
370
|
+
"""
|
|
371
|
+
end_time = time.time()
|
|
372
|
+
return cls(
|
|
373
|
+
operation_type=operation_type,
|
|
374
|
+
success=True,
|
|
375
|
+
start_time=start_time,
|
|
376
|
+
end_time=end_time,
|
|
377
|
+
duration_ms=(end_time - start_time) * 1000,
|
|
378
|
+
result_count=result_count,
|
|
379
|
+
result_data=result_data,
|
|
380
|
+
metrics=metrics or {},
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
@classmethod
|
|
384
|
+
def failure_result(
|
|
385
|
+
cls,
|
|
386
|
+
operation_type: MetadataOperationType,
|
|
387
|
+
error_message: str,
|
|
388
|
+
start_time: float | None = None,
|
|
389
|
+
) -> DataFrameMetadataResult:
|
|
390
|
+
"""Create a failure result.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
operation_type: The operation that failed
|
|
394
|
+
error_message: Description of the failure
|
|
395
|
+
start_time: Optional start time (defaults to now)
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
DataFrameMetadataResult indicating failure
|
|
399
|
+
"""
|
|
400
|
+
now = time.time()
|
|
401
|
+
return cls(
|
|
402
|
+
operation_type=operation_type,
|
|
403
|
+
success=False,
|
|
404
|
+
start_time=start_time or now,
|
|
405
|
+
end_time=now,
|
|
406
|
+
duration_ms=0.0 if start_time is None else (now - start_time) * 1000,
|
|
407
|
+
result_count=0,
|
|
408
|
+
error_message=error_message,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class UnsupportedOperationError(Exception):
|
|
413
|
+
"""Raised when a metadata operation is not supported on the current platform.
|
|
414
|
+
|
|
415
|
+
Provides a helpful error message with alternatives.
|
|
416
|
+
"""
|
|
417
|
+
|
|
418
|
+
def __init__(
|
|
419
|
+
self,
|
|
420
|
+
operation: MetadataOperationType,
|
|
421
|
+
platform_name: str,
|
|
422
|
+
suggestion: str | None = None,
|
|
423
|
+
):
|
|
424
|
+
self.operation = operation
|
|
425
|
+
self.platform_name = platform_name
|
|
426
|
+
self.suggestion = suggestion
|
|
427
|
+
|
|
428
|
+
message = f"{platform_name} does not support {operation.value} operations."
|
|
429
|
+
if suggestion:
|
|
430
|
+
message += f"\n{suggestion}"
|
|
431
|
+
|
|
432
|
+
super().__init__(message)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def get_unsupported_message(operation: MetadataOperationType, platform_name: str) -> str:
|
|
436
|
+
"""Get helpful error message for unsupported operations.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
operation: The unsupported operation
|
|
440
|
+
platform_name: The platform name
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Helpful error message with alternatives
|
|
444
|
+
"""
|
|
445
|
+
category = OPERATION_CATEGORIES.get(operation, MetadataOperationCategory.SCHEMA)
|
|
446
|
+
|
|
447
|
+
if category == MetadataOperationCategory.CATALOG:
|
|
448
|
+
return (
|
|
449
|
+
f"{platform_name} does not support catalog operations ({operation.value}).\n"
|
|
450
|
+
f"Catalog operations require PySpark with a configured catalog.\n"
|
|
451
|
+
f"Alternatives:\n"
|
|
452
|
+
f" - Use schema introspection operations (df.schema, df.dtypes)\n"
|
|
453
|
+
f" - Switch to pyspark-df platform with Hive metastore or Unity Catalog"
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
if category == MetadataOperationCategory.LAKEHOUSE:
|
|
457
|
+
if "delta" in operation.value.lower() or operation in (
|
|
458
|
+
MetadataOperationType.TABLE_HISTORY,
|
|
459
|
+
MetadataOperationType.TABLE_DETAIL,
|
|
460
|
+
):
|
|
461
|
+
return (
|
|
462
|
+
f"{platform_name} does not support Delta Lake metadata operations ({operation.value}).\n"
|
|
463
|
+
f"Delta Lake operations require:\n"
|
|
464
|
+
f" - pyspark-df with delta-spark package, or\n"
|
|
465
|
+
f" - polars with delta support (read-only)"
|
|
466
|
+
)
|
|
467
|
+
if "iceberg" in operation.value.lower() or operation == MetadataOperationType.SNAPSHOT_INFO:
|
|
468
|
+
return (
|
|
469
|
+
f"{platform_name} does not support Iceberg metadata operations ({operation.value}).\n"
|
|
470
|
+
f"Iceberg operations require:\n"
|
|
471
|
+
f" - pyspark-df with iceberg-spark package, or\n"
|
|
472
|
+
f" - pyiceberg library"
|
|
473
|
+
)
|
|
474
|
+
return (
|
|
475
|
+
f"{platform_name} does not support lakehouse metadata operations ({operation.value}).\n"
|
|
476
|
+
f"Use pyspark-df with Delta Lake or Iceberg table format."
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
return f"{platform_name} does not support {operation.value} operations."
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
class DataFrameMetadataOperationsManager:
|
|
483
|
+
"""Manager for DataFrame metadata introspection operations.
|
|
484
|
+
|
|
485
|
+
Provides a unified interface for metadata operations across DataFrame platforms.
|
|
486
|
+
Handles platform capability detection and provides helpful error messages for
|
|
487
|
+
unsupported operations.
|
|
488
|
+
|
|
489
|
+
Example:
|
|
490
|
+
manager = DataFrameMetadataOperationsManager("polars-df")
|
|
491
|
+
|
|
492
|
+
# Check capabilities
|
|
493
|
+
if manager.supports_operation(MetadataOperationType.GET_SCHEMA):
|
|
494
|
+
result = manager.execute_get_schema(df)
|
|
495
|
+
|
|
496
|
+
# Get all supported operations
|
|
497
|
+
ops = manager.get_supported_operations()
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
def __init__(
|
|
501
|
+
self,
|
|
502
|
+
platform_name: str,
|
|
503
|
+
spark_session: Any = None,
|
|
504
|
+
delta_available: bool | None = None,
|
|
505
|
+
iceberg_available: bool | None = None,
|
|
506
|
+
) -> None:
|
|
507
|
+
"""Initialize the metadata operations manager.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
platform_name: Platform name (e.g., "polars-df", "pyspark-df")
|
|
511
|
+
spark_session: SparkSession instance (required for pyspark-df catalog ops)
|
|
512
|
+
delta_available: Override for Delta Lake availability detection
|
|
513
|
+
iceberg_available: Override for Iceberg availability detection
|
|
514
|
+
|
|
515
|
+
Raises:
|
|
516
|
+
ValueError: If platform is not supported for DataFrame operations
|
|
517
|
+
"""
|
|
518
|
+
self.platform_name = platform_name.lower()
|
|
519
|
+
self.spark_session = spark_session
|
|
520
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
|
521
|
+
|
|
522
|
+
# Build capabilities with optional overrides
|
|
523
|
+
overrides = {}
|
|
524
|
+
|
|
525
|
+
if "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
526
|
+
# Detect Delta Lake / Iceberg availability
|
|
527
|
+
if delta_available is None:
|
|
528
|
+
delta_available = self._detect_delta_lake()
|
|
529
|
+
if iceberg_available is None:
|
|
530
|
+
iceberg_available = self._detect_iceberg()
|
|
531
|
+
|
|
532
|
+
overrides["supports_delta_lake"] = delta_available
|
|
533
|
+
overrides["supports_iceberg"] = iceberg_available
|
|
534
|
+
|
|
535
|
+
self._capabilities = get_platform_capabilities(self.platform_name, **overrides)
|
|
536
|
+
|
|
537
|
+
def _detect_delta_lake(self) -> bool:
|
|
538
|
+
"""Detect if Delta Lake is available.
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
True if delta-spark or deltalake is available
|
|
542
|
+
"""
|
|
543
|
+
try:
|
|
544
|
+
import delta # noqa: F401
|
|
545
|
+
|
|
546
|
+
return True
|
|
547
|
+
except ImportError:
|
|
548
|
+
pass
|
|
549
|
+
|
|
550
|
+
try:
|
|
551
|
+
import deltalake # noqa: F401
|
|
552
|
+
|
|
553
|
+
return True
|
|
554
|
+
except ImportError:
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
return False
|
|
558
|
+
|
|
559
|
+
def _detect_iceberg(self) -> bool:
|
|
560
|
+
"""Detect if Iceberg is available.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
True if iceberg-spark or pyiceberg is available
|
|
564
|
+
"""
|
|
565
|
+
try:
|
|
566
|
+
import pyiceberg # noqa: F401
|
|
567
|
+
|
|
568
|
+
return True
|
|
569
|
+
except ImportError:
|
|
570
|
+
pass
|
|
571
|
+
|
|
572
|
+
# Check for iceberg-spark via SparkSession
|
|
573
|
+
if self.spark_session is not None:
|
|
574
|
+
try:
|
|
575
|
+
# Check if iceberg catalog is configured
|
|
576
|
+
catalogs = self.spark_session.conf.get("spark.sql.catalog", "")
|
|
577
|
+
if "iceberg" in catalogs.lower():
|
|
578
|
+
return True
|
|
579
|
+
except Exception:
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
return False
|
|
583
|
+
|
|
584
|
+
def get_capabilities(self) -> DataFrameMetadataCapabilities:
|
|
585
|
+
"""Get platform metadata capabilities.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
DataFrameMetadataCapabilities for this platform
|
|
589
|
+
"""
|
|
590
|
+
return self._capabilities
|
|
591
|
+
|
|
592
|
+
def supports_operation(self, operation: MetadataOperationType) -> bool:
|
|
593
|
+
"""Check if an operation type is supported.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
operation: The operation to check
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
True if supported
|
|
600
|
+
"""
|
|
601
|
+
return self._capabilities.supports_operation(operation)
|
|
602
|
+
|
|
603
|
+
def get_supported_operations(self) -> list[MetadataOperationType]:
|
|
604
|
+
"""Get list of supported operations.
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
List of supported MetadataOperationType values
|
|
608
|
+
"""
|
|
609
|
+
return self._capabilities.get_supported_operations()
|
|
610
|
+
|
|
611
|
+
def validate_operation(self, operation: MetadataOperationType) -> None:
|
|
612
|
+
"""Validate that an operation is supported.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
operation: The operation to validate
|
|
616
|
+
|
|
617
|
+
Raises:
|
|
618
|
+
UnsupportedOperationError: If operation is not supported
|
|
619
|
+
"""
|
|
620
|
+
if not self.supports_operation(operation):
|
|
621
|
+
raise UnsupportedOperationError(
|
|
622
|
+
operation=operation,
|
|
623
|
+
platform_name=self.platform_name,
|
|
624
|
+
suggestion=get_unsupported_message(operation, self.platform_name),
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
# =========================================================================
|
|
628
|
+
# Schema Introspection Operations (All Platforms)
|
|
629
|
+
# =========================================================================
|
|
630
|
+
|
|
631
|
+
def execute_list_columns(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
632
|
+
"""List column names from a DataFrame.
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
dataframe: The DataFrame to introspect
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
DataFrameMetadataResult with column names
|
|
639
|
+
"""
|
|
640
|
+
start_time = time.time()
|
|
641
|
+
|
|
642
|
+
try:
|
|
643
|
+
self.validate_operation(MetadataOperationType.LIST_COLUMNS)
|
|
644
|
+
|
|
645
|
+
if "polars" in self.platform_name:
|
|
646
|
+
columns = dataframe.columns
|
|
647
|
+
elif "pandas" in self.platform_name:
|
|
648
|
+
columns = list(dataframe.columns)
|
|
649
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
650
|
+
columns = dataframe.columns
|
|
651
|
+
elif "datafusion" in self.platform_name:
|
|
652
|
+
columns = [field.name for field in dataframe.schema()]
|
|
653
|
+
else:
|
|
654
|
+
# Generic fallback
|
|
655
|
+
columns = list(getattr(dataframe, "columns", []))
|
|
656
|
+
|
|
657
|
+
return DataFrameMetadataResult.success_result(
|
|
658
|
+
operation_type=MetadataOperationType.LIST_COLUMNS,
|
|
659
|
+
start_time=start_time,
|
|
660
|
+
result_count=len(columns),
|
|
661
|
+
result_data=columns,
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
except UnsupportedOperationError:
|
|
665
|
+
raise
|
|
666
|
+
except Exception as e:
|
|
667
|
+
self.logger.error(f"LIST_COLUMNS failed: {e}")
|
|
668
|
+
return DataFrameMetadataResult.failure_result(
|
|
669
|
+
MetadataOperationType.LIST_COLUMNS,
|
|
670
|
+
str(e),
|
|
671
|
+
start_time,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
def execute_get_dtypes(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
675
|
+
"""Get data types for all columns.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
dataframe: The DataFrame to introspect
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
DataFrameMetadataResult with column name to dtype mapping
|
|
682
|
+
"""
|
|
683
|
+
start_time = time.time()
|
|
684
|
+
|
|
685
|
+
try:
|
|
686
|
+
self.validate_operation(MetadataOperationType.GET_DTYPES)
|
|
687
|
+
|
|
688
|
+
if "polars" in self.platform_name:
|
|
689
|
+
dtypes = {col: str(dtype) for col, dtype in zip(dataframe.columns, dataframe.dtypes)}
|
|
690
|
+
elif "pandas" in self.platform_name:
|
|
691
|
+
dtypes = {col: str(dtype) for col, dtype in dataframe.dtypes.items()}
|
|
692
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
693
|
+
dtypes = {field.name: str(field.dataType) for field in dataframe.schema.fields}
|
|
694
|
+
elif "datafusion" in self.platform_name:
|
|
695
|
+
dtypes = {field.name: str(field.type) for field in dataframe.schema()}
|
|
696
|
+
else:
|
|
697
|
+
# Generic fallback
|
|
698
|
+
dtypes = {}
|
|
699
|
+
if hasattr(dataframe, "dtypes"):
|
|
700
|
+
dtypes = dict(dataframe.dtypes)
|
|
701
|
+
|
|
702
|
+
return DataFrameMetadataResult.success_result(
|
|
703
|
+
operation_type=MetadataOperationType.GET_DTYPES,
|
|
704
|
+
start_time=start_time,
|
|
705
|
+
result_count=len(dtypes),
|
|
706
|
+
result_data=dtypes,
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
except UnsupportedOperationError:
|
|
710
|
+
raise
|
|
711
|
+
except Exception as e:
|
|
712
|
+
self.logger.error(f"GET_DTYPES failed: {e}")
|
|
713
|
+
return DataFrameMetadataResult.failure_result(
|
|
714
|
+
MetadataOperationType.GET_DTYPES,
|
|
715
|
+
str(e),
|
|
716
|
+
start_time,
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
def execute_get_schema(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
720
|
+
"""Get full schema information for a DataFrame.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
dataframe: The DataFrame to introspect
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
DataFrameMetadataResult with schema dict (name, dtype, nullable)
|
|
727
|
+
"""
|
|
728
|
+
start_time = time.time()
|
|
729
|
+
|
|
730
|
+
try:
|
|
731
|
+
self.validate_operation(MetadataOperationType.GET_SCHEMA)
|
|
732
|
+
|
|
733
|
+
schema_info: list[dict[str, Any]] = []
|
|
734
|
+
|
|
735
|
+
if "polars" in self.platform_name:
|
|
736
|
+
for col, dtype in zip(dataframe.columns, dataframe.dtypes):
|
|
737
|
+
schema_info.append(
|
|
738
|
+
{
|
|
739
|
+
"name": col,
|
|
740
|
+
"dtype": str(dtype),
|
|
741
|
+
"nullable": True, # Polars columns are nullable by default
|
|
742
|
+
}
|
|
743
|
+
)
|
|
744
|
+
elif "pandas" in self.platform_name:
|
|
745
|
+
for col, dtype in dataframe.dtypes.items():
|
|
746
|
+
schema_info.append(
|
|
747
|
+
{
|
|
748
|
+
"name": col,
|
|
749
|
+
"dtype": str(dtype),
|
|
750
|
+
"nullable": dataframe[col].isna().any(),
|
|
751
|
+
}
|
|
752
|
+
)
|
|
753
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
754
|
+
for field in dataframe.schema.fields:
|
|
755
|
+
schema_info.append(
|
|
756
|
+
{
|
|
757
|
+
"name": field.name,
|
|
758
|
+
"dtype": str(field.dataType),
|
|
759
|
+
"nullable": field.nullable,
|
|
760
|
+
}
|
|
761
|
+
)
|
|
762
|
+
elif "datafusion" in self.platform_name:
|
|
763
|
+
for field in dataframe.schema():
|
|
764
|
+
schema_info.append(
|
|
765
|
+
{
|
|
766
|
+
"name": field.name,
|
|
767
|
+
"dtype": str(field.type),
|
|
768
|
+
"nullable": field.is_nullable,
|
|
769
|
+
}
|
|
770
|
+
)
|
|
771
|
+
else:
|
|
772
|
+
# Generic fallback
|
|
773
|
+
if hasattr(dataframe, "columns") and hasattr(dataframe, "dtypes"):
|
|
774
|
+
for col, dtype in zip(dataframe.columns, dataframe.dtypes):
|
|
775
|
+
schema_info.append({"name": col, "dtype": str(dtype), "nullable": True})
|
|
776
|
+
|
|
777
|
+
return DataFrameMetadataResult.success_result(
|
|
778
|
+
operation_type=MetadataOperationType.GET_SCHEMA,
|
|
779
|
+
start_time=start_time,
|
|
780
|
+
result_count=len(schema_info),
|
|
781
|
+
result_data=schema_info,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
except UnsupportedOperationError:
|
|
785
|
+
raise
|
|
786
|
+
except Exception as e:
|
|
787
|
+
self.logger.error(f"GET_SCHEMA failed: {e}")
|
|
788
|
+
return DataFrameMetadataResult.failure_result(
|
|
789
|
+
MetadataOperationType.GET_SCHEMA,
|
|
790
|
+
str(e),
|
|
791
|
+
start_time,
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
def execute_describe_stats(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
795
|
+
"""Get summary statistics for a DataFrame.
|
|
796
|
+
|
|
797
|
+
Args:
|
|
798
|
+
dataframe: The DataFrame to introspect
|
|
799
|
+
|
|
800
|
+
Returns:
|
|
801
|
+
DataFrameMetadataResult with statistics DataFrame/dict
|
|
802
|
+
"""
|
|
803
|
+
start_time = time.time()
|
|
804
|
+
|
|
805
|
+
try:
|
|
806
|
+
self.validate_operation(MetadataOperationType.DESCRIBE_STATS)
|
|
807
|
+
|
|
808
|
+
if "polars" in self.platform_name:
|
|
809
|
+
stats_df = dataframe.describe()
|
|
810
|
+
result_data = stats_df.to_dicts()
|
|
811
|
+
result_count = stats_df.height
|
|
812
|
+
elif "pandas" in self.platform_name:
|
|
813
|
+
stats_df = dataframe.describe()
|
|
814
|
+
result_data = stats_df.to_dict()
|
|
815
|
+
result_count = len(stats_df)
|
|
816
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
817
|
+
stats_df = dataframe.describe()
|
|
818
|
+
result_data = stats_df.collect()
|
|
819
|
+
result_count = stats_df.count()
|
|
820
|
+
elif "datafusion" in self.platform_name:
|
|
821
|
+
# DataFusion doesn't have describe() yet
|
|
822
|
+
result_data = None
|
|
823
|
+
result_count = 0
|
|
824
|
+
else:
|
|
825
|
+
result_data = None
|
|
826
|
+
result_count = 0
|
|
827
|
+
|
|
828
|
+
return DataFrameMetadataResult.success_result(
|
|
829
|
+
operation_type=MetadataOperationType.DESCRIBE_STATS,
|
|
830
|
+
start_time=start_time,
|
|
831
|
+
result_count=result_count,
|
|
832
|
+
result_data=result_data,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
except UnsupportedOperationError:
|
|
836
|
+
raise
|
|
837
|
+
except Exception as e:
|
|
838
|
+
self.logger.error(f"DESCRIBE_STATS failed: {e}")
|
|
839
|
+
return DataFrameMetadataResult.failure_result(
|
|
840
|
+
MetadataOperationType.DESCRIBE_STATS,
|
|
841
|
+
str(e),
|
|
842
|
+
start_time,
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
def execute_row_count(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
846
|
+
"""Get row count for a DataFrame.
|
|
847
|
+
|
|
848
|
+
Note: This may require full scan on lazy/distributed platforms.
|
|
849
|
+
|
|
850
|
+
Args:
|
|
851
|
+
dataframe: The DataFrame to count
|
|
852
|
+
|
|
853
|
+
Returns:
|
|
854
|
+
DataFrameMetadataResult with row count
|
|
855
|
+
"""
|
|
856
|
+
start_time = time.time()
|
|
857
|
+
|
|
858
|
+
try:
|
|
859
|
+
self.validate_operation(MetadataOperationType.ROW_COUNT)
|
|
860
|
+
|
|
861
|
+
if "polars" in self.platform_name:
|
|
862
|
+
count = dataframe.height
|
|
863
|
+
elif "pandas" in self.platform_name:
|
|
864
|
+
count = len(dataframe)
|
|
865
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name or "datafusion" in self.platform_name:
|
|
866
|
+
count = dataframe.count()
|
|
867
|
+
else:
|
|
868
|
+
count = len(dataframe) if hasattr(dataframe, "__len__") else 0
|
|
869
|
+
|
|
870
|
+
return DataFrameMetadataResult.success_result(
|
|
871
|
+
operation_type=MetadataOperationType.ROW_COUNT,
|
|
872
|
+
start_time=start_time,
|
|
873
|
+
result_count=1,
|
|
874
|
+
result_data=count,
|
|
875
|
+
metrics={"row_count": count},
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
except UnsupportedOperationError:
|
|
879
|
+
raise
|
|
880
|
+
except Exception as e:
|
|
881
|
+
self.logger.error(f"ROW_COUNT failed: {e}")
|
|
882
|
+
return DataFrameMetadataResult.failure_result(
|
|
883
|
+
MetadataOperationType.ROW_COUNT,
|
|
884
|
+
str(e),
|
|
885
|
+
start_time,
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
def execute_column_count(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
889
|
+
"""Get column count for a DataFrame.
|
|
890
|
+
|
|
891
|
+
Args:
|
|
892
|
+
dataframe: The DataFrame to count
|
|
893
|
+
|
|
894
|
+
Returns:
|
|
895
|
+
DataFrameMetadataResult with column count
|
|
896
|
+
"""
|
|
897
|
+
start_time = time.time()
|
|
898
|
+
|
|
899
|
+
try:
|
|
900
|
+
self.validate_operation(MetadataOperationType.COLUMN_COUNT)
|
|
901
|
+
|
|
902
|
+
if "polars" in self.platform_name:
|
|
903
|
+
count = dataframe.width
|
|
904
|
+
elif "pandas" in self.platform_name or "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
905
|
+
count = len(dataframe.columns)
|
|
906
|
+
elif "datafusion" in self.platform_name:
|
|
907
|
+
count = len(dataframe.schema())
|
|
908
|
+
else:
|
|
909
|
+
count = len(getattr(dataframe, "columns", []))
|
|
910
|
+
|
|
911
|
+
return DataFrameMetadataResult.success_result(
|
|
912
|
+
operation_type=MetadataOperationType.COLUMN_COUNT,
|
|
913
|
+
start_time=start_time,
|
|
914
|
+
result_count=1,
|
|
915
|
+
result_data=count,
|
|
916
|
+
metrics={"column_count": count},
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
except UnsupportedOperationError:
|
|
920
|
+
raise
|
|
921
|
+
except Exception as e:
|
|
922
|
+
self.logger.error(f"COLUMN_COUNT failed: {e}")
|
|
923
|
+
return DataFrameMetadataResult.failure_result(
|
|
924
|
+
MetadataOperationType.COLUMN_COUNT,
|
|
925
|
+
str(e),
|
|
926
|
+
start_time,
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# =========================================================================
|
|
930
|
+
# Catalog Operations (PySpark with Catalog)
|
|
931
|
+
# =========================================================================
|
|
932
|
+
|
|
933
|
+
def execute_list_databases(self) -> DataFrameMetadataResult:
|
|
934
|
+
"""List all databases in the catalog.
|
|
935
|
+
|
|
936
|
+
Requires PySpark with configured catalog.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
DataFrameMetadataResult with database names
|
|
940
|
+
"""
|
|
941
|
+
start_time = time.time()
|
|
942
|
+
|
|
943
|
+
try:
|
|
944
|
+
self.validate_operation(MetadataOperationType.LIST_DATABASES)
|
|
945
|
+
|
|
946
|
+
if self.spark_session is None:
|
|
947
|
+
return DataFrameMetadataResult.failure_result(
|
|
948
|
+
MetadataOperationType.LIST_DATABASES,
|
|
949
|
+
"SparkSession is required for catalog operations. "
|
|
950
|
+
"Pass spark_session to DataFrameMetadataOperationsManager.",
|
|
951
|
+
start_time,
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
databases = self.spark_session.catalog.listDatabases()
|
|
955
|
+
db_names = [db.name for db in databases]
|
|
956
|
+
|
|
957
|
+
return DataFrameMetadataResult.success_result(
|
|
958
|
+
operation_type=MetadataOperationType.LIST_DATABASES,
|
|
959
|
+
start_time=start_time,
|
|
960
|
+
result_count=len(db_names),
|
|
961
|
+
result_data=db_names,
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
except UnsupportedOperationError:
|
|
965
|
+
raise
|
|
966
|
+
except Exception as e:
|
|
967
|
+
self.logger.error(f"LIST_DATABASES failed: {e}")
|
|
968
|
+
return DataFrameMetadataResult.failure_result(
|
|
969
|
+
MetadataOperationType.LIST_DATABASES,
|
|
970
|
+
str(e),
|
|
971
|
+
start_time,
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
def execute_list_tables(self, database: str | None = None) -> DataFrameMetadataResult:
|
|
975
|
+
"""List all tables in a database.
|
|
976
|
+
|
|
977
|
+
Requires PySpark with configured catalog.
|
|
978
|
+
|
|
979
|
+
Args:
|
|
980
|
+
database: Database name (uses current database if None)
|
|
981
|
+
|
|
982
|
+
Returns:
|
|
983
|
+
DataFrameMetadataResult with table names
|
|
984
|
+
"""
|
|
985
|
+
start_time = time.time()
|
|
986
|
+
|
|
987
|
+
try:
|
|
988
|
+
self.validate_operation(MetadataOperationType.LIST_TABLES)
|
|
989
|
+
|
|
990
|
+
if self.spark_session is None:
|
|
991
|
+
return DataFrameMetadataResult.failure_result(
|
|
992
|
+
MetadataOperationType.LIST_TABLES,
|
|
993
|
+
"SparkSession is required for catalog operations.",
|
|
994
|
+
start_time,
|
|
995
|
+
)
|
|
996
|
+
|
|
997
|
+
if database:
|
|
998
|
+
tables = self.spark_session.catalog.listTables(database)
|
|
999
|
+
else:
|
|
1000
|
+
tables = self.spark_session.catalog.listTables()
|
|
1001
|
+
|
|
1002
|
+
table_info = [{"name": t.name, "database": t.database, "tableType": t.tableType} for t in tables]
|
|
1003
|
+
|
|
1004
|
+
return DataFrameMetadataResult.success_result(
|
|
1005
|
+
operation_type=MetadataOperationType.LIST_TABLES,
|
|
1006
|
+
start_time=start_time,
|
|
1007
|
+
result_count=len(table_info),
|
|
1008
|
+
result_data=table_info,
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
except UnsupportedOperationError:
|
|
1012
|
+
raise
|
|
1013
|
+
except Exception as e:
|
|
1014
|
+
self.logger.error(f"LIST_TABLES failed: {e}")
|
|
1015
|
+
return DataFrameMetadataResult.failure_result(
|
|
1016
|
+
MetadataOperationType.LIST_TABLES,
|
|
1017
|
+
str(e),
|
|
1018
|
+
start_time,
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
def execute_list_table_columns(self, table_name: str, database: str | None = None) -> DataFrameMetadataResult:
|
|
1022
|
+
"""List columns for a specific table in the catalog.
|
|
1023
|
+
|
|
1024
|
+
Requires PySpark with configured catalog.
|
|
1025
|
+
|
|
1026
|
+
Args:
|
|
1027
|
+
table_name: Name of the table
|
|
1028
|
+
database: Database name (uses current database if None)
|
|
1029
|
+
|
|
1030
|
+
Returns:
|
|
1031
|
+
DataFrameMetadataResult with column information
|
|
1032
|
+
"""
|
|
1033
|
+
start_time = time.time()
|
|
1034
|
+
|
|
1035
|
+
try:
|
|
1036
|
+
self.validate_operation(MetadataOperationType.LIST_TABLE_COLUMNS)
|
|
1037
|
+
|
|
1038
|
+
if self.spark_session is None:
|
|
1039
|
+
return DataFrameMetadataResult.failure_result(
|
|
1040
|
+
MetadataOperationType.LIST_TABLE_COLUMNS,
|
|
1041
|
+
"SparkSession is required for catalog operations.",
|
|
1042
|
+
start_time,
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
if database:
|
|
1046
|
+
columns = self.spark_session.catalog.listColumns(table_name, database)
|
|
1047
|
+
else:
|
|
1048
|
+
columns = self.spark_session.catalog.listColumns(table_name)
|
|
1049
|
+
|
|
1050
|
+
column_info = [
|
|
1051
|
+
{
|
|
1052
|
+
"name": c.name,
|
|
1053
|
+
"dataType": c.dataType,
|
|
1054
|
+
"nullable": c.nullable,
|
|
1055
|
+
"description": c.description,
|
|
1056
|
+
}
|
|
1057
|
+
for c in columns
|
|
1058
|
+
]
|
|
1059
|
+
|
|
1060
|
+
return DataFrameMetadataResult.success_result(
|
|
1061
|
+
operation_type=MetadataOperationType.LIST_TABLE_COLUMNS,
|
|
1062
|
+
start_time=start_time,
|
|
1063
|
+
result_count=len(column_info),
|
|
1064
|
+
result_data=column_info,
|
|
1065
|
+
)
|
|
1066
|
+
|
|
1067
|
+
except UnsupportedOperationError:
|
|
1068
|
+
raise
|
|
1069
|
+
except Exception as e:
|
|
1070
|
+
self.logger.error(f"LIST_TABLE_COLUMNS failed: {e}")
|
|
1071
|
+
return DataFrameMetadataResult.failure_result(
|
|
1072
|
+
MetadataOperationType.LIST_TABLE_COLUMNS,
|
|
1073
|
+
str(e),
|
|
1074
|
+
start_time,
|
|
1075
|
+
)
|
|
1076
|
+
|
|
1077
|
+
def execute_table_exists(self, table_name: str, database: str | None = None) -> DataFrameMetadataResult:
|
|
1078
|
+
"""Check if a table exists in the catalog.
|
|
1079
|
+
|
|
1080
|
+
Requires PySpark with configured catalog.
|
|
1081
|
+
|
|
1082
|
+
Args:
|
|
1083
|
+
table_name: Name of the table
|
|
1084
|
+
database: Database name (uses current database if None)
|
|
1085
|
+
|
|
1086
|
+
Returns:
|
|
1087
|
+
DataFrameMetadataResult with exists boolean
|
|
1088
|
+
"""
|
|
1089
|
+
start_time = time.time()
|
|
1090
|
+
|
|
1091
|
+
try:
|
|
1092
|
+
self.validate_operation(MetadataOperationType.TABLE_EXISTS)
|
|
1093
|
+
|
|
1094
|
+
if self.spark_session is None:
|
|
1095
|
+
return DataFrameMetadataResult.failure_result(
|
|
1096
|
+
MetadataOperationType.TABLE_EXISTS,
|
|
1097
|
+
"SparkSession is required for catalog operations.",
|
|
1098
|
+
start_time,
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
if database:
|
|
1102
|
+
full_name = f"{database}.{table_name}"
|
|
1103
|
+
else:
|
|
1104
|
+
full_name = table_name
|
|
1105
|
+
|
|
1106
|
+
exists = self.spark_session.catalog.tableExists(full_name)
|
|
1107
|
+
|
|
1108
|
+
return DataFrameMetadataResult.success_result(
|
|
1109
|
+
operation_type=MetadataOperationType.TABLE_EXISTS,
|
|
1110
|
+
start_time=start_time,
|
|
1111
|
+
result_count=1 if exists else 0,
|
|
1112
|
+
result_data=exists,
|
|
1113
|
+
)
|
|
1114
|
+
|
|
1115
|
+
except UnsupportedOperationError:
|
|
1116
|
+
raise
|
|
1117
|
+
except Exception as e:
|
|
1118
|
+
self.logger.error(f"TABLE_EXISTS failed: {e}")
|
|
1119
|
+
return DataFrameMetadataResult.failure_result(
|
|
1120
|
+
MetadataOperationType.TABLE_EXISTS,
|
|
1121
|
+
str(e),
|
|
1122
|
+
start_time,
|
|
1123
|
+
)
|
|
1124
|
+
|
|
1125
|
+
def execute_get_table_info(self, table_name: str, database: str | None = None) -> DataFrameMetadataResult:
|
|
1126
|
+
"""Get detailed information about a table.
|
|
1127
|
+
|
|
1128
|
+
Requires PySpark with configured catalog.
|
|
1129
|
+
|
|
1130
|
+
Args:
|
|
1131
|
+
table_name: Name of the table
|
|
1132
|
+
database: Database name (uses current database if None)
|
|
1133
|
+
|
|
1134
|
+
Returns:
|
|
1135
|
+
DataFrameMetadataResult with table metadata
|
|
1136
|
+
"""
|
|
1137
|
+
start_time = time.time()
|
|
1138
|
+
|
|
1139
|
+
try:
|
|
1140
|
+
self.validate_operation(MetadataOperationType.GET_TABLE_INFO)
|
|
1141
|
+
|
|
1142
|
+
if self.spark_session is None:
|
|
1143
|
+
return DataFrameMetadataResult.failure_result(
|
|
1144
|
+
MetadataOperationType.GET_TABLE_INFO,
|
|
1145
|
+
"SparkSession is required for catalog operations.",
|
|
1146
|
+
start_time,
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
if database:
|
|
1150
|
+
full_name = f"{database}.{table_name}"
|
|
1151
|
+
else:
|
|
1152
|
+
full_name = table_name
|
|
1153
|
+
|
|
1154
|
+
table = self.spark_session.catalog.getTable(full_name)
|
|
1155
|
+
|
|
1156
|
+
table_info = {
|
|
1157
|
+
"name": table.name,
|
|
1158
|
+
"database": table.database,
|
|
1159
|
+
"tableType": table.tableType,
|
|
1160
|
+
"description": table.description,
|
|
1161
|
+
"isTemporary": table.isTemporary,
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
return DataFrameMetadataResult.success_result(
|
|
1165
|
+
operation_type=MetadataOperationType.GET_TABLE_INFO,
|
|
1166
|
+
start_time=start_time,
|
|
1167
|
+
result_count=1,
|
|
1168
|
+
result_data=table_info,
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
except UnsupportedOperationError:
|
|
1172
|
+
raise
|
|
1173
|
+
except Exception as e:
|
|
1174
|
+
self.logger.error(f"GET_TABLE_INFO failed: {e}")
|
|
1175
|
+
return DataFrameMetadataResult.failure_result(
|
|
1176
|
+
MetadataOperationType.GET_TABLE_INFO,
|
|
1177
|
+
str(e),
|
|
1178
|
+
start_time,
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
# =========================================================================
|
|
1182
|
+
# Lakehouse Metadata Operations (Delta Lake / Iceberg)
|
|
1183
|
+
# =========================================================================
|
|
1184
|
+
|
|
1185
|
+
def execute_table_history(self, table_path: str) -> DataFrameMetadataResult:
|
|
1186
|
+
"""Get transaction history for a Delta Lake table.
|
|
1187
|
+
|
|
1188
|
+
Requires Delta Lake support.
|
|
1189
|
+
|
|
1190
|
+
Args:
|
|
1191
|
+
table_path: Path to the Delta table
|
|
1192
|
+
|
|
1193
|
+
Returns:
|
|
1194
|
+
DataFrameMetadataResult with transaction history
|
|
1195
|
+
"""
|
|
1196
|
+
start_time = time.time()
|
|
1197
|
+
|
|
1198
|
+
try:
|
|
1199
|
+
self.validate_operation(MetadataOperationType.TABLE_HISTORY)
|
|
1200
|
+
|
|
1201
|
+
if self.spark_session is not None:
|
|
1202
|
+
# Use DeltaTable API
|
|
1203
|
+
from delta.tables import DeltaTable
|
|
1204
|
+
|
|
1205
|
+
delta_table = DeltaTable.forPath(self.spark_session, table_path)
|
|
1206
|
+
history_df = delta_table.history()
|
|
1207
|
+
history = history_df.collect()
|
|
1208
|
+
|
|
1209
|
+
return DataFrameMetadataResult.success_result(
|
|
1210
|
+
operation_type=MetadataOperationType.TABLE_HISTORY,
|
|
1211
|
+
start_time=start_time,
|
|
1212
|
+
result_count=len(history),
|
|
1213
|
+
result_data=[row.asDict() for row in history],
|
|
1214
|
+
)
|
|
1215
|
+
else:
|
|
1216
|
+
# Use deltalake Python library
|
|
1217
|
+
from deltalake import DeltaTable as PyDeltaTable
|
|
1218
|
+
|
|
1219
|
+
dt = PyDeltaTable(table_path)
|
|
1220
|
+
history = list(dt.history())
|
|
1221
|
+
|
|
1222
|
+
return DataFrameMetadataResult.success_result(
|
|
1223
|
+
operation_type=MetadataOperationType.TABLE_HISTORY,
|
|
1224
|
+
start_time=start_time,
|
|
1225
|
+
result_count=len(history),
|
|
1226
|
+
result_data=history,
|
|
1227
|
+
)
|
|
1228
|
+
|
|
1229
|
+
except UnsupportedOperationError:
|
|
1230
|
+
raise
|
|
1231
|
+
except ImportError as e:
|
|
1232
|
+
return DataFrameMetadataResult.failure_result(
|
|
1233
|
+
MetadataOperationType.TABLE_HISTORY,
|
|
1234
|
+
f"Delta Lake library not available: {e}",
|
|
1235
|
+
start_time,
|
|
1236
|
+
)
|
|
1237
|
+
except Exception as e:
|
|
1238
|
+
self.logger.error(f"TABLE_HISTORY failed: {e}")
|
|
1239
|
+
return DataFrameMetadataResult.failure_result(
|
|
1240
|
+
MetadataOperationType.TABLE_HISTORY,
|
|
1241
|
+
str(e),
|
|
1242
|
+
start_time,
|
|
1243
|
+
)
|
|
1244
|
+
|
|
1245
|
+
def execute_table_detail(self, table_path: str) -> DataFrameMetadataResult:
|
|
1246
|
+
"""Get detailed metadata for a Delta Lake table.
|
|
1247
|
+
|
|
1248
|
+
Requires Delta Lake support.
|
|
1249
|
+
|
|
1250
|
+
Args:
|
|
1251
|
+
table_path: Path to the Delta table
|
|
1252
|
+
|
|
1253
|
+
Returns:
|
|
1254
|
+
DataFrameMetadataResult with table detail
|
|
1255
|
+
"""
|
|
1256
|
+
start_time = time.time()
|
|
1257
|
+
|
|
1258
|
+
try:
|
|
1259
|
+
self.validate_operation(MetadataOperationType.TABLE_DETAIL)
|
|
1260
|
+
|
|
1261
|
+
if self.spark_session is not None:
|
|
1262
|
+
# Use DeltaTable API
|
|
1263
|
+
from delta.tables import DeltaTable
|
|
1264
|
+
|
|
1265
|
+
delta_table = DeltaTable.forPath(self.spark_session, table_path)
|
|
1266
|
+
detail_df = delta_table.detail()
|
|
1267
|
+
detail = detail_df.collect()[0].asDict()
|
|
1268
|
+
|
|
1269
|
+
return DataFrameMetadataResult.success_result(
|
|
1270
|
+
operation_type=MetadataOperationType.TABLE_DETAIL,
|
|
1271
|
+
start_time=start_time,
|
|
1272
|
+
result_count=1,
|
|
1273
|
+
result_data=detail,
|
|
1274
|
+
)
|
|
1275
|
+
else:
|
|
1276
|
+
# Use deltalake Python library
|
|
1277
|
+
from deltalake import DeltaTable as PyDeltaTable
|
|
1278
|
+
|
|
1279
|
+
dt = PyDeltaTable(table_path)
|
|
1280
|
+
metadata = dt.metadata()
|
|
1281
|
+
|
|
1282
|
+
detail = {
|
|
1283
|
+
"id": metadata.id,
|
|
1284
|
+
"name": metadata.name,
|
|
1285
|
+
"description": metadata.description,
|
|
1286
|
+
"partitionColumns": metadata.partition_columns,
|
|
1287
|
+
"createdTime": metadata.created_time,
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
return DataFrameMetadataResult.success_result(
|
|
1291
|
+
operation_type=MetadataOperationType.TABLE_DETAIL,
|
|
1292
|
+
start_time=start_time,
|
|
1293
|
+
result_count=1,
|
|
1294
|
+
result_data=detail,
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1297
|
+
except UnsupportedOperationError:
|
|
1298
|
+
raise
|
|
1299
|
+
except ImportError as e:
|
|
1300
|
+
return DataFrameMetadataResult.failure_result(
|
|
1301
|
+
MetadataOperationType.TABLE_DETAIL,
|
|
1302
|
+
f"Delta Lake library not available: {e}",
|
|
1303
|
+
start_time,
|
|
1304
|
+
)
|
|
1305
|
+
except Exception as e:
|
|
1306
|
+
self.logger.error(f"TABLE_DETAIL failed: {e}")
|
|
1307
|
+
return DataFrameMetadataResult.failure_result(
|
|
1308
|
+
MetadataOperationType.TABLE_DETAIL,
|
|
1309
|
+
str(e),
|
|
1310
|
+
start_time,
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
def execute_file_metadata(self, table_path: str) -> DataFrameMetadataResult:
|
|
1314
|
+
"""Get file-level metadata for a lakehouse table.
|
|
1315
|
+
|
|
1316
|
+
Requires Delta Lake or Iceberg support.
|
|
1317
|
+
|
|
1318
|
+
Args:
|
|
1319
|
+
table_path: Path to the table
|
|
1320
|
+
|
|
1321
|
+
Returns:
|
|
1322
|
+
DataFrameMetadataResult with file metadata
|
|
1323
|
+
"""
|
|
1324
|
+
start_time = time.time()
|
|
1325
|
+
|
|
1326
|
+
try:
|
|
1327
|
+
self.validate_operation(MetadataOperationType.FILE_METADATA)
|
|
1328
|
+
|
|
1329
|
+
if self._capabilities.supports_delta_lake:
|
|
1330
|
+
# Delta Lake file metadata
|
|
1331
|
+
if self.spark_session is not None:
|
|
1332
|
+
from delta.tables import DeltaTable
|
|
1333
|
+
|
|
1334
|
+
delta_table = DeltaTable.forPath(self.spark_session, table_path)
|
|
1335
|
+
# Get files via detail or internal API
|
|
1336
|
+
detail = delta_table.detail().collect()[0]
|
|
1337
|
+
files = {
|
|
1338
|
+
"numFiles": detail.numFiles if hasattr(detail, "numFiles") else None,
|
|
1339
|
+
"sizeInBytes": detail.sizeInBytes if hasattr(detail, "sizeInBytes") else None,
|
|
1340
|
+
}
|
|
1341
|
+
else:
|
|
1342
|
+
from deltalake import DeltaTable as PyDeltaTable
|
|
1343
|
+
|
|
1344
|
+
dt = PyDeltaTable(table_path)
|
|
1345
|
+
file_uris = dt.file_uris()
|
|
1346
|
+
files = {
|
|
1347
|
+
"numFiles": len(file_uris),
|
|
1348
|
+
"files": file_uris[:100], # Limit for large tables
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
return DataFrameMetadataResult.success_result(
|
|
1352
|
+
operation_type=MetadataOperationType.FILE_METADATA,
|
|
1353
|
+
start_time=start_time,
|
|
1354
|
+
result_count=files.get("numFiles", 0),
|
|
1355
|
+
result_data=files,
|
|
1356
|
+
)
|
|
1357
|
+
|
|
1358
|
+
elif self._capabilities.supports_iceberg:
|
|
1359
|
+
# Iceberg file metadata - requires catalog configuration
|
|
1360
|
+
# Note: pyiceberg.catalog and pyiceberg.table would be used here
|
|
1361
|
+
# but require proper catalog setup which is outside scope of basic introspection
|
|
1362
|
+
return DataFrameMetadataResult.failure_result(
|
|
1363
|
+
MetadataOperationType.FILE_METADATA,
|
|
1364
|
+
"Iceberg file metadata requires catalog configuration",
|
|
1365
|
+
start_time,
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
else:
|
|
1369
|
+
return DataFrameMetadataResult.failure_result(
|
|
1370
|
+
MetadataOperationType.FILE_METADATA,
|
|
1371
|
+
"No lakehouse format available for file metadata",
|
|
1372
|
+
start_time,
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
except UnsupportedOperationError:
|
|
1376
|
+
raise
|
|
1377
|
+
except ImportError as e:
|
|
1378
|
+
return DataFrameMetadataResult.failure_result(
|
|
1379
|
+
MetadataOperationType.FILE_METADATA,
|
|
1380
|
+
f"Lakehouse library not available: {e}",
|
|
1381
|
+
start_time,
|
|
1382
|
+
)
|
|
1383
|
+
except Exception as e:
|
|
1384
|
+
self.logger.error(f"FILE_METADATA failed: {e}")
|
|
1385
|
+
return DataFrameMetadataResult.failure_result(
|
|
1386
|
+
MetadataOperationType.FILE_METADATA,
|
|
1387
|
+
str(e),
|
|
1388
|
+
start_time,
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
def execute_partition_info(self, table_path: str) -> DataFrameMetadataResult:
|
|
1392
|
+
"""Get partition information for a table.
|
|
1393
|
+
|
|
1394
|
+
Requires PySpark or lakehouse table format.
|
|
1395
|
+
|
|
1396
|
+
Args:
|
|
1397
|
+
table_path: Path to the table
|
|
1398
|
+
|
|
1399
|
+
Returns:
|
|
1400
|
+
DataFrameMetadataResult with partition info
|
|
1401
|
+
"""
|
|
1402
|
+
start_time = time.time()
|
|
1403
|
+
|
|
1404
|
+
try:
|
|
1405
|
+
self.validate_operation(MetadataOperationType.PARTITION_INFO)
|
|
1406
|
+
|
|
1407
|
+
if self.spark_session is not None and self._capabilities.supports_delta_lake:
|
|
1408
|
+
from delta.tables import DeltaTable
|
|
1409
|
+
|
|
1410
|
+
delta_table = DeltaTable.forPath(self.spark_session, table_path)
|
|
1411
|
+
detail = delta_table.detail().collect()[0]
|
|
1412
|
+
partitions = {
|
|
1413
|
+
"partitionColumns": list(detail.partitionColumns) if detail.partitionColumns else [],
|
|
1414
|
+
"numPartitions": len(detail.partitionColumns) if detail.partitionColumns else 0,
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
return DataFrameMetadataResult.success_result(
|
|
1418
|
+
operation_type=MetadataOperationType.PARTITION_INFO,
|
|
1419
|
+
start_time=start_time,
|
|
1420
|
+
result_count=partitions["numPartitions"],
|
|
1421
|
+
result_data=partitions,
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
elif self._capabilities.supports_delta_lake:
|
|
1425
|
+
# Use standalone deltalake Python library (no SparkSession)
|
|
1426
|
+
from deltalake import DeltaTable as PyDeltaTable
|
|
1427
|
+
|
|
1428
|
+
dt = PyDeltaTable(table_path)
|
|
1429
|
+
metadata = dt.metadata()
|
|
1430
|
+
partitions = {
|
|
1431
|
+
"partitionColumns": metadata.partition_columns,
|
|
1432
|
+
"numPartitions": len(metadata.partition_columns),
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
return DataFrameMetadataResult.success_result(
|
|
1436
|
+
operation_type=MetadataOperationType.PARTITION_INFO,
|
|
1437
|
+
start_time=start_time,
|
|
1438
|
+
result_count=partitions["numPartitions"],
|
|
1439
|
+
result_data=partitions,
|
|
1440
|
+
)
|
|
1441
|
+
|
|
1442
|
+
else:
|
|
1443
|
+
return DataFrameMetadataResult.failure_result(
|
|
1444
|
+
MetadataOperationType.PARTITION_INFO,
|
|
1445
|
+
"Partition info requires Delta Lake or Iceberg table format",
|
|
1446
|
+
start_time,
|
|
1447
|
+
)
|
|
1448
|
+
|
|
1449
|
+
except UnsupportedOperationError:
|
|
1450
|
+
raise
|
|
1451
|
+
except ImportError as e:
|
|
1452
|
+
return DataFrameMetadataResult.failure_result(
|
|
1453
|
+
MetadataOperationType.PARTITION_INFO,
|
|
1454
|
+
f"Lakehouse library not available: {e}",
|
|
1455
|
+
start_time,
|
|
1456
|
+
)
|
|
1457
|
+
except Exception as e:
|
|
1458
|
+
self.logger.error(f"PARTITION_INFO failed: {e}")
|
|
1459
|
+
return DataFrameMetadataResult.failure_result(
|
|
1460
|
+
MetadataOperationType.PARTITION_INFO,
|
|
1461
|
+
str(e),
|
|
1462
|
+
start_time,
|
|
1463
|
+
)
|
|
1464
|
+
|
|
1465
|
+
def execute_snapshot_info(self, table_path: str) -> DataFrameMetadataResult:
|
|
1466
|
+
"""Get snapshot information for an Iceberg table.
|
|
1467
|
+
|
|
1468
|
+
Requires Iceberg support.
|
|
1469
|
+
|
|
1470
|
+
Args:
|
|
1471
|
+
table_path: Path or identifier for the Iceberg table
|
|
1472
|
+
|
|
1473
|
+
Returns:
|
|
1474
|
+
DataFrameMetadataResult with snapshot info
|
|
1475
|
+
"""
|
|
1476
|
+
start_time = time.time()
|
|
1477
|
+
|
|
1478
|
+
try:
|
|
1479
|
+
self.validate_operation(MetadataOperationType.SNAPSHOT_INFO)
|
|
1480
|
+
|
|
1481
|
+
# This requires proper Iceberg catalog configuration
|
|
1482
|
+
# Simplified implementation for now
|
|
1483
|
+
return DataFrameMetadataResult.failure_result(
|
|
1484
|
+
MetadataOperationType.SNAPSHOT_INFO,
|
|
1485
|
+
"Iceberg snapshot info requires catalog configuration. "
|
|
1486
|
+
"Configure Iceberg catalog in your Spark session or use pyiceberg with proper catalog.",
|
|
1487
|
+
start_time,
|
|
1488
|
+
)
|
|
1489
|
+
|
|
1490
|
+
except UnsupportedOperationError:
|
|
1491
|
+
raise
|
|
1492
|
+
except Exception as e:
|
|
1493
|
+
self.logger.error(f"SNAPSHOT_INFO failed: {e}")
|
|
1494
|
+
return DataFrameMetadataResult.failure_result(
|
|
1495
|
+
MetadataOperationType.SNAPSHOT_INFO,
|
|
1496
|
+
str(e),
|
|
1497
|
+
start_time,
|
|
1498
|
+
)
|
|
1499
|
+
|
|
1500
|
+
# =========================================================================
|
|
1501
|
+
# Complexity Testing Operations
|
|
1502
|
+
# =========================================================================
|
|
1503
|
+
|
|
1504
|
+
def execute_wide_table_schema(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
1505
|
+
"""Introspect schema of a wide DataFrame (100+ columns).
|
|
1506
|
+
|
|
1507
|
+
Tests metadata introspection performance on DataFrames with many columns.
|
|
1508
|
+
This is useful for benchmarking schema discovery performance.
|
|
1509
|
+
|
|
1510
|
+
Args:
|
|
1511
|
+
dataframe: A wide DataFrame with many columns
|
|
1512
|
+
|
|
1513
|
+
Returns:
|
|
1514
|
+
DataFrameMetadataResult with schema info and metrics
|
|
1515
|
+
"""
|
|
1516
|
+
start_time = time.time()
|
|
1517
|
+
|
|
1518
|
+
try:
|
|
1519
|
+
self.validate_operation(MetadataOperationType.WIDE_TABLE_SCHEMA)
|
|
1520
|
+
|
|
1521
|
+
# Get column count
|
|
1522
|
+
if "polars" in self.platform_name:
|
|
1523
|
+
column_count = dataframe.width
|
|
1524
|
+
schema_info = [
|
|
1525
|
+
{"name": col, "dtype": str(dtype)} for col, dtype in zip(dataframe.columns, dataframe.dtypes)
|
|
1526
|
+
]
|
|
1527
|
+
elif "pandas" in self.platform_name:
|
|
1528
|
+
column_count = len(dataframe.columns)
|
|
1529
|
+
schema_info = [{"name": col, "dtype": str(dtype)} for col, dtype in dataframe.dtypes.items()]
|
|
1530
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
1531
|
+
column_count = len(dataframe.columns)
|
|
1532
|
+
schema_info = [{"name": f.name, "dtype": str(f.dataType)} for f in dataframe.schema.fields]
|
|
1533
|
+
else:
|
|
1534
|
+
column_count = len(getattr(dataframe, "columns", []))
|
|
1535
|
+
schema_info = []
|
|
1536
|
+
|
|
1537
|
+
return DataFrameMetadataResult.success_result(
|
|
1538
|
+
operation_type=MetadataOperationType.WIDE_TABLE_SCHEMA,
|
|
1539
|
+
start_time=start_time,
|
|
1540
|
+
result_count=column_count,
|
|
1541
|
+
result_data=schema_info,
|
|
1542
|
+
metrics={
|
|
1543
|
+
"column_count": column_count,
|
|
1544
|
+
"is_wide_table": column_count >= 100,
|
|
1545
|
+
},
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
except UnsupportedOperationError:
|
|
1549
|
+
raise
|
|
1550
|
+
except Exception as e:
|
|
1551
|
+
self.logger.error(f"WIDE_TABLE_SCHEMA failed: {e}")
|
|
1552
|
+
return DataFrameMetadataResult.failure_result(
|
|
1553
|
+
MetadataOperationType.WIDE_TABLE_SCHEMA,
|
|
1554
|
+
str(e),
|
|
1555
|
+
start_time,
|
|
1556
|
+
)
|
|
1557
|
+
|
|
1558
|
+
def execute_large_catalog_list(self) -> DataFrameMetadataResult:
|
|
1559
|
+
"""List tables in a large catalog (100+ tables).
|
|
1560
|
+
|
|
1561
|
+
Tests catalog introspection performance with many tables.
|
|
1562
|
+
Requires PySpark with configured catalog.
|
|
1563
|
+
|
|
1564
|
+
Returns:
|
|
1565
|
+
DataFrameMetadataResult with table list and metrics
|
|
1566
|
+
"""
|
|
1567
|
+
start_time = time.time()
|
|
1568
|
+
|
|
1569
|
+
try:
|
|
1570
|
+
self.validate_operation(MetadataOperationType.LARGE_CATALOG_LIST)
|
|
1571
|
+
|
|
1572
|
+
if self.spark_session is None:
|
|
1573
|
+
return DataFrameMetadataResult.failure_result(
|
|
1574
|
+
MetadataOperationType.LARGE_CATALOG_LIST,
|
|
1575
|
+
"SparkSession is required for catalog operations.",
|
|
1576
|
+
start_time,
|
|
1577
|
+
)
|
|
1578
|
+
|
|
1579
|
+
# Get all tables across all databases
|
|
1580
|
+
tables_result = []
|
|
1581
|
+
databases = self.spark_session.catalog.listDatabases()
|
|
1582
|
+
|
|
1583
|
+
for db in databases:
|
|
1584
|
+
try:
|
|
1585
|
+
tables = self.spark_session.catalog.listTables(db.name)
|
|
1586
|
+
for t in tables:
|
|
1587
|
+
tables_result.append({"name": t.name, "database": db.name, "tableType": t.tableType})
|
|
1588
|
+
except Exception as e:
|
|
1589
|
+
self.logger.debug(f"Skipping database {db.name}: {e}")
|
|
1590
|
+
|
|
1591
|
+
table_count = len(tables_result)
|
|
1592
|
+
|
|
1593
|
+
return DataFrameMetadataResult.success_result(
|
|
1594
|
+
operation_type=MetadataOperationType.LARGE_CATALOG_LIST,
|
|
1595
|
+
start_time=start_time,
|
|
1596
|
+
result_count=table_count,
|
|
1597
|
+
result_data=tables_result,
|
|
1598
|
+
metrics={
|
|
1599
|
+
"table_count": table_count,
|
|
1600
|
+
"database_count": len(databases),
|
|
1601
|
+
"is_large_catalog": table_count >= 100,
|
|
1602
|
+
},
|
|
1603
|
+
)
|
|
1604
|
+
|
|
1605
|
+
except UnsupportedOperationError:
|
|
1606
|
+
raise
|
|
1607
|
+
except Exception as e:
|
|
1608
|
+
self.logger.error(f"LARGE_CATALOG_LIST failed: {e}")
|
|
1609
|
+
return DataFrameMetadataResult.failure_result(
|
|
1610
|
+
MetadataOperationType.LARGE_CATALOG_LIST,
|
|
1611
|
+
str(e),
|
|
1612
|
+
start_time,
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
def execute_complex_type_introspection(self, dataframe: Any) -> DataFrameMetadataResult:
|
|
1616
|
+
"""Introspect complex/nested types in a DataFrame.
|
|
1617
|
+
|
|
1618
|
+
Tests metadata introspection for DataFrames containing complex types
|
|
1619
|
+
like ARRAY, STRUCT, MAP, and nested structures.
|
|
1620
|
+
|
|
1621
|
+
Args:
|
|
1622
|
+
dataframe: DataFrame with complex nested types
|
|
1623
|
+
|
|
1624
|
+
Returns:
|
|
1625
|
+
DataFrameMetadataResult with type analysis
|
|
1626
|
+
"""
|
|
1627
|
+
start_time = time.time()
|
|
1628
|
+
|
|
1629
|
+
try:
|
|
1630
|
+
self.validate_operation(MetadataOperationType.COMPLEX_TYPE_INTROSPECTION)
|
|
1631
|
+
|
|
1632
|
+
complex_types = []
|
|
1633
|
+
nested_depth = 0
|
|
1634
|
+
|
|
1635
|
+
if "polars" in self.platform_name:
|
|
1636
|
+
import polars as pl
|
|
1637
|
+
|
|
1638
|
+
for col, dtype in zip(dataframe.columns, dataframe.dtypes):
|
|
1639
|
+
type_info = self._analyze_polars_type(col, dtype, pl)
|
|
1640
|
+
if type_info.get("is_complex"):
|
|
1641
|
+
complex_types.append(type_info)
|
|
1642
|
+
nested_depth = max(nested_depth, type_info.get("nested_depth", 0))
|
|
1643
|
+
|
|
1644
|
+
elif "pyspark" in self.platform_name or "spark" in self.platform_name:
|
|
1645
|
+
from pyspark.sql.types import ArrayType, MapType, StructType
|
|
1646
|
+
|
|
1647
|
+
for field in dataframe.schema.fields:
|
|
1648
|
+
type_info = self._analyze_spark_type(field, ArrayType, MapType, StructType)
|
|
1649
|
+
if type_info.get("is_complex"):
|
|
1650
|
+
complex_types.append(type_info)
|
|
1651
|
+
nested_depth = max(nested_depth, type_info.get("nested_depth", 0))
|
|
1652
|
+
|
|
1653
|
+
elif "pandas" in self.platform_name:
|
|
1654
|
+
# Pandas has limited complex type support
|
|
1655
|
+
for col, dtype in dataframe.dtypes.items():
|
|
1656
|
+
if str(dtype) == "object":
|
|
1657
|
+
# Could contain nested structures
|
|
1658
|
+
complex_types.append(
|
|
1659
|
+
{
|
|
1660
|
+
"name": col,
|
|
1661
|
+
"dtype": str(dtype),
|
|
1662
|
+
"is_complex": True,
|
|
1663
|
+
"complex_type": "object",
|
|
1664
|
+
"nested_depth": 1,
|
|
1665
|
+
}
|
|
1666
|
+
)
|
|
1667
|
+
|
|
1668
|
+
return DataFrameMetadataResult.success_result(
|
|
1669
|
+
operation_type=MetadataOperationType.COMPLEX_TYPE_INTROSPECTION,
|
|
1670
|
+
start_time=start_time,
|
|
1671
|
+
result_count=len(complex_types),
|
|
1672
|
+
result_data=complex_types,
|
|
1673
|
+
metrics={
|
|
1674
|
+
"complex_column_count": len(complex_types),
|
|
1675
|
+
"max_nested_depth": nested_depth,
|
|
1676
|
+
"has_arrays": any(t.get("complex_type") == "array" for t in complex_types),
|
|
1677
|
+
"has_structs": any(t.get("complex_type") == "struct" for t in complex_types),
|
|
1678
|
+
"has_maps": any(t.get("complex_type") == "map" for t in complex_types),
|
|
1679
|
+
},
|
|
1680
|
+
)
|
|
1681
|
+
|
|
1682
|
+
except UnsupportedOperationError:
|
|
1683
|
+
raise
|
|
1684
|
+
except Exception as e:
|
|
1685
|
+
self.logger.error(f"COMPLEX_TYPE_INTROSPECTION failed: {e}")
|
|
1686
|
+
return DataFrameMetadataResult.failure_result(
|
|
1687
|
+
MetadataOperationType.COMPLEX_TYPE_INTROSPECTION,
|
|
1688
|
+
str(e),
|
|
1689
|
+
start_time,
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1692
|
+
def _analyze_polars_type(self, col_name: str, dtype: Any, pl: Any) -> dict[str, Any]:
|
|
1693
|
+
"""Analyze a Polars data type for complexity.
|
|
1694
|
+
|
|
1695
|
+
Args:
|
|
1696
|
+
col_name: Column name
|
|
1697
|
+
dtype: Polars data type
|
|
1698
|
+
pl: Polars module
|
|
1699
|
+
|
|
1700
|
+
Returns:
|
|
1701
|
+
Type analysis dict
|
|
1702
|
+
"""
|
|
1703
|
+
dtype_str = str(dtype)
|
|
1704
|
+
is_complex = False
|
|
1705
|
+
complex_type = None
|
|
1706
|
+
nested_depth = 0
|
|
1707
|
+
|
|
1708
|
+
if dtype_str.startswith("List"):
|
|
1709
|
+
is_complex = True
|
|
1710
|
+
complex_type = "array"
|
|
1711
|
+
nested_depth = dtype_str.count("List") + dtype_str.count("Struct")
|
|
1712
|
+
elif dtype_str.startswith("Struct"):
|
|
1713
|
+
is_complex = True
|
|
1714
|
+
complex_type = "struct"
|
|
1715
|
+
nested_depth = dtype_str.count("Struct")
|
|
1716
|
+
|
|
1717
|
+
return {
|
|
1718
|
+
"name": col_name,
|
|
1719
|
+
"dtype": dtype_str,
|
|
1720
|
+
"is_complex": is_complex,
|
|
1721
|
+
"complex_type": complex_type,
|
|
1722
|
+
"nested_depth": nested_depth,
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
def _analyze_spark_type(self, field: Any, ArrayType: type, MapType: type, StructType: type) -> dict[str, Any]:
|
|
1726
|
+
"""Analyze a Spark schema field for complexity.
|
|
1727
|
+
|
|
1728
|
+
Args:
|
|
1729
|
+
field: Spark StructField
|
|
1730
|
+
ArrayType: Spark ArrayType class
|
|
1731
|
+
MapType: Spark MapType class
|
|
1732
|
+
StructType: Spark StructType class
|
|
1733
|
+
|
|
1734
|
+
Returns:
|
|
1735
|
+
Type analysis dict
|
|
1736
|
+
"""
|
|
1737
|
+
dtype = field.dataType
|
|
1738
|
+
is_complex = False
|
|
1739
|
+
complex_type = None
|
|
1740
|
+
nested_depth = 0
|
|
1741
|
+
|
|
1742
|
+
def count_depth(t: Any, depth: int = 0) -> int:
|
|
1743
|
+
if isinstance(t, ArrayType):
|
|
1744
|
+
return count_depth(t.elementType, depth + 1)
|
|
1745
|
+
elif isinstance(t, MapType):
|
|
1746
|
+
return max(count_depth(t.keyType, depth + 1), count_depth(t.valueType, depth + 1))
|
|
1747
|
+
elif isinstance(t, StructType):
|
|
1748
|
+
if t.fields:
|
|
1749
|
+
return max(count_depth(f.dataType, depth + 1) for f in t.fields)
|
|
1750
|
+
return depth + 1
|
|
1751
|
+
return depth
|
|
1752
|
+
|
|
1753
|
+
if isinstance(dtype, ArrayType):
|
|
1754
|
+
is_complex = True
|
|
1755
|
+
complex_type = "array"
|
|
1756
|
+
nested_depth = count_depth(dtype)
|
|
1757
|
+
elif isinstance(dtype, MapType):
|
|
1758
|
+
is_complex = True
|
|
1759
|
+
complex_type = "map"
|
|
1760
|
+
nested_depth = count_depth(dtype)
|
|
1761
|
+
elif isinstance(dtype, StructType):
|
|
1762
|
+
is_complex = True
|
|
1763
|
+
complex_type = "struct"
|
|
1764
|
+
nested_depth = count_depth(dtype)
|
|
1765
|
+
|
|
1766
|
+
return {
|
|
1767
|
+
"name": field.name,
|
|
1768
|
+
"dtype": str(dtype),
|
|
1769
|
+
"is_complex": is_complex,
|
|
1770
|
+
"complex_type": complex_type,
|
|
1771
|
+
"nested_depth": nested_depth,
|
|
1772
|
+
}
|
|
1773
|
+
|
|
1774
|
+
|
|
1775
|
+
def get_dataframe_metadata_manager(
|
|
1776
|
+
platform_name: str,
|
|
1777
|
+
spark_session: Any = None,
|
|
1778
|
+
) -> DataFrameMetadataOperationsManager | None:
|
|
1779
|
+
"""Get a DataFrame metadata operations manager for a platform.
|
|
1780
|
+
|
|
1781
|
+
Args:
|
|
1782
|
+
platform_name: Platform name (e.g., "polars-df", "pandas-df", "pyspark-df")
|
|
1783
|
+
spark_session: SparkSession instance (required for pyspark-df catalog ops)
|
|
1784
|
+
|
|
1785
|
+
Returns:
|
|
1786
|
+
DataFrameMetadataOperationsManager if platform supports DataFrame operations,
|
|
1787
|
+
None if platform is not a DataFrame platform.
|
|
1788
|
+
"""
|
|
1789
|
+
platform_lower = platform_name.lower()
|
|
1790
|
+
|
|
1791
|
+
# Check if this is a DataFrame platform
|
|
1792
|
+
df_platforms = ("polars-df", "polars", "pandas-df", "pandas", "pyspark-df", "pyspark", "datafusion")
|
|
1793
|
+
if not any(p in platform_lower for p in df_platforms):
|
|
1794
|
+
logger.debug(f"Platform {platform_name} is not a DataFrame platform")
|
|
1795
|
+
return None
|
|
1796
|
+
|
|
1797
|
+
try:
|
|
1798
|
+
return DataFrameMetadataOperationsManager(platform_name, spark_session=spark_session)
|
|
1799
|
+
except Exception as e:
|
|
1800
|
+
logger.warning(f"Failed to create metadata manager for {platform_name}: {e}")
|
|
1801
|
+
return None
|
|
1802
|
+
|
|
1803
|
+
|
|
1804
|
+
__all__ = [
|
|
1805
|
+
# Enums
|
|
1806
|
+
"MetadataOperationType",
|
|
1807
|
+
"MetadataOperationCategory",
|
|
1808
|
+
"OPERATION_CATEGORIES",
|
|
1809
|
+
# Capabilities
|
|
1810
|
+
"DataFrameMetadataCapabilities",
|
|
1811
|
+
"get_platform_capabilities",
|
|
1812
|
+
"POLARS_METADATA_CAPABILITIES",
|
|
1813
|
+
"PANDAS_METADATA_CAPABILITIES",
|
|
1814
|
+
"PYSPARK_METADATA_CAPABILITIES",
|
|
1815
|
+
"DATAFUSION_METADATA_CAPABILITIES",
|
|
1816
|
+
# Results
|
|
1817
|
+
"DataFrameMetadataResult",
|
|
1818
|
+
# Errors
|
|
1819
|
+
"UnsupportedOperationError",
|
|
1820
|
+
"get_unsupported_message",
|
|
1821
|
+
# Manager
|
|
1822
|
+
"DataFrameMetadataOperationsManager",
|
|
1823
|
+
"get_dataframe_metadata_manager",
|
|
1824
|
+
]
|