deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,229 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Tuple, Optional
|
4
|
+
|
5
|
+
from deltacat.catalog.model.catalog import Catalog as DCCatalog
|
6
|
+
from deltacat.catalog.model.table_definition import TableDefinition
|
7
|
+
|
8
|
+
from daft.catalog import Catalog, Identifier, Table
|
9
|
+
from daft.dataframe import DataFrame
|
10
|
+
from daft.logical.schema import Schema
|
11
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
12
|
+
|
13
|
+
|
14
|
+
class DaftCatalog(Catalog):
|
15
|
+
"""
|
16
|
+
Wrapper class to create a Daft catalog from a DeltaCAT catalog.
|
17
|
+
|
18
|
+
The initialization of DeltaCAT and Daft catalogs is managed in `deltacat.catalog.catalog.py`. The user
|
19
|
+
is just expected to initialize catalogs through the DeltaCAT public interface (init / put_catalog).
|
20
|
+
|
21
|
+
TODO (mccember) in follow up PR we need to consider how to keep the DeltaCAT Catalogs class and Daft session in sync,
|
22
|
+
and the user-facing entrypoint to get a Daft catalog
|
23
|
+
|
24
|
+
This class itself expects a `Catalog` and will invoke the underlying implementation
|
25
|
+
similar to `deltacat.catalog.delegate.py`, like:
|
26
|
+
catalog.impl.create_namespace(namespace, inner=catalog.inner)
|
27
|
+
|
28
|
+
We cannot route calls through the higher level catalog registry / delegate since this wrapper class is at a lower
|
29
|
+
layer and does not manage registering catalogs.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(self, catalog: DCCatalog, name: str):
|
33
|
+
"""
|
34
|
+
Initialize given DeltaCAT catalog. This catalog is also registered with DeltaCAT (via deltacat.put_catalog) given the provided Name
|
35
|
+
|
36
|
+
:param catalog: DeltaCAT Catalog object. If None, the catalog will be fetched from `deltacat.Catalogs`
|
37
|
+
given the catalog name.
|
38
|
+
|
39
|
+
:param name: Name of DeltaCAT catalog. If the name is not yet registered with `deltacat.Catalogs`,
|
40
|
+
it will be registered upon creation to ensure that the DeltaCAT and Daft catalogs keep in sync.
|
41
|
+
|
42
|
+
:param kwargs: Additional keyword arguments passed to deltacat.get_catalog or deltacat.put_catalog,
|
43
|
+
such as 'namespace' for tests.
|
44
|
+
"""
|
45
|
+
self.dc_catalog = catalog
|
46
|
+
self._name = name
|
47
|
+
|
48
|
+
@property
|
49
|
+
def name(self) -> str:
|
50
|
+
return self._name
|
51
|
+
|
52
|
+
###
|
53
|
+
# create_*
|
54
|
+
###
|
55
|
+
def create_namespace(self, identifier: Identifier | str):
|
56
|
+
"""Create a new namespace in the catalog."""
|
57
|
+
if isinstance(identifier, Identifier):
|
58
|
+
identifier = str(identifier)
|
59
|
+
self.dc_catalog.impl.create_namespace(identifier, inner=self.dc_catalog.inner)
|
60
|
+
|
61
|
+
def create_table(
|
62
|
+
self, identifier: Identifier | str, source: Schema | DataFrame, **kwargs
|
63
|
+
) -> Table:
|
64
|
+
"""
|
65
|
+
Create a DeltaCAT table via Daft catalog API
|
66
|
+
|
67
|
+
End users calling create_table through the daft table API may provide kwargs which will be plumbed through
|
68
|
+
to deltacat create_table. For full list of keyword arguments accepted by create_table.
|
69
|
+
|
70
|
+
Note: as of 4/22, Daft create_table does not yet support kwargs. Tracked at: https://github.com/Eventual-Inc/Daft/issues/4195
|
71
|
+
|
72
|
+
:param identifier: Daft table identifier. Sequence of strings of the format (namespace) or (namespace, table)
|
73
|
+
or (namespace, table, table version). If this is a string, it is a dot delimited string of the same format.
|
74
|
+
Identifiers can be created either like Identifier("namespace", "table", "version") OR
|
75
|
+
Identifier.from_str("namespace.table.version")
|
76
|
+
|
77
|
+
:param source: a TableSource, either a Daft DataFrame, Daft Schema, or str (filesystem path)
|
78
|
+
"""
|
79
|
+
if isinstance(source, DataFrame):
|
80
|
+
return self._create_table_from_df(identifier, source)
|
81
|
+
elif isinstance(source, Schema):
|
82
|
+
return self._create_table_from_schema(identifier, source)
|
83
|
+
else:
|
84
|
+
raise Exception(
|
85
|
+
f"Expected table source to be Schema or DataFrame. Found: {type(source)}"
|
86
|
+
)
|
87
|
+
|
88
|
+
def _create_table_from_df(
|
89
|
+
self, ident: Identifier | str, source: DataFrame, **kwargs
|
90
|
+
) -> Table:
|
91
|
+
"""
|
92
|
+
Create a table from a DataFrame.
|
93
|
+
"""
|
94
|
+
t = self._create_table_from_schema(ident, source.schema(), **kwargs)
|
95
|
+
# TODO (mccember) append data upon creation
|
96
|
+
return t
|
97
|
+
|
98
|
+
def _create_table_from_schema(
|
99
|
+
self, ident: Identifier | str, source: Schema, **kwargs
|
100
|
+
) -> Table:
|
101
|
+
"""
|
102
|
+
Create a table from a schema.
|
103
|
+
"""
|
104
|
+
namespace, name, version = self._extract_namespace_name_version(ident)
|
105
|
+
|
106
|
+
# Convert the Daft schema to a DeltaCAT schema
|
107
|
+
# This is a simplified version, would need to be enhanced for production
|
108
|
+
deltacat_schema = self._convert_schema_to_deltacat(source)
|
109
|
+
|
110
|
+
# Create the table in DeltaCAT
|
111
|
+
table_def = self.dc_catalog.impl.create_table(
|
112
|
+
name,
|
113
|
+
namespace=namespace,
|
114
|
+
version=version,
|
115
|
+
schema=deltacat_schema,
|
116
|
+
inner=self.dc_catalog.inner,
|
117
|
+
**kwargs,
|
118
|
+
)
|
119
|
+
|
120
|
+
return DaftTable._from_obj(table_def)
|
121
|
+
|
122
|
+
###
|
123
|
+
# drop_*
|
124
|
+
###
|
125
|
+
|
126
|
+
def drop_namespace(self, identifier: Identifier | str):
|
127
|
+
raise NotImplementedError()
|
128
|
+
|
129
|
+
def drop_table(self, identifier: Identifier | str):
|
130
|
+
raise NotImplementedError()
|
131
|
+
|
132
|
+
###
|
133
|
+
# get_*
|
134
|
+
###
|
135
|
+
|
136
|
+
def get_table(self, identifier: Identifier | str, **kwargs) -> Table:
|
137
|
+
namespace, table, version = self._extract_namespace_name_version(identifier)
|
138
|
+
|
139
|
+
table_def = self.dc_catalog.impl.get_table(
|
140
|
+
table,
|
141
|
+
namespace=namespace,
|
142
|
+
table_version=version,
|
143
|
+
inner=self.dc_catalog.inner,
|
144
|
+
**kwargs,
|
145
|
+
)
|
146
|
+
|
147
|
+
if not table_def:
|
148
|
+
raise ValueError(f"Table {identifier} not found")
|
149
|
+
|
150
|
+
return DaftTable._from_obj(table_def)
|
151
|
+
|
152
|
+
###
|
153
|
+
# list_*
|
154
|
+
###
|
155
|
+
|
156
|
+
def list_namespaces(self, pattern: str | None = None) -> list[Identifier]:
|
157
|
+
raise NotImplementedError("Not implemented")
|
158
|
+
|
159
|
+
def list_tables(self, pattern: str | None = None) -> list[str]:
|
160
|
+
raise NotImplementedError("Not implemented")
|
161
|
+
|
162
|
+
def _extract_namespace_name_version(
|
163
|
+
self, ident: Identifier | str
|
164
|
+
) -> Tuple[str, str, Optional[str]]:
|
165
|
+
"""
|
166
|
+
Extract namespace, name,version from identifier
|
167
|
+
|
168
|
+
Returns a 3-tuple. If no namespace is provided, uses DeltaCAT defualt namespace
|
169
|
+
"""
|
170
|
+
default_namespace = DEFAULT_NAMESPACE
|
171
|
+
|
172
|
+
if isinstance(ident, str):
|
173
|
+
ident = Identifier.from_str(ident)
|
174
|
+
|
175
|
+
if isinstance(ident, Identifier):
|
176
|
+
if len(ident) == 1:
|
177
|
+
return (default_namespace, ident[0], None)
|
178
|
+
elif len(ident) == 2:
|
179
|
+
return (ident[0], ident[1], None)
|
180
|
+
elif len(ident) == 3:
|
181
|
+
return (ident[0], ident[1], ident[2])
|
182
|
+
else:
|
183
|
+
raise ValueError(
|
184
|
+
f"Expected table identifier to be in format (table) or (namespace, table)"
|
185
|
+
f"or (namespace, table, version). Found: {ident}"
|
186
|
+
)
|
187
|
+
|
188
|
+
def _convert_schema_to_deltacat(self, schema: Schema):
|
189
|
+
"""Convert Daft schema to DeltaCAT schema.
|
190
|
+
For now, just use PyArrow schema as intermediary
|
191
|
+
TODO look into how enhancements on schema can be propagated between Daft<=>DeltaCAT
|
192
|
+
"""
|
193
|
+
from deltacat.storage.model.schema import Schema as DeltaCATSchema
|
194
|
+
|
195
|
+
return DeltaCATSchema.of(schema=schema.to_pyarrow_schema())
|
196
|
+
|
197
|
+
|
198
|
+
class DaftTable(Table):
|
199
|
+
"""
|
200
|
+
Wrapper class to create a Daft table from a DeltaCAT table
|
201
|
+
"""
|
202
|
+
|
203
|
+
_inner: TableDefinition
|
204
|
+
|
205
|
+
_read_options = set()
|
206
|
+
_write_options = set()
|
207
|
+
|
208
|
+
def __init__(self, inner: TableDefinition):
|
209
|
+
self._inner = inner
|
210
|
+
|
211
|
+
@property
|
212
|
+
def name(self) -> str:
|
213
|
+
"""Return the table name."""
|
214
|
+
return self._inner.table_version.table_name
|
215
|
+
|
216
|
+
@staticmethod
|
217
|
+
def _from_obj(obj: object) -> DaftTable:
|
218
|
+
"""Returns a DeltaCATTable if the given object can be adapted so."""
|
219
|
+
if isinstance(obj, TableDefinition):
|
220
|
+
t = DaftTable.__new__(DaftTable)
|
221
|
+
t._inner = obj
|
222
|
+
return t
|
223
|
+
raise ValueError(f"Unsupported DeltaCAT table type: {type(obj)}")
|
224
|
+
|
225
|
+
def read(self, **options) -> DataFrame:
|
226
|
+
raise NotImplementedError("Not implemented")
|
227
|
+
|
228
|
+
def write(self, df: DataFrame | object, mode: str = "append", **options):
|
229
|
+
raise NotImplementedError("Not implemented")
|
File without changes
|
File without changes
|
@@ -0,0 +1,129 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional, Any, Set
|
3
|
+
|
4
|
+
from pyiceberg.catalog import Catalog
|
5
|
+
from pyiceberg.table import Table
|
6
|
+
import deltacat.logs as logs
|
7
|
+
|
8
|
+
from deltacat.storage.model.scan.push_down import Pushdown, PartitionFilter
|
9
|
+
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
10
|
+
from deltacat.storage.model.scan.scan_task import FileScanTask, DataFile
|
11
|
+
from deltacat.storage.util.scan_planner import ScanPlanner
|
12
|
+
from deltacat.experimental.storage.iceberg.impl import _try_load_iceberg_table
|
13
|
+
from deltacat.experimental.storage.iceberg.visitor import IcebergExpressionVisitor
|
14
|
+
|
15
|
+
# Initialize DeltaCAT logger
|
16
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
17
|
+
|
18
|
+
|
19
|
+
class IcebergScanPlanner(ScanPlanner):
|
20
|
+
def __init__(self, catalog: Catalog):
|
21
|
+
self.catalog = catalog
|
22
|
+
self.expression_visitor = IcebergExpressionVisitor()
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def _collect_filter_fields(cls, expr: Any) -> Set[str]:
|
26
|
+
"""
|
27
|
+
Collects all field names referenced in the filter expression.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
expr: The expression to analyze
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Set of field names referenced in the expression
|
34
|
+
"""
|
35
|
+
fields = set()
|
36
|
+
if hasattr(expr, "field"):
|
37
|
+
fields.add(expr.field)
|
38
|
+
if hasattr(expr, "left"):
|
39
|
+
fields.update(cls._collect_filter_fields(expr.left))
|
40
|
+
if hasattr(expr, "right"):
|
41
|
+
fields.update(cls._collect_filter_fields(expr.right))
|
42
|
+
if hasattr(expr, "expr"):
|
43
|
+
fields.update(cls._collect_filter_fields(expr.expr))
|
44
|
+
if hasattr(expr, "values"):
|
45
|
+
for value in expr.values:
|
46
|
+
fields.update(cls._collect_filter_fields(value))
|
47
|
+
return fields
|
48
|
+
|
49
|
+
def create_scan_plan(
|
50
|
+
self,
|
51
|
+
table_name: str,
|
52
|
+
namespace: Optional[str] = None,
|
53
|
+
pushdown: Optional[Pushdown] = None,
|
54
|
+
) -> ScanPlan:
|
55
|
+
iceberg_table = _try_load_iceberg_table(
|
56
|
+
self.catalog, namespace=namespace, table_name=table_name
|
57
|
+
)
|
58
|
+
|
59
|
+
# TODO: implement row, column predicate pushdown to Iceberg
|
60
|
+
|
61
|
+
# Get the partition spec
|
62
|
+
partition_spec = iceberg_table.spec()
|
63
|
+
|
64
|
+
# Check if the table is partitioned
|
65
|
+
is_partitioned = len(partition_spec.fields) > 0
|
66
|
+
|
67
|
+
scan = iceberg_table.scan()
|
68
|
+
if is_partitioned:
|
69
|
+
if pushdown and pushdown.partition_filter:
|
70
|
+
filter_fields = self._collect_filter_fields(pushdown.partition_filter)
|
71
|
+
logger.info(
|
72
|
+
f"Pushdown partition filter is enabled, converting to Iceberg. Fields discovered in filter: {', '.join(sorted(filter_fields))}"
|
73
|
+
)
|
74
|
+
# Handle partition filter if present, DeltaCAT only supports partition-level filters right now
|
75
|
+
iceberg_expression = self._convert_partition_filter(
|
76
|
+
iceberg_table, pushdown.partition_filter
|
77
|
+
)
|
78
|
+
scan = scan.filter(iceberg_expression)
|
79
|
+
|
80
|
+
file_scan_tasks = []
|
81
|
+
for scan_task in scan.plan_files():
|
82
|
+
file_scan_tasks.append(FileScanTask([DataFile(scan_task.file.file_path)]))
|
83
|
+
return ScanPlan(file_scan_tasks)
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def _validate_partition_references(
|
87
|
+
cls, expr: Any, partition_cols: Set[str]
|
88
|
+
) -> None:
|
89
|
+
"""
|
90
|
+
Validates that the expression only references partition columns.
|
91
|
+
|
92
|
+
Args:
|
93
|
+
expr: The expression to validate
|
94
|
+
partition_cols: Set of valid partition column names
|
95
|
+
|
96
|
+
Raises:
|
97
|
+
ValueError: If the expression references a non-partition column
|
98
|
+
"""
|
99
|
+
if hasattr(expr, "field"): # Reference type expression
|
100
|
+
if expr.field not in partition_cols:
|
101
|
+
raise ValueError(
|
102
|
+
f"Filter references non-partition column: {expr.field}. "
|
103
|
+
f"Partition columns are: {partition_cols}"
|
104
|
+
)
|
105
|
+
# Recursively validate nested expressions
|
106
|
+
if hasattr(expr, "left"):
|
107
|
+
cls._validate_partition_references(expr.left, partition_cols)
|
108
|
+
if hasattr(expr, "right"):
|
109
|
+
cls._validate_partition_references(expr.right, partition_cols)
|
110
|
+
if hasattr(expr, "expr"):
|
111
|
+
cls._validate_partition_references(expr.expr, partition_cols)
|
112
|
+
if hasattr(expr, "values"):
|
113
|
+
for value in expr.values:
|
114
|
+
cls._validate_partition_references(value, partition_cols)
|
115
|
+
|
116
|
+
def _convert_partition_filter(
|
117
|
+
self, table: Table, partition_filter: PartitionFilter
|
118
|
+
):
|
119
|
+
"""
|
120
|
+
Convert DeltaCAT partition filter to PyIceberg expression,
|
121
|
+
validating that only partition columns are referenced.
|
122
|
+
"""
|
123
|
+
partition_cols = set(field.name for field in table.spec().fields)
|
124
|
+
|
125
|
+
# Validate before converting
|
126
|
+
self._validate_partition_references(partition_filter, partition_cols)
|
127
|
+
|
128
|
+
# Convert to PyIceberg expression
|
129
|
+
return self.expression_visitor.visit(partition_filter)
|