deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,83 +1,385 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
|
4
|
+
import logging
|
5
|
+
from types import ModuleType
|
5
6
|
|
7
|
+
from typing import Any, Dict, List, Optional, Union
|
8
|
+
from functools import partial
|
6
9
|
import ray
|
7
10
|
|
8
|
-
from deltacat
|
11
|
+
from deltacat import logs
|
12
|
+
from deltacat.catalog.main import impl as dcat
|
13
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
14
|
+
from deltacat.constants import DEFAULT_CATALOG
|
9
15
|
|
10
|
-
all_catalogs: Optional[
|
16
|
+
all_catalogs: Optional[ray.actor.ActorHandle] = None
|
17
|
+
|
18
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
19
|
|
12
20
|
|
13
21
|
class Catalog:
|
14
|
-
def __init__(
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
config: Optional[Union[CatalogProperties, Any]] = None,
|
25
|
+
impl: ModuleType = dcat,
|
26
|
+
*args,
|
27
|
+
**kwargs,
|
28
|
+
):
|
29
|
+
"""
|
30
|
+
Constructor for a Catalog.
|
31
|
+
|
32
|
+
Invokes `impl.initialize(config, *args, **kwargs)` and stores its
|
33
|
+
return value in the `inner` property. This captures all state required
|
34
|
+
to deterministically reconstruct this Catalog instance on any node, and
|
35
|
+
must be pickleable by Ray cloudpickle.
|
36
|
+
"""
|
37
|
+
if not isinstance(self, Catalog):
|
38
|
+
# self may contain the tuple returned from __reduce__ (ray pickle bug?)
|
39
|
+
if callable(self[0]) and isinstance(self[1], tuple):
|
40
|
+
logger.info(f"Invoking {self[0]} with positional args: {self[1]}")
|
41
|
+
return self[0](*self[1])
|
42
|
+
else:
|
43
|
+
err_msg = f"Expected `self` to be {Catalog}, but found: {self}"
|
44
|
+
raise RuntimeError(err_msg)
|
45
|
+
|
46
|
+
self._config = config
|
15
47
|
self._impl = impl
|
16
|
-
self._impl.initialize(*args, **kwargs)
|
48
|
+
self._inner = self._impl.initialize(config=config, *args, **kwargs)
|
49
|
+
self._args = args
|
50
|
+
self._kwargs = kwargs
|
51
|
+
|
52
|
+
@property
|
53
|
+
def config(self):
|
54
|
+
return self._config
|
17
55
|
|
18
56
|
@property
|
19
57
|
def impl(self):
|
20
58
|
return self._impl
|
21
59
|
|
60
|
+
@property
|
61
|
+
def inner(self) -> Optional[Any]:
|
62
|
+
return self._inner
|
63
|
+
|
64
|
+
# support pickle, copy, deepcopy, etc.
|
65
|
+
def __reduce__(self):
|
66
|
+
# instantiated catalogs may fail to pickle, so exclude _inner
|
67
|
+
# (e.g. Iceberg catalog w/ unserializable SSLContext from boto3 client)
|
68
|
+
return partial(self.__class__, **self._kwargs), (
|
69
|
+
self._config,
|
70
|
+
self._impl,
|
71
|
+
*self._args,
|
72
|
+
)
|
73
|
+
|
74
|
+
def __str__(self):
|
75
|
+
string_rep = f"{self.__class__.__name__}("
|
76
|
+
if self._args:
|
77
|
+
string_rep += f"args={self._args}, "
|
78
|
+
if self._kwargs:
|
79
|
+
string_rep += f"kwargs={self._kwargs}, "
|
80
|
+
if self._inner:
|
81
|
+
string_rep += f"inner={self._inner})"
|
82
|
+
return string_rep
|
83
|
+
|
84
|
+
def __repr__(self):
|
85
|
+
return self.__str__()
|
86
|
+
|
22
87
|
|
23
88
|
@ray.remote
|
24
89
|
class Catalogs:
|
25
90
|
def __init__(
|
26
91
|
self,
|
27
|
-
catalogs: Dict[str, Catalog],
|
28
|
-
|
29
|
-
*args,
|
30
|
-
**kwargs,
|
92
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
93
|
+
default: Optional[str] = None,
|
31
94
|
):
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
)
|
37
|
-
if not catalogs:
|
38
|
-
raise ValueError(
|
39
|
-
f"No catalogs given to register. "
|
40
|
-
f"Please specify one or more catalogs."
|
41
|
-
)
|
42
|
-
self.catalogs: Dict[str, Catalog] = catalogs
|
43
|
-
if default_catalog_name:
|
44
|
-
self.default_catalog = self.catalogs[default_catalog_name]
|
45
|
-
elif len(catalogs) == 1:
|
46
|
-
self.default_catalog = list(self.catalogs.values())[0]
|
47
|
-
else:
|
48
|
-
self.default_catalog = None
|
95
|
+
self._catalogs = {}
|
96
|
+
self._default_catalog_name = None
|
97
|
+
self._default_catalog = None
|
98
|
+
self.update(catalogs, default)
|
49
99
|
|
50
100
|
def all(self) -> Dict[str, Catalog]:
|
51
|
-
return self.
|
101
|
+
return self._catalogs
|
102
|
+
|
103
|
+
def update(
|
104
|
+
self,
|
105
|
+
catalogs: Union[Catalog, Dict[str, Catalog]],
|
106
|
+
default: Optional[str] = None,
|
107
|
+
) -> None:
|
108
|
+
if isinstance(catalogs, Catalog):
|
109
|
+
catalogs = {DEFAULT_CATALOG: catalogs}
|
110
|
+
elif not isinstance(catalogs, dict):
|
111
|
+
raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
|
112
|
+
self._catalogs.update(catalogs)
|
113
|
+
if default:
|
114
|
+
if default not in catalogs:
|
115
|
+
raise ValueError(
|
116
|
+
f"Default catalog `{default}` not found in: {catalogs}"
|
117
|
+
)
|
118
|
+
self._default_catalog = self._catalogs[default]
|
119
|
+
self._default_catalog_name = default
|
120
|
+
elif len(catalogs) == 1:
|
121
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
122
|
+
else:
|
123
|
+
self._default_catalog = None
|
52
124
|
|
53
125
|
def names(self) -> List[str]:
|
54
|
-
return list(self.
|
126
|
+
return list(self._catalogs.keys())
|
127
|
+
|
128
|
+
def put(self, name: str, catalog: Catalog, set_default: bool = False) -> None:
|
129
|
+
self._catalogs[name] = catalog
|
130
|
+
if set_default or len(self._catalogs) == 1:
|
131
|
+
self._default_catalog = catalog
|
132
|
+
|
133
|
+
def get(self, name) -> Optional[Catalog]:
|
134
|
+
return self._catalogs.get(name)
|
55
135
|
|
56
|
-
def
|
57
|
-
self.
|
136
|
+
def pop(self, name) -> Optional[Catalog]:
|
137
|
+
catalog = self._catalogs.pop(name, None)
|
138
|
+
if catalog and self._default_catalog_name == name:
|
139
|
+
if len(self._catalogs) == 1:
|
140
|
+
self._default_catalog = list(self._catalogs.values())[0]
|
141
|
+
else:
|
142
|
+
self._default_catalog = None
|
143
|
+
return catalog
|
58
144
|
|
59
|
-
def
|
60
|
-
|
145
|
+
def clear(self) -> None:
|
146
|
+
self._catalogs.clear()
|
147
|
+
self._default_catalog = None
|
61
148
|
|
62
149
|
def default(self) -> Optional[Catalog]:
|
63
|
-
return self.
|
150
|
+
return self._default_catalog
|
64
151
|
|
65
152
|
|
66
|
-
def
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
**kwargs,
|
72
|
-
) -> None:
|
153
|
+
def is_initialized(*args, **kwargs) -> bool:
|
154
|
+
"""
|
155
|
+
Check if DeltaCAT is initialized.
|
156
|
+
"""
|
157
|
+
global all_catalogs
|
73
158
|
|
74
159
|
if not ray.is_initialized():
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
160
|
+
# Any existing Catalogs actor reference must be stale - reset it
|
161
|
+
all_catalogs = None
|
162
|
+
return all_catalogs is not None
|
163
|
+
|
164
|
+
|
165
|
+
def raise_if_not_initialized(
|
166
|
+
err_msg: str = "DeltaCAT is not initialized. Please call `deltacat.init()` and try again.",
|
167
|
+
) -> None:
|
168
|
+
"""
|
169
|
+
Raises a RuntimeError with the given error message if DeltaCAT is not
|
170
|
+
initialized.
|
171
|
+
|
172
|
+
:param err_msg: Custom error message to raise if DeltaCAT is not
|
173
|
+
initialized. If unspecified, the default error message is used.
|
174
|
+
"""
|
175
|
+
if not is_initialized():
|
176
|
+
raise RuntimeError(err_msg)
|
177
|
+
|
178
|
+
|
179
|
+
def init(
|
180
|
+
catalogs: Union[Dict[str, Catalog], Catalog] = {},
|
181
|
+
default: Optional[str] = None,
|
182
|
+
ray_init_args: Dict[str, Any] = {},
|
183
|
+
*,
|
184
|
+
force=False,
|
185
|
+
) -> Optional[ray.runtime.BaseContext]:
|
186
|
+
"""
|
187
|
+
Initialize DeltaCAT catalogs.
|
79
188
|
|
189
|
+
:param catalogs: A single Catalog instance or a map of catalog names to
|
190
|
+
Catalog instances.
|
191
|
+
:param default: The name of the default Catalog. If only one Catalog is
|
192
|
+
provided, it will always be the default.
|
193
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
194
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
195
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
196
|
+
catalogs.
|
197
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
198
|
+
"""
|
80
199
|
global all_catalogs
|
81
|
-
|
82
|
-
|
200
|
+
|
201
|
+
if is_initialized() and not force:
|
202
|
+
logger.warning("DeltaCAT already initialized.")
|
203
|
+
return None
|
204
|
+
|
205
|
+
# initialize ray (and ignore reinitialization errors)
|
206
|
+
ray_init_args["ignore_reinit_error"] = True
|
207
|
+
context = ray.init(**ray_init_args)
|
208
|
+
|
209
|
+
# register custom serializer for catalogs since these may contain
|
210
|
+
# unserializable objects like boto3 clients with SSLContext
|
211
|
+
ray.util.register_serializer(
|
212
|
+
Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
|
213
|
+
)
|
214
|
+
# TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
|
215
|
+
# with all catalogs from the last session
|
216
|
+
all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
|
217
|
+
return context
|
218
|
+
|
219
|
+
|
220
|
+
def init_local(
|
221
|
+
path: Optional[str] = None,
|
222
|
+
ray_init_args: Dict[str, Any] = {},
|
223
|
+
*,
|
224
|
+
force=False,
|
225
|
+
) -> Optional[ray.runtime.BaseContext]:
|
226
|
+
"""
|
227
|
+
Initialize DeltaCAT with a default local catalog.
|
228
|
+
|
229
|
+
This is a convenience function that creates a default catalog for local usage.
|
230
|
+
Equivalent to calling init(catalogs={"default": Catalog()}).
|
231
|
+
|
232
|
+
:param path: Optional path for catalog root directory. If not provided, uses
|
233
|
+
the default behavior of CatalogProperties (DELTACAT_ROOT env var or
|
234
|
+
"./.deltacat/").
|
235
|
+
:param ray_init_args: Keyword arguments to pass to `ray.init()`.
|
236
|
+
:param force: Whether to force DeltaCAT reinitialization. If True, reruns
|
237
|
+
ray.init(**ray_init_args) and overwrites all previously registered
|
238
|
+
catalogs.
|
239
|
+
:returns: The Ray context object if Ray was initialized, otherwise None.
|
240
|
+
"""
|
241
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
242
|
+
|
243
|
+
config = CatalogProperties(root=path) if path is not None else None
|
244
|
+
return init(
|
245
|
+
catalogs={"default": Catalog(config=config)},
|
246
|
+
default="default",
|
247
|
+
ray_init_args=ray_init_args,
|
248
|
+
force=force,
|
83
249
|
)
|
250
|
+
|
251
|
+
|
252
|
+
def get_catalog(name: Optional[str] = None) -> Catalog:
|
253
|
+
"""
|
254
|
+
Get a catalog by name, or the default catalog if no name is provided.
|
255
|
+
|
256
|
+
Args:
|
257
|
+
name: Name of catalog to retrieve (optional, uses default if not provided)
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
The requested Catalog, or ValueError if it does not exist
|
261
|
+
"""
|
262
|
+
global all_catalogs
|
263
|
+
|
264
|
+
if not all_catalogs:
|
265
|
+
raise ValueError(
|
266
|
+
"No catalogs available! Call "
|
267
|
+
"`deltacat.init(catalogs={...})` to register one or more "
|
268
|
+
"catalogs then retry."
|
269
|
+
)
|
270
|
+
if name is not None:
|
271
|
+
catalog = ray.get(all_catalogs.get.remote(name))
|
272
|
+
if not catalog:
|
273
|
+
available_catalogs = ray.get(all_catalogs.all.remote()).values()
|
274
|
+
raise ValueError(
|
275
|
+
f"Catalog '{name}' not found. Available catalogs: "
|
276
|
+
f"{available_catalogs}."
|
277
|
+
)
|
278
|
+
else:
|
279
|
+
catalog = ray.get(all_catalogs.default.remote())
|
280
|
+
if not catalog:
|
281
|
+
available_catalogs = list(ray.get(all_catalogs.all.remote()).keys())
|
282
|
+
raise ValueError(
|
283
|
+
f"Call to get_catalog without name set failed because there "
|
284
|
+
f"is no default Catalog set. Available catalogs: "
|
285
|
+
f"{available_catalogs}."
|
286
|
+
)
|
287
|
+
return catalog
|
288
|
+
|
289
|
+
|
290
|
+
def clear_catalogs() -> None:
|
291
|
+
"""
|
292
|
+
Clear all catalogs from the global map of named catalogs.
|
293
|
+
"""
|
294
|
+
if all_catalogs:
|
295
|
+
ray.get(all_catalogs.clear.remote())
|
296
|
+
|
297
|
+
|
298
|
+
def pop_catalog(name: str) -> Optional[Catalog]:
|
299
|
+
"""
|
300
|
+
Remove a named catalog from the global map of named catalogs.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
name: Name of the catalog to remove.
|
304
|
+
|
305
|
+
Returns:
|
306
|
+
The removed catalog, or None if not found.
|
307
|
+
"""
|
308
|
+
global all_catalogs
|
309
|
+
|
310
|
+
if not all_catalogs:
|
311
|
+
return None
|
312
|
+
catalog = ray.get(all_catalogs.pop.remote(name))
|
313
|
+
return catalog
|
314
|
+
|
315
|
+
|
316
|
+
def put_catalog(
|
317
|
+
name: str,
|
318
|
+
catalog: Catalog = None,
|
319
|
+
*,
|
320
|
+
default: bool = False,
|
321
|
+
ray_init_args: Dict[str, Any] = {},
|
322
|
+
fail_if_exists: bool = False,
|
323
|
+
**kwargs,
|
324
|
+
) -> Catalog:
|
325
|
+
"""
|
326
|
+
Add a named catalog to the global map of named catalogs. Initializes
|
327
|
+
DeltaCAT if not already initialized.
|
328
|
+
|
329
|
+
Args:
|
330
|
+
name: Name of the catalog.
|
331
|
+
catalog: Catalog instance to use. If none is provided, then all
|
332
|
+
additional keyword arguments will be forwarded to
|
333
|
+
`CatalogProperties` for a default DeltaCAT native Catalog.
|
334
|
+
default: Make this the default catalog if multiple catalogs are
|
335
|
+
available. If only one catalog is available, it will always be the
|
336
|
+
default.
|
337
|
+
ray_init_args: Ray initialization args (used only if ray is not already
|
338
|
+
initialized).
|
339
|
+
fail_if_exists: if True, raises an error if a catalog with the given
|
340
|
+
name already exists. If False, inserts or replaces the given
|
341
|
+
catalog name.
|
342
|
+
kwargs: Additional keyword arguments to forward to `CatalogProperties`
|
343
|
+
for a default DeltaCAT native Catalog.
|
344
|
+
|
345
|
+
Returns:
|
346
|
+
The catalog put in the named catalog map.
|
347
|
+
"""
|
348
|
+
global all_catalogs
|
349
|
+
|
350
|
+
if not catalog:
|
351
|
+
catalog = Catalog(**kwargs)
|
352
|
+
if name is None:
|
353
|
+
raise ValueError("Catalog name cannot be None")
|
354
|
+
|
355
|
+
# Initialize, if necessary
|
356
|
+
if not is_initialized():
|
357
|
+
# We are initializing a single catalog - make it the default
|
358
|
+
if not default:
|
359
|
+
logger.info(
|
360
|
+
f"Calling put_catalog with set_as_default=False, "
|
361
|
+
f"but still setting Catalog {catalog} as default since it is "
|
362
|
+
f"the only catalog."
|
363
|
+
)
|
364
|
+
init({name: catalog}, ray_init_args=ray_init_args)
|
365
|
+
return catalog
|
366
|
+
|
367
|
+
# Fail if fail_if_exists and catalog already exists
|
368
|
+
if fail_if_exists:
|
369
|
+
try:
|
370
|
+
get_catalog(name)
|
371
|
+
# If we get here, catalog exists - raise error
|
372
|
+
raise ValueError(
|
373
|
+
f"Failed to put catalog {name} because it already exists and "
|
374
|
+
f"fail_if_exists={fail_if_exists}"
|
375
|
+
)
|
376
|
+
except ValueError as e:
|
377
|
+
if "not found" not in str(e):
|
378
|
+
# Re-raise if it's not a "catalog not found" error
|
379
|
+
raise
|
380
|
+
# If catalog doesn't exist, continue normally
|
381
|
+
pass
|
382
|
+
|
383
|
+
# Add the catalog (which may overwrite existing if fail_if_exists=False)
|
384
|
+
ray.get(all_catalogs.put.remote(name, catalog, default))
|
385
|
+
return catalog
|
@@ -0,0 +1,155 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Optional, Any
|
4
|
+
import urllib.parse
|
5
|
+
|
6
|
+
import os
|
7
|
+
|
8
|
+
import pyarrow
|
9
|
+
from deltacat.constants import DELTACAT_ROOT
|
10
|
+
|
11
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
12
|
+
|
13
|
+
|
14
|
+
def get_catalog_properties(
|
15
|
+
*,
|
16
|
+
catalog: Optional[CatalogProperties] = None,
|
17
|
+
inner: Optional[CatalogProperties] = None,
|
18
|
+
**kwargs,
|
19
|
+
) -> CatalogProperties:
|
20
|
+
"""
|
21
|
+
Helper function to fetch CatalogProperties instance.
|
22
|
+
|
23
|
+
This will look first look for CatalogProperties in either "catalog"
|
24
|
+
or "inner" and otherwise passes all keyword arguments to the
|
25
|
+
CatalogProperties constructor.
|
26
|
+
"""
|
27
|
+
properties = catalog if catalog is not None else inner
|
28
|
+
if properties is not None and isinstance(properties, CatalogProperties):
|
29
|
+
return properties
|
30
|
+
elif properties is not None and not isinstance(properties, CatalogProperties):
|
31
|
+
raise ValueError(
|
32
|
+
f"Expected catalog properties of type {CatalogProperties.__name__} "
|
33
|
+
f"but found {type(properties)}."
|
34
|
+
)
|
35
|
+
else:
|
36
|
+
return CatalogProperties(**kwargs)
|
37
|
+
|
38
|
+
|
39
|
+
class CatalogProperties:
|
40
|
+
"""
|
41
|
+
DeltaCAT catalog properties used to deterministically resolve a durable
|
42
|
+
DeltaCAT catalog instance. Properties are set from system environment
|
43
|
+
variables unless explicit overrides are provided during initialization.
|
44
|
+
|
45
|
+
Catalog and storage APIs rely on the property catalog to retrieve durable
|
46
|
+
state about the catalog they're working against.
|
47
|
+
|
48
|
+
Attributes:
|
49
|
+
root: The root path for catalog metadata and data storage. Resolved by
|
50
|
+
searching for the root path in the following order:
|
51
|
+
1. "root" constructor input argument
|
52
|
+
2. "DELTACAT_ROOT" system environment variable
|
53
|
+
3. default to "./.deltacat/"
|
54
|
+
|
55
|
+
filesystem: The filesystem implementation that should be used for
|
56
|
+
reading/writing files. If None, a filesystem will be inferred from
|
57
|
+
the catalog root path.
|
58
|
+
|
59
|
+
storage: Storage class implementation (overrides default filesystem
|
60
|
+
storage impl)
|
61
|
+
"""
|
62
|
+
|
63
|
+
def __init__(
|
64
|
+
self,
|
65
|
+
root: Optional[str] = None,
|
66
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
67
|
+
storage=None,
|
68
|
+
):
|
69
|
+
"""
|
70
|
+
Initialize a CatalogProperties instance.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
root: Catalog root directory path. Uses the "DELTACAT_ROOT"
|
74
|
+
system environment variable if not set, and defaults to
|
75
|
+
"./.deltacat/" if this environment variable is not set.
|
76
|
+
filesystem: The filesystem implementation that should be used for
|
77
|
+
reading these files. If None, a filesystem will be inferred.
|
78
|
+
If provided, this will be validated for compatibility with the
|
79
|
+
catalog root path.
|
80
|
+
storage: DeltaCAT storage implementation override.
|
81
|
+
"""
|
82
|
+
# set root, using precedence rules described in pydoc
|
83
|
+
if root is None:
|
84
|
+
# Check environment variables
|
85
|
+
root = DELTACAT_ROOT
|
86
|
+
if not root:
|
87
|
+
# Default to "./.deltacat/"
|
88
|
+
root = os.path.join(os.getcwd(), ".deltacat")
|
89
|
+
|
90
|
+
# Store the original root with its scheme for reconstruction later
|
91
|
+
self._original_root = root
|
92
|
+
self._original_scheme = urllib.parse.urlparse(root).scheme
|
93
|
+
|
94
|
+
resolved_root, resolved_filesystem = resolve_path_and_filesystem(
|
95
|
+
path=root,
|
96
|
+
filesystem=filesystem,
|
97
|
+
)
|
98
|
+
self._root = resolved_root
|
99
|
+
self._filesystem = resolved_filesystem
|
100
|
+
self._storage = storage
|
101
|
+
|
102
|
+
@property
|
103
|
+
def root(self) -> str:
|
104
|
+
return self._root
|
105
|
+
|
106
|
+
@property
|
107
|
+
def filesystem(self) -> Optional[pyarrow.fs.FileSystem]:
|
108
|
+
return self._filesystem
|
109
|
+
|
110
|
+
@property
|
111
|
+
def storage(self) -> Optional[Any]:
|
112
|
+
"""
|
113
|
+
Return overridden storage impl, if any
|
114
|
+
"""
|
115
|
+
return self._storage
|
116
|
+
|
117
|
+
def reconstruct_full_path(self, path: str) -> str:
|
118
|
+
"""
|
119
|
+
Reconstruct a full path with the original scheme for external readers.
|
120
|
+
|
121
|
+
This addresses GitHub issue #567 by ensuring that cloud storage URIs
|
122
|
+
include the relevant scheme prefix (e.g., s3://) that some file readers
|
123
|
+
require regardless of the filesystem being used to read the file
|
124
|
+
(e.g., Daft).
|
125
|
+
|
126
|
+
Args:
|
127
|
+
path: A path relative to the catalog root or absolute path
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Full path with appropriate scheme prefix for external readers
|
131
|
+
"""
|
132
|
+
# If the path already has a scheme, return it as-is
|
133
|
+
if urllib.parse.urlparse(path).scheme:
|
134
|
+
return path
|
135
|
+
|
136
|
+
# If we don't have an original scheme (local filesystem), return as-is
|
137
|
+
if not self._original_scheme:
|
138
|
+
return path
|
139
|
+
|
140
|
+
# Reconstruct the full path with the original scheme
|
141
|
+
# Handle both absolute and relative paths
|
142
|
+
if path.startswith("/"):
|
143
|
+
# Absolute path - this shouldn't happen normally but handle it
|
144
|
+
return f"{self._original_scheme}:/{path}"
|
145
|
+
else:
|
146
|
+
# Relative path - prepend the s3:// scheme
|
147
|
+
return f"{self._original_scheme}://{path}"
|
148
|
+
|
149
|
+
def __str__(self):
|
150
|
+
return (
|
151
|
+
f"{self.__class__.__name__}(root={self.root}, filesystem={self.filesystem})"
|
152
|
+
)
|
153
|
+
|
154
|
+
def __repr__(self):
|
155
|
+
return self.__str__()
|
@@ -1,19 +1,30 @@
|
|
1
1
|
# Allow self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Optional, Any
|
5
|
+
|
4
6
|
from deltacat.storage import Stream, Table, TableVersion
|
7
|
+
from deltacat.storage.model.scan.push_down import Pushdown
|
8
|
+
from deltacat.storage.model.scan.scan_plan import ScanPlan
|
9
|
+
from deltacat.storage.util.scan_planner import ScanPlanner
|
5
10
|
|
6
11
|
|
7
12
|
class TableDefinition(dict):
|
8
13
|
@staticmethod
|
9
14
|
def of(
|
10
|
-
table: Table,
|
15
|
+
table: Table,
|
16
|
+
table_version: TableVersion,
|
17
|
+
stream: Stream,
|
18
|
+
native_object: Optional[Any] = None,
|
19
|
+
scan_planner: Optional[ScanPlanner] = None,
|
11
20
|
) -> TableDefinition:
|
12
21
|
return TableDefinition(
|
13
22
|
{
|
14
23
|
"table": table,
|
15
24
|
"tableVersion": table_version,
|
16
25
|
"stream": stream,
|
26
|
+
"nativeObject": native_object,
|
27
|
+
"scan_planner": scan_planner,
|
17
28
|
}
|
18
29
|
)
|
19
30
|
|
@@ -28,3 +39,23 @@ class TableDefinition(dict):
|
|
28
39
|
@property
|
29
40
|
def stream(self) -> Stream:
|
30
41
|
return self["stream"]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def native_object(self) -> Optional[Any]:
|
45
|
+
return self.get("nativeObject")
|
46
|
+
|
47
|
+
@property
|
48
|
+
def scan_planner(self) -> Optional[ScanPlanner]:
|
49
|
+
return self.get("scan_planner")
|
50
|
+
|
51
|
+
def create_scan_plan(self, pushdown: Optional[Pushdown] = None) -> ScanPlan:
|
52
|
+
if not self.scan_planner:
|
53
|
+
raise RuntimeError(
|
54
|
+
f"ScanPlanner is not initialized for table '{self.table.table_name}' "
|
55
|
+
f"of namespace '{self.table.namespace}'"
|
56
|
+
)
|
57
|
+
return self.scan_planner.create_scan_plan(
|
58
|
+
table_name=self.table.table_name,
|
59
|
+
namespace=self.table.namespace,
|
60
|
+
pushdown=pushdown,
|
61
|
+
)
|
deltacat/compute/__init__.py
CHANGED