deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3030 @@
|
|
1
|
+
import logging
|
2
|
+
import uuid
|
3
|
+
import posixpath
|
4
|
+
import pyarrow
|
5
|
+
|
6
|
+
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
7
|
+
|
8
|
+
from deltacat.catalog.model.properties import get_catalog_properties
|
9
|
+
from deltacat.constants import (
|
10
|
+
DEFAULT_TABLE_VERSION,
|
11
|
+
DATA_FILE_DIR_NAME,
|
12
|
+
)
|
13
|
+
from deltacat.exceptions import (
|
14
|
+
TableNotFoundError,
|
15
|
+
TableVersionNotFoundError,
|
16
|
+
DeltaCatError,
|
17
|
+
UnclassifiedDeltaCatError,
|
18
|
+
SchemaValidationError,
|
19
|
+
StreamNotFoundError,
|
20
|
+
PartitionNotFoundError,
|
21
|
+
DeltaNotFoundError,
|
22
|
+
NamespaceNotFoundError,
|
23
|
+
TableValidationError,
|
24
|
+
ConcurrentModificationError,
|
25
|
+
ObjectAlreadyExistsError,
|
26
|
+
NamespaceAlreadyExistsError,
|
27
|
+
TableAlreadyExistsError,
|
28
|
+
TableVersionAlreadyExistsError,
|
29
|
+
ObjectNotFoundError,
|
30
|
+
)
|
31
|
+
from deltacat.storage.model.manifest import (
|
32
|
+
EntryParams,
|
33
|
+
EntryType,
|
34
|
+
ManifestAuthor,
|
35
|
+
ManifestEntryList,
|
36
|
+
ManifestEntry,
|
37
|
+
)
|
38
|
+
from deltacat.storage.model.delta import (
|
39
|
+
Delta,
|
40
|
+
DeltaLocator,
|
41
|
+
DeltaProperties,
|
42
|
+
DeltaType,
|
43
|
+
)
|
44
|
+
from deltacat.storage.model.transaction import setup_transaction
|
45
|
+
from deltacat.storage.model.types import (
|
46
|
+
CommitState,
|
47
|
+
DistributedDataset,
|
48
|
+
LifecycleState,
|
49
|
+
LocalDataset,
|
50
|
+
LocalTable,
|
51
|
+
TransactionOperationType,
|
52
|
+
StreamFormat,
|
53
|
+
)
|
54
|
+
from deltacat.storage.model.list_result import ListResult
|
55
|
+
from deltacat.storage.model.namespace import (
|
56
|
+
Namespace,
|
57
|
+
NamespaceLocator,
|
58
|
+
NamespaceProperties,
|
59
|
+
)
|
60
|
+
from deltacat.storage.model.partition import (
|
61
|
+
Partition,
|
62
|
+
PartitionLocator,
|
63
|
+
PartitionScheme,
|
64
|
+
PartitionValues,
|
65
|
+
UNPARTITIONED_SCHEME,
|
66
|
+
UNPARTITIONED_SCHEME_ID,
|
67
|
+
)
|
68
|
+
from deltacat.storage.model.schema import Schema
|
69
|
+
from deltacat.storage.model.sort_key import (
|
70
|
+
SortScheme,
|
71
|
+
UNSORTED_SCHEME,
|
72
|
+
)
|
73
|
+
from deltacat.storage.model.stream import (
|
74
|
+
Stream,
|
75
|
+
StreamLocator,
|
76
|
+
)
|
77
|
+
from deltacat.storage.model.table import (
|
78
|
+
Table,
|
79
|
+
TableProperties,
|
80
|
+
TableLocator,
|
81
|
+
)
|
82
|
+
from deltacat.storage.model.table_version import (
|
83
|
+
TableVersion,
|
84
|
+
TableVersionProperties,
|
85
|
+
TableVersionLocator,
|
86
|
+
)
|
87
|
+
from deltacat.storage.model.metafile import (
|
88
|
+
Metafile,
|
89
|
+
)
|
90
|
+
from deltacat.storage.model.transaction import (
|
91
|
+
TransactionOperation,
|
92
|
+
Transaction,
|
93
|
+
)
|
94
|
+
from deltacat.storage.model.manifest import Manifest
|
95
|
+
from deltacat.types.media import (
|
96
|
+
ContentType,
|
97
|
+
DatasetType,
|
98
|
+
DistributedDatasetType,
|
99
|
+
StorageType,
|
100
|
+
ContentEncoding,
|
101
|
+
)
|
102
|
+
from deltacat.utils.common import ReadKwargsProvider
|
103
|
+
import pyarrow as pa
|
104
|
+
|
105
|
+
from deltacat.types.tables import (
|
106
|
+
TableProperty,
|
107
|
+
get_table_writer,
|
108
|
+
get_table_slicer,
|
109
|
+
write_sliced_table,
|
110
|
+
download_manifest_entries,
|
111
|
+
download_manifest_entries_distributed,
|
112
|
+
download_manifest_entry,
|
113
|
+
)
|
114
|
+
from deltacat import logs
|
115
|
+
|
116
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
117
|
+
|
118
|
+
|
119
|
+
def _normalize_partition_values(
|
120
|
+
partition_values: Optional[PartitionValues],
|
121
|
+
) -> Optional[PartitionValues]:
|
122
|
+
"""
|
123
|
+
Normalize partition values to ensure consistent representation of unpartitioned data.
|
124
|
+
|
125
|
+
Both None and empty list [] represent unpartitioned data, but they should be
|
126
|
+
normalized to None for consistent lookup and validation.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
partition_values: The partition values to normalize
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
None for unpartitioned data (both None and [] inputs),
|
133
|
+
original value for partitioned data
|
134
|
+
"""
|
135
|
+
if partition_values is None or (
|
136
|
+
isinstance(partition_values, list) and len(partition_values) == 0
|
137
|
+
):
|
138
|
+
return None
|
139
|
+
return partition_values
|
140
|
+
|
141
|
+
|
142
|
+
def _list(
|
143
|
+
metafile: Metafile,
|
144
|
+
txn_op_type: TransactionOperationType,
|
145
|
+
*args,
|
146
|
+
transaction: Optional[Transaction] = None,
|
147
|
+
**kwargs,
|
148
|
+
) -> ListResult[Metafile]:
|
149
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
150
|
+
limit = kwargs.get("limit") or None
|
151
|
+
|
152
|
+
operation = TransactionOperation.of(
|
153
|
+
operation_type=txn_op_type,
|
154
|
+
dest_metafile=metafile,
|
155
|
+
read_limit=limit,
|
156
|
+
)
|
157
|
+
|
158
|
+
if transaction is not None:
|
159
|
+
# Add the read operation to the existing transaction and return the result
|
160
|
+
return transaction.step(operation)
|
161
|
+
else:
|
162
|
+
# Create and commit a new transaction (legacy behavior)
|
163
|
+
new_transaction = Transaction.of([operation])
|
164
|
+
list_results_per_op = new_transaction.commit(
|
165
|
+
catalog_root_dir=catalog_properties.root,
|
166
|
+
filesystem=catalog_properties.filesystem,
|
167
|
+
)
|
168
|
+
return list_results_per_op[0]
|
169
|
+
|
170
|
+
|
171
|
+
def _latest(
|
172
|
+
metafile: Metafile,
|
173
|
+
*args,
|
174
|
+
transaction: Optional[Transaction] = None,
|
175
|
+
**kwargs,
|
176
|
+
) -> Optional[Metafile]:
|
177
|
+
list_results = _list(
|
178
|
+
metafile=metafile,
|
179
|
+
txn_op_type=TransactionOperationType.READ_LATEST,
|
180
|
+
transaction=transaction,
|
181
|
+
*args,
|
182
|
+
**kwargs,
|
183
|
+
)
|
184
|
+
results = list_results.all_items()
|
185
|
+
return results[0] if results else None
|
186
|
+
|
187
|
+
|
188
|
+
def _exists(
|
189
|
+
metafile: Metafile,
|
190
|
+
*args,
|
191
|
+
**kwargs,
|
192
|
+
) -> Optional[bool]:
|
193
|
+
list_results = _list(
|
194
|
+
metafile=metafile,
|
195
|
+
txn_op_type=TransactionOperationType.READ_EXISTS,
|
196
|
+
*args,
|
197
|
+
**kwargs,
|
198
|
+
)
|
199
|
+
results = list_results.all_items()
|
200
|
+
return True if results else False
|
201
|
+
|
202
|
+
|
203
|
+
def _resolve_latest_active_table_version_id(
|
204
|
+
namespace: str,
|
205
|
+
table_name: str,
|
206
|
+
*args,
|
207
|
+
fail_if_no_active_table_version: bool = True,
|
208
|
+
transaction: Optional[Transaction] = None,
|
209
|
+
**kwargs,
|
210
|
+
) -> Optional[str]:
|
211
|
+
table = get_table(
|
212
|
+
namespace=namespace,
|
213
|
+
table_name=table_name,
|
214
|
+
transaction=transaction,
|
215
|
+
*args,
|
216
|
+
**kwargs,
|
217
|
+
)
|
218
|
+
if not table:
|
219
|
+
raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
|
220
|
+
if fail_if_no_active_table_version and not table.latest_active_table_version:
|
221
|
+
raise TableVersionNotFoundError(
|
222
|
+
f"Table has no active table version: {namespace}.{table_name}"
|
223
|
+
)
|
224
|
+
return table.latest_active_table_version
|
225
|
+
|
226
|
+
|
227
|
+
def _resolve_latest_table_version_id(
|
228
|
+
namespace: str,
|
229
|
+
table_name: str,
|
230
|
+
fail_if_no_active_table_version: True,
|
231
|
+
*args,
|
232
|
+
transaction: Optional[Transaction] = None,
|
233
|
+
**kwargs,
|
234
|
+
) -> Optional[str]:
|
235
|
+
table = get_table(
|
236
|
+
namespace=namespace,
|
237
|
+
table_name=table_name,
|
238
|
+
transaction=transaction,
|
239
|
+
*args,
|
240
|
+
**kwargs,
|
241
|
+
)
|
242
|
+
if not table:
|
243
|
+
raise TableNotFoundError(f"Table does not exist: {namespace}.{table_name}")
|
244
|
+
if fail_if_no_active_table_version and not table.latest_table_version:
|
245
|
+
raise TableVersionNotFoundError(
|
246
|
+
f"Table has no table version: {namespace}.{table_name}"
|
247
|
+
)
|
248
|
+
return table.latest_table_version
|
249
|
+
|
250
|
+
|
251
|
+
def _validate_schemes_against_schema(
|
252
|
+
schema: Optional[Schema],
|
253
|
+
partition_scheme: Optional[PartitionScheme],
|
254
|
+
sort_scheme: Optional[SortScheme],
|
255
|
+
) -> None:
|
256
|
+
"""
|
257
|
+
Validates partition and sort schemes against a schema, ensuring all referenced fields exist.
|
258
|
+
If schema is None, validation is skipped.
|
259
|
+
"""
|
260
|
+
if schema is None:
|
261
|
+
return
|
262
|
+
|
263
|
+
schema_fields = set(field.name for field in schema.arrow)
|
264
|
+
|
265
|
+
# Validate partition scheme
|
266
|
+
if partition_scheme is not None and partition_scheme.keys is not None:
|
267
|
+
for key in partition_scheme.keys:
|
268
|
+
if key.key[0] not in schema_fields:
|
269
|
+
raise SchemaValidationError(
|
270
|
+
f"Partition key field '{key.key[0]}' not found in schema"
|
271
|
+
)
|
272
|
+
|
273
|
+
# Validate sort scheme
|
274
|
+
if sort_scheme is not None and sort_scheme.keys is not None:
|
275
|
+
for key in sort_scheme.keys:
|
276
|
+
if key.key[0] not in schema_fields:
|
277
|
+
raise SchemaValidationError(
|
278
|
+
f"Sort key field '{key.key[0]}' not found in schema"
|
279
|
+
)
|
280
|
+
|
281
|
+
|
282
|
+
def _validate_partition_values_against_scheme(
|
283
|
+
partition_values: Optional[PartitionValues],
|
284
|
+
partition_scheme: PartitionScheme,
|
285
|
+
schema: Optional[Schema],
|
286
|
+
) -> None:
|
287
|
+
"""
|
288
|
+
Validates that partition values match the data types of the partition key fields in the schema.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
partition_values: List of partition values to validate
|
292
|
+
partition_scheme: The partition scheme containing the keys to validate against
|
293
|
+
schema: The schema containing the field types to validate against
|
294
|
+
|
295
|
+
Raises:
|
296
|
+
TableValidationError: If validation fails
|
297
|
+
"""
|
298
|
+
if not partition_values:
|
299
|
+
raise TableValidationError("Partition values cannot be empty")
|
300
|
+
|
301
|
+
if not schema:
|
302
|
+
raise TableValidationError(
|
303
|
+
"Table version must have a schema to validate partition values"
|
304
|
+
)
|
305
|
+
|
306
|
+
if len(partition_values) != len(partition_scheme.keys):
|
307
|
+
raise TableValidationError(
|
308
|
+
f"Number of partition values ({len(partition_values)}) does not match "
|
309
|
+
f"number of partition keys ({len(partition_scheme.keys)})"
|
310
|
+
)
|
311
|
+
|
312
|
+
# Validate each partition value against its corresponding field type
|
313
|
+
for i in range(len(partition_scheme.keys)):
|
314
|
+
field_type = partition_scheme.keys[i].transform.return_type
|
315
|
+
partition_value = partition_values[i]
|
316
|
+
if field_type is None:
|
317
|
+
# the transform returns the same type as the source schema type
|
318
|
+
# (which also implies that it is a single-key transform)
|
319
|
+
field_type = schema.field(partition_scheme.keys[i].key[0]).arrow.type
|
320
|
+
try:
|
321
|
+
# Try to convert the value to PyArrow to validate its type
|
322
|
+
pa.array([partition_value], type=field_type)
|
323
|
+
# If successful, the type is valid
|
324
|
+
except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
|
325
|
+
raise TableValidationError(
|
326
|
+
f"Partition value {partition_value} (type {type(partition_value)}) "
|
327
|
+
f"incompatible with partition transform return type {field_type}"
|
328
|
+
) from e
|
329
|
+
|
330
|
+
|
331
|
+
def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
|
332
|
+
"""
|
333
|
+
Lists a page of table namespaces. Namespaces are returned as list result
|
334
|
+
items.
|
335
|
+
"""
|
336
|
+
return _list(
|
337
|
+
metafile=Namespace.of(NamespaceLocator.of("placeholder")),
|
338
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
339
|
+
*args,
|
340
|
+
**kwargs,
|
341
|
+
)
|
342
|
+
|
343
|
+
|
344
|
+
def list_tables(namespace: str, *args, **kwargs) -> ListResult[Table]:
|
345
|
+
"""
|
346
|
+
Lists a page of tables for the given table namespace. Tables are returned as
|
347
|
+
list result items. Raises an error if the given namespace does not exist.
|
348
|
+
"""
|
349
|
+
locator = TableLocator.at(namespace=namespace, table_name="placeholder")
|
350
|
+
try:
|
351
|
+
return _list(
|
352
|
+
metafile=Table.of(locator=locator),
|
353
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
354
|
+
*args,
|
355
|
+
**kwargs,
|
356
|
+
)
|
357
|
+
except ObjectNotFoundError as e:
|
358
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} not found") from e
|
359
|
+
|
360
|
+
|
361
|
+
def list_table_versions(
|
362
|
+
namespace: str,
|
363
|
+
table_name: str,
|
364
|
+
*args,
|
365
|
+
**kwargs,
|
366
|
+
) -> ListResult[TableVersion]:
|
367
|
+
"""
|
368
|
+
Lists a page of table versions for the given table. Table versions are
|
369
|
+
returned as list result items. Raises an error if the given table does not
|
370
|
+
exist.
|
371
|
+
"""
|
372
|
+
locator = TableVersionLocator.at(
|
373
|
+
namespace=namespace,
|
374
|
+
table_name=table_name,
|
375
|
+
table_version="placeholder.0",
|
376
|
+
)
|
377
|
+
table_version = TableVersion.of(
|
378
|
+
locator=locator,
|
379
|
+
schema=None,
|
380
|
+
)
|
381
|
+
try:
|
382
|
+
return _list(
|
383
|
+
metafile=table_version,
|
384
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
385
|
+
*args,
|
386
|
+
**kwargs,
|
387
|
+
)
|
388
|
+
except ObjectNotFoundError as e:
|
389
|
+
raise TableNotFoundError(f"Table {namespace}.{table_name} not found") from e
|
390
|
+
|
391
|
+
|
392
|
+
def list_streams(
|
393
|
+
namespace: str,
|
394
|
+
table_name: str,
|
395
|
+
table_version: str,
|
396
|
+
*args,
|
397
|
+
**kwargs,
|
398
|
+
) -> ListResult[Stream]:
|
399
|
+
"""
|
400
|
+
Lists a page of streams for the given table version.
|
401
|
+
Raises an error if the table version does not exist.
|
402
|
+
"""
|
403
|
+
# TODO(pdames): Support listing uncommitted streams.
|
404
|
+
locator = StreamLocator.at(
|
405
|
+
namespace=namespace,
|
406
|
+
table_name=table_name,
|
407
|
+
table_version=table_version,
|
408
|
+
stream_id="placeholder",
|
409
|
+
stream_format=None,
|
410
|
+
)
|
411
|
+
stream = Stream.of(
|
412
|
+
locator=locator,
|
413
|
+
partition_scheme=None,
|
414
|
+
)
|
415
|
+
try:
|
416
|
+
return _list(
|
417
|
+
stream,
|
418
|
+
TransactionOperationType.READ_SIBLINGS,
|
419
|
+
*args,
|
420
|
+
**kwargs,
|
421
|
+
)
|
422
|
+
except ObjectNotFoundError as e:
|
423
|
+
raise TableVersionNotFoundError(
|
424
|
+
f"Table version {namespace}.{table_name}.{table_version} not found"
|
425
|
+
) from e
|
426
|
+
|
427
|
+
|
428
|
+
def list_partitions(
|
429
|
+
namespace: str,
|
430
|
+
table_name: str,
|
431
|
+
table_version: Optional[str] = None,
|
432
|
+
*args,
|
433
|
+
transaction: Optional[Transaction] = None,
|
434
|
+
**kwargs,
|
435
|
+
) -> ListResult[Partition]:
|
436
|
+
"""
|
437
|
+
Lists a page of partitions for the given table version. Partitions are
|
438
|
+
returned as list result items. Table version resolves to the latest active
|
439
|
+
table version if not specified. Raises an error if the table version does
|
440
|
+
not exist.
|
441
|
+
"""
|
442
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
443
|
+
|
444
|
+
if not namespace:
|
445
|
+
raise ValueError("Namespace cannot be empty.")
|
446
|
+
if not table_name:
|
447
|
+
raise ValueError("Table name cannot be empty.")
|
448
|
+
# resolve default deltacat stream for the given namespace, table name, and table version
|
449
|
+
# TODO(pdames): debug why this doesn't work when only the table_version is provided
|
450
|
+
# and PartitionLocator.stream_format is hard-coded to deltacat (we should be able
|
451
|
+
# to resolve the default deltacat stream automatically)
|
452
|
+
stream = get_stream(
|
453
|
+
namespace=namespace,
|
454
|
+
table_name=table_name,
|
455
|
+
table_version=table_version,
|
456
|
+
transaction=transaction,
|
457
|
+
*args,
|
458
|
+
**kwargs,
|
459
|
+
)
|
460
|
+
if not stream:
|
461
|
+
raise StreamNotFoundError(
|
462
|
+
f"Default stream for {namespace}.{table_name}.{table_version} not found."
|
463
|
+
)
|
464
|
+
locator = PartitionLocator.of(
|
465
|
+
stream_locator=stream.locator,
|
466
|
+
partition_values=["placeholder"],
|
467
|
+
partition_id="placeholder",
|
468
|
+
)
|
469
|
+
partition = Partition.of(
|
470
|
+
locator=locator,
|
471
|
+
content_types=None,
|
472
|
+
)
|
473
|
+
try:
|
474
|
+
result = _list(
|
475
|
+
metafile=partition,
|
476
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
477
|
+
transaction=transaction,
|
478
|
+
*args,
|
479
|
+
**kwargs,
|
480
|
+
)
|
481
|
+
except ObjectNotFoundError as e:
|
482
|
+
raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
|
483
|
+
|
484
|
+
if commit_transaction:
|
485
|
+
transaction.seal()
|
486
|
+
return result
|
487
|
+
|
488
|
+
|
489
|
+
def list_stream_partitions(stream: Stream, *args, **kwargs) -> ListResult[Partition]:
|
490
|
+
"""
|
491
|
+
Lists all partitions committed to the given stream.
|
492
|
+
"""
|
493
|
+
# TODO(pdames): Support listing uncommitted partitions.
|
494
|
+
if stream.stream_format != StreamFormat.DELTACAT:
|
495
|
+
raise ValueError(
|
496
|
+
f"Unsupported stream format: {stream.stream_format}"
|
497
|
+
f"Expected stream format: {StreamFormat.DELTACAT}"
|
498
|
+
)
|
499
|
+
locator = PartitionLocator.of(
|
500
|
+
stream_locator=stream.locator,
|
501
|
+
partition_values=["placeholder"],
|
502
|
+
partition_id="placeholder",
|
503
|
+
)
|
504
|
+
partition = Partition.of(
|
505
|
+
locator=locator,
|
506
|
+
content_types=None,
|
507
|
+
)
|
508
|
+
try:
|
509
|
+
return _list(
|
510
|
+
metafile=partition,
|
511
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
512
|
+
*args,
|
513
|
+
**kwargs,
|
514
|
+
)
|
515
|
+
except ObjectNotFoundError as e:
|
516
|
+
raise StreamNotFoundError(f"Stream {stream.locator} not found") from e
|
517
|
+
|
518
|
+
|
519
|
+
def list_deltas(
|
520
|
+
namespace: str,
|
521
|
+
table_name: str,
|
522
|
+
partition_values: Optional[PartitionValues] = None,
|
523
|
+
table_version: Optional[str] = None,
|
524
|
+
first_stream_position: Optional[int] = None,
|
525
|
+
last_stream_position: Optional[int] = None,
|
526
|
+
ascending_order: Optional[bool] = None,
|
527
|
+
include_manifest: bool = False,
|
528
|
+
partition_scheme_id: Optional[str] = None,
|
529
|
+
*args,
|
530
|
+
transaction: Optional[Transaction] = None,
|
531
|
+
**kwargs,
|
532
|
+
) -> ListResult[Delta]:
|
533
|
+
"""
|
534
|
+
Lists a page of deltas for the given table version and committed partition.
|
535
|
+
Deltas are returned as list result items. Deltas returned can optionally be
|
536
|
+
limited to inclusive first and last stream positions. Deltas are returned by
|
537
|
+
descending stream position by default. Table version resolves to the latest
|
538
|
+
active table version if not specified. Partition values should not be
|
539
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
540
|
+
table version's current partition scheme by default. Raises an error if the
|
541
|
+
given table version or partition does not exist.
|
542
|
+
|
543
|
+
To conserve memory, the deltas returned do not include manifests by
|
544
|
+
default. The manifests can either be optionally retrieved as part of this
|
545
|
+
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
546
|
+
"""
|
547
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
548
|
+
|
549
|
+
# TODO(pdames): Delta listing should ideally either use an efficient
|
550
|
+
# range-limited dir listing of partition children between start and end
|
551
|
+
# positions, or should traverse using Partition.stream_position (to
|
552
|
+
# resolve last stream position) and Delta.previous_stream_position
|
553
|
+
# (down to first stream position).
|
554
|
+
|
555
|
+
# First get the stream to resolve proper table version and stream locator
|
556
|
+
stream = get_stream(
|
557
|
+
namespace=namespace,
|
558
|
+
table_name=table_name,
|
559
|
+
table_version=table_version,
|
560
|
+
transaction=transaction,
|
561
|
+
*args,
|
562
|
+
**kwargs,
|
563
|
+
)
|
564
|
+
if not stream:
|
565
|
+
raise StreamNotFoundError(
|
566
|
+
f"Failed to resolve stream for "
|
567
|
+
f"`{namespace}.{table_name}` at table version "
|
568
|
+
f"`{table_version or 'latest'}` (no stream found)."
|
569
|
+
)
|
570
|
+
|
571
|
+
# Then get the actual partition to ensure we have the real partition locator with ID
|
572
|
+
partition = get_partition(
|
573
|
+
stream_locator=stream.locator,
|
574
|
+
partition_values=partition_values,
|
575
|
+
partition_scheme_id=partition_scheme_id,
|
576
|
+
transaction=transaction,
|
577
|
+
*args,
|
578
|
+
**kwargs,
|
579
|
+
)
|
580
|
+
if not partition:
|
581
|
+
raise PartitionNotFoundError(
|
582
|
+
f"Failed to find partition for stream {stream.locator} "
|
583
|
+
f"with partition_values={partition_values} and "
|
584
|
+
f"partition_scheme_id={partition_scheme_id}"
|
585
|
+
)
|
586
|
+
|
587
|
+
# Use the actual partition locator (with partition ID) for listing deltas
|
588
|
+
locator = DeltaLocator.of(partition_locator=partition.locator)
|
589
|
+
delta = Delta.of(
|
590
|
+
locator=locator,
|
591
|
+
delta_type=None,
|
592
|
+
meta=None,
|
593
|
+
properties=None,
|
594
|
+
manifest=None,
|
595
|
+
)
|
596
|
+
try:
|
597
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
598
|
+
metafile=delta,
|
599
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
600
|
+
transaction=transaction,
|
601
|
+
*args,
|
602
|
+
**kwargs,
|
603
|
+
)
|
604
|
+
except ObjectNotFoundError as e:
|
605
|
+
raise PartitionNotFoundError(f"Partition {partition.locator} not found") from e
|
606
|
+
all_deltas = all_deltas_list_result.all_items()
|
607
|
+
filtered_deltas = [
|
608
|
+
delta
|
609
|
+
for delta in all_deltas
|
610
|
+
if (
|
611
|
+
first_stream_position is None
|
612
|
+
or first_stream_position <= delta.stream_position
|
613
|
+
)
|
614
|
+
and (
|
615
|
+
last_stream_position is None
|
616
|
+
or delta.stream_position <= last_stream_position
|
617
|
+
)
|
618
|
+
]
|
619
|
+
# Sort deltas by stream position in the requested order
|
620
|
+
filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
|
621
|
+
|
622
|
+
if commit_transaction:
|
623
|
+
transaction.seal()
|
624
|
+
return filtered_deltas
|
625
|
+
|
626
|
+
|
627
|
+
def list_partition_deltas(
|
628
|
+
partition_like: Union[Partition, PartitionLocator],
|
629
|
+
first_stream_position: Optional[int] = None,
|
630
|
+
last_stream_position: Optional[int] = None,
|
631
|
+
ascending_order: bool = False,
|
632
|
+
include_manifest: bool = False,
|
633
|
+
*args,
|
634
|
+
**kwargs,
|
635
|
+
) -> ListResult[Delta]:
|
636
|
+
"""
|
637
|
+
Lists a page of deltas committed to the given partition.
|
638
|
+
|
639
|
+
To conserve memory, the deltas returned do not include manifests by
|
640
|
+
default. The manifests can either be optionally retrieved as part of this
|
641
|
+
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
642
|
+
"""
|
643
|
+
# TODO(pdames): Delta listing should ideally either use an efficient
|
644
|
+
# range-limited dir listing of partition children between start and end
|
645
|
+
# positions, or should traverse using Partition.stream_position (to
|
646
|
+
# resolve last stream position) and Delta.previous_stream_position
|
647
|
+
# (down to first stream position).
|
648
|
+
locator = DeltaLocator.of(
|
649
|
+
partition_locator=partition_like
|
650
|
+
if isinstance(partition_like, PartitionLocator)
|
651
|
+
else partition_like.locator,
|
652
|
+
stream_position=None,
|
653
|
+
)
|
654
|
+
delta = Delta.of(
|
655
|
+
locator=locator,
|
656
|
+
delta_type=None,
|
657
|
+
meta=None,
|
658
|
+
properties=None,
|
659
|
+
manifest=None,
|
660
|
+
)
|
661
|
+
try:
|
662
|
+
all_deltas_list_result: ListResult[Delta] = _list(
|
663
|
+
metafile=delta,
|
664
|
+
txn_op_type=TransactionOperationType.READ_SIBLINGS,
|
665
|
+
*args,
|
666
|
+
**kwargs,
|
667
|
+
)
|
668
|
+
except ObjectNotFoundError as e:
|
669
|
+
raise PartitionNotFoundError(
|
670
|
+
f"Partition {partition_like.locator} not found"
|
671
|
+
) from e
|
672
|
+
all_deltas = all_deltas_list_result.all_items()
|
673
|
+
filtered_deltas = [
|
674
|
+
delta
|
675
|
+
for delta in all_deltas
|
676
|
+
if (
|
677
|
+
first_stream_position is None
|
678
|
+
or first_stream_position <= delta.stream_position
|
679
|
+
)
|
680
|
+
and (
|
681
|
+
last_stream_position is None
|
682
|
+
or delta.stream_position <= last_stream_position
|
683
|
+
)
|
684
|
+
]
|
685
|
+
# Sort deltas by stream position in the requested order
|
686
|
+
filtered_deltas.sort(reverse=(not ascending_order), key=lambda d: d.stream_position)
|
687
|
+
return ListResult.of(
|
688
|
+
items=filtered_deltas,
|
689
|
+
pagination_key=None,
|
690
|
+
next_page_provider=None,
|
691
|
+
)
|
692
|
+
|
693
|
+
|
694
|
+
def get_delta(
|
695
|
+
namespace: str,
|
696
|
+
table_name: str,
|
697
|
+
stream_position: int,
|
698
|
+
partition_values: Optional[PartitionValues] = None,
|
699
|
+
table_version: Optional[str] = None,
|
700
|
+
include_manifest: bool = False,
|
701
|
+
partition_scheme_id: Optional[str] = None,
|
702
|
+
*args,
|
703
|
+
transaction: Optional[Transaction] = None,
|
704
|
+
**kwargs,
|
705
|
+
) -> Optional[Delta]:
|
706
|
+
"""
|
707
|
+
Gets the delta for the given table version, partition, and stream position.
|
708
|
+
Table version resolves to the latest active table version if not specified.
|
709
|
+
Partition values should not be specified for unpartitioned tables. Partition
|
710
|
+
scheme ID resolves to the table version's current partition scheme by
|
711
|
+
default. Raises an error if the given table version or partition does not
|
712
|
+
exist.
|
713
|
+
|
714
|
+
To conserve memory, the delta returned does not include a manifest by
|
715
|
+
default. The manifest can either be optionally retrieved as part of this
|
716
|
+
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
717
|
+
"""
|
718
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
719
|
+
|
720
|
+
# TODO(pdames): Honor `include_manifest` param.
|
721
|
+
|
722
|
+
# First get the stream to resolve proper table version and stream locator
|
723
|
+
stream = get_stream(
|
724
|
+
namespace=namespace,
|
725
|
+
table_name=table_name,
|
726
|
+
table_version=table_version,
|
727
|
+
transaction=transaction,
|
728
|
+
*args,
|
729
|
+
**kwargs,
|
730
|
+
)
|
731
|
+
if not stream:
|
732
|
+
raise StreamNotFoundError(
|
733
|
+
f"Failed to resolve stream for "
|
734
|
+
f"`{namespace}.{table_name}` at table version "
|
735
|
+
f"`{table_version or 'latest'}` (no stream found)."
|
736
|
+
)
|
737
|
+
|
738
|
+
# Then get the actual partition to ensure we have the real partition locator with ID
|
739
|
+
partition = get_partition(
|
740
|
+
stream_locator=stream.locator,
|
741
|
+
partition_values=partition_values,
|
742
|
+
partition_scheme_id=partition_scheme_id,
|
743
|
+
transaction=transaction,
|
744
|
+
*args,
|
745
|
+
**kwargs,
|
746
|
+
)
|
747
|
+
if not partition:
|
748
|
+
raise PartitionNotFoundError(
|
749
|
+
f"Failed to find partition for stream {stream.locator} "
|
750
|
+
f"with partition_values={partition_values} and "
|
751
|
+
f"partition_scheme_id={partition_scheme_id}"
|
752
|
+
)
|
753
|
+
|
754
|
+
# Use the actual partition locator (with partition ID) for getting the delta
|
755
|
+
locator = DeltaLocator.of(
|
756
|
+
partition_locator=partition.locator,
|
757
|
+
stream_position=stream_position,
|
758
|
+
)
|
759
|
+
delta = Delta.of(
|
760
|
+
locator=locator,
|
761
|
+
delta_type=None,
|
762
|
+
meta=None,
|
763
|
+
properties=None,
|
764
|
+
manifest=None,
|
765
|
+
)
|
766
|
+
result = _latest(
|
767
|
+
metafile=delta,
|
768
|
+
transaction=transaction,
|
769
|
+
*args,
|
770
|
+
**kwargs,
|
771
|
+
)
|
772
|
+
|
773
|
+
# TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
|
774
|
+
# the point is to avoid loading the manifest into memory if it's not needed.
|
775
|
+
if result and not include_manifest:
|
776
|
+
result.manifest = None
|
777
|
+
|
778
|
+
if commit_transaction:
|
779
|
+
transaction.seal()
|
780
|
+
return result
|
781
|
+
|
782
|
+
|
783
|
+
def get_latest_delta(
|
784
|
+
namespace: str,
|
785
|
+
table_name: str,
|
786
|
+
partition_values: Optional[PartitionValues] = None,
|
787
|
+
table_version: Optional[str] = None,
|
788
|
+
include_manifest: bool = False,
|
789
|
+
partition_scheme_id: Optional[str] = None,
|
790
|
+
*args,
|
791
|
+
transaction: Optional[Transaction] = None,
|
792
|
+
**kwargs,
|
793
|
+
) -> Optional[Delta]:
|
794
|
+
"""
|
795
|
+
Gets the latest delta (i.e. the delta with the greatest stream position) for
|
796
|
+
the given table version and partition. Table version resolves to the latest
|
797
|
+
active table version if not specified. Partition values should not be
|
798
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
799
|
+
table version's current partition scheme by default. Raises an error if the
|
800
|
+
given table version or partition does not exist.
|
801
|
+
|
802
|
+
To conserve memory, the delta returned does not include a manifest by
|
803
|
+
default. The manifest can either be optionally retrieved as part of this
|
804
|
+
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
805
|
+
"""
|
806
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
807
|
+
|
808
|
+
stream = get_stream(
|
809
|
+
namespace=namespace,
|
810
|
+
table_name=table_name,
|
811
|
+
table_version=table_version,
|
812
|
+
transaction=transaction,
|
813
|
+
*args,
|
814
|
+
**kwargs,
|
815
|
+
)
|
816
|
+
partition = get_partition(
|
817
|
+
stream_locator=stream.locator,
|
818
|
+
partition_values=partition_values,
|
819
|
+
partition_scheme_id=partition_scheme_id,
|
820
|
+
transaction=transaction,
|
821
|
+
*args,
|
822
|
+
**kwargs,
|
823
|
+
)
|
824
|
+
locator = DeltaLocator.of(
|
825
|
+
partition_locator=partition.locator,
|
826
|
+
stream_position=partition.stream_position,
|
827
|
+
)
|
828
|
+
delta = Delta.of(
|
829
|
+
locator=locator,
|
830
|
+
delta_type=None,
|
831
|
+
meta=None,
|
832
|
+
properties=None,
|
833
|
+
manifest=None,
|
834
|
+
)
|
835
|
+
result = _latest(
|
836
|
+
metafile=delta,
|
837
|
+
transaction=transaction,
|
838
|
+
*args,
|
839
|
+
**kwargs,
|
840
|
+
)
|
841
|
+
|
842
|
+
# TODO(pdames): Honor the include_manifest parameter during retrieval from _latest, since
|
843
|
+
# the point is to avoid loading the manifest into memory if it's not needed.
|
844
|
+
if result and not include_manifest:
|
845
|
+
result.manifest = None
|
846
|
+
|
847
|
+
if commit_transaction:
|
848
|
+
transaction.seal()
|
849
|
+
return result
|
850
|
+
|
851
|
+
|
852
|
+
def _download_delta_distributed(
|
853
|
+
manifest: Manifest,
|
854
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
855
|
+
max_parallelism: Optional[int] = None,
|
856
|
+
column_names: Optional[List[str]] = None,
|
857
|
+
include_columns: Optional[List[str]] = None,
|
858
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
859
|
+
*args,
|
860
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
861
|
+
distributed_dataset_type: Optional[
|
862
|
+
DistributedDatasetType
|
863
|
+
] = DistributedDatasetType.RAY_DATASET,
|
864
|
+
**kwargs,
|
865
|
+
) -> DistributedDataset:
|
866
|
+
|
867
|
+
distributed_dataset: DistributedDataset = download_manifest_entries_distributed(
|
868
|
+
manifest=manifest,
|
869
|
+
table_type=table_type,
|
870
|
+
max_parallelism=max_parallelism,
|
871
|
+
column_names=column_names,
|
872
|
+
include_columns=include_columns,
|
873
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
874
|
+
ray_options_provider=ray_options_provider,
|
875
|
+
distributed_dataset_type=distributed_dataset_type,
|
876
|
+
*args,
|
877
|
+
**kwargs,
|
878
|
+
)
|
879
|
+
|
880
|
+
return distributed_dataset
|
881
|
+
|
882
|
+
|
883
|
+
def _download_delta_local(
|
884
|
+
manifest: Manifest,
|
885
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
886
|
+
max_parallelism: Optional[int] = None,
|
887
|
+
column_names: Optional[List[str]] = None,
|
888
|
+
include_columns: Optional[List[str]] = None,
|
889
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
890
|
+
*args,
|
891
|
+
**kwargs,
|
892
|
+
) -> LocalDataset:
|
893
|
+
tables: LocalDataset = download_manifest_entries(
|
894
|
+
manifest,
|
895
|
+
table_type,
|
896
|
+
max_parallelism if max_parallelism else 1,
|
897
|
+
column_names,
|
898
|
+
include_columns,
|
899
|
+
file_reader_kwargs_provider,
|
900
|
+
**kwargs,
|
901
|
+
)
|
902
|
+
return tables
|
903
|
+
|
904
|
+
|
905
|
+
def download_delta(
|
906
|
+
delta_like: Union[Delta, DeltaLocator],
|
907
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
908
|
+
storage_type: StorageType = StorageType.DISTRIBUTED,
|
909
|
+
max_parallelism: Optional[int] = None,
|
910
|
+
columns: Optional[List[str]] = None,
|
911
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
912
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
913
|
+
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
914
|
+
file_path_column: Optional[str] = None,
|
915
|
+
*args,
|
916
|
+
transaction: Optional[Transaction] = None,
|
917
|
+
all_column_names: Optional[List[str]] = None,
|
918
|
+
**kwargs,
|
919
|
+
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
920
|
+
"""
|
921
|
+
Read the given delta or delta locator into either a list of
|
922
|
+
tables resident in the local node's memory, or into a dataset distributed
|
923
|
+
across this Ray cluster's object store memory. Ordered table N of a local
|
924
|
+
table list, or ordered block N of a distributed dataset, always contain
|
925
|
+
the contents of ordered delta manifest entry N.
|
926
|
+
"""
|
927
|
+
# TODO (pdames): Cast delimited text types to the table's schema types
|
928
|
+
# TODO (pdames): Deprecate this method and replace with `read_delta`
|
929
|
+
# TODO (pdames): Replace dependence on TableType, StorageType, and DistributedDatasetType
|
930
|
+
# with DatasetType
|
931
|
+
|
932
|
+
# if all column names are provided, then this is a pure manifest entry download (no transaction needed)
|
933
|
+
commit_transaction = False
|
934
|
+
if not all_column_names:
|
935
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
936
|
+
|
937
|
+
storage_type_to_download_func = {
|
938
|
+
StorageType.LOCAL: _download_delta_local,
|
939
|
+
StorageType.DISTRIBUTED: _download_delta_distributed,
|
940
|
+
}
|
941
|
+
|
942
|
+
is_delta = isinstance(delta_like, Delta)
|
943
|
+
is_delta_locator = isinstance(delta_like, DeltaLocator)
|
944
|
+
|
945
|
+
delta_locator: Optional[DeltaLocator] = None
|
946
|
+
if is_delta_locator:
|
947
|
+
delta_locator = delta_like
|
948
|
+
elif is_delta:
|
949
|
+
delta_locator = Delta(delta_like).locator
|
950
|
+
if not delta_locator:
|
951
|
+
raise ValueError(
|
952
|
+
f"Expected delta_like to be a Delta or DeltaLocator, but found "
|
953
|
+
f"{type(delta_like)}."
|
954
|
+
)
|
955
|
+
|
956
|
+
# Get manifest - if delta_like is a Delta with a manifest, use it, otherwise fetch from storage
|
957
|
+
if is_delta and delta_like.manifest:
|
958
|
+
manifest = delta_like.manifest
|
959
|
+
elif all_column_names:
|
960
|
+
raise ValueError(
|
961
|
+
"All column names can only be specified with a delta with an inline manifest."
|
962
|
+
)
|
963
|
+
else:
|
964
|
+
manifest = get_delta_manifest(
|
965
|
+
delta_locator,
|
966
|
+
transaction=transaction,
|
967
|
+
*args,
|
968
|
+
**kwargs,
|
969
|
+
)
|
970
|
+
all_column_names = all_column_names or None
|
971
|
+
if not all_column_names:
|
972
|
+
table_version_schema = get_table_version_schema(
|
973
|
+
delta_locator.namespace,
|
974
|
+
delta_locator.table_name,
|
975
|
+
delta_locator.table_version,
|
976
|
+
transaction=transaction,
|
977
|
+
*args,
|
978
|
+
**kwargs,
|
979
|
+
)
|
980
|
+
if table_version_schema and table_version_schema.arrow:
|
981
|
+
all_column_names = [field.name for field in table_version_schema.arrow]
|
982
|
+
if distributed_dataset_type == DatasetType.DAFT:
|
983
|
+
# Daft needs the latest table version schema to properly handle schema evolution
|
984
|
+
kwargs["table_version_schema"] = table_version_schema.arrow
|
985
|
+
elif distributed_dataset_type == DatasetType.DAFT:
|
986
|
+
raise ValueError("All column names canot be specified with Daft.")
|
987
|
+
if columns:
|
988
|
+
# Extract file_path_column since it's appended after reading each file
|
989
|
+
columns_to_validate = (
|
990
|
+
[col for col in columns if col != file_path_column]
|
991
|
+
if file_path_column
|
992
|
+
else columns
|
993
|
+
)
|
994
|
+
|
995
|
+
# Only validate columns if we have schema information (all_column_names is not None)
|
996
|
+
if all_column_names is not None:
|
997
|
+
if not all(
|
998
|
+
col in [col_name.lower() for col_name in all_column_names]
|
999
|
+
for col in columns_to_validate
|
1000
|
+
):
|
1001
|
+
raise SchemaValidationError(
|
1002
|
+
f"One or more columns in {columns_to_validate} are not present in table "
|
1003
|
+
f"version columns {all_column_names}"
|
1004
|
+
)
|
1005
|
+
columns = [column.lower() for column in columns]
|
1006
|
+
logger.debug(
|
1007
|
+
f"Reading {columns or 'all'} columns from table version column "
|
1008
|
+
f"names: {all_column_names}. "
|
1009
|
+
)
|
1010
|
+
|
1011
|
+
# Filter out parameters that are already passed as positional/keyword arguments
|
1012
|
+
# to avoid "multiple values for argument" errors
|
1013
|
+
filtered_kwargs = {
|
1014
|
+
k: v
|
1015
|
+
for k, v in kwargs.items()
|
1016
|
+
if k
|
1017
|
+
not in [
|
1018
|
+
"manifest",
|
1019
|
+
"table_type",
|
1020
|
+
"max_parallelism",
|
1021
|
+
"column_names",
|
1022
|
+
"include_columns",
|
1023
|
+
"file_reader_kwargs_provider",
|
1024
|
+
"ray_options_provider",
|
1025
|
+
"distributed_dataset_type",
|
1026
|
+
]
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
dataset = storage_type_to_download_func[storage_type](
|
1030
|
+
manifest,
|
1031
|
+
table_type,
|
1032
|
+
max_parallelism,
|
1033
|
+
all_column_names,
|
1034
|
+
columns,
|
1035
|
+
file_reader_kwargs_provider,
|
1036
|
+
ray_options_provider=ray_options_provider,
|
1037
|
+
distributed_dataset_type=distributed_dataset_type,
|
1038
|
+
file_path_column=file_path_column,
|
1039
|
+
**filtered_kwargs,
|
1040
|
+
)
|
1041
|
+
if commit_transaction:
|
1042
|
+
transaction.seal()
|
1043
|
+
return dataset
|
1044
|
+
|
1045
|
+
|
1046
|
+
def _download_manifest_entry(
|
1047
|
+
manifest_entry: ManifestEntry,
|
1048
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1049
|
+
column_names: Optional[List[str]] = None,
|
1050
|
+
include_columns: Optional[List[str]] = None,
|
1051
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1052
|
+
content_type: Optional[ContentType] = None,
|
1053
|
+
content_encoding: Optional[ContentEncoding] = None,
|
1054
|
+
filesystem: Optional[pyarrow.fs.FileSystem] = None,
|
1055
|
+
) -> LocalTable:
|
1056
|
+
|
1057
|
+
return download_manifest_entry(
|
1058
|
+
manifest_entry,
|
1059
|
+
table_type,
|
1060
|
+
column_names,
|
1061
|
+
include_columns,
|
1062
|
+
file_reader_kwargs_provider,
|
1063
|
+
content_type,
|
1064
|
+
content_encoding,
|
1065
|
+
filesystem,
|
1066
|
+
)
|
1067
|
+
|
1068
|
+
|
1069
|
+
def download_delta_manifest_entry(
|
1070
|
+
delta_like: Union[Delta, DeltaLocator],
|
1071
|
+
entry_index: int,
|
1072
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
1073
|
+
columns: Optional[List[str]] = None,
|
1074
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
1075
|
+
*args,
|
1076
|
+
transaction: Optional[Transaction] = None,
|
1077
|
+
all_column_names: Optional[List[str]] = None,
|
1078
|
+
**kwargs,
|
1079
|
+
) -> LocalTable:
|
1080
|
+
"""
|
1081
|
+
Reads a single manifest entry into the specified table type for the
|
1082
|
+
given delta or delta locator. If a delta is provided with a non-empty
|
1083
|
+
manifest, then the entry is read from this manifest. Otherwise, the
|
1084
|
+
manifest is first retrieved then the given entry index read.
|
1085
|
+
|
1086
|
+
NOTE: The entry will be read in the current node's memory.
|
1087
|
+
"""
|
1088
|
+
# if all column names are provided, then this is a pure manifest entry download (no transaction needed)
|
1089
|
+
commit_transaction = False
|
1090
|
+
if not all_column_names:
|
1091
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1092
|
+
|
1093
|
+
is_delta = isinstance(delta_like, Delta)
|
1094
|
+
is_delta_locator = isinstance(delta_like, DeltaLocator)
|
1095
|
+
|
1096
|
+
delta_locator: Optional[DeltaLocator] = None
|
1097
|
+
if is_delta_locator:
|
1098
|
+
delta_locator = delta_like
|
1099
|
+
elif is_delta:
|
1100
|
+
delta_locator = Delta(delta_like).locator
|
1101
|
+
if not delta_locator:
|
1102
|
+
raise ValueError(
|
1103
|
+
f"Expected delta_like to be a Delta or DeltaLocator, but found "
|
1104
|
+
f"{type(delta_like)}."
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
if is_delta and delta_like.manifest:
|
1108
|
+
manifest = delta_like.manifest
|
1109
|
+
elif all_column_names:
|
1110
|
+
raise ValueError(
|
1111
|
+
"All column names can only be specified with a delta with an inline manifest."
|
1112
|
+
)
|
1113
|
+
else:
|
1114
|
+
manifest = get_delta_manifest(
|
1115
|
+
delta_locator,
|
1116
|
+
transaction=transaction,
|
1117
|
+
*args,
|
1118
|
+
**kwargs,
|
1119
|
+
)
|
1120
|
+
# TODO(pdames): Cache table version column names and only invoke when
|
1121
|
+
# needed.
|
1122
|
+
all_column_names = all_column_names or get_table_version_column_names(
|
1123
|
+
delta_locator.namespace,
|
1124
|
+
delta_locator.table_name,
|
1125
|
+
delta_locator.table_version,
|
1126
|
+
transaction=transaction,
|
1127
|
+
*args,
|
1128
|
+
**kwargs,
|
1129
|
+
)
|
1130
|
+
if columns:
|
1131
|
+
if not all(
|
1132
|
+
col in [col_name.lower() for col_name in all_column_names]
|
1133
|
+
for col in columns
|
1134
|
+
):
|
1135
|
+
raise SchemaValidationError(
|
1136
|
+
f"One or more columns in {columns} are not present in table "
|
1137
|
+
f"version columns {all_column_names}"
|
1138
|
+
)
|
1139
|
+
columns = [column.lower() for column in columns]
|
1140
|
+
logger.debug(
|
1141
|
+
f"Reading {columns or 'all'} columns from table version column "
|
1142
|
+
f"names: {all_column_names}. "
|
1143
|
+
)
|
1144
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
1145
|
+
manifest_entry = _download_manifest_entry(
|
1146
|
+
manifest.entries[entry_index],
|
1147
|
+
table_type,
|
1148
|
+
all_column_names,
|
1149
|
+
columns,
|
1150
|
+
file_reader_kwargs_provider,
|
1151
|
+
filesystem=catalog_properties.filesystem,
|
1152
|
+
)
|
1153
|
+
if commit_transaction:
|
1154
|
+
transaction.seal()
|
1155
|
+
return manifest_entry
|
1156
|
+
|
1157
|
+
|
1158
|
+
def get_delta_manifest(
|
1159
|
+
delta_like: Union[Delta, DeltaLocator],
|
1160
|
+
*args,
|
1161
|
+
**kwargs,
|
1162
|
+
) -> Manifest:
|
1163
|
+
"""
|
1164
|
+
Get the manifest associated with the given delta or delta locator. This
|
1165
|
+
always retrieves the authoritative durable copy of the delta manifest, and
|
1166
|
+
never the local manifest defined for any input delta. Raises an error if
|
1167
|
+
the delta can't be found, or if it doesn't contain a manifest.
|
1168
|
+
"""
|
1169
|
+
if isinstance(delta_like, Delta):
|
1170
|
+
delta_locator = delta_like.locator
|
1171
|
+
elif isinstance(delta_like, DeltaLocator):
|
1172
|
+
delta_locator = delta_like
|
1173
|
+
else:
|
1174
|
+
raise ValueError(
|
1175
|
+
f"Expected delta or delta locator, but got: {type(delta_like)}"
|
1176
|
+
)
|
1177
|
+
delta = Delta.of(
|
1178
|
+
locator=delta_locator,
|
1179
|
+
delta_type=None,
|
1180
|
+
meta=None,
|
1181
|
+
properties=None,
|
1182
|
+
manifest=None,
|
1183
|
+
)
|
1184
|
+
latest_delta: Delta = _latest(
|
1185
|
+
metafile=delta,
|
1186
|
+
*args,
|
1187
|
+
**kwargs,
|
1188
|
+
)
|
1189
|
+
if not latest_delta:
|
1190
|
+
raise DeltaNotFoundError(f"No delta found for locator: {delta_locator}")
|
1191
|
+
elif not latest_delta.manifest:
|
1192
|
+
raise DeltaNotFoundError(f"No manifest found for delta: {latest_delta}")
|
1193
|
+
return latest_delta.manifest
|
1194
|
+
|
1195
|
+
|
1196
|
+
def create_namespace(
|
1197
|
+
namespace: str,
|
1198
|
+
properties: Optional[NamespaceProperties] = None,
|
1199
|
+
*args,
|
1200
|
+
transaction: Optional[Transaction] = None,
|
1201
|
+
**kwargs,
|
1202
|
+
) -> Namespace:
|
1203
|
+
"""
|
1204
|
+
Creates a table namespace with the given name and properties. Returns
|
1205
|
+
the created namespace.
|
1206
|
+
"""
|
1207
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1208
|
+
|
1209
|
+
namespace = Namespace.of(
|
1210
|
+
locator=NamespaceLocator.of(namespace=namespace),
|
1211
|
+
properties=properties,
|
1212
|
+
)
|
1213
|
+
|
1214
|
+
# Add the operation to the transaction
|
1215
|
+
transaction.step(
|
1216
|
+
TransactionOperation.of(
|
1217
|
+
operation_type=TransactionOperationType.CREATE,
|
1218
|
+
dest_metafile=namespace,
|
1219
|
+
),
|
1220
|
+
)
|
1221
|
+
|
1222
|
+
if commit_transaction:
|
1223
|
+
transaction.seal()
|
1224
|
+
return namespace
|
1225
|
+
|
1226
|
+
|
1227
|
+
def update_namespace(
|
1228
|
+
namespace: str,
|
1229
|
+
properties: Optional[NamespaceProperties] = None,
|
1230
|
+
new_namespace: Optional[str] = None,
|
1231
|
+
*args,
|
1232
|
+
transaction: Optional[Transaction] = None,
|
1233
|
+
**kwargs,
|
1234
|
+
) -> None:
|
1235
|
+
"""
|
1236
|
+
Updates a table namespace's name and/or properties. Raises an error if the
|
1237
|
+
given namespace does not exist.
|
1238
|
+
"""
|
1239
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1240
|
+
|
1241
|
+
# Check if the namespace exists
|
1242
|
+
old_namespace_meta = get_namespace(
|
1243
|
+
namespace=namespace,
|
1244
|
+
transaction=transaction,
|
1245
|
+
*args,
|
1246
|
+
**kwargs,
|
1247
|
+
)
|
1248
|
+
if not old_namespace_meta:
|
1249
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
|
1250
|
+
|
1251
|
+
# Create new namespace metadata
|
1252
|
+
new_namespace_meta: Namespace = Metafile.update_for(old_namespace_meta)
|
1253
|
+
if new_namespace:
|
1254
|
+
new_namespace_meta.locator.namespace = new_namespace
|
1255
|
+
if properties is not None:
|
1256
|
+
new_namespace_meta.properties = properties
|
1257
|
+
|
1258
|
+
# Add the update operation to the transaction
|
1259
|
+
try:
|
1260
|
+
transaction.step(
|
1261
|
+
TransactionOperation.of(
|
1262
|
+
operation_type=TransactionOperationType.UPDATE,
|
1263
|
+
dest_metafile=new_namespace_meta,
|
1264
|
+
src_metafile=old_namespace_meta,
|
1265
|
+
),
|
1266
|
+
)
|
1267
|
+
except ObjectAlreadyExistsError as e:
|
1268
|
+
raise NamespaceAlreadyExistsError(
|
1269
|
+
f"Namespace {namespace} already exists"
|
1270
|
+
) from e
|
1271
|
+
|
1272
|
+
if commit_transaction:
|
1273
|
+
transaction.seal()
|
1274
|
+
|
1275
|
+
|
1276
|
+
def create_table_version(
|
1277
|
+
namespace: str,
|
1278
|
+
table_name: str,
|
1279
|
+
table_version: Optional[str] = None,
|
1280
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
|
1281
|
+
schema: Optional[Schema] = None,
|
1282
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
1283
|
+
sort_keys: Optional[SortScheme] = None,
|
1284
|
+
table_version_description: Optional[str] = None,
|
1285
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
1286
|
+
table_description: Optional[str] = None,
|
1287
|
+
table_properties: Optional[TableProperties] = None,
|
1288
|
+
supported_content_types: Optional[List[ContentType]] = None,
|
1289
|
+
*args,
|
1290
|
+
transaction: Optional[Transaction] = None,
|
1291
|
+
**kwargs,
|
1292
|
+
) -> Tuple[Table, TableVersion, Stream]:
|
1293
|
+
"""
|
1294
|
+
Create a table version with the given or CREATED lifecycle state and an empty delta
|
1295
|
+
stream. Table versions may be schemaless and unpartitioned to improve write
|
1296
|
+
performance, or have their writes governed by a schema and partition scheme
|
1297
|
+
to improve data consistency and read performance.
|
1298
|
+
|
1299
|
+
Returns a tuple containing the created/updated table, table version, and
|
1300
|
+
stream (respectively).
|
1301
|
+
|
1302
|
+
Raises an error if the given namespace does not exist.
|
1303
|
+
"""
|
1304
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1305
|
+
|
1306
|
+
if not namespace_exists(
|
1307
|
+
namespace=namespace,
|
1308
|
+
transaction=transaction,
|
1309
|
+
*args,
|
1310
|
+
**kwargs,
|
1311
|
+
):
|
1312
|
+
raise NamespaceNotFoundError(f"Namespace {namespace} does not exist")
|
1313
|
+
|
1314
|
+
# Validate schemes against schema
|
1315
|
+
_validate_schemes_against_schema(schema, partition_scheme, sort_keys)
|
1316
|
+
|
1317
|
+
# coerce unspecified partition schemes to the unpartitioned scheme
|
1318
|
+
partition_scheme = partition_scheme or UNPARTITIONED_SCHEME
|
1319
|
+
# coerce unspecified sort schemes to the unsorted scheme
|
1320
|
+
sort_keys = sort_keys or UNSORTED_SCHEME
|
1321
|
+
# check if a parent table and/or previous table version already exist
|
1322
|
+
prev_table_version = None
|
1323
|
+
prev_table = get_table(
|
1324
|
+
namespace=namespace,
|
1325
|
+
table_name=table_name,
|
1326
|
+
transaction=transaction,
|
1327
|
+
*args,
|
1328
|
+
**kwargs,
|
1329
|
+
)
|
1330
|
+
if not prev_table:
|
1331
|
+
# no parent table exists, so we'll create it in this transaction
|
1332
|
+
table_txn_op_type = TransactionOperationType.CREATE
|
1333
|
+
prev_table = None
|
1334
|
+
new_table = Table.of(
|
1335
|
+
locator=TableLocator.at(namespace=namespace, table_name=table_name),
|
1336
|
+
)
|
1337
|
+
table_version = table_version or DEFAULT_TABLE_VERSION
|
1338
|
+
else:
|
1339
|
+
# the parent table exists, so we'll update it in this transaction
|
1340
|
+
table_txn_op_type = TransactionOperationType.UPDATE
|
1341
|
+
new_table: Table = Metafile.update_for(prev_table)
|
1342
|
+
prev_table_version = prev_table.latest_table_version
|
1343
|
+
if not table_version:
|
1344
|
+
# generate the next table version ID
|
1345
|
+
table_version = TableVersion.next_version(prev_table_version)
|
1346
|
+
else:
|
1347
|
+
# ensure that the given table version number matches expectations
|
1348
|
+
expected_table_version = TableVersion.next_version(prev_table_version)
|
1349
|
+
_, version_number = TableVersion.parse_table_version(
|
1350
|
+
table_version,
|
1351
|
+
)
|
1352
|
+
_, expected_version_number = TableVersion.parse_table_version(
|
1353
|
+
expected_table_version,
|
1354
|
+
)
|
1355
|
+
if version_number != expected_version_number:
|
1356
|
+
raise TableValidationError(
|
1357
|
+
f"Expected to create table version "
|
1358
|
+
f"{expected_version_number} but found {version_number}.",
|
1359
|
+
)
|
1360
|
+
if table_description is not None:
|
1361
|
+
new_table.description = table_description
|
1362
|
+
if table_properties is not None:
|
1363
|
+
new_table.properties = table_properties
|
1364
|
+
new_table.latest_table_version = table_version
|
1365
|
+
new_table.latest_active_table_version = (
|
1366
|
+
table_version if lifecycle_state == LifecycleState.ACTIVE else None
|
1367
|
+
)
|
1368
|
+
locator = TableVersionLocator.at(
|
1369
|
+
namespace=namespace,
|
1370
|
+
table_name=table_name,
|
1371
|
+
table_version=table_version,
|
1372
|
+
)
|
1373
|
+
table_version = TableVersion.of(
|
1374
|
+
locator=locator,
|
1375
|
+
schema=schema,
|
1376
|
+
partition_scheme=partition_scheme,
|
1377
|
+
description=table_version_description,
|
1378
|
+
properties=table_version_properties,
|
1379
|
+
content_types=supported_content_types,
|
1380
|
+
sort_scheme=sort_keys,
|
1381
|
+
watermark=None,
|
1382
|
+
lifecycle_state=lifecycle_state,
|
1383
|
+
schemas=[schema] if schema else None,
|
1384
|
+
partition_schemes=[partition_scheme],
|
1385
|
+
sort_schemes=[sort_keys],
|
1386
|
+
previous_table_version=prev_table_version,
|
1387
|
+
)
|
1388
|
+
# create the table version's default deltacat stream in this transaction
|
1389
|
+
stream_locator = StreamLocator.of(
|
1390
|
+
table_version_locator=locator,
|
1391
|
+
stream_id=str(uuid.uuid4()),
|
1392
|
+
stream_format=StreamFormat.DELTACAT,
|
1393
|
+
)
|
1394
|
+
stream = Stream.of(
|
1395
|
+
locator=stream_locator,
|
1396
|
+
partition_scheme=partition_scheme,
|
1397
|
+
state=CommitState.COMMITTED,
|
1398
|
+
previous_stream_id=None,
|
1399
|
+
watermark=None,
|
1400
|
+
)
|
1401
|
+
# Add operations to the transaction
|
1402
|
+
transaction.step(
|
1403
|
+
TransactionOperation.of(
|
1404
|
+
operation_type=table_txn_op_type,
|
1405
|
+
dest_metafile=new_table,
|
1406
|
+
src_metafile=prev_table,
|
1407
|
+
),
|
1408
|
+
)
|
1409
|
+
transaction.step(
|
1410
|
+
TransactionOperation.of(
|
1411
|
+
operation_type=TransactionOperationType.CREATE,
|
1412
|
+
dest_metafile=table_version,
|
1413
|
+
),
|
1414
|
+
)
|
1415
|
+
transaction.step(
|
1416
|
+
TransactionOperation.of(
|
1417
|
+
operation_type=TransactionOperationType.CREATE,
|
1418
|
+
dest_metafile=stream,
|
1419
|
+
),
|
1420
|
+
)
|
1421
|
+
|
1422
|
+
if commit_transaction:
|
1423
|
+
transaction.seal()
|
1424
|
+
return new_table, table_version, stream
|
1425
|
+
|
1426
|
+
|
1427
|
+
def create_table(
|
1428
|
+
namespace: str,
|
1429
|
+
table_name: str,
|
1430
|
+
description: Optional[str] = None,
|
1431
|
+
properties: Optional[TableProperties] = None,
|
1432
|
+
*args,
|
1433
|
+
transaction: Optional[Transaction] = None,
|
1434
|
+
**kwargs,
|
1435
|
+
) -> Table:
|
1436
|
+
"""
|
1437
|
+
Create a new table. Raises an error if the given table already exists.
|
1438
|
+
"""
|
1439
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1440
|
+
|
1441
|
+
new_table: Table = Table.of(
|
1442
|
+
locator=TableLocator.at(namespace=namespace, table_name=table_name),
|
1443
|
+
description=description,
|
1444
|
+
properties=properties,
|
1445
|
+
)
|
1446
|
+
try:
|
1447
|
+
transaction.step(
|
1448
|
+
TransactionOperation.of(
|
1449
|
+
operation_type=TransactionOperationType.CREATE,
|
1450
|
+
dest_metafile=new_table,
|
1451
|
+
),
|
1452
|
+
)
|
1453
|
+
except ObjectAlreadyExistsError as e:
|
1454
|
+
raise TableAlreadyExistsError(
|
1455
|
+
f"Table {namespace}.{table_name} already exists"
|
1456
|
+
) from e
|
1457
|
+
|
1458
|
+
if commit_transaction:
|
1459
|
+
transaction.seal()
|
1460
|
+
return new_table
|
1461
|
+
|
1462
|
+
|
1463
|
+
def update_table(
|
1464
|
+
namespace: str,
|
1465
|
+
table_name: str,
|
1466
|
+
description: Optional[str] = None,
|
1467
|
+
properties: Optional[TableProperties] = None,
|
1468
|
+
new_table_name: Optional[str] = None,
|
1469
|
+
*args,
|
1470
|
+
transaction: Optional[Transaction] = None,
|
1471
|
+
**kwargs,
|
1472
|
+
) -> Table:
|
1473
|
+
"""
|
1474
|
+
Update table metadata describing the table versions it contains. By default,
|
1475
|
+
a table's properties are empty, and its description is equal to that given
|
1476
|
+
when its first table version was created. Raises an error if the given
|
1477
|
+
table does not exist.
|
1478
|
+
"""
|
1479
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1480
|
+
|
1481
|
+
old_table = get_table(
|
1482
|
+
namespace=namespace,
|
1483
|
+
table_name=table_name,
|
1484
|
+
transaction=transaction,
|
1485
|
+
*args,
|
1486
|
+
**kwargs,
|
1487
|
+
)
|
1488
|
+
if not old_table:
|
1489
|
+
raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
|
1490
|
+
new_table: Table = Metafile.update_for(old_table)
|
1491
|
+
new_table.description = description or old_table.description
|
1492
|
+
new_table.properties = properties or old_table.properties
|
1493
|
+
new_table.table_name = new_table_name or old_table.table_name
|
1494
|
+
|
1495
|
+
try:
|
1496
|
+
transaction.step(
|
1497
|
+
TransactionOperation.of(
|
1498
|
+
operation_type=TransactionOperationType.UPDATE,
|
1499
|
+
dest_metafile=new_table,
|
1500
|
+
src_metafile=old_table,
|
1501
|
+
),
|
1502
|
+
)
|
1503
|
+
except ObjectAlreadyExistsError as e:
|
1504
|
+
raise TableAlreadyExistsError(
|
1505
|
+
f"Table {namespace}.{table_name} already exists"
|
1506
|
+
) from e
|
1507
|
+
|
1508
|
+
if commit_transaction:
|
1509
|
+
transaction.seal()
|
1510
|
+
return new_table
|
1511
|
+
|
1512
|
+
|
1513
|
+
def update_table_version(
|
1514
|
+
namespace: str,
|
1515
|
+
table_name: str,
|
1516
|
+
table_version: str,
|
1517
|
+
lifecycle_state: Optional[LifecycleState] = None,
|
1518
|
+
schema: Optional[Schema] = None,
|
1519
|
+
description: Optional[str] = None,
|
1520
|
+
properties: Optional[TableVersionProperties] = None,
|
1521
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
1522
|
+
sort_keys: Optional[SortScheme] = None,
|
1523
|
+
*args,
|
1524
|
+
transaction: Optional[Transaction] = None,
|
1525
|
+
**kwargs,
|
1526
|
+
) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
|
1527
|
+
"""
|
1528
|
+
Update a table version. Notably, updating an unreleased table version's
|
1529
|
+
lifecycle state to 'ACTIVE' telegraphs that it is ready for external
|
1530
|
+
consumption, and causes all calls made to consume/produce streams,
|
1531
|
+
partitions, or deltas from/to its parent table to automatically resolve to
|
1532
|
+
this table version by default (i.e., when the client does not explicitly
|
1533
|
+
specify a different table version). Raises an error if the given table
|
1534
|
+
version does not exist.
|
1535
|
+
|
1536
|
+
Note that, to transition a table version from partitioned to unpartitioned,
|
1537
|
+
partition_scheme must be explicitly set to UNPARTITIONED_SCHEME. Similarly
|
1538
|
+
to transition a table version from sorted to unsorted, sort_keys must be
|
1539
|
+
explicitly set to UNSORTED_SCHEME.
|
1540
|
+
"""
|
1541
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1542
|
+
old_table_version = get_table_version(
|
1543
|
+
namespace=namespace,
|
1544
|
+
table_name=table_name,
|
1545
|
+
table_version=table_version,
|
1546
|
+
transaction=transaction,
|
1547
|
+
*args,
|
1548
|
+
**kwargs,
|
1549
|
+
)
|
1550
|
+
if not old_table_version:
|
1551
|
+
raise TableVersionNotFoundError(
|
1552
|
+
f"Table version `{table_version}` does not exist for "
|
1553
|
+
f"table `{namespace}.{table_name}`."
|
1554
|
+
)
|
1555
|
+
|
1556
|
+
# If schema is not provided but partition_scheme or sort_keys are,
|
1557
|
+
# validate against the existing schema
|
1558
|
+
schema_to_validate = schema or old_table_version.schema
|
1559
|
+
_validate_schemes_against_schema(schema_to_validate, partition_scheme, sort_keys)
|
1560
|
+
|
1561
|
+
new_table_version: TableVersion = Metafile.update_for(old_table_version)
|
1562
|
+
new_table_version.state = lifecycle_state or old_table_version.state
|
1563
|
+
|
1564
|
+
# Caller is expected to do all necessary backwards compatibility schema checks
|
1565
|
+
update_schema = schema and not schema.equivalent_to(
|
1566
|
+
old_table_version.schema,
|
1567
|
+
True,
|
1568
|
+
)
|
1569
|
+
if update_schema and schema.id in [s.id for s in old_table_version.schemas]:
|
1570
|
+
raise TableValidationError(
|
1571
|
+
f"Schema ID `{schema.id}` already exists in "
|
1572
|
+
f"table version `{table_version}`."
|
1573
|
+
)
|
1574
|
+
new_table_version.schema = schema if update_schema else old_table_version.schema
|
1575
|
+
new_table_version.schemas = (
|
1576
|
+
old_table_version.schemas + [schema]
|
1577
|
+
if update_schema
|
1578
|
+
else old_table_version.schemas
|
1579
|
+
)
|
1580
|
+
new_table_version.description = (
|
1581
|
+
description if description is not None else old_table_version.description
|
1582
|
+
)
|
1583
|
+
new_table_version.properties = (
|
1584
|
+
properties if properties is not None else old_table_version.properties
|
1585
|
+
)
|
1586
|
+
new_supported_reader_types = new_table_version.read_table_property(
|
1587
|
+
TableProperty.SUPPORTED_READER_TYPES
|
1588
|
+
)
|
1589
|
+
if new_supported_reader_types:
|
1590
|
+
old_supported_reader_types = (
|
1591
|
+
old_table_version.read_table_property(TableProperty.SUPPORTED_READER_TYPES)
|
1592
|
+
or {}
|
1593
|
+
)
|
1594
|
+
added_supported_reader_types = set(new_supported_reader_types) - set(
|
1595
|
+
old_supported_reader_types
|
1596
|
+
)
|
1597
|
+
if added_supported_reader_types:
|
1598
|
+
raise TableValidationError(
|
1599
|
+
f"Cannot add new supported reader types: {added_supported_reader_types}"
|
1600
|
+
)
|
1601
|
+
new_table_version.partition_scheme = (
|
1602
|
+
partition_scheme or old_table_version.partition_scheme
|
1603
|
+
)
|
1604
|
+
# TODO(pdames): Check for backwards incompatible partition scheme changes.
|
1605
|
+
update_partition_scheme = partition_scheme and not partition_scheme.equivalent_to(
|
1606
|
+
old_table_version.partition_scheme,
|
1607
|
+
True,
|
1608
|
+
)
|
1609
|
+
if update_partition_scheme and partition_scheme.id in [
|
1610
|
+
ps.id for ps in old_table_version.partition_schemes
|
1611
|
+
]:
|
1612
|
+
raise TableValidationError(
|
1613
|
+
f"Partition scheme ID `{partition_scheme.id}` already exists in "
|
1614
|
+
f"table version `{table_version}`."
|
1615
|
+
)
|
1616
|
+
new_table_version.partition_schemes = (
|
1617
|
+
old_table_version.partition_schemes + [partition_scheme]
|
1618
|
+
if update_partition_scheme
|
1619
|
+
else old_table_version.partition_schemes
|
1620
|
+
)
|
1621
|
+
# TODO(pdames): Check for backwards incompatible sort scheme changes.
|
1622
|
+
update_sort_scheme = sort_keys and not sort_keys.equivalent_to(
|
1623
|
+
old_table_version.sort_scheme,
|
1624
|
+
True,
|
1625
|
+
)
|
1626
|
+
if update_sort_scheme and sort_keys.id in [
|
1627
|
+
sk.id for sk in old_table_version.sort_schemes
|
1628
|
+
]:
|
1629
|
+
raise TableValidationError(
|
1630
|
+
f"Sort scheme ID `{sort_keys.id}` already exists in "
|
1631
|
+
f"table version `{table_version}`."
|
1632
|
+
)
|
1633
|
+
new_table_version.sort_scheme = sort_keys or old_table_version.sort_scheme
|
1634
|
+
new_table_version.sort_schemes = (
|
1635
|
+
old_table_version.sort_schemes + [sort_keys]
|
1636
|
+
if update_sort_scheme
|
1637
|
+
else old_table_version.sort_schemes
|
1638
|
+
)
|
1639
|
+
old_table = get_table(
|
1640
|
+
namespace=namespace,
|
1641
|
+
table_name=table_name,
|
1642
|
+
transaction=transaction,
|
1643
|
+
*args,
|
1644
|
+
**kwargs,
|
1645
|
+
)
|
1646
|
+
new_table: Table = None
|
1647
|
+
if (
|
1648
|
+
lifecycle_state == LifecycleState.ACTIVE
|
1649
|
+
and old_table_version.state != LifecycleState.ACTIVE
|
1650
|
+
):
|
1651
|
+
_, old_version_number = (
|
1652
|
+
TableVersion.parse_table_version(
|
1653
|
+
old_table.latest_active_table_version,
|
1654
|
+
)
|
1655
|
+
if old_table.latest_active_table_version
|
1656
|
+
else (None, None)
|
1657
|
+
)
|
1658
|
+
_, new_version_number = TableVersion.parse_table_version(table_version)
|
1659
|
+
if old_version_number is None or old_version_number < new_version_number:
|
1660
|
+
# update the table's latest table version
|
1661
|
+
new_table = Metafile.update_for(old_table)
|
1662
|
+
new_table.latest_active_table_version = table_version
|
1663
|
+
transaction.step(
|
1664
|
+
TransactionOperation.of(
|
1665
|
+
operation_type=TransactionOperationType.UPDATE,
|
1666
|
+
dest_metafile=new_table,
|
1667
|
+
src_metafile=old_table,
|
1668
|
+
),
|
1669
|
+
)
|
1670
|
+
try:
|
1671
|
+
transaction.step(
|
1672
|
+
TransactionOperation.of(
|
1673
|
+
operation_type=TransactionOperationType.UPDATE,
|
1674
|
+
dest_metafile=new_table_version,
|
1675
|
+
src_metafile=old_table_version,
|
1676
|
+
),
|
1677
|
+
)
|
1678
|
+
except ObjectAlreadyExistsError as e:
|
1679
|
+
raise TableVersionAlreadyExistsError(
|
1680
|
+
f"Table version {namespace}.{table_name}.{table_version} already exists"
|
1681
|
+
) from e
|
1682
|
+
|
1683
|
+
# TODO(pdames): Push changes down to non-deltacat streams via sync module.
|
1684
|
+
# Also copy sort scheme changes down to deltacat child stream?
|
1685
|
+
new_stream: Stream = None
|
1686
|
+
if partition_scheme:
|
1687
|
+
old_stream = get_stream(
|
1688
|
+
namespace=namespace,
|
1689
|
+
table_name=table_name,
|
1690
|
+
table_version=table_version,
|
1691
|
+
transaction=transaction,
|
1692
|
+
*args,
|
1693
|
+
**kwargs,
|
1694
|
+
)
|
1695
|
+
new_stream = Metafile.update_for(old_stream)
|
1696
|
+
new_stream.partition_scheme = partition_scheme
|
1697
|
+
transaction.step(
|
1698
|
+
TransactionOperation.of(
|
1699
|
+
operation_type=TransactionOperationType.UPDATE,
|
1700
|
+
dest_metafile=new_stream,
|
1701
|
+
src_metafile=old_stream,
|
1702
|
+
),
|
1703
|
+
)
|
1704
|
+
if commit_transaction:
|
1705
|
+
transaction.seal()
|
1706
|
+
return new_table, new_table_version, new_stream
|
1707
|
+
|
1708
|
+
|
1709
|
+
def stage_stream(
|
1710
|
+
namespace: str,
|
1711
|
+
table_name: str,
|
1712
|
+
table_version: Optional[str] = None,
|
1713
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1714
|
+
*args,
|
1715
|
+
transaction: Optional[Transaction] = None,
|
1716
|
+
**kwargs,
|
1717
|
+
) -> Stream:
|
1718
|
+
"""
|
1719
|
+
Stages a new delta stream for the given table version. Resolves to the
|
1720
|
+
latest active table version if no table version is given. Resolves to the
|
1721
|
+
DeltaCAT stream format if no stream format is given. If this stream
|
1722
|
+
will replace another stream with the same format and scheme, then it will
|
1723
|
+
have its previous stream ID set to the ID of the stream being replaced.
|
1724
|
+
Returns the staged stream. Raises an error if the table version does not
|
1725
|
+
exist.
|
1726
|
+
"""
|
1727
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1728
|
+
|
1729
|
+
if not table_version:
|
1730
|
+
table_version = _resolve_latest_active_table_version_id(
|
1731
|
+
namespace=namespace,
|
1732
|
+
table_name=table_name,
|
1733
|
+
transaction=transaction,
|
1734
|
+
*args,
|
1735
|
+
**kwargs,
|
1736
|
+
)
|
1737
|
+
table_version_meta = get_table_version(
|
1738
|
+
namespace=namespace,
|
1739
|
+
table_name=table_name,
|
1740
|
+
table_version=table_version,
|
1741
|
+
transaction=transaction,
|
1742
|
+
*args,
|
1743
|
+
**kwargs,
|
1744
|
+
)
|
1745
|
+
if not table_version_meta:
|
1746
|
+
raise TableVersionNotFoundError(
|
1747
|
+
f"Table version not found: {namespace}.{table_name}.{table_version}."
|
1748
|
+
)
|
1749
|
+
locator = StreamLocator.at(
|
1750
|
+
namespace=namespace,
|
1751
|
+
table_name=table_name,
|
1752
|
+
table_version=table_version,
|
1753
|
+
stream_id=str(uuid.uuid4()),
|
1754
|
+
stream_format=stream_format or StreamFormat.DELTACAT,
|
1755
|
+
)
|
1756
|
+
stream = Stream.of(
|
1757
|
+
locator=locator,
|
1758
|
+
partition_scheme=table_version_meta.partition_scheme,
|
1759
|
+
state=CommitState.STAGED,
|
1760
|
+
previous_stream_id=None,
|
1761
|
+
watermark=None,
|
1762
|
+
)
|
1763
|
+
prev_stream = get_stream(
|
1764
|
+
namespace=stream.namespace,
|
1765
|
+
table_name=stream.table_name,
|
1766
|
+
table_version=stream.table_version,
|
1767
|
+
stream_format=stream.stream_format,
|
1768
|
+
transaction=transaction,
|
1769
|
+
*args,
|
1770
|
+
**kwargs,
|
1771
|
+
)
|
1772
|
+
if prev_stream:
|
1773
|
+
if prev_stream.stream_id == stream.stream_id:
|
1774
|
+
raise TableValidationError(
|
1775
|
+
f"Stream to stage has the same ID as existing stream: {prev_stream}."
|
1776
|
+
)
|
1777
|
+
stream.previous_stream_id = prev_stream.stream_id
|
1778
|
+
# Add the operation to the transaction
|
1779
|
+
transaction.step(
|
1780
|
+
TransactionOperation.of(
|
1781
|
+
operation_type=TransactionOperationType.CREATE,
|
1782
|
+
dest_metafile=stream,
|
1783
|
+
),
|
1784
|
+
)
|
1785
|
+
|
1786
|
+
if commit_transaction:
|
1787
|
+
transaction.seal()
|
1788
|
+
return stream
|
1789
|
+
|
1790
|
+
|
1791
|
+
def commit_stream(
|
1792
|
+
stream: Stream,
|
1793
|
+
*args,
|
1794
|
+
transaction: Optional[Transaction] = None,
|
1795
|
+
**kwargs,
|
1796
|
+
) -> Stream:
|
1797
|
+
"""
|
1798
|
+
Registers a staged delta stream with a target table version, replacing any
|
1799
|
+
previous stream registered for the same table version. Returns the
|
1800
|
+
committed stream.
|
1801
|
+
"""
|
1802
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1803
|
+
|
1804
|
+
if not stream.stream_id:
|
1805
|
+
raise ValueError("Stream ID to commit must be set to a staged stream ID.")
|
1806
|
+
if not stream.table_version_locator:
|
1807
|
+
raise ValueError(
|
1808
|
+
"Stream to commit must have its table version locator "
|
1809
|
+
"set to the parent of its staged stream ID."
|
1810
|
+
)
|
1811
|
+
prev_staged_stream = get_stream_by_id(
|
1812
|
+
table_version_locator=stream.table_version_locator,
|
1813
|
+
stream_id=stream.stream_id,
|
1814
|
+
transaction=transaction,
|
1815
|
+
*args,
|
1816
|
+
**kwargs,
|
1817
|
+
)
|
1818
|
+
if not prev_staged_stream:
|
1819
|
+
raise StreamNotFoundError(
|
1820
|
+
f"Stream at table version {stream.table_version_locator} with ID "
|
1821
|
+
f"{stream.stream_id} not found."
|
1822
|
+
)
|
1823
|
+
if prev_staged_stream.state != CommitState.STAGED:
|
1824
|
+
raise TableValidationError(
|
1825
|
+
f"Expected to find a `{CommitState.STAGED}` stream at table version "
|
1826
|
+
f"{stream.table_version_locator} with ID {stream.stream_id},"
|
1827
|
+
f"but found a `{prev_staged_stream.state}` partition."
|
1828
|
+
)
|
1829
|
+
stream: Stream = Metafile.update_for(prev_staged_stream)
|
1830
|
+
stream.state = CommitState.COMMITTED
|
1831
|
+
prev_committed_stream = get_stream(
|
1832
|
+
namespace=stream.namespace,
|
1833
|
+
table_name=stream.table_name,
|
1834
|
+
table_version=stream.table_version,
|
1835
|
+
stream_format=stream.stream_format,
|
1836
|
+
transaction=transaction,
|
1837
|
+
*args,
|
1838
|
+
**kwargs,
|
1839
|
+
)
|
1840
|
+
if prev_committed_stream:
|
1841
|
+
# there's a previously committed stream, so update the transaction
|
1842
|
+
# type to overwrite the previously committed stream
|
1843
|
+
txn_op_type = TransactionOperationType.REPLACE
|
1844
|
+
else:
|
1845
|
+
txn_op_type = TransactionOperationType.UPDATE
|
1846
|
+
|
1847
|
+
# the first transaction operation updates the staged stream commit state
|
1848
|
+
transaction.step(
|
1849
|
+
TransactionOperation.of(
|
1850
|
+
operation_type=txn_op_type,
|
1851
|
+
dest_metafile=stream,
|
1852
|
+
src_metafile=prev_staged_stream,
|
1853
|
+
),
|
1854
|
+
)
|
1855
|
+
if prev_committed_stream:
|
1856
|
+
if prev_committed_stream.stream_id != stream.previous_stream_id:
|
1857
|
+
raise ConcurrentModificationError(
|
1858
|
+
f"Previous stream ID mismatch Expected "
|
1859
|
+
f"{stream.previous_stream_id} but found "
|
1860
|
+
f"{prev_committed_stream.stream_id}."
|
1861
|
+
)
|
1862
|
+
if prev_committed_stream.stream_id == stream.stream_id:
|
1863
|
+
raise TableValidationError(
|
1864
|
+
f"Stream to commit has the same ID as existing stream: {prev_committed_stream}."
|
1865
|
+
)
|
1866
|
+
# add another transaction operation to replace the previously committed stream
|
1867
|
+
# with the staged stream
|
1868
|
+
transaction.step(
|
1869
|
+
TransactionOperation.of(
|
1870
|
+
operation_type=txn_op_type,
|
1871
|
+
dest_metafile=stream,
|
1872
|
+
src_metafile=prev_committed_stream,
|
1873
|
+
),
|
1874
|
+
)
|
1875
|
+
if commit_transaction:
|
1876
|
+
transaction.seal()
|
1877
|
+
return stream
|
1878
|
+
|
1879
|
+
|
1880
|
+
def delete_stream(
|
1881
|
+
namespace: str,
|
1882
|
+
table_name: str,
|
1883
|
+
table_version: Optional[str] = None,
|
1884
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
1885
|
+
*args,
|
1886
|
+
transaction: Optional[Transaction] = None,
|
1887
|
+
**kwargs,
|
1888
|
+
) -> None:
|
1889
|
+
"""
|
1890
|
+
Deletes the delta stream currently registered with the given table version.
|
1891
|
+
Resolves to the latest active table version if no table version is given.
|
1892
|
+
Resolves to the deltacat stream format if no stream format is given.
|
1893
|
+
Raises an error if the stream does not exist.
|
1894
|
+
"""
|
1895
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1896
|
+
|
1897
|
+
if not table_version:
|
1898
|
+
table_version = _resolve_latest_active_table_version_id(
|
1899
|
+
namespace=namespace,
|
1900
|
+
table_name=table_name,
|
1901
|
+
transaction=transaction,
|
1902
|
+
*args,
|
1903
|
+
**kwargs,
|
1904
|
+
)
|
1905
|
+
stream_to_delete = get_stream(
|
1906
|
+
namespace=namespace,
|
1907
|
+
table_name=table_name,
|
1908
|
+
table_version=table_version,
|
1909
|
+
stream_format=stream_format,
|
1910
|
+
transaction=transaction,
|
1911
|
+
*args,
|
1912
|
+
**kwargs,
|
1913
|
+
)
|
1914
|
+
if not stream_to_delete:
|
1915
|
+
raise StreamNotFoundError(
|
1916
|
+
f"Stream to delete not found: {namespace}.{table_name}"
|
1917
|
+
f".{table_version}.{stream_format}."
|
1918
|
+
)
|
1919
|
+
else:
|
1920
|
+
stream_to_delete.state = CommitState.DEPRECATED
|
1921
|
+
|
1922
|
+
transaction.step(
|
1923
|
+
TransactionOperation.of(
|
1924
|
+
operation_type=TransactionOperationType.DELETE,
|
1925
|
+
dest_metafile=stream_to_delete,
|
1926
|
+
),
|
1927
|
+
)
|
1928
|
+
|
1929
|
+
if commit_transaction:
|
1930
|
+
transaction.seal()
|
1931
|
+
|
1932
|
+
|
1933
|
+
def delete_table(
|
1934
|
+
namespace: str,
|
1935
|
+
table_name: str,
|
1936
|
+
purge: bool = False,
|
1937
|
+
*args,
|
1938
|
+
transaction: Optional[Transaction] = None,
|
1939
|
+
**kwargs,
|
1940
|
+
) -> None:
|
1941
|
+
"""
|
1942
|
+
Drops the given table from the catalog. If purge is True, also removes
|
1943
|
+
all data files associated with the table. Raises an error if the given table
|
1944
|
+
does not exist.
|
1945
|
+
"""
|
1946
|
+
if purge:
|
1947
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
1948
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1949
|
+
|
1950
|
+
table: Optional[Table] = get_table(
|
1951
|
+
namespace=namespace,
|
1952
|
+
table_name=table_name,
|
1953
|
+
transaction=transaction,
|
1954
|
+
*args,
|
1955
|
+
**kwargs,
|
1956
|
+
)
|
1957
|
+
|
1958
|
+
if not table:
|
1959
|
+
# TODO(pdames): Refactor this so that it doesn't initialize Ray
|
1960
|
+
raise TableNotFoundError(f"Table `{namespace}.{table_name}` does not exist.")
|
1961
|
+
|
1962
|
+
transaction.step(
|
1963
|
+
TransactionOperation.of(
|
1964
|
+
operation_type=TransactionOperationType.DELETE,
|
1965
|
+
dest_metafile=table,
|
1966
|
+
),
|
1967
|
+
)
|
1968
|
+
|
1969
|
+
if commit_transaction:
|
1970
|
+
transaction.seal()
|
1971
|
+
|
1972
|
+
|
1973
|
+
def delete_namespace(
|
1974
|
+
namespace: str,
|
1975
|
+
purge: bool = False,
|
1976
|
+
*args,
|
1977
|
+
transaction: Optional[Transaction] = None,
|
1978
|
+
**kwargs,
|
1979
|
+
) -> None:
|
1980
|
+
"""
|
1981
|
+
Drops the given namespace from the catalog. If purge is True, also removes
|
1982
|
+
all data files associated with the namespace. Raises an error if the given
|
1983
|
+
namespace does not exist.
|
1984
|
+
"""
|
1985
|
+
if purge:
|
1986
|
+
raise NotImplementedError("Purge flag is not currently supported.")
|
1987
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1988
|
+
|
1989
|
+
namespace_obj: Optional[Namespace] = get_namespace(
|
1990
|
+
namespace=namespace,
|
1991
|
+
transaction=transaction,
|
1992
|
+
*args,
|
1993
|
+
**kwargs,
|
1994
|
+
)
|
1995
|
+
|
1996
|
+
if not namespace_obj:
|
1997
|
+
raise NamespaceNotFoundError(f"Namespace `{namespace}` does not exist.")
|
1998
|
+
|
1999
|
+
transaction.step(
|
2000
|
+
TransactionOperation.of(
|
2001
|
+
operation_type=TransactionOperationType.DELETE,
|
2002
|
+
dest_metafile=namespace_obj,
|
2003
|
+
),
|
2004
|
+
)
|
2005
|
+
|
2006
|
+
if commit_transaction:
|
2007
|
+
transaction.seal()
|
2008
|
+
|
2009
|
+
|
2010
|
+
def get_stream_by_id(
|
2011
|
+
table_version_locator: TableVersionLocator,
|
2012
|
+
stream_id: str,
|
2013
|
+
*args,
|
2014
|
+
**kwargs,
|
2015
|
+
) -> Optional[Partition]:
|
2016
|
+
"""
|
2017
|
+
Gets the stream for the given table version locator and stream ID.
|
2018
|
+
Returns None if the stream does not exist. Raises an error if the given
|
2019
|
+
table version locator does not exist.
|
2020
|
+
"""
|
2021
|
+
locator = StreamLocator.of(
|
2022
|
+
table_version_locator=table_version_locator,
|
2023
|
+
stream_id=stream_id,
|
2024
|
+
stream_format=None,
|
2025
|
+
)
|
2026
|
+
return _latest(
|
2027
|
+
metafile=Stream.of(locator=locator, partition_scheme=None),
|
2028
|
+
*args,
|
2029
|
+
**kwargs,
|
2030
|
+
)
|
2031
|
+
|
2032
|
+
|
2033
|
+
def get_stream(
|
2034
|
+
namespace: str,
|
2035
|
+
table_name: str,
|
2036
|
+
table_version: Optional[str] = None,
|
2037
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2038
|
+
*args,
|
2039
|
+
transaction: Optional[Transaction] = None,
|
2040
|
+
**kwargs,
|
2041
|
+
) -> Optional[Stream]:
|
2042
|
+
"""
|
2043
|
+
Gets the most recently committed stream for the given table version.
|
2044
|
+
Resolves to the latest active table version if no table version is given.
|
2045
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
2046
|
+
Returns None if the table version or stream format does not exist.
|
2047
|
+
"""
|
2048
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2049
|
+
if not table_version:
|
2050
|
+
table_version = _resolve_latest_active_table_version_id(
|
2051
|
+
namespace=namespace,
|
2052
|
+
table_name=table_name,
|
2053
|
+
fail_if_no_active_table_version=False,
|
2054
|
+
transaction=transaction,
|
2055
|
+
*args,
|
2056
|
+
**kwargs,
|
2057
|
+
)
|
2058
|
+
locator = StreamLocator.at(
|
2059
|
+
namespace=namespace,
|
2060
|
+
table_name=table_name,
|
2061
|
+
table_version=table_version,
|
2062
|
+
stream_id=None,
|
2063
|
+
stream_format=stream_format,
|
2064
|
+
)
|
2065
|
+
stream = _latest(
|
2066
|
+
metafile=Stream.of(
|
2067
|
+
locator=locator,
|
2068
|
+
partition_scheme=None,
|
2069
|
+
state=CommitState.COMMITTED,
|
2070
|
+
),
|
2071
|
+
transaction=transaction,
|
2072
|
+
*args,
|
2073
|
+
**kwargs,
|
2074
|
+
)
|
2075
|
+
if commit_transaction:
|
2076
|
+
transaction.seal()
|
2077
|
+
return stream
|
2078
|
+
|
2079
|
+
|
2080
|
+
def stream_exists(
|
2081
|
+
namespace: str,
|
2082
|
+
table_name: str,
|
2083
|
+
table_version: Optional[str] = None,
|
2084
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2085
|
+
*args,
|
2086
|
+
transaction: Optional[Transaction] = None,
|
2087
|
+
**kwargs,
|
2088
|
+
) -> Optional[Stream]:
|
2089
|
+
"""
|
2090
|
+
Returns True if the given Stream exists, False if not.
|
2091
|
+
Resolves to the latest active table version if no table version is given.
|
2092
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
2093
|
+
Returns None if the table version or stream format does not exist.
|
2094
|
+
"""
|
2095
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2096
|
+
if not table_version:
|
2097
|
+
table_version = _resolve_latest_active_table_version_id(
|
2098
|
+
namespace=namespace,
|
2099
|
+
table_name=table_name,
|
2100
|
+
fail_if_no_active_table_version=False,
|
2101
|
+
transaction=transaction,
|
2102
|
+
*args,
|
2103
|
+
**kwargs,
|
2104
|
+
)
|
2105
|
+
|
2106
|
+
# Try with the provided table name first
|
2107
|
+
locator = StreamLocator.at(
|
2108
|
+
namespace=namespace,
|
2109
|
+
table_name=table_name,
|
2110
|
+
table_version=table_version,
|
2111
|
+
stream_id=None,
|
2112
|
+
stream_format=stream_format,
|
2113
|
+
)
|
2114
|
+
exists = _exists(
|
2115
|
+
metafile=Stream.of(
|
2116
|
+
locator=locator,
|
2117
|
+
partition_scheme=None,
|
2118
|
+
state=CommitState.COMMITTED,
|
2119
|
+
),
|
2120
|
+
transaction=transaction,
|
2121
|
+
*args,
|
2122
|
+
**kwargs,
|
2123
|
+
)
|
2124
|
+
if commit_transaction:
|
2125
|
+
transaction.seal()
|
2126
|
+
return exists
|
2127
|
+
|
2128
|
+
|
2129
|
+
def stage_partition(
|
2130
|
+
stream: Stream,
|
2131
|
+
partition_values: Optional[PartitionValues] = None,
|
2132
|
+
partition_scheme_id: Optional[str] = None,
|
2133
|
+
*args,
|
2134
|
+
transaction: Optional[Transaction] = None,
|
2135
|
+
**kwargs,
|
2136
|
+
) -> Partition:
|
2137
|
+
"""
|
2138
|
+
Stages a new partition for the given stream and partition values. Returns
|
2139
|
+
the staged partition. If this partition will replace another partition
|
2140
|
+
with the same partition values and scheme, then it will have its previous
|
2141
|
+
partition ID set to the ID of the partition being replaced. Partition values
|
2142
|
+
should not be specified for unpartitioned tables.
|
2143
|
+
|
2144
|
+
The partition_values must represent the results of transforms in a partition
|
2145
|
+
spec specified in the stream.
|
2146
|
+
"""
|
2147
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2148
|
+
|
2149
|
+
# TODO(pdames): Cache last retrieved metafile revisions in memory to resolve
|
2150
|
+
# potentially high cost of staging many partitions.
|
2151
|
+
table_version = get_table_version(
|
2152
|
+
namespace=stream.namespace,
|
2153
|
+
table_name=stream.table_name,
|
2154
|
+
table_version=stream.table_version,
|
2155
|
+
transaction=transaction,
|
2156
|
+
*args,
|
2157
|
+
**kwargs,
|
2158
|
+
)
|
2159
|
+
if not table_version:
|
2160
|
+
raise TableVersionNotFoundError(
|
2161
|
+
f"Table version not found: {stream.namespace}.{stream.table_name}."
|
2162
|
+
f"{stream.table_version}."
|
2163
|
+
)
|
2164
|
+
# Set partition_scheme_id to UNPARTITIONED_SCHEME_ID when partition_values
|
2165
|
+
# is None or empty
|
2166
|
+
if not partition_values:
|
2167
|
+
partition_scheme_id = UNPARTITIONED_SCHEME_ID
|
2168
|
+
# Use stream's partition scheme ID if none provided and partition_values
|
2169
|
+
# are specified
|
2170
|
+
elif partition_scheme_id is None:
|
2171
|
+
partition_scheme_id = stream.partition_scheme.id
|
2172
|
+
if not table_version.partition_schemes or partition_scheme_id not in [
|
2173
|
+
ps.id for ps in table_version.partition_schemes
|
2174
|
+
]:
|
2175
|
+
raise TableValidationError(
|
2176
|
+
f"Invalid partition scheme ID `{partition_scheme_id}` (not found "
|
2177
|
+
f"in parent table version `{stream.namespace}.{stream.table_name}"
|
2178
|
+
f".{table_version.table_version}` partition scheme IDs)."
|
2179
|
+
)
|
2180
|
+
if stream.partition_scheme.id not in [
|
2181
|
+
ps.id for ps in table_version.partition_schemes
|
2182
|
+
]:
|
2183
|
+
# this should never happen, but just in case
|
2184
|
+
raise TableValidationError(
|
2185
|
+
f"Invalid stream partition scheme ID `{stream.partition_scheme.id}`"
|
2186
|
+
f" (not found in parent table version "
|
2187
|
+
f"`{stream.namespace}.{stream.table_name}"
|
2188
|
+
f".{table_version.table_version}` partition scheme IDs)."
|
2189
|
+
)
|
2190
|
+
|
2191
|
+
if partition_values:
|
2192
|
+
if partition_scheme_id == UNPARTITIONED_SCHEME_ID:
|
2193
|
+
raise TableValidationError(
|
2194
|
+
"Partition values cannot be specified for unpartitioned tables"
|
2195
|
+
)
|
2196
|
+
# Validate partition values against partition scheme
|
2197
|
+
partition_scheme = next(
|
2198
|
+
ps for ps in table_version.partition_schemes if ps.id == partition_scheme_id
|
2199
|
+
)
|
2200
|
+
_validate_partition_values_against_scheme(
|
2201
|
+
partition_values=partition_values,
|
2202
|
+
partition_scheme=partition_scheme,
|
2203
|
+
schema=table_version.schema,
|
2204
|
+
)
|
2205
|
+
|
2206
|
+
locator = PartitionLocator.of(
|
2207
|
+
stream_locator=stream.locator,
|
2208
|
+
partition_values=partition_values,
|
2209
|
+
partition_id=str(uuid.uuid4()),
|
2210
|
+
)
|
2211
|
+
partition = Partition.of(
|
2212
|
+
locator=locator,
|
2213
|
+
content_types=table_version.content_types,
|
2214
|
+
state=CommitState.STAGED,
|
2215
|
+
previous_stream_position=None,
|
2216
|
+
previous_partition_id=None,
|
2217
|
+
stream_position=None,
|
2218
|
+
partition_scheme_id=partition_scheme_id,
|
2219
|
+
)
|
2220
|
+
prev_partition = get_partition(
|
2221
|
+
stream_locator=stream.locator,
|
2222
|
+
partition_values=partition_values,
|
2223
|
+
partition_scheme_id=partition_scheme_id,
|
2224
|
+
transaction=transaction,
|
2225
|
+
*args,
|
2226
|
+
**kwargs,
|
2227
|
+
)
|
2228
|
+
prev_partition_id = prev_partition.partition_id if prev_partition else None
|
2229
|
+
|
2230
|
+
# TODO(pdames): Check all historic partitions for the same partition ID
|
2231
|
+
if prev_partition_id == partition.partition_id:
|
2232
|
+
raise TableValidationError(
|
2233
|
+
f"Partition to stage has the same ID as previous partition: {prev_partition_id}."
|
2234
|
+
)
|
2235
|
+
partition.previous_partition_id = prev_partition_id
|
2236
|
+
|
2237
|
+
# Add the operation to the transaction
|
2238
|
+
transaction.step(
|
2239
|
+
TransactionOperation.of(
|
2240
|
+
operation_type=TransactionOperationType.CREATE,
|
2241
|
+
dest_metafile=partition,
|
2242
|
+
),
|
2243
|
+
)
|
2244
|
+
|
2245
|
+
if commit_transaction:
|
2246
|
+
transaction.seal()
|
2247
|
+
return partition
|
2248
|
+
|
2249
|
+
|
2250
|
+
def commit_partition(
|
2251
|
+
partition: Partition,
|
2252
|
+
previous_partition: Optional[Partition] = None,
|
2253
|
+
*args,
|
2254
|
+
transaction: Optional[Transaction] = None,
|
2255
|
+
**kwargs,
|
2256
|
+
) -> Partition:
|
2257
|
+
"""
|
2258
|
+
Commits the staged partition to its associated table version stream,
|
2259
|
+
replacing any previous partition registered for the same stream and
|
2260
|
+
partition values. All values set on the input partition except compaction
|
2261
|
+
round completion info will be overwritten with the values stored in the
|
2262
|
+
staged partition.
|
2263
|
+
|
2264
|
+
If previous partition is given then it will be replaced with its deltas
|
2265
|
+
prepended to the new partition being committed. Otherwise the latest
|
2266
|
+
committed partition with the same keys and partition scheme ID will be
|
2267
|
+
retrieved.
|
2268
|
+
|
2269
|
+
Returns the registered partition. If the partition's
|
2270
|
+
previous delta stream position is specified, then the commit will
|
2271
|
+
be rejected if it does not match the actual previous stream position of
|
2272
|
+
the partition being replaced. If the partition's previous partition ID is
|
2273
|
+
specified, then the commit will be rejected if it does not match the actual
|
2274
|
+
ID of the partition being replaced.
|
2275
|
+
"""
|
2276
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2277
|
+
|
2278
|
+
if previous_partition:
|
2279
|
+
raise NotImplementedError(
|
2280
|
+
f"delta prepending from previous partition {previous_partition} "
|
2281
|
+
f"is not yet implemented"
|
2282
|
+
)
|
2283
|
+
if not partition.partition_id:
|
2284
|
+
raise ValueError("Partition ID to commit must be set to a staged partition ID.")
|
2285
|
+
if not partition.stream_locator:
|
2286
|
+
raise ValueError(
|
2287
|
+
"Partition to commit must have its stream locator "
|
2288
|
+
"set to the parent of its staged partition ID."
|
2289
|
+
)
|
2290
|
+
|
2291
|
+
# Start a single multi-step transaction for all operations (both read and write)
|
2292
|
+
# Step 1: Get the staged partition using transaction
|
2293
|
+
prev_staged_partition = get_partition_by_id(
|
2294
|
+
stream_locator=partition.stream_locator,
|
2295
|
+
partition_id=partition.partition_id,
|
2296
|
+
transaction=transaction,
|
2297
|
+
*args,
|
2298
|
+
**kwargs,
|
2299
|
+
)
|
2300
|
+
|
2301
|
+
# Validate staged partition
|
2302
|
+
if not prev_staged_partition:
|
2303
|
+
raise PartitionNotFoundError(
|
2304
|
+
f"Partition at stream {partition.stream_locator} with ID "
|
2305
|
+
f"{partition.partition_id} not found."
|
2306
|
+
)
|
2307
|
+
if prev_staged_partition.state != CommitState.STAGED:
|
2308
|
+
raise TableValidationError(
|
2309
|
+
f"Expected to find a `{CommitState.STAGED}` partition at stream "
|
2310
|
+
f"{partition.stream_locator} with ID {partition.partition_id},"
|
2311
|
+
f"but found a `{prev_staged_partition.state}` partition."
|
2312
|
+
)
|
2313
|
+
|
2314
|
+
# Step 2: Check for existing committed partition
|
2315
|
+
prev_committed_partition = None
|
2316
|
+
if partition.previous_partition_id is not None:
|
2317
|
+
prev_committed_partition = get_partition(
|
2318
|
+
stream_locator=partition.stream_locator,
|
2319
|
+
partition_values=partition.partition_values,
|
2320
|
+
partition_scheme_id=partition.partition_scheme_id,
|
2321
|
+
transaction=transaction,
|
2322
|
+
*args,
|
2323
|
+
**kwargs,
|
2324
|
+
)
|
2325
|
+
|
2326
|
+
# Validate expected previous partition ID for race condition detection
|
2327
|
+
if prev_committed_partition:
|
2328
|
+
logger.info(
|
2329
|
+
f"Checking previous committed partition for conflicts: {prev_committed_partition}"
|
2330
|
+
)
|
2331
|
+
if prev_committed_partition.partition_id != partition.previous_partition_id:
|
2332
|
+
raise ConcurrentModificationError(
|
2333
|
+
f"Concurrent modification detected: Expected committed partition "
|
2334
|
+
f"{partition.previous_partition_id} but found "
|
2335
|
+
f"{prev_committed_partition.partition_id}."
|
2336
|
+
)
|
2337
|
+
|
2338
|
+
if prev_committed_partition:
|
2339
|
+
# Update transaction type based on what we found
|
2340
|
+
txn_op_type = TransactionOperationType.REPLACE
|
2341
|
+
if prev_committed_partition.partition_id == partition.partition_id:
|
2342
|
+
raise TableValidationError(
|
2343
|
+
f"Partition to commit has the same ID as existing partition: "
|
2344
|
+
f"{prev_committed_partition}."
|
2345
|
+
)
|
2346
|
+
else:
|
2347
|
+
txn_op_type = TransactionOperationType.UPDATE
|
2348
|
+
|
2349
|
+
# Prepare the committed partition based on the staged partition
|
2350
|
+
# Compaction round completion info (if any) is not set on the staged partition,
|
2351
|
+
# so we need to save it from the input partition to commit.
|
2352
|
+
input_partition_rci = partition.compaction_round_completion_info
|
2353
|
+
partition: Partition = Metafile.update_for(prev_staged_partition)
|
2354
|
+
partition.state = CommitState.COMMITTED
|
2355
|
+
# Restore compaction round completion info (if any) from the input partition.
|
2356
|
+
if input_partition_rci is not None:
|
2357
|
+
partition.compaction_round_completion_info = input_partition_rci
|
2358
|
+
|
2359
|
+
# Step 4: Add write operations to the same transaction
|
2360
|
+
# Always UPDATE the staged partition to committed state
|
2361
|
+
transaction.step(
|
2362
|
+
TransactionOperation.of(
|
2363
|
+
operation_type=txn_op_type,
|
2364
|
+
dest_metafile=partition,
|
2365
|
+
src_metafile=prev_staged_partition,
|
2366
|
+
),
|
2367
|
+
)
|
2368
|
+
|
2369
|
+
# If there's a previously committed partition, we need to replace it too
|
2370
|
+
if prev_committed_partition:
|
2371
|
+
transaction.step(
|
2372
|
+
TransactionOperation.of(
|
2373
|
+
operation_type=txn_op_type,
|
2374
|
+
dest_metafile=partition,
|
2375
|
+
src_metafile=prev_committed_partition,
|
2376
|
+
),
|
2377
|
+
)
|
2378
|
+
|
2379
|
+
if commit_transaction:
|
2380
|
+
transaction.seal()
|
2381
|
+
|
2382
|
+
return partition
|
2383
|
+
|
2384
|
+
|
2385
|
+
def delete_partition(
|
2386
|
+
stream_locator: StreamLocator,
|
2387
|
+
partition_values: Optional[PartitionValues] = None,
|
2388
|
+
partition_scheme_id: Optional[str] = None,
|
2389
|
+
*args,
|
2390
|
+
transaction: Optional[Transaction] = None,
|
2391
|
+
**kwargs,
|
2392
|
+
) -> None:
|
2393
|
+
"""
|
2394
|
+
Deletes the given partition from the specified stream. Partition
|
2395
|
+
values should not be specified for unpartitioned tables. Raises an error
|
2396
|
+
if the partition does not exist.
|
2397
|
+
"""
|
2398
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2399
|
+
|
2400
|
+
partition_to_delete = get_partition(
|
2401
|
+
stream_locator=stream_locator,
|
2402
|
+
partition_values=partition_values,
|
2403
|
+
partition_scheme_id=partition_scheme_id,
|
2404
|
+
transaction=transaction,
|
2405
|
+
*args,
|
2406
|
+
**kwargs,
|
2407
|
+
)
|
2408
|
+
if not partition_to_delete:
|
2409
|
+
raise PartitionNotFoundError(
|
2410
|
+
f"Partition with values {partition_values} and scheme "
|
2411
|
+
f"{partition_scheme_id} not found in stream: {stream_locator}"
|
2412
|
+
)
|
2413
|
+
else:
|
2414
|
+
partition_to_delete.state = CommitState.DEPRECATED
|
2415
|
+
|
2416
|
+
transaction.step(
|
2417
|
+
TransactionOperation.of(
|
2418
|
+
operation_type=TransactionOperationType.DELETE,
|
2419
|
+
dest_metafile=partition_to_delete,
|
2420
|
+
),
|
2421
|
+
)
|
2422
|
+
|
2423
|
+
if commit_transaction:
|
2424
|
+
transaction.seal()
|
2425
|
+
|
2426
|
+
|
2427
|
+
def get_partition_by_id(
|
2428
|
+
stream_locator: StreamLocator,
|
2429
|
+
partition_id: str,
|
2430
|
+
*args,
|
2431
|
+
**kwargs,
|
2432
|
+
) -> Optional[Partition]:
|
2433
|
+
"""
|
2434
|
+
Gets the partition for the given stream locator and partition ID.
|
2435
|
+
Returns None if the partition does not exist. Raises an error if the
|
2436
|
+
given stream locator does not exist.
|
2437
|
+
"""
|
2438
|
+
locator = PartitionLocator.of(
|
2439
|
+
stream_locator=stream_locator,
|
2440
|
+
partition_values=None,
|
2441
|
+
partition_id=partition_id,
|
2442
|
+
)
|
2443
|
+
return _latest(
|
2444
|
+
metafile=Partition.of(
|
2445
|
+
locator=locator,
|
2446
|
+
content_types=None,
|
2447
|
+
),
|
2448
|
+
*args,
|
2449
|
+
**kwargs,
|
2450
|
+
)
|
2451
|
+
|
2452
|
+
|
2453
|
+
def get_partition(
|
2454
|
+
stream_locator: StreamLocator,
|
2455
|
+
partition_values: Optional[PartitionValues] = None,
|
2456
|
+
partition_scheme_id: Optional[str] = None,
|
2457
|
+
*args,
|
2458
|
+
transaction: Optional[Transaction] = None,
|
2459
|
+
**kwargs,
|
2460
|
+
) -> Optional[Partition]:
|
2461
|
+
"""
|
2462
|
+
Gets the most recently committed partition for the given stream locator and
|
2463
|
+
partition key values. Returns None if no partition has been committed for
|
2464
|
+
the given table version and/or partition key values. Partition values
|
2465
|
+
should not be specified for unpartitioned tables. Partition scheme ID
|
2466
|
+
resolves to the table version's current partition scheme by default.
|
2467
|
+
Raises an error if the given stream locator does not exist.
|
2468
|
+
"""
|
2469
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2470
|
+
if not partition_scheme_id or not stream_locator.stream_id:
|
2471
|
+
# resolve latest partition scheme from the current
|
2472
|
+
# revision of its `deltacat` stream
|
2473
|
+
stream = get_stream(
|
2474
|
+
namespace=stream_locator.namespace,
|
2475
|
+
table_name=stream_locator.table_name,
|
2476
|
+
table_version=stream_locator.table_version,
|
2477
|
+
transaction=transaction,
|
2478
|
+
*args,
|
2479
|
+
**kwargs,
|
2480
|
+
)
|
2481
|
+
if not stream:
|
2482
|
+
raise StreamNotFoundError(f"Stream {stream_locator} not found.")
|
2483
|
+
partition_scheme_id = stream.partition_scheme.id
|
2484
|
+
# ensure that we always use a fully qualified stream locator
|
2485
|
+
stream_locator = stream.locator
|
2486
|
+
locator = PartitionLocator.of(
|
2487
|
+
stream_locator=stream_locator,
|
2488
|
+
partition_values=partition_values,
|
2489
|
+
partition_id=None,
|
2490
|
+
)
|
2491
|
+
partition = _latest(
|
2492
|
+
metafile=Partition.of(
|
2493
|
+
locator=locator,
|
2494
|
+
content_types=None,
|
2495
|
+
state=CommitState.COMMITTED,
|
2496
|
+
partition_scheme_id=partition_scheme_id,
|
2497
|
+
),
|
2498
|
+
transaction=transaction,
|
2499
|
+
*args,
|
2500
|
+
**kwargs,
|
2501
|
+
)
|
2502
|
+
if commit_transaction:
|
2503
|
+
transaction.seal()
|
2504
|
+
return partition
|
2505
|
+
|
2506
|
+
|
2507
|
+
def _write_table_slices(
|
2508
|
+
table: Union[LocalTable, LocalDataset, DistributedDataset],
|
2509
|
+
partition_id: str,
|
2510
|
+
max_records_per_entry: Optional[int],
|
2511
|
+
table_writer_fn: Callable,
|
2512
|
+
table_slicer_fn: Callable,
|
2513
|
+
content_type: ContentType = ContentType.PARQUET,
|
2514
|
+
entry_params: Optional[EntryParams] = None,
|
2515
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2516
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
2517
|
+
**kwargs,
|
2518
|
+
) -> ManifestEntryList:
|
2519
|
+
catalog_properties = get_catalog_properties(**kwargs)
|
2520
|
+
manifest_entries = ManifestEntryList()
|
2521
|
+
# LocalDataset is a special case to upload iteratively
|
2522
|
+
tables = [t for t in table] if isinstance(table, list) else [table]
|
2523
|
+
filesystem = catalog_properties.filesystem
|
2524
|
+
data_dir_path = posixpath.join(
|
2525
|
+
catalog_properties.root,
|
2526
|
+
DATA_FILE_DIR_NAME,
|
2527
|
+
partition_id,
|
2528
|
+
)
|
2529
|
+
filesystem.create_dir(data_dir_path, recursive=True)
|
2530
|
+
for t in tables:
|
2531
|
+
manifest_entries.extend(
|
2532
|
+
write_sliced_table(
|
2533
|
+
t,
|
2534
|
+
data_dir_path,
|
2535
|
+
filesystem,
|
2536
|
+
max_records_per_entry,
|
2537
|
+
table_writer_fn,
|
2538
|
+
table_slicer_fn,
|
2539
|
+
table_writer_kwargs,
|
2540
|
+
content_type,
|
2541
|
+
entry_params,
|
2542
|
+
entry_type,
|
2543
|
+
)
|
2544
|
+
)
|
2545
|
+
return manifest_entries
|
2546
|
+
|
2547
|
+
|
2548
|
+
def _write_table(
|
2549
|
+
partition_id: str,
|
2550
|
+
table: Union[LocalTable, LocalDataset, DistributedDataset],
|
2551
|
+
max_records_per_entry: Optional[int] = None,
|
2552
|
+
author: Optional[ManifestAuthor] = None,
|
2553
|
+
content_type: ContentType = ContentType.PARQUET,
|
2554
|
+
entry_params: Optional[EntryParams] = None,
|
2555
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2556
|
+
write_table_slices_fn: Optional[Callable] = _write_table_slices,
|
2557
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
2558
|
+
**kwargs,
|
2559
|
+
) -> Manifest:
|
2560
|
+
"""
|
2561
|
+
Writes the given table to 1 or more files and returns a
|
2562
|
+
Redshift manifest pointing to the uploaded files.
|
2563
|
+
"""
|
2564
|
+
table_writer_fn = get_table_writer(table)
|
2565
|
+
table_slicer_fn = get_table_slicer(table)
|
2566
|
+
|
2567
|
+
manifest_entries = write_table_slices_fn(
|
2568
|
+
table,
|
2569
|
+
partition_id,
|
2570
|
+
max_records_per_entry,
|
2571
|
+
table_writer_fn,
|
2572
|
+
table_slicer_fn,
|
2573
|
+
content_type,
|
2574
|
+
entry_params,
|
2575
|
+
entry_type,
|
2576
|
+
table_writer_kwargs,
|
2577
|
+
**kwargs,
|
2578
|
+
)
|
2579
|
+
manifest = Manifest.of(
|
2580
|
+
entries=manifest_entries,
|
2581
|
+
author=author,
|
2582
|
+
uuid=str(uuid.uuid4()),
|
2583
|
+
entry_type=entry_type,
|
2584
|
+
entry_params=entry_params,
|
2585
|
+
)
|
2586
|
+
return manifest
|
2587
|
+
|
2588
|
+
|
2589
|
+
def stage_delta(
|
2590
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
2591
|
+
partition: Partition,
|
2592
|
+
delta_type: DeltaType = DeltaType.UPSERT,
|
2593
|
+
max_records_per_entry: Optional[int] = None,
|
2594
|
+
author: Optional[ManifestAuthor] = None,
|
2595
|
+
properties: Optional[DeltaProperties] = None,
|
2596
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
2597
|
+
content_type: ContentType = ContentType.PARQUET,
|
2598
|
+
entry_params: Optional[EntryParams] = None,
|
2599
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
2600
|
+
write_table_slices_fn: Optional[Callable] = _write_table_slices,
|
2601
|
+
schema: Optional[Schema] = None,
|
2602
|
+
sort_scheme_id: Optional[str] = None,
|
2603
|
+
*args,
|
2604
|
+
**kwargs,
|
2605
|
+
) -> Delta:
|
2606
|
+
"""
|
2607
|
+
Writes the given dataset to 1 or more files. Returns an unregistered
|
2608
|
+
delta whose manifest entries point to the uploaded files. Applies any
|
2609
|
+
schema consistency policies configured for the parent table version.
|
2610
|
+
"""
|
2611
|
+
# TODO(pdames): Validate that equality delete entry types either have
|
2612
|
+
# entry params specified, or are being added to a table with merge keys.
|
2613
|
+
if not partition.is_supported_content_type(content_type):
|
2614
|
+
raise TableValidationError(
|
2615
|
+
f"Content type {content_type} is not supported by "
|
2616
|
+
f"partition: {partition}"
|
2617
|
+
)
|
2618
|
+
if partition.state == CommitState.DEPRECATED:
|
2619
|
+
raise TableValidationError(
|
2620
|
+
f"Cannot stage delta to {partition.state} partition: {partition}",
|
2621
|
+
)
|
2622
|
+
previous_stream_position: Optional[int] = partition.stream_position
|
2623
|
+
|
2624
|
+
# Handle schema parameter and add to table_writer_kwargs if available
|
2625
|
+
table_writer_kwargs = table_writer_kwargs or {}
|
2626
|
+
|
2627
|
+
# Extract schema_id from the schema if it's a DeltaCAT Schema
|
2628
|
+
schema_id = None
|
2629
|
+
if isinstance(schema, Schema):
|
2630
|
+
schema_id = schema.id
|
2631
|
+
table_writer_kwargs["schema_id"] = schema_id
|
2632
|
+
# Add PyArrow schema to table_writer_kwargs if not already present
|
2633
|
+
if "schema" not in table_writer_kwargs:
|
2634
|
+
table_writer_kwargs["schema"] = schema.arrow
|
2635
|
+
elif schema is not None and "schema" not in table_writer_kwargs:
|
2636
|
+
# For PyArrow schemas or other types, add directly
|
2637
|
+
table_writer_kwargs["schema"] = schema
|
2638
|
+
|
2639
|
+
# Add sort_scheme_id to table_writer_kwargs for manifest entry creation
|
2640
|
+
if sort_scheme_id is not None:
|
2641
|
+
table_writer_kwargs["sort_scheme_id"] = sort_scheme_id
|
2642
|
+
|
2643
|
+
manifest: Manifest = _write_table(
|
2644
|
+
partition.partition_id,
|
2645
|
+
data,
|
2646
|
+
max_records_per_entry,
|
2647
|
+
author,
|
2648
|
+
content_type,
|
2649
|
+
entry_params,
|
2650
|
+
entry_type,
|
2651
|
+
write_table_slices_fn,
|
2652
|
+
table_writer_kwargs,
|
2653
|
+
**kwargs,
|
2654
|
+
)
|
2655
|
+
staged_delta: Delta = Delta.of(
|
2656
|
+
locator=DeltaLocator.of(partition.locator, None),
|
2657
|
+
delta_type=delta_type,
|
2658
|
+
meta=manifest.meta,
|
2659
|
+
properties=properties,
|
2660
|
+
manifest=manifest,
|
2661
|
+
previous_stream_position=previous_stream_position,
|
2662
|
+
)
|
2663
|
+
return staged_delta
|
2664
|
+
|
2665
|
+
|
2666
|
+
def commit_delta(
|
2667
|
+
delta: Delta,
|
2668
|
+
*args,
|
2669
|
+
transaction: Optional[Transaction] = None,
|
2670
|
+
**kwargs,
|
2671
|
+
) -> Delta:
|
2672
|
+
"""
|
2673
|
+
Registers a new delta with its associated target table version and
|
2674
|
+
partition. Returns the registered delta. If the delta's previous stream
|
2675
|
+
position is specified, then the commit will be rejected if it does not match
|
2676
|
+
the target partition's actual previous stream position. If the delta's
|
2677
|
+
stream position is specified, it must be greater than the latest stream
|
2678
|
+
position in the target partition.
|
2679
|
+
"""
|
2680
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2681
|
+
|
2682
|
+
delta: Delta = Metafile.update_for(delta)
|
2683
|
+
delta_type: Optional[DeltaType] = delta.type
|
2684
|
+
resolved_delta_type = delta_type if delta_type is not None else DeltaType.UPSERT
|
2685
|
+
delta.type = resolved_delta_type
|
2686
|
+
delta.properties = kwargs.get("properties") or delta.properties
|
2687
|
+
|
2688
|
+
if delta.partition_id:
|
2689
|
+
parent_partition = get_partition_by_id(
|
2690
|
+
stream_locator=delta.stream_locator,
|
2691
|
+
partition_id=delta.partition_id,
|
2692
|
+
transaction=transaction,
|
2693
|
+
*args,
|
2694
|
+
**kwargs,
|
2695
|
+
)
|
2696
|
+
else:
|
2697
|
+
parent_partition = get_partition(
|
2698
|
+
stream_locator=delta.stream_locator,
|
2699
|
+
partition_values=delta.partition_values,
|
2700
|
+
transaction=transaction,
|
2701
|
+
*args,
|
2702
|
+
**kwargs,
|
2703
|
+
)
|
2704
|
+
if not parent_partition:
|
2705
|
+
raise PartitionNotFoundError(f"Partition not found: {delta.locator}")
|
2706
|
+
# ensure that we always use a fully qualified partition locator
|
2707
|
+
delta.locator.partition_locator = parent_partition.locator
|
2708
|
+
# resolve the delta's stream position
|
2709
|
+
delta.previous_stream_position = parent_partition.stream_position or 0
|
2710
|
+
if delta.stream_position is not None:
|
2711
|
+
if delta.stream_position <= delta.previous_stream_position:
|
2712
|
+
# manually specified delta stream positions must be greater than the
|
2713
|
+
# previous stream position
|
2714
|
+
raise TableValidationError(
|
2715
|
+
f"Delta stream position {delta.stream_position} must be "
|
2716
|
+
f"greater than previous stream position "
|
2717
|
+
f"{delta.previous_stream_position}"
|
2718
|
+
)
|
2719
|
+
else:
|
2720
|
+
delta.locator.stream_position = delta.previous_stream_position + 1
|
2721
|
+
|
2722
|
+
# update the parent partition's stream position
|
2723
|
+
new_parent_partition: Partition = Metafile.update_for(parent_partition)
|
2724
|
+
new_parent_partition.stream_position = delta.locator.stream_position
|
2725
|
+
|
2726
|
+
# Add operations to the transaction
|
2727
|
+
# the 1st operation creates the delta
|
2728
|
+
transaction.step(
|
2729
|
+
TransactionOperation.of(
|
2730
|
+
operation_type=TransactionOperationType.CREATE,
|
2731
|
+
dest_metafile=delta,
|
2732
|
+
),
|
2733
|
+
)
|
2734
|
+
# the 2nd operation alters the stream position of the partition
|
2735
|
+
transaction.step(
|
2736
|
+
TransactionOperation.of(
|
2737
|
+
operation_type=TransactionOperationType.UPDATE,
|
2738
|
+
dest_metafile=new_parent_partition,
|
2739
|
+
src_metafile=parent_partition,
|
2740
|
+
),
|
2741
|
+
)
|
2742
|
+
|
2743
|
+
if commit_transaction:
|
2744
|
+
transaction.seal()
|
2745
|
+
return delta
|
2746
|
+
|
2747
|
+
|
2748
|
+
def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
|
2749
|
+
"""
|
2750
|
+
Gets table namespace metadata for the specified table namespace. Returns
|
2751
|
+
None if the given namespace does not exist.
|
2752
|
+
"""
|
2753
|
+
return _latest(
|
2754
|
+
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
2755
|
+
*args,
|
2756
|
+
**kwargs,
|
2757
|
+
)
|
2758
|
+
|
2759
|
+
|
2760
|
+
def namespace_exists(namespace: str, *args, **kwargs) -> bool:
|
2761
|
+
"""
|
2762
|
+
Returns True if the given table namespace exists, False if not.
|
2763
|
+
"""
|
2764
|
+
return _exists(
|
2765
|
+
metafile=Namespace.of(NamespaceLocator.of(namespace)),
|
2766
|
+
*args,
|
2767
|
+
**kwargs,
|
2768
|
+
)
|
2769
|
+
|
2770
|
+
|
2771
|
+
def get_table(
|
2772
|
+
namespace: str,
|
2773
|
+
table_name: str,
|
2774
|
+
*args,
|
2775
|
+
**kwargs,
|
2776
|
+
) -> Optional[Table]:
|
2777
|
+
"""
|
2778
|
+
Gets table metadata for the specified table. Returns None if the given
|
2779
|
+
table does not exist.
|
2780
|
+
"""
|
2781
|
+
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
2782
|
+
return _latest(
|
2783
|
+
metafile=Table.of(locator=locator),
|
2784
|
+
*args,
|
2785
|
+
**kwargs,
|
2786
|
+
)
|
2787
|
+
|
2788
|
+
|
2789
|
+
def table_exists(
|
2790
|
+
namespace: str,
|
2791
|
+
table_name: str,
|
2792
|
+
*args,
|
2793
|
+
**kwargs,
|
2794
|
+
) -> bool:
|
2795
|
+
"""
|
2796
|
+
Returns True if the given table exists, False if not.
|
2797
|
+
"""
|
2798
|
+
locator = TableLocator.at(namespace=namespace, table_name=table_name)
|
2799
|
+
return _exists(
|
2800
|
+
metafile=Table.of(locator=locator),
|
2801
|
+
*args,
|
2802
|
+
**kwargs,
|
2803
|
+
)
|
2804
|
+
|
2805
|
+
|
2806
|
+
def get_table_version(
|
2807
|
+
namespace: str,
|
2808
|
+
table_name: str,
|
2809
|
+
table_version: str,
|
2810
|
+
*args,
|
2811
|
+
**kwargs,
|
2812
|
+
) -> Optional[TableVersion]:
|
2813
|
+
"""
|
2814
|
+
Gets table version metadata for the specified table version. Returns None
|
2815
|
+
if the given table version does not exist.
|
2816
|
+
"""
|
2817
|
+
locator = TableVersionLocator.at(
|
2818
|
+
namespace=namespace,
|
2819
|
+
table_name=table_name,
|
2820
|
+
table_version=table_version,
|
2821
|
+
)
|
2822
|
+
table_version = TableVersion.of(
|
2823
|
+
locator=locator,
|
2824
|
+
schema=None,
|
2825
|
+
)
|
2826
|
+
return _latest(
|
2827
|
+
metafile=table_version,
|
2828
|
+
*args,
|
2829
|
+
**kwargs,
|
2830
|
+
)
|
2831
|
+
|
2832
|
+
|
2833
|
+
def get_latest_table_version(
|
2834
|
+
namespace: str,
|
2835
|
+
table_name: str,
|
2836
|
+
*args,
|
2837
|
+
transaction: Optional[Transaction] = None,
|
2838
|
+
**kwargs,
|
2839
|
+
) -> Optional[TableVersion]:
|
2840
|
+
"""
|
2841
|
+
Gets table version metadata for the latest version of the specified table.
|
2842
|
+
Returns None if no table version exists for the given table. Raises
|
2843
|
+
an error if the given table doesn't exist.
|
2844
|
+
"""
|
2845
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2846
|
+
table_version_id = _resolve_latest_table_version_id(
|
2847
|
+
namespace=namespace,
|
2848
|
+
table_name=table_name,
|
2849
|
+
fail_if_no_active_table_version=False,
|
2850
|
+
transaction=transaction,
|
2851
|
+
*args,
|
2852
|
+
**kwargs,
|
2853
|
+
)
|
2854
|
+
|
2855
|
+
table_version = (
|
2856
|
+
get_table_version(
|
2857
|
+
namespace=namespace,
|
2858
|
+
table_name=table_name,
|
2859
|
+
table_version=table_version_id,
|
2860
|
+
transaction=transaction,
|
2861
|
+
*args,
|
2862
|
+
**kwargs,
|
2863
|
+
)
|
2864
|
+
if table_version_id
|
2865
|
+
else None
|
2866
|
+
)
|
2867
|
+
if commit_transaction:
|
2868
|
+
transaction.seal()
|
2869
|
+
return table_version
|
2870
|
+
|
2871
|
+
|
2872
|
+
def get_latest_active_table_version(
|
2873
|
+
namespace: str,
|
2874
|
+
table_name: str,
|
2875
|
+
*args,
|
2876
|
+
transaction: Optional[Transaction] = None,
|
2877
|
+
**kwargs,
|
2878
|
+
) -> Optional[TableVersion]:
|
2879
|
+
"""
|
2880
|
+
Gets table version metadata for the latest active version of the specified
|
2881
|
+
table. Returns None if no active table version exists for the given table.
|
2882
|
+
Raises an error if the given table doesn't exist.
|
2883
|
+
"""
|
2884
|
+
transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2885
|
+
table_version_id = _resolve_latest_active_table_version_id(
|
2886
|
+
namespace=namespace,
|
2887
|
+
table_name=table_name,
|
2888
|
+
fail_if_no_active_table_version=False,
|
2889
|
+
transaction=transaction,
|
2890
|
+
*args,
|
2891
|
+
**kwargs,
|
2892
|
+
)
|
2893
|
+
table_version = (
|
2894
|
+
get_table_version(
|
2895
|
+
namespace=namespace,
|
2896
|
+
table_name=table_name,
|
2897
|
+
table_version=table_version_id,
|
2898
|
+
transaction=transaction,
|
2899
|
+
*args,
|
2900
|
+
**kwargs,
|
2901
|
+
)
|
2902
|
+
if table_version_id
|
2903
|
+
else None
|
2904
|
+
)
|
2905
|
+
if commit_transaction:
|
2906
|
+
transaction.seal()
|
2907
|
+
return table_version
|
2908
|
+
|
2909
|
+
|
2910
|
+
def get_table_version_column_names(
|
2911
|
+
namespace: str,
|
2912
|
+
table_name: str,
|
2913
|
+
table_version: Optional[str] = None,
|
2914
|
+
*args,
|
2915
|
+
**kwargs,
|
2916
|
+
) -> Optional[List[str]]:
|
2917
|
+
"""
|
2918
|
+
Gets a list of column names for the specified table version, or for the
|
2919
|
+
latest active table version if none is specified. The index of each
|
2920
|
+
column name returned represents its ordinal position in a delimited text
|
2921
|
+
file or other row-oriented content type files appended to the table.
|
2922
|
+
Returns None for schemaless tables. Raises an error if the table version
|
2923
|
+
does not exist.
|
2924
|
+
"""
|
2925
|
+
schema = get_table_version_schema(
|
2926
|
+
namespace=namespace,
|
2927
|
+
table_name=table_name,
|
2928
|
+
table_version=table_version,
|
2929
|
+
*args,
|
2930
|
+
**kwargs,
|
2931
|
+
)
|
2932
|
+
return schema.arrow.names if schema else None
|
2933
|
+
|
2934
|
+
|
2935
|
+
def get_table_version_schema(
|
2936
|
+
namespace: str,
|
2937
|
+
table_name: str,
|
2938
|
+
table_version: Optional[str] = None,
|
2939
|
+
*args,
|
2940
|
+
**kwargs,
|
2941
|
+
) -> Optional[Schema]:
|
2942
|
+
"""
|
2943
|
+
Gets the schema for the specified table version, or for the latest active
|
2944
|
+
table version if none is specified. Returns None if the table version is
|
2945
|
+
schemaless. Raises an error if the table version does not exist.
|
2946
|
+
"""
|
2947
|
+
table_version_meta = (
|
2948
|
+
get_table_version(
|
2949
|
+
*args,
|
2950
|
+
namespace=namespace,
|
2951
|
+
table_name=table_name,
|
2952
|
+
table_version=table_version,
|
2953
|
+
**kwargs,
|
2954
|
+
)
|
2955
|
+
if table_version
|
2956
|
+
else get_latest_active_table_version(
|
2957
|
+
*args,
|
2958
|
+
namespace=namespace,
|
2959
|
+
table_name=table_name,
|
2960
|
+
**kwargs,
|
2961
|
+
)
|
2962
|
+
)
|
2963
|
+
return table_version_meta.schema
|
2964
|
+
|
2965
|
+
|
2966
|
+
def table_version_exists(
|
2967
|
+
namespace: str,
|
2968
|
+
table_name: str,
|
2969
|
+
table_version: str,
|
2970
|
+
*args,
|
2971
|
+
**kwargs,
|
2972
|
+
) -> bool:
|
2973
|
+
"""
|
2974
|
+
Returns True if the given table version exists, False if not.
|
2975
|
+
"""
|
2976
|
+
locator = TableVersionLocator.at(
|
2977
|
+
namespace=namespace,
|
2978
|
+
table_name=table_name,
|
2979
|
+
table_version=table_version,
|
2980
|
+
)
|
2981
|
+
table_version = TableVersion.of(
|
2982
|
+
locator=locator,
|
2983
|
+
schema=None,
|
2984
|
+
)
|
2985
|
+
return _exists(
|
2986
|
+
*args,
|
2987
|
+
metafile=table_version,
|
2988
|
+
**kwargs,
|
2989
|
+
)
|
2990
|
+
|
2991
|
+
|
2992
|
+
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
2993
|
+
"""
|
2994
|
+
True if the input error originated from the storage
|
2995
|
+
implementation layer and can be categorized under an
|
2996
|
+
existing DeltaCatError. The "categorize_errors" decorator
|
2997
|
+
uses this to determine if an unknown error from the storage
|
2998
|
+
implementation can be categorized prior to casting it to
|
2999
|
+
the equivalent DeltaCatError via `raise_categorized_error`
|
3000
|
+
"""
|
3001
|
+
|
3002
|
+
# DeltaCAT native storage can only categorize DeltaCatError
|
3003
|
+
# (i.e., this is effectively a no-op for native storage)
|
3004
|
+
if isinstance(e, DeltaCatError):
|
3005
|
+
return True
|
3006
|
+
else:
|
3007
|
+
return False
|
3008
|
+
|
3009
|
+
|
3010
|
+
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
3011
|
+
"""
|
3012
|
+
Casts a categorizable error that originaed from the storage
|
3013
|
+
implementation layer to its equivalent DeltaCatError
|
3014
|
+
for uniform handling (e.g., determining whether an error
|
3015
|
+
is retryable or not) via the "categorize_errors" decorator.
|
3016
|
+
Raises an UnclassifiedDeltaCatError from the input exception
|
3017
|
+
if the error cannot be categorized.
|
3018
|
+
"""
|
3019
|
+
|
3020
|
+
# DeltaCAT native storage can only categorize DeltaCatError
|
3021
|
+
# (i.e., this is effectively a no-op for native storage)
|
3022
|
+
logger.info(f"Categorizing exception: {e}")
|
3023
|
+
categorized = None
|
3024
|
+
if isinstance(categorized, DeltaCatError):
|
3025
|
+
raise categorized from e
|
3026
|
+
|
3027
|
+
logger.warning(f"Could not classify {type(e).__name__}: {e}")
|
3028
|
+
raise UnclassifiedDeltaCatError(
|
3029
|
+
f"Failed to classify error {type(e).__name__}: {e}"
|
3030
|
+
) from e
|