deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/storage/interface.py
CHANGED
@@ -1,38 +1,42 @@
|
|
1
|
-
from typing import Any, Callable, Dict, List, Optional,
|
2
|
-
|
3
|
-
import pyarrow as pa
|
1
|
+
from typing import Any, Callable, Dict, List, Optional, Union, Tuple
|
4
2
|
|
5
3
|
from deltacat.storage import (
|
6
|
-
|
4
|
+
EntryParams,
|
5
|
+
EntryType,
|
7
6
|
Delta,
|
8
7
|
DeltaLocator,
|
8
|
+
DeltaProperties,
|
9
9
|
DeltaType,
|
10
10
|
DistributedDataset,
|
11
11
|
LifecycleState,
|
12
12
|
ListResult,
|
13
13
|
LocalDataset,
|
14
14
|
LocalTable,
|
15
|
-
Manifest,
|
16
15
|
ManifestAuthor,
|
17
16
|
Namespace,
|
17
|
+
NamespaceProperties,
|
18
18
|
Partition,
|
19
|
-
|
19
|
+
PartitionLocator,
|
20
|
+
PartitionScheme,
|
21
|
+
PartitionValues,
|
22
|
+
Schema,
|
23
|
+
SortScheme,
|
20
24
|
Stream,
|
25
|
+
StreamFormat,
|
21
26
|
StreamLocator,
|
22
27
|
Table,
|
28
|
+
TableProperties,
|
23
29
|
TableVersion,
|
24
|
-
|
25
|
-
|
26
|
-
PartitionFilter,
|
27
|
-
PartitionValues,
|
28
|
-
DeltaPartitionSpec,
|
29
|
-
StreamPartitionSpec,
|
30
|
+
TableVersionLocator,
|
31
|
+
TableVersionProperties,
|
30
32
|
)
|
33
|
+
from deltacat.storage.model.manifest import Manifest
|
34
|
+
from deltacat.storage.model.partition import UNKNOWN_PARTITION_ID
|
31
35
|
from deltacat.types.media import (
|
32
36
|
ContentType,
|
33
|
-
StorageType,
|
34
|
-
TableType,
|
35
37
|
DistributedDatasetType,
|
38
|
+
StorageType,
|
39
|
+
DatasetType,
|
36
40
|
)
|
37
41
|
from deltacat.utils.common import ReadKwargsProvider
|
38
42
|
|
@@ -64,12 +68,26 @@ def list_table_versions(
|
|
64
68
|
raise NotImplementedError("list_table_versions not implemented")
|
65
69
|
|
66
70
|
|
71
|
+
def list_streams(
|
72
|
+
namespace: str,
|
73
|
+
table_name: str,
|
74
|
+
table_version: str,
|
75
|
+
*args,
|
76
|
+
**kwargs,
|
77
|
+
) -> ListResult[Stream]:
|
78
|
+
"""
|
79
|
+
Lists a page of streams for the given table version.
|
80
|
+
Raises an error if the table version does not exist.
|
81
|
+
"""
|
82
|
+
raise NotImplementedError("list_streams not implemented")
|
83
|
+
|
84
|
+
|
67
85
|
def list_partitions(
|
68
86
|
namespace: str,
|
69
87
|
table_name: str,
|
70
88
|
table_version: Optional[str] = None,
|
71
89
|
*args,
|
72
|
-
**kwargs
|
90
|
+
**kwargs,
|
73
91
|
) -> ListResult[Partition]:
|
74
92
|
"""
|
75
93
|
Lists a page of partitions for the given table version. Partitions are
|
@@ -96,9 +114,9 @@ def list_deltas(
|
|
96
114
|
last_stream_position: Optional[int] = None,
|
97
115
|
ascending_order: Optional[bool] = None,
|
98
116
|
include_manifest: bool = False,
|
99
|
-
|
117
|
+
partition_scheme_id: Optional[str] = None,
|
100
118
|
*args,
|
101
|
-
**kwargs
|
119
|
+
**kwargs,
|
102
120
|
) -> ListResult[Delta]:
|
103
121
|
"""
|
104
122
|
Lists a page of deltas for the given table version and committed partition.
|
@@ -106,15 +124,13 @@ def list_deltas(
|
|
106
124
|
limited to inclusive first and last stream positions. Deltas are returned by
|
107
125
|
descending stream position by default. Table version resolves to the latest
|
108
126
|
active table version if not specified. Partition values should not be
|
109
|
-
specified for unpartitioned tables.
|
110
|
-
version
|
127
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
128
|
+
table version's current partition scheme by default. Raises an error if the
|
129
|
+
given table version or partition does not exist.
|
111
130
|
|
112
131
|
To conserve memory, the deltas returned do not include manifests by
|
113
132
|
default. The manifests can either be optionally retrieved as part of this
|
114
133
|
call or lazily loaded via subsequent calls to `get_delta_manifest`.
|
115
|
-
|
116
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
117
|
-
Use partition_filter instead.
|
118
134
|
"""
|
119
135
|
raise NotImplementedError("list_deltas not implemented")
|
120
136
|
|
@@ -126,7 +142,7 @@ def list_partition_deltas(
|
|
126
142
|
ascending_order: bool = False,
|
127
143
|
include_manifest: bool = False,
|
128
144
|
*args,
|
129
|
-
**kwargs
|
145
|
+
**kwargs,
|
130
146
|
) -> ListResult[Delta]:
|
131
147
|
"""
|
132
148
|
Lists a page of deltas committed to the given partition.
|
@@ -145,22 +161,21 @@ def get_delta(
|
|
145
161
|
partition_values: Optional[PartitionValues] = None,
|
146
162
|
table_version: Optional[str] = None,
|
147
163
|
include_manifest: bool = False,
|
148
|
-
|
164
|
+
partition_scheme_id: Optional[str] = None,
|
149
165
|
*args,
|
150
|
-
**kwargs
|
166
|
+
**kwargs,
|
151
167
|
) -> Optional[Delta]:
|
152
168
|
"""
|
153
169
|
Gets the delta for the given table version, partition, and stream position.
|
154
170
|
Table version resolves to the latest active table version if not specified.
|
155
|
-
Partition values should not be specified for unpartitioned tables.
|
156
|
-
|
171
|
+
Partition values should not be specified for unpartitioned tables. Partition
|
172
|
+
scheme ID resolves to the table version's current partition scheme by
|
173
|
+
default. Raises an error if the given table version or partition does not
|
174
|
+
exist.
|
157
175
|
|
158
176
|
To conserve memory, the delta returned does not include a manifest by
|
159
177
|
default. The manifest can either be optionally retrieved as part of this
|
160
178
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
161
|
-
|
162
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
163
|
-
Use partition_filter instead.
|
164
179
|
"""
|
165
180
|
raise NotImplementedError("get_delta not implemented")
|
166
181
|
|
@@ -171,50 +186,43 @@ def get_latest_delta(
|
|
171
186
|
partition_values: Optional[PartitionValues] = None,
|
172
187
|
table_version: Optional[str] = None,
|
173
188
|
include_manifest: bool = False,
|
174
|
-
|
189
|
+
partition_scheme_id: Optional[str] = None,
|
175
190
|
*args,
|
176
|
-
**kwargs
|
191
|
+
**kwargs,
|
177
192
|
) -> Optional[Delta]:
|
178
193
|
"""
|
179
194
|
Gets the latest delta (i.e. the delta with the greatest stream position) for
|
180
195
|
the given table version and partition. Table version resolves to the latest
|
181
196
|
active table version if not specified. Partition values should not be
|
182
|
-
specified for unpartitioned tables.
|
183
|
-
version
|
197
|
+
specified for unpartitioned tables. Partition scheme ID resolves to the
|
198
|
+
table version's current partition scheme by default. Raises an error if the
|
199
|
+
given table version or partition does not exist.
|
184
200
|
|
185
201
|
To conserve memory, the delta returned does not include a manifest by
|
186
202
|
default. The manifest can either be optionally retrieved as part of this
|
187
203
|
call or lazily loaded via a subsequent call to `get_delta_manifest`.
|
188
|
-
|
189
|
-
Note: partition_values is deprecated and will be removed in future releases.
|
190
|
-
Use partition_filter instead.
|
191
204
|
"""
|
192
205
|
raise NotImplementedError("get_latest_delta not implemented")
|
193
206
|
|
194
207
|
|
195
208
|
def download_delta(
|
196
209
|
delta_like: Union[Delta, DeltaLocator],
|
197
|
-
table_type:
|
210
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
198
211
|
storage_type: StorageType = StorageType.DISTRIBUTED,
|
199
212
|
max_parallelism: Optional[int] = None,
|
200
213
|
columns: Optional[List[str]] = None,
|
201
214
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
202
215
|
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
203
216
|
distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
|
204
|
-
partition_filter: Optional[PartitionFilter] = None,
|
205
217
|
*args,
|
206
|
-
**kwargs
|
218
|
+
**kwargs,
|
207
219
|
) -> Union[LocalDataset, DistributedDataset]: # type: ignore
|
208
220
|
"""
|
209
|
-
|
221
|
+
Reads the given delta or delta locator into either a list of
|
210
222
|
tables resident in the local node's memory, or into a dataset distributed
|
211
223
|
across this Ray cluster's object store memory. Ordered table N of a local
|
212
224
|
table list, or ordered block N of a distributed dataset, always contain
|
213
225
|
the contents of ordered delta manifest entry N.
|
214
|
-
|
215
|
-
partition_filter is an optional parameter which determines which files to
|
216
|
-
download from the delta manifest. A delta manifest contains all the data files
|
217
|
-
for a given delta.
|
218
226
|
"""
|
219
227
|
raise NotImplementedError("download_delta not implemented")
|
220
228
|
|
@@ -222,19 +230,19 @@ def download_delta(
|
|
222
230
|
def download_delta_manifest_entry(
|
223
231
|
delta_like: Union[Delta, DeltaLocator],
|
224
232
|
entry_index: int,
|
225
|
-
table_type:
|
233
|
+
table_type: DatasetType = DatasetType.PYARROW,
|
226
234
|
columns: Optional[List[str]] = None,
|
227
235
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
228
236
|
*args,
|
229
|
-
**kwargs
|
237
|
+
**kwargs,
|
230
238
|
) -> LocalTable:
|
231
239
|
"""
|
232
|
-
|
240
|
+
Reads a single manifest entry into the specified table type for the
|
233
241
|
given delta or delta locator. If a delta is provided with a non-empty
|
234
|
-
manifest, then the entry is
|
235
|
-
manifest is first retrieved then the given entry index
|
242
|
+
manifest, then the entry is read from this manifest. Otherwise, the
|
243
|
+
manifest is first retrieved then the given entry index read.
|
236
244
|
|
237
|
-
NOTE: The entry will be
|
245
|
+
NOTE: The entry will be read in the current node's memory.
|
238
246
|
"""
|
239
247
|
raise NotImplementedError("download_delta_manifest_entry not implemented")
|
240
248
|
|
@@ -244,17 +252,21 @@ def get_delta_manifest(
|
|
244
252
|
) -> Manifest:
|
245
253
|
"""
|
246
254
|
Get the manifest associated with the given delta or delta locator. This
|
247
|
-
always retrieves the authoritative
|
248
|
-
never the local manifest defined for any input delta.
|
255
|
+
always retrieves the authoritative durable copy of the delta manifest, and
|
256
|
+
never the local manifest defined for any input delta. Raises an error if
|
257
|
+
the delta can't be found, or if it doesn't contain a manifest.
|
249
258
|
"""
|
250
259
|
raise NotImplementedError("get_delta_manifest not implemented")
|
251
260
|
|
252
261
|
|
253
262
|
def create_namespace(
|
254
|
-
namespace: str,
|
263
|
+
namespace: str,
|
264
|
+
properties: Optional[NamespaceProperties] = None,
|
265
|
+
*args,
|
266
|
+
**kwargs,
|
255
267
|
) -> Namespace:
|
256
268
|
"""
|
257
|
-
Creates a table namespace with the given name and
|
269
|
+
Creates a table namespace with the given name and properties. Returns
|
258
270
|
the created namespace.
|
259
271
|
"""
|
260
272
|
raise NotImplementedError("create_namespace not implemented")
|
@@ -262,13 +274,13 @@ def create_namespace(
|
|
262
274
|
|
263
275
|
def update_namespace(
|
264
276
|
namespace: str,
|
265
|
-
|
277
|
+
properties: Optional[NamespaceProperties] = None,
|
266
278
|
new_namespace: Optional[str] = None,
|
267
279
|
*args,
|
268
|
-
**kwargs
|
280
|
+
**kwargs,
|
269
281
|
) -> None:
|
270
282
|
"""
|
271
|
-
Updates a table namespace's name and/or
|
283
|
+
Updates a table namespace's name and/or properties. Raises an error if the
|
272
284
|
given namespace does not exist.
|
273
285
|
"""
|
274
286
|
raise NotImplementedError("update_namespace not implemented")
|
@@ -278,71 +290,60 @@ def create_table_version(
|
|
278
290
|
namespace: str,
|
279
291
|
table_name: str,
|
280
292
|
table_version: Optional[str] = None,
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
sort_keys: Optional[List[SortKey]] = None,
|
293
|
+
lifecycle_state: Optional[LifecycleState] = LifecycleState.CREATED,
|
294
|
+
schema: Optional[Schema] = None,
|
295
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
296
|
+
sort_keys: Optional[SortScheme] = None,
|
286
297
|
table_version_description: Optional[str] = None,
|
287
|
-
table_version_properties: Optional[
|
288
|
-
table_permissions: Optional[Dict[str, Any]] = None,
|
298
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
289
299
|
table_description: Optional[str] = None,
|
290
|
-
table_properties: Optional[
|
300
|
+
table_properties: Optional[TableProperties] = None,
|
291
301
|
supported_content_types: Optional[List[ContentType]] = None,
|
292
|
-
partition_spec: Optional[StreamPartitionSpec] = None,
|
293
302
|
*args,
|
294
|
-
**kwargs
|
295
|
-
) -> Stream:
|
303
|
+
**kwargs,
|
304
|
+
) -> Tuple[Table, TableVersion, Stream]:
|
296
305
|
"""
|
297
|
-
Create a table version with
|
298
|
-
stream. Table versions may be schemaless and unpartitioned
|
299
|
-
|
300
|
-
|
301
|
-
used with schemaless tables. This can be useful for creating logical shards
|
302
|
-
of a delta stream where partition keys are known but not projected onto each
|
303
|
-
row of the table (e.g. all rows of a customer orders table are known to
|
304
|
-
correspond to a given order day, even if this column doesn't exist in the
|
305
|
-
table). Primary and sort keys must exist within the table's schema.
|
306
|
-
Permissions specified at the table level override any conflicting
|
307
|
-
permissions specified at the table namespace level. Returns the stream
|
308
|
-
for the created table version. Raises an error if the given namespace does
|
309
|
-
not exist.
|
310
|
-
|
311
|
-
Schemas are optional for DeltaCAT tables and can be used to inform the data
|
312
|
-
consistency checks run for each field. If a schema is present, it can be
|
313
|
-
used to enforce the following column-level data consistency policies at
|
314
|
-
table load time:
|
306
|
+
Create a table version with the given or CREATED lifecycle state and an empty delta
|
307
|
+
stream. Table versions may be schemaless and unpartitioned to improve write
|
308
|
+
performance, or have their writes governed by a schema and partition scheme
|
309
|
+
to improve data consistency and read performance.
|
315
310
|
|
316
|
-
|
317
|
-
|
318
|
-
column names to coerce/validate.
|
311
|
+
Returns a tuple containing the created/updated table, table version, and
|
312
|
+
stream (respectively).
|
319
313
|
|
320
|
-
|
321
|
-
|
314
|
+
Raises an error if the given namespace does not exist.
|
315
|
+
"""
|
316
|
+
raise NotImplementedError("create_table_version not implemented")
|
322
317
|
|
323
|
-
Validate: Raise an error for any fields that don't fit the schema. An
|
324
|
-
explicit subset of column names to validate may optionally be specified.
|
325
318
|
|
326
|
-
|
319
|
+
def create_table(
|
320
|
+
namespace: str,
|
321
|
+
table_name: str,
|
322
|
+
description: Optional[str] = None,
|
323
|
+
properties: Optional[TableProperties] = None,
|
324
|
+
*args,
|
325
|
+
**kwargs,
|
326
|
+
) -> Table:
|
327
327
|
"""
|
328
|
-
|
328
|
+
Create a new table. Raises an error if the given table already exists.
|
329
|
+
"""
|
330
|
+
raise NotImplementedError("create_table not implemented")
|
329
331
|
|
330
332
|
|
331
333
|
def update_table(
|
332
334
|
namespace: str,
|
333
335
|
table_name: str,
|
334
|
-
permissions: Optional[Dict[str, Any]] = None,
|
335
336
|
description: Optional[str] = None,
|
336
|
-
properties: Optional[
|
337
|
+
properties: Optional[TableProperties] = None,
|
337
338
|
new_table_name: Optional[str] = None,
|
338
339
|
*args,
|
339
|
-
**kwargs
|
340
|
-
) ->
|
340
|
+
**kwargs,
|
341
|
+
) -> Table:
|
341
342
|
"""
|
342
343
|
Update table metadata describing the table versions it contains. By default,
|
343
|
-
a table's properties are empty, and its description
|
344
|
-
|
345
|
-
|
344
|
+
a table's properties are empty, and its description is equal to that given
|
345
|
+
when its first table version was created. Raises an error if the given
|
346
|
+
table does not exist.
|
346
347
|
"""
|
347
348
|
raise NotImplementedError("update_table not implemented")
|
348
349
|
|
@@ -352,13 +353,15 @@ def update_table_version(
|
|
352
353
|
table_name: str,
|
353
354
|
table_version: str,
|
354
355
|
lifecycle_state: Optional[LifecycleState] = None,
|
355
|
-
schema: Optional[
|
356
|
-
schema_consistency: Optional[Dict[str, SchemaConsistencyType]] = None,
|
356
|
+
schema: Optional[Schema] = None,
|
357
357
|
description: Optional[str] = None,
|
358
|
-
properties: Optional[
|
358
|
+
properties: Optional[TableVersionProperties] = None,
|
359
|
+
partition_scheme: Optional[PartitionScheme] = None,
|
360
|
+
# TODO(pdames): rename to `sort_scheme`
|
361
|
+
sort_keys: Optional[SortScheme] = None,
|
359
362
|
*args,
|
360
|
-
**kwargs
|
361
|
-
) ->
|
363
|
+
**kwargs,
|
364
|
+
) -> Tuple[Optional[Table], TableVersion, Optional[Stream]]:
|
362
365
|
"""
|
363
366
|
Update a table version. Notably, updating an unreleased table version's
|
364
367
|
lifecycle state to 'active' telegraphs that it is ready for external
|
@@ -375,18 +378,27 @@ def stage_stream(
|
|
375
378
|
namespace: str,
|
376
379
|
table_name: str,
|
377
380
|
table_version: Optional[str] = None,
|
381
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
378
382
|
*args,
|
379
|
-
**kwargs
|
383
|
+
**kwargs,
|
380
384
|
) -> Stream:
|
381
385
|
"""
|
382
386
|
Stages a new delta stream for the given table version. Resolves to the
|
383
|
-
latest active table version if no table version is given.
|
384
|
-
|
387
|
+
latest active table version if no table version is given. Resolves to the
|
388
|
+
DeltaCAT stream format if no stream format is given. If this stream
|
389
|
+
will replace another stream with the same format and scheme, then it will
|
390
|
+
have its previous stream ID set to the ID of the stream being replaced.
|
391
|
+
Returns the staged stream. Raises an error if the table version does not
|
392
|
+
exist.
|
385
393
|
"""
|
386
394
|
raise NotImplementedError("stage_stream not implemented")
|
387
395
|
|
388
396
|
|
389
|
-
def commit_stream(
|
397
|
+
def commit_stream(
|
398
|
+
stream: Stream,
|
399
|
+
*args,
|
400
|
+
**kwargs,
|
401
|
+
) -> Stream:
|
390
402
|
"""
|
391
403
|
Registers a delta stream with a target table version, replacing any
|
392
404
|
previous stream registered for the same table version. Returns the
|
@@ -399,43 +411,111 @@ def delete_stream(
|
|
399
411
|
namespace: str,
|
400
412
|
table_name: str,
|
401
413
|
table_version: Optional[str] = None,
|
414
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
402
415
|
*args,
|
403
|
-
**kwargs
|
416
|
+
**kwargs,
|
404
417
|
) -> None:
|
405
418
|
"""
|
406
419
|
Deletes the delta stream currently registered with the given table version.
|
407
420
|
Resolves to the latest active table version if no table version is given.
|
408
|
-
|
421
|
+
Resolves to the deltacat stream format if no stream format is given.
|
422
|
+
Raises an error if the stream does not exist.
|
409
423
|
"""
|
410
424
|
raise NotImplementedError("delete_stream not implemented")
|
411
425
|
|
412
426
|
|
427
|
+
def delete_table(
|
428
|
+
namespace: str,
|
429
|
+
table_name: str,
|
430
|
+
purge: bool = False,
|
431
|
+
*args,
|
432
|
+
**kwargs,
|
433
|
+
) -> None:
|
434
|
+
"""
|
435
|
+
Drops the given table from the catalog. If purge is True, also removes
|
436
|
+
all data files associated with the table. Raises an error if the given table
|
437
|
+
does not exist.
|
438
|
+
"""
|
439
|
+
raise NotImplementedError("delete_table not implemented")
|
440
|
+
|
441
|
+
|
442
|
+
def delete_namespace(
|
443
|
+
namespace: str,
|
444
|
+
purge: bool = False,
|
445
|
+
*args,
|
446
|
+
**kwargs,
|
447
|
+
) -> None:
|
448
|
+
"""
|
449
|
+
Drops the given namespace from the catalog. If purge is True, also removes
|
450
|
+
all data files associated with the namespace. Raises an error if the given
|
451
|
+
namespace does not exist.
|
452
|
+
"""
|
453
|
+
raise NotImplementedError("drop_namespace not implemented")
|
454
|
+
|
455
|
+
|
456
|
+
def get_stream_by_id(
|
457
|
+
table_version_locator: TableVersionLocator,
|
458
|
+
stream_id: str,
|
459
|
+
*args,
|
460
|
+
**kwargs,
|
461
|
+
) -> Optional[Partition]:
|
462
|
+
"""
|
463
|
+
Gets the stream for the given table version locator and stream ID.
|
464
|
+
Returns None if the stream does not exist. Raises an error if the given
|
465
|
+
table version locator does not exist.
|
466
|
+
"""
|
467
|
+
raise NotImplementedError("get_stream_by_id not implemented")
|
468
|
+
|
469
|
+
|
413
470
|
def get_stream(
|
414
471
|
namespace: str,
|
415
472
|
table_name: str,
|
416
473
|
table_version: Optional[str] = None,
|
474
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
417
475
|
*args,
|
418
|
-
**kwargs
|
476
|
+
**kwargs,
|
419
477
|
) -> Optional[Stream]:
|
420
478
|
"""
|
421
|
-
Gets the most recently committed stream for the given table version
|
422
|
-
|
423
|
-
|
479
|
+
Gets the most recently committed stream for the given table version.
|
480
|
+
Resolves to the latest active table version if no table version is given.
|
481
|
+
Resolves to the deltacat stream format if no stream format is given.
|
482
|
+
Returns None if the table version or stream format does not exist.
|
424
483
|
"""
|
425
484
|
raise NotImplementedError("get_stream not implemented")
|
426
485
|
|
427
486
|
|
487
|
+
def stream_exists(
|
488
|
+
namespace: str,
|
489
|
+
table_name: str,
|
490
|
+
table_version: Optional[str] = None,
|
491
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
492
|
+
*args,
|
493
|
+
**kwargs,
|
494
|
+
) -> bool:
|
495
|
+
"""
|
496
|
+
Returns True if the given Stream exists, False if not.
|
497
|
+
Resolves to the latest active table version if no table version is given.
|
498
|
+
Resolves to the DeltaCAT stream format if no stream format is given.
|
499
|
+
Returns None if the table version or stream format does not exist.
|
500
|
+
"""
|
501
|
+
raise NotImplementedError("stream_exists not implemented")
|
502
|
+
|
503
|
+
|
428
504
|
def stage_partition(
|
429
|
-
stream: Stream,
|
505
|
+
stream: Stream,
|
506
|
+
partition_values: Optional[PartitionValues] = None,
|
507
|
+
partition_scheme_id: Optional[str] = None,
|
508
|
+
*args,
|
509
|
+
**kwargs,
|
430
510
|
) -> Partition:
|
431
511
|
"""
|
432
512
|
Stages a new partition for the given stream and partition values. Returns
|
433
513
|
the staged partition. If this partition will replace another partition
|
434
|
-
with the same partition values, then it will have its previous
|
435
|
-
set to the ID of the partition being replaced. Partition
|
436
|
-
specified for unpartitioned tables.
|
514
|
+
with the same partition values and scheme, then it will have its previous
|
515
|
+
partition ID set to the ID of the partition being replaced. Partition values
|
516
|
+
should not be specified for unpartitioned tables.
|
437
517
|
|
438
|
-
The partition_values must
|
518
|
+
The partition_values must represent the results of transforms in a partition
|
439
519
|
spec specified in the stream.
|
440
520
|
"""
|
441
521
|
raise NotImplementedError("stage_partition not implemented")
|
@@ -444,14 +524,20 @@ def stage_partition(
|
|
444
524
|
def commit_partition(
|
445
525
|
partition: Partition,
|
446
526
|
previous_partition: Optional[Partition] = None,
|
527
|
+
expected_previous_partition_id: Optional[str] = UNKNOWN_PARTITION_ID,
|
447
528
|
*args,
|
448
|
-
**kwargs
|
529
|
+
**kwargs,
|
449
530
|
) -> Partition:
|
450
531
|
"""
|
451
|
-
Commits the
|
452
|
-
replacing any previous partition
|
532
|
+
Commits the staged partition to its associated table version stream,
|
533
|
+
replacing any previous partition registered for the same stream and
|
453
534
|
partition values.
|
454
|
-
|
535
|
+
|
536
|
+
If previous partition is given then it will be replaced with its deltas
|
537
|
+
prepended to the new partition being committed. Otherwise the latest
|
538
|
+
committed partition with the same keys and partition scheme ID will be
|
539
|
+
retrieved.
|
540
|
+
|
455
541
|
Returns the registered partition. If the partition's
|
456
542
|
previous delta stream position is specified, then the commit will
|
457
543
|
be rejected if it does not match the actual previous stream position of
|
@@ -463,33 +549,48 @@ def commit_partition(
|
|
463
549
|
|
464
550
|
|
465
551
|
def delete_partition(
|
466
|
-
|
467
|
-
table_name: str,
|
468
|
-
table_version: Optional[str] = None,
|
552
|
+
stream_locator: StreamLocator,
|
469
553
|
partition_values: Optional[PartitionValues] = None,
|
554
|
+
partition_scheme_id: Optional[str] = None,
|
470
555
|
*args,
|
471
|
-
**kwargs
|
556
|
+
**kwargs,
|
472
557
|
) -> None:
|
473
558
|
"""
|
474
|
-
Deletes the given partition from the specified
|
475
|
-
the latest active table version if no table version is given. Partition
|
559
|
+
Deletes the given partition from the specified stream. Partition
|
476
560
|
values should not be specified for unpartitioned tables. Raises an error
|
477
|
-
if the
|
561
|
+
if the partition does not exist.
|
478
562
|
"""
|
479
563
|
raise NotImplementedError("delete_partition not implemented")
|
480
564
|
|
481
565
|
|
566
|
+
def get_partition_by_id(
|
567
|
+
stream_locator: StreamLocator,
|
568
|
+
partition_id: str,
|
569
|
+
*args,
|
570
|
+
**kwargs,
|
571
|
+
) -> Optional[Partition]:
|
572
|
+
"""
|
573
|
+
Gets the partition for the given stream locator and partition ID.
|
574
|
+
Returns None if the partition does not exist. Raises an error if the
|
575
|
+
given stream locator does not exist.
|
576
|
+
"""
|
577
|
+
raise NotImplementedError("get_partition_by_id not implemented")
|
578
|
+
|
579
|
+
|
482
580
|
def get_partition(
|
483
581
|
stream_locator: StreamLocator,
|
484
582
|
partition_values: Optional[PartitionValues] = None,
|
583
|
+
partition_scheme_id: Optional[str] = None,
|
485
584
|
*args,
|
486
|
-
**kwargs
|
585
|
+
**kwargs,
|
487
586
|
) -> Optional[Partition]:
|
488
587
|
"""
|
489
588
|
Gets the most recently committed partition for the given stream locator and
|
490
589
|
partition key values. Returns None if no partition has been committed for
|
491
590
|
the given table version and/or partition key values. Partition values
|
492
|
-
should not be specified for unpartitioned tables.
|
591
|
+
should not be specified for unpartitioned tables. Partition scheme ID
|
592
|
+
resolves to the table version's current partition scheme by default.
|
593
|
+
Raises an error if the given stream locator does not exist.
|
493
594
|
"""
|
494
595
|
raise NotImplementedError("get_partition not implemented")
|
495
596
|
|
@@ -500,26 +601,20 @@ def stage_delta(
|
|
500
601
|
delta_type: DeltaType = DeltaType.UPSERT,
|
501
602
|
max_records_per_entry: Optional[int] = None,
|
502
603
|
author: Optional[ManifestAuthor] = None,
|
503
|
-
properties: Optional[
|
504
|
-
|
604
|
+
properties: Optional[DeltaProperties] = None,
|
605
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
505
606
|
content_type: ContentType = ContentType.PARQUET,
|
506
|
-
|
507
|
-
|
508
|
-
|
607
|
+
entry_params: Optional[EntryParams] = None,
|
608
|
+
entry_type: Optional[EntryType] = EntryType.DATA,
|
609
|
+
schema: Optional[Schema] = None,
|
610
|
+
sort_scheme_id: Optional[str] = None,
|
509
611
|
*args,
|
510
|
-
**kwargs
|
612
|
+
**kwargs,
|
511
613
|
) -> Delta:
|
512
614
|
"""
|
513
|
-
Writes the given
|
615
|
+
Writes the given dataset to 1 or more files. Returns an unregistered
|
514
616
|
delta whose manifest entries point to the uploaded files. Applies any
|
515
617
|
schema consistency policies configured for the parent table version.
|
516
|
-
|
517
|
-
The partition spec will be used to split the input table into
|
518
|
-
multiple files. Optionally, partition_values can be provided to avoid
|
519
|
-
this method to recompute partition_values from the provided data.
|
520
|
-
|
521
|
-
Raises an error if the provided data does not conform to a unique ordered
|
522
|
-
list of partition_values
|
523
618
|
"""
|
524
619
|
raise NotImplementedError("stage_delta not implemented")
|
525
620
|
|
@@ -601,7 +696,7 @@ def get_table_version_column_names(
|
|
601
696
|
table_name: str,
|
602
697
|
table_version: Optional[str] = None,
|
603
698
|
*args,
|
604
|
-
**kwargs
|
699
|
+
**kwargs,
|
605
700
|
) -> Optional[List[str]]:
|
606
701
|
"""
|
607
702
|
Gets a list of column names for the specified table version, or for the
|
@@ -619,8 +714,8 @@ def get_table_version_schema(
|
|
619
714
|
table_name: str,
|
620
715
|
table_version: Optional[str] = None,
|
621
716
|
*args,
|
622
|
-
**kwargs
|
623
|
-
) -> Optional[
|
717
|
+
**kwargs,
|
718
|
+
) -> Optional[Schema]:
|
624
719
|
"""
|
625
720
|
Gets the schema for the specified table version, or for the latest active
|
626
721
|
table version if none is specified. Returns None if the table version is
|
@@ -640,13 +735,23 @@ def table_version_exists(
|
|
640
735
|
|
641
736
|
def can_categorize(e: BaseException, *args, **kwargs) -> bool:
|
642
737
|
"""
|
643
|
-
|
738
|
+
True if the input error originated from the storage
|
739
|
+
implementation layer and can be categorized under an
|
740
|
+
existing DeltaCatError. The "categorize_errors" decorator
|
741
|
+
uses this to determine if an unknown error from the storage
|
742
|
+
implementation can be categorized prior to casting it to
|
743
|
+
the equivalent DeltaCatError via `raise_categorized_error`
|
644
744
|
"""
|
645
745
|
raise NotImplementedError
|
646
746
|
|
647
747
|
|
648
748
|
def raise_categorized_error(e: BaseException, *args, **kwargs):
|
649
749
|
"""
|
650
|
-
|
750
|
+
Casts a categorizable error that originaed from the storage
|
751
|
+
implementation layer to its equivalent DeltaCatError
|
752
|
+
for uniform handling (e.g., determining whether an error
|
753
|
+
is retryable or not) via the "categorize_errors" decorator.
|
754
|
+
Raises an UnclassifiedDeltaCatError from the input exception
|
755
|
+
if the error cannot be categorized.
|
651
756
|
"""
|
652
757
|
raise NotImplementedError
|