deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,9 @@ class DeltaFileEnvelope(dict):
|
|
37
37
|
pointing to a file from the uncompacted source table, False if
|
38
38
|
this Locator is pointing to a file in the compacted destination
|
39
39
|
table.
|
40
|
-
|
40
|
+
file_record_count: Record count in the delta file table.
|
41
|
+
table_storage_strategy: The way the table object is stored in the
|
42
|
+
delta file envelope. If None just stores the table normally
|
41
43
|
Returns:
|
42
44
|
A delta file envelope.
|
43
45
|
|
@@ -31,9 +31,11 @@ class DeltaFileLocator(Locator, tuple):
|
|
31
31
|
|
32
32
|
file_index: Index of the file in the Delta Manifest.
|
33
33
|
|
34
|
+
file_record_count: Count of records in the Delta File.
|
35
|
+
|
34
36
|
Returns:
|
35
37
|
delta_file_locator: The Delta File Locator Tuple as
|
36
|
-
(
|
38
|
+
(is_src_delta, stream_position, file_index, file_record_count).
|
37
39
|
"""
|
38
40
|
return DeltaFileLocator(
|
39
41
|
(is_src_delta, stream_position, file_index, file_record_count)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import Tuple
|
4
|
+
from typing import Tuple, Union
|
5
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
6
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
7
|
from typing import Any, Dict, Optional
|
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
|
|
10
10
|
class HighWatermark(dict):
|
11
11
|
"""
|
12
12
|
Inherit from dict to make it easy for serialization/deserialization.
|
13
|
-
Keep both partition locator and high watermark as a tuple to be persisted in the
|
13
|
+
Keep both partition locator and high watermark as a tuple to be persisted in the rci
|
14
14
|
"""
|
15
15
|
|
16
16
|
def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
|
@@ -34,7 +34,7 @@ class RoundCompletionInfo(dict):
|
|
34
34
|
|
35
35
|
@staticmethod
|
36
36
|
def of(
|
37
|
-
high_watermark: HighWatermark,
|
37
|
+
high_watermark: Union[HighWatermark, int],
|
38
38
|
compacted_delta_locator: DeltaLocator,
|
39
39
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
40
40
|
sort_keys_bit_width: int,
|
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
|
|
46
46
|
compactor_version: Optional[str] = None,
|
47
47
|
input_inflation: Optional[float] = None,
|
48
48
|
input_average_record_size_bytes: Optional[float] = None,
|
49
|
+
prev_source_partition_locator: Optional[PartitionLocator] = None,
|
49
50
|
) -> RoundCompletionInfo:
|
50
51
|
|
51
52
|
rci = RoundCompletionInfo()
|
@@ -63,10 +64,11 @@ class RoundCompletionInfo(dict):
|
|
63
64
|
rci["compactorVersion"] = compactor_version
|
64
65
|
rci["inputInflation"] = input_inflation
|
65
66
|
rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
|
67
|
+
rci["prevSourcePartitionLocator"] = prev_source_partition_locator
|
66
68
|
return rci
|
67
69
|
|
68
70
|
@property
|
69
|
-
def high_watermark(self) -> HighWatermark:
|
71
|
+
def high_watermark(self) -> Union[HighWatermark, int]:
|
70
72
|
val: Dict[str, Any] = self.get("highWatermark")
|
71
73
|
if (
|
72
74
|
val is not None
|
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
|
|
100
102
|
|
101
103
|
@property
|
102
104
|
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
103
|
-
|
105
|
+
val = self.get("rebaseSourcePartitionLocator")
|
106
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
107
|
+
val = PartitionLocator(val)
|
108
|
+
self["rebaseSourcePartitionLocator"] = val # Cache the converted value
|
109
|
+
return val
|
104
110
|
|
105
111
|
@property
|
106
112
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
@@ -111,7 +117,7 @@ class RoundCompletionInfo(dict):
|
|
111
117
|
return self["hashBucketCount"]
|
112
118
|
|
113
119
|
@property
|
114
|
-
def hb_index_to_entry_range(self) -> Optional[Dict[
|
120
|
+
def hb_index_to_entry_range(self) -> Optional[Dict[str, Tuple[int, int]]]:
|
115
121
|
"""
|
116
122
|
The start index is inclusive and end index is exclusive by default.
|
117
123
|
"""
|
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
|
|
129
135
|
def input_average_record_size_bytes(self) -> Optional[float]:
|
130
136
|
return self.get("inputAverageRecordSizeBytes")
|
131
137
|
|
132
|
-
@
|
133
|
-
def
|
134
|
-
|
138
|
+
@property
|
139
|
+
def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
|
140
|
+
val = self.get("prevSourcePartitionLocator")
|
141
|
+
if val is not None and not isinstance(val, PartitionLocator):
|
142
|
+
val = PartitionLocator(val)
|
143
|
+
self["prevSourcePartitionLocator"] = val # Cache the converted value
|
144
|
+
return val
|
@@ -4,7 +4,7 @@ from ray.types import ObjectRef
|
|
4
4
|
|
5
5
|
from typing import Any, Union
|
6
6
|
|
7
|
-
from abc import ABC, abstractmethod
|
7
|
+
from abc import ABC, abstractmethod
|
8
8
|
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
9
9
|
from deltacat.storage import (
|
10
10
|
LocalTable,
|
@@ -15,7 +15,8 @@ LocalTableReference = Union[ObjectRef, LocalTable]
|
|
15
15
|
|
16
16
|
|
17
17
|
class LocalTableStorageStrategy(ABC):
|
18
|
-
@
|
18
|
+
@property
|
19
|
+
@abstractmethod
|
19
20
|
def object_store(cls) -> IObjectStore:
|
20
21
|
pass
|
21
22
|
|
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
21
21
|
from typing import List, Optional, Dict, Any
|
22
22
|
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
23
23
|
from deltacat.compute.compactor.utils import io
|
24
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
25
24
|
from deltacat.compute.compactor.steps import repartition as repar
|
26
25
|
from deltacat.compute.compactor.steps.repartition import RepartitionType
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
30
29
|
PartitionLocator,
|
31
|
-
|
30
|
+
metastore,
|
32
31
|
)
|
33
32
|
from deltacat.utils.metrics import MetricsConfig
|
34
33
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -41,7 +40,6 @@ def repartition(
|
|
41
40
|
source_partition_locator: PartitionLocator,
|
42
41
|
destination_partition_locator: PartitionLocator,
|
43
42
|
repartition_args: Any,
|
44
|
-
repartition_completion_file_s3_url: str,
|
45
43
|
last_stream_position_to_compact: int,
|
46
44
|
repartition_type: RepartitionType = RepartitionType.RANGE,
|
47
45
|
sort_keys: List[SortKey] = None,
|
@@ -54,9 +52,8 @@ def repartition(
|
|
54
52
|
pg_config: Optional[PlacementGroupConfig] = None,
|
55
53
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
56
54
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
57
|
-
|
58
|
-
|
59
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
55
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
56
|
+
deltacat_storage=metastore,
|
60
57
|
**kwargs,
|
61
58
|
) -> Optional[str]:
|
62
59
|
|
@@ -132,13 +129,13 @@ def repartition(
|
|
132
129
|
enable_profiler=enable_profiler,
|
133
130
|
metrics_config=metrics_config,
|
134
131
|
read_kwargs_provider=read_kwargs_provider,
|
135
|
-
|
132
|
+
table_writer_kwargs=table_writer_kwargs,
|
136
133
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
137
134
|
deltacat_storage=deltacat_storage,
|
138
135
|
)
|
139
136
|
logger.info(f"Getting {len(repar_tasks_pending)} task results...")
|
140
137
|
repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
|
141
|
-
repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
|
138
|
+
repar_results: List[List[Delta]] = [rp.range_deltas for rp in repar_results]
|
142
139
|
transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
|
143
140
|
ordered_deltas: List[Delta] = [
|
144
141
|
i for sublist in transposed for i in sublist if i is not None
|
@@ -153,9 +150,6 @@ def repartition(
|
|
153
150
|
compacted_delta = deltacat_storage.commit_delta(
|
154
151
|
merged_delta, properties=kwargs.get("properties", {})
|
155
152
|
)
|
156
|
-
deltacat_storage.commit_partition(partition)
|
157
|
-
logger.info(f"Committed final delta: {compacted_delta}")
|
158
|
-
logger.info(f"Job run completed successfully!")
|
159
153
|
new_compacted_delta_locator = DeltaLocator.of(
|
160
154
|
new_compacted_partition_locator,
|
161
155
|
compacted_delta.stream_position,
|
@@ -173,14 +167,7 @@ def repartition(
|
|
173
167
|
bit_width_of_sort_keys,
|
174
168
|
None,
|
175
169
|
)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
None,
|
181
|
-
None,
|
182
|
-
None,
|
183
|
-
repartition_completion_info,
|
184
|
-
repartition_completion_file_s3_url,
|
185
|
-
**s3_client_kwargs,
|
186
|
-
)
|
170
|
+
partition.compaction_round_completion_info = repartition_completion_info
|
171
|
+
deltacat_storage.commit_partition(partition)
|
172
|
+
logger.info(f"Committed final delta: {compacted_delta}")
|
173
|
+
logger.info(f"Job run completed successfully!")
|
@@ -15,7 +15,8 @@ from deltacat.compute.compactor import (
|
|
15
15
|
DeltaFileEnvelope,
|
16
16
|
DeltaFileLocator,
|
17
17
|
)
|
18
|
-
from deltacat.storage.model.sort_key import SortKey
|
18
|
+
from deltacat.storage.model.sort_key import SortKey
|
19
|
+
from deltacat.storage import SortOrder
|
19
20
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
20
21
|
from deltacat.compute.compactor.utils import system_columns as sc
|
21
22
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -155,15 +156,21 @@ def _timed_dedupe(
|
|
155
156
|
sort_keys.extend(
|
156
157
|
[
|
157
158
|
SortKey.of(
|
158
|
-
sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
|
159
|
+
[sc._PARTITION_STREAM_POSITION_COLUMN_NAME],
|
159
160
|
SortOrder.ASCENDING,
|
160
161
|
),
|
161
162
|
SortKey.of(
|
162
|
-
sc._ORDERED_FILE_IDX_COLUMN_NAME,
|
163
|
+
[sc._ORDERED_FILE_IDX_COLUMN_NAME],
|
164
|
+
SortOrder.ASCENDING,
|
163
165
|
),
|
164
166
|
]
|
165
167
|
)
|
166
|
-
table = table.take(
|
168
|
+
table = table.take(
|
169
|
+
pc.sort_indices(
|
170
|
+
table,
|
171
|
+
sort_keys=[pa_key for key in sort_keys for pa_key in key.arrow],
|
172
|
+
)
|
173
|
+
)
|
167
174
|
|
168
175
|
# drop duplicates by primary key hash column
|
169
176
|
logger.info(
|
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
|
|
21
21
|
group_hash_bucket_indices,
|
22
22
|
group_record_indices_by_hash_bucket,
|
23
23
|
)
|
24
|
-
from deltacat.storage import
|
24
|
+
from deltacat.storage import metastore
|
25
25
|
from deltacat.types.media import StorageType
|
26
26
|
from deltacat.utils.common import sha1_digest
|
27
27
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
90
90
|
sort_key_names: List[str],
|
91
91
|
is_src_delta: np.bool_ = True,
|
92
92
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
93
|
-
deltacat_storage=
|
93
|
+
deltacat_storage=metastore,
|
94
94
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
95
95
|
**kwargs,
|
96
96
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
|
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
|
|
139
139
|
primary_keys: List[str],
|
140
140
|
sort_key_names: List[str],
|
141
141
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
142
|
-
deltacat_storage=
|
142
|
+
deltacat_storage=metastore,
|
143
143
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
144
144
|
**kwargs,
|
145
145
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
|
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
|
|
190
190
|
enable_profiler: bool,
|
191
191
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
192
192
|
object_store: Optional[IObjectStore] = None,
|
193
|
-
deltacat_storage=
|
193
|
+
deltacat_storage=metastore,
|
194
194
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
195
195
|
**kwargs,
|
196
196
|
):
|
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
|
|
201
201
|
with memray.Tracker(
|
202
202
|
f"hash_bucket_{worker_id}_{task_id}.bin"
|
203
203
|
) if enable_profiler else nullcontext():
|
204
|
-
sort_key_names = [key.
|
204
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
205
205
|
if not round_completion_info:
|
206
206
|
is_src_delta = True
|
207
207
|
else:
|
@@ -249,7 +249,7 @@ def hash_bucket(
|
|
249
249
|
metrics_config: MetricsConfig,
|
250
250
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
251
251
|
object_store: Optional[IObjectStore],
|
252
|
-
deltacat_storage=
|
252
|
+
deltacat_storage=metastore,
|
253
253
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
254
254
|
**kwargs,
|
255
255
|
) -> HashBucketResult:
|
@@ -25,10 +25,11 @@ from deltacat.storage import (
|
|
25
25
|
DeltaType,
|
26
26
|
Partition,
|
27
27
|
PartitionLocator,
|
28
|
-
Manifest,
|
29
28
|
ManifestEntry,
|
29
|
+
ManifestEntryList,
|
30
30
|
)
|
31
|
-
from deltacat.storage import
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
32
|
+
|
32
33
|
from deltacat.utils.common import ReadKwargsProvider
|
33
34
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
34
35
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
@@ -45,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
45
46
|
)
|
46
47
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
47
48
|
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
49
|
+
from deltacat.storage import metastore
|
48
50
|
|
49
51
|
if importlib.util.find_spec("memray"):
|
50
52
|
import memray
|
@@ -66,9 +68,9 @@ def materialize(
|
|
66
68
|
metrics_config: MetricsConfig,
|
67
69
|
schema: Optional[pa.Schema] = None,
|
68
70
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
69
|
-
|
71
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
72
|
object_store: Optional[IObjectStore] = None,
|
71
|
-
deltacat_storage=
|
73
|
+
deltacat_storage=metastore,
|
72
74
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
73
75
|
):
|
74
76
|
if deltacat_storage_kwargs is None:
|
@@ -77,12 +79,15 @@ def materialize(
|
|
77
79
|
def _stage_delta_from_manifest_entry_reference_list(
|
78
80
|
manifest_entry_list_reference: List[ManifestEntry],
|
79
81
|
partition: Partition,
|
80
|
-
delta_type: DeltaType = DeltaType.
|
82
|
+
delta_type: DeltaType = DeltaType.APPEND,
|
81
83
|
) -> Delta:
|
82
84
|
assert (
|
83
|
-
delta_type == DeltaType.
|
84
|
-
), "
|
85
|
-
manifest = Manifest.of(
|
85
|
+
delta_type == DeltaType.APPEND
|
86
|
+
), "Compaction should always produce APPEND deltas for consistent read operations!"
|
87
|
+
manifest = Manifest.of(
|
88
|
+
entries=ManifestEntryList.of(manifest_entry_list_reference),
|
89
|
+
uuid=str(uuid4()),
|
90
|
+
)
|
86
91
|
delta = Delta.of(
|
87
92
|
locator=DeltaLocator.of(partition.locator),
|
88
93
|
delta_type=delta_type,
|
@@ -106,9 +111,10 @@ def materialize(
|
|
106
111
|
deltacat_storage.stage_delta,
|
107
112
|
compacted_table,
|
108
113
|
partition,
|
114
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
109
115
|
max_records_per_entry=max_records_per_output_file,
|
110
116
|
content_type=compacted_file_content_type,
|
111
|
-
|
117
|
+
table_writer_kwargs=table_writer_kwargs,
|
112
118
|
**deltacat_storage_kwargs,
|
113
119
|
)
|
114
120
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -10,7 +10,7 @@ import ray
|
|
10
10
|
from deltacat import logs
|
11
11
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
12
|
from deltacat.compute.compactor.model.repartition_result import RepartitionResult
|
13
|
-
from deltacat.storage import
|
13
|
+
from deltacat.storage import metastore
|
14
14
|
from deltacat.storage import Partition
|
15
15
|
from deltacat.utils.ray_utils.runtime import (
|
16
16
|
get_current_ray_task_id,
|
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
19
19
|
from deltacat.utils.common import ReadKwargsProvider
|
20
20
|
from deltacat.utils.performance import timed_invocation
|
21
21
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
22
|
-
from deltacat.storage import Delta
|
22
|
+
from deltacat.storage import Delta, DeltaType
|
23
23
|
from enum import Enum
|
24
24
|
|
25
25
|
if importlib.util.find_spec("memray"):
|
@@ -56,9 +56,9 @@ def repartition_range(
|
|
56
56
|
destination_partition: Partition,
|
57
57
|
repartition_args: dict,
|
58
58
|
max_records_per_output_file: int,
|
59
|
-
|
59
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
60
60
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
61
|
-
deltacat_storage=
|
61
|
+
deltacat_storage=metastore,
|
62
62
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
63
63
|
**kwargs,
|
64
64
|
):
|
@@ -144,9 +144,10 @@ def repartition_range(
|
|
144
144
|
partition_delta: Delta = deltacat_storage.stage_delta(
|
145
145
|
partition_table,
|
146
146
|
destination_partition,
|
147
|
+
delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
|
147
148
|
max_records_per_entry=max_records_per_output_file,
|
148
149
|
content_type=repartitioned_file_content_type,
|
149
|
-
|
150
|
+
table_writer_kwargs=table_writer_kwargs,
|
150
151
|
**deltacat_storage_kwargs,
|
151
152
|
)
|
152
153
|
partition_deltas.append(partition_delta)
|
@@ -168,9 +169,9 @@ def _timed_repartition(
|
|
168
169
|
max_records_per_output_file: int,
|
169
170
|
enable_profiler: bool,
|
170
171
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
171
|
-
|
172
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
172
173
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
173
|
-
deltacat_storage=
|
174
|
+
deltacat_storage=metastore,
|
174
175
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
175
176
|
**kwargs,
|
176
177
|
) -> RepartitionResult:
|
@@ -192,7 +193,7 @@ def _timed_repartition(
|
|
192
193
|
destination_partition=destination_partition,
|
193
194
|
repartition_args=repartition_args,
|
194
195
|
max_records_per_output_file=max_records_per_output_file,
|
195
|
-
|
196
|
+
table_writer_kwargs=table_writer_kwargs,
|
196
197
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
197
198
|
deltacat_storage=deltacat_storage,
|
198
199
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -213,9 +214,9 @@ def repartition(
|
|
213
214
|
enable_profiler: bool,
|
214
215
|
metrics_config: Optional[MetricsConfig],
|
215
216
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
216
|
-
|
217
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
217
218
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
218
|
-
deltacat_storage=
|
219
|
+
deltacat_storage=metastore,
|
219
220
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
220
221
|
**kwargs,
|
221
222
|
) -> RepartitionResult:
|
@@ -231,7 +232,7 @@ def repartition(
|
|
231
232
|
max_records_per_output_file=max_records_per_output_file,
|
232
233
|
enable_profiler=enable_profiler,
|
233
234
|
read_kwargs_provider=read_kwargs_provider,
|
234
|
-
|
235
|
+
table_writer_kwargs=table_writer_kwargs,
|
235
236
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
236
237
|
deltacat_storage=deltacat_storage,
|
237
238
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
@@ -11,7 +11,7 @@ from deltacat.storage import (
|
|
11
11
|
PartitionLocator,
|
12
12
|
Delta,
|
13
13
|
ManifestEntry,
|
14
|
-
|
14
|
+
metastore,
|
15
15
|
)
|
16
16
|
from deltacat import logs
|
17
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
@@ -31,12 +31,13 @@ def discover_deltas(
|
|
31
31
|
compacted_partition_locator: Optional[PartitionLocator],
|
32
32
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
33
33
|
rebase_source_partition_high_watermark: Optional[int],
|
34
|
-
deltacat_storage=
|
34
|
+
deltacat_storage=metastore,
|
35
35
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
36
36
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
37
37
|
) -> Tuple[List[Delta], int]:
|
38
38
|
if deltacat_storage_kwargs is None:
|
39
39
|
deltacat_storage_kwargs = {}
|
40
|
+
|
40
41
|
# Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
|
41
42
|
start_position_exclusive = (
|
42
43
|
high_watermark.get(source_partition_locator)
|
@@ -109,7 +110,7 @@ def limit_input_deltas(
|
|
109
110
|
user_hash_bucket_chunk_size: int,
|
110
111
|
input_deltas_stats: Dict[int, DeltaStats],
|
111
112
|
compaction_audit: CompactionSessionAuditInfo,
|
112
|
-
deltacat_storage=
|
113
|
+
deltacat_storage=metastore,
|
113
114
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
114
115
|
**kwargs,
|
115
116
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -272,7 +273,7 @@ def fit_input_deltas(
|
|
272
273
|
cluster_resources: Dict[str, float],
|
273
274
|
compaction_audit: CompactionSessionAuditInfo,
|
274
275
|
hash_bucket_count: Optional[int],
|
275
|
-
deltacat_storage=
|
276
|
+
deltacat_storage=metastore,
|
276
277
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
277
278
|
**kwargs,
|
278
279
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -358,8 +359,8 @@ def fit_input_deltas(
|
|
358
359
|
def _discover_deltas(
|
359
360
|
source_partition_locator: PartitionLocator,
|
360
361
|
start_position_exclusive: Optional[int],
|
361
|
-
end_position_inclusive: int,
|
362
|
-
deltacat_storage=
|
362
|
+
end_position_inclusive: Optional[int],
|
363
|
+
deltacat_storage=metastore,
|
363
364
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
365
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
365
366
|
) -> List[Delta]:
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
from deltacat import logs
|
4
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
5
|
+
from deltacat.storage import PartitionLocator
|
6
|
+
from deltacat.storage.model.partition import Partition
|
7
|
+
from deltacat.utils.metrics import metrics
|
8
|
+
from deltacat.exceptions import PartitionNotFoundError
|
9
|
+
|
10
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
+
|
12
|
+
|
13
|
+
@metrics
|
14
|
+
def read_round_completion_info(
|
15
|
+
source_partition_locator: PartitionLocator,
|
16
|
+
destination_partition_locator: PartitionLocator,
|
17
|
+
deltacat_storage,
|
18
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
19
|
+
destination_partition: Optional[Partition] = None,
|
20
|
+
) -> Optional[RoundCompletionInfo]:
|
21
|
+
"""
|
22
|
+
Read round completion info from the partition metafile.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
source_partition_locator: Source partition locator for validation
|
26
|
+
destination_partition_locator: Destination partition locator
|
27
|
+
deltacat_storage: Storage implementation
|
28
|
+
deltacat_storage_kwargs: Optional storage kwargs
|
29
|
+
destination_partition: Optional destination partition to avoid redundant get_partition calls
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
RoundCompletionInfo if found in partition, None otherwise
|
33
|
+
"""
|
34
|
+
if not destination_partition_locator:
|
35
|
+
return None
|
36
|
+
|
37
|
+
if deltacat_storage_kwargs is None:
|
38
|
+
deltacat_storage_kwargs = {}
|
39
|
+
|
40
|
+
try:
|
41
|
+
# Use provided partition or get it from storage
|
42
|
+
if destination_partition:
|
43
|
+
partition = destination_partition
|
44
|
+
else:
|
45
|
+
# First get the current partition to access its previous_partition_id
|
46
|
+
current_partition: Partition = deltacat_storage.get_partition(
|
47
|
+
destination_partition_locator.stream_locator,
|
48
|
+
destination_partition_locator.partition_values,
|
49
|
+
**deltacat_storage_kwargs,
|
50
|
+
)
|
51
|
+
|
52
|
+
# If current partition has round completion info, use it
|
53
|
+
if current_partition.compaction_round_completion_info:
|
54
|
+
partition = current_partition
|
55
|
+
elif current_partition.previous_partition_id is not None:
|
56
|
+
# For incremental compaction, we need to get the previous committed partition
|
57
|
+
# that contains the round completion info.
|
58
|
+
# Get the previous partition by ID - this is where the round completion info should be
|
59
|
+
logger.info(
|
60
|
+
f"Current partition {destination_partition_locator} does not have round completion info, "
|
61
|
+
f"getting previous partition with ID: {current_partition.previous_partition_id}"
|
62
|
+
)
|
63
|
+
previous_partition = deltacat_storage.get_partition_by_id(
|
64
|
+
destination_partition_locator.stream_locator,
|
65
|
+
current_partition.previous_partition_id,
|
66
|
+
**deltacat_storage_kwargs,
|
67
|
+
)
|
68
|
+
if previous_partition is not None:
|
69
|
+
logger.info(
|
70
|
+
f"Found previous partition: {previous_partition.locator}"
|
71
|
+
)
|
72
|
+
partition = previous_partition
|
73
|
+
else:
|
74
|
+
raise PartitionNotFoundError(
|
75
|
+
f"Previous partition with ID {current_partition.previous_partition_id} not found"
|
76
|
+
)
|
77
|
+
else:
|
78
|
+
logger.info(f"No previous partition ID found, using current partition")
|
79
|
+
partition = current_partition
|
80
|
+
|
81
|
+
if partition:
|
82
|
+
round_completion_info = partition.compaction_round_completion_info
|
83
|
+
if round_completion_info:
|
84
|
+
# Validate that prev_source_partition_locator matches current source
|
85
|
+
if (
|
86
|
+
not source_partition_locator
|
87
|
+
or not round_completion_info.prev_source_partition_locator
|
88
|
+
):
|
89
|
+
raise ValueError(
|
90
|
+
f"Source partition locator ({source_partition_locator}) and "
|
91
|
+
f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
|
92
|
+
f"must both be provided."
|
93
|
+
)
|
94
|
+
|
95
|
+
if (
|
96
|
+
round_completion_info.prev_source_partition_locator.canonical_string()
|
97
|
+
!= source_partition_locator.canonical_string()
|
98
|
+
):
|
99
|
+
logger.warning(
|
100
|
+
f"Previous source partition locator mismatch: "
|
101
|
+
f"expected {source_partition_locator.canonical_string()}, "
|
102
|
+
f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
|
103
|
+
f"in round completion info. Ignoring cached round completion info."
|
104
|
+
)
|
105
|
+
return None
|
106
|
+
|
107
|
+
logger.info(
|
108
|
+
f"Read round completion info from partition metafile: {round_completion_info}"
|
109
|
+
)
|
110
|
+
return round_completion_info
|
111
|
+
|
112
|
+
except Exception as e:
|
113
|
+
logger.debug(
|
114
|
+
f"Failed to read round completion info from partition metafile: {e}"
|
115
|
+
)
|
116
|
+
|
117
|
+
return None
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import pyarrow as pa
|
2
2
|
from typing import List
|
3
|
-
from
|
3
|
+
from itertools import chain
|
4
|
+
from deltacat.storage import PartitionLocator, SortKey, TransformName
|
4
5
|
|
5
6
|
MAX_SORT_KEYS_BIT_WIDTH = 256
|
6
7
|
|
@@ -22,7 +23,13 @@ def validate_sort_keys(
|
|
22
23
|
deltacat_storage_kwargs = {}
|
23
24
|
total_sort_keys_bit_width = 0
|
24
25
|
if sort_keys:
|
25
|
-
sort_key_names = [key.
|
26
|
+
sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
|
27
|
+
assert all(
|
28
|
+
[
|
29
|
+
key.transform is None or key.transform.name == TransformName.IDENTITY
|
30
|
+
for key in sort_keys
|
31
|
+
]
|
32
|
+
), f"Sort key transforms are not supported: {sort_keys}"
|
26
33
|
assert len(sort_key_names) == len(
|
27
34
|
set(sort_key_names)
|
28
35
|
), f"Sort key names must be unique: {sort_key_names}"
|
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
|
|
294
294
|
|
295
295
|
|
296
296
|
def delta_type_to_field(delta_type: DeltaType) -> bool:
|
297
|
-
|
297
|
+
# For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
|
298
|
+
# Only DELETE should be treated as DELETE (False)
|
299
|
+
return delta_type is not DeltaType.DELETE
|
298
300
|
|
299
301
|
|
300
302
|
def delta_type_from_field(delta_type_field: bool) -> DeltaType:
|