deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,15 @@ import ray
|
|
5
5
|
import time
|
6
6
|
import json
|
7
7
|
from math import ceil
|
8
|
+
from urllib.parse import urlparse
|
9
|
+
import pyarrow
|
8
10
|
|
9
11
|
from deltacat.compute.compactor import (
|
10
12
|
PyArrowWriteResult,
|
11
13
|
HighWatermark,
|
12
14
|
RoundCompletionInfo,
|
13
15
|
)
|
14
|
-
from deltacat.
|
16
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
15
17
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
16
18
|
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
17
19
|
ExecutionCompactionResult,
|
@@ -32,7 +34,7 @@ from deltacat.compute.compactor_v2.utils.merge import (
|
|
32
34
|
from deltacat.compute.compactor_v2.utils.task_options import (
|
33
35
|
hash_bucket_resource_options_provider,
|
34
36
|
)
|
35
|
-
from deltacat.compute.compactor.utils import
|
37
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
36
38
|
from deltacat.compute.compactor import DeltaAnnotated
|
37
39
|
from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
|
38
40
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
@@ -48,6 +50,7 @@ from deltacat.storage import (
|
|
48
50
|
DeltaType,
|
49
51
|
DeltaLocator,
|
50
52
|
Partition,
|
53
|
+
PartitionLocator,
|
51
54
|
Manifest,
|
52
55
|
Stream,
|
53
56
|
StreamLocator,
|
@@ -63,7 +66,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
|
|
63
66
|
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
64
67
|
from deltacat.compute.compactor_v2.utils import io
|
65
68
|
|
66
|
-
from typing import List, Optional
|
69
|
+
from typing import List, Optional, Union
|
67
70
|
from collections import defaultdict
|
68
71
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
69
72
|
CompactionSessionAuditInfo,
|
@@ -77,21 +80,39 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
77
80
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
78
81
|
|
79
82
|
|
83
|
+
def _get_rci_source_partition_locator(
|
84
|
+
params: CompactPartitionParams,
|
85
|
+
) -> PartitionLocator:
|
86
|
+
return params.rebase_source_partition_locator or params.source_partition_locator
|
87
|
+
|
88
|
+
|
89
|
+
def _is_inplace_compacted(
|
90
|
+
rci_source_partition_locator: PartitionLocator,
|
91
|
+
destination_partition_locator: PartitionLocator,
|
92
|
+
) -> bool:
|
93
|
+
return (
|
94
|
+
rci_source_partition_locator.partition_values
|
95
|
+
== destination_partition_locator.partition_values
|
96
|
+
and rci_source_partition_locator.stream_id
|
97
|
+
== destination_partition_locator.stream_id
|
98
|
+
)
|
99
|
+
|
100
|
+
|
80
101
|
def _fetch_compaction_metadata(
|
81
102
|
params: CompactPartitionParams,
|
82
103
|
) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
|
83
104
|
|
84
105
|
# read the results from any previously completed compaction round
|
85
106
|
round_completion_info: Optional[RoundCompletionInfo] = None
|
86
|
-
high_watermark: Optional[HighWatermark] = None
|
107
|
+
high_watermark: Optional[Union[HighWatermark, int]] = None
|
87
108
|
previous_compacted_delta_manifest: Optional[Manifest] = None
|
88
109
|
|
89
110
|
if not params.rebase_source_partition_locator:
|
90
|
-
round_completion_info =
|
91
|
-
params.
|
92
|
-
params.
|
93
|
-
params.
|
94
|
-
|
111
|
+
round_completion_info = rci.read_round_completion_info(
|
112
|
+
source_partition_locator=params.source_partition_locator,
|
113
|
+
destination_partition_locator=params.destination_partition_locator,
|
114
|
+
deltacat_storage=params.deltacat_storage,
|
115
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
95
116
|
)
|
96
117
|
if not round_completion_info:
|
97
118
|
logger.info(
|
@@ -111,10 +132,10 @@ def _fetch_compaction_metadata(
|
|
111
132
|
assert (
|
112
133
|
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
113
134
|
), (
|
114
|
-
"
|
115
|
-
"
|
116
|
-
f"Hash
|
117
|
-
f"
|
135
|
+
"Partition hash bucket count for compaction has changed. "
|
136
|
+
"Rebase compaction with the desired hash bucket count before running another incremental compaction. "
|
137
|
+
f"Hash bucket count in RCI={round_completion_info.hash_bucket_count} "
|
138
|
+
f"!= hash bucket count in params={params.hash_bucket_count}."
|
118
139
|
)
|
119
140
|
|
120
141
|
logger.info(f"Round completion file: {round_completion_info}")
|
@@ -129,7 +150,7 @@ def _build_uniform_deltas(
|
|
129
150
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
130
151
|
input_deltas: List[Delta],
|
131
152
|
delta_discovery_start: float,
|
132
|
-
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
153
|
+
) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
|
133
154
|
|
134
155
|
delete_strategy: Optional[DeleteStrategy] = None
|
135
156
|
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
|
@@ -149,6 +170,7 @@ def _build_uniform_deltas(
|
|
149
170
|
hash_bucket_count=params.hash_bucket_count,
|
150
171
|
compaction_audit=mutable_compaction_audit,
|
151
172
|
compact_partition_params=params,
|
173
|
+
all_column_names=params.all_column_names,
|
152
174
|
deltacat_storage=params.deltacat_storage,
|
153
175
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
154
176
|
)
|
@@ -159,10 +181,9 @@ def _build_uniform_deltas(
|
|
159
181
|
delta_discovery_end - delta_discovery_start
|
160
182
|
)
|
161
183
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
**params.s3_client_kwargs,
|
184
|
+
_upload_audit_data(
|
185
|
+
params,
|
186
|
+
mutable_compaction_audit,
|
166
187
|
)
|
167
188
|
|
168
189
|
return (
|
@@ -222,7 +243,7 @@ def _run_hash_and_merge(
|
|
222
243
|
uniform_deltas: List[DeltaAnnotated],
|
223
244
|
round_completion_info: RoundCompletionInfo,
|
224
245
|
delete_strategy: Optional[DeleteStrategy],
|
225
|
-
delete_file_envelopes: Optional[DeleteFileEnvelope],
|
246
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
|
226
247
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
227
248
|
previous_compacted_delta_manifest: Optional[Manifest],
|
228
249
|
compacted_partition: Partition,
|
@@ -267,10 +288,9 @@ def _run_hash_and_merge(
|
|
267
288
|
hb_end - hb_start,
|
268
289
|
)
|
269
290
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
**params.s3_client_kwargs,
|
291
|
+
_upload_audit_data(
|
292
|
+
params,
|
293
|
+
mutable_compaction_audit,
|
274
294
|
)
|
275
295
|
|
276
296
|
hb_data_processed_size_bytes = np.int64(0)
|
@@ -389,7 +409,7 @@ def _merge(
|
|
389
409
|
all_hash_group_idx_to_obj_id: dict,
|
390
410
|
compacted_partition: Partition,
|
391
411
|
delete_strategy: DeleteStrategy,
|
392
|
-
delete_file_envelopes: DeleteFileEnvelope,
|
412
|
+
delete_file_envelopes: List[DeleteFileEnvelope],
|
393
413
|
) -> tuple[List[MergeResult], float]:
|
394
414
|
merge_options_provider = functools.partial(
|
395
415
|
task_resource_options_provider,
|
@@ -402,13 +422,24 @@ def _merge(
|
|
402
422
|
round_completion_info=round_completion_info,
|
403
423
|
compacted_delta_manifest=previous_compacted_delta_manifest,
|
404
424
|
primary_keys=params.primary_keys,
|
405
|
-
deltacat_storage=params.deltacat_storage,
|
406
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
407
425
|
ray_custom_resources=params.ray_custom_resources,
|
408
426
|
memory_logs_enabled=params.memory_logs_enabled,
|
409
427
|
estimate_resources_params=params.estimate_resources_params,
|
410
428
|
)
|
411
429
|
|
430
|
+
# set previous compacted delta manifest on input so that we don't need a transaction to retrieve it
|
431
|
+
if round_completion_info:
|
432
|
+
previous_compacted_delta_manifest = params.deltacat_storage.get_delta_manifest(
|
433
|
+
round_completion_info.compacted_delta_locator,
|
434
|
+
**params.deltacat_storage_kwargs,
|
435
|
+
)
|
436
|
+
|
437
|
+
# create a copy of deltacat storage kwargs without any parent transaction context
|
438
|
+
# (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
|
439
|
+
deltacat_storage_kwargs_copy = {
|
440
|
+
k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
|
441
|
+
}
|
442
|
+
|
412
443
|
def merge_input_provider(index, item) -> dict[str, MergeInput]:
|
413
444
|
return {
|
414
445
|
"input": MergeInput.of(
|
@@ -422,23 +453,26 @@ def _merge(
|
|
422
453
|
write_to_partition=compacted_partition,
|
423
454
|
compacted_file_content_type=params.compacted_file_content_type,
|
424
455
|
primary_keys=params.primary_keys,
|
456
|
+
all_column_names=params.all_column_names,
|
425
457
|
sort_keys=params.sort_keys,
|
426
458
|
merge_task_index=index,
|
427
459
|
drop_duplicates=params.drop_duplicates,
|
428
460
|
max_records_per_output_file=params.records_per_compacted_file,
|
429
461
|
enable_profiler=params.enable_profiler,
|
430
462
|
metrics_config=params.metrics_config,
|
431
|
-
|
463
|
+
table_writer_kwargs=params.table_writer_kwargs,
|
432
464
|
read_kwargs_provider=params.read_kwargs_provider,
|
433
465
|
round_completion_info=round_completion_info,
|
434
466
|
object_store=params.object_store,
|
435
467
|
deltacat_storage=params.deltacat_storage,
|
436
|
-
deltacat_storage_kwargs=
|
468
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
|
437
469
|
delete_strategy=delete_strategy,
|
438
470
|
delete_file_envelopes=delete_file_envelopes,
|
439
471
|
memory_logs_enabled=params.memory_logs_enabled,
|
440
472
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
441
473
|
hash_bucket_count=params.hash_bucket_count,
|
474
|
+
original_fields=params.original_fields,
|
475
|
+
compacted_manifest=previous_compacted_delta_manifest,
|
442
476
|
)
|
443
477
|
}
|
444
478
|
|
@@ -474,6 +508,12 @@ def _hash_bucket(
|
|
474
508
|
estimate_resources_params=params.estimate_resources_params,
|
475
509
|
)
|
476
510
|
|
511
|
+
# create a copy of deltacat storage kwargs without any parent transaction context
|
512
|
+
# (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
|
513
|
+
deltacat_storage_kwargs_copy = {
|
514
|
+
k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
|
515
|
+
}
|
516
|
+
|
477
517
|
def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
|
478
518
|
return {
|
479
519
|
"input": HashBucketInput.of(
|
@@ -482,12 +522,13 @@ def _hash_bucket(
|
|
482
522
|
hb_task_index=index,
|
483
523
|
num_hash_buckets=params.hash_bucket_count,
|
484
524
|
num_hash_groups=params.hash_group_count,
|
525
|
+
all_column_names=params.all_column_names,
|
485
526
|
enable_profiler=params.enable_profiler,
|
486
527
|
metrics_config=params.metrics_config,
|
487
528
|
read_kwargs_provider=params.read_kwargs_provider,
|
488
529
|
object_store=params.object_store,
|
489
530
|
deltacat_storage=params.deltacat_storage,
|
490
|
-
deltacat_storage_kwargs=
|
531
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
|
491
532
|
memory_logs_enabled=params.memory_logs_enabled,
|
492
533
|
)
|
493
534
|
}
|
@@ -596,10 +637,9 @@ def _process_merge_results(
|
|
596
637
|
file_index += mat_result.pyarrow_write_result.files
|
597
638
|
previous_task_index = mat_result.task_index
|
598
639
|
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
**params.s3_client_kwargs,
|
640
|
+
_upload_audit_data(
|
641
|
+
params,
|
642
|
+
mutable_compaction_audit,
|
603
643
|
)
|
604
644
|
deltas: List[Delta] = [m.delta for m in mat_results]
|
605
645
|
# Note: An appropriate last stream position must be set
|
@@ -634,21 +674,20 @@ def _update_and_upload_compaction_audit(
|
|
634
674
|
+ round_completion_info.compacted_pyarrow_write_result.records
|
635
675
|
)
|
636
676
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
**params.s3_client_kwargs,
|
677
|
+
_upload_audit_data(
|
678
|
+
params,
|
679
|
+
mutable_compaction_audit,
|
641
680
|
)
|
642
681
|
return
|
643
682
|
|
644
683
|
|
645
|
-
def
|
684
|
+
def _create_round_completion_info(
|
646
685
|
params: CompactPartitionParams,
|
647
686
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
648
687
|
compacted_partition: Partition,
|
649
688
|
audit_url: str,
|
650
689
|
hb_id_to_entry_indices_range: dict,
|
651
|
-
|
690
|
+
rci_source_partition_locator: PartitionLocator,
|
652
691
|
new_compacted_delta_locator: DeltaLocator,
|
653
692
|
pyarrow_write_result: PyArrowWriteResult,
|
654
693
|
prev_round_completion_info: Optional[RoundCompletionInfo] = None,
|
@@ -690,6 +729,27 @@ def _write_new_round_completion_file(
|
|
690
729
|
prev_round_completion_info,
|
691
730
|
)
|
692
731
|
|
732
|
+
# Check if this is an in-place compaction before creating RoundCompletionInfo
|
733
|
+
logger.info(
|
734
|
+
f"Checking if partition {rci_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
735
|
+
)
|
736
|
+
is_inplace_compacted: bool = _is_inplace_compacted(
|
737
|
+
rci_source_partition_locator, params.destination_partition_locator
|
738
|
+
)
|
739
|
+
|
740
|
+
# Determine the prev_source_partition_locator based on compaction type
|
741
|
+
if is_inplace_compacted:
|
742
|
+
logger.info(
|
743
|
+
"In-place compaction detected. Using compacted partition locator as prev_source_partition_locator. "
|
744
|
+
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
745
|
+
f"and rci source partition_id of {rci_source_partition_locator.partition_id}."
|
746
|
+
)
|
747
|
+
prev_source_partition_locator = compacted_partition.locator
|
748
|
+
# Update rci_source_partition_locator for backward compatibility
|
749
|
+
rci_source_partition_locator = compacted_partition.locator
|
750
|
+
else:
|
751
|
+
prev_source_partition_locator = rci_source_partition_locator
|
752
|
+
|
693
753
|
new_round_completion_info = RoundCompletionInfo.of(
|
694
754
|
high_watermark=params.last_stream_position_to_compact,
|
695
755
|
compacted_delta_locator=new_compacted_delta_locator,
|
@@ -702,41 +762,17 @@ def _write_new_round_completion_file(
|
|
702
762
|
compactor_version=CompactorVersion.V2.value,
|
703
763
|
input_inflation=input_inflation,
|
704
764
|
input_average_record_size_bytes=input_average_record_size_bytes,
|
765
|
+
prev_source_partition_locator=prev_source_partition_locator,
|
705
766
|
)
|
706
767
|
|
707
768
|
logger.info(
|
708
769
|
f"Partition-{params.source_partition_locator.partition_values},"
|
709
770
|
f"compacted at: {params.last_stream_position_to_compact},"
|
710
771
|
)
|
711
|
-
logger.info(
|
712
|
-
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
713
|
-
)
|
714
|
-
is_inplace_compacted: bool = (
|
715
|
-
rcf_source_partition_locator.partition_values
|
716
|
-
== params.destination_partition_locator.partition_values
|
717
|
-
and rcf_source_partition_locator.stream_id
|
718
|
-
== params.destination_partition_locator.stream_id
|
719
|
-
)
|
720
|
-
if is_inplace_compacted:
|
721
|
-
logger.info(
|
722
|
-
"Overriding round completion file source partition locator as in-place compacted. "
|
723
|
-
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
724
|
-
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
725
|
-
)
|
726
|
-
rcf_source_partition_locator = compacted_partition.locator
|
727
|
-
|
728
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
729
|
-
params.compaction_artifact_s3_bucket,
|
730
|
-
rcf_source_partition_locator,
|
731
|
-
compacted_partition.locator,
|
732
|
-
new_round_completion_info,
|
733
|
-
**params.s3_client_kwargs,
|
734
|
-
)
|
735
772
|
|
736
773
|
return ExecutionCompactionResult(
|
737
774
|
compacted_partition,
|
738
775
|
new_round_completion_info,
|
739
|
-
round_completion_file_s3_url,
|
740
776
|
is_inplace_compacted,
|
741
777
|
)
|
742
778
|
|
@@ -752,21 +788,29 @@ def _commit_compaction_result(
|
|
752
788
|
f"Partition-{params.source_partition_locator} -> "
|
753
789
|
f"{compaction_session_type} Compaction session data processing completed"
|
754
790
|
)
|
791
|
+
# TODO(pdames): Uncomment this once we support concurrent writes to the same
|
792
|
+
# partition (via write_to_table). This requires updating the commit_partition
|
793
|
+
# method to support previous partition as input. Right now, a concurrent write
|
794
|
+
# to the same partition will cause the commit_partition method to fail.
|
755
795
|
if execute_compaction_result.new_compacted_partition:
|
756
796
|
previous_partition: Optional[Partition] = None
|
757
|
-
if execute_compaction_result.is_inplace_compacted:
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
797
|
+
# if execute_compaction_result.is_inplace_compacted:
|
798
|
+
# previous_partition: Optional[
|
799
|
+
# Partition
|
800
|
+
# ] = params.deltacat_storage.get_partition(
|
801
|
+
# params.source_partition_locator.stream_locator,
|
802
|
+
# params.source_partition_locator.partition_values,
|
803
|
+
# **params.deltacat_storage_kwargs,
|
804
|
+
# )
|
805
|
+
# # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
|
766
806
|
logger.info(
|
767
807
|
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
768
808
|
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
769
809
|
)
|
810
|
+
# Set the round completion info on the partition before committing
|
811
|
+
execute_compaction_result.new_compacted_partition.compaction_round_completion_info = (
|
812
|
+
execute_compaction_result.new_round_completion_info
|
813
|
+
)
|
770
814
|
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
771
815
|
execute_compaction_result.new_compacted_partition,
|
772
816
|
previous_partition,
|
@@ -777,3 +821,57 @@ def _commit_compaction_result(
|
|
777
821
|
logger.warning("No new partition was committed during compaction.")
|
778
822
|
|
779
823
|
logger.info(f"Completed compaction session for: {params.source_partition_locator}")
|
824
|
+
|
825
|
+
|
826
|
+
def _upload_audit_data(
|
827
|
+
params: CompactPartitionParams,
|
828
|
+
audit_info: CompactionSessionAuditInfo,
|
829
|
+
) -> None:
|
830
|
+
"""
|
831
|
+
Upload audit data to the specified URL using the filesystem from catalog properties.
|
832
|
+
"""
|
833
|
+
audit_url = audit_info.audit_url
|
834
|
+
audit_data = json.dumps(audit_info.to_serializable(params.catalog.root))
|
835
|
+
if params.catalog and params.catalog.filesystem:
|
836
|
+
# Use the filesystem from catalog properties
|
837
|
+
filesystem = params.catalog.filesystem
|
838
|
+
parsed_url = urlparse(audit_url)
|
839
|
+
# For filesystem paths, use the path component
|
840
|
+
path = parsed_url.path if parsed_url.scheme else audit_url
|
841
|
+
|
842
|
+
# Ensure parent directories exist
|
843
|
+
import os
|
844
|
+
|
845
|
+
parent_dir = os.path.dirname(path)
|
846
|
+
if (
|
847
|
+
parent_dir
|
848
|
+
and not filesystem.get_file_info(parent_dir).type
|
849
|
+
== pyarrow.fs.FileType.Directory
|
850
|
+
):
|
851
|
+
try:
|
852
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
853
|
+
except Exception as e:
|
854
|
+
logger.warning(f"Failed to create directory {parent_dir}: {e}")
|
855
|
+
|
856
|
+
with filesystem.open_output_stream(path) as output_stream:
|
857
|
+
output_stream.write(audit_data.encode("utf-8"))
|
858
|
+
else:
|
859
|
+
# Fallback: resolve filesystem from the URL
|
860
|
+
path, filesystem = resolve_path_and_filesystem(audit_url)
|
861
|
+
|
862
|
+
# Ensure parent directories exist
|
863
|
+
import os
|
864
|
+
|
865
|
+
parent_dir = os.path.dirname(path)
|
866
|
+
if (
|
867
|
+
parent_dir
|
868
|
+
and not filesystem.get_file_info(parent_dir).type
|
869
|
+
== pyarrow.fs.FileType.Directory
|
870
|
+
):
|
871
|
+
try:
|
872
|
+
filesystem.create_dir(parent_dir, recursive=True)
|
873
|
+
except Exception as e:
|
874
|
+
logger.warning(f"Failed to create directory {parent_dir}: {e}")
|
875
|
+
|
876
|
+
with filesystem.open_output_stream(path) as output_stream:
|
877
|
+
output_stream.write(audit_data.encode("utf-8"))
|
@@ -18,7 +18,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
18
18
|
group_hash_bucket_indices,
|
19
19
|
group_by_pk_hash_bucket,
|
20
20
|
)
|
21
|
-
from deltacat.storage import
|
21
|
+
from deltacat.storage import metastore
|
22
22
|
from deltacat.utils.ray_utils.runtime import (
|
23
23
|
get_current_ray_task_id,
|
24
24
|
get_current_ray_worker_id,
|
@@ -50,8 +50,9 @@ def _group_file_records_by_pk_hash_bucket(
|
|
50
50
|
annotated_delta: DeltaAnnotated,
|
51
51
|
num_hash_buckets: int,
|
52
52
|
primary_keys: List[str],
|
53
|
+
all_column_names: List[str],
|
53
54
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
54
|
-
deltacat_storage=
|
55
|
+
deltacat_storage=metastore,
|
55
56
|
deltacat_storage_kwargs: Optional[dict] = None,
|
56
57
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
|
57
58
|
# read input parquet s3 objects into a list of delta file envelopes
|
@@ -61,6 +62,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
61
62
|
total_size_bytes,
|
62
63
|
) = read_delta_file_envelopes(
|
63
64
|
annotated_delta,
|
65
|
+
all_column_names,
|
64
66
|
read_kwargs_provider,
|
65
67
|
deltacat_storage,
|
66
68
|
deltacat_storage_kwargs,
|
@@ -116,6 +118,7 @@ def _timed_hash_bucket(input: HashBucketInput):
|
|
116
118
|
annotated_delta=input.annotated_delta,
|
117
119
|
num_hash_buckets=input.num_hash_buckets,
|
118
120
|
primary_keys=input.primary_keys,
|
121
|
+
all_column_names=input.all_column_names,
|
119
122
|
read_kwargs_provider=input.read_kwargs_provider,
|
120
123
|
deltacat_storage=input.deltacat_storage,
|
121
124
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|