deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from deltacat.utils.pyarrow import MAX_INT_BYTES
|
|
11
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
12
12
|
from uuid import uuid4
|
13
13
|
from deltacat import logs
|
14
|
-
from typing import Callable, Iterator, List, Optional, Tuple
|
14
|
+
from typing import Callable, Iterator, List, Optional, Tuple, Set
|
15
15
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
16
16
|
from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
|
17
17
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
@@ -38,10 +38,10 @@ from deltacat.storage import (
|
|
38
38
|
Delta,
|
39
39
|
DeltaLocator,
|
40
40
|
DeltaType,
|
41
|
-
Manifest,
|
42
41
|
Partition,
|
43
|
-
|
42
|
+
metastore,
|
44
43
|
)
|
44
|
+
from deltacat.storage.model.manifest import Manifest
|
45
45
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
46
46
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
47
47
|
from deltacat.compute.compactor_v2.constants import (
|
@@ -94,9 +94,12 @@ def _build_incremental_table(
|
|
94
94
|
# sort by delta file stream position now instead of sorting every row later
|
95
95
|
is_delete = False
|
96
96
|
for df_envelope in df_envelopes:
|
97
|
-
|
98
|
-
|
99
|
-
|
97
|
+
# Allow APPEND, UPSERT, and DELETE delta types
|
98
|
+
assert df_envelope.delta_type in (
|
99
|
+
DeltaType.APPEND,
|
100
|
+
DeltaType.UPSERT,
|
101
|
+
DeltaType.DELETE,
|
102
|
+
), "Only APPEND, UPSERT, and DELETE delta types are supported"
|
100
103
|
if df_envelope.delta_type == DeltaType.DELETE:
|
101
104
|
is_delete = True
|
102
105
|
|
@@ -108,16 +111,35 @@ def _build_incremental_table(
|
|
108
111
|
)
|
109
112
|
|
110
113
|
hb_tables.append(table)
|
111
|
-
result =
|
114
|
+
result = _concat_or_coerce_tables(hb_tables)
|
112
115
|
return result
|
113
116
|
|
114
117
|
|
118
|
+
def _concat_or_coerce_tables(all_tables: List[pa.Table]) -> pa.Table:
|
119
|
+
try:
|
120
|
+
return pa.concat_tables(all_tables)
|
121
|
+
except pa.ArrowInvalid:
|
122
|
+
# Fallback path: schema evolution needed - try PyArrow's built-in unification
|
123
|
+
if all_tables:
|
124
|
+
try:
|
125
|
+
return pa.concat_tables(
|
126
|
+
all_tables, promote_options="permissive", unify_schemas=True
|
127
|
+
)
|
128
|
+
except (pa.ArrowInvalid, TypeError, pa.ArrowNotImplementedError):
|
129
|
+
# If PyArrow unification fails, re-raise the original error
|
130
|
+
raise
|
131
|
+
else:
|
132
|
+
# Empty table list - should not happen but handle gracefully
|
133
|
+
raise RuntimeError("Expected at least one table to merge, but found none.")
|
134
|
+
|
135
|
+
|
115
136
|
def _merge_tables(
|
116
137
|
table: pa.Table,
|
117
138
|
primary_keys: List[str],
|
118
139
|
can_drop_duplicates: bool,
|
119
140
|
hb_index: int,
|
120
141
|
num_buckets: int,
|
142
|
+
original_fields: Set[str],
|
121
143
|
compacted_table: Optional[pa.Table] = None,
|
122
144
|
) -> pa.Table:
|
123
145
|
"""
|
@@ -159,7 +181,7 @@ def _merge_tables(
|
|
159
181
|
all_tables[incremental_idx], DeltaType.DELETE
|
160
182
|
)
|
161
183
|
# we need not drop duplicates
|
162
|
-
return
|
184
|
+
return _concat_or_coerce_tables(all_tables)
|
163
185
|
|
164
186
|
all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
|
165
187
|
|
@@ -169,6 +191,12 @@ def _merge_tables(
|
|
169
191
|
all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
|
170
192
|
)
|
171
193
|
|
194
|
+
# Always drop DELETE rows from incremental table
|
195
|
+
incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
|
196
|
+
|
197
|
+
# Default to using incremental records as-is, override only if merging is needed
|
198
|
+
incremental_data = incremental_table
|
199
|
+
|
172
200
|
if compacted_table:
|
173
201
|
compacted_table = all_tables[0]
|
174
202
|
|
@@ -194,34 +222,100 @@ def _merge_tables(
|
|
194
222
|
incremental_pk_hash_str, pa.large_string()
|
195
223
|
)
|
196
224
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
incremental_pk_hash_str,
|
201
|
-
)
|
225
|
+
records_to_update = pc.is_in(
|
226
|
+
compacted_pk_hash_str,
|
227
|
+
incremental_pk_hash_str,
|
202
228
|
)
|
203
229
|
|
230
|
+
records_to_keep = pc.invert(records_to_update)
|
231
|
+
|
232
|
+
# Keep records that don't have updates
|
204
233
|
result_table_list.append(compacted_table.filter(records_to_keep))
|
205
234
|
|
206
|
-
|
207
|
-
|
235
|
+
# Override default if merging is needed
|
236
|
+
if pc.sum(records_to_update).as_py() > 0: # There are records to update
|
237
|
+
old_records_to_update = compacted_table.filter(records_to_update)
|
238
|
+
# Perform partial UPSERT: merge old and new records field by field
|
239
|
+
incremental_data = _merge_records_partially(
|
240
|
+
old_records=old_records_to_update,
|
241
|
+
new_records=incremental_table,
|
242
|
+
original_fields=original_fields,
|
243
|
+
)
|
244
|
+
|
245
|
+
# Add the determined incremental data
|
246
|
+
result_table_list.append(incremental_data)
|
208
247
|
|
209
|
-
final_table =
|
248
|
+
final_table = _concat_or_coerce_tables(result_table_list)
|
210
249
|
final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
|
211
250
|
|
212
251
|
return final_table
|
213
252
|
|
214
253
|
|
254
|
+
def _merge_records_partially(
|
255
|
+
old_records: pa.Table, new_records: pa.Table, original_fields: Set[str]
|
256
|
+
) -> pa.Table:
|
257
|
+
"""
|
258
|
+
Merge records field by field for partial UPSERT behavior. Fills missing
|
259
|
+
fields in new_records with values from old_records.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
old_records: Records from the compacted table that need updates
|
263
|
+
new_records: New records with potential partial field updates
|
264
|
+
|
265
|
+
Returns:
|
266
|
+
Table with merged records where missing fields preserve old values
|
267
|
+
"""
|
268
|
+
# Get field sets (excluding hash column which is used for joining)
|
269
|
+
old_fields = set(old_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
|
270
|
+
new_fields = set(new_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
|
271
|
+
|
272
|
+
# Find fields that are missing from new_records but exist in old_records
|
273
|
+
missing_fields = old_fields - new_fields
|
274
|
+
|
275
|
+
# Find fields that were auto-added by schema coercion (missing from original user data)
|
276
|
+
# These should be treated as missing fields and filled from old_records
|
277
|
+
auto_added_null_fields = set()
|
278
|
+
|
279
|
+
# Use definitive information about which fields were originally provided
|
280
|
+
# Any field that exists in both tables but was NOT in the original user data
|
281
|
+
# should be treated as auto-added by schema coercion
|
282
|
+
for field_name in old_fields & new_fields: # Fields that exist in both
|
283
|
+
if field_name not in original_fields:
|
284
|
+
auto_added_null_fields.add(field_name)
|
285
|
+
|
286
|
+
# Combine missing fields with auto-added null fields
|
287
|
+
fields_to_fill = missing_fields | auto_added_null_fields
|
288
|
+
|
289
|
+
# Start with new_records and add missing fields from old_records
|
290
|
+
result_columns = {}
|
291
|
+
|
292
|
+
# Copy all existing columns from new_records
|
293
|
+
for column_name in new_records.column_names:
|
294
|
+
result_columns[column_name] = new_records[column_name]
|
295
|
+
|
296
|
+
# Fill in missing/auto-added null fields with values from old_records
|
297
|
+
for field_name in fields_to_fill:
|
298
|
+
# For missing fields, use the old values entirely
|
299
|
+
result_columns[field_name] = old_records[field_name]
|
300
|
+
|
301
|
+
# Create the enhanced new_records table with all fields filled
|
302
|
+
enhanced_new_records = pa.table(result_columns)
|
303
|
+
|
304
|
+
# Now we can return the enhanced table - it has all the fields with proper values
|
305
|
+
# Missing fields are filled with old values, explicitly null fields remain null
|
306
|
+
return enhanced_new_records
|
307
|
+
|
308
|
+
|
215
309
|
def _validate_bucketing_spec_compliance(
|
216
310
|
table: pa.Table,
|
217
311
|
num_buckets: int,
|
218
312
|
hb_index: int,
|
219
313
|
primary_keys: List[str],
|
220
|
-
|
314
|
+
rci: Optional[RoundCompletionInfo] = None,
|
221
315
|
log_prefix=None,
|
222
316
|
) -> None:
|
223
|
-
if
|
224
|
-
message_prefix = f"{log_prefix}{
|
317
|
+
if rci is not None:
|
318
|
+
message_prefix = f"{log_prefix}{rci.compacted_delta_locator.namespace}.{rci.compacted_delta_locator.table_name}.{rci.compacted_delta_locator.table_version}.{rci.compacted_delta_locator.partition_id}.{rci.compacted_delta_locator.partition_values}"
|
225
319
|
else:
|
226
320
|
message_prefix = f"{log_prefix}"
|
227
321
|
pki_table = generate_pk_hash_column(
|
@@ -251,14 +345,16 @@ def _validate_bucketing_spec_compliance(
|
|
251
345
|
|
252
346
|
def _download_compacted_table(
|
253
347
|
hb_index: int,
|
254
|
-
|
348
|
+
rci: RoundCompletionInfo,
|
255
349
|
primary_keys: List[str],
|
350
|
+
all_column_names: List[str],
|
351
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
256
352
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
257
|
-
deltacat_storage=
|
353
|
+
deltacat_storage: metastore = metastore,
|
258
354
|
deltacat_storage_kwargs: Optional[dict] = None,
|
259
355
|
) -> pa.Table:
|
260
356
|
tables = []
|
261
|
-
hb_index_to_indices =
|
357
|
+
hb_index_to_indices = rci.hb_index_to_entry_range
|
262
358
|
|
263
359
|
if str(hb_index) not in hb_index_to_indices:
|
264
360
|
return None
|
@@ -268,9 +364,16 @@ def _download_compacted_table(
|
|
268
364
|
), "indices should not be none and contains exactly two elements"
|
269
365
|
for offset in range(indices[1] - indices[0]):
|
270
366
|
table = deltacat_storage.download_delta_manifest_entry(
|
271
|
-
|
367
|
+
Delta.of(
|
368
|
+
rci.compacted_delta_locator,
|
369
|
+
DeltaType.APPEND,
|
370
|
+
compacted_delta_manifest.meta,
|
371
|
+
None,
|
372
|
+
compacted_delta_manifest,
|
373
|
+
),
|
272
374
|
entry_index=(indices[0] + offset),
|
273
375
|
file_reader_kwargs_provider=read_kwargs_provider,
|
376
|
+
all_column_names=all_column_names,
|
274
377
|
**deltacat_storage_kwargs,
|
275
378
|
)
|
276
379
|
|
@@ -291,10 +394,10 @@ def _download_compacted_table(
|
|
291
394
|
if primary_keys and check_bucketing_spec:
|
292
395
|
_validate_bucketing_spec_compliance(
|
293
396
|
compacted_table,
|
294
|
-
|
397
|
+
rci.hash_bucket_count,
|
295
398
|
hb_index,
|
296
399
|
primary_keys,
|
297
|
-
|
400
|
+
rci=rci,
|
298
401
|
log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
|
299
402
|
)
|
300
403
|
return compacted_table
|
@@ -304,15 +407,9 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
304
407
|
hb_index_copy_by_reference: List[int],
|
305
408
|
round_completion_info: RoundCompletionInfo,
|
306
409
|
write_to_partition: Partition,
|
307
|
-
|
308
|
-
deltacat_storage_kwargs: Optional[dict] = None,
|
410
|
+
compacted_manifest: Optional[Manifest] = None,
|
309
411
|
) -> List[MaterializeResult]:
|
310
412
|
|
311
|
-
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
312
|
-
manifest = deltacat_storage.get_delta_manifest(
|
313
|
-
compacted_delta_locator, **deltacat_storage_kwargs
|
314
|
-
)
|
315
|
-
|
316
413
|
manifest_entry_referenced_list = []
|
317
414
|
materialize_result_list = []
|
318
415
|
hb_index_to_indices = round_completion_info.hb_index_to_entry_range
|
@@ -329,27 +426,27 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
329
426
|
for offset in range(indices[1] - indices[0]):
|
330
427
|
entry_index = indices[0] + offset
|
331
428
|
assert entry_index < len(
|
332
|
-
|
333
|
-
), f"entry index: {entry_index} >= {len(
|
334
|
-
manifest_entry =
|
429
|
+
compacted_manifest.entries
|
430
|
+
), f"entry index: {entry_index} >= {len(compacted_manifest.entries)}"
|
431
|
+
manifest_entry = compacted_manifest.entries[entry_index]
|
335
432
|
manifest_entry_referenced_list.append(manifest_entry)
|
336
433
|
|
337
|
-
|
434
|
+
compacted_manifest = Manifest.of(
|
338
435
|
entries=manifest_entry_referenced_list, uuid=str(uuid4())
|
339
436
|
)
|
340
437
|
delta = Delta.of(
|
341
438
|
locator=DeltaLocator.of(write_to_partition.locator),
|
342
|
-
delta_type=DeltaType.
|
343
|
-
meta=
|
344
|
-
manifest=
|
439
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
440
|
+
meta=compacted_manifest.meta,
|
441
|
+
manifest=compacted_manifest,
|
345
442
|
previous_stream_position=write_to_partition.stream_position,
|
346
443
|
properties={},
|
347
444
|
)
|
348
445
|
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
349
446
|
len(manifest_entry_referenced_list),
|
350
|
-
|
351
|
-
|
352
|
-
|
447
|
+
compacted_manifest.meta.source_content_length,
|
448
|
+
compacted_manifest.meta.content_length,
|
449
|
+
compacted_manifest.meta.record_count,
|
353
450
|
)
|
354
451
|
materialize_result = MaterializeResult.of(
|
355
452
|
delta=delta,
|
@@ -374,6 +471,7 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
|
|
374
471
|
"""
|
375
472
|
return (
|
376
473
|
input.round_completion_info
|
474
|
+
and input.compacted_manifest is not None
|
377
475
|
and input.round_completion_info.hb_index_to_entry_range
|
378
476
|
and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
|
379
477
|
is not None
|
@@ -391,6 +489,7 @@ def _can_copy_by_reference(
|
|
391
489
|
not has_delete
|
392
490
|
and not merge_file_group.dfe_groups
|
393
491
|
and input.round_completion_info is not None
|
492
|
+
and input.compacted_manifest is not None
|
394
493
|
)
|
395
494
|
|
396
495
|
if input.disable_copy_by_reference:
|
@@ -489,9 +588,9 @@ def _compact_tables(
|
|
489
588
|
delete_file_envelopes + df_envelopes
|
490
589
|
)
|
491
590
|
assert all(
|
492
|
-
dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
|
591
|
+
dfe.delta_type in (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE)
|
493
592
|
for dfe in reordered_all_dfes
|
494
|
-
), "All reordered delta file envelopes must be of the UPSERT or DELETE"
|
593
|
+
), "All reordered delta file envelopes must be of the APPEND, UPSERT or DELETE"
|
495
594
|
table = compacted_table
|
496
595
|
aggregated_incremental_len = 0
|
497
596
|
aggregated_deduped_records = 0
|
@@ -499,7 +598,7 @@ def _compact_tables(
|
|
499
598
|
for i, (delta_type, delta_type_sequence) in enumerate(
|
500
599
|
_group_sequence_by_delta_type(reordered_all_dfes)
|
501
600
|
):
|
502
|
-
if delta_type is DeltaType.UPSERT:
|
601
|
+
if delta_type is DeltaType.UPSERT or delta_type is DeltaType.APPEND:
|
503
602
|
(table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
|
504
603
|
input=input,
|
505
604
|
dfe_list=delta_type_sequence,
|
@@ -540,8 +639,9 @@ def _apply_upserts(
|
|
540
639
|
prev_table=None,
|
541
640
|
) -> Tuple[pa.Table, int, int, int]:
|
542
641
|
assert all(
|
543
|
-
dfe.delta_type is DeltaType.UPSERT
|
544
|
-
|
642
|
+
dfe.delta_type is DeltaType.UPSERT or dfe.delta_type is DeltaType.APPEND
|
643
|
+
for dfe in dfe_list
|
644
|
+
), "All incoming delta file envelopes must of the DeltaType.UPSERT or DeltaType.APPEND"
|
545
645
|
logger.info(
|
546
646
|
f"[Hash bucket index {hb_idx}] Reading dedupe input for "
|
547
647
|
f"{len(dfe_list)} delta file envelope lists..."
|
@@ -556,16 +656,19 @@ def _apply_upserts(
|
|
556
656
|
# on non event based sort key does not produce consistent
|
557
657
|
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
558
658
|
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
559
|
-
table = table.sort_by(
|
659
|
+
table = table.sort_by(
|
660
|
+
[pa_key for key in input.sort_keys for pa_key in key.arrow]
|
661
|
+
)
|
560
662
|
hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
|
561
663
|
table, merge_time = timed_invocation(
|
562
664
|
func=_merge_tables,
|
563
665
|
table=table,
|
564
666
|
primary_keys=input.primary_keys,
|
565
667
|
can_drop_duplicates=input.drop_duplicates,
|
566
|
-
compacted_table=prev_table,
|
567
668
|
hb_index=hb_idx,
|
568
669
|
num_buckets=input.hash_bucket_count,
|
670
|
+
original_fields=input.original_fields,
|
671
|
+
compacted_table=prev_table,
|
569
672
|
)
|
570
673
|
deduped_records = hb_table_record_count - len(table)
|
571
674
|
return table, incremental_len, deduped_records, merge_time
|
@@ -582,8 +685,7 @@ def _copy_manifests_from_hash_bucketing(
|
|
582
685
|
hb_index_copy_by_reference_ids,
|
583
686
|
input.round_completion_info,
|
584
687
|
input.write_to_partition,
|
585
|
-
input.
|
586
|
-
input.deltacat_storage_kwargs,
|
688
|
+
input.compacted_manifest,
|
587
689
|
)
|
588
690
|
)
|
589
691
|
logger.info(
|
@@ -623,12 +725,13 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
623
725
|
):
|
624
726
|
hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
|
625
727
|
continue
|
626
|
-
|
627
728
|
if _has_previous_compacted_table(input, merge_file_group.hb_index):
|
628
729
|
compacted_table = _download_compacted_table(
|
629
730
|
hb_index=merge_file_group.hb_index,
|
630
|
-
|
731
|
+
rci=input.round_completion_info,
|
631
732
|
primary_keys=input.primary_keys,
|
733
|
+
all_column_names=input.all_column_names,
|
734
|
+
compacted_delta_manifest=input.compacted_manifest,
|
632
735
|
read_kwargs_provider=input.read_kwargs_provider,
|
633
736
|
deltacat_storage=input.deltacat_storage,
|
634
737
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import ray
|
3
3
|
import functools
|
4
|
+
from typing import List
|
4
5
|
from deltacat.compute.compactor_v2.constants import (
|
5
6
|
TASK_MAX_PARALLELISM,
|
6
7
|
MAX_PARQUET_METADATA_SIZE,
|
@@ -11,10 +12,10 @@ from deltacat import logs
|
|
11
12
|
from deltacat.storage import (
|
12
13
|
Delta,
|
13
14
|
ManifestEntry,
|
14
|
-
|
15
|
+
metastore,
|
15
16
|
)
|
16
17
|
from typing import Dict, Optional, Any
|
17
|
-
from deltacat.types.media import
|
18
|
+
from deltacat.types.media import DatasetType
|
18
19
|
from deltacat.types.media import ContentType
|
19
20
|
from deltacat.types.partial_download import PartialParquetParameters
|
20
21
|
from deltacat.exceptions import RetryableError
|
@@ -74,7 +75,8 @@ class AppendContentTypeParamsCache:
|
|
74
75
|
def _download_parquet_metadata_for_manifest_entry(
|
75
76
|
delta: Delta,
|
76
77
|
entry_index: int,
|
77
|
-
|
78
|
+
all_column_names: List[str],
|
79
|
+
deltacat_storage: metastore,
|
78
80
|
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
79
81
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
80
82
|
) -> Dict[str, Any]:
|
@@ -86,11 +88,13 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
86
88
|
"'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
|
87
89
|
)
|
88
90
|
deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
|
91
|
+
|
89
92
|
pq_file = deltacat_storage.download_delta_manifest_entry(
|
90
93
|
delta,
|
91
94
|
entry_index=entry_index,
|
92
|
-
table_type=
|
95
|
+
table_type=DatasetType.PYARROW_PARQUET,
|
93
96
|
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
97
|
+
all_column_names=all_column_names,
|
94
98
|
**deltacat_storage_kwargs,
|
95
99
|
)
|
96
100
|
|
@@ -104,9 +108,10 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
104
108
|
|
105
109
|
def append_content_type_params(
|
106
110
|
delta: Delta,
|
111
|
+
all_column_names: List[str],
|
107
112
|
task_max_parallelism: int = TASK_MAX_PARALLELISM,
|
108
113
|
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
109
|
-
deltacat_storage=
|
114
|
+
deltacat_storage: metastore = metastore,
|
110
115
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
111
116
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
112
117
|
) -> bool:
|
@@ -172,13 +177,19 @@ def append_content_type_params(
|
|
172
177
|
max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
|
173
178
|
)
|
174
179
|
|
180
|
+
# create a copy of deltacat_storage_kwargs without transaction key
|
181
|
+
deltacat_storage_kwargs_copy = {
|
182
|
+
k: v for k, v in deltacat_storage_kwargs.items() if k != "transaction"
|
183
|
+
}
|
184
|
+
|
175
185
|
def input_provider(index, item) -> Dict:
|
176
186
|
return {
|
177
187
|
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
178
|
-
"deltacat_storage_kwargs":
|
188
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs_copy,
|
179
189
|
"deltacat_storage": deltacat_storage,
|
180
190
|
"delta": delta,
|
181
191
|
"entry_index": item,
|
192
|
+
"all_column_names": all_column_names,
|
182
193
|
}
|
183
194
|
|
184
195
|
logger.info(
|
@@ -9,7 +9,7 @@ from deltacat.storage import (
|
|
9
9
|
Delta,
|
10
10
|
)
|
11
11
|
from deltacat.storage.model.delta import DeltaType
|
12
|
-
from deltacat.storage import
|
12
|
+
from deltacat.storage import metastore
|
13
13
|
from deltacat.types.media import StorageType
|
14
14
|
from deltacat.utils.common import ReadKwargsProvider
|
15
15
|
from deltacat import logs
|
@@ -30,8 +30,9 @@ def contains_delete_deltas(deltas: List[Delta]) -> bool:
|
|
30
30
|
|
31
31
|
def read_delta_file_envelopes(
|
32
32
|
annotated_delta: DeltaAnnotated,
|
33
|
+
all_column_names: List[str],
|
33
34
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
34
|
-
deltacat_storage=
|
35
|
+
deltacat_storage: metastore = metastore,
|
35
36
|
deltacat_storage_kwargs: Optional[dict] = None,
|
36
37
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
37
38
|
tables = deltacat_storage.download_delta(
|
@@ -39,6 +40,7 @@ def read_delta_file_envelopes(
|
|
39
40
|
max_parallelism=1,
|
40
41
|
file_reader_kwargs_provider=read_kwargs_provider,
|
41
42
|
storage_type=StorageType.LOCAL,
|
43
|
+
all_column_names=all_column_names,
|
42
44
|
**deltacat_storage_kwargs,
|
43
45
|
)
|
44
46
|
annotations = annotated_delta.annotations
|
@@ -80,7 +82,7 @@ def read_delta_file_envelopes(
|
|
80
82
|
def get_local_delta_file_envelopes(
|
81
83
|
uniform_deltas: List[DeltaAnnotated],
|
82
84
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
83
|
-
deltacat_storage=
|
85
|
+
deltacat_storage=metastore,
|
84
86
|
deltacat_storage_kwargs: Optional[dict] = None,
|
85
87
|
) -> Tuple[List[DeltaFileEnvelope], int]:
|
86
88
|
local_dfe_list = []
|
@@ -3,7 +3,7 @@ import functools
|
|
3
3
|
from deltacat.storage import (
|
4
4
|
PartitionLocator,
|
5
5
|
Delta,
|
6
|
-
|
6
|
+
metastore,
|
7
7
|
)
|
8
8
|
from deltacat import logs
|
9
9
|
from deltacat.compute.compactor.utils import io as io_v1
|
@@ -38,7 +38,7 @@ def discover_deltas(
|
|
38
38
|
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
39
39
|
rebase_source_partition_high_watermark: Optional[int] = None,
|
40
40
|
rcf_high_watermark: Optional[int] = None,
|
41
|
-
deltacat_storage=
|
41
|
+
deltacat_storage=metastore,
|
42
42
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
43
43
|
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
44
44
|
) -> List[Delta]:
|
@@ -67,6 +67,11 @@ def discover_deltas(
|
|
67
67
|
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
68
68
|
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
69
69
|
)
|
70
|
+
logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
|
71
|
+
logger.info(
|
72
|
+
f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
|
73
|
+
)
|
74
|
+
logger.info(f"DEBUG: total input deltas found = {len(result)}")
|
70
75
|
|
71
76
|
if rebase_source_partition_locator:
|
72
77
|
previous_compacted_deltas = io_v1._discover_deltas(
|
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
|
|
93
98
|
hash_bucket_count: int,
|
94
99
|
compaction_audit: CompactionSessionAuditInfo,
|
95
100
|
compact_partition_params: CompactPartitionParams,
|
96
|
-
|
101
|
+
all_column_names: List[str],
|
102
|
+
deltacat_storage=metastore,
|
97
103
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
98
104
|
) -> List[DeltaAnnotated]:
|
99
105
|
|
@@ -113,6 +119,7 @@ def create_uniform_input_deltas(
|
|
113
119
|
)
|
114
120
|
append_content_type_params(
|
115
121
|
delta=delta,
|
122
|
+
all_column_names=all_column_names,
|
116
123
|
deltacat_storage=deltacat_storage,
|
117
124
|
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
118
125
|
task_max_parallelism=compact_partition_params.task_max_parallelism,
|
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
|
23
23
|
|
24
24
|
from deltacat.utils.performance import timed_invocation
|
25
25
|
from deltacat.storage import (
|
26
|
+
DeltaType,
|
26
27
|
Partition,
|
27
28
|
)
|
28
29
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
@@ -47,13 +48,21 @@ def materialize(
|
|
47
48
|
# TODO (pdames): compare performance to pandas-native materialize path
|
48
49
|
df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
|
49
50
|
compacted_table = df
|
51
|
+
# Extract schema from table_writer_kwargs to pass as direct parameter
|
52
|
+
# This ensures schema_id is properly set in the manifest
|
53
|
+
schema = None
|
54
|
+
if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
|
55
|
+
schema = input.table_writer_kwargs["schema"]
|
56
|
+
|
50
57
|
delta, stage_delta_time = timed_invocation(
|
51
58
|
input.deltacat_storage.stage_delta,
|
52
59
|
compacted_table,
|
53
60
|
input.write_to_partition,
|
61
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
54
62
|
max_records_per_entry=input.max_records_per_output_file,
|
55
63
|
content_type=input.compacted_file_content_type,
|
56
|
-
|
64
|
+
schema=schema, # Pass schema as direct parameter for schema_id extraction
|
65
|
+
table_writer_kwargs=input.table_writer_kwargs,
|
57
66
|
**input.deltacat_storage_kwargs,
|
58
67
|
)
|
59
68
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
@@ -112,6 +121,7 @@ def generate_local_merge_input(
|
|
112
121
|
return MergeInput.of(
|
113
122
|
merge_file_groups_provider=LocalMergeFileGroupsProvider(
|
114
123
|
annotated_deltas,
|
124
|
+
all_column_names=params.all_column_names,
|
115
125
|
read_kwargs_provider=params.read_kwargs_provider,
|
116
126
|
deltacat_storage=params.deltacat_storage,
|
117
127
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
@@ -119,12 +129,13 @@ def generate_local_merge_input(
|
|
119
129
|
write_to_partition=compacted_partition,
|
120
130
|
compacted_file_content_type=params.compacted_file_content_type,
|
121
131
|
primary_keys=params.primary_keys,
|
132
|
+
all_column_names=params.all_column_names,
|
122
133
|
sort_keys=params.sort_keys,
|
123
134
|
drop_duplicates=params.drop_duplicates,
|
124
135
|
max_records_per_output_file=params.records_per_compacted_file,
|
125
136
|
enable_profiler=params.enable_profiler,
|
126
137
|
metrics_config=params.metrics_config,
|
127
|
-
|
138
|
+
table_writer_kwargs=params.table_writer_kwargs,
|
128
139
|
read_kwargs_provider=params.read_kwargs_provider,
|
129
140
|
round_completion_info=round_completion_info,
|
130
141
|
object_store=params.object_store,
|
@@ -134,4 +145,5 @@ def generate_local_merge_input(
|
|
134
145
|
delete_file_envelopes=delete_file_envelopes,
|
135
146
|
disable_copy_by_reference=params.disable_copy_by_reference,
|
136
147
|
hash_bucket_count=params.hash_bucket_count,
|
148
|
+
original_fields=params.original_fields,
|
137
149
|
)
|
@@ -11,7 +11,7 @@ from deltacat.compute.compactor_v2.model.merge_file_group import (
|
|
11
11
|
from deltacat.storage import (
|
12
12
|
Manifest,
|
13
13
|
ManifestEntry,
|
14
|
-
|
14
|
+
metastore,
|
15
15
|
)
|
16
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
17
17
|
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
@@ -77,8 +77,6 @@ def _get_merge_task_options(
|
|
77
77
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
78
78
|
compacted_delta_manifest: Optional[Manifest] = None,
|
79
79
|
primary_keys: Optional[List[str]] = None,
|
80
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
81
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
82
80
|
memory_logs_enabled: Optional[bool] = None,
|
83
81
|
) -> Dict[str, Any]:
|
84
82
|
if (
|
@@ -275,8 +273,6 @@ def merge_resource_options_provider(
|
|
275
273
|
compacted_delta_manifest: Optional[Manifest] = None,
|
276
274
|
ray_custom_resources: Optional[Dict] = None,
|
277
275
|
primary_keys: Optional[List[str]] = None,
|
278
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
279
|
-
deltacat_storage_kwargs: Optional[Dict] = {},
|
280
276
|
memory_logs_enabled: Optional[bool] = None,
|
281
277
|
**kwargs,
|
282
278
|
) -> Dict:
|
@@ -306,8 +302,6 @@ def merge_resource_options_provider(
|
|
306
302
|
round_completion_info=round_completion_info,
|
307
303
|
compacted_delta_manifest=compacted_delta_manifest,
|
308
304
|
primary_keys=primary_keys,
|
309
|
-
deltacat_storage=deltacat_storage,
|
310
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
311
305
|
memory_logs_enabled=memory_logs_enabled,
|
312
306
|
estimate_resources_params=estimate_resources_params,
|
313
307
|
)
|
@@ -322,7 +316,7 @@ def local_merge_resource_options_provider(
|
|
322
316
|
compacted_delta_manifest: Optional[Manifest] = None,
|
323
317
|
ray_custom_resources: Optional[Dict] = None,
|
324
318
|
primary_keys: Optional[List[str]] = None,
|
325
|
-
deltacat_storage=
|
319
|
+
deltacat_storage=metastore,
|
326
320
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
327
321
|
memory_logs_enabled: Optional[bool] = None,
|
328
322
|
**kwargs,
|
@@ -348,8 +342,6 @@ def local_merge_resource_options_provider(
|
|
348
342
|
round_completion_info=round_completion_info,
|
349
343
|
compacted_delta_manifest=compacted_delta_manifest,
|
350
344
|
primary_keys=primary_keys,
|
351
|
-
deltacat_storage=deltacat_storage,
|
352
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
353
345
|
memory_logs_enabled=memory_logs_enabled,
|
354
346
|
estimate_resources_params=estimate_resources_params,
|
355
347
|
)
|