deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
|
|
1
|
+
from typing import List, Dict
|
2
|
+
from collections import defaultdict
|
3
|
+
import uuid
|
4
|
+
from pyiceberg.table import Table
|
5
|
+
from pyiceberg.table.metadata import TableMetadata
|
6
|
+
from pyiceberg.table.snapshots import (
|
7
|
+
Operation,
|
8
|
+
)
|
9
|
+
from pyiceberg.manifest import (
|
10
|
+
DataFile,
|
11
|
+
DataFileContent,
|
12
|
+
ManifestContent,
|
13
|
+
ManifestEntry,
|
14
|
+
ManifestEntryStatus,
|
15
|
+
ManifestFile,
|
16
|
+
write_manifest,
|
17
|
+
)
|
18
|
+
import itertools
|
19
|
+
from pyiceberg.utils.concurrent import ExecutorFactory
|
20
|
+
from pyiceberg.table.update.snapshot import _SnapshotProducer, UpdateSnapshot
|
21
|
+
|
22
|
+
|
23
|
+
def replace_delete_files_override(
|
24
|
+
update_snapshot: UpdateSnapshot,
|
25
|
+
) -> "_ReplaceDeleteFilesOverride":
|
26
|
+
commit_uuid = uuid.uuid4()
|
27
|
+
return _ReplaceDeleteFilesOverride(
|
28
|
+
commit_uuid=commit_uuid,
|
29
|
+
operation=Operation.OVERWRITE,
|
30
|
+
transaction=update_snapshot._transaction,
|
31
|
+
io=update_snapshot._io,
|
32
|
+
snapshot_properties=update_snapshot._snapshot_properties,
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
class _ReplaceDeleteFilesOverride(_SnapshotProducer):
|
37
|
+
def _manifests(self) -> List[ManifestFile]:
|
38
|
+
def _write_added_manifest() -> List[ManifestFile]:
|
39
|
+
if self._added_data_files:
|
40
|
+
with write_manifest(
|
41
|
+
format_version=self._transaction.table_metadata.format_version,
|
42
|
+
spec=self._transaction.table_metadata.spec(),
|
43
|
+
schema=self._transaction.table_metadata.schema(),
|
44
|
+
output_file=self.new_manifest_output(),
|
45
|
+
snapshot_id=self._snapshot_id,
|
46
|
+
) as writer:
|
47
|
+
for data_file in self._added_data_files:
|
48
|
+
writer.add(
|
49
|
+
ManifestEntry(
|
50
|
+
status=ManifestEntryStatus.ADDED,
|
51
|
+
snapshot_id=self._snapshot_id,
|
52
|
+
sequence_number=None,
|
53
|
+
file_sequence_number=None,
|
54
|
+
data_file=data_file,
|
55
|
+
)
|
56
|
+
)
|
57
|
+
writer.content = self.writer_content
|
58
|
+
return [writer.to_manifest_file()]
|
59
|
+
else:
|
60
|
+
return []
|
61
|
+
|
62
|
+
def _write_delete_manifest() -> List[ManifestFile]:
|
63
|
+
# Check if we need to mark the files as deleted
|
64
|
+
deleted_entries = self._deleted_entries()
|
65
|
+
if len(deleted_entries) > 0:
|
66
|
+
deleted_manifests = []
|
67
|
+
partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list)
|
68
|
+
for deleted_entry in deleted_entries:
|
69
|
+
partition_groups[deleted_entry.data_file.spec_id].append(
|
70
|
+
deleted_entry
|
71
|
+
)
|
72
|
+
for spec_id, entries in partition_groups.items():
|
73
|
+
with write_manifest(
|
74
|
+
format_version=self._transaction.table_metadata.format_version,
|
75
|
+
spec=self._transaction.table_metadata.specs()[spec_id],
|
76
|
+
schema=self._transaction.table_metadata.schema(),
|
77
|
+
output_file=self.new_manifest_output(),
|
78
|
+
snapshot_id=self._snapshot_id,
|
79
|
+
) as writer:
|
80
|
+
for entry in entries:
|
81
|
+
writer.add_entry(entry)
|
82
|
+
deleted_manifests.append(writer.to_manifest_file())
|
83
|
+
return deleted_manifests
|
84
|
+
else:
|
85
|
+
return []
|
86
|
+
|
87
|
+
executor = ExecutorFactory.get_or_create()
|
88
|
+
|
89
|
+
added_manifests = executor.submit(_write_added_manifest)
|
90
|
+
existing_manifests = executor.submit(self._existing_manifests)
|
91
|
+
delete_manifests = executor.submit(_write_delete_manifest)
|
92
|
+
return self._process_manifests(
|
93
|
+
added_manifests.result()
|
94
|
+
+ existing_manifests.result()
|
95
|
+
+ delete_manifests.result()
|
96
|
+
)
|
97
|
+
|
98
|
+
def writer_content(self) -> ManifestContent:
|
99
|
+
return ManifestContent.DELETES
|
100
|
+
|
101
|
+
def _existing_manifests(self) -> List[ManifestFile]:
|
102
|
+
"""To determine if there are any existing manifest files.
|
103
|
+
|
104
|
+
A fast append will add another ManifestFile to the ManifestList.
|
105
|
+
All the existing manifest files are considered existing.
|
106
|
+
"""
|
107
|
+
existing_manifests = []
|
108
|
+
|
109
|
+
if self._parent_snapshot_id is not None:
|
110
|
+
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
|
111
|
+
self._parent_snapshot_id
|
112
|
+
)
|
113
|
+
|
114
|
+
if previous_snapshot is None:
|
115
|
+
raise ValueError(
|
116
|
+
f"Snapshot could not be found: {self._parent_snapshot_id}"
|
117
|
+
)
|
118
|
+
|
119
|
+
for manifest in previous_snapshot.manifests(io=self._io):
|
120
|
+
if (
|
121
|
+
manifest.has_added_files()
|
122
|
+
or manifest.has_existing_files()
|
123
|
+
or manifest.added_snapshot_id == self._snapshot_id
|
124
|
+
):
|
125
|
+
existing_manifests.append(manifest)
|
126
|
+
|
127
|
+
return existing_manifests
|
128
|
+
|
129
|
+
def _deleted_entries(self) -> List[ManifestEntry]:
|
130
|
+
if self._parent_snapshot_id is not None:
|
131
|
+
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
|
132
|
+
self._parent_snapshot_id
|
133
|
+
)
|
134
|
+
if previous_snapshot is None:
|
135
|
+
# This should never happen since you cannot overwrite an empty table
|
136
|
+
raise ValueError(
|
137
|
+
f"Could not find the previous snapshot: {self._parent_snapshot_id}"
|
138
|
+
)
|
139
|
+
|
140
|
+
executor = ExecutorFactory.get_or_create()
|
141
|
+
|
142
|
+
def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
|
143
|
+
return [
|
144
|
+
ManifestEntry(
|
145
|
+
status=ManifestEntryStatus.DELETED,
|
146
|
+
snapshot_id=entry.snapshot_id,
|
147
|
+
sequence_number=entry.sequence_number,
|
148
|
+
file_sequence_number=entry.file_sequence_number,
|
149
|
+
data_file=entry.data_file,
|
150
|
+
)
|
151
|
+
for entry in manifest.fetch_manifest_entry(
|
152
|
+
self._io, discard_deleted=True
|
153
|
+
)
|
154
|
+
if entry.data_file.content == DataFileContent.EQUALITY_DELETES
|
155
|
+
and entry.data_file in self._deleted_data_files
|
156
|
+
]
|
157
|
+
|
158
|
+
list_of_entries = executor.map(
|
159
|
+
_get_entries, previous_snapshot.manifests(self._io)
|
160
|
+
)
|
161
|
+
return list(itertools.chain(*list_of_entries))
|
162
|
+
else:
|
163
|
+
return []
|
164
|
+
|
165
|
+
|
166
|
+
def commit_append_snapshot(
|
167
|
+
iceberg_table: Table, new_position_delete_files: List[DataFile]
|
168
|
+
) -> TableMetadata:
|
169
|
+
tx = iceberg_table.transaction()
|
170
|
+
try:
|
171
|
+
if iceberg_table.metadata.name_mapping() is None:
|
172
|
+
tx.set_properties(
|
173
|
+
**{
|
174
|
+
"schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
|
175
|
+
}
|
176
|
+
)
|
177
|
+
with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
|
178
|
+
if new_position_delete_files:
|
179
|
+
for data_file in new_position_delete_files:
|
180
|
+
append_snapshot.append_data_file(data_file)
|
181
|
+
except Exception as e:
|
182
|
+
raise e
|
183
|
+
else:
|
184
|
+
return tx.commit_transaction().metadata
|
185
|
+
|
186
|
+
|
187
|
+
def append_delete_files_override(
|
188
|
+
update_snapshot: UpdateSnapshot,
|
189
|
+
) -> "_AppendDeleteFilesOverride":
|
190
|
+
commit_uuid = uuid.uuid4()
|
191
|
+
return _AppendDeleteFilesOverride(
|
192
|
+
commit_uuid=commit_uuid,
|
193
|
+
operation=Operation.APPEND,
|
194
|
+
transaction=update_snapshot._transaction,
|
195
|
+
io=update_snapshot._io,
|
196
|
+
snapshot_properties=update_snapshot._snapshot_properties,
|
197
|
+
)
|
198
|
+
|
199
|
+
|
200
|
+
class _AppendDeleteFilesOverride(_SnapshotProducer):
|
201
|
+
def _manifests(self) -> List[ManifestFile]:
|
202
|
+
def _write_added_manifest() -> List[ManifestFile]:
|
203
|
+
if self._added_data_files:
|
204
|
+
with write_manifest(
|
205
|
+
format_version=self._transaction.table_metadata.format_version,
|
206
|
+
spec=self._transaction.table_metadata.spec(),
|
207
|
+
schema=self._transaction.table_metadata.schema(),
|
208
|
+
output_file=self.new_manifest_output(),
|
209
|
+
snapshot_id=self._snapshot_id,
|
210
|
+
) as writer:
|
211
|
+
for data_file in self._added_data_files:
|
212
|
+
writer.add(
|
213
|
+
ManifestEntry(
|
214
|
+
status=ManifestEntryStatus.ADDED,
|
215
|
+
snapshot_id=self._snapshot_id,
|
216
|
+
sequence_number=None,
|
217
|
+
file_sequence_number=None,
|
218
|
+
data_file=data_file,
|
219
|
+
)
|
220
|
+
)
|
221
|
+
writer.content = self.writer_content
|
222
|
+
return [writer.to_manifest_file()]
|
223
|
+
else:
|
224
|
+
return []
|
225
|
+
|
226
|
+
executor = ExecutorFactory.get_or_create()
|
227
|
+
|
228
|
+
added_manifests = executor.submit(_write_added_manifest)
|
229
|
+
existing_manifests = executor.submit(self._existing_manifests)
|
230
|
+
|
231
|
+
return self._process_manifests(
|
232
|
+
added_manifests.result() + existing_manifests.result()
|
233
|
+
)
|
234
|
+
|
235
|
+
def writer_content(self) -> ManifestContent:
|
236
|
+
return ManifestContent.DELETES
|
237
|
+
|
238
|
+
def _existing_manifests(self) -> List[ManifestFile]:
|
239
|
+
"""To determine if there are any existing manifest files.
|
240
|
+
|
241
|
+
A fast append will add another ManifestFile to the ManifestList.
|
242
|
+
All the existing manifest files are considered existing.
|
243
|
+
"""
|
244
|
+
existing_manifests = []
|
245
|
+
|
246
|
+
if self._parent_snapshot_id is not None:
|
247
|
+
previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
|
248
|
+
self._parent_snapshot_id
|
249
|
+
)
|
250
|
+
|
251
|
+
if previous_snapshot is None:
|
252
|
+
raise ValueError(
|
253
|
+
f"Snapshot could not be found: {self._parent_snapshot_id}"
|
254
|
+
)
|
255
|
+
|
256
|
+
for manifest in previous_snapshot.manifests(io=self._io):
|
257
|
+
if (
|
258
|
+
manifest.has_added_files()
|
259
|
+
or manifest.has_existing_files()
|
260
|
+
or manifest.added_snapshot_id == self._snapshot_id
|
261
|
+
):
|
262
|
+
existing_manifests.append(manifest)
|
263
|
+
|
264
|
+
return existing_manifests
|
265
|
+
|
266
|
+
def _deleted_entries(self) -> List[ManifestEntry]:
|
267
|
+
"""To determine if we need to record any deleted manifest entries.
|
268
|
+
|
269
|
+
In case of an append, nothing is deleted.
|
270
|
+
"""
|
271
|
+
return []
|
272
|
+
|
273
|
+
|
274
|
+
def commit_replace_snapshot(
|
275
|
+
iceberg_table: Table,
|
276
|
+
new_position_delete_files: List[DataFile],
|
277
|
+
to_be_deleted_files: List[DataFile],
|
278
|
+
) -> TableMetadata:
|
279
|
+
tx = iceberg_table.transaction()
|
280
|
+
try:
|
281
|
+
if iceberg_table.metadata.name_mapping() is None:
|
282
|
+
tx.set_properties(
|
283
|
+
**{
|
284
|
+
"schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
|
285
|
+
}
|
286
|
+
)
|
287
|
+
with replace_delete_files_override(
|
288
|
+
tx.update_snapshot()
|
289
|
+
) as replace_delete_snapshot:
|
290
|
+
if new_position_delete_files:
|
291
|
+
for data_file in new_position_delete_files:
|
292
|
+
replace_delete_snapshot.append_data_file(data_file)
|
293
|
+
if to_be_deleted_files:
|
294
|
+
for delete_file in to_be_deleted_files:
|
295
|
+
replace_delete_snapshot.delete_data_file(delete_file)
|
296
|
+
except Exception as e:
|
297
|
+
raise e
|
298
|
+
else:
|
299
|
+
return tx.commit_transaction().metadata
|
@@ -0,0 +1,366 @@
|
|
1
|
+
import pyarrow.compute as pc
|
2
|
+
|
3
|
+
import deltacat.compute.converter.utils.iceberg_columns as sc
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
from collections import defaultdict
|
7
|
+
import ray
|
8
|
+
import logging
|
9
|
+
from deltacat.compute.converter.model.convert_input import ConvertInput
|
10
|
+
from deltacat.compute.converter.steps.dedupe import dedupe_data_files
|
11
|
+
from deltacat.compute.converter.utils.io import write_sliced_table
|
12
|
+
from deltacat.compute.converter.utils.io import (
|
13
|
+
download_data_table_and_append_iceberg_columns,
|
14
|
+
)
|
15
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
16
|
+
partition_value_record_to_partition_value_string,
|
17
|
+
sort_data_files_maintaining_order,
|
18
|
+
)
|
19
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
20
|
+
parquet_files_dict_to_iceberg_data_files,
|
21
|
+
)
|
22
|
+
from deltacat.compute.converter.model.convert_result import ConvertResult
|
23
|
+
from pyiceberg.manifest import DataFileContent
|
24
|
+
from deltacat import logs
|
25
|
+
from fsspec import AbstractFileSystem
|
26
|
+
from typing import List, Dict, Tuple, Optional, Any
|
27
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
28
|
+
from deltacat.compute.converter.model.convert_input_files import (
|
29
|
+
DataFileList,
|
30
|
+
DataFileListGroup,
|
31
|
+
)
|
32
|
+
|
33
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
34
|
+
|
35
|
+
|
36
|
+
@ray.remote
|
37
|
+
def convert(convert_input: ConvertInput) -> ConvertResult:
|
38
|
+
convert_input_files = convert_input.convert_input_files
|
39
|
+
convert_task_index = convert_input.convert_task_index
|
40
|
+
iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
|
41
|
+
identifier_fields = convert_input.identifier_fields
|
42
|
+
table_io = convert_input.table_io
|
43
|
+
table_metadata = convert_input.table_metadata
|
44
|
+
compact_previous_position_delete_files = (
|
45
|
+
convert_input.compact_previous_position_delete_files
|
46
|
+
)
|
47
|
+
position_delete_for_multiple_data_files = (
|
48
|
+
convert_input.position_delete_for_multiple_data_files
|
49
|
+
)
|
50
|
+
max_parallel_data_file_download = convert_input.max_parallel_data_file_download
|
51
|
+
filesystem = convert_input.filesystem
|
52
|
+
s3_client_kwargs = convert_input.s3_client_kwargs
|
53
|
+
task_memory = convert_input.task_memory
|
54
|
+
|
55
|
+
if not position_delete_for_multiple_data_files:
|
56
|
+
raise NotImplementedError(
|
57
|
+
f"Distributed file level position delete compute is not supported yet"
|
58
|
+
)
|
59
|
+
if compact_previous_position_delete_files:
|
60
|
+
raise NotImplementedError(f"Compact previous position delete not supported yet")
|
61
|
+
|
62
|
+
logger.info(f"Starting convert task index: {convert_task_index}")
|
63
|
+
|
64
|
+
applicable_data_files = convert_input_files.applicable_data_files
|
65
|
+
applicable_equality_delete_files = (
|
66
|
+
convert_input_files.applicable_equality_delete_files
|
67
|
+
)
|
68
|
+
|
69
|
+
all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
|
70
|
+
|
71
|
+
partition_value_str = partition_value_record_to_partition_value_string(
|
72
|
+
convert_input_files.partition_value
|
73
|
+
)
|
74
|
+
partition_value = convert_input_files.partition_value
|
75
|
+
|
76
|
+
if partition_value_str:
|
77
|
+
iceberg_table_warehouse_prefix_with_partition = (
|
78
|
+
f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
iceberg_table_warehouse_prefix_with_partition = (
|
82
|
+
f"{iceberg_table_warehouse_prefix}"
|
83
|
+
)
|
84
|
+
|
85
|
+
enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
|
86
|
+
total_pos_delete_table = []
|
87
|
+
data_table_after_converting_equality_delete = []
|
88
|
+
if applicable_equality_delete_files:
|
89
|
+
(
|
90
|
+
pos_delete_after_converting_equality_delete,
|
91
|
+
data_table_after_converting_equality_delete,
|
92
|
+
) = compute_pos_delete_with_limited_parallelism(
|
93
|
+
data_files_list=applicable_data_files,
|
94
|
+
identifier_columns=identifier_fields,
|
95
|
+
equality_delete_files_list=applicable_equality_delete_files,
|
96
|
+
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
97
|
+
convert_task_index=convert_task_index,
|
98
|
+
max_parallel_data_file_download=max_parallel_data_file_download,
|
99
|
+
s3_file_system=filesystem,
|
100
|
+
s3_client_kwargs=s3_client_kwargs,
|
101
|
+
)
|
102
|
+
if pos_delete_after_converting_equality_delete:
|
103
|
+
total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
|
104
|
+
|
105
|
+
if enforce_primary_key_uniqueness:
|
106
|
+
data_files_downloaded_during_convert = []
|
107
|
+
if applicable_data_files:
|
108
|
+
for file_list in applicable_data_files:
|
109
|
+
for file in file_list:
|
110
|
+
data_files_downloaded_during_convert.append(file)
|
111
|
+
|
112
|
+
data_files_to_dedupe = get_additional_applicable_data_files(
|
113
|
+
all_data_files=all_data_files_for_this_bucket,
|
114
|
+
data_files_downloaded=data_files_downloaded_during_convert,
|
115
|
+
)
|
116
|
+
|
117
|
+
dedupe_file_size_bytes = sum(
|
118
|
+
data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
|
119
|
+
)
|
120
|
+
logger.info(
|
121
|
+
f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
|
122
|
+
)
|
123
|
+
|
124
|
+
logger.info(
|
125
|
+
f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
|
126
|
+
)
|
127
|
+
|
128
|
+
(
|
129
|
+
pos_delete_after_dedupe,
|
130
|
+
data_file_to_dedupe_record_count,
|
131
|
+
data_file_to_dedupe_size,
|
132
|
+
) = dedupe_data_files(
|
133
|
+
data_file_to_dedupe=data_files_to_dedupe,
|
134
|
+
identifier_columns=identifier_fields,
|
135
|
+
remaining_data_table_after_convert=data_table_after_converting_equality_delete,
|
136
|
+
merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
137
|
+
s3_client_kwargs=s3_client_kwargs,
|
138
|
+
)
|
139
|
+
logger.info(
|
140
|
+
f"[Convert task {convert_task_index}]: Dedupe produced {len(pos_delete_after_dedupe)} position delete records."
|
141
|
+
)
|
142
|
+
total_pos_delete_table.append(pos_delete_after_dedupe)
|
143
|
+
|
144
|
+
total_pos_delete = pa.concat_tables(total_pos_delete_table)
|
145
|
+
|
146
|
+
logger.info(
|
147
|
+
f"[Convert task {convert_task_index}]: Total position delete produced:{len(total_pos_delete)}"
|
148
|
+
)
|
149
|
+
|
150
|
+
to_be_added_files_list = []
|
151
|
+
if total_pos_delete:
|
152
|
+
to_be_added_files_list_parquet = write_sliced_table(
|
153
|
+
table=total_pos_delete,
|
154
|
+
base_path=iceberg_table_warehouse_prefix_with_partition,
|
155
|
+
table_writer_kwargs={},
|
156
|
+
filesystem=filesystem,
|
157
|
+
)
|
158
|
+
|
159
|
+
to_be_added_files_dict = defaultdict()
|
160
|
+
to_be_added_files_dict[partition_value] = to_be_added_files_list_parquet
|
161
|
+
|
162
|
+
logger.info(
|
163
|
+
f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
|
164
|
+
)
|
165
|
+
file_content_type = DataFileContent.POSITION_DELETES
|
166
|
+
to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
|
167
|
+
io=table_io,
|
168
|
+
table_metadata=table_metadata,
|
169
|
+
files_dict=to_be_added_files_dict,
|
170
|
+
file_content_type=file_content_type,
|
171
|
+
)
|
172
|
+
|
173
|
+
to_be_delete_files_dict = defaultdict()
|
174
|
+
|
175
|
+
if applicable_equality_delete_files:
|
176
|
+
to_be_delete_files_dict[partition_value] = [
|
177
|
+
equality_delete_file[1]
|
178
|
+
for equality_delete_list in applicable_equality_delete_files
|
179
|
+
for equality_delete_file in equality_delete_list
|
180
|
+
]
|
181
|
+
|
182
|
+
if not enforce_primary_key_uniqueness:
|
183
|
+
data_file_to_dedupe_record_count = 0
|
184
|
+
data_file_to_dedupe_size = 0
|
185
|
+
|
186
|
+
peak_memory_usage_bytes = (
|
187
|
+
get_current_process_peak_memory_usage_in_bytes()
|
188
|
+
) # Convert KB to bytes
|
189
|
+
memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
|
190
|
+
|
191
|
+
logger.info(
|
192
|
+
f"[Convert task {convert_task_index}]: Memory usage stats - "
|
193
|
+
f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
|
194
|
+
f"Allocated task memory: {convert_input.task_memory} bytes, "
|
195
|
+
f"Usage percentage: {memory_usage_percentage:.2f}%"
|
196
|
+
)
|
197
|
+
|
198
|
+
convert_res = ConvertResult.of(
|
199
|
+
convert_task_index=convert_task_index,
|
200
|
+
to_be_added_files=to_be_added_files_list,
|
201
|
+
to_be_deleted_files=to_be_delete_files_dict,
|
202
|
+
position_delete_record_count=len(total_pos_delete),
|
203
|
+
input_data_files_record_count=data_file_to_dedupe_record_count,
|
204
|
+
input_data_files_hash_columns_in_memory_sizes=data_file_to_dedupe_size,
|
205
|
+
position_delete_in_memory_sizes=int(total_pos_delete.nbytes),
|
206
|
+
position_delete_on_disk_sizes=sum(
|
207
|
+
file.file_size_in_bytes for file in to_be_added_files_list
|
208
|
+
),
|
209
|
+
input_data_files_on_disk_size=dedupe_file_size_bytes,
|
210
|
+
peak_memory_usage_bytes=peak_memory_usage_bytes,
|
211
|
+
memory_usage_percentage=memory_usage_percentage,
|
212
|
+
)
|
213
|
+
return convert_res
|
214
|
+
|
215
|
+
|
216
|
+
def get_additional_applicable_data_files(
|
217
|
+
all_data_files: DataFileList,
|
218
|
+
data_files_downloaded: DataFileList,
|
219
|
+
) -> DataFileList:
|
220
|
+
data_file_to_dedupe = []
|
221
|
+
assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
|
222
|
+
f"Length of all data files ({len(set(all_data_files))}) should never be less than "
|
223
|
+
f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
|
224
|
+
)
|
225
|
+
if data_files_downloaded:
|
226
|
+
# set1.difference(set2) returns elements in set1 but not in set2
|
227
|
+
data_file_to_dedupe.extend(
|
228
|
+
list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
data_file_to_dedupe = all_data_files
|
232
|
+
return data_file_to_dedupe
|
233
|
+
|
234
|
+
|
235
|
+
def filter_rows_to_be_deleted(
|
236
|
+
equality_delete_table: Optional[pa.Table],
|
237
|
+
data_file_table: Optional[pa.Table],
|
238
|
+
identifier_columns: List[str],
|
239
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
240
|
+
identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
|
241
|
+
if equality_delete_table and data_file_table:
|
242
|
+
equality_deletes = pc.is_in(
|
243
|
+
data_file_table[identifier_column],
|
244
|
+
equality_delete_table[identifier_column],
|
245
|
+
)
|
246
|
+
data_file_record_remaining = pc.invert(
|
247
|
+
pc.is_in(
|
248
|
+
data_file_table[identifier_column],
|
249
|
+
equality_delete_table[identifier_column],
|
250
|
+
)
|
251
|
+
)
|
252
|
+
position_delete_table = data_file_table.filter(equality_deletes)
|
253
|
+
remaining_data_table = data_file_table.filter(data_file_record_remaining)
|
254
|
+
|
255
|
+
position_delete_table = position_delete_table.drop(
|
256
|
+
[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
|
257
|
+
)
|
258
|
+
assert len(position_delete_table) + len(remaining_data_table) == len(
|
259
|
+
data_file_table
|
260
|
+
), (
|
261
|
+
f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
|
262
|
+
f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
|
263
|
+
)
|
264
|
+
|
265
|
+
return position_delete_table, remaining_data_table
|
266
|
+
|
267
|
+
|
268
|
+
def compute_pos_delete_converting_equality_deletes(
|
269
|
+
equality_delete_table: Optional[pa.Table],
|
270
|
+
data_file_table: Optional[pa.Table],
|
271
|
+
identifier_columns: List[str],
|
272
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
273
|
+
s3_file_system: Optional[AbstractFileSystem],
|
274
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
275
|
+
new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
|
276
|
+
data_file_table=data_file_table,
|
277
|
+
equality_delete_table=equality_delete_table,
|
278
|
+
identifier_columns=identifier_columns,
|
279
|
+
)
|
280
|
+
if new_position_delete_table:
|
281
|
+
logger.info(
|
282
|
+
f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
|
283
|
+
)
|
284
|
+
return new_position_delete_table, remaining_data_table
|
285
|
+
elif not remaining_data_table:
|
286
|
+
return None, None
|
287
|
+
else:
|
288
|
+
return None, remaining_data_table
|
289
|
+
|
290
|
+
|
291
|
+
def compute_pos_delete_with_limited_parallelism(
|
292
|
+
data_files_list: DataFileListGroup,
|
293
|
+
identifier_columns: List[str],
|
294
|
+
equality_delete_files_list: DataFileListGroup,
|
295
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
296
|
+
convert_task_index: int,
|
297
|
+
max_parallel_data_file_download: int,
|
298
|
+
s3_file_system: Optional[AbstractFileSystem],
|
299
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
300
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
301
|
+
assert len(data_files_list) == len(equality_delete_files_list), (
|
302
|
+
f"Number of lists of data files should equal to number of list of equality delete files, "
|
303
|
+
f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
|
304
|
+
)
|
305
|
+
|
306
|
+
new_pos_delete_table_total = []
|
307
|
+
for data_files, equality_delete_files in zip(
|
308
|
+
data_files_list, equality_delete_files_list
|
309
|
+
):
|
310
|
+
data_table_total = []
|
311
|
+
|
312
|
+
# Sort data files by file sequence number first, then file path to
|
313
|
+
# make sure files having same sequence number are deterministically sorted
|
314
|
+
data_files = sort_data_files_maintaining_order(data_files=data_files)
|
315
|
+
|
316
|
+
for data_file in data_files:
|
317
|
+
data_table = download_data_table_and_append_iceberg_columns(
|
318
|
+
file=data_file[1],
|
319
|
+
columns_to_download=identifier_columns,
|
320
|
+
additional_columns_to_append=[
|
321
|
+
sc._FILE_PATH_COLUMN_NAME,
|
322
|
+
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
323
|
+
],
|
324
|
+
s3_client_kwargs=s3_client_kwargs,
|
325
|
+
)
|
326
|
+
data_table_total.append(data_table)
|
327
|
+
data_table_total = pa.concat_tables(data_table_total)
|
328
|
+
|
329
|
+
equality_delete_table_total = []
|
330
|
+
for equality_delete in equality_delete_files:
|
331
|
+
equality_delete_table = download_data_table_and_append_iceberg_columns(
|
332
|
+
file=equality_delete[1],
|
333
|
+
columns_to_download=identifier_columns,
|
334
|
+
s3_client_kwargs=s3_client_kwargs,
|
335
|
+
)
|
336
|
+
equality_delete_table_total.append(equality_delete_table)
|
337
|
+
equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
|
338
|
+
|
339
|
+
(
|
340
|
+
new_pos_delete_table,
|
341
|
+
remaining_data_table,
|
342
|
+
) = compute_pos_delete_converting_equality_deletes(
|
343
|
+
equality_delete_table=equality_delete_table_total,
|
344
|
+
data_file_table=data_table_total,
|
345
|
+
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
346
|
+
identifier_columns=identifier_columns,
|
347
|
+
s3_file_system=s3_file_system,
|
348
|
+
)
|
349
|
+
new_pos_delete_table_total.append(new_pos_delete_table)
|
350
|
+
|
351
|
+
if new_pos_delete_table_total:
|
352
|
+
new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
|
353
|
+
|
354
|
+
logger.info(
|
355
|
+
f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
|
356
|
+
f"{len(equality_delete_table_total)} equality deletes as input, "
|
357
|
+
f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
|
358
|
+
)
|
359
|
+
|
360
|
+
if not new_pos_delete_table_total:
|
361
|
+
logger.info("No records deleted based on equality delete convertion")
|
362
|
+
|
363
|
+
if not remaining_data_table:
|
364
|
+
logger.info("No data table remaining after converting equality deletes")
|
365
|
+
|
366
|
+
return new_pos_delete_table_total, remaining_data_table
|