deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,826 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
import pytest
|
3
|
+
import ray
|
4
|
+
from typing import List, Dict, Any, Tuple
|
5
|
+
from pyiceberg.catalog.rest import RestCatalog
|
6
|
+
from pyiceberg.schema import Schema
|
7
|
+
from pyiceberg.types import (
|
8
|
+
NestedField,
|
9
|
+
StringType,
|
10
|
+
LongType,
|
11
|
+
)
|
12
|
+
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
13
|
+
from pyiceberg.transforms import IdentityTransform
|
14
|
+
import pyarrow as pa
|
15
|
+
import daft
|
16
|
+
|
17
|
+
from deltacat.compute.converter.steps.convert import convert
|
18
|
+
from deltacat.compute.converter.model.convert_input import ConvertInput
|
19
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
20
|
+
fetch_all_bucket_files,
|
21
|
+
)
|
22
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
23
|
+
group_all_files_to_each_bucket,
|
24
|
+
)
|
25
|
+
from deltacat.tests.compute.converter.utils import (
|
26
|
+
get_s3_file_system,
|
27
|
+
drop_table_if_exists,
|
28
|
+
commit_equality_delete_to_table,
|
29
|
+
)
|
30
|
+
from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
|
31
|
+
commit_append_snapshot,
|
32
|
+
commit_replace_snapshot,
|
33
|
+
)
|
34
|
+
|
35
|
+
from pyiceberg.typedef import Record
|
36
|
+
from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
|
37
|
+
from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
|
38
|
+
from deltacat.compute.converter.converter_session import converter_session
|
39
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
40
|
+
ConverterSessionParams,
|
41
|
+
)
|
42
|
+
from pyiceberg.catalog import load_catalog
|
43
|
+
import os
|
44
|
+
import pyarrow.parquet as pq
|
45
|
+
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
46
|
+
from pyiceberg.io.pyarrow import (
|
47
|
+
data_file_statistics_from_parquet_metadata,
|
48
|
+
compute_statistics_plan,
|
49
|
+
parquet_path_to_id_mapping,
|
50
|
+
)
|
51
|
+
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
|
52
|
+
from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
|
53
|
+
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
54
|
+
|
55
|
+
# Task memory in bytes for testing
|
56
|
+
TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
|
57
|
+
|
58
|
+
|
59
|
+
# Test data fixtures
|
60
|
+
@pytest.fixture
|
61
|
+
def base_schema():
|
62
|
+
return Schema(
|
63
|
+
NestedField(
|
64
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
65
|
+
),
|
66
|
+
NestedField(
|
67
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
68
|
+
),
|
69
|
+
NestedField(
|
70
|
+
field_id=2147483546,
|
71
|
+
name="file_path",
|
72
|
+
field_type=StringType(),
|
73
|
+
required=False,
|
74
|
+
),
|
75
|
+
NestedField(
|
76
|
+
field_id=2147483545, name="pos", field_type=LongType(), required=False
|
77
|
+
),
|
78
|
+
schema_id=0,
|
79
|
+
)
|
80
|
+
|
81
|
+
|
82
|
+
@pytest.fixture
|
83
|
+
def base_schema_without_metadata():
|
84
|
+
return Schema(
|
85
|
+
NestedField(
|
86
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
87
|
+
),
|
88
|
+
NestedField(
|
89
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
90
|
+
),
|
91
|
+
schema_id=0,
|
92
|
+
)
|
93
|
+
|
94
|
+
|
95
|
+
@pytest.fixture
|
96
|
+
def multi_key_schema():
|
97
|
+
return Schema(
|
98
|
+
NestedField(
|
99
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
100
|
+
),
|
101
|
+
NestedField(
|
102
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
103
|
+
),
|
104
|
+
NestedField(
|
105
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
106
|
+
),
|
107
|
+
NestedField(
|
108
|
+
field_id=2147483546,
|
109
|
+
name="file_path",
|
110
|
+
field_type=StringType(),
|
111
|
+
required=False,
|
112
|
+
),
|
113
|
+
NestedField(
|
114
|
+
field_id=2147483545, name="pos", field_type=LongType(), required=False
|
115
|
+
),
|
116
|
+
schema_id=0,
|
117
|
+
)
|
118
|
+
|
119
|
+
|
120
|
+
@pytest.fixture
|
121
|
+
def multi_key_schema_without_file_path():
|
122
|
+
return Schema(
|
123
|
+
NestedField(
|
124
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
125
|
+
),
|
126
|
+
NestedField(
|
127
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
128
|
+
),
|
129
|
+
NestedField(
|
130
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
131
|
+
),
|
132
|
+
schema_id=0,
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
@pytest.fixture
|
137
|
+
def base_partition_spec():
|
138
|
+
partition_field_identity = PartitionField(
|
139
|
+
source_id=1,
|
140
|
+
field_id=101,
|
141
|
+
transform=IdentityTransform(),
|
142
|
+
name="number_partitioned",
|
143
|
+
)
|
144
|
+
return PartitionSpec(partition_field_identity)
|
145
|
+
|
146
|
+
|
147
|
+
@pytest.fixture
|
148
|
+
def table_properties():
|
149
|
+
return {
|
150
|
+
"write.format.default": "parquet",
|
151
|
+
"write.delete.mode": "merge-on-read",
|
152
|
+
"write.update.mode": "merge-on-read",
|
153
|
+
"write.merge.mode": "merge-on-read",
|
154
|
+
"format-version": "2",
|
155
|
+
}
|
156
|
+
|
157
|
+
|
158
|
+
def create_test_table(
|
159
|
+
session_catalog: RestCatalog,
|
160
|
+
namespace: str,
|
161
|
+
table_name: str,
|
162
|
+
schema: Schema,
|
163
|
+
partition_spec: PartitionSpec,
|
164
|
+
properties: Dict[str, str],
|
165
|
+
) -> str:
|
166
|
+
"""Helper function to create a test table"""
|
167
|
+
identifier = f"{namespace}.{table_name}"
|
168
|
+
drop_table_if_exists(identifier, session_catalog)
|
169
|
+
session_catalog.create_table(
|
170
|
+
identifier,
|
171
|
+
schema=schema,
|
172
|
+
partition_spec=partition_spec,
|
173
|
+
properties=properties,
|
174
|
+
)
|
175
|
+
return identifier
|
176
|
+
|
177
|
+
|
178
|
+
def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
|
179
|
+
"""Helper function to create mock data tables based on test case"""
|
180
|
+
tables = []
|
181
|
+
for data in test_case["mock_data"]:
|
182
|
+
if "primary_key2" in data: # Multi-key case
|
183
|
+
names = ["primary_key1", "primary_key2"]
|
184
|
+
table = pa.Table.from_arrays(
|
185
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
186
|
+
names=names,
|
187
|
+
)
|
188
|
+
else: # Single key case
|
189
|
+
names = ["primary_key"]
|
190
|
+
table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
|
191
|
+
tables.append(daft.from_arrow(table))
|
192
|
+
if "equality_delete_data_mock" in test_case:
|
193
|
+
for data in test_case["equality_delete_data_mock"]:
|
194
|
+
if "primary_key2" in data: # Multi-key case
|
195
|
+
names = ["primary_key1", "primary_key2"]
|
196
|
+
table = pa.Table.from_arrays(
|
197
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
198
|
+
names=names,
|
199
|
+
)
|
200
|
+
else: # Single key case
|
201
|
+
names = ["primary_key"]
|
202
|
+
table = pa.Table.from_arrays(
|
203
|
+
[pa.array(data["primary_key"])], names=names
|
204
|
+
)
|
205
|
+
tables.append(daft.from_arrow(table))
|
206
|
+
return tuple(tables)
|
207
|
+
|
208
|
+
|
209
|
+
def run_spark_commands(spark, sqls: List[str]) -> None:
|
210
|
+
"""Helper function to run Spark SQL commands"""
|
211
|
+
for sql in sqls:
|
212
|
+
spark.sql(sql)
|
213
|
+
|
214
|
+
|
215
|
+
def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
|
216
|
+
"""Helper function to insert test data into the table"""
|
217
|
+
if "primary_key2" in test_case["mock_data"][0]:
|
218
|
+
# Multi-key case
|
219
|
+
for data in test_case["mock_data"]:
|
220
|
+
values = ", ".join(
|
221
|
+
f"(0, '{pk1}', {pk2})"
|
222
|
+
for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
|
223
|
+
)
|
224
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
225
|
+
else:
|
226
|
+
# Single key case
|
227
|
+
if test_case["schema"] == "base_schema":
|
228
|
+
# For drop duplicates test, use file_path and pos from mock_data
|
229
|
+
for data in test_case["mock_data"]:
|
230
|
+
values = ", ".join(
|
231
|
+
f"(0, '{pk}', '{path}', {pos})"
|
232
|
+
for pk, path, pos in zip(
|
233
|
+
data["primary_key"], data["file_path"], data["pos"]
|
234
|
+
)
|
235
|
+
)
|
236
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
237
|
+
else:
|
238
|
+
# For other tests, just include the basic columns
|
239
|
+
for data in test_case["mock_data"]:
|
240
|
+
values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
|
241
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
242
|
+
|
243
|
+
|
244
|
+
def create_convert_input(
|
245
|
+
tbl,
|
246
|
+
convert_input_files_for_all_buckets: List[Any],
|
247
|
+
test_case: Dict[str, Any],
|
248
|
+
s3_file_system: Any,
|
249
|
+
) -> List[ConvertInput]:
|
250
|
+
"""Helper function to create convert inputs"""
|
251
|
+
convert_inputs = []
|
252
|
+
for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
|
253
|
+
convert_input = ConvertInput.of(
|
254
|
+
convert_input_files=one_bucket_files,
|
255
|
+
convert_task_index=i,
|
256
|
+
iceberg_table_warehouse_prefix="warehouse/default",
|
257
|
+
identifier_fields=test_case["identifier_fields"],
|
258
|
+
table_io=tbl.io,
|
259
|
+
table_metadata=tbl.metadata,
|
260
|
+
compact_previous_position_delete_files=False,
|
261
|
+
enforce_primary_key_uniqueness=True,
|
262
|
+
position_delete_for_multiple_data_files=True,
|
263
|
+
max_parallel_data_file_download=10,
|
264
|
+
filesystem=s3_file_system,
|
265
|
+
s3_client_kwargs={},
|
266
|
+
task_memory=TASK_MEMORY_BYTES,
|
267
|
+
)
|
268
|
+
convert_inputs.append(convert_input)
|
269
|
+
return convert_inputs
|
270
|
+
|
271
|
+
|
272
|
+
def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
|
273
|
+
"""Helper function to process convert results
|
274
|
+
|
275
|
+
Args:
|
276
|
+
convert_result: The result from convert_session
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
|
280
|
+
"""
|
281
|
+
to_be_deleted_files_list = []
|
282
|
+
to_be_added_files_list = []
|
283
|
+
if convert_result.to_be_deleted_files:
|
284
|
+
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
285
|
+
if convert_result.to_be_added_files:
|
286
|
+
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
287
|
+
return to_be_deleted_files_list, to_be_added_files_list
|
288
|
+
|
289
|
+
|
290
|
+
def verify_result(result, expected_result, verify_pos_index=False):
|
291
|
+
"""Verify the result matches the expected result.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
result: The result to verify
|
295
|
+
expected_result: The expected result
|
296
|
+
verify_pos_index: Whether to verify position values for primary keys
|
297
|
+
"""
|
298
|
+
if "primary_keys" in expected_result and "primary_key" in result:
|
299
|
+
# Single key case
|
300
|
+
assert set(result["primary_key"]) == set(expected_result["primary_keys"])
|
301
|
+
if verify_pos_index and "pk_to_pos" in expected_result:
|
302
|
+
for index in range(len(result["primary_key"])):
|
303
|
+
assert (
|
304
|
+
result["pos"][index]
|
305
|
+
== expected_result["pk_to_pos"][result["primary_key"][index]]
|
306
|
+
)
|
307
|
+
elif "pk_tuples" in expected_result:
|
308
|
+
pk_combined_res = []
|
309
|
+
for pk1, pk2 in zip(
|
310
|
+
result["primary_key1"],
|
311
|
+
result["primary_key2"],
|
312
|
+
):
|
313
|
+
pk_combined_res.append((pk1, pk2))
|
314
|
+
|
315
|
+
# Multi-key case
|
316
|
+
assert set(pk_combined_res) == set(expected_result["pk_tuples"])
|
317
|
+
else:
|
318
|
+
assert set(result) == set(expected_result["primary_keys"])
|
319
|
+
|
320
|
+
|
321
|
+
def verify_spark_read_results(spark, identifier, expected_result):
|
322
|
+
spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
|
323
|
+
all_pk = [
|
324
|
+
spark_read_pos_delete[row_idx][1]
|
325
|
+
for row_idx in range(len(spark_read_pos_delete))
|
326
|
+
]
|
327
|
+
verify_result(all_pk, expected_result, verify_pos_index=False)
|
328
|
+
|
329
|
+
|
330
|
+
def get_file_prefix(tbl):
|
331
|
+
"""Get the file prefix from a table's data files.
|
332
|
+
|
333
|
+
Args:
|
334
|
+
tbl: The table to get the file prefix from
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
str: The file prefix
|
338
|
+
"""
|
339
|
+
df = tbl.inspect.entries()
|
340
|
+
data_files = df.to_pydict()["data_file"]
|
341
|
+
file_link = data_files[0]["file_path"]
|
342
|
+
file_prefix = "/".join(file_link.split("/")[:-1])
|
343
|
+
return file_prefix.split("//")[1]
|
344
|
+
|
345
|
+
|
346
|
+
# Test cases configuration
|
347
|
+
TEST_CASES = [
|
348
|
+
{
|
349
|
+
"name": "single_key_drop_duplicates",
|
350
|
+
"table_name": "table_converter_ray_drop_duplicates_success",
|
351
|
+
"schema": "base_schema",
|
352
|
+
"identifier_fields": ["primary_key"],
|
353
|
+
"mock_data": [
|
354
|
+
{
|
355
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
356
|
+
"file_path": ["path1", "path2", "path3"],
|
357
|
+
"pos": [1, 2, 3],
|
358
|
+
},
|
359
|
+
{
|
360
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
361
|
+
"file_path": ["path1", "path2", "path3"],
|
362
|
+
"pos": [4, 5, 6],
|
363
|
+
},
|
364
|
+
{
|
365
|
+
"primary_key": ["pk4", "pk2", "pk3"],
|
366
|
+
"file_path": ["path4", "path2", "path3"],
|
367
|
+
"pos": [7, 8, 9],
|
368
|
+
},
|
369
|
+
],
|
370
|
+
"expected_result": {
|
371
|
+
"primary_keys": ["pk1", "pk2", "pk3", "pk4"],
|
372
|
+
"pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
|
373
|
+
},
|
374
|
+
},
|
375
|
+
{
|
376
|
+
"name": "multi_key_drop_duplicates",
|
377
|
+
"table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
|
378
|
+
"schema": "multi_key_schema_without_file_path",
|
379
|
+
"identifier_fields": ["primary_key1", "primary_key2"],
|
380
|
+
"mock_data": [
|
381
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
382
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
383
|
+
{"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
|
384
|
+
],
|
385
|
+
"expected_result": {
|
386
|
+
"pk_tuples": [
|
387
|
+
("pk1", 1),
|
388
|
+
("pk2", 2),
|
389
|
+
("pk2", 3),
|
390
|
+
("pk3", 3),
|
391
|
+
("pk3", 4),
|
392
|
+
("pk4", 1),
|
393
|
+
]
|
394
|
+
},
|
395
|
+
},
|
396
|
+
{
|
397
|
+
"name": "equality_delete",
|
398
|
+
"table_name": "table_converter_ray_equality_delete_success",
|
399
|
+
"schema": "base_schema_without_metadata",
|
400
|
+
"identifier_fields": ["primary_key"],
|
401
|
+
"mock_data": [
|
402
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
403
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
404
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
405
|
+
],
|
406
|
+
"equality_delete_data_mock": [{"primary_key": ["pk1"]}],
|
407
|
+
"equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
|
408
|
+
"verify_spark_read": True,
|
409
|
+
"expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
|
410
|
+
},
|
411
|
+
{
|
412
|
+
"name": "position_delete",
|
413
|
+
"table_name": "table_converter_ray_position_delete_success",
|
414
|
+
"schema": "base_schema_without_metadata",
|
415
|
+
"identifier_fields": ["primary_key"],
|
416
|
+
"mock_data": [
|
417
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
418
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
419
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
420
|
+
],
|
421
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
422
|
+
},
|
423
|
+
{
|
424
|
+
"name": "position_delete_read_by_spark",
|
425
|
+
"table_name": "table_converter_ray_pos_delete_read_by_spark_success",
|
426
|
+
"schema": "base_schema_without_metadata",
|
427
|
+
"identifier_fields": ["primary_key"],
|
428
|
+
"mock_data": [
|
429
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
430
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
431
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
432
|
+
],
|
433
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
434
|
+
"verify_spark_read": True,
|
435
|
+
"expected_spark_count": 4,
|
436
|
+
},
|
437
|
+
]
|
438
|
+
|
439
|
+
|
440
|
+
@pytest.mark.parametrize("test_case", TEST_CASES)
|
441
|
+
@pytest.mark.integration
|
442
|
+
def test_converter(
|
443
|
+
test_case: Dict[str, Any],
|
444
|
+
spark,
|
445
|
+
session_catalog: RestCatalog,
|
446
|
+
setup_ray_cluster,
|
447
|
+
mocker,
|
448
|
+
request,
|
449
|
+
) -> None:
|
450
|
+
"""
|
451
|
+
Parameterized test for converter functionality.
|
452
|
+
Tests drop duplicates, equality delete, and position delete scenarios.
|
453
|
+
"""
|
454
|
+
# Get schema fixture based on test case
|
455
|
+
schema = request.getfixturevalue(test_case["schema"])
|
456
|
+
|
457
|
+
# Create test table
|
458
|
+
identifier = create_test_table(
|
459
|
+
session_catalog=session_catalog,
|
460
|
+
namespace="default",
|
461
|
+
table_name=test_case["table_name"],
|
462
|
+
schema=schema,
|
463
|
+
partition_spec=request.getfixturevalue("base_partition_spec"),
|
464
|
+
properties=request.getfixturevalue("table_properties"),
|
465
|
+
)
|
466
|
+
|
467
|
+
# Insert test data
|
468
|
+
insert_test_data(spark, identifier, test_case)
|
469
|
+
|
470
|
+
# Get files and create convert input
|
471
|
+
tbl = session_catalog.load_table(identifier)
|
472
|
+
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
|
473
|
+
|
474
|
+
# Handle equality delete if present
|
475
|
+
if "equality_delete_data" in test_case:
|
476
|
+
tbl = session_catalog.load_table(identifier)
|
477
|
+
file_prefix = get_file_prefix(tbl)
|
478
|
+
partition_value = Record(number_partitioned=0)
|
479
|
+
|
480
|
+
# Note: Just upload to S3 to mock input data here.
|
481
|
+
# NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
|
482
|
+
equality_file_list = commit_equality_delete_to_table(
|
483
|
+
table=tbl,
|
484
|
+
partition_value=partition_value,
|
485
|
+
equality_delete_table=test_case["equality_delete_data"],
|
486
|
+
file_link_prefix=file_prefix,
|
487
|
+
)
|
488
|
+
# Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
|
489
|
+
equality_delete_dict = defaultdict()
|
490
|
+
equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
|
491
|
+
|
492
|
+
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
493
|
+
data_file_dict=data_file_dict,
|
494
|
+
equality_delete_dict=equality_delete_dict,
|
495
|
+
pos_delete_dict=pos_delete_dict,
|
496
|
+
)
|
497
|
+
|
498
|
+
s3_file_system = get_s3_file_system()
|
499
|
+
convert_inputs = create_convert_input(
|
500
|
+
tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
|
501
|
+
)
|
502
|
+
|
503
|
+
# Create and set up mock data
|
504
|
+
mock_data_tables = create_mock_data_tables(test_case)
|
505
|
+
download_data_mock = mocker.patch(
|
506
|
+
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
507
|
+
)
|
508
|
+
|
509
|
+
download_data_mock.side_effect = mock_data_tables
|
510
|
+
|
511
|
+
# Run conversion
|
512
|
+
convert_ref = convert.remote(convert_inputs[0])
|
513
|
+
convert_result = ray.get(convert_ref)
|
514
|
+
|
515
|
+
# Process results
|
516
|
+
to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
|
517
|
+
convert_result
|
518
|
+
)
|
519
|
+
|
520
|
+
if not to_be_deleted_files_list:
|
521
|
+
# Commit changes
|
522
|
+
commit_append_snapshot(
|
523
|
+
iceberg_table=tbl,
|
524
|
+
new_position_delete_files=to_be_added_files_list,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
commit_replace_snapshot(
|
528
|
+
iceberg_table=tbl,
|
529
|
+
to_be_deleted_files=to_be_deleted_files_list[0],
|
530
|
+
new_position_delete_files=to_be_added_files_list,
|
531
|
+
)
|
532
|
+
tbl.refresh()
|
533
|
+
|
534
|
+
# Verify results
|
535
|
+
pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
|
536
|
+
|
537
|
+
# Verify Spark read if required
|
538
|
+
if test_case.get("verify_spark_read", False):
|
539
|
+
verify_spark_read_results(spark, identifier, test_case["expected_result"])
|
540
|
+
else:
|
541
|
+
verify_result(
|
542
|
+
pyiceberg_scan_table_rows,
|
543
|
+
test_case["expected_result"],
|
544
|
+
verify_pos_index=test_case.get("verify_pos_index", False),
|
545
|
+
)
|
546
|
+
|
547
|
+
|
548
|
+
def test_converter_session_with_local_filesystem_and_duplicate_ids(
|
549
|
+
setup_ray_cluster,
|
550
|
+
) -> None:
|
551
|
+
"""
|
552
|
+
Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
|
553
|
+
This test simulates the pattern where duplicate IDs represent updates to existing records.
|
554
|
+
The converter should merge these updates by creating position delete files.
|
555
|
+
"""
|
556
|
+
with temp_dir_autocleanup() as temp_catalog_dir:
|
557
|
+
# Create warehouse directory
|
558
|
+
warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
|
559
|
+
os.makedirs(warehouse_path, exist_ok=True)
|
560
|
+
|
561
|
+
# Set up local in-memory catalog
|
562
|
+
local_catalog = load_catalog(
|
563
|
+
"local_sql_catalog",
|
564
|
+
**{
|
565
|
+
"type": "in-memory",
|
566
|
+
"warehouse": warehouse_path,
|
567
|
+
},
|
568
|
+
)
|
569
|
+
|
570
|
+
# Create local PyArrow filesystem
|
571
|
+
import pyarrow.fs as pafs
|
572
|
+
|
573
|
+
local_filesystem = pafs.LocalFileSystem()
|
574
|
+
|
575
|
+
# Define schema (id, name, value, version)
|
576
|
+
schema = Schema(
|
577
|
+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
|
578
|
+
NestedField(
|
579
|
+
field_id=2, name="name", field_type=StringType(), required=False
|
580
|
+
),
|
581
|
+
NestedField(
|
582
|
+
field_id=3, name="value", field_type=LongType(), required=False
|
583
|
+
),
|
584
|
+
NestedField(
|
585
|
+
field_id=4, name="version", field_type=LongType(), required=False
|
586
|
+
),
|
587
|
+
schema_id=0,
|
588
|
+
)
|
589
|
+
|
590
|
+
# Create table properties for merge-on-read
|
591
|
+
properties = {
|
592
|
+
"write.format.default": "parquet",
|
593
|
+
"write.delete.mode": "merge-on-read",
|
594
|
+
"write.update.mode": "merge-on-read",
|
595
|
+
"write.merge.mode": "merge-on-read",
|
596
|
+
"format-version": "2",
|
597
|
+
}
|
598
|
+
|
599
|
+
# Create the table
|
600
|
+
table_identifier = "default.test_duplicate_ids"
|
601
|
+
try:
|
602
|
+
local_catalog.create_namespace("default")
|
603
|
+
except NamespaceAlreadyExistsError:
|
604
|
+
pass # Namespace may already exist
|
605
|
+
try:
|
606
|
+
local_catalog.drop_table(table_identifier)
|
607
|
+
except NoSuchTableError:
|
608
|
+
pass # Table may not exist
|
609
|
+
|
610
|
+
local_catalog.create_table(
|
611
|
+
table_identifier,
|
612
|
+
schema=schema,
|
613
|
+
properties=properties,
|
614
|
+
)
|
615
|
+
tbl = local_catalog.load_table(table_identifier)
|
616
|
+
|
617
|
+
# Set the name mapping property so Iceberg can read parquet files without field IDs
|
618
|
+
with tbl.transaction() as tx:
|
619
|
+
tx.set_properties(
|
620
|
+
**{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
|
621
|
+
)
|
622
|
+
|
623
|
+
# Step 1: Write initial data
|
624
|
+
# Create PyArrow table with explicit schema to match Iceberg schema
|
625
|
+
arrow_schema = schema_to_pyarrow(schema)
|
626
|
+
|
627
|
+
initial_data = pa.table(
|
628
|
+
{
|
629
|
+
"id": [1, 2, 3, 4],
|
630
|
+
"name": ["Alice", "Bob", "Charlie", "David"],
|
631
|
+
"value": [100, 200, 300, 400],
|
632
|
+
"version": [1, 1, 1, 1],
|
633
|
+
},
|
634
|
+
schema=arrow_schema,
|
635
|
+
)
|
636
|
+
|
637
|
+
# Step 2: Write additional data
|
638
|
+
additional_data = pa.table(
|
639
|
+
{
|
640
|
+
"id": [5, 6, 7, 8],
|
641
|
+
"name": ["Eve", "Frank", "Grace", "Henry"],
|
642
|
+
"value": [500, 600, 700, 800],
|
643
|
+
"version": [1, 1, 1, 1],
|
644
|
+
},
|
645
|
+
schema=arrow_schema,
|
646
|
+
)
|
647
|
+
|
648
|
+
# Step 3: Write updates to existing records (this creates duplicates by ID)
|
649
|
+
# These should overwrite the original records with same IDs
|
650
|
+
updated_data = pa.table(
|
651
|
+
{
|
652
|
+
"id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
|
653
|
+
"name": [
|
654
|
+
"Robert",
|
655
|
+
"Charles",
|
656
|
+
"Ivan",
|
657
|
+
], # Updated names for Bob and Charlie
|
658
|
+
"value": [201, 301, 900], # Updated values
|
659
|
+
"version": [2, 2, 1], # Higher version numbers for updates
|
660
|
+
},
|
661
|
+
schema=arrow_schema,
|
662
|
+
)
|
663
|
+
|
664
|
+
# Write all data to separate parquet files to simulate multiple writes
|
665
|
+
data_files_to_commit = []
|
666
|
+
|
667
|
+
for i, data in enumerate([initial_data, additional_data, updated_data]):
|
668
|
+
data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
|
669
|
+
pq.write_table(data, data_file_path)
|
670
|
+
|
671
|
+
# Create DataFile objects for Iceberg
|
672
|
+
parquet_metadata = pq.read_metadata(data_file_path)
|
673
|
+
file_size = os.path.getsize(data_file_path)
|
674
|
+
|
675
|
+
# Check schema compatibility
|
676
|
+
_check_pyarrow_schema_compatible(
|
677
|
+
schema, parquet_metadata.schema.to_arrow_schema()
|
678
|
+
)
|
679
|
+
|
680
|
+
# Calculate statistics
|
681
|
+
statistics = data_file_statistics_from_parquet_metadata(
|
682
|
+
parquet_metadata=parquet_metadata,
|
683
|
+
stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
|
684
|
+
parquet_column_mapping=parquet_path_to_id_mapping(schema),
|
685
|
+
)
|
686
|
+
|
687
|
+
data_file = DataFile(
|
688
|
+
content=DataFileContent.DATA,
|
689
|
+
file_path=data_file_path,
|
690
|
+
file_format=FileFormat.PARQUET,
|
691
|
+
partition={}, # No partitioning
|
692
|
+
file_size_in_bytes=file_size,
|
693
|
+
sort_order_id=None,
|
694
|
+
spec_id=tbl.metadata.default_spec_id,
|
695
|
+
key_metadata=None,
|
696
|
+
equality_ids=None,
|
697
|
+
**statistics.to_serialized_dict(),
|
698
|
+
)
|
699
|
+
data_files_to_commit.append(data_file)
|
700
|
+
|
701
|
+
# Commit all data files to the table
|
702
|
+
with tbl.transaction() as tx:
|
703
|
+
with tx.update_snapshot().fast_append() as update_snapshot:
|
704
|
+
for data_file in data_files_to_commit:
|
705
|
+
update_snapshot.append_data_file(data_file)
|
706
|
+
|
707
|
+
tbl.refresh()
|
708
|
+
|
709
|
+
# Verify we have duplicate IDs before conversion
|
710
|
+
initial_scan = tbl.scan().to_arrow().to_pydict()
|
711
|
+
print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
|
712
|
+
|
713
|
+
# There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
714
|
+
expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
715
|
+
assert (
|
716
|
+
sorted(initial_scan["id"]) == expected_duplicate_ids
|
717
|
+
), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
|
718
|
+
|
719
|
+
# Now call converter_session to convert equality deletes to position deletes
|
720
|
+
converter_params = ConverterSessionParams.of(
|
721
|
+
{
|
722
|
+
"catalog": local_catalog,
|
723
|
+
"iceberg_table_name": table_identifier,
|
724
|
+
"iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
|
725
|
+
"merge_keys": ["id"], # Use ID as the merge key
|
726
|
+
"enforce_primary_key_uniqueness": True,
|
727
|
+
"task_max_parallelism": 1, # Single task for local testing
|
728
|
+
"filesystem": local_filesystem,
|
729
|
+
"location_provider_prefix_override": None, # Use local filesystem
|
730
|
+
"location_provider_prefix_override": None, # Let the system auto-generate the prefix
|
731
|
+
}
|
732
|
+
)
|
733
|
+
|
734
|
+
print(f"Running converter_session with local filesystem...")
|
735
|
+
print(f"Warehouse path: {warehouse_path}")
|
736
|
+
print(f"Merge keys: ['id']")
|
737
|
+
print(f"Enforce uniqueness: True")
|
738
|
+
|
739
|
+
# Run the converter
|
740
|
+
converter_session(params=converter_params)
|
741
|
+
|
742
|
+
# Refresh table and scan again
|
743
|
+
tbl.refresh()
|
744
|
+
final_scan = tbl.scan().to_arrow().to_pydict()
|
745
|
+
|
746
|
+
print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
|
747
|
+
print(f"Final data: {final_scan}")
|
748
|
+
|
749
|
+
# Verify position delete files were created by checking table metadata
|
750
|
+
latest_snapshot = tbl.metadata.current_snapshot()
|
751
|
+
if latest_snapshot:
|
752
|
+
manifests = latest_snapshot.manifests(tbl.io)
|
753
|
+
position_delete_files = []
|
754
|
+
|
755
|
+
for manifest in manifests:
|
756
|
+
entries = manifest.fetch_manifest_entry(tbl.io)
|
757
|
+
for entry in entries:
|
758
|
+
if entry.data_file.content == DataFileContent.POSITION_DELETES:
|
759
|
+
position_delete_files.append(entry.data_file.file_path)
|
760
|
+
|
761
|
+
print(f"Position delete files found: {position_delete_files}")
|
762
|
+
assert (
|
763
|
+
len(position_delete_files) > 0
|
764
|
+
), "No position delete files were created by converter_session"
|
765
|
+
|
766
|
+
# Verify the final result has unique IDs (duplicates should be resolved)
|
767
|
+
# Expected: Latest values for each ID based on the updates
|
768
|
+
expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
|
769
|
+
actual_ids = sorted(final_scan["id"])
|
770
|
+
|
771
|
+
print(f"Expected unique IDs: {expected_unique_ids}")
|
772
|
+
print(f"Actual IDs after conversion: {actual_ids}")
|
773
|
+
|
774
|
+
assert (
|
775
|
+
actual_ids == expected_unique_ids
|
776
|
+
), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
|
777
|
+
|
778
|
+
# Verify the updated values are present (higher version should win)
|
779
|
+
final_data_by_id = {}
|
780
|
+
for i, id_val in enumerate(final_scan["id"]):
|
781
|
+
final_data_by_id[id_val] = {
|
782
|
+
"name": final_scan["name"][i],
|
783
|
+
"value": final_scan["value"][i],
|
784
|
+
"version": final_scan["version"][i],
|
785
|
+
}
|
786
|
+
|
787
|
+
# Check that ID 2 has updated value (Robert, 201, version 2)
|
788
|
+
assert (
|
789
|
+
final_data_by_id[2]["name"] == "Robert"
|
790
|
+
), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
|
791
|
+
assert (
|
792
|
+
final_data_by_id[2]["value"] == 201
|
793
|
+
), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
|
794
|
+
assert (
|
795
|
+
final_data_by_id[2]["version"] == 2
|
796
|
+
), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
|
797
|
+
|
798
|
+
# Check that ID 3 has updated value (Charles, 301, version 2)
|
799
|
+
assert (
|
800
|
+
final_data_by_id[3]["name"] == "Charles"
|
801
|
+
), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
|
802
|
+
assert (
|
803
|
+
final_data_by_id[3]["value"] == 301
|
804
|
+
), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
|
805
|
+
assert (
|
806
|
+
final_data_by_id[3]["version"] == 2
|
807
|
+
), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
|
808
|
+
|
809
|
+
# Check that new ID 9 is present
|
810
|
+
assert (
|
811
|
+
final_data_by_id[9]["name"] == "Ivan"
|
812
|
+
), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
|
813
|
+
assert (
|
814
|
+
final_data_by_id[9]["value"] == 900
|
815
|
+
), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
|
816
|
+
|
817
|
+
print(f"✅ Test completed successfully!")
|
818
|
+
print(
|
819
|
+
f"✅ Position delete files were created: {len(position_delete_files)} files"
|
820
|
+
)
|
821
|
+
print(f"✅ Duplicate IDs were resolved correctly")
|
822
|
+
print(
|
823
|
+
f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
|
824
|
+
)
|
825
|
+
print(f"✅ Final table has {len(actual_ids)} unique records")
|
826
|
+
print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")
|