deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
+
from collections import defaultdict
|
1
2
|
import pytest
|
2
3
|
import ray
|
3
|
-
from typing import List
|
4
|
+
from typing import List, Dict, Any, Tuple
|
4
5
|
from pyiceberg.catalog.rest import RestCatalog
|
5
|
-
from pyiceberg.expressions import EqualTo
|
6
6
|
from pyiceberg.schema import Schema
|
7
7
|
from pyiceberg.types import (
|
8
8
|
NestedField,
|
@@ -12,158 +12,98 @@ from pyiceberg.types import (
|
|
12
12
|
from pyiceberg.partitioning import PartitionSpec, PartitionField
|
13
13
|
from pyiceberg.transforms import IdentityTransform
|
14
14
|
import pyarrow as pa
|
15
|
+
import daft
|
15
16
|
|
16
17
|
from deltacat.compute.converter.steps.convert import convert
|
17
18
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
18
19
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
19
20
|
fetch_all_bucket_files,
|
20
|
-
parquet_files_dict_to_iceberg_data_files,
|
21
21
|
)
|
22
|
-
from collections import defaultdict
|
23
22
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
24
23
|
group_all_files_to_each_bucket,
|
25
24
|
)
|
26
25
|
from deltacat.tests.compute.converter.utils import (
|
27
26
|
get_s3_file_system,
|
28
27
|
drop_table_if_exists,
|
28
|
+
commit_equality_delete_to_table,
|
29
29
|
)
|
30
30
|
from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
|
31
31
|
commit_append_snapshot,
|
32
|
+
commit_replace_snapshot,
|
32
33
|
)
|
33
34
|
|
35
|
+
from pyiceberg.typedef import Record
|
36
|
+
from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
|
37
|
+
from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
|
38
|
+
from deltacat.compute.converter.converter_session import converter_session
|
39
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
40
|
+
ConverterSessionParams,
|
41
|
+
)
|
42
|
+
from pyiceberg.catalog import load_catalog
|
43
|
+
import os
|
44
|
+
import pyarrow.parquet as pq
|
45
|
+
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
46
|
+
from pyiceberg.io.pyarrow import (
|
47
|
+
data_file_statistics_from_parquet_metadata,
|
48
|
+
compute_statistics_plan,
|
49
|
+
parquet_path_to_id_mapping,
|
50
|
+
)
|
51
|
+
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
|
52
|
+
from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
|
53
|
+
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
34
54
|
|
35
|
-
|
36
|
-
|
37
|
-
spark.sql(sql)
|
55
|
+
# Task memory in bytes for testing
|
56
|
+
TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
|
38
57
|
|
39
58
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
""",
|
61
|
-
f"""
|
62
|
-
INSERT INTO {identifier} VALUES (10, 20), (10, 30)
|
63
|
-
""",
|
64
|
-
f"""
|
65
|
-
INSERT INTO {identifier} VALUES (11, 20), (11, 30)
|
66
|
-
""",
|
67
|
-
],
|
59
|
+
# Test data fixtures
|
60
|
+
@pytest.fixture
|
61
|
+
def base_schema():
|
62
|
+
return Schema(
|
63
|
+
NestedField(
|
64
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
65
|
+
),
|
66
|
+
NestedField(
|
67
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
68
|
+
),
|
69
|
+
NestedField(
|
70
|
+
field_id=2147483546,
|
71
|
+
name="file_path",
|
72
|
+
field_type=StringType(),
|
73
|
+
required=False,
|
74
|
+
),
|
75
|
+
NestedField(
|
76
|
+
field_id=2147483545, name="pos", field_type=LongType(), required=False
|
77
|
+
),
|
78
|
+
schema_id=0,
|
68
79
|
)
|
69
80
|
|
70
|
-
tbl = session_catalog.load_table(identifier)
|
71
|
-
tbl.delete(EqualTo("number_partitioned", 10))
|
72
|
-
|
73
|
-
# No overwrite operation
|
74
|
-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
|
75
|
-
"append",
|
76
|
-
"append",
|
77
|
-
"delete",
|
78
|
-
]
|
79
|
-
assert tbl.scan().to_arrow().to_pydict() == {
|
80
|
-
"number_partitioned": [11, 11],
|
81
|
-
"number": [20, 30],
|
82
|
-
}
|
83
|
-
|
84
|
-
|
85
|
-
@pytest.mark.integration
|
86
|
-
def test_spark_position_delete_production_sanity(
|
87
|
-
spark, session_catalog: RestCatalog
|
88
|
-
) -> None:
|
89
|
-
"""
|
90
|
-
Sanity test to ensure Spark position delete production is successful with `merge-on-read` spec V2.
|
91
|
-
Table has two partition levels. 1. BucketTransform on primary key
|
92
|
-
"""
|
93
|
-
identifier = "default.table_spark_position_delete_production_sanity"
|
94
|
-
|
95
|
-
run_spark_commands(
|
96
|
-
spark,
|
97
|
-
[
|
98
|
-
f"DROP TABLE IF EXISTS {identifier}",
|
99
|
-
f"""
|
100
|
-
CREATE TABLE {identifier} (
|
101
|
-
number_partitioned INT,
|
102
|
-
primary_key STRING
|
103
|
-
)
|
104
|
-
USING iceberg
|
105
|
-
PARTITIONED BY (bucket(3, primary_key), number_partitioned)
|
106
|
-
TBLPROPERTIES(
|
107
|
-
'format-version' = 2,
|
108
|
-
'write.delete.mode'='merge-on-read',
|
109
|
-
'write.update.mode'='merge-on-read',
|
110
|
-
'write.merge.mode'='merge-on-read'
|
111
|
-
)
|
112
|
-
""",
|
113
|
-
f"""
|
114
|
-
INSERT INTO {identifier} VALUES (0, 'pk1'), (0, 'pk2'), (0, 'pk3')
|
115
|
-
""",
|
116
|
-
f"""
|
117
|
-
INSERT INTO {identifier} VALUES (1, 'pk1'), (1, 'pk2'), (1, 'pk3')
|
118
|
-
""",
|
119
|
-
],
|
120
|
-
)
|
121
81
|
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
82
|
+
@pytest.fixture
|
83
|
+
def base_schema_without_metadata():
|
84
|
+
return Schema(
|
85
|
+
NestedField(
|
86
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
87
|
+
),
|
88
|
+
NestedField(
|
89
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
90
|
+
),
|
91
|
+
schema_id=0,
|
129
92
|
)
|
130
93
|
|
131
|
-
tbl = session_catalog.load_table(identifier)
|
132
|
-
tbl.refresh()
|
133
|
-
|
134
|
-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
|
135
|
-
"append",
|
136
|
-
"append",
|
137
|
-
"delete",
|
138
|
-
]
|
139
|
-
|
140
|
-
assert tbl.scan().to_arrow().to_pydict() == {
|
141
|
-
"number_partitioned": [1, 1, 0, 0],
|
142
|
-
"primary_key": ["pk2", "pk3", "pk2", "pk3"],
|
143
|
-
}
|
144
|
-
|
145
|
-
|
146
|
-
@pytest.mark.integration
|
147
|
-
def test_converter_drop_duplicates_success(
|
148
|
-
spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
|
149
|
-
) -> None:
|
150
|
-
"""
|
151
|
-
Test for convert compute remote function happy case. Download file results are mocked.
|
152
|
-
"""
|
153
94
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
identifier = f"{namespace}.{table_name}"
|
158
|
-
|
159
|
-
schema = Schema(
|
95
|
+
@pytest.fixture
|
96
|
+
def multi_key_schema():
|
97
|
+
return Schema(
|
160
98
|
NestedField(
|
161
99
|
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
162
100
|
),
|
163
101
|
NestedField(
|
164
|
-
field_id=2, name="
|
102
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
103
|
+
),
|
104
|
+
NestedField(
|
105
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
165
106
|
),
|
166
|
-
# Explicitly define "file_path" and "pos" for assertion of deterministic record after dedupe
|
167
107
|
NestedField(
|
168
108
|
field_id=2147483546,
|
169
109
|
name="file_path",
|
@@ -176,21 +116,55 @@ def test_converter_drop_duplicates_success(
|
|
176
116
|
schema_id=0,
|
177
117
|
)
|
178
118
|
|
119
|
+
|
120
|
+
@pytest.fixture
|
121
|
+
def multi_key_schema_without_file_path():
|
122
|
+
return Schema(
|
123
|
+
NestedField(
|
124
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
125
|
+
),
|
126
|
+
NestedField(
|
127
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
128
|
+
),
|
129
|
+
NestedField(
|
130
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
131
|
+
),
|
132
|
+
schema_id=0,
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
@pytest.fixture
|
137
|
+
def base_partition_spec():
|
179
138
|
partition_field_identity = PartitionField(
|
180
139
|
source_id=1,
|
181
140
|
field_id=101,
|
182
141
|
transform=IdentityTransform(),
|
183
142
|
name="number_partitioned",
|
184
143
|
)
|
185
|
-
|
144
|
+
return PartitionSpec(partition_field_identity)
|
186
145
|
|
187
|
-
properties = dict()
|
188
|
-
properties["write.format.default"] = "parquet"
|
189
|
-
properties["write.delete.mode"] = "merge-on-read"
|
190
|
-
properties["write.update.mode"] = "merge-on-read"
|
191
|
-
properties["write.merge.mode"] = "merge-on-read"
|
192
|
-
properties["format-version"] = "2"
|
193
146
|
|
147
|
+
@pytest.fixture
|
148
|
+
def table_properties():
|
149
|
+
return {
|
150
|
+
"write.format.default": "parquet",
|
151
|
+
"write.delete.mode": "merge-on-read",
|
152
|
+
"write.update.mode": "merge-on-read",
|
153
|
+
"write.merge.mode": "merge-on-read",
|
154
|
+
"format-version": "2",
|
155
|
+
}
|
156
|
+
|
157
|
+
|
158
|
+
def create_test_table(
|
159
|
+
session_catalog: RestCatalog,
|
160
|
+
namespace: str,
|
161
|
+
table_name: str,
|
162
|
+
schema: Schema,
|
163
|
+
partition_spec: PartitionSpec,
|
164
|
+
properties: Dict[str, str],
|
165
|
+
) -> str:
|
166
|
+
"""Helper function to create a test table"""
|
167
|
+
identifier = f"{namespace}.{table_name}"
|
194
168
|
drop_table_if_exists(identifier, session_catalog)
|
195
169
|
session_catalog.create_table(
|
196
170
|
identifier,
|
@@ -198,281 +172,655 @@ def test_converter_drop_duplicates_success(
|
|
198
172
|
partition_spec=partition_spec,
|
199
173
|
properties=properties,
|
200
174
|
)
|
175
|
+
return identifier
|
176
|
+
|
177
|
+
|
178
|
+
def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
|
179
|
+
"""Helper function to create mock data tables based on test case"""
|
180
|
+
tables = []
|
181
|
+
for data in test_case["mock_data"]:
|
182
|
+
if "primary_key2" in data: # Multi-key case
|
183
|
+
names = ["primary_key1", "primary_key2"]
|
184
|
+
table = pa.Table.from_arrays(
|
185
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
186
|
+
names=names,
|
187
|
+
)
|
188
|
+
else: # Single key case
|
189
|
+
names = ["primary_key"]
|
190
|
+
table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
|
191
|
+
tables.append(daft.from_arrow(table))
|
192
|
+
if "equality_delete_data_mock" in test_case:
|
193
|
+
for data in test_case["equality_delete_data_mock"]:
|
194
|
+
if "primary_key2" in data: # Multi-key case
|
195
|
+
names = ["primary_key1", "primary_key2"]
|
196
|
+
table = pa.Table.from_arrays(
|
197
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
198
|
+
names=names,
|
199
|
+
)
|
200
|
+
else: # Single key case
|
201
|
+
names = ["primary_key"]
|
202
|
+
table = pa.Table.from_arrays(
|
203
|
+
[pa.array(data["primary_key"])], names=names
|
204
|
+
)
|
205
|
+
tables.append(daft.from_arrow(table))
|
206
|
+
return tuple(tables)
|
201
207
|
|
202
|
-
# 2. Use Spark to generate initial data files
|
203
|
-
tbl = session_catalog.load_table(identifier)
|
204
|
-
tbl.refresh()
|
205
|
-
run_spark_commands(
|
206
|
-
spark,
|
207
|
-
[
|
208
|
-
f"""
|
209
|
-
INSERT INTO {identifier} VALUES (0, "pk1", "path1", 1), (0, "pk2", "path2", 2), (0, "pk3", "path3", 3)
|
210
|
-
"""
|
211
|
-
],
|
212
|
-
)
|
213
|
-
run_spark_commands(
|
214
|
-
spark,
|
215
|
-
[
|
216
|
-
f"""
|
217
|
-
INSERT INTO {identifier} VALUES (0, "pk1", "path1", 4), (0, "pk2", "path2", 5), (0, "pk3", "path3", 6)
|
218
|
-
"""
|
219
|
-
],
|
220
|
-
)
|
221
|
-
run_spark_commands(
|
222
|
-
spark,
|
223
|
-
[
|
224
|
-
f"""
|
225
|
-
INSERT INTO {identifier} VALUES (0, "pk4", "path4", 7), (0, "pk2", "path2", 8), (0, "pk3", "path3", 9)
|
226
|
-
"""
|
227
|
-
],
|
228
|
-
)
|
229
208
|
|
230
|
-
|
231
|
-
|
232
|
-
|
209
|
+
def run_spark_commands(spark, sqls: List[str]) -> None:
|
210
|
+
"""Helper function to run Spark SQL commands"""
|
211
|
+
for sql in sqls:
|
212
|
+
spark.sql(sql)
|
233
213
|
|
234
|
-
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
235
|
-
data_file_dict=data_file_dict,
|
236
|
-
equality_delete_dict=equality_delete_dict,
|
237
|
-
pos_delete_dict=pos_delete_dict,
|
238
|
-
)
|
239
214
|
|
240
|
-
|
215
|
+
def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
|
216
|
+
"""Helper function to insert test data into the table"""
|
217
|
+
if "primary_key2" in test_case["mock_data"][0]:
|
218
|
+
# Multi-key case
|
219
|
+
for data in test_case["mock_data"]:
|
220
|
+
values = ", ".join(
|
221
|
+
f"(0, '{pk1}', {pk2})"
|
222
|
+
for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
|
223
|
+
)
|
224
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
225
|
+
else:
|
226
|
+
# Single key case
|
227
|
+
if test_case["schema"] == "base_schema":
|
228
|
+
# For drop duplicates test, use file_path and pos from mock_data
|
229
|
+
for data in test_case["mock_data"]:
|
230
|
+
values = ", ".join(
|
231
|
+
f"(0, '{pk}', '{path}', {pos})"
|
232
|
+
for pk, path, pos in zip(
|
233
|
+
data["primary_key"], data["file_path"], data["pos"]
|
234
|
+
)
|
235
|
+
)
|
236
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
237
|
+
else:
|
238
|
+
# For other tests, just include the basic columns
|
239
|
+
for data in test_case["mock_data"]:
|
240
|
+
values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
|
241
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
242
|
+
|
243
|
+
|
244
|
+
def create_convert_input(
|
245
|
+
tbl,
|
246
|
+
convert_input_files_for_all_buckets: List[Any],
|
247
|
+
test_case: Dict[str, Any],
|
248
|
+
s3_file_system: Any,
|
249
|
+
) -> List[ConvertInput]:
|
250
|
+
"""Helper function to create convert inputs"""
|
251
|
+
convert_inputs = []
|
241
252
|
for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
|
242
253
|
convert_input = ConvertInput.of(
|
243
254
|
convert_input_files=one_bucket_files,
|
244
255
|
convert_task_index=i,
|
245
256
|
iceberg_table_warehouse_prefix="warehouse/default",
|
246
|
-
identifier_fields=["
|
247
|
-
|
257
|
+
identifier_fields=test_case["identifier_fields"],
|
258
|
+
table_io=tbl.io,
|
259
|
+
table_metadata=tbl.metadata,
|
260
|
+
compact_previous_position_delete_files=False,
|
248
261
|
enforce_primary_key_uniqueness=True,
|
249
262
|
position_delete_for_multiple_data_files=True,
|
250
263
|
max_parallel_data_file_download=10,
|
251
|
-
|
264
|
+
filesystem=s3_file_system,
|
265
|
+
s3_client_kwargs={},
|
266
|
+
task_memory=TASK_MEMORY_BYTES,
|
252
267
|
)
|
268
|
+
convert_inputs.append(convert_input)
|
269
|
+
return convert_inputs
|
253
270
|
|
254
|
-
number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
|
255
|
-
primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
|
256
|
-
names = ["number_partitioned", "primary_key"]
|
257
|
-
data_table_1 = pa.Table.from_arrays(
|
258
|
-
[number_partitioned_array_1, primary_key_array_1], names=names
|
259
|
-
)
|
260
|
-
|
261
|
-
number_partitioned_array_2 = pa.array([0, 0, 0], type=pa.int32())
|
262
|
-
primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
|
263
|
-
names = ["number_partitioned", "primary_key"]
|
264
|
-
data_table_2 = pa.Table.from_arrays(
|
265
|
-
[number_partitioned_array_2, primary_key_array_2], names=names
|
266
|
-
)
|
267
271
|
|
268
|
-
|
269
|
-
|
270
|
-
names = ["number_partitioned", "primary_key"]
|
271
|
-
data_table_3 = pa.Table.from_arrays(
|
272
|
-
[number_partitioned_array_3, primary_key_array_3], names=names
|
273
|
-
)
|
272
|
+
def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
|
273
|
+
"""Helper function to process convert results
|
274
274
|
|
275
|
-
|
276
|
-
|
277
|
-
)
|
278
|
-
download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
|
279
|
-
|
280
|
-
convert_ref = convert.remote(convert_input)
|
275
|
+
Args:
|
276
|
+
convert_result: The result from convert_session
|
281
277
|
|
278
|
+
Returns:
|
279
|
+
Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
|
280
|
+
"""
|
282
281
|
to_be_deleted_files_list = []
|
283
|
-
|
284
|
-
convert_result
|
285
|
-
|
286
|
-
|
282
|
+
to_be_added_files_list = []
|
283
|
+
if convert_result.to_be_deleted_files:
|
284
|
+
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
285
|
+
if convert_result.to_be_added_files:
|
286
|
+
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
287
|
+
return to_be_deleted_files_list, to_be_added_files_list
|
287
288
|
|
288
|
-
if convert_result[0]:
|
289
|
-
to_be_deleted_files_list.extend(convert_result[0].values())
|
290
289
|
|
291
|
-
|
292
|
-
|
290
|
+
def verify_result(result, expected_result, verify_pos_index=False):
|
291
|
+
"""Verify the result matches the expected result.
|
293
292
|
|
294
|
-
|
295
|
-
|
296
|
-
|
293
|
+
Args:
|
294
|
+
result: The result to verify
|
295
|
+
expected_result: The expected result
|
296
|
+
verify_pos_index: Whether to verify position values for primary keys
|
297
|
+
"""
|
298
|
+
if "primary_keys" in expected_result and "primary_key" in result:
|
299
|
+
# Single key case
|
300
|
+
assert set(result["primary_key"]) == set(expected_result["primary_keys"])
|
301
|
+
if verify_pos_index and "pk_to_pos" in expected_result:
|
302
|
+
for index in range(len(result["primary_key"])):
|
303
|
+
assert (
|
304
|
+
result["pos"][index]
|
305
|
+
== expected_result["pk_to_pos"][result["primary_key"][index]]
|
306
|
+
)
|
307
|
+
elif "pk_tuples" in expected_result:
|
308
|
+
pk_combined_res = []
|
309
|
+
for pk1, pk2 in zip(
|
310
|
+
result["primary_key1"],
|
311
|
+
result["primary_key2"],
|
312
|
+
):
|
313
|
+
pk_combined_res.append((pk1, pk2))
|
314
|
+
|
315
|
+
# Multi-key case
|
316
|
+
assert set(pk_combined_res) == set(expected_result["pk_tuples"])
|
317
|
+
else:
|
318
|
+
assert set(result) == set(expected_result["primary_keys"])
|
319
|
+
|
320
|
+
|
321
|
+
def verify_spark_read_results(spark, identifier, expected_result):
|
322
|
+
spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
|
323
|
+
all_pk = [
|
324
|
+
spark_read_pos_delete[row_idx][1]
|
325
|
+
for row_idx in range(len(spark_read_pos_delete))
|
326
|
+
]
|
327
|
+
verify_result(all_pk, expected_result, verify_pos_index=False)
|
297
328
|
|
298
|
-
# 4. Commit position delete, delete equality deletes from table
|
299
|
-
new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
|
300
|
-
io=tbl.io,
|
301
|
-
table_metadata=tbl.metadata,
|
302
|
-
files_dict_list=to_be_added_files_dict_list,
|
303
|
-
)
|
304
|
-
commit_append_snapshot(
|
305
|
-
iceberg_table=tbl,
|
306
|
-
new_position_delete_files=new_position_delete_files,
|
307
|
-
)
|
308
|
-
tbl.refresh()
|
309
329
|
|
310
|
-
|
311
|
-
|
330
|
+
def get_file_prefix(tbl):
|
331
|
+
"""Get the file prefix from a table's data files.
|
312
332
|
|
313
|
-
|
314
|
-
|
315
|
-
assert all_pk == ["pk1", "pk2", "pk3", "pk4"]
|
333
|
+
Args:
|
334
|
+
tbl: The table to get the file prefix from
|
316
335
|
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
)
|
322
|
-
|
336
|
+
Returns:
|
337
|
+
str: The file prefix
|
338
|
+
"""
|
339
|
+
df = tbl.inspect.entries()
|
340
|
+
data_files = df.to_pydict()["data_file"]
|
341
|
+
file_link = data_files[0]["file_path"]
|
342
|
+
file_prefix = "/".join(file_link.split("/")[:-1])
|
343
|
+
return file_prefix.split("//")[1]
|
344
|
+
|
345
|
+
|
346
|
+
# Test cases configuration
|
347
|
+
TEST_CASES = [
|
348
|
+
{
|
349
|
+
"name": "single_key_drop_duplicates",
|
350
|
+
"table_name": "table_converter_ray_drop_duplicates_success",
|
351
|
+
"schema": "base_schema",
|
352
|
+
"identifier_fields": ["primary_key"],
|
353
|
+
"mock_data": [
|
354
|
+
{
|
355
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
356
|
+
"file_path": ["path1", "path2", "path3"],
|
357
|
+
"pos": [1, 2, 3],
|
358
|
+
},
|
359
|
+
{
|
360
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
361
|
+
"file_path": ["path1", "path2", "path3"],
|
362
|
+
"pos": [4, 5, 6],
|
363
|
+
},
|
364
|
+
{
|
365
|
+
"primary_key": ["pk4", "pk2", "pk3"],
|
366
|
+
"file_path": ["path4", "path2", "path3"],
|
367
|
+
"pos": [7, 8, 9],
|
368
|
+
},
|
369
|
+
],
|
370
|
+
"expected_result": {
|
371
|
+
"primary_keys": ["pk1", "pk2", "pk3", "pk4"],
|
372
|
+
"pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
|
373
|
+
},
|
374
|
+
},
|
375
|
+
{
|
376
|
+
"name": "multi_key_drop_duplicates",
|
377
|
+
"table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
|
378
|
+
"schema": "multi_key_schema_without_file_path",
|
379
|
+
"identifier_fields": ["primary_key1", "primary_key2"],
|
380
|
+
"mock_data": [
|
381
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
382
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
383
|
+
{"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
|
384
|
+
],
|
385
|
+
"expected_result": {
|
386
|
+
"pk_tuples": [
|
387
|
+
("pk1", 1),
|
388
|
+
("pk2", 2),
|
389
|
+
("pk2", 3),
|
390
|
+
("pk3", 3),
|
391
|
+
("pk3", 4),
|
392
|
+
("pk4", 1),
|
393
|
+
]
|
394
|
+
},
|
395
|
+
},
|
396
|
+
{
|
397
|
+
"name": "equality_delete",
|
398
|
+
"table_name": "table_converter_ray_equality_delete_success",
|
399
|
+
"schema": "base_schema_without_metadata",
|
400
|
+
"identifier_fields": ["primary_key"],
|
401
|
+
"mock_data": [
|
402
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
403
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
404
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
405
|
+
],
|
406
|
+
"equality_delete_data_mock": [{"primary_key": ["pk1"]}],
|
407
|
+
"equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
|
408
|
+
"verify_spark_read": True,
|
409
|
+
"expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
|
410
|
+
},
|
411
|
+
{
|
412
|
+
"name": "position_delete",
|
413
|
+
"table_name": "table_converter_ray_position_delete_success",
|
414
|
+
"schema": "base_schema_without_metadata",
|
415
|
+
"identifier_fields": ["primary_key"],
|
416
|
+
"mock_data": [
|
417
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
418
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
419
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
420
|
+
],
|
421
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
422
|
+
},
|
423
|
+
{
|
424
|
+
"name": "position_delete_read_by_spark",
|
425
|
+
"table_name": "table_converter_ray_pos_delete_read_by_spark_success",
|
426
|
+
"schema": "base_schema_without_metadata",
|
427
|
+
"identifier_fields": ["primary_key"],
|
428
|
+
"mock_data": [
|
429
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
430
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
431
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
432
|
+
],
|
433
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
434
|
+
"verify_spark_read": True,
|
435
|
+
"expected_spark_count": 4,
|
436
|
+
},
|
437
|
+
]
|
323
438
|
|
324
439
|
|
440
|
+
@pytest.mark.parametrize("test_case", TEST_CASES)
|
325
441
|
@pytest.mark.integration
|
326
|
-
def
|
327
|
-
|
442
|
+
def test_converter(
|
443
|
+
test_case: Dict[str, Any],
|
444
|
+
spark,
|
445
|
+
session_catalog: RestCatalog,
|
446
|
+
setup_ray_cluster,
|
447
|
+
mocker,
|
448
|
+
request,
|
328
449
|
) -> None:
|
329
450
|
"""
|
330
|
-
|
451
|
+
Parameterized test for converter functionality.
|
452
|
+
Tests drop duplicates, equality delete, and position delete scenarios.
|
331
453
|
"""
|
454
|
+
# Get schema fixture based on test case
|
455
|
+
schema = request.getfixturevalue(test_case["schema"])
|
456
|
+
|
457
|
+
# Create test table
|
458
|
+
identifier = create_test_table(
|
459
|
+
session_catalog=session_catalog,
|
460
|
+
namespace="default",
|
461
|
+
table_name=test_case["table_name"],
|
462
|
+
schema=schema,
|
463
|
+
partition_spec=request.getfixturevalue("base_partition_spec"),
|
464
|
+
properties=request.getfixturevalue("table_properties"),
|
465
|
+
)
|
332
466
|
|
333
|
-
#
|
334
|
-
|
335
|
-
table_name = "table_converter_ray_pos_delete_read_by_spark_success"
|
336
|
-
identifier = f"{namespace}.{table_name}"
|
467
|
+
# Insert test data
|
468
|
+
insert_test_data(spark, identifier, test_case)
|
337
469
|
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
),
|
342
|
-
NestedField(
|
343
|
-
field_id=2, name="primary_key", field_type=StringType(), required=False
|
344
|
-
),
|
345
|
-
schema_id=0,
|
346
|
-
)
|
470
|
+
# Get files and create convert input
|
471
|
+
tbl = session_catalog.load_table(identifier)
|
472
|
+
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
|
347
473
|
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
474
|
+
# Handle equality delete if present
|
475
|
+
if "equality_delete_data" in test_case:
|
476
|
+
tbl = session_catalog.load_table(identifier)
|
477
|
+
file_prefix = get_file_prefix(tbl)
|
478
|
+
partition_value = Record(number_partitioned=0)
|
479
|
+
|
480
|
+
# Note: Just upload to S3 to mock input data here.
|
481
|
+
# NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
|
482
|
+
equality_file_list = commit_equality_delete_to_table(
|
483
|
+
table=tbl,
|
484
|
+
partition_value=partition_value,
|
485
|
+
equality_delete_table=test_case["equality_delete_data"],
|
486
|
+
file_link_prefix=file_prefix,
|
487
|
+
)
|
488
|
+
# Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
|
489
|
+
equality_delete_dict = defaultdict()
|
490
|
+
equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
|
491
|
+
|
492
|
+
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
493
|
+
data_file_dict=data_file_dict,
|
494
|
+
equality_delete_dict=equality_delete_dict,
|
495
|
+
pos_delete_dict=pos_delete_dict,
|
353
496
|
)
|
354
|
-
partition_spec = PartitionSpec(partition_field_identity)
|
355
497
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
properties["write.merge.mode"] = "merge-on-read"
|
361
|
-
properties["format-version"] = "2"
|
498
|
+
s3_file_system = get_s3_file_system()
|
499
|
+
convert_inputs = create_convert_input(
|
500
|
+
tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
|
501
|
+
)
|
362
502
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
partition_spec=partition_spec,
|
368
|
-
properties=properties,
|
503
|
+
# Create and set up mock data
|
504
|
+
mock_data_tables = create_mock_data_tables(test_case)
|
505
|
+
download_data_mock = mocker.patch(
|
506
|
+
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
369
507
|
)
|
370
508
|
|
371
|
-
|
372
|
-
tbl = session_catalog.load_table(identifier)
|
509
|
+
download_data_mock.side_effect = mock_data_tables
|
373
510
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
)
|
382
|
-
run_spark_commands(
|
383
|
-
spark,
|
384
|
-
[
|
385
|
-
f"""
|
386
|
-
INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
|
387
|
-
"""
|
388
|
-
],
|
389
|
-
)
|
390
|
-
run_spark_commands(
|
391
|
-
spark,
|
392
|
-
[
|
393
|
-
f"""
|
394
|
-
INSERT INTO {identifier} VALUES (0, "pk4"), (0, "pk2"), (0, "pk3")
|
395
|
-
"""
|
396
|
-
],
|
511
|
+
# Run conversion
|
512
|
+
convert_ref = convert.remote(convert_inputs[0])
|
513
|
+
convert_result = ray.get(convert_ref)
|
514
|
+
|
515
|
+
# Process results
|
516
|
+
to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
|
517
|
+
convert_result
|
397
518
|
)
|
519
|
+
|
520
|
+
if not to_be_deleted_files_list:
|
521
|
+
# Commit changes
|
522
|
+
commit_append_snapshot(
|
523
|
+
iceberg_table=tbl,
|
524
|
+
new_position_delete_files=to_be_added_files_list,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
commit_replace_snapshot(
|
528
|
+
iceberg_table=tbl,
|
529
|
+
to_be_deleted_files=to_be_deleted_files_list[0],
|
530
|
+
new_position_delete_files=to_be_added_files_list,
|
531
|
+
)
|
398
532
|
tbl.refresh()
|
399
533
|
|
400
|
-
#
|
401
|
-
|
534
|
+
# Verify results
|
535
|
+
pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
|
402
536
|
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
537
|
+
# Verify Spark read if required
|
538
|
+
if test_case.get("verify_spark_read", False):
|
539
|
+
verify_spark_read_results(spark, identifier, test_case["expected_result"])
|
540
|
+
else:
|
541
|
+
verify_result(
|
542
|
+
pyiceberg_scan_table_rows,
|
543
|
+
test_case["expected_result"],
|
544
|
+
verify_pos_index=test_case.get("verify_pos_index", False),
|
545
|
+
)
|
408
546
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
547
|
+
|
548
|
+
def test_converter_session_with_local_filesystem_and_duplicate_ids(
|
549
|
+
setup_ray_cluster,
|
550
|
+
) -> None:
|
551
|
+
"""
|
552
|
+
Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
|
553
|
+
This test simulates the pattern where duplicate IDs represent updates to existing records.
|
554
|
+
The converter should merge these updates by creating position delete files.
|
555
|
+
"""
|
556
|
+
with temp_dir_autocleanup() as temp_catalog_dir:
|
557
|
+
# Create warehouse directory
|
558
|
+
warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
|
559
|
+
os.makedirs(warehouse_path, exist_ok=True)
|
560
|
+
|
561
|
+
# Set up local in-memory catalog
|
562
|
+
local_catalog = load_catalog(
|
563
|
+
"local_sql_catalog",
|
564
|
+
**{
|
565
|
+
"type": "in-memory",
|
566
|
+
"warehouse": warehouse_path,
|
567
|
+
},
|
421
568
|
)
|
422
569
|
|
423
|
-
|
424
|
-
|
425
|
-
|
570
|
+
# Create local PyArrow filesystem
|
571
|
+
import pyarrow.fs as pafs
|
572
|
+
|
573
|
+
local_filesystem = pafs.LocalFileSystem()
|
574
|
+
|
575
|
+
# Define schema (id, name, value, version)
|
576
|
+
schema = Schema(
|
577
|
+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
|
578
|
+
NestedField(
|
579
|
+
field_id=2, name="name", field_type=StringType(), required=False
|
580
|
+
),
|
581
|
+
NestedField(
|
582
|
+
field_id=3, name="value", field_type=LongType(), required=False
|
583
|
+
),
|
584
|
+
NestedField(
|
585
|
+
field_id=4, name="version", field_type=LongType(), required=False
|
586
|
+
),
|
587
|
+
schema_id=0,
|
588
|
+
)
|
426
589
|
|
427
|
-
|
428
|
-
|
429
|
-
|
590
|
+
# Create table properties for merge-on-read
|
591
|
+
properties = {
|
592
|
+
"write.format.default": "parquet",
|
593
|
+
"write.delete.mode": "merge-on-read",
|
594
|
+
"write.update.mode": "merge-on-read",
|
595
|
+
"write.merge.mode": "merge-on-read",
|
596
|
+
"format-version": "2",
|
597
|
+
}
|
598
|
+
|
599
|
+
# Create the table
|
600
|
+
table_identifier = "default.test_duplicate_ids"
|
601
|
+
try:
|
602
|
+
local_catalog.create_namespace("default")
|
603
|
+
except NamespaceAlreadyExistsError:
|
604
|
+
pass # Namespace may already exist
|
605
|
+
try:
|
606
|
+
local_catalog.drop_table(table_identifier)
|
607
|
+
except NoSuchTableError:
|
608
|
+
pass # Table may not exist
|
609
|
+
|
610
|
+
local_catalog.create_table(
|
611
|
+
table_identifier,
|
612
|
+
schema=schema,
|
613
|
+
properties=properties,
|
614
|
+
)
|
615
|
+
tbl = local_catalog.load_table(table_identifier)
|
430
616
|
|
431
|
-
|
432
|
-
|
433
|
-
|
617
|
+
# Set the name mapping property so Iceberg can read parquet files without field IDs
|
618
|
+
with tbl.transaction() as tx:
|
619
|
+
tx.set_properties(
|
620
|
+
**{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
|
621
|
+
)
|
434
622
|
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
623
|
+
# Step 1: Write initial data
|
624
|
+
# Create PyArrow table with explicit schema to match Iceberg schema
|
625
|
+
arrow_schema = schema_to_pyarrow(schema)
|
626
|
+
|
627
|
+
initial_data = pa.table(
|
628
|
+
{
|
629
|
+
"id": [1, 2, 3, 4],
|
630
|
+
"name": ["Alice", "Bob", "Charlie", "David"],
|
631
|
+
"value": [100, 200, 300, 400],
|
632
|
+
"version": [1, 1, 1, 1],
|
633
|
+
},
|
634
|
+
schema=arrow_schema,
|
635
|
+
)
|
439
636
|
|
440
|
-
|
637
|
+
# Step 2: Write additional data
|
638
|
+
additional_data = pa.table(
|
639
|
+
{
|
640
|
+
"id": [5, 6, 7, 8],
|
641
|
+
"name": ["Eve", "Frank", "Grace", "Henry"],
|
642
|
+
"value": [500, 600, 700, 800],
|
643
|
+
"version": [1, 1, 1, 1],
|
644
|
+
},
|
645
|
+
schema=arrow_schema,
|
646
|
+
)
|
441
647
|
|
442
|
-
|
443
|
-
|
444
|
-
|
648
|
+
# Step 3: Write updates to existing records (this creates duplicates by ID)
|
649
|
+
# These should overwrite the original records with same IDs
|
650
|
+
updated_data = pa.table(
|
651
|
+
{
|
652
|
+
"id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
|
653
|
+
"name": [
|
654
|
+
"Robert",
|
655
|
+
"Charles",
|
656
|
+
"Ivan",
|
657
|
+
], # Updated names for Bob and Charlie
|
658
|
+
"value": [201, 301, 900], # Updated values
|
659
|
+
"version": [2, 2, 1], # Higher version numbers for updates
|
660
|
+
},
|
661
|
+
schema=arrow_schema,
|
662
|
+
)
|
445
663
|
|
446
|
-
|
664
|
+
# Write all data to separate parquet files to simulate multiple writes
|
665
|
+
data_files_to_commit = []
|
447
666
|
|
448
|
-
|
449
|
-
|
667
|
+
for i, data in enumerate([initial_data, additional_data, updated_data]):
|
668
|
+
data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
|
669
|
+
pq.write_table(data, data_file_path)
|
450
670
|
|
451
|
-
|
452
|
-
|
671
|
+
# Create DataFile objects for Iceberg
|
672
|
+
parquet_metadata = pq.read_metadata(data_file_path)
|
673
|
+
file_size = os.path.getsize(data_file_path)
|
453
674
|
|
454
|
-
|
455
|
-
|
456
|
-
|
675
|
+
# Check schema compatibility
|
676
|
+
_check_pyarrow_schema_compatible(
|
677
|
+
schema, parquet_metadata.schema.to_arrow_schema()
|
678
|
+
)
|
457
679
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
680
|
+
# Calculate statistics
|
681
|
+
statistics = data_file_statistics_from_parquet_metadata(
|
682
|
+
parquet_metadata=parquet_metadata,
|
683
|
+
stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
|
684
|
+
parquet_column_mapping=parquet_path_to_id_mapping(schema),
|
685
|
+
)
|
464
686
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
687
|
+
data_file = DataFile(
|
688
|
+
content=DataFileContent.DATA,
|
689
|
+
file_path=data_file_path,
|
690
|
+
file_format=FileFormat.PARQUET,
|
691
|
+
partition={}, # No partitioning
|
692
|
+
file_size_in_bytes=file_size,
|
693
|
+
sort_order_id=None,
|
694
|
+
spec_id=tbl.metadata.default_spec_id,
|
695
|
+
key_metadata=None,
|
696
|
+
equality_ids=None,
|
697
|
+
**statistics.to_serialized_dict(),
|
698
|
+
)
|
699
|
+
data_files_to_commit.append(data_file)
|
700
|
+
|
701
|
+
# Commit all data files to the table
|
702
|
+
with tbl.transaction() as tx:
|
703
|
+
with tx.update_snapshot().fast_append() as update_snapshot:
|
704
|
+
for data_file in data_files_to_commit:
|
705
|
+
update_snapshot.append_data_file(data_file)
|
706
|
+
|
707
|
+
tbl.refresh()
|
708
|
+
|
709
|
+
# Verify we have duplicate IDs before conversion
|
710
|
+
initial_scan = tbl.scan().to_arrow().to_pydict()
|
711
|
+
print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
|
712
|
+
|
713
|
+
# There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
714
|
+
expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
715
|
+
assert (
|
716
|
+
sorted(initial_scan["id"]) == expected_duplicate_ids
|
717
|
+
), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
|
718
|
+
|
719
|
+
# Now call converter_session to convert equality deletes to position deletes
|
720
|
+
converter_params = ConverterSessionParams.of(
|
721
|
+
{
|
722
|
+
"catalog": local_catalog,
|
723
|
+
"iceberg_table_name": table_identifier,
|
724
|
+
"iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
|
725
|
+
"merge_keys": ["id"], # Use ID as the merge key
|
726
|
+
"enforce_primary_key_uniqueness": True,
|
727
|
+
"task_max_parallelism": 1, # Single task for local testing
|
728
|
+
"filesystem": local_filesystem,
|
729
|
+
"location_provider_prefix_override": None, # Use local filesystem
|
730
|
+
"location_provider_prefix_override": None, # Let the system auto-generate the prefix
|
731
|
+
}
|
732
|
+
)
|
470
733
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
734
|
+
print(f"Running converter_session with local filesystem...")
|
735
|
+
print(f"Warehouse path: {warehouse_path}")
|
736
|
+
print(f"Merge keys: ['id']")
|
737
|
+
print(f"Enforce uniqueness: True")
|
738
|
+
|
739
|
+
# Run the converter
|
740
|
+
converter_session(params=converter_params)
|
741
|
+
|
742
|
+
# Refresh table and scan again
|
743
|
+
tbl.refresh()
|
744
|
+
final_scan = tbl.scan().to_arrow().to_pydict()
|
745
|
+
|
746
|
+
print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
|
747
|
+
print(f"Final data: {final_scan}")
|
748
|
+
|
749
|
+
# Verify position delete files were created by checking table metadata
|
750
|
+
latest_snapshot = tbl.metadata.current_snapshot()
|
751
|
+
if latest_snapshot:
|
752
|
+
manifests = latest_snapshot.manifests(tbl.io)
|
753
|
+
position_delete_files = []
|
754
|
+
|
755
|
+
for manifest in manifests:
|
756
|
+
entries = manifest.fetch_manifest_entry(tbl.io)
|
757
|
+
for entry in entries:
|
758
|
+
if entry.data_file.content == DataFileContent.POSITION_DELETES:
|
759
|
+
position_delete_files.append(entry.data_file.file_path)
|
760
|
+
|
761
|
+
print(f"Position delete files found: {position_delete_files}")
|
762
|
+
assert (
|
763
|
+
len(position_delete_files) > 0
|
764
|
+
), "No position delete files were created by converter_session"
|
765
|
+
|
766
|
+
# Verify the final result has unique IDs (duplicates should be resolved)
|
767
|
+
# Expected: Latest values for each ID based on the updates
|
768
|
+
expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
|
769
|
+
actual_ids = sorted(final_scan["id"])
|
770
|
+
|
771
|
+
print(f"Expected unique IDs: {expected_unique_ids}")
|
772
|
+
print(f"Actual IDs after conversion: {actual_ids}")
|
773
|
+
|
774
|
+
assert (
|
775
|
+
actual_ids == expected_unique_ids
|
776
|
+
), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
|
777
|
+
|
778
|
+
# Verify the updated values are present (higher version should win)
|
779
|
+
final_data_by_id = {}
|
780
|
+
for i, id_val in enumerate(final_scan["id"]):
|
781
|
+
final_data_by_id[id_val] = {
|
782
|
+
"name": final_scan["name"][i],
|
783
|
+
"value": final_scan["value"][i],
|
784
|
+
"version": final_scan["version"][i],
|
785
|
+
}
|
786
|
+
|
787
|
+
# Check that ID 2 has updated value (Robert, 201, version 2)
|
788
|
+
assert (
|
789
|
+
final_data_by_id[2]["name"] == "Robert"
|
790
|
+
), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
|
791
|
+
assert (
|
792
|
+
final_data_by_id[2]["value"] == 201
|
793
|
+
), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
|
794
|
+
assert (
|
795
|
+
final_data_by_id[2]["version"] == 2
|
796
|
+
), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
|
797
|
+
|
798
|
+
# Check that ID 3 has updated value (Charles, 301, version 2)
|
799
|
+
assert (
|
800
|
+
final_data_by_id[3]["name"] == "Charles"
|
801
|
+
), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
|
802
|
+
assert (
|
803
|
+
final_data_by_id[3]["value"] == 301
|
804
|
+
), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
|
805
|
+
assert (
|
806
|
+
final_data_by_id[3]["version"] == 2
|
807
|
+
), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
|
808
|
+
|
809
|
+
# Check that new ID 9 is present
|
810
|
+
assert (
|
811
|
+
final_data_by_id[9]["name"] == "Ivan"
|
812
|
+
), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
|
813
|
+
assert (
|
814
|
+
final_data_by_id[9]["value"] == 900
|
815
|
+
), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
|
816
|
+
|
817
|
+
print(f"✅ Test completed successfully!")
|
818
|
+
print(
|
819
|
+
f"✅ Position delete files were created: {len(position_delete_files)} files"
|
820
|
+
)
|
821
|
+
print(f"✅ Duplicate IDs were resolved correctly")
|
822
|
+
print(
|
823
|
+
f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
|
824
|
+
)
|
825
|
+
print(f"✅ Final table has {len(actual_ids)} unique records")
|
826
|
+
print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")
|