deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
|
|
1
|
+
from collections import defaultdict
|
1
2
|
import pytest
|
2
3
|
import ray
|
3
|
-
from typing import List
|
4
|
+
from typing import List, Dict, Any, Tuple
|
4
5
|
from pyiceberg.catalog.rest import RestCatalog
|
5
|
-
from pyiceberg.expressions import EqualTo
|
6
6
|
from pyiceberg.schema import Schema
|
7
7
|
from pyiceberg.types import (
|
8
8
|
NestedField,
|
@@ -25,144 +25,85 @@ from deltacat.compute.converter.utils.converter_session_utils import (
|
|
25
25
|
from deltacat.tests.compute.converter.utils import (
|
26
26
|
get_s3_file_system,
|
27
27
|
drop_table_if_exists,
|
28
|
+
commit_equality_delete_to_table,
|
28
29
|
)
|
29
30
|
from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
|
30
31
|
commit_append_snapshot,
|
32
|
+
commit_replace_snapshot,
|
31
33
|
)
|
32
34
|
|
35
|
+
from pyiceberg.typedef import Record
|
36
|
+
from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
|
37
|
+
from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
|
38
|
+
from deltacat.compute.converter.converter_session import converter_session
|
39
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
40
|
+
ConverterSessionParams,
|
41
|
+
)
|
42
|
+
from pyiceberg.catalog import load_catalog
|
43
|
+
import os
|
44
|
+
import pyarrow.parquet as pq
|
45
|
+
from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
46
|
+
from pyiceberg.io.pyarrow import (
|
47
|
+
data_file_statistics_from_parquet_metadata,
|
48
|
+
compute_statistics_plan,
|
49
|
+
parquet_path_to_id_mapping,
|
50
|
+
)
|
51
|
+
from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
|
52
|
+
from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
|
53
|
+
from pyiceberg.io.pyarrow import schema_to_pyarrow
|
33
54
|
|
34
|
-
|
35
|
-
|
36
|
-
spark.sql(sql)
|
55
|
+
# Task memory in bytes for testing
|
56
|
+
TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
|
37
57
|
|
38
58
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
""",
|
60
|
-
f"""
|
61
|
-
INSERT INTO {identifier} VALUES (10, 20), (10, 30)
|
62
|
-
""",
|
63
|
-
f"""
|
64
|
-
INSERT INTO {identifier} VALUES (11, 20), (11, 30)
|
65
|
-
""",
|
66
|
-
],
|
59
|
+
# Test data fixtures
|
60
|
+
@pytest.fixture
|
61
|
+
def base_schema():
|
62
|
+
return Schema(
|
63
|
+
NestedField(
|
64
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
65
|
+
),
|
66
|
+
NestedField(
|
67
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
68
|
+
),
|
69
|
+
NestedField(
|
70
|
+
field_id=2147483546,
|
71
|
+
name="file_path",
|
72
|
+
field_type=StringType(),
|
73
|
+
required=False,
|
74
|
+
),
|
75
|
+
NestedField(
|
76
|
+
field_id=2147483545, name="pos", field_type=LongType(), required=False
|
77
|
+
),
|
78
|
+
schema_id=0,
|
67
79
|
)
|
68
80
|
|
69
|
-
tbl = session_catalog.load_table(identifier)
|
70
|
-
tbl.delete(EqualTo("number_partitioned", 10))
|
71
|
-
|
72
|
-
# No overwrite operation
|
73
|
-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
|
74
|
-
"append",
|
75
|
-
"append",
|
76
|
-
"delete",
|
77
|
-
]
|
78
|
-
assert tbl.scan().to_arrow().to_pydict() == {
|
79
|
-
"number_partitioned": [11, 11],
|
80
|
-
"number": [20, 30],
|
81
|
-
}
|
82
|
-
|
83
|
-
|
84
|
-
@pytest.mark.integration
|
85
|
-
def test_spark_position_delete_production_sanity(
|
86
|
-
spark, session_catalog: RestCatalog
|
87
|
-
) -> None:
|
88
|
-
"""
|
89
|
-
Sanity test to ensure Spark position delete production is successful with `merge-on-read` spec V2.
|
90
|
-
Table has two partition levels. 1. BucketTransform on primary key
|
91
|
-
"""
|
92
|
-
identifier = "default.table_spark_position_delete_production_sanity"
|
93
|
-
|
94
|
-
run_spark_commands(
|
95
|
-
spark,
|
96
|
-
[
|
97
|
-
f"DROP TABLE IF EXISTS {identifier}",
|
98
|
-
f"""
|
99
|
-
CREATE TABLE {identifier} (
|
100
|
-
number_partitioned INT,
|
101
|
-
primary_key STRING
|
102
|
-
)
|
103
|
-
USING iceberg
|
104
|
-
PARTITIONED BY (bucket(3, primary_key), number_partitioned)
|
105
|
-
TBLPROPERTIES(
|
106
|
-
'format-version' = 2,
|
107
|
-
'write.delete.mode'='merge-on-read',
|
108
|
-
'write.update.mode'='merge-on-read',
|
109
|
-
'write.merge.mode'='merge-on-read'
|
110
|
-
)
|
111
|
-
""",
|
112
|
-
f"""
|
113
|
-
INSERT INTO {identifier} VALUES (0, 'pk1'), (0, 'pk2'), (0, 'pk3')
|
114
|
-
""",
|
115
|
-
f"""
|
116
|
-
INSERT INTO {identifier} VALUES (1, 'pk1'), (1, 'pk2'), (1, 'pk3')
|
117
|
-
""",
|
118
|
-
],
|
119
|
-
)
|
120
81
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
82
|
+
@pytest.fixture
|
83
|
+
def base_schema_without_metadata():
|
84
|
+
return Schema(
|
85
|
+
NestedField(
|
86
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
87
|
+
),
|
88
|
+
NestedField(
|
89
|
+
field_id=2, name="primary_key", field_type=StringType(), required=False
|
90
|
+
),
|
91
|
+
schema_id=0,
|
128
92
|
)
|
129
93
|
|
130
|
-
tbl = session_catalog.load_table(identifier)
|
131
|
-
tbl.refresh()
|
132
|
-
|
133
|
-
assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
|
134
|
-
"append",
|
135
|
-
"append",
|
136
|
-
"delete",
|
137
|
-
]
|
138
|
-
|
139
|
-
assert tbl.scan().to_arrow().to_pydict() == {
|
140
|
-
"number_partitioned": [1, 1, 0, 0],
|
141
|
-
"primary_key": ["pk2", "pk3", "pk2", "pk3"],
|
142
|
-
}
|
143
|
-
|
144
|
-
|
145
|
-
@pytest.mark.integration
|
146
|
-
def test_converter_drop_duplicates_success(
|
147
|
-
spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
|
148
|
-
) -> None:
|
149
|
-
"""
|
150
|
-
Test for convert compute remote function happy case. Download file results are mocked.
|
151
|
-
"""
|
152
|
-
|
153
|
-
# 1. Create Iceberg table
|
154
|
-
namespace = "default"
|
155
|
-
table_name = "table_converter_ray_pos_delete_drop_duplicates_compute"
|
156
|
-
identifier = f"{namespace}.{table_name}"
|
157
94
|
|
158
|
-
|
95
|
+
@pytest.fixture
|
96
|
+
def multi_key_schema():
|
97
|
+
return Schema(
|
159
98
|
NestedField(
|
160
99
|
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
161
100
|
),
|
162
101
|
NestedField(
|
163
|
-
field_id=2, name="
|
102
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
103
|
+
),
|
104
|
+
NestedField(
|
105
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
164
106
|
),
|
165
|
-
# Explicitly define "file_path" and "pos" for assertion of deterministic record after dedupe
|
166
107
|
NestedField(
|
167
108
|
field_id=2147483546,
|
168
109
|
name="file_path",
|
@@ -175,21 +116,55 @@ def test_converter_drop_duplicates_success(
|
|
175
116
|
schema_id=0,
|
176
117
|
)
|
177
118
|
|
119
|
+
|
120
|
+
@pytest.fixture
|
121
|
+
def multi_key_schema_without_file_path():
|
122
|
+
return Schema(
|
123
|
+
NestedField(
|
124
|
+
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
125
|
+
),
|
126
|
+
NestedField(
|
127
|
+
field_id=2, name="primary_key1", field_type=StringType(), required=False
|
128
|
+
),
|
129
|
+
NestedField(
|
130
|
+
field_id=3, name="primary_key2", field_type=LongType(), required=False
|
131
|
+
),
|
132
|
+
schema_id=0,
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
@pytest.fixture
|
137
|
+
def base_partition_spec():
|
178
138
|
partition_field_identity = PartitionField(
|
179
139
|
source_id=1,
|
180
140
|
field_id=101,
|
181
141
|
transform=IdentityTransform(),
|
182
142
|
name="number_partitioned",
|
183
143
|
)
|
184
|
-
|
144
|
+
return PartitionSpec(partition_field_identity)
|
185
145
|
|
186
|
-
properties = dict()
|
187
|
-
properties["write.format.default"] = "parquet"
|
188
|
-
properties["write.delete.mode"] = "merge-on-read"
|
189
|
-
properties["write.update.mode"] = "merge-on-read"
|
190
|
-
properties["write.merge.mode"] = "merge-on-read"
|
191
|
-
properties["format-version"] = "2"
|
192
146
|
|
147
|
+
@pytest.fixture
|
148
|
+
def table_properties():
|
149
|
+
return {
|
150
|
+
"write.format.default": "parquet",
|
151
|
+
"write.delete.mode": "merge-on-read",
|
152
|
+
"write.update.mode": "merge-on-read",
|
153
|
+
"write.merge.mode": "merge-on-read",
|
154
|
+
"format-version": "2",
|
155
|
+
}
|
156
|
+
|
157
|
+
|
158
|
+
def create_test_table(
|
159
|
+
session_catalog: RestCatalog,
|
160
|
+
namespace: str,
|
161
|
+
table_name: str,
|
162
|
+
schema: Schema,
|
163
|
+
partition_spec: PartitionSpec,
|
164
|
+
properties: Dict[str, str],
|
165
|
+
) -> str:
|
166
|
+
"""Helper function to create a test table"""
|
167
|
+
identifier = f"{namespace}.{table_name}"
|
193
168
|
drop_table_if_exists(identifier, session_catalog)
|
194
169
|
session_catalog.create_table(
|
195
170
|
identifier,
|
@@ -197,204 +172,323 @@ def test_converter_drop_duplicates_success(
|
|
197
172
|
partition_spec=partition_spec,
|
198
173
|
properties=properties,
|
199
174
|
)
|
175
|
+
return identifier
|
200
176
|
|
201
|
-
# 2. Use Spark to generate initial data files
|
202
|
-
tbl = session_catalog.load_table(identifier)
|
203
|
-
tbl.refresh()
|
204
|
-
run_spark_commands(
|
205
|
-
spark,
|
206
|
-
[
|
207
|
-
f"""
|
208
|
-
INSERT INTO {identifier} VALUES (0, "pk1", "path1", 1), (0, "pk2", "path2", 2), (0, "pk3", "path3", 3)
|
209
|
-
"""
|
210
|
-
],
|
211
|
-
)
|
212
|
-
run_spark_commands(
|
213
|
-
spark,
|
214
|
-
[
|
215
|
-
f"""
|
216
|
-
INSERT INTO {identifier} VALUES (0, "pk1", "path1", 4), (0, "pk2", "path2", 5), (0, "pk3", "path3", 6)
|
217
|
-
"""
|
218
|
-
],
|
219
|
-
)
|
220
|
-
run_spark_commands(
|
221
|
-
spark,
|
222
|
-
[
|
223
|
-
f"""
|
224
|
-
INSERT INTO {identifier} VALUES (0, "pk4", "path4", 7), (0, "pk2", "path2", 8), (0, "pk3", "path3", 9)
|
225
|
-
"""
|
226
|
-
],
|
227
|
-
)
|
228
177
|
|
229
|
-
|
230
|
-
|
231
|
-
|
178
|
+
def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
|
179
|
+
"""Helper function to create mock data tables based on test case"""
|
180
|
+
tables = []
|
181
|
+
for data in test_case["mock_data"]:
|
182
|
+
if "primary_key2" in data: # Multi-key case
|
183
|
+
names = ["primary_key1", "primary_key2"]
|
184
|
+
table = pa.Table.from_arrays(
|
185
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
186
|
+
names=names,
|
187
|
+
)
|
188
|
+
else: # Single key case
|
189
|
+
names = ["primary_key"]
|
190
|
+
table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
|
191
|
+
tables.append(daft.from_arrow(table))
|
192
|
+
if "equality_delete_data_mock" in test_case:
|
193
|
+
for data in test_case["equality_delete_data_mock"]:
|
194
|
+
if "primary_key2" in data: # Multi-key case
|
195
|
+
names = ["primary_key1", "primary_key2"]
|
196
|
+
table = pa.Table.from_arrays(
|
197
|
+
[pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
|
198
|
+
names=names,
|
199
|
+
)
|
200
|
+
else: # Single key case
|
201
|
+
names = ["primary_key"]
|
202
|
+
table = pa.Table.from_arrays(
|
203
|
+
[pa.array(data["primary_key"])], names=names
|
204
|
+
)
|
205
|
+
tables.append(daft.from_arrow(table))
|
206
|
+
return tuple(tables)
|
232
207
|
|
233
|
-
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
234
|
-
data_file_dict=data_file_dict,
|
235
|
-
equality_delete_dict=equality_delete_dict,
|
236
|
-
pos_delete_dict=pos_delete_dict,
|
237
|
-
)
|
238
208
|
|
239
|
-
|
209
|
+
def run_spark_commands(spark, sqls: List[str]) -> None:
|
210
|
+
"""Helper function to run Spark SQL commands"""
|
211
|
+
for sql in sqls:
|
212
|
+
spark.sql(sql)
|
213
|
+
|
214
|
+
|
215
|
+
def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
|
216
|
+
"""Helper function to insert test data into the table"""
|
217
|
+
if "primary_key2" in test_case["mock_data"][0]:
|
218
|
+
# Multi-key case
|
219
|
+
for data in test_case["mock_data"]:
|
220
|
+
values = ", ".join(
|
221
|
+
f"(0, '{pk1}', {pk2})"
|
222
|
+
for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
|
223
|
+
)
|
224
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
225
|
+
else:
|
226
|
+
# Single key case
|
227
|
+
if test_case["schema"] == "base_schema":
|
228
|
+
# For drop duplicates test, use file_path and pos from mock_data
|
229
|
+
for data in test_case["mock_data"]:
|
230
|
+
values = ", ".join(
|
231
|
+
f"(0, '{pk}', '{path}', {pos})"
|
232
|
+
for pk, path, pos in zip(
|
233
|
+
data["primary_key"], data["file_path"], data["pos"]
|
234
|
+
)
|
235
|
+
)
|
236
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
237
|
+
else:
|
238
|
+
# For other tests, just include the basic columns
|
239
|
+
for data in test_case["mock_data"]:
|
240
|
+
values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
|
241
|
+
run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
|
242
|
+
|
243
|
+
|
244
|
+
def create_convert_input(
|
245
|
+
tbl,
|
246
|
+
convert_input_files_for_all_buckets: List[Any],
|
247
|
+
test_case: Dict[str, Any],
|
248
|
+
s3_file_system: Any,
|
249
|
+
) -> List[ConvertInput]:
|
250
|
+
"""Helper function to create convert inputs"""
|
251
|
+
convert_inputs = []
|
240
252
|
for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
|
241
253
|
convert_input = ConvertInput.of(
|
242
254
|
convert_input_files=one_bucket_files,
|
243
255
|
convert_task_index=i,
|
244
256
|
iceberg_table_warehouse_prefix="warehouse/default",
|
245
|
-
identifier_fields=["
|
257
|
+
identifier_fields=test_case["identifier_fields"],
|
246
258
|
table_io=tbl.io,
|
247
259
|
table_metadata=tbl.metadata,
|
248
260
|
compact_previous_position_delete_files=False,
|
249
261
|
enforce_primary_key_uniqueness=True,
|
250
262
|
position_delete_for_multiple_data_files=True,
|
251
263
|
max_parallel_data_file_download=10,
|
252
|
-
|
264
|
+
filesystem=s3_file_system,
|
253
265
|
s3_client_kwargs={},
|
266
|
+
task_memory=TASK_MEMORY_BYTES,
|
254
267
|
)
|
268
|
+
convert_inputs.append(convert_input)
|
269
|
+
return convert_inputs
|
255
270
|
|
256
|
-
number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
|
257
|
-
primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
|
258
|
-
names = ["number_partitioned", "primary_key"]
|
259
|
-
data_table_1 = pa.Table.from_arrays(
|
260
|
-
[number_partitioned_array_1, primary_key_array_1], names=names
|
261
|
-
)
|
262
|
-
|
263
|
-
number_partitioned_array_2 = pa.array([0, 0, 0], type=pa.int32())
|
264
|
-
primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
|
265
|
-
names = ["number_partitioned", "primary_key"]
|
266
|
-
data_table_2 = pa.Table.from_arrays(
|
267
|
-
[number_partitioned_array_2, primary_key_array_2], names=names
|
268
|
-
)
|
269
|
-
|
270
|
-
number_partitioned_array_3 = pa.array([0, 0, 0], type=pa.int32())
|
271
|
-
primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
|
272
|
-
names = ["number_partitioned", "primary_key"]
|
273
|
-
data_table_3 = pa.Table.from_arrays(
|
274
|
-
[number_partitioned_array_3, primary_key_array_3], names=names
|
275
|
-
)
|
276
|
-
|
277
|
-
daft_df_1 = daft.from_arrow(data_table_1)
|
278
|
-
daft_df_2 = daft.from_arrow(data_table_2)
|
279
|
-
daft_df_3 = daft.from_arrow(data_table_3)
|
280
271
|
|
281
|
-
|
282
|
-
|
283
|
-
)
|
284
|
-
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
272
|
+
def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
|
273
|
+
"""Helper function to process convert results
|
285
274
|
|
286
|
-
|
275
|
+
Args:
|
276
|
+
convert_result: The result from convert_session
|
287
277
|
|
278
|
+
Returns:
|
279
|
+
Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
|
280
|
+
"""
|
288
281
|
to_be_deleted_files_list = []
|
289
|
-
|
290
|
-
convert_result = ray.get(convert_ref)
|
291
|
-
|
292
282
|
to_be_added_files_list = []
|
293
|
-
# Check if there're files to delete
|
294
283
|
if convert_result.to_be_deleted_files:
|
295
284
|
to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
|
296
285
|
if convert_result.to_be_added_files:
|
297
286
|
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
287
|
+
return to_be_deleted_files_list, to_be_added_files_list
|
298
288
|
|
299
|
-
commit_append_snapshot(
|
300
|
-
iceberg_table=tbl,
|
301
|
-
new_position_delete_files=to_be_added_files_list,
|
302
|
-
)
|
303
|
-
tbl.refresh()
|
304
289
|
|
305
|
-
|
306
|
-
|
290
|
+
def verify_result(result, expected_result, verify_pos_index=False):
|
291
|
+
"""Verify the result matches the expected result.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
result: The result to verify
|
295
|
+
expected_result: The expected result
|
296
|
+
verify_pos_index: Whether to verify position values for primary keys
|
297
|
+
"""
|
298
|
+
if "primary_keys" in expected_result and "primary_key" in result:
|
299
|
+
# Single key case
|
300
|
+
assert set(result["primary_key"]) == set(expected_result["primary_keys"])
|
301
|
+
if verify_pos_index and "pk_to_pos" in expected_result:
|
302
|
+
for index in range(len(result["primary_key"])):
|
303
|
+
assert (
|
304
|
+
result["pos"][index]
|
305
|
+
== expected_result["pk_to_pos"][result["primary_key"][index]]
|
306
|
+
)
|
307
|
+
elif "pk_tuples" in expected_result:
|
308
|
+
pk_combined_res = []
|
309
|
+
for pk1, pk2 in zip(
|
310
|
+
result["primary_key1"],
|
311
|
+
result["primary_key2"],
|
312
|
+
):
|
313
|
+
pk_combined_res.append((pk1, pk2))
|
314
|
+
|
315
|
+
# Multi-key case
|
316
|
+
assert set(pk_combined_res) == set(expected_result["pk_tuples"])
|
317
|
+
else:
|
318
|
+
assert set(result) == set(expected_result["primary_keys"])
|
319
|
+
|
320
|
+
|
321
|
+
def verify_spark_read_results(spark, identifier, expected_result):
|
322
|
+
spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
|
323
|
+
all_pk = [
|
324
|
+
spark_read_pos_delete[row_idx][1]
|
325
|
+
for row_idx in range(len(spark_read_pos_delete))
|
326
|
+
]
|
327
|
+
verify_result(all_pk, expected_result, verify_pos_index=False)
|
328
|
+
|
329
|
+
|
330
|
+
def get_file_prefix(tbl):
|
331
|
+
"""Get the file prefix from a table's data files.
|
307
332
|
|
308
|
-
|
309
|
-
|
310
|
-
assert all_pk == ["pk1", "pk2", "pk3", "pk4"]
|
333
|
+
Args:
|
334
|
+
tbl: The table to get the file prefix from
|
311
335
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
)
|
317
|
-
|
336
|
+
Returns:
|
337
|
+
str: The file prefix
|
338
|
+
"""
|
339
|
+
df = tbl.inspect.entries()
|
340
|
+
data_files = df.to_pydict()["data_file"]
|
341
|
+
file_link = data_files[0]["file_path"]
|
342
|
+
file_prefix = "/".join(file_link.split("/")[:-1])
|
343
|
+
return file_prefix.split("//")[1]
|
344
|
+
|
345
|
+
|
346
|
+
# Test cases configuration
|
347
|
+
TEST_CASES = [
|
348
|
+
{
|
349
|
+
"name": "single_key_drop_duplicates",
|
350
|
+
"table_name": "table_converter_ray_drop_duplicates_success",
|
351
|
+
"schema": "base_schema",
|
352
|
+
"identifier_fields": ["primary_key"],
|
353
|
+
"mock_data": [
|
354
|
+
{
|
355
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
356
|
+
"file_path": ["path1", "path2", "path3"],
|
357
|
+
"pos": [1, 2, 3],
|
358
|
+
},
|
359
|
+
{
|
360
|
+
"primary_key": ["pk1", "pk2", "pk3"],
|
361
|
+
"file_path": ["path1", "path2", "path3"],
|
362
|
+
"pos": [4, 5, 6],
|
363
|
+
},
|
364
|
+
{
|
365
|
+
"primary_key": ["pk4", "pk2", "pk3"],
|
366
|
+
"file_path": ["path4", "path2", "path3"],
|
367
|
+
"pos": [7, 8, 9],
|
368
|
+
},
|
369
|
+
],
|
370
|
+
"expected_result": {
|
371
|
+
"primary_keys": ["pk1", "pk2", "pk3", "pk4"],
|
372
|
+
"pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
|
373
|
+
},
|
374
|
+
},
|
375
|
+
{
|
376
|
+
"name": "multi_key_drop_duplicates",
|
377
|
+
"table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
|
378
|
+
"schema": "multi_key_schema_without_file_path",
|
379
|
+
"identifier_fields": ["primary_key1", "primary_key2"],
|
380
|
+
"mock_data": [
|
381
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
382
|
+
{"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
|
383
|
+
{"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
|
384
|
+
],
|
385
|
+
"expected_result": {
|
386
|
+
"pk_tuples": [
|
387
|
+
("pk1", 1),
|
388
|
+
("pk2", 2),
|
389
|
+
("pk2", 3),
|
390
|
+
("pk3", 3),
|
391
|
+
("pk3", 4),
|
392
|
+
("pk4", 1),
|
393
|
+
]
|
394
|
+
},
|
395
|
+
},
|
396
|
+
{
|
397
|
+
"name": "equality_delete",
|
398
|
+
"table_name": "table_converter_ray_equality_delete_success",
|
399
|
+
"schema": "base_schema_without_metadata",
|
400
|
+
"identifier_fields": ["primary_key"],
|
401
|
+
"mock_data": [
|
402
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
403
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
404
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
405
|
+
],
|
406
|
+
"equality_delete_data_mock": [{"primary_key": ["pk1"]}],
|
407
|
+
"equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
|
408
|
+
"verify_spark_read": True,
|
409
|
+
"expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
|
410
|
+
},
|
411
|
+
{
|
412
|
+
"name": "position_delete",
|
413
|
+
"table_name": "table_converter_ray_position_delete_success",
|
414
|
+
"schema": "base_schema_without_metadata",
|
415
|
+
"identifier_fields": ["primary_key"],
|
416
|
+
"mock_data": [
|
417
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
418
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
419
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
420
|
+
],
|
421
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
422
|
+
},
|
423
|
+
{
|
424
|
+
"name": "position_delete_read_by_spark",
|
425
|
+
"table_name": "table_converter_ray_pos_delete_read_by_spark_success",
|
426
|
+
"schema": "base_schema_without_metadata",
|
427
|
+
"identifier_fields": ["primary_key"],
|
428
|
+
"mock_data": [
|
429
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
430
|
+
{"primary_key": ["pk1", "pk2", "pk3"]},
|
431
|
+
{"primary_key": ["pk4", "pk2", "pk3"]},
|
432
|
+
],
|
433
|
+
"expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
|
434
|
+
"verify_spark_read": True,
|
435
|
+
"expected_spark_count": 4,
|
436
|
+
},
|
437
|
+
]
|
318
438
|
|
319
439
|
|
440
|
+
@pytest.mark.parametrize("test_case", TEST_CASES)
|
320
441
|
@pytest.mark.integration
|
321
|
-
def
|
322
|
-
|
442
|
+
def test_converter(
|
443
|
+
test_case: Dict[str, Any],
|
444
|
+
spark,
|
445
|
+
session_catalog: RestCatalog,
|
446
|
+
setup_ray_cluster,
|
447
|
+
mocker,
|
448
|
+
request,
|
323
449
|
) -> None:
|
324
450
|
"""
|
325
|
-
|
451
|
+
Parameterized test for converter functionality.
|
452
|
+
Tests drop duplicates, equality delete, and position delete scenarios.
|
326
453
|
"""
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
identifier =
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
field_id=1, name="number_partitioned", field_type=LongType(), required=False
|
336
|
-
),
|
337
|
-
NestedField(
|
338
|
-
field_id=2, name="primary_key", field_type=StringType(), required=False
|
339
|
-
),
|
340
|
-
schema_id=0,
|
341
|
-
)
|
342
|
-
|
343
|
-
partition_field_identity = PartitionField(
|
344
|
-
source_id=1,
|
345
|
-
field_id=101,
|
346
|
-
transform=IdentityTransform(),
|
347
|
-
name="number_partitioned",
|
348
|
-
)
|
349
|
-
partition_spec = PartitionSpec(partition_field_identity)
|
350
|
-
|
351
|
-
properties = dict()
|
352
|
-
properties["write.format.default"] = "parquet"
|
353
|
-
properties["write.delete.mode"] = "merge-on-read"
|
354
|
-
properties["write.update.mode"] = "merge-on-read"
|
355
|
-
properties["write.merge.mode"] = "merge-on-read"
|
356
|
-
properties["format-version"] = "2"
|
357
|
-
|
358
|
-
drop_table_if_exists(identifier, session_catalog)
|
359
|
-
session_catalog.create_table(
|
360
|
-
identifier,
|
454
|
+
# Get schema fixture based on test case
|
455
|
+
schema = request.getfixturevalue(test_case["schema"])
|
456
|
+
|
457
|
+
# Create test table
|
458
|
+
identifier = create_test_table(
|
459
|
+
session_catalog=session_catalog,
|
460
|
+
namespace="default",
|
461
|
+
table_name=test_case["table_name"],
|
361
462
|
schema=schema,
|
362
|
-
partition_spec=
|
363
|
-
properties=
|
463
|
+
partition_spec=request.getfixturevalue("base_partition_spec"),
|
464
|
+
properties=request.getfixturevalue("table_properties"),
|
364
465
|
)
|
365
466
|
|
366
|
-
#
|
367
|
-
|
368
|
-
|
369
|
-
run_spark_commands(
|
370
|
-
spark,
|
371
|
-
[
|
372
|
-
f"""
|
373
|
-
INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
|
374
|
-
"""
|
375
|
-
],
|
376
|
-
)
|
377
|
-
run_spark_commands(
|
378
|
-
spark,
|
379
|
-
[
|
380
|
-
f"""
|
381
|
-
INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
|
382
|
-
"""
|
383
|
-
],
|
384
|
-
)
|
385
|
-
run_spark_commands(
|
386
|
-
spark,
|
387
|
-
[
|
388
|
-
f"""
|
389
|
-
INSERT INTO {identifier} VALUES (0, "pk4"), (0, "pk2"), (0, "pk3")
|
390
|
-
"""
|
391
|
-
],
|
392
|
-
)
|
393
|
-
tbl.refresh()
|
467
|
+
# Insert test data
|
468
|
+
insert_test_data(spark, identifier, test_case)
|
394
469
|
|
395
|
-
#
|
470
|
+
# Get files and create convert input
|
471
|
+
tbl = session_catalog.load_table(identifier)
|
396
472
|
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
|
397
473
|
|
474
|
+
# Handle equality delete if present
|
475
|
+
if "equality_delete_data" in test_case:
|
476
|
+
tbl = session_catalog.load_table(identifier)
|
477
|
+
file_prefix = get_file_prefix(tbl)
|
478
|
+
partition_value = Record(number_partitioned=0)
|
479
|
+
|
480
|
+
# Note: Just upload to S3 to mock input data here.
|
481
|
+
# NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
|
482
|
+
equality_file_list = commit_equality_delete_to_table(
|
483
|
+
table=tbl,
|
484
|
+
partition_value=partition_value,
|
485
|
+
equality_delete_table=test_case["equality_delete_data"],
|
486
|
+
file_link_prefix=file_prefix,
|
487
|
+
)
|
488
|
+
# Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
|
489
|
+
equality_delete_dict = defaultdict()
|
490
|
+
equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
|
491
|
+
|
398
492
|
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
399
493
|
data_file_dict=data_file_dict,
|
400
494
|
equality_delete_dict=equality_delete_dict,
|
@@ -402,240 +496,331 @@ def test_converter_pos_delete_read_by_spark_success(
|
|
402
496
|
)
|
403
497
|
|
404
498
|
s3_file_system = get_s3_file_system()
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
convert_task_index=i,
|
409
|
-
iceberg_table_warehouse_prefix="warehouse/default",
|
410
|
-
identifier_fields=["primary_key"],
|
411
|
-
table_io=tbl.io,
|
412
|
-
table_metadata=tbl.metadata,
|
413
|
-
compact_previous_position_delete_files=False,
|
414
|
-
enforce_primary_key_uniqueness=True,
|
415
|
-
position_delete_for_multiple_data_files=True,
|
416
|
-
max_parallel_data_file_download=10,
|
417
|
-
s3_file_system=s3_file_system,
|
418
|
-
s3_client_kwargs={},
|
419
|
-
)
|
420
|
-
|
421
|
-
primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
|
422
|
-
names = ["primary_key"]
|
423
|
-
data_table_1 = pa.Table.from_arrays([primary_key_array_1], names=names)
|
424
|
-
|
425
|
-
primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
|
426
|
-
names = ["primary_key"]
|
427
|
-
data_table_2 = pa.Table.from_arrays([primary_key_array_2], names=names)
|
428
|
-
|
429
|
-
primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
|
430
|
-
names = ["primary_key"]
|
431
|
-
data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
|
432
|
-
|
433
|
-
daft_df_1 = daft.from_arrow(data_table_1)
|
434
|
-
daft_df_2 = daft.from_arrow(data_table_2)
|
435
|
-
daft_df_3 = daft.from_arrow(data_table_3)
|
499
|
+
convert_inputs = create_convert_input(
|
500
|
+
tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
|
501
|
+
)
|
436
502
|
|
503
|
+
# Create and set up mock data
|
504
|
+
mock_data_tables = create_mock_data_tables(test_case)
|
437
505
|
download_data_mock = mocker.patch(
|
438
506
|
"deltacat.compute.converter.utils.io.daft_read_parquet"
|
439
507
|
)
|
440
|
-
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
441
508
|
|
442
|
-
|
509
|
+
download_data_mock.side_effect = mock_data_tables
|
443
510
|
|
444
|
-
|
445
|
-
|
511
|
+
# Run conversion
|
512
|
+
convert_ref = convert.remote(convert_inputs[0])
|
446
513
|
convert_result = ray.get(convert_ref)
|
447
514
|
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
to_be_added_files_list.extend(convert_result.to_be_added_files)
|
452
|
-
|
453
|
-
# 4. Commit position delete, delete equality deletes from table
|
454
|
-
commit_append_snapshot(
|
455
|
-
iceberg_table=tbl,
|
456
|
-
new_position_delete_files=to_be_added_files_list,
|
515
|
+
# Process results
|
516
|
+
to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
|
517
|
+
convert_result
|
457
518
|
)
|
519
|
+
|
520
|
+
if not to_be_deleted_files_list:
|
521
|
+
# Commit changes
|
522
|
+
commit_append_snapshot(
|
523
|
+
iceberg_table=tbl,
|
524
|
+
new_position_delete_files=to_be_added_files_list,
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
commit_replace_snapshot(
|
528
|
+
iceberg_table=tbl,
|
529
|
+
to_be_deleted_files=to_be_deleted_files_list[0],
|
530
|
+
new_position_delete_files=to_be_added_files_list,
|
531
|
+
)
|
458
532
|
tbl.refresh()
|
459
533
|
|
460
|
-
#
|
461
|
-
|
462
|
-
all_pk = [
|
463
|
-
spark_read_pos_delete[row_idx][1]
|
464
|
-
for row_idx in range(len(spark_read_pos_delete))
|
465
|
-
]
|
466
|
-
all_pk_sorted = sorted(all_pk)
|
467
|
-
assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
|
534
|
+
# Verify results
|
535
|
+
pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
|
468
536
|
|
537
|
+
# Verify Spark read if required
|
538
|
+
if test_case.get("verify_spark_read", False):
|
539
|
+
verify_spark_read_results(spark, identifier, test_case["expected_result"])
|
540
|
+
else:
|
541
|
+
verify_result(
|
542
|
+
pyiceberg_scan_table_rows,
|
543
|
+
test_case["expected_result"],
|
544
|
+
verify_pos_index=test_case.get("verify_pos_index", False),
|
545
|
+
)
|
469
546
|
|
470
|
-
|
471
|
-
def
|
472
|
-
|
547
|
+
|
548
|
+
def test_converter_session_with_local_filesystem_and_duplicate_ids(
|
549
|
+
setup_ray_cluster,
|
473
550
|
) -> None:
|
474
551
|
"""
|
475
|
-
Test
|
552
|
+
Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
|
553
|
+
This test simulates the pattern where duplicate IDs represent updates to existing records.
|
554
|
+
The converter should merge these updates by creating position delete files.
|
476
555
|
"""
|
556
|
+
with temp_dir_autocleanup() as temp_catalog_dir:
|
557
|
+
# Create warehouse directory
|
558
|
+
warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
|
559
|
+
os.makedirs(warehouse_path, exist_ok=True)
|
560
|
+
|
561
|
+
# Set up local in-memory catalog
|
562
|
+
local_catalog = load_catalog(
|
563
|
+
"local_sql_catalog",
|
564
|
+
**{
|
565
|
+
"type": "in-memory",
|
566
|
+
"warehouse": warehouse_path,
|
567
|
+
},
|
568
|
+
)
|
477
569
|
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
partition_field_identity = PartitionField(
|
498
|
-
source_id=1,
|
499
|
-
field_id=101,
|
500
|
-
transform=IdentityTransform(),
|
501
|
-
name="number_partitioned",
|
502
|
-
)
|
503
|
-
partition_spec = PartitionSpec(partition_field_identity)
|
504
|
-
|
505
|
-
properties = dict()
|
506
|
-
properties["write.format.default"] = "parquet"
|
507
|
-
properties["write.delete.mode"] = "merge-on-read"
|
508
|
-
properties["write.update.mode"] = "merge-on-read"
|
509
|
-
properties["write.merge.mode"] = "merge-on-read"
|
510
|
-
properties["format-version"] = "2"
|
511
|
-
|
512
|
-
drop_table_if_exists(identifier, session_catalog)
|
513
|
-
session_catalog.create_table(
|
514
|
-
identifier,
|
515
|
-
schema=schema,
|
516
|
-
partition_spec=partition_spec,
|
517
|
-
properties=properties,
|
518
|
-
)
|
519
|
-
|
520
|
-
# 2. Use Spark to generate initial data files
|
521
|
-
tbl = session_catalog.load_table(identifier)
|
522
|
-
|
523
|
-
run_spark_commands(
|
524
|
-
spark,
|
525
|
-
[
|
526
|
-
f"""
|
527
|
-
INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
|
528
|
-
"""
|
529
|
-
],
|
530
|
-
)
|
531
|
-
run_spark_commands(
|
532
|
-
spark,
|
533
|
-
[
|
534
|
-
f"""
|
535
|
-
INSERT INTO {identifier} VALUES (0, "pk1", 1), (0, "pk2", 2), (0, "pk3", 3)
|
536
|
-
"""
|
537
|
-
],
|
538
|
-
)
|
539
|
-
run_spark_commands(
|
540
|
-
spark,
|
541
|
-
[
|
542
|
-
f"""
|
543
|
-
INSERT INTO {identifier} VALUES (0, "pk4", 1), (0, "pk2", 3), (0, "pk3", 4)
|
544
|
-
"""
|
545
|
-
],
|
546
|
-
)
|
547
|
-
tbl.refresh()
|
548
|
-
|
549
|
-
# 3. Use convert.remote() function to compute position deletes
|
550
|
-
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
|
551
|
-
|
552
|
-
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
553
|
-
data_file_dict=data_file_dict,
|
554
|
-
equality_delete_dict=equality_delete_dict,
|
555
|
-
pos_delete_dict=pos_delete_dict,
|
556
|
-
)
|
557
|
-
|
558
|
-
s3_file_system = get_s3_file_system()
|
559
|
-
for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
|
560
|
-
convert_input = ConvertInput.of(
|
561
|
-
convert_input_files=one_bucket_files,
|
562
|
-
convert_task_index=i,
|
563
|
-
iceberg_table_warehouse_prefix="warehouse/default",
|
564
|
-
identifier_fields=["primary_key1", "primary_key2"],
|
565
|
-
table_io=tbl.io,
|
566
|
-
table_metadata=tbl.metadata,
|
567
|
-
compact_previous_position_delete_files=False,
|
568
|
-
enforce_primary_key_uniqueness=True,
|
569
|
-
position_delete_for_multiple_data_files=True,
|
570
|
-
max_parallel_data_file_download=10,
|
571
|
-
s3_file_system=s3_file_system,
|
572
|
-
s3_client_kwargs={},
|
570
|
+
# Create local PyArrow filesystem
|
571
|
+
import pyarrow.fs as pafs
|
572
|
+
|
573
|
+
local_filesystem = pafs.LocalFileSystem()
|
574
|
+
|
575
|
+
# Define schema (id, name, value, version)
|
576
|
+
schema = Schema(
|
577
|
+
NestedField(field_id=1, name="id", field_type=LongType(), required=True),
|
578
|
+
NestedField(
|
579
|
+
field_id=2, name="name", field_type=StringType(), required=False
|
580
|
+
),
|
581
|
+
NestedField(
|
582
|
+
field_id=3, name="value", field_type=LongType(), required=False
|
583
|
+
),
|
584
|
+
NestedField(
|
585
|
+
field_id=4, name="version", field_type=LongType(), required=False
|
586
|
+
),
|
587
|
+
schema_id=0,
|
573
588
|
)
|
574
589
|
|
575
|
-
|
590
|
+
# Create table properties for merge-on-read
|
591
|
+
properties = {
|
592
|
+
"write.format.default": "parquet",
|
593
|
+
"write.delete.mode": "merge-on-read",
|
594
|
+
"write.update.mode": "merge-on-read",
|
595
|
+
"write.merge.mode": "merge-on-read",
|
596
|
+
"format-version": "2",
|
597
|
+
}
|
598
|
+
|
599
|
+
# Create the table
|
600
|
+
table_identifier = "default.test_duplicate_ids"
|
601
|
+
try:
|
602
|
+
local_catalog.create_namespace("default")
|
603
|
+
except NamespaceAlreadyExistsError:
|
604
|
+
pass # Namespace may already exist
|
605
|
+
try:
|
606
|
+
local_catalog.drop_table(table_identifier)
|
607
|
+
except NoSuchTableError:
|
608
|
+
pass # Table may not exist
|
609
|
+
|
610
|
+
local_catalog.create_table(
|
611
|
+
table_identifier,
|
612
|
+
schema=schema,
|
613
|
+
properties=properties,
|
614
|
+
)
|
615
|
+
tbl = local_catalog.load_table(table_identifier)
|
576
616
|
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
617
|
+
# Set the name mapping property so Iceberg can read parquet files without field IDs
|
618
|
+
with tbl.transaction() as tx:
|
619
|
+
tx.set_properties(
|
620
|
+
**{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
|
621
|
+
)
|
582
622
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
623
|
+
# Step 1: Write initial data
|
624
|
+
# Create PyArrow table with explicit schema to match Iceberg schema
|
625
|
+
arrow_schema = schema_to_pyarrow(schema)
|
626
|
+
|
627
|
+
initial_data = pa.table(
|
628
|
+
{
|
629
|
+
"id": [1, 2, 3, 4],
|
630
|
+
"name": ["Alice", "Bob", "Charlie", "David"],
|
631
|
+
"value": [100, 200, 300, 400],
|
632
|
+
"version": [1, 1, 1, 1],
|
633
|
+
},
|
634
|
+
schema=arrow_schema,
|
635
|
+
)
|
588
636
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
637
|
+
# Step 2: Write additional data
|
638
|
+
additional_data = pa.table(
|
639
|
+
{
|
640
|
+
"id": [5, 6, 7, 8],
|
641
|
+
"name": ["Eve", "Frank", "Grace", "Henry"],
|
642
|
+
"value": [500, 600, 700, 800],
|
643
|
+
"version": [1, 1, 1, 1],
|
644
|
+
},
|
645
|
+
schema=arrow_schema,
|
646
|
+
)
|
594
647
|
|
595
|
-
|
596
|
-
|
597
|
-
|
648
|
+
# Step 3: Write updates to existing records (this creates duplicates by ID)
|
649
|
+
# These should overwrite the original records with same IDs
|
650
|
+
updated_data = pa.table(
|
651
|
+
{
|
652
|
+
"id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
|
653
|
+
"name": [
|
654
|
+
"Robert",
|
655
|
+
"Charles",
|
656
|
+
"Ivan",
|
657
|
+
], # Updated names for Bob and Charlie
|
658
|
+
"value": [201, 301, 900], # Updated values
|
659
|
+
"version": [2, 2, 1], # Higher version numbers for updates
|
660
|
+
},
|
661
|
+
schema=arrow_schema,
|
662
|
+
)
|
598
663
|
|
599
|
-
|
600
|
-
|
601
|
-
)
|
602
|
-
download_data_mock.side_effect = (daft_df_1, daft_df_2, daft_df_3)
|
664
|
+
# Write all data to separate parquet files to simulate multiple writes
|
665
|
+
data_files_to_commit = []
|
603
666
|
|
604
|
-
|
667
|
+
for i, data in enumerate([initial_data, additional_data, updated_data]):
|
668
|
+
data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
|
669
|
+
pq.write_table(data, data_file_path)
|
605
670
|
|
606
|
-
|
607
|
-
|
608
|
-
|
671
|
+
# Create DataFile objects for Iceberg
|
672
|
+
parquet_metadata = pq.read_metadata(data_file_path)
|
673
|
+
file_size = os.path.getsize(data_file_path)
|
609
674
|
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
675
|
+
# Check schema compatibility
|
676
|
+
_check_pyarrow_schema_compatible(
|
677
|
+
schema, parquet_metadata.schema.to_arrow_schema()
|
678
|
+
)
|
614
679
|
|
615
|
-
|
680
|
+
# Calculate statistics
|
681
|
+
statistics = data_file_statistics_from_parquet_metadata(
|
682
|
+
parquet_metadata=parquet_metadata,
|
683
|
+
stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
|
684
|
+
parquet_column_mapping=parquet_path_to_id_mapping(schema),
|
685
|
+
)
|
616
686
|
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
687
|
+
data_file = DataFile(
|
688
|
+
content=DataFileContent.DATA,
|
689
|
+
file_path=data_file_path,
|
690
|
+
file_format=FileFormat.PARQUET,
|
691
|
+
partition={}, # No partitioning
|
692
|
+
file_size_in_bytes=file_size,
|
693
|
+
sort_order_id=None,
|
694
|
+
spec_id=tbl.metadata.default_spec_id,
|
695
|
+
key_metadata=None,
|
696
|
+
equality_ids=None,
|
697
|
+
**statistics.to_serialized_dict(),
|
698
|
+
)
|
699
|
+
data_files_to_commit.append(data_file)
|
700
|
+
|
701
|
+
# Commit all data files to the table
|
702
|
+
with tbl.transaction() as tx:
|
703
|
+
with tx.update_snapshot().fast_append() as update_snapshot:
|
704
|
+
for data_file in data_files_to_commit:
|
705
|
+
update_snapshot.append_data_file(data_file)
|
706
|
+
|
707
|
+
tbl.refresh()
|
708
|
+
|
709
|
+
# Verify we have duplicate IDs before conversion
|
710
|
+
initial_scan = tbl.scan().to_arrow().to_pydict()
|
711
|
+
print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
|
712
|
+
|
713
|
+
# There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
714
|
+
expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
|
715
|
+
assert (
|
716
|
+
sorted(initial_scan["id"]) == expected_duplicate_ids
|
717
|
+
), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
|
718
|
+
|
719
|
+
# Now call converter_session to convert equality deletes to position deletes
|
720
|
+
converter_params = ConverterSessionParams.of(
|
721
|
+
{
|
722
|
+
"catalog": local_catalog,
|
723
|
+
"iceberg_table_name": table_identifier,
|
724
|
+
"iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
|
725
|
+
"merge_keys": ["id"], # Use ID as the merge key
|
726
|
+
"enforce_primary_key_uniqueness": True,
|
727
|
+
"task_max_parallelism": 1, # Single task for local testing
|
728
|
+
"filesystem": local_filesystem,
|
729
|
+
"location_provider_prefix_override": None, # Use local filesystem
|
730
|
+
"location_provider_prefix_override": None, # Let the system auto-generate the prefix
|
731
|
+
}
|
732
|
+
)
|
622
733
|
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
("
|
627
|
-
|
628
|
-
|
629
|
-
(
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
734
|
+
print(f"Running converter_session with local filesystem...")
|
735
|
+
print(f"Warehouse path: {warehouse_path}")
|
736
|
+
print(f"Merge keys: ['id']")
|
737
|
+
print(f"Enforce uniqueness: True")
|
738
|
+
|
739
|
+
# Run the converter
|
740
|
+
converter_session(params=converter_params)
|
741
|
+
|
742
|
+
# Refresh table and scan again
|
743
|
+
tbl.refresh()
|
744
|
+
final_scan = tbl.scan().to_arrow().to_pydict()
|
745
|
+
|
746
|
+
print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
|
747
|
+
print(f"Final data: {final_scan}")
|
748
|
+
|
749
|
+
# Verify position delete files were created by checking table metadata
|
750
|
+
latest_snapshot = tbl.metadata.current_snapshot()
|
751
|
+
if latest_snapshot:
|
752
|
+
manifests = latest_snapshot.manifests(tbl.io)
|
753
|
+
position_delete_files = []
|
754
|
+
|
755
|
+
for manifest in manifests:
|
756
|
+
entries = manifest.fetch_manifest_entry(tbl.io)
|
757
|
+
for entry in entries:
|
758
|
+
if entry.data_file.content == DataFileContent.POSITION_DELETES:
|
759
|
+
position_delete_files.append(entry.data_file.file_path)
|
760
|
+
|
761
|
+
print(f"Position delete files found: {position_delete_files}")
|
762
|
+
assert (
|
763
|
+
len(position_delete_files) > 0
|
764
|
+
), "No position delete files were created by converter_session"
|
765
|
+
|
766
|
+
# Verify the final result has unique IDs (duplicates should be resolved)
|
767
|
+
# Expected: Latest values for each ID based on the updates
|
768
|
+
expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
|
769
|
+
actual_ids = sorted(final_scan["id"])
|
770
|
+
|
771
|
+
print(f"Expected unique IDs: {expected_unique_ids}")
|
772
|
+
print(f"Actual IDs after conversion: {actual_ids}")
|
773
|
+
|
774
|
+
assert (
|
775
|
+
actual_ids == expected_unique_ids
|
776
|
+
), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
|
777
|
+
|
778
|
+
# Verify the updated values are present (higher version should win)
|
779
|
+
final_data_by_id = {}
|
780
|
+
for i, id_val in enumerate(final_scan["id"]):
|
781
|
+
final_data_by_id[id_val] = {
|
782
|
+
"name": final_scan["name"][i],
|
783
|
+
"value": final_scan["value"][i],
|
784
|
+
"version": final_scan["version"][i],
|
785
|
+
}
|
786
|
+
|
787
|
+
# Check that ID 2 has updated value (Robert, 201, version 2)
|
788
|
+
assert (
|
789
|
+
final_data_by_id[2]["name"] == "Robert"
|
790
|
+
), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
|
791
|
+
assert (
|
792
|
+
final_data_by_id[2]["value"] == 201
|
793
|
+
), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
|
794
|
+
assert (
|
795
|
+
final_data_by_id[2]["version"] == 2
|
796
|
+
), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
|
797
|
+
|
798
|
+
# Check that ID 3 has updated value (Charles, 301, version 2)
|
799
|
+
assert (
|
800
|
+
final_data_by_id[3]["name"] == "Charles"
|
801
|
+
), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
|
802
|
+
assert (
|
803
|
+
final_data_by_id[3]["value"] == 301
|
804
|
+
), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
|
805
|
+
assert (
|
806
|
+
final_data_by_id[3]["version"] == 2
|
807
|
+
), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
|
808
|
+
|
809
|
+
# Check that new ID 9 is present
|
810
|
+
assert (
|
811
|
+
final_data_by_id[9]["name"] == "Ivan"
|
812
|
+
), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
|
813
|
+
assert (
|
814
|
+
final_data_by_id[9]["value"] == 900
|
815
|
+
), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
|
816
|
+
|
817
|
+
print(f"✅ Test completed successfully!")
|
818
|
+
print(
|
819
|
+
f"✅ Position delete files were created: {len(position_delete_files)} files"
|
820
|
+
)
|
821
|
+
print(f"✅ Duplicate IDs were resolved correctly")
|
822
|
+
print(
|
823
|
+
f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
|
824
|
+
)
|
825
|
+
print(f"✅ Final table has {len(actual_ids)} unique records")
|
826
|
+
print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")
|