deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,30 @@
|
|
1
1
|
import shutil
|
2
2
|
import tempfile
|
3
|
+
from collections import defaultdict
|
3
4
|
|
4
5
|
import deltacat as dc
|
5
6
|
from deltacat.constants import METAFILE_FORMAT_MSGPACK
|
6
|
-
from deltacat import
|
7
|
-
|
7
|
+
from deltacat import (
|
8
|
+
ContentType,
|
9
|
+
DeltaCatUrl,
|
10
|
+
DatasetType,
|
11
|
+
Namespace,
|
12
|
+
TableProperties,
|
13
|
+
TableWriteMode,
|
14
|
+
TableProperty,
|
15
|
+
TableReadOptimizationLevel,
|
16
|
+
)
|
17
|
+
from deltacat.storage import (
|
18
|
+
Metafile,
|
19
|
+
Table,
|
20
|
+
TableVersion,
|
21
|
+
Stream,
|
22
|
+
Partition,
|
23
|
+
Delta,
|
24
|
+
)
|
25
|
+
from deltacat.storage.model.partition import UNPARTITIONED_SCHEME_ID
|
26
|
+
from deltacat.catalog import write_to_table
|
27
|
+
import pandas as pd
|
8
28
|
|
9
29
|
from deltacat.io import (
|
10
30
|
METAFILE_TYPE_COLUMN_NAME,
|
@@ -36,9 +56,10 @@ class TestDeltaCAT:
|
|
36
56
|
DeltaCatUrl("dc://test_catalog_2/test_namespace"),
|
37
57
|
)
|
38
58
|
# Expect the catalog namespace created in each catalog
|
39
|
-
# method to be equivalent
|
59
|
+
# method to be equivalent but not equal to the source namespace
|
60
|
+
# (due to different metafile IDs).
|
40
61
|
assert namespace_src.equivalent_to(namespace_dst)
|
41
|
-
assert namespace_src == namespace_dst
|
62
|
+
assert not namespace_src == namespace_dst
|
42
63
|
|
43
64
|
# When each catalog namespace is fetched explicitly
|
44
65
|
# Expect them to be equivalent but not equal
|
@@ -46,7 +67,9 @@ class TestDeltaCAT:
|
|
46
67
|
actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
|
47
68
|
actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
|
48
69
|
assert actual_namespace_src.equivalent_to(actual_namespace_dst)
|
70
|
+
assert actual_namespace_src == namespace_src
|
49
71
|
assert not actual_namespace_src == actual_namespace_dst
|
72
|
+
assert namespace_dst == actual_namespace_dst
|
50
73
|
|
51
74
|
def test_catalog_listing_shallow_local_metafiles(self):
|
52
75
|
# Given two empty DeltaCAT catalogs.
|
@@ -78,3 +101,964 @@ class TestDeltaCAT:
|
|
78
101
|
assert actual_namespace.equivalent_to(namespace_src)
|
79
102
|
namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
|
80
103
|
assert namespace_type == "Namespace"
|
104
|
+
|
105
|
+
def test_recursive_listing_multiple_namespaces_with_tables(self):
|
106
|
+
"""
|
107
|
+
Test that recursive listing correctly processes namespaces, tables, and deltas.
|
108
|
+
"""
|
109
|
+
# Create multiple namespaces with tables and data
|
110
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_alpha"))
|
111
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_beta"))
|
112
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_gamma"))
|
113
|
+
|
114
|
+
# Create tables with data in each namespace
|
115
|
+
test_data = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
116
|
+
|
117
|
+
# Create tables in each namespace
|
118
|
+
write_to_table(
|
119
|
+
data=test_data,
|
120
|
+
table="table1",
|
121
|
+
namespace="namespace_alpha",
|
122
|
+
mode=TableWriteMode.CREATE,
|
123
|
+
content_type=ContentType.PARQUET,
|
124
|
+
catalog="test_catalog_1",
|
125
|
+
)
|
126
|
+
|
127
|
+
write_to_table(
|
128
|
+
data=test_data,
|
129
|
+
table="table2",
|
130
|
+
namespace="namespace_beta",
|
131
|
+
mode=TableWriteMode.CREATE,
|
132
|
+
content_type=ContentType.PARQUET,
|
133
|
+
catalog="test_catalog_1",
|
134
|
+
)
|
135
|
+
|
136
|
+
write_to_table(
|
137
|
+
data=test_data,
|
138
|
+
table="table3",
|
139
|
+
namespace="namespace_gamma",
|
140
|
+
mode=TableWriteMode.CREATE,
|
141
|
+
content_type=ContentType.PARQUET,
|
142
|
+
catalog="test_catalog_1",
|
143
|
+
)
|
144
|
+
|
145
|
+
# Test recursive listing
|
146
|
+
all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
147
|
+
|
148
|
+
# Verify we found objects from ALL namespaces
|
149
|
+
object_types_to_names = defaultdict(list)
|
150
|
+
|
151
|
+
# Verify we found all namespaces, tables, and deltas
|
152
|
+
for obj in all_objects:
|
153
|
+
obj_type = Metafile.get_class(obj)
|
154
|
+
object_types_to_names[obj_type].append(obj.name)
|
155
|
+
|
156
|
+
# Assert we found all namespaces
|
157
|
+
expected_namespaces = {"namespace_alpha", "namespace_beta", "namespace_gamma"}
|
158
|
+
assert (
|
159
|
+
len(object_types_to_names[Namespace]) == 3
|
160
|
+
), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
|
161
|
+
assert (
|
162
|
+
set(object_types_to_names[Namespace]) == expected_namespaces
|
163
|
+
), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
|
164
|
+
|
165
|
+
# Assert we found all tables
|
166
|
+
expected_tables = {"table1", "table2", "table3"}
|
167
|
+
assert (
|
168
|
+
len(object_types_to_names[Table]) == 3
|
169
|
+
), f"Expected 3 tables, found {len(object_types_to_names[Table])}"
|
170
|
+
assert (
|
171
|
+
set(object_types_to_names[Table]) == expected_tables
|
172
|
+
), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
|
173
|
+
|
174
|
+
# Assert we found all deltas
|
175
|
+
assert (
|
176
|
+
len(object_types_to_names[Delta]) == 3
|
177
|
+
), f"Expected 3 deltas, found {len(object_types_to_names[Delta])}"
|
178
|
+
expected_deltas = {
|
179
|
+
"1"
|
180
|
+
} # all 3 deltas should have the same stream position in their respective partitions
|
181
|
+
assert (
|
182
|
+
set(object_types_to_names[Delta]) == expected_deltas
|
183
|
+
), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
|
184
|
+
|
185
|
+
def test_recursive_listing_multiple_tables_per_namespace(self):
|
186
|
+
"""
|
187
|
+
Test that recursive listing finds all tables within a namespace.
|
188
|
+
"""
|
189
|
+
# Create one namespace with multiple tables
|
190
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/multi_table_namespace"))
|
191
|
+
|
192
|
+
test_data = pd.DataFrame({"id": [1, 2], "value": ["x", "y"]})
|
193
|
+
|
194
|
+
# Create multiple tables in the same namespace
|
195
|
+
table_names = ["events", "users", "products", "orders"]
|
196
|
+
for table_name in table_names:
|
197
|
+
write_to_table(
|
198
|
+
data=test_data,
|
199
|
+
table=table_name,
|
200
|
+
namespace="multi_table_namespace",
|
201
|
+
mode=TableWriteMode.CREATE,
|
202
|
+
content_type=ContentType.PARQUET,
|
203
|
+
catalog="test_catalog_1",
|
204
|
+
)
|
205
|
+
|
206
|
+
# Test recursive listing
|
207
|
+
all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
208
|
+
|
209
|
+
# Extract table names from results
|
210
|
+
object_types_to_names = defaultdict(list)
|
211
|
+
for obj in all_objects:
|
212
|
+
obj_type = Metafile.get_class(obj)
|
213
|
+
object_types_to_names[obj_type].append(obj.name)
|
214
|
+
|
215
|
+
# Assert we found all tables
|
216
|
+
assert len(object_types_to_names[Table]) == len(
|
217
|
+
table_names
|
218
|
+
), f"Expected {len(table_names)} tables, found {len(object_types_to_names[Table])}"
|
219
|
+
assert set(object_types_to_names[Table]) == set(
|
220
|
+
table_names
|
221
|
+
), f"Expected tables: {table_names}, found: {object_types_to_names[Table]}"
|
222
|
+
|
223
|
+
def test_recursive_listing_multiple_deltas_per_table(self):
|
224
|
+
"""
|
225
|
+
Test that recursive listing finds all deltas within a table.
|
226
|
+
"""
|
227
|
+
# Create namespace and table
|
228
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/delta_test_namespace"))
|
229
|
+
|
230
|
+
# Create table with multiple deltas
|
231
|
+
batch1 = pd.DataFrame({"id": [1, 2, 3], "value": ["a", "b", "c"]})
|
232
|
+
|
233
|
+
batch2 = pd.DataFrame({"id": [4, 5, 6], "value": ["d", "e", "f"]})
|
234
|
+
|
235
|
+
# Write first batch (CREATE)
|
236
|
+
write_to_table(
|
237
|
+
data=batch1,
|
238
|
+
table="multi_delta_table",
|
239
|
+
namespace="delta_test_namespace",
|
240
|
+
mode=TableWriteMode.CREATE,
|
241
|
+
content_type=ContentType.PARQUET,
|
242
|
+
catalog="test_catalog_1",
|
243
|
+
)
|
244
|
+
|
245
|
+
# Write second batch (APPEND - creates second delta)
|
246
|
+
write_to_table(
|
247
|
+
data=batch2,
|
248
|
+
table="multi_delta_table",
|
249
|
+
namespace="delta_test_namespace",
|
250
|
+
mode=TableWriteMode.APPEND,
|
251
|
+
content_type=ContentType.PARQUET,
|
252
|
+
catalog="test_catalog_1",
|
253
|
+
)
|
254
|
+
|
255
|
+
# Test recursive listing
|
256
|
+
all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
257
|
+
|
258
|
+
# Extract table names from results
|
259
|
+
object_types_to_names = defaultdict(list)
|
260
|
+
for obj in all_objects:
|
261
|
+
obj_type = Metafile.get_class(obj)
|
262
|
+
object_types_to_names[obj_type].append(obj.name)
|
263
|
+
|
264
|
+
# Assert we found all deltas
|
265
|
+
expected_deltas = {
|
266
|
+
"1",
|
267
|
+
"2",
|
268
|
+
} # all deltas should have the same stream position in their respective partitions
|
269
|
+
assert (
|
270
|
+
len(object_types_to_names[Delta]) == 2
|
271
|
+
), f"Expected 2 deltas, found {len(object_types_to_names[Delta])}"
|
272
|
+
assert (
|
273
|
+
set(object_types_to_names[Delta]) == expected_deltas
|
274
|
+
), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
|
275
|
+
|
276
|
+
def test_recursive_listing_empty_namespaces_mixed_with_populated(self):
|
277
|
+
"""
|
278
|
+
Test that recursive listing handles a mix of empty and populated namespaces correctly.
|
279
|
+
"""
|
280
|
+
# Create mix of empty and populated namespaces
|
281
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/empty_namespace_1"))
|
282
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/empty_namespace_2"))
|
283
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/populated_namespace"))
|
284
|
+
|
285
|
+
# Add data only to the populated namespace
|
286
|
+
test_data = pd.DataFrame({"id": [1, 2], "data": ["test1", "test2"]})
|
287
|
+
|
288
|
+
write_to_table(
|
289
|
+
data=test_data,
|
290
|
+
table="test_table",
|
291
|
+
namespace="populated_namespace",
|
292
|
+
mode=TableWriteMode.CREATE,
|
293
|
+
content_type=ContentType.PARQUET,
|
294
|
+
catalog="test_catalog_1",
|
295
|
+
)
|
296
|
+
|
297
|
+
# Test recursive listing
|
298
|
+
all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
299
|
+
|
300
|
+
object_types_to_names = defaultdict(list)
|
301
|
+
for obj in all_objects:
|
302
|
+
obj_type = Metafile.get_class(obj)
|
303
|
+
object_types_to_names[obj_type].append(obj.name)
|
304
|
+
|
305
|
+
# Verify we found all namespaces
|
306
|
+
expected_namespaces = {
|
307
|
+
"empty_namespace_1",
|
308
|
+
"empty_namespace_2",
|
309
|
+
"populated_namespace",
|
310
|
+
}
|
311
|
+
assert (
|
312
|
+
len(object_types_to_names[Namespace]) == 3
|
313
|
+
), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
|
314
|
+
assert (
|
315
|
+
set(object_types_to_names[Namespace]) == expected_namespaces
|
316
|
+
), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
|
317
|
+
|
318
|
+
# Verify we found the table in the populated namespace
|
319
|
+
expected_tables = {"test_table"}
|
320
|
+
assert (
|
321
|
+
len(object_types_to_names[Table]) == 1
|
322
|
+
), f"Expected 1 table, found {len(object_types_to_names[Table])}"
|
323
|
+
assert (
|
324
|
+
set(object_types_to_names[Table]) == expected_tables
|
325
|
+
), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
|
326
|
+
|
327
|
+
def test_non_recursive_listing_vs_recursive_listing(self):
|
328
|
+
"""
|
329
|
+
Test that non-recursive listing only returns top-level objects while recursive returns all.
|
330
|
+
"""
|
331
|
+
# Create nested structure
|
332
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_one"))
|
333
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/namespace_two"))
|
334
|
+
|
335
|
+
test_data = pd.DataFrame({"id": [1], "value": ["test"]})
|
336
|
+
|
337
|
+
write_to_table(
|
338
|
+
data=test_data,
|
339
|
+
table="table_in_ns1",
|
340
|
+
namespace="namespace_one",
|
341
|
+
mode=TableWriteMode.CREATE,
|
342
|
+
content_type=ContentType.PARQUET,
|
343
|
+
catalog="test_catalog_1",
|
344
|
+
)
|
345
|
+
|
346
|
+
# Non-recursive listing (should only get namespaces)
|
347
|
+
shallow_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=False)
|
348
|
+
|
349
|
+
# Recursive listing (should get everything)
|
350
|
+
deep_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
351
|
+
|
352
|
+
# Shallow should have fewer objects than deep
|
353
|
+
assert len(shallow_objects) < len(deep_objects)
|
354
|
+
|
355
|
+
# Shallow should only contain namespaces
|
356
|
+
shallow_object_types_to_names = defaultdict(list)
|
357
|
+
for obj in shallow_objects:
|
358
|
+
obj_type = Metafile.get_class(obj)
|
359
|
+
shallow_object_types_to_names[obj_type].append(obj.name)
|
360
|
+
|
361
|
+
# Assert we found all namespaces
|
362
|
+
expected_namespaces = {"namespace_one", "namespace_two"}
|
363
|
+
assert (
|
364
|
+
len(shallow_object_types_to_names[Namespace]) == 2
|
365
|
+
), f"Expected 2 namespaces, found {len(shallow_object_types_to_names[Namespace])}"
|
366
|
+
assert (
|
367
|
+
set(shallow_object_types_to_names[Namespace]) == expected_namespaces
|
368
|
+
), f"Expected namespaces: {expected_namespaces}, found: {shallow_object_types_to_names[Namespace]}"
|
369
|
+
assert (
|
370
|
+
len(shallow_object_types_to_names) == 1
|
371
|
+
), f"Expected 1 object type, found {len(shallow_object_types_to_names)}"
|
372
|
+
assert set(shallow_object_types_to_names.keys()) == {
|
373
|
+
Namespace
|
374
|
+
}, f"Expected only Namespace object type, found: {shallow_object_types_to_names.keys()}"
|
375
|
+
|
376
|
+
# Deep should contain multiple types (namespaces, tables, streams, partitions, deltas)
|
377
|
+
deep_object_types_to_names = defaultdict(list)
|
378
|
+
for obj in deep_objects:
|
379
|
+
deep_object_types_to_names[Metafile.get_class(obj)].append(obj.name)
|
380
|
+
|
381
|
+
expected_namespaces = {"namespace_one", "namespace_two"}
|
382
|
+
assert (
|
383
|
+
len(deep_object_types_to_names[Namespace]) == 2
|
384
|
+
), f"Expected 2 namespaces, found {len(deep_object_types_to_names[Namespace])}"
|
385
|
+
assert (
|
386
|
+
set(deep_object_types_to_names[Namespace]) == expected_namespaces
|
387
|
+
), f"Expected namespaces: {expected_namespaces}, found: {deep_object_types_to_names[Namespace]}"
|
388
|
+
|
389
|
+
expected_tables = {"table_in_ns1"}
|
390
|
+
assert (
|
391
|
+
len(deep_object_types_to_names[Table]) == 1
|
392
|
+
), f"Expected 1 table, found {len(deep_object_types_to_names[Table])}"
|
393
|
+
assert (
|
394
|
+
set(deep_object_types_to_names[Table]) == expected_tables
|
395
|
+
), f"Expected tables: {expected_tables}, found: {deep_object_types_to_names[Table]}"
|
396
|
+
|
397
|
+
expected_table_versions = {"1"}
|
398
|
+
assert (
|
399
|
+
len(deep_object_types_to_names[TableVersion]) == 1
|
400
|
+
), f"Expected 1 table version, found {len(deep_object_types_to_names[TableVersion])}"
|
401
|
+
assert (
|
402
|
+
set(deep_object_types_to_names[TableVersion]) == expected_table_versions
|
403
|
+
), f"Expected table versions: {expected_table_versions}, found: {deep_object_types_to_names[TableVersion]}"
|
404
|
+
|
405
|
+
expected_streams = {"deltacat"}
|
406
|
+
assert (
|
407
|
+
len(deep_object_types_to_names[Stream]) == 1
|
408
|
+
), f"Expected 1 stream, found {len(deep_object_types_to_names[Stream])}"
|
409
|
+
assert (
|
410
|
+
set(deep_object_types_to_names[Stream]) == expected_streams
|
411
|
+
), f"Expected streams: {expected_streams}, found: {deep_object_types_to_names[Stream]}"
|
412
|
+
|
413
|
+
expected_partitions = {f"None|{UNPARTITIONED_SCHEME_ID}"}
|
414
|
+
assert (
|
415
|
+
len(deep_object_types_to_names[Partition]) == 1
|
416
|
+
), f"Expected 1 partition, found {len(deep_object_types_to_names[Partition])}"
|
417
|
+
assert (
|
418
|
+
set(deep_object_types_to_names[Partition]) == expected_partitions
|
419
|
+
), f"Expected partitions: {expected_partitions}, found: {deep_object_types_to_names[Partition]}"
|
420
|
+
|
421
|
+
expected_deltas = {"1"}
|
422
|
+
assert (
|
423
|
+
len(deep_object_types_to_names[Delta]) == 1
|
424
|
+
), f"Expected 1 delta, found {len(deep_object_types_to_names[Delta])}"
|
425
|
+
assert (
|
426
|
+
set(deep_object_types_to_names[Delta]) == expected_deltas
|
427
|
+
), f"Expected deltas: {expected_deltas}, found: {deep_object_types_to_names[Delta]}"
|
428
|
+
|
429
|
+
def test_recursive_listing_all_children_processed(self):
|
430
|
+
"""
|
431
|
+
Ensure that all children are processed at each level of recursive listings.
|
432
|
+
"""
|
433
|
+
# Create 3 namespaces
|
434
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/alpha_namespace"))
|
435
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/beta_namespace"))
|
436
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/gamma_namespace"))
|
437
|
+
|
438
|
+
# Create test data
|
439
|
+
test_data = pd.DataFrame(
|
440
|
+
{"id": [1, 2], "name": ["test1", "test2"], "value": [100, 200]}
|
441
|
+
)
|
442
|
+
|
443
|
+
# Create tables in EACH namespace
|
444
|
+
write_to_table(
|
445
|
+
data=test_data,
|
446
|
+
table="alpha_table",
|
447
|
+
namespace="alpha_namespace",
|
448
|
+
mode=TableWriteMode.CREATE,
|
449
|
+
content_type=ContentType.PARQUET,
|
450
|
+
catalog="test_catalog_1",
|
451
|
+
)
|
452
|
+
|
453
|
+
write_to_table(
|
454
|
+
data=test_data,
|
455
|
+
table="beta_table",
|
456
|
+
namespace="beta_namespace",
|
457
|
+
mode=TableWriteMode.CREATE,
|
458
|
+
content_type=ContentType.PARQUET,
|
459
|
+
catalog="test_catalog_1",
|
460
|
+
)
|
461
|
+
|
462
|
+
write_to_table(
|
463
|
+
data=test_data,
|
464
|
+
table="gamma_table",
|
465
|
+
namespace="gamma_namespace",
|
466
|
+
mode=TableWriteMode.CREATE,
|
467
|
+
content_type=ContentType.PARQUET,
|
468
|
+
catalog="test_catalog_1",
|
469
|
+
)
|
470
|
+
|
471
|
+
# Perform recursive listing
|
472
|
+
all_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
473
|
+
|
474
|
+
# Extract all objects found
|
475
|
+
object_types_to_names = defaultdict(list)
|
476
|
+
for obj in all_objects:
|
477
|
+
obj_type = Metafile.get_class(obj)
|
478
|
+
object_types_to_names[obj_type].append(obj.name)
|
479
|
+
|
480
|
+
# All namespaces should be found
|
481
|
+
expected_namespaces = {"alpha_namespace", "beta_namespace", "gamma_namespace"}
|
482
|
+
assert (
|
483
|
+
len(object_types_to_names[Namespace]) == 3
|
484
|
+
), f"Expected 3 namespaces, found {len(object_types_to_names[Namespace])}"
|
485
|
+
assert (
|
486
|
+
set(object_types_to_names[Namespace]) == expected_namespaces
|
487
|
+
), f"Expected namespaces: {expected_namespaces}, found: {object_types_to_names[Namespace]}"
|
488
|
+
|
489
|
+
# All tables should be found
|
490
|
+
expected_tables = {"alpha_table", "beta_table", "gamma_table"}
|
491
|
+
assert (
|
492
|
+
len(object_types_to_names[Table]) == 3
|
493
|
+
), f"Expected 3 tables, found {len(object_types_to_names[Table])}"
|
494
|
+
assert (
|
495
|
+
set(object_types_to_names[Table]) == expected_tables
|
496
|
+
), f"Expected tables: {expected_tables}, found: {object_types_to_names[Table]}"
|
497
|
+
|
498
|
+
# All table versions should be found
|
499
|
+
expected_table_versions = {"1"}
|
500
|
+
assert (
|
501
|
+
len(object_types_to_names[TableVersion]) == 3
|
502
|
+
), f"Expected 3 table versions, found {len(object_types_to_names[TableVersion])}"
|
503
|
+
assert (
|
504
|
+
set(object_types_to_names[TableVersion]) == expected_table_versions
|
505
|
+
), f"Expected table versions: {expected_table_versions}, found: {object_types_to_names[TableVersion]}"
|
506
|
+
|
507
|
+
# All streams should be found
|
508
|
+
expected_streams = {"deltacat"}
|
509
|
+
assert (
|
510
|
+
len(object_types_to_names[Stream]) == 3
|
511
|
+
), f"Expected 1 stream, found {len(object_types_to_names[Stream])}"
|
512
|
+
assert (
|
513
|
+
set(object_types_to_names[Stream]) == expected_streams
|
514
|
+
), f"Expected streams: {expected_streams}, found: {object_types_to_names[Stream]}"
|
515
|
+
|
516
|
+
# All partitions should be found
|
517
|
+
expected_partitions = {f"None|{UNPARTITIONED_SCHEME_ID}"}
|
518
|
+
assert (
|
519
|
+
len(object_types_to_names[Partition]) == 3
|
520
|
+
), f"Expected 1 partition, found {len(object_types_to_names[Partition])}"
|
521
|
+
assert (
|
522
|
+
set(object_types_to_names[Partition]) == expected_partitions
|
523
|
+
), f"Expected partitions: {expected_partitions}, found: {object_types_to_names[Partition]}"
|
524
|
+
|
525
|
+
# All deltas should be found
|
526
|
+
expected_deltas = {"1"}
|
527
|
+
assert (
|
528
|
+
len(object_types_to_names[Delta]) == 3
|
529
|
+
), f"Expected 3 deltas, found {len(object_types_to_names[Delta])}"
|
530
|
+
assert (
|
531
|
+
set(object_types_to_names[Delta]) == expected_deltas
|
532
|
+
), f"Expected deltas: {expected_deltas}, found: {object_types_to_names[Delta]}"
|
533
|
+
|
534
|
+
# Ensure we found the expected objects across all levels of hierarchy
|
535
|
+
total_objects = len(all_objects)
|
536
|
+
assert (
|
537
|
+
total_objects == 18
|
538
|
+
), f"Expected 18 objects from deep traversal, found only {total_objects}."
|
539
|
+
|
540
|
+
def test_recursive_cross_catalog_copy(self):
|
541
|
+
"""
|
542
|
+
Test comprehensive cross-catalog copy using dc.copy with ** pattern.
|
543
|
+
This test validates complete catalog copying with all metadata types:
|
544
|
+
namespaces, tables, table versions, streams, partitions, and deltas.
|
545
|
+
"""
|
546
|
+
# Create multiple namespaces, multiple tables, versions, streams, partitions, and deltas
|
547
|
+
|
548
|
+
# Namespace 1: Analytics data with multiple table versions
|
549
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/analytics"))
|
550
|
+
|
551
|
+
# Create table with multiple versions
|
552
|
+
events_data_v1 = pd.DataFrame(
|
553
|
+
{
|
554
|
+
"event_id": [1, 2, 3],
|
555
|
+
"user_id": ["user_1", "user_2", "user_3"],
|
556
|
+
"event_type": ["click", "view", "purchase"],
|
557
|
+
"timestamp": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
|
558
|
+
"value": [10.5, 20.0, 150.75],
|
559
|
+
}
|
560
|
+
)
|
561
|
+
|
562
|
+
table_properties: TableProperties = {
|
563
|
+
TableProperty.READ_OPTIMIZATION_LEVEL: TableReadOptimizationLevel.MAX,
|
564
|
+
TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER: 1,
|
565
|
+
TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER: 1,
|
566
|
+
TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER: 1,
|
567
|
+
}
|
568
|
+
|
569
|
+
write_to_table(
|
570
|
+
data=events_data_v1,
|
571
|
+
table="events",
|
572
|
+
namespace="analytics",
|
573
|
+
mode=TableWriteMode.CREATE,
|
574
|
+
content_type=ContentType.PARQUET,
|
575
|
+
catalog="test_catalog_1",
|
576
|
+
table_properties=table_properties,
|
577
|
+
)
|
578
|
+
|
579
|
+
# Add more data to create additional deltas
|
580
|
+
events_data_v2 = pd.DataFrame(
|
581
|
+
{
|
582
|
+
"event_id": [4, 5, 6, 7],
|
583
|
+
"user_id": ["user_4", "user_1", "user_5", "user_2"],
|
584
|
+
"event_type": ["view", "click", "purchase", "refund"],
|
585
|
+
"timestamp": pd.to_datetime(
|
586
|
+
["2023-01-04", "2023-01-05", "2023-01-06", "2023-01-07"]
|
587
|
+
),
|
588
|
+
"value": [0.0, 5.25, 299.99, -150.75],
|
589
|
+
}
|
590
|
+
)
|
591
|
+
|
592
|
+
write_to_table(
|
593
|
+
data=events_data_v2,
|
594
|
+
table="events",
|
595
|
+
namespace="analytics",
|
596
|
+
mode=TableWriteMode.APPEND,
|
597
|
+
content_type=ContentType.PARQUET,
|
598
|
+
catalog="test_catalog_1",
|
599
|
+
)
|
600
|
+
|
601
|
+
# Create second table in analytics namespace
|
602
|
+
users_data = pd.DataFrame(
|
603
|
+
{
|
604
|
+
"user_id": ["user_1", "user_2", "user_3", "user_4", "user_5"],
|
605
|
+
"username": ["alice", "bob", "charlie", "diana", "eve"],
|
606
|
+
"email": [
|
607
|
+
"alice@test.com",
|
608
|
+
"bob@test.com",
|
609
|
+
"charlie@test.com",
|
610
|
+
"diana@test.com",
|
611
|
+
"eve@test.com",
|
612
|
+
],
|
613
|
+
"created_at": pd.to_datetime(
|
614
|
+
[
|
615
|
+
"2022-12-01",
|
616
|
+
"2022-12-15",
|
617
|
+
"2022-12-20",
|
618
|
+
"2023-01-01",
|
619
|
+
"2023-01-03",
|
620
|
+
]
|
621
|
+
),
|
622
|
+
"is_active": [True, True, False, True, True],
|
623
|
+
}
|
624
|
+
)
|
625
|
+
|
626
|
+
write_to_table(
|
627
|
+
data=users_data,
|
628
|
+
table="users",
|
629
|
+
namespace="analytics",
|
630
|
+
mode=TableWriteMode.CREATE,
|
631
|
+
content_type=ContentType.PARQUET,
|
632
|
+
catalog="test_catalog_1",
|
633
|
+
)
|
634
|
+
|
635
|
+
# Create version 2 of the events table to test table version ordering in recursive copy
|
636
|
+
events_data_v3 = pd.DataFrame(
|
637
|
+
{
|
638
|
+
"event_id": [8, 9, 10],
|
639
|
+
"user_id": ["user_3", "user_4", "user_5"],
|
640
|
+
"event_type": ["signup", "login", "logout"],
|
641
|
+
"timestamp": pd.to_datetime(["2023-01-08", "2023-01-09", "2023-01-10"]),
|
642
|
+
"value": [0.0, 0.0, 0.0],
|
643
|
+
}
|
644
|
+
)
|
645
|
+
|
646
|
+
write_to_table(
|
647
|
+
data=events_data_v3,
|
648
|
+
table="events",
|
649
|
+
namespace="analytics",
|
650
|
+
table_version="2", # Explicitly create version 2
|
651
|
+
mode=TableWriteMode.CREATE,
|
652
|
+
content_type=ContentType.PARQUET,
|
653
|
+
catalog="test_catalog_1",
|
654
|
+
)
|
655
|
+
|
656
|
+
# Namespace 2: Product data with different schema
|
657
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/products"))
|
658
|
+
|
659
|
+
products_data = pd.DataFrame(
|
660
|
+
{
|
661
|
+
"product_id": ["prod_1", "prod_2", "prod_3"],
|
662
|
+
"name": ["Widget A", "Widget B", "Super Widget"],
|
663
|
+
"category": ["widgets", "widgets", "premium"],
|
664
|
+
"price": [19.99, 29.99, 149.99],
|
665
|
+
"in_stock": [True, False, True],
|
666
|
+
"metadata": [
|
667
|
+
{"color": "red"},
|
668
|
+
{"color": "blue", "size": "large"},
|
669
|
+
{"color": "gold", "premium": True},
|
670
|
+
],
|
671
|
+
}
|
672
|
+
)
|
673
|
+
|
674
|
+
write_to_table(
|
675
|
+
data=products_data,
|
676
|
+
table="inventory",
|
677
|
+
namespace="products",
|
678
|
+
mode=TableWriteMode.CREATE,
|
679
|
+
content_type=ContentType.PARQUET,
|
680
|
+
catalog="test_catalog_1",
|
681
|
+
)
|
682
|
+
|
683
|
+
# Create product categories table
|
684
|
+
categories_data = pd.DataFrame(
|
685
|
+
{
|
686
|
+
"category_id": ["widgets", "premium", "accessories"],
|
687
|
+
"display_name": ["Standard Widgets", "Premium Products", "Accessories"],
|
688
|
+
"description": [
|
689
|
+
"Basic widget products",
|
690
|
+
"High-end premium items",
|
691
|
+
"Additional accessories",
|
692
|
+
],
|
693
|
+
"active": [True, True, False],
|
694
|
+
}
|
695
|
+
)
|
696
|
+
|
697
|
+
write_to_table(
|
698
|
+
data=categories_data,
|
699
|
+
table="categories",
|
700
|
+
namespace="products",
|
701
|
+
mode=TableWriteMode.CREATE,
|
702
|
+
content_type=ContentType.PARQUET,
|
703
|
+
catalog="test_catalog_1",
|
704
|
+
)
|
705
|
+
|
706
|
+
# Namespace 3: Empty namespace (edge case testing)
|
707
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/empty_data"))
|
708
|
+
|
709
|
+
# Namespace 4: Orders with complex nested data
|
710
|
+
dc.put(DeltaCatUrl("dc://test_catalog_1/orders"))
|
711
|
+
|
712
|
+
orders_data = pd.DataFrame(
|
713
|
+
{
|
714
|
+
"order_id": ["order_1", "order_2", "order_3"],
|
715
|
+
"user_id": ["user_1", "user_2", "user_1"],
|
716
|
+
"product_ids": [["prod_1"], ["prod_2", "prod_3"], ["prod_1", "prod_2"]],
|
717
|
+
"order_date": pd.to_datetime(
|
718
|
+
["2023-01-05", "2023-01-06", "2023-01-07"]
|
719
|
+
),
|
720
|
+
"total_amount": [19.99, 179.98, 49.98],
|
721
|
+
"status": ["completed", "pending", "completed"],
|
722
|
+
}
|
723
|
+
)
|
724
|
+
|
725
|
+
write_to_table(
|
726
|
+
data=orders_data,
|
727
|
+
table="transactions",
|
728
|
+
namespace="orders",
|
729
|
+
mode=TableWriteMode.CREATE,
|
730
|
+
content_type=ContentType.PARQUET,
|
731
|
+
catalog="test_catalog_1",
|
732
|
+
)
|
733
|
+
|
734
|
+
# Verify source catalog structure before copy
|
735
|
+
source_objects = dc.list(DeltaCatUrl("dc://test_catalog_1"), recursive=True)
|
736
|
+
source_urls_by_type = defaultdict(list)
|
737
|
+
source_by_type = defaultdict(list)
|
738
|
+
|
739
|
+
for obj in source_objects:
|
740
|
+
obj_class = Metafile.get_class(obj.to_serializable())
|
741
|
+
source_urls_by_type[obj_class].append(obj.url())
|
742
|
+
source_by_type[obj_class].append(obj)
|
743
|
+
|
744
|
+
assert (
|
745
|
+
len(source_urls_by_type[Namespace]) == 4
|
746
|
+
), f"Expected 4 namespaces, got {len(source_urls_by_type[Namespace])}"
|
747
|
+
assert (
|
748
|
+
len(source_urls_by_type[Table]) == 5
|
749
|
+
), f"Expected 5 tables, got {len(source_urls_by_type[Table])}"
|
750
|
+
assert (
|
751
|
+
len(source_urls_by_type[TableVersion]) == 6
|
752
|
+
), f"Expected 6 table versions, got {len(source_urls_by_type[TableVersion])}"
|
753
|
+
assert (
|
754
|
+
len(source_urls_by_type[Stream]) == 6
|
755
|
+
), f"Expected 6 streams, got {len(source_urls_by_type[Stream])}"
|
756
|
+
assert (
|
757
|
+
len(source_urls_by_type[Partition]) == 6
|
758
|
+
), f"Expected 6 partitions, got {len(source_urls_by_type[Partition])}"
|
759
|
+
assert (
|
760
|
+
len(source_urls_by_type[Delta]) == 6
|
761
|
+
), f"Expected 6 deltas, got {len(source_urls_by_type[Delta])}"
|
762
|
+
|
763
|
+
# Test the /** recursive copy pattern.
|
764
|
+
dc.copy(
|
765
|
+
DeltaCatUrl("dc://test_catalog_1/**"), # ** means recursive copy all
|
766
|
+
DeltaCatUrl("dc://test_catalog_2/"),
|
767
|
+
)
|
768
|
+
|
769
|
+
# Verify destination catalog has same structure
|
770
|
+
dest_objects = dc.list(DeltaCatUrl("dc://test_catalog_2"), recursive=True)
|
771
|
+
dest_urls_by_type = defaultdict(list)
|
772
|
+
dest_by_type = defaultdict(list)
|
773
|
+
|
774
|
+
assert len(dest_objects) == len(
|
775
|
+
source_objects
|
776
|
+
), f"Expected {len(source_objects)} objects, got {len(dest_objects)}"
|
777
|
+
|
778
|
+
for obj in dest_objects:
|
779
|
+
obj_class = Metafile.get_class(obj.to_serializable())
|
780
|
+
dest_urls_by_type[obj_class].append(obj.url())
|
781
|
+
dest_by_type[obj_class].append(obj)
|
782
|
+
|
783
|
+
assert sorted(dest_urls_by_type[Namespace]) == sorted(
|
784
|
+
source_urls_by_type[Namespace]
|
785
|
+
), f"Namespace mismatch: {dest_urls_by_type[Namespace]} vs {source_urls_by_type[Namespace]}"
|
786
|
+
assert sorted(dest_urls_by_type[Table]) == sorted(
|
787
|
+
source_urls_by_type[Table]
|
788
|
+
), f"Table mismatch: {dest_urls_by_type[Table]} vs {source_urls_by_type[Table]}"
|
789
|
+
assert sorted(dest_urls_by_type[TableVersion]) == sorted(
|
790
|
+
source_urls_by_type[TableVersion]
|
791
|
+
), f"Table version mismatch: {dest_urls_by_type[TableVersion]} vs {source_urls_by_type[TableVersion]}"
|
792
|
+
assert sorted(dest_urls_by_type[Stream]) == sorted(
|
793
|
+
source_urls_by_type[Stream]
|
794
|
+
), f"Stream mismatch: {dest_urls_by_type[Stream]} vs {source_urls_by_type[Stream]}"
|
795
|
+
assert sorted(dest_urls_by_type[Partition]) == sorted(
|
796
|
+
source_urls_by_type[Partition]
|
797
|
+
), f"Partition mismatch: {dest_urls_by_type[Partition]} vs {source_urls_by_type[Partition]}"
|
798
|
+
assert sorted(dest_urls_by_type[Delta]) == sorted(
|
799
|
+
source_urls_by_type[Delta]
|
800
|
+
), f"Delta mismatch: {dest_urls_by_type[Delta]} vs {source_urls_by_type[Delta]}"
|
801
|
+
|
802
|
+
# Validate each hierarchy level
|
803
|
+
for obj_type in source_by_type.keys():
|
804
|
+
source_count = len(source_by_type.get(obj_type))
|
805
|
+
dest_count = len(dest_by_type.get(obj_type, []))
|
806
|
+
assert (
|
807
|
+
dest_count == source_count
|
808
|
+
), f"{obj_type} count mismatch: {dest_count} vs {source_count}"
|
809
|
+
|
810
|
+
# Spot check equivalence of each type
|
811
|
+
if obj_type == Namespace and source_count > 0:
|
812
|
+
# Check namespace properties are preserved
|
813
|
+
source_ns = source_by_type[obj_type][0] # NamespaceModel
|
814
|
+
dest_ns = next(
|
815
|
+
(
|
816
|
+
ns
|
817
|
+
for ns in dest_by_type[obj_type]
|
818
|
+
if ns.namespace == source_ns.namespace
|
819
|
+
),
|
820
|
+
None,
|
821
|
+
)
|
822
|
+
assert (
|
823
|
+
dest_ns is not None
|
824
|
+
), f"Namespace {source_ns.namespace} not found in destination"
|
825
|
+
assert source_ns.equivalent_to(
|
826
|
+
dest_ns
|
827
|
+
), f"Namespace {source_ns.namespace} not equivalent to {dest_ns.namespace}"
|
828
|
+
elif obj_type == Table:
|
829
|
+
source_table = source_by_type[obj_type][0] # TableModel
|
830
|
+
dest_table = next(
|
831
|
+
(
|
832
|
+
t
|
833
|
+
for t in dest_by_type[obj_type]
|
834
|
+
if t.namespace == source_table.namespace
|
835
|
+
and t.table_name == source_table.table_name
|
836
|
+
),
|
837
|
+
None,
|
838
|
+
)
|
839
|
+
assert (
|
840
|
+
dest_table is not None
|
841
|
+
), f"Table {source_table.namespace}.{source_table.table_name} not found in destination"
|
842
|
+
assert source_table.equivalent_to(
|
843
|
+
dest_table
|
844
|
+
), f"Table {source_table.namespace}.{source_table.table_name} not equivalent to {dest_table.namespace}.{dest_table.table_name}"
|
845
|
+
elif obj_type == TableVersion and source_count > 0:
|
846
|
+
# Check table version properties are preserved
|
847
|
+
source_tv = source_by_type[obj_type][0] # TableVersionModel
|
848
|
+
dest_tv = next(
|
849
|
+
(
|
850
|
+
tv
|
851
|
+
for tv in dest_by_type[obj_type]
|
852
|
+
if tv.namespace == source_tv.namespace
|
853
|
+
and tv.table_name == source_tv.table_name
|
854
|
+
and tv.table_version == source_tv.table_version
|
855
|
+
),
|
856
|
+
None,
|
857
|
+
)
|
858
|
+
assert (
|
859
|
+
dest_tv is not None
|
860
|
+
), f"TableVersion {source_tv.namespace}.{source_tv.table_name}.{source_tv.table_version} not found in destination"
|
861
|
+
assert dest_tv.equivalent_to(
|
862
|
+
source_tv
|
863
|
+
), f"TableVersion {source_tv.namespace}.{source_tv.table_name}.{source_tv.table_version} not equivalent to {dest_tv.namespace}.{dest_tv.table_name}.{dest_tv.table_version}"
|
864
|
+
|
865
|
+
# Special validation for table version ordering - check that analytics.events has versions 1 and 2
|
866
|
+
analytics_events_versions = [
|
867
|
+
tv
|
868
|
+
for tv in dest_by_type[obj_type]
|
869
|
+
if tv.namespace == "analytics" and tv.table_name == "events"
|
870
|
+
]
|
871
|
+
if analytics_events_versions:
|
872
|
+
versions = sorted(
|
873
|
+
[tv.table_version for tv in analytics_events_versions]
|
874
|
+
)
|
875
|
+
assert versions == [
|
876
|
+
"1",
|
877
|
+
"2",
|
878
|
+
], f"Expected analytics.events versions ['1', '2'], got {versions}"
|
879
|
+
elif obj_type == Stream and source_count > 0:
|
880
|
+
# Check stream properties are preserved
|
881
|
+
source_stream = source_by_type[obj_type][0] # StreamModel
|
882
|
+
dest_stream = next(
|
883
|
+
(
|
884
|
+
s
|
885
|
+
for s in dest_by_type[obj_type]
|
886
|
+
if s.namespace == source_stream.namespace
|
887
|
+
and s.table_name == source_stream.table_name
|
888
|
+
and s.stream_format == source_stream.stream_format
|
889
|
+
),
|
890
|
+
None,
|
891
|
+
)
|
892
|
+
assert (
|
893
|
+
dest_stream is not None
|
894
|
+
), f"Stream {source_stream.namespace}.{source_stream.table_name}.{source_stream.stream_format} not found in destination"
|
895
|
+
assert dest_stream.equivalent_to(
|
896
|
+
source_stream
|
897
|
+
), f"Stream {source_stream.namespace}.{source_stream.table_name}.{source_stream.stream_format} not equivalent to {dest_stream.namespace}.{dest_stream.table_name}.{dest_stream.stream_format}"
|
898
|
+
elif obj_type == Partition and source_count > 0:
|
899
|
+
# Check partition properties are preserved (with new partition IDs)
|
900
|
+
source_partition = source_by_type[obj_type][0] # PartitionModel
|
901
|
+
dest_partition = next(
|
902
|
+
(
|
903
|
+
p
|
904
|
+
for p in dest_by_type[obj_type]
|
905
|
+
if p.namespace == source_partition.namespace
|
906
|
+
and p.table_name == source_partition.table_name
|
907
|
+
),
|
908
|
+
None,
|
909
|
+
)
|
910
|
+
assert (
|
911
|
+
dest_partition is not None
|
912
|
+
), f"Partition for {source_partition.namespace}.{source_partition.table_name} not found in destination"
|
913
|
+
assert dest_partition.equivalent_to(
|
914
|
+
source_partition
|
915
|
+
), f"Partition {source_partition.namespace}.{source_partition.table_name} not equivalent to {dest_partition.namespace}.{dest_partition.table_name}"
|
916
|
+
elif obj_type == Delta and source_count > 0:
|
917
|
+
# Check delta properties are preserved (with same stream positions)
|
918
|
+
source_delta = source_by_type[obj_type][0] # DeltaModel
|
919
|
+
dest_delta = next(
|
920
|
+
(
|
921
|
+
d
|
922
|
+
for d in dest_by_type[obj_type]
|
923
|
+
if d.namespace == source_delta.namespace
|
924
|
+
and d.table_name == source_delta.table_name
|
925
|
+
and d.stream_position == source_delta.stream_position
|
926
|
+
),
|
927
|
+
None,
|
928
|
+
)
|
929
|
+
assert (
|
930
|
+
dest_delta is not None
|
931
|
+
), f"Delta for {source_delta.namespace}.{source_delta.table_name} at position {source_delta.stream_position} not found in destination"
|
932
|
+
assert dest_delta.equivalent_to(
|
933
|
+
source_delta
|
934
|
+
), f"Delta {source_delta.namespace}.{source_delta.table_name} at position {source_delta.stream_position} not equivalent to {dest_delta.namespace}.{dest_delta.table_name} at position {dest_delta.stream_position}"
|
935
|
+
|
936
|
+
# Validate each table's data integrity
|
937
|
+
test_cases = [
|
938
|
+
("analytics", "events"),
|
939
|
+
("analytics", "users"),
|
940
|
+
("products", "inventory"),
|
941
|
+
("products", "categories"),
|
942
|
+
("orders", "transactions"),
|
943
|
+
]
|
944
|
+
|
945
|
+
for namespace, table in test_cases:
|
946
|
+
# Check table exists in destination
|
947
|
+
assert dc.table_exists(
|
948
|
+
table=table,
|
949
|
+
namespace=namespace,
|
950
|
+
catalog="test_catalog_2",
|
951
|
+
), f"Table {namespace}/{table} should exist in destination catalog"
|
952
|
+
|
953
|
+
# Verify table data equivalence using read_table
|
954
|
+
source_df = dc.read_table(
|
955
|
+
table=table,
|
956
|
+
namespace=namespace,
|
957
|
+
catalog="test_catalog_1",
|
958
|
+
read_as=DatasetType.PANDAS,
|
959
|
+
)
|
960
|
+
|
961
|
+
dest_df = dc.read_table(
|
962
|
+
table=table,
|
963
|
+
namespace=namespace,
|
964
|
+
catalog="test_catalog_2",
|
965
|
+
read_as=DatasetType.PANDAS,
|
966
|
+
)
|
967
|
+
|
968
|
+
# Verify both datasets are valid pandas DataFrames
|
969
|
+
assert (
|
970
|
+
source_df is not None
|
971
|
+
), f"Source data should not be None for {namespace}.{table}"
|
972
|
+
assert (
|
973
|
+
dest_df is not None
|
974
|
+
), f"Destination data should not be None for {namespace}.{table}"
|
975
|
+
|
976
|
+
# Compare DataFrame properties
|
977
|
+
assert len(source_df) == len(
|
978
|
+
dest_df
|
979
|
+
), f"Row count mismatch for {namespace}.{table}: {len(source_df)} vs {len(dest_df)}"
|
980
|
+
assert list(source_df.columns) == list(
|
981
|
+
dest_df.columns
|
982
|
+
), f"Column mismatch for {namespace}.{table}"
|
983
|
+
|
984
|
+
# Sort both dataframes by first column for comparison (to handle potential row ordering differences)
|
985
|
+
_assert_data_equivalence(source_df, dest_df)
|
986
|
+
|
987
|
+
# Verify that writing to the source table doesn't affect the destination table
|
988
|
+
dc.write_to_table(
|
989
|
+
data=source_df,
|
990
|
+
table=table,
|
991
|
+
namespace=namespace,
|
992
|
+
catalog="test_catalog_1",
|
993
|
+
mode=TableWriteMode.APPEND,
|
994
|
+
)
|
995
|
+
|
996
|
+
# Verify that the destination table's data hasn't changed
|
997
|
+
dest_df = dc.read_table(
|
998
|
+
table=table,
|
999
|
+
namespace=namespace,
|
1000
|
+
catalog="test_catalog_2",
|
1001
|
+
read_as=DatasetType.PANDAS,
|
1002
|
+
)
|
1003
|
+
_assert_data_equivalence(source_df, dest_df)
|
1004
|
+
|
1005
|
+
# Verify that the source table has source_df repeated twice
|
1006
|
+
source_df_repeated = dc.read_table(
|
1007
|
+
table=table,
|
1008
|
+
namespace=namespace,
|
1009
|
+
catalog="test_catalog_1",
|
1010
|
+
read_as=DatasetType.PANDAS,
|
1011
|
+
)
|
1012
|
+
assert (
|
1013
|
+
len(source_df_repeated) == len(source_df) * 2
|
1014
|
+
), f"Source table {namespace}.{table} should have {len(source_df) * 2} rows"
|
1015
|
+
|
1016
|
+
# Verify that writing to the destination table doesn't affect the source table
|
1017
|
+
dc.write_to_table(
|
1018
|
+
data=dest_df,
|
1019
|
+
table=table,
|
1020
|
+
namespace=namespace,
|
1021
|
+
catalog="test_catalog_2",
|
1022
|
+
mode=TableWriteMode.APPEND,
|
1023
|
+
)
|
1024
|
+
|
1025
|
+
# Verify that the source table's data hasn't changed
|
1026
|
+
source_df_unchanged = dc.read_table(
|
1027
|
+
table=table,
|
1028
|
+
namespace=namespace,
|
1029
|
+
catalog="test_catalog_1",
|
1030
|
+
read_as=DatasetType.PANDAS,
|
1031
|
+
)
|
1032
|
+
_assert_data_equivalence(source_df_repeated, source_df_unchanged)
|
1033
|
+
|
1034
|
+
# Verify that the destination table's data has dest_df repeated twice
|
1035
|
+
dest_df_repeated = dc.read_table(
|
1036
|
+
table=table,
|
1037
|
+
namespace=namespace,
|
1038
|
+
catalog="test_catalog_2",
|
1039
|
+
read_as=DatasetType.PANDAS,
|
1040
|
+
)
|
1041
|
+
assert (
|
1042
|
+
len(dest_df_repeated) == len(dest_df) * 2
|
1043
|
+
), f"Destination table {namespace}.{table} should have {len(dest_df) * 2} rows"
|
1044
|
+
|
1045
|
+
# Verify empty namespace was copied correctly
|
1046
|
+
assert dc.namespace_exists(
|
1047
|
+
namespace="empty_data",
|
1048
|
+
catalog="test_catalog_2",
|
1049
|
+
), "Empty namespace should exist in destination catalog"
|
1050
|
+
|
1051
|
+
|
1052
|
+
def _assert_data_equivalence(source_df: pd.DataFrame, dest_df: pd.DataFrame):
|
1053
|
+
# Sort both dataframes by first column for comparison (to handle potential row ordering differences)
|
1054
|
+
if len(source_df) > 0:
|
1055
|
+
first_col = source_df.columns[0]
|
1056
|
+
# Handle sorting with potential complex data types
|
1057
|
+
source_sorted = source_df.sort_values(first_col).reset_index(drop=True)
|
1058
|
+
dest_sorted = dest_df.sort_values(first_col).reset_index(drop=True)
|
1059
|
+
|
1060
|
+
# Compare data values using pandas testing
|
1061
|
+
pd.testing.assert_frame_equal(
|
1062
|
+
source_sorted,
|
1063
|
+
dest_sorted,
|
1064
|
+
)
|