deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
TEST_S3_RCF_BUCKET_NAME = "test-compaction-artifacts-bucket"
|
5
4
|
# REBASE src = spark compacted table to create an initial version of ray compacted table
|
6
5
|
BASE_TEST_SOURCE_NAMESPACE = "source_test_namespace"
|
7
6
|
BASE_TEST_SOURCE_TABLE_NAME = "source_test_table"
|
@@ -0,0 +1 @@
|
|
1
|
+
# Test package for experimental DeltaCAT features
|
@@ -0,0 +1 @@
|
|
1
|
+
# Test package for compatibility utilities
|
@@ -0,0 +1,582 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Unit tests for the backfill script that migrates catalogs from old to new canonical string format.
|
4
|
+
|
5
|
+
Tests verify that catalogs created with the old canonical_string format (with parent hexdigest)
|
6
|
+
can be successfully migrated to the new hierarchical format (without parent hexdigest).
|
7
|
+
"""
|
8
|
+
import os
|
9
|
+
import tempfile
|
10
|
+
import shutil
|
11
|
+
import uuid
|
12
|
+
from typing import Dict, Any
|
13
|
+
import pandas as pd
|
14
|
+
import pyarrow as pa
|
15
|
+
|
16
|
+
import deltacat as dc
|
17
|
+
from deltacat import Catalog
|
18
|
+
from deltacat.catalog.main import impl as catalog
|
19
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
20
|
+
from deltacat.storage.model.schema import Schema, Field
|
21
|
+
from deltacat.types.tables import TableWriteMode
|
22
|
+
from deltacat import DatasetType
|
23
|
+
from deltacat.utils.url import DeltaCatUrl
|
24
|
+
from deltacat.storage.model.metafile import Metafile
|
25
|
+
from deltacat.storage.model.namespace import NamespaceLocator
|
26
|
+
from deltacat.storage.model.table import TableLocator
|
27
|
+
from deltacat.storage.model.table_version import TableVersionLocator
|
28
|
+
from deltacat.storage.model.stream import StreamLocator
|
29
|
+
|
30
|
+
|
31
|
+
from deltacat.experimental.compatibility.backfill_locator_to_id_mappings import (
|
32
|
+
patched_canonical_string,
|
33
|
+
migrate_catalog,
|
34
|
+
)
|
35
|
+
|
36
|
+
|
37
|
+
def get_catalog_properties(root: str) -> CatalogProperties:
|
38
|
+
"""Helper to create catalog properties for testing."""
|
39
|
+
return CatalogProperties(root=root)
|
40
|
+
|
41
|
+
|
42
|
+
def create_test_schema() -> Schema:
|
43
|
+
"""Create a basic schema for testing."""
|
44
|
+
return Schema.of(
|
45
|
+
[
|
46
|
+
Field.of(pa.field("id", pa.int64())),
|
47
|
+
Field.of(pa.field("name", pa.string())),
|
48
|
+
Field.of(pa.field("value", pa.float64())),
|
49
|
+
]
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
def create_test_data() -> pd.DataFrame:
|
54
|
+
"""Create test data for writing to tables."""
|
55
|
+
return pd.DataFrame(
|
56
|
+
{
|
57
|
+
"id": [1, 2, 3],
|
58
|
+
"name": ["Alice", "Bob", "Charlie"],
|
59
|
+
"value": [10.5, 20.0, 30.5],
|
60
|
+
}
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
class TestBackfillLocatorToIdMappings:
|
65
|
+
"""Test the backfill script for canonical string migration."""
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def setup_class(cls):
|
69
|
+
"""Set up test environment."""
|
70
|
+
cls.temp_dir = tempfile.mkdtemp()
|
71
|
+
cls.dest_dir = tempfile.mkdtemp()
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def teardown_class(cls):
|
75
|
+
"""Clean up test environment."""
|
76
|
+
shutil.rmtree(cls.temp_dir, ignore_errors=True)
|
77
|
+
shutil.rmtree(cls.dest_dir, ignore_errors=True)
|
78
|
+
|
79
|
+
def setup_method(self):
|
80
|
+
"""Set up for each test method."""
|
81
|
+
# Clear directories for each test
|
82
|
+
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
83
|
+
shutil.rmtree(self.dest_dir, ignore_errors=True)
|
84
|
+
self.temp_dir = tempfile.mkdtemp()
|
85
|
+
self.dest_dir = tempfile.mkdtemp()
|
86
|
+
|
87
|
+
self.catalog_properties = get_catalog_properties(root=self.temp_dir)
|
88
|
+
self.dest_catalog_properties = get_catalog_properties(root=self.dest_dir)
|
89
|
+
|
90
|
+
def create_old_format_catalog(self) -> Dict[str, Any]:
|
91
|
+
"""
|
92
|
+
Create a catalog using the old canonical_string format.
|
93
|
+
|
94
|
+
Returns:
|
95
|
+
Dict with information about created objects for verification
|
96
|
+
"""
|
97
|
+
# Initialize DeltaCAT following the correct pattern
|
98
|
+
dc.init()
|
99
|
+
source_catalog_name = f"test_source_{uuid.uuid4()}"
|
100
|
+
dc.put_catalog(
|
101
|
+
source_catalog_name, catalog=Catalog(config=self.catalog_properties)
|
102
|
+
)
|
103
|
+
|
104
|
+
# Create catalog structure using old canonical_string format
|
105
|
+
with patched_canonical_string(use_old_format=True):
|
106
|
+
# Create namespace
|
107
|
+
namespace_name = "test_namespace"
|
108
|
+
catalog.create_namespace(
|
109
|
+
namespace=namespace_name, inner=self.catalog_properties
|
110
|
+
)
|
111
|
+
|
112
|
+
# Create multiple tables with different stream formats
|
113
|
+
tables_info = []
|
114
|
+
|
115
|
+
# Table 1: Basic table with deltacat stream
|
116
|
+
table1_name = "table_one"
|
117
|
+
table1_data = create_test_data()
|
118
|
+
catalog.write_to_table(
|
119
|
+
data=table1_data,
|
120
|
+
table=table1_name,
|
121
|
+
namespace=namespace_name,
|
122
|
+
mode=TableWriteMode.CREATE,
|
123
|
+
inner=self.catalog_properties,
|
124
|
+
# This will create deltacat format stream by default
|
125
|
+
)
|
126
|
+
tables_info.append(
|
127
|
+
{
|
128
|
+
"name": table1_name,
|
129
|
+
"expected_streams": ["deltacat"],
|
130
|
+
"expected_partitions": [("default",)],
|
131
|
+
}
|
132
|
+
)
|
133
|
+
|
134
|
+
# Table 2: Table with additional data (creates more partitions/deltas)
|
135
|
+
table2_name = "table_two"
|
136
|
+
table2_data1 = create_test_data()
|
137
|
+
table2_data2 = pd.DataFrame(
|
138
|
+
{
|
139
|
+
"id": [4, 5, 6],
|
140
|
+
"name": ["David", "Eve", "Frank"],
|
141
|
+
"value": [40.0, 50.5, 60.0],
|
142
|
+
}
|
143
|
+
)
|
144
|
+
|
145
|
+
# Create table
|
146
|
+
catalog.write_to_table(
|
147
|
+
data=table2_data1,
|
148
|
+
table=table2_name,
|
149
|
+
namespace=namespace_name,
|
150
|
+
mode=TableWriteMode.CREATE,
|
151
|
+
inner=self.catalog_properties,
|
152
|
+
)
|
153
|
+
|
154
|
+
# Append more data (creates additional delta)
|
155
|
+
catalog.write_to_table(
|
156
|
+
data=table2_data2,
|
157
|
+
table=table2_name,
|
158
|
+
namespace=namespace_name,
|
159
|
+
mode=TableWriteMode.APPEND,
|
160
|
+
inner=self.catalog_properties,
|
161
|
+
)
|
162
|
+
|
163
|
+
tables_info.append(
|
164
|
+
{
|
165
|
+
"name": table2_name,
|
166
|
+
"expected_streams": ["deltacat"],
|
167
|
+
"expected_partitions": [("default",)],
|
168
|
+
}
|
169
|
+
)
|
170
|
+
|
171
|
+
# Table 3: Table with explicit schema (might create different stream characteristics)
|
172
|
+
table3_name = "table_three"
|
173
|
+
table3_data = create_test_data()
|
174
|
+
catalog.write_to_table(
|
175
|
+
data=table3_data,
|
176
|
+
table=table3_name,
|
177
|
+
namespace=namespace_name,
|
178
|
+
mode=TableWriteMode.CREATE,
|
179
|
+
schema=create_test_schema(),
|
180
|
+
inner=self.catalog_properties,
|
181
|
+
)
|
182
|
+
tables_info.append(
|
183
|
+
{
|
184
|
+
"name": table3_name,
|
185
|
+
"expected_streams": ["deltacat"],
|
186
|
+
"expected_partitions": [("default",)],
|
187
|
+
}
|
188
|
+
)
|
189
|
+
|
190
|
+
return {
|
191
|
+
"namespace": namespace_name,
|
192
|
+
"tables": tables_info,
|
193
|
+
"catalog_root": self.temp_dir,
|
194
|
+
"catalog_name": source_catalog_name,
|
195
|
+
}
|
196
|
+
|
197
|
+
def verify_catalog_integrity(
|
198
|
+
self, catalog_root: str, expected_objects: Dict[str, Any]
|
199
|
+
):
|
200
|
+
"""
|
201
|
+
Verify that a catalog contains the expected objects and they can be read.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
catalog_root: Path to catalog root
|
205
|
+
expected_objects: Dict with expected namespace, tables, etc.
|
206
|
+
"""
|
207
|
+
# Use the catalog name from expected_objects if available, otherwise create a new one
|
208
|
+
if "catalog_name" in expected_objects:
|
209
|
+
verify_catalog_name = expected_objects["catalog_name"]
|
210
|
+
else:
|
211
|
+
# Fallback: create a new catalog for verification
|
212
|
+
verify_catalog_name = f"verify_{uuid.uuid4()}"
|
213
|
+
catalog_props = get_catalog_properties(root=catalog_root)
|
214
|
+
dc.put_catalog(verify_catalog_name, catalog=Catalog(config=catalog_props))
|
215
|
+
|
216
|
+
namespace_name = expected_objects["namespace"]
|
217
|
+
|
218
|
+
# Verify namespace exists
|
219
|
+
assert dc.namespace_exists(
|
220
|
+
namespace=namespace_name, catalog=verify_catalog_name
|
221
|
+
), f"Namespace {namespace_name} should exist"
|
222
|
+
|
223
|
+
# Verify each table exists and can be read
|
224
|
+
for table_info in expected_objects["tables"]:
|
225
|
+
table_name = table_info["name"]
|
226
|
+
|
227
|
+
# Check table exists with specific table version (default is "1")
|
228
|
+
assert dc.table_exists(
|
229
|
+
table=table_name,
|
230
|
+
namespace=namespace_name,
|
231
|
+
catalog=verify_catalog_name,
|
232
|
+
table_version="1",
|
233
|
+
), f"Table {namespace_name}/{table_name} should exist"
|
234
|
+
|
235
|
+
# Check we can get table definition with specific table version
|
236
|
+
table_def = dc.get_table(
|
237
|
+
table=table_name,
|
238
|
+
namespace=namespace_name,
|
239
|
+
catalog=verify_catalog_name,
|
240
|
+
table_version="1",
|
241
|
+
)
|
242
|
+
assert table_def is not None
|
243
|
+
assert table_def.table.table_name == table_name
|
244
|
+
|
245
|
+
# Check we can list table versions using dc.list_tables
|
246
|
+
tables_list = dc.list_tables(
|
247
|
+
namespace=namespace_name,
|
248
|
+
catalog=verify_catalog_name,
|
249
|
+
table=table_name, # List versions of this specific table
|
250
|
+
)
|
251
|
+
assert (
|
252
|
+
len(tables_list.all_items()) > 0
|
253
|
+
), f"Table {table_name} should have versions"
|
254
|
+
|
255
|
+
# Try to read some data from the table to verify it works
|
256
|
+
table_data = dc.read_table(
|
257
|
+
table=table_name,
|
258
|
+
namespace=namespace_name,
|
259
|
+
catalog=verify_catalog_name,
|
260
|
+
table_version="1",
|
261
|
+
read_as=DatasetType.PANDAS,
|
262
|
+
)
|
263
|
+
assert (
|
264
|
+
table_data is not None
|
265
|
+
), f"Should be able to read data from {table_name}"
|
266
|
+
|
267
|
+
# Verify the data matches expected test data structure
|
268
|
+
expected_columns = ["id", "name", "value"]
|
269
|
+
assert (
|
270
|
+
list(table_data.columns) == expected_columns
|
271
|
+
), f"Table {table_name} should have columns {expected_columns}"
|
272
|
+
assert (
|
273
|
+
len(table_data) >= 3
|
274
|
+
), f"Table {table_name} should have at least 3 rows of test data"
|
275
|
+
|
276
|
+
# List all objects in the catalog to verify complete structure including deltas
|
277
|
+
all_objects = dc.list(
|
278
|
+
DeltaCatUrl(f"dc://{verify_catalog_name}/"), recursive=True
|
279
|
+
)
|
280
|
+
|
281
|
+
# Count objects by type to verify deltas are present
|
282
|
+
object_counts = {}
|
283
|
+
for obj in all_objects:
|
284
|
+
obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
|
285
|
+
object_counts[obj_class_name] = object_counts.get(obj_class_name, 0) + 1
|
286
|
+
|
287
|
+
print(f"Migrated catalog object counts: {object_counts}")
|
288
|
+
|
289
|
+
# Verify deltas are present (table_two should have 2 deltas due to APPEND operation)
|
290
|
+
assert (
|
291
|
+
"Delta" in object_counts
|
292
|
+
), "No deltas found in migrated catalog - delta migration may have failed"
|
293
|
+
assert (
|
294
|
+
object_counts["Delta"] >= 3
|
295
|
+
), f"Expected at least 3 deltas (one for each table, plus one for append), got {object_counts.get('Delta', 0)}"
|
296
|
+
|
297
|
+
def test_patched_canonical_string_context_manager(self):
|
298
|
+
"""Test that the canonical_string patching works correctly."""
|
299
|
+
# Create test locators
|
300
|
+
ns_locator = NamespaceLocator({"namespace": "test_ns"})
|
301
|
+
table_locator = TableLocator(
|
302
|
+
{"namespaceLocator": ns_locator, "tableName": "test_table"}
|
303
|
+
)
|
304
|
+
|
305
|
+
# Test normal (new) format
|
306
|
+
normal_result = table_locator.canonical_string()
|
307
|
+
assert normal_result == "test_table"
|
308
|
+
|
309
|
+
# Test patched (old) format
|
310
|
+
with patched_canonical_string(use_old_format=True):
|
311
|
+
old_result = table_locator.canonical_string()
|
312
|
+
# Should include parent hexdigest
|
313
|
+
assert old_result != normal_result
|
314
|
+
assert old_result.endswith("|test_table")
|
315
|
+
assert len(old_result.split("|")) == 2
|
316
|
+
|
317
|
+
# Test that patch is restored
|
318
|
+
restored_result = table_locator.canonical_string()
|
319
|
+
assert restored_result == normal_result
|
320
|
+
|
321
|
+
def test_migrate_catalog_dry_run(self):
|
322
|
+
"""Test migration in dry-run mode."""
|
323
|
+
# Create catalog with old format
|
324
|
+
old_catalog_info = self.create_old_format_catalog()
|
325
|
+
|
326
|
+
# Create destination catalog
|
327
|
+
dest_catalog_name = f"test_dest_{uuid.uuid4()}"
|
328
|
+
dc.put_catalog(
|
329
|
+
dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
|
330
|
+
)
|
331
|
+
|
332
|
+
# Use catalog names in URLs, not directory paths
|
333
|
+
source_url = f"dc://{old_catalog_info['catalog_name']}/"
|
334
|
+
dest_url = f"dc://{dest_catalog_name}/"
|
335
|
+
|
336
|
+
# Test dry run migration
|
337
|
+
success = migrate_catalog(source_url, dest_url, dry_run=True)
|
338
|
+
assert success, "Dry run migration should succeed"
|
339
|
+
|
340
|
+
# Destination should be empty after dry run
|
341
|
+
dest_contents = (
|
342
|
+
os.listdir(self.dest_dir) if os.path.exists(self.dest_dir) else []
|
343
|
+
)
|
344
|
+
assert len(dest_contents) == 0, "Destination should be empty after dry run"
|
345
|
+
|
346
|
+
def test_migrate_catalog_full_migration(self):
|
347
|
+
"""Test full migration from old to new canonical string format."""
|
348
|
+
# Create catalog with old canonical_string format
|
349
|
+
old_catalog_info = self.create_old_format_catalog()
|
350
|
+
|
351
|
+
# Verify the old catalog works (using patched canonical_string since it was created with old format)
|
352
|
+
with patched_canonical_string(use_old_format=True):
|
353
|
+
self.verify_catalog_integrity(self.temp_dir, old_catalog_info)
|
354
|
+
|
355
|
+
# Create destination catalog
|
356
|
+
dest_catalog_name = f"test_dest_{uuid.uuid4()}"
|
357
|
+
dc.put_catalog(
|
358
|
+
dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
|
359
|
+
)
|
360
|
+
|
361
|
+
# Perform migration using catalog names
|
362
|
+
source_url = f"dc://{old_catalog_info['catalog_name']}/"
|
363
|
+
dest_url = f"dc://{dest_catalog_name}/"
|
364
|
+
|
365
|
+
success = migrate_catalog(source_url, dest_url, dry_run=False)
|
366
|
+
assert success, "Migration should succeed"
|
367
|
+
|
368
|
+
# Verify migrated catalog has same structure and data (update catalog_name for destination)
|
369
|
+
migrated_catalog_info = old_catalog_info.copy()
|
370
|
+
migrated_catalog_info["catalog_name"] = dest_catalog_name
|
371
|
+
migrated_catalog_info["catalog_root"] = self.dest_dir
|
372
|
+
self.verify_catalog_integrity(self.dest_dir, migrated_catalog_info)
|
373
|
+
|
374
|
+
# Additional verification: Compare object counts between source and destination
|
375
|
+
# This ensures all object types are migrated
|
376
|
+
with patched_canonical_string(use_old_format=True):
|
377
|
+
source_objects = dc.list(
|
378
|
+
DeltaCatUrl(f"dc://{old_catalog_info['catalog_name']}/"), recursive=True
|
379
|
+
)
|
380
|
+
|
381
|
+
dest_objects = dc.list(
|
382
|
+
DeltaCatUrl(f"dc://{dest_catalog_name}/"), recursive=True
|
383
|
+
)
|
384
|
+
|
385
|
+
# Count objects by type in both catalogs
|
386
|
+
source_counts = {}
|
387
|
+
dest_counts = {}
|
388
|
+
|
389
|
+
for obj in source_objects:
|
390
|
+
obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
|
391
|
+
source_counts[obj_class_name] = source_counts.get(obj_class_name, 0) + 1
|
392
|
+
|
393
|
+
for obj in dest_objects:
|
394
|
+
obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
|
395
|
+
dest_counts[obj_class_name] = dest_counts.get(obj_class_name, 0) + 1
|
396
|
+
|
397
|
+
# Verify all object types are migrated
|
398
|
+
for obj_type, count in source_counts.items():
|
399
|
+
assert (
|
400
|
+
obj_type in dest_counts
|
401
|
+
), f"Object type {obj_type} missing from destination"
|
402
|
+
assert (
|
403
|
+
dest_counts[obj_type] == count
|
404
|
+
), f"Object count mismatch for {obj_type}: source={count}, dest={dest_counts[obj_type]}"
|
405
|
+
|
406
|
+
def test_migrate_catalog_preserves_data_integrity(self):
|
407
|
+
"""Test that migration preserves data integrity."""
|
408
|
+
# Create catalog with old format
|
409
|
+
old_catalog_info = self.create_old_format_catalog()
|
410
|
+
|
411
|
+
# Get expected test data structure for validation
|
412
|
+
expected_test_data = create_test_data() # Get the expected data structure
|
413
|
+
|
414
|
+
# Create destination catalog for migration
|
415
|
+
dest_catalog_name = f"test_dest_{uuid.uuid4()}"
|
416
|
+
dc.put_catalog(
|
417
|
+
dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
|
418
|
+
)
|
419
|
+
|
420
|
+
# Perform migration
|
421
|
+
source_url = f"dc://{old_catalog_info['catalog_name']}/"
|
422
|
+
dest_url = f"dc://{dest_catalog_name}/"
|
423
|
+
|
424
|
+
success = migrate_catalog(source_url, dest_url, dry_run=False)
|
425
|
+
assert success, "Migration should succeed"
|
426
|
+
|
427
|
+
# Read data from migrated catalog and compare
|
428
|
+
for table_info in old_catalog_info["tables"]:
|
429
|
+
table_name = table_info["name"]
|
430
|
+
migrated_data = dc.read_table(
|
431
|
+
table=table_name,
|
432
|
+
namespace=old_catalog_info["namespace"],
|
433
|
+
catalog=dest_catalog_name,
|
434
|
+
table_version="1",
|
435
|
+
read_as=DatasetType.PANDAS,
|
436
|
+
)
|
437
|
+
|
438
|
+
# Verify migrated data structure and content matches expected test data exactly
|
439
|
+
assert (
|
440
|
+
migrated_data is not None
|
441
|
+
), f"Should be able to read migrated data from {table_name}"
|
442
|
+
assert list(migrated_data.columns) == list(
|
443
|
+
expected_test_data.columns
|
444
|
+
), f"Migrated {table_name} should have expected columns {list(expected_test_data.columns)}"
|
445
|
+
|
446
|
+
# For tables that append data, expect at least the base test data rows
|
447
|
+
if table_name == "table_two": # This table has appended data
|
448
|
+
assert (
|
449
|
+
len(migrated_data) == len(expected_test_data) * 2
|
450
|
+
), f"Migrated {table_name} should have at least {len(expected_test_data)} rows (base data)"
|
451
|
+
else:
|
452
|
+
assert len(migrated_data) == len(
|
453
|
+
expected_test_data
|
454
|
+
), f"Migrated {table_name} should have exactly {len(expected_test_data)} rows"
|
455
|
+
|
456
|
+
# Verify that the migrated data contains the expected test data
|
457
|
+
# Sort both dataframes for consistent comparison using 'id' column
|
458
|
+
expected_sorted = expected_test_data.sort_values("id").reset_index(
|
459
|
+
drop=True
|
460
|
+
)
|
461
|
+
migrated_sorted = migrated_data.sort_values("id").reset_index(drop=True)
|
462
|
+
|
463
|
+
# For tables with appended data, check that the original data is present
|
464
|
+
if table_name == "table_two":
|
465
|
+
# Check that all expected rows are present in the migrated data
|
466
|
+
for _, expected_row in expected_sorted.iterrows():
|
467
|
+
matching_rows = migrated_sorted[
|
468
|
+
migrated_sorted["id"] == expected_row["id"]
|
469
|
+
]
|
470
|
+
assert (
|
471
|
+
len(matching_rows) > 0
|
472
|
+
), f"Expected row with id {expected_row['id']} not found in migrated {table_name}"
|
473
|
+
# Verify the first matching row has the expected values
|
474
|
+
actual_row = matching_rows.iloc[0]
|
475
|
+
assert (
|
476
|
+
actual_row["name"] == expected_row["name"]
|
477
|
+
), f"Name mismatch for id {expected_row['id']} in {table_name}"
|
478
|
+
assert (
|
479
|
+
actual_row["value"] == expected_row["value"]
|
480
|
+
), f"Value mismatch for id {expected_row['id']} in {table_name}"
|
481
|
+
else:
|
482
|
+
# For tables without appended data, expect exact match
|
483
|
+
try:
|
484
|
+
pd.testing.assert_frame_equal(
|
485
|
+
expected_sorted,
|
486
|
+
migrated_sorted,
|
487
|
+
check_dtype=False, # Allow minor type differences
|
488
|
+
)
|
489
|
+
except AssertionError as e:
|
490
|
+
raise AssertionError(
|
491
|
+
f"Data content should match expected test data for {table_name} after migration: {e}"
|
492
|
+
)
|
493
|
+
|
494
|
+
def test_migrate_empty_catalog(self):
|
495
|
+
"""Test migration of an empty catalog."""
|
496
|
+
# Create empty catalog with old format
|
497
|
+
dc.init()
|
498
|
+
empty_catalog_name = f"empty_{uuid.uuid4()}"
|
499
|
+
dc.put_catalog(
|
500
|
+
empty_catalog_name, catalog=Catalog(config=self.catalog_properties)
|
501
|
+
)
|
502
|
+
|
503
|
+
with patched_canonical_string(use_old_format=True):
|
504
|
+
# Just create a namespace, no tables
|
505
|
+
dc.create_namespace(namespace="empty_namespace", catalog=empty_catalog_name)
|
506
|
+
|
507
|
+
# Create destination catalog for migration
|
508
|
+
dest_catalog_name = f"dest_{uuid.uuid4()}"
|
509
|
+
dc.put_catalog(
|
510
|
+
dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
|
511
|
+
)
|
512
|
+
|
513
|
+
# Perform migration
|
514
|
+
source_url = f"dc://{empty_catalog_name}/"
|
515
|
+
dest_url = f"dc://{dest_catalog_name}/"
|
516
|
+
|
517
|
+
success = migrate_catalog(source_url, dest_url, dry_run=False)
|
518
|
+
assert success, "Migration of empty catalog should succeed"
|
519
|
+
|
520
|
+
# Verify namespace exists in destination
|
521
|
+
assert dc.namespace_exists(
|
522
|
+
namespace="empty_namespace", catalog=dest_catalog_name
|
523
|
+
), "Namespace should exist in migrated catalog"
|
524
|
+
|
525
|
+
def test_migration_error_handling(self):
|
526
|
+
"""Test migration error handling for invalid inputs."""
|
527
|
+
# Test migration with non-existent source
|
528
|
+
invalid_source = f"dc://{self.temp_dir}/nonexistent/"
|
529
|
+
dest_url = f"dc://{self.dest_dir}/"
|
530
|
+
|
531
|
+
# This should handle the error gracefully
|
532
|
+
success = migrate_catalog(invalid_source, dest_url, dry_run=True)
|
533
|
+
# May succeed or fail depending on implementation, but shouldn't crash
|
534
|
+
assert isinstance(success, bool), "Should return boolean result"
|
535
|
+
|
536
|
+
def test_canonical_string_format_differences(self):
|
537
|
+
"""Test that old and new canonical string formats are actually different."""
|
538
|
+
# Create hierarchy of locators
|
539
|
+
ns_locator = NamespaceLocator({"namespace": "test_ns"})
|
540
|
+
table_locator = TableLocator(
|
541
|
+
{"namespaceLocator": ns_locator, "tableName": "test_table"}
|
542
|
+
)
|
543
|
+
table_version_locator = TableVersionLocator(
|
544
|
+
{"tableLocator": table_locator, "version": "1"}
|
545
|
+
)
|
546
|
+
stream_locator = StreamLocator(
|
547
|
+
{"tableVersionLocator": table_version_locator, "streamFormat": "deltacat"}
|
548
|
+
)
|
549
|
+
|
550
|
+
# Test each level shows difference between old and new format
|
551
|
+
test_cases = [
|
552
|
+
("namespace", ns_locator, True), # Namespace should be same (no parent)
|
553
|
+
("table", table_locator, False), # Table should be different
|
554
|
+
(
|
555
|
+
"table_version",
|
556
|
+
table_version_locator,
|
557
|
+
False,
|
558
|
+
), # Table version should be different
|
559
|
+
("stream", stream_locator, False), # Stream should be different
|
560
|
+
]
|
561
|
+
|
562
|
+
for obj_type, locator, should_be_same in test_cases:
|
563
|
+
new_format = locator.canonical_string()
|
564
|
+
|
565
|
+
with patched_canonical_string(use_old_format=True):
|
566
|
+
old_format = locator.canonical_string()
|
567
|
+
|
568
|
+
if should_be_same:
|
569
|
+
assert (
|
570
|
+
old_format == new_format
|
571
|
+
), f"{obj_type} canonical strings should be the same"
|
572
|
+
else:
|
573
|
+
assert (
|
574
|
+
old_format != new_format
|
575
|
+
), f"{obj_type} canonical strings should be different"
|
576
|
+
assert (
|
577
|
+
"|" in old_format
|
578
|
+
), f"{obj_type} old format should contain separator"
|
579
|
+
# New format should be a suffix of old format
|
580
|
+
assert old_format.endswith(
|
581
|
+
f"|{new_format}"
|
582
|
+
), f"{obj_type} old format should end with new format"
|