deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
Common utility functions for main storage compaction tests.
|
3
|
+
|
4
|
+
These functions are shared between incremental and multiple rounds compaction tests.
|
5
|
+
"""
|
1
6
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
7
|
from __future__ import annotations
|
3
8
|
from enum import Enum
|
4
|
-
from typing import Any, Dict, List, Optional
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
10
|
import datetime as dt
|
6
|
-
from boto3.resources.base import ServiceResource
|
7
11
|
from datetime import timezone
|
8
12
|
|
9
|
-
|
10
|
-
|
11
|
-
|
13
|
+
import tempfile
|
14
|
+
import os
|
15
|
+
import shutil
|
16
|
+
|
17
|
+
import pyarrow as pa
|
18
|
+
|
19
|
+
|
12
20
|
from deltacat.tests.compute.test_util_constant import (
|
13
21
|
BASE_TEST_SOURCE_NAMESPACE,
|
14
22
|
BASE_TEST_SOURCE_TABLE_NAME,
|
@@ -26,11 +34,10 @@ from deltacat.compute.compactor import (
|
|
26
34
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
27
35
|
CompactionSessionAuditInfo,
|
28
36
|
)
|
29
|
-
|
30
37
|
from deltacat.storage.model.partition import (
|
31
38
|
PartitionLocator,
|
32
39
|
PartitionScheme,
|
33
|
-
PartitionKey as
|
40
|
+
PartitionKey as StoragePartitionKey,
|
34
41
|
)
|
35
42
|
from deltacat.storage.model.stream import StreamLocator
|
36
43
|
from deltacat.storage.model.table_version import TableVersionLocator
|
@@ -39,8 +46,22 @@ from deltacat.storage.model.namespace import NamespaceLocator
|
|
39
46
|
from deltacat.storage.model.sort_key import (
|
40
47
|
SortScheme,
|
41
48
|
)
|
49
|
+
from deltacat.storage.model.delta import (
|
50
|
+
Delta,
|
51
|
+
DeltaType,
|
52
|
+
)
|
53
|
+
from deltacat.storage.model.partition import (
|
54
|
+
Partition,
|
55
|
+
PartitionKeyList,
|
56
|
+
)
|
57
|
+
from deltacat.storage.model.stream import Stream
|
58
|
+
from deltacat.storage.model.transform import IdentityTransform
|
59
|
+
from deltacat.storage.model.schema import Schema
|
42
60
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
43
61
|
|
62
|
+
from deltacat.storage import metastore
|
63
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
64
|
+
|
44
65
|
|
45
66
|
class PartitionKeyType(str, Enum):
|
46
67
|
INT = "int"
|
@@ -80,113 +101,741 @@ def get_test_partition_locator(partition_id):
|
|
80
101
|
return partition_locator
|
81
102
|
|
82
103
|
|
83
|
-
def
|
104
|
+
def create_main_deltacat_storage_kwargs() -> Dict[str, Any]:
|
105
|
+
"""
|
106
|
+
Helper function to create main deltacat storage kwargs
|
107
|
+
|
108
|
+
Returns: kwargs to use for main deltacat storage, i.e. {"catalog": CatalogProperties(...)}
|
109
|
+
"""
|
110
|
+
temp_dir = tempfile.mkdtemp()
|
111
|
+
catalog = CatalogProperties(root=temp_dir)
|
112
|
+
return {"catalog": catalog}
|
113
|
+
|
114
|
+
|
115
|
+
def clean_up_main_deltacat_storage_kwargs(storage_kwargs: Dict[str, Any]):
|
116
|
+
"""
|
117
|
+
Cleans up directory created by create_main_deltacat_storage_kwargs
|
118
|
+
"""
|
119
|
+
catalog = storage_kwargs["catalog"]
|
120
|
+
if hasattr(catalog, "root") and os.path.exists(catalog.root):
|
121
|
+
shutil.rmtree(catalog.root)
|
122
|
+
|
123
|
+
|
124
|
+
def _create_table_main(
|
84
125
|
namespace: str,
|
85
126
|
table_name: str,
|
86
127
|
table_version: str,
|
87
128
|
sort_keys: Optional[List[Any]],
|
88
129
|
partition_keys: Optional[List[PartitionKey]],
|
130
|
+
input_deltas: Optional[pa.Table],
|
89
131
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
90
132
|
):
|
91
|
-
|
92
|
-
|
133
|
+
"""
|
134
|
+
Main storage version of _create_table that works for both incremental and multiple rounds tests.
|
135
|
+
|
136
|
+
For incremental tests, input_deltas is provided to extract schema.
|
137
|
+
For multiple rounds tests, input_deltas can be None and we use a simpler approach.
|
138
|
+
"""
|
139
|
+
# Create namespace first
|
140
|
+
metastore.create_namespace(namespace=namespace, **ds_mock_kwargs)
|
141
|
+
|
142
|
+
# Handle schema creation
|
143
|
+
if input_deltas is not None:
|
144
|
+
# Incremental test approach - extract schema from input deltas
|
145
|
+
schema = input_deltas.schema
|
146
|
+
|
147
|
+
# Add partition key fields to schema if they're not already present
|
148
|
+
if partition_keys:
|
149
|
+
for pk in partition_keys:
|
150
|
+
field_name = pk.key_name
|
151
|
+
if field_name not in schema.names:
|
152
|
+
# Add partition key field with appropriate type
|
153
|
+
if pk.key_type == PartitionKeyType.INT:
|
154
|
+
field_type = pa.int32()
|
155
|
+
elif pk.key_type == PartitionKeyType.STRING:
|
156
|
+
field_type = pa.string()
|
157
|
+
elif (
|
158
|
+
pk.key_type == PartitionKeyType.TIMESTAMP
|
159
|
+
): # Handle timestamp type properly
|
160
|
+
field_type = pa.timestamp("us")
|
161
|
+
else:
|
162
|
+
field_type = pa.string() # Default to string
|
163
|
+
|
164
|
+
schema = schema.append(pa.field(field_name, field_type))
|
165
|
+
|
166
|
+
schema_obj = Schema.of(schema=schema)
|
167
|
+
else:
|
168
|
+
# Multiple rounds test approach - use None for schema (will be set later)
|
169
|
+
schema_obj = None
|
93
170
|
|
94
|
-
ds.create_namespace(namespace, {}, **ds_mock_kwargs)
|
95
|
-
partition_scheme = (
|
96
|
-
PartitionScheme.of(
|
97
|
-
[PartitionSchemeKey.of(key.key_name) for key in partition_keys]
|
98
|
-
)
|
99
|
-
if partition_keys
|
100
|
-
else None
|
101
|
-
)
|
102
171
|
sort_scheme = SortScheme.of(sort_keys) if sort_keys else None
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
172
|
+
|
173
|
+
# Convert test partition keys to storage partition keys
|
174
|
+
storage_partition_keys = []
|
175
|
+
if partition_keys:
|
176
|
+
for pk in partition_keys:
|
177
|
+
storage_partition_key = StoragePartitionKey.of(
|
178
|
+
key=[pk.key_name],
|
179
|
+
name=pk.key_name,
|
180
|
+
transform=IdentityTransform.of(),
|
181
|
+
)
|
182
|
+
storage_partition_keys.append(storage_partition_key)
|
183
|
+
|
184
|
+
# Create partition scheme
|
185
|
+
partition_scheme = None
|
186
|
+
if storage_partition_keys:
|
187
|
+
partition_scheme = PartitionScheme.of(
|
188
|
+
keys=PartitionKeyList.of(storage_partition_keys),
|
189
|
+
scheme_id="default_partition_scheme",
|
190
|
+
)
|
191
|
+
|
192
|
+
# Create table version (which creates table and stream automatically)
|
193
|
+
metastore.create_table_version(
|
194
|
+
namespace=namespace,
|
195
|
+
table_name=table_name,
|
196
|
+
table_version=table_version,
|
197
|
+
schema=schema_obj,
|
108
198
|
partition_scheme=partition_scheme,
|
109
|
-
|
199
|
+
sort_keys=sort_scheme,
|
110
200
|
**ds_mock_kwargs,
|
111
201
|
)
|
202
|
+
|
112
203
|
return namespace, table_name, table_version
|
113
204
|
|
114
205
|
|
115
|
-
def
|
206
|
+
def create_src_table_main(
|
116
207
|
sort_keys: Optional[List[Any]],
|
117
208
|
partition_keys: Optional[List[PartitionKey]],
|
209
|
+
input_deltas: Optional[pa.Table],
|
118
210
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
119
211
|
):
|
212
|
+
"""
|
213
|
+
Main storage version of create_src_table
|
214
|
+
"""
|
120
215
|
source_namespace: str = BASE_TEST_SOURCE_NAMESPACE
|
121
216
|
source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME
|
122
217
|
source_table_version: str = BASE_TEST_SOURCE_TABLE_VERSION
|
123
|
-
return
|
218
|
+
return _create_table_main(
|
124
219
|
source_namespace,
|
125
220
|
source_table_name,
|
126
221
|
source_table_version,
|
127
222
|
sort_keys,
|
128
223
|
partition_keys,
|
224
|
+
input_deltas,
|
129
225
|
ds_mock_kwargs,
|
130
226
|
)
|
131
227
|
|
132
228
|
|
133
|
-
def
|
229
|
+
def create_destination_table_main(
|
134
230
|
sort_keys: Optional[List[Any]],
|
135
231
|
partition_keys: Optional[List[PartitionKey]],
|
232
|
+
input_deltas: Optional[pa.Table],
|
136
233
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
137
234
|
):
|
235
|
+
"""
|
236
|
+
Main storage version of create_destination_table
|
237
|
+
"""
|
138
238
|
destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE
|
139
239
|
destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME
|
140
240
|
destination_table_version: str = BASE_TEST_DESTINATION_TABLE_VERSION
|
141
|
-
return
|
241
|
+
return _create_table_main(
|
142
242
|
destination_namespace,
|
143
243
|
destination_table_name,
|
144
244
|
destination_table_version,
|
145
245
|
sort_keys,
|
146
246
|
partition_keys,
|
247
|
+
input_deltas,
|
147
248
|
ds_mock_kwargs,
|
148
249
|
)
|
149
250
|
|
150
251
|
|
151
|
-
def
|
252
|
+
def create_rebase_table_main(
|
152
253
|
sort_keys: Optional[List[Any]],
|
153
254
|
partition_keys: Optional[List[PartitionKey]],
|
255
|
+
input_deltas: Optional[pa.Table],
|
154
256
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
155
257
|
):
|
258
|
+
"""
|
259
|
+
Main storage version of create_rebase_table
|
260
|
+
"""
|
156
261
|
rebasing_namespace = REBASING_NAMESPACE
|
157
262
|
rebasing_table_name = REBASING_TABLE_NAME
|
158
263
|
rebasing_table_version = REBASING_TABLE_VERSION
|
159
|
-
return
|
264
|
+
return _create_table_main(
|
160
265
|
rebasing_namespace,
|
161
266
|
rebasing_table_name,
|
162
267
|
rebasing_table_version,
|
163
268
|
sort_keys,
|
164
269
|
partition_keys,
|
270
|
+
input_deltas,
|
165
271
|
ds_mock_kwargs,
|
166
272
|
)
|
167
273
|
|
168
274
|
|
169
|
-
def
|
170
|
-
|
275
|
+
def get_rci_from_partition(
|
276
|
+
partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
|
277
|
+
) -> RoundCompletionInfo:
|
278
|
+
"""
|
279
|
+
Read RoundCompletionInfo from a partition metafile.
|
280
|
+
|
281
|
+
Args:
|
282
|
+
partition_locator: Locator of the partition containing the RoundCompletionInfo
|
283
|
+
deltacat_storage: Storage implementation (defaults to metastore)
|
284
|
+
**kwargs: Additional arguments to pass to deltacat_storage.get_partition (e.g., catalog)
|
285
|
+
|
286
|
+
Returns:
|
287
|
+
RoundCompletionInfo object from the partition, or None if not found
|
288
|
+
"""
|
289
|
+
from deltacat.storage import metastore
|
290
|
+
|
291
|
+
if deltacat_storage is None:
|
292
|
+
deltacat_storage = metastore
|
293
|
+
|
294
|
+
partition = deltacat_storage.get_partition(
|
295
|
+
partition_locator.stream_locator, partition_locator.partition_values, **kwargs
|
296
|
+
)
|
297
|
+
|
298
|
+
if partition and partition.compaction_round_completion_info:
|
299
|
+
return partition.compaction_round_completion_info
|
300
|
+
|
301
|
+
return None
|
171
302
|
|
172
|
-
|
173
|
-
|
174
|
-
|
303
|
+
|
304
|
+
def _add_deltas_to_partition_main(
|
305
|
+
deltas_ingredients: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
|
306
|
+
partition: Optional[Partition],
|
307
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
308
|
+
) -> Tuple[Optional[Delta], int]:
|
309
|
+
"""
|
310
|
+
Add deltas to a partition using main storage
|
311
|
+
"""
|
312
|
+
all_deltas_length = 0
|
313
|
+
incremental_delta = None
|
314
|
+
for (delta_data, delta_type, delete_parameters) in deltas_ingredients:
|
315
|
+
staged_delta: Delta = metastore.stage_delta(
|
316
|
+
delta_data,
|
317
|
+
partition,
|
318
|
+
delta_type,
|
319
|
+
entry_params=delete_parameters,
|
320
|
+
**ds_mock_kwargs,
|
321
|
+
)
|
322
|
+
incremental_delta = metastore.commit_delta(
|
323
|
+
staged_delta,
|
324
|
+
**ds_mock_kwargs,
|
325
|
+
)
|
326
|
+
all_deltas_length += len(delta_data) if delta_data else 0
|
327
|
+
return incremental_delta, all_deltas_length
|
328
|
+
|
329
|
+
|
330
|
+
def add_late_deltas_to_partition_main(
|
331
|
+
late_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
|
332
|
+
source_partition: Optional[Partition],
|
333
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
334
|
+
) -> Tuple[Optional[Delta], int]:
|
335
|
+
"""
|
336
|
+
Add late deltas to a partition using main storage
|
337
|
+
"""
|
338
|
+
return _add_deltas_to_partition_main(late_deltas, source_partition, ds_mock_kwargs)
|
339
|
+
|
340
|
+
|
341
|
+
def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
342
|
+
sort_keys: Optional[List[Any]],
|
343
|
+
partition_keys: Optional[List[PartitionKey]],
|
344
|
+
input_deltas: List[pa.Table],
|
345
|
+
partition_values: Optional[List[Any]],
|
346
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
347
|
+
) -> Tuple[Stream, Stream, Optional[Stream], bool]:
|
348
|
+
"""
|
349
|
+
Main storage version of multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy
|
350
|
+
"""
|
351
|
+
# For multiple rounds, we need to extract the first delta to get schema
|
352
|
+
first_delta_table = input_deltas[0][0] if input_deltas else None
|
353
|
+
source_namespace, source_table_name, source_table_version = create_src_table_main(
|
354
|
+
sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
|
175
355
|
)
|
176
|
-
return RoundCompletionInfo(**rcf_file_output)
|
177
356
|
|
357
|
+
source_table_stream: Stream = metastore.get_stream(
|
358
|
+
namespace=source_namespace,
|
359
|
+
table_name=source_table_name,
|
360
|
+
table_version=source_table_version,
|
361
|
+
**ds_mock_kwargs,
|
362
|
+
)
|
363
|
+
|
364
|
+
# Convert partition values to correct types
|
365
|
+
converted_partition_values = []
|
366
|
+
if partition_values and partition_keys:
|
367
|
+
for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
|
368
|
+
if pk.key_type == PartitionKeyType.INT:
|
369
|
+
converted_partition_values.append(int(value))
|
370
|
+
else:
|
371
|
+
converted_partition_values.append(value)
|
372
|
+
else:
|
373
|
+
converted_partition_values = partition_values
|
374
|
+
|
375
|
+
staged_partition: Partition = metastore.stage_partition(
|
376
|
+
source_table_stream,
|
377
|
+
converted_partition_values,
|
378
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
379
|
+
**ds_mock_kwargs,
|
380
|
+
)
|
381
|
+
|
382
|
+
is_delete = False
|
383
|
+
input_delta_length = 0
|
384
|
+
for (
|
385
|
+
input_delta,
|
386
|
+
input_delta_type,
|
387
|
+
input_delta_parameters,
|
388
|
+
) in input_deltas:
|
389
|
+
if input_delta_type is DeltaType.DELETE:
|
390
|
+
is_delete = True
|
391
|
+
staged_delta = metastore.stage_delta(
|
392
|
+
input_delta,
|
393
|
+
staged_partition,
|
394
|
+
input_delta_type,
|
395
|
+
entry_params=input_delta_parameters,
|
396
|
+
**ds_mock_kwargs,
|
397
|
+
)
|
398
|
+
metastore.commit_delta(staged_delta, **ds_mock_kwargs)
|
399
|
+
input_delta_length += len(input_delta)
|
400
|
+
metastore.commit_partition(staged_partition, **ds_mock_kwargs)
|
401
|
+
|
402
|
+
(
|
403
|
+
destination_table_namespace,
|
404
|
+
destination_table_name,
|
405
|
+
destination_table_version,
|
406
|
+
) = create_destination_table_main(
|
407
|
+
sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
|
408
|
+
)
|
409
|
+
destination_table_stream: Stream = metastore.get_stream(
|
410
|
+
namespace=destination_table_namespace,
|
411
|
+
table_name=destination_table_name,
|
412
|
+
table_version=destination_table_version,
|
413
|
+
**ds_mock_kwargs,
|
414
|
+
)
|
415
|
+
|
416
|
+
# Always create rebase table for multiple rounds tests
|
417
|
+
(
|
418
|
+
rebasing_table_namespace,
|
419
|
+
rebasing_table_name,
|
420
|
+
rebasing_table_version,
|
421
|
+
) = create_rebase_table_main(
|
422
|
+
sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
|
423
|
+
)
|
424
|
+
rebasing_table_stream: Stream = metastore.get_stream(
|
425
|
+
namespace=rebasing_table_namespace,
|
426
|
+
table_name=rebasing_table_name,
|
427
|
+
table_version=rebasing_table_version,
|
428
|
+
**ds_mock_kwargs,
|
429
|
+
)
|
430
|
+
|
431
|
+
# Stage partition and add deltas to rebase table
|
432
|
+
rebased_staged_partition: Partition = metastore.stage_partition(
|
433
|
+
rebasing_table_stream,
|
434
|
+
converted_partition_values,
|
435
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
436
|
+
**ds_mock_kwargs,
|
437
|
+
)
|
438
|
+
|
439
|
+
for (
|
440
|
+
input_delta,
|
441
|
+
input_delta_type,
|
442
|
+
input_delta_parameters,
|
443
|
+
) in input_deltas:
|
444
|
+
staged_delta = metastore.stage_delta(
|
445
|
+
input_delta,
|
446
|
+
rebased_staged_partition,
|
447
|
+
input_delta_type,
|
448
|
+
entry_params=input_delta_parameters,
|
449
|
+
**ds_mock_kwargs,
|
450
|
+
)
|
451
|
+
metastore.commit_delta(staged_delta, **ds_mock_kwargs)
|
452
|
+
metastore.commit_partition(rebased_staged_partition, **ds_mock_kwargs)
|
453
|
+
|
454
|
+
return (
|
455
|
+
source_table_stream,
|
456
|
+
destination_table_stream,
|
457
|
+
rebasing_table_stream,
|
458
|
+
is_delete,
|
459
|
+
)
|
460
|
+
|
461
|
+
|
462
|
+
def create_src_w_deltas_destination_plus_destination_main(
|
463
|
+
sort_keys: Optional[List[Any]],
|
464
|
+
partition_keys: Optional[List[PartitionKey]],
|
465
|
+
input_deltas: pa.Table,
|
466
|
+
input_delta_type: DeltaType,
|
467
|
+
partition_values: Optional[List[Any]],
|
468
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
469
|
+
simulate_is_inplace: bool = False,
|
470
|
+
) -> Tuple[Stream, Stream, Optional[Stream], str, str, str]:
|
471
|
+
"""
|
472
|
+
Create source with deltas and destination tables for incremental compaction testing
|
473
|
+
"""
|
474
|
+
source_namespace, source_table_name, source_table_version = create_src_table_main(
|
475
|
+
sort_keys, partition_keys, input_deltas, ds_mock_kwargs
|
476
|
+
)
|
178
477
|
|
179
|
-
|
180
|
-
|
478
|
+
source_table_stream: Stream = metastore.get_stream(
|
479
|
+
namespace=source_namespace,
|
480
|
+
table_name=source_table_name,
|
481
|
+
table_version=source_table_version,
|
482
|
+
**ds_mock_kwargs,
|
483
|
+
)
|
484
|
+
|
485
|
+
# Convert partition values to correct types
|
486
|
+
converted_partition_values = []
|
487
|
+
if partition_values and partition_keys:
|
488
|
+
for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
|
489
|
+
if pk.key_type == PartitionKeyType.INT:
|
490
|
+
converted_partition_values.append(int(value))
|
491
|
+
else:
|
492
|
+
converted_partition_values.append(value)
|
493
|
+
else:
|
494
|
+
converted_partition_values = partition_values
|
495
|
+
|
496
|
+
staged_partition: Partition = metastore.stage_partition(
|
497
|
+
source_table_stream,
|
498
|
+
converted_partition_values,
|
499
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
500
|
+
**ds_mock_kwargs,
|
501
|
+
)
|
502
|
+
metastore.commit_delta(
|
503
|
+
metastore.stage_delta(
|
504
|
+
input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
|
505
|
+
),
|
506
|
+
**ds_mock_kwargs,
|
507
|
+
)
|
508
|
+
metastore.commit_partition(staged_partition, **ds_mock_kwargs)
|
509
|
+
source_table_stream_after_committed: Stream = metastore.get_stream(
|
510
|
+
namespace=source_namespace,
|
511
|
+
table_name=source_table_name,
|
512
|
+
table_version=source_table_version,
|
513
|
+
**ds_mock_kwargs,
|
514
|
+
)
|
515
|
+
|
516
|
+
destination_table_namespace: Optional[str] = None
|
517
|
+
destination_table_name: Optional[str] = None
|
518
|
+
destination_table_version: Optional[str] = None
|
519
|
+
if not simulate_is_inplace:
|
520
|
+
(
|
521
|
+
destination_table_namespace,
|
522
|
+
destination_table_name,
|
523
|
+
destination_table_version,
|
524
|
+
) = create_destination_table_main(
|
525
|
+
sort_keys, partition_keys, input_deltas, ds_mock_kwargs
|
526
|
+
)
|
527
|
+
else:
|
528
|
+
destination_table_namespace = source_namespace
|
529
|
+
destination_table_name = source_table_name
|
530
|
+
destination_table_version = source_table_version
|
531
|
+
|
532
|
+
destination_table_stream: Stream = metastore.get_stream(
|
533
|
+
namespace=destination_table_namespace,
|
534
|
+
table_name=destination_table_name,
|
535
|
+
table_version=destination_table_version,
|
536
|
+
**ds_mock_kwargs,
|
537
|
+
)
|
538
|
+
|
539
|
+
return (
|
540
|
+
source_table_stream_after_committed,
|
541
|
+
destination_table_stream,
|
542
|
+
None,
|
543
|
+
source_namespace,
|
544
|
+
source_table_name,
|
545
|
+
source_table_version,
|
546
|
+
)
|
547
|
+
|
548
|
+
|
549
|
+
def create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
550
|
+
sort_keys: Optional[List[Any]],
|
551
|
+
partition_keys: Optional[List[PartitionKey]],
|
552
|
+
input_deltas: pa.Table,
|
553
|
+
input_delta_type: DeltaType,
|
554
|
+
partition_values: Optional[List[Any]],
|
555
|
+
ds_mock_kwargs: Optional[Dict[str, Any]],
|
556
|
+
) -> Tuple[Stream, Stream, Optional[Stream]]:
|
557
|
+
"""
|
558
|
+
Main storage version of create_src_w_deltas_destination_rebase_w_deltas_strategy
|
559
|
+
|
560
|
+
Creates source table with deltas, destination table, and rebase table for rebase testing.
|
561
|
+
This test scenario sets up different source and rebase partition locators to simulate
|
562
|
+
scenarios like hash bucket count changes.
|
563
|
+
"""
|
564
|
+
from deltacat.utils.common import current_time_ms
|
565
|
+
|
566
|
+
last_stream_position = current_time_ms()
|
567
|
+
source_namespace, source_table_name, source_table_version = create_src_table_main(
|
568
|
+
sort_keys, partition_keys, input_deltas, ds_mock_kwargs
|
569
|
+
)
|
570
|
+
|
571
|
+
source_table_stream: Stream = metastore.get_stream(
|
572
|
+
namespace=source_namespace,
|
573
|
+
table_name=source_table_name,
|
574
|
+
table_version=source_table_version,
|
575
|
+
**ds_mock_kwargs,
|
576
|
+
)
|
577
|
+
|
578
|
+
# Convert partition values to correct types, including timestamp handling
|
579
|
+
converted_partition_values = []
|
580
|
+
if partition_values and partition_keys:
|
581
|
+
for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
|
582
|
+
if pk.key_type == PartitionKeyType.INT:
|
583
|
+
converted_partition_values.append(int(value))
|
584
|
+
elif pk.key_type == PartitionKeyType.TIMESTAMP:
|
585
|
+
# Handle timestamp partition values
|
586
|
+
if isinstance(value, str) and "T" in value and value.endswith("Z"):
|
587
|
+
import pandas as pd
|
588
|
+
|
589
|
+
ts = pd.to_datetime(value)
|
590
|
+
# Convert to microseconds since epoch for PyArrow timestamp[us]
|
591
|
+
converted_partition_values.append(int(ts.timestamp() * 1_000_000))
|
592
|
+
else:
|
593
|
+
converted_partition_values.append(value)
|
594
|
+
else:
|
595
|
+
converted_partition_values.append(value)
|
596
|
+
else:
|
597
|
+
converted_partition_values = partition_values
|
598
|
+
|
599
|
+
staged_partition: Partition = metastore.stage_partition(
|
600
|
+
source_table_stream,
|
601
|
+
converted_partition_values,
|
602
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
603
|
+
**ds_mock_kwargs,
|
604
|
+
)
|
605
|
+
staged_delta: Delta = metastore.stage_delta(
|
606
|
+
input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
|
607
|
+
)
|
608
|
+
staged_delta.locator.stream_position = last_stream_position
|
609
|
+
metastore.commit_delta(staged_delta, **ds_mock_kwargs)
|
610
|
+
metastore.commit_partition(staged_partition, **ds_mock_kwargs)
|
611
|
+
|
612
|
+
source_table_stream_after_committed: Stream = metastore.get_stream(
|
613
|
+
namespace=source_namespace,
|
614
|
+
table_name=source_table_name,
|
615
|
+
table_version=source_table_version,
|
616
|
+
**ds_mock_kwargs,
|
617
|
+
)
|
618
|
+
|
619
|
+
# Create the destination table
|
620
|
+
(
|
621
|
+
destination_table_namespace,
|
622
|
+
destination_table_name,
|
623
|
+
destination_table_version,
|
624
|
+
) = create_destination_table_main(
|
625
|
+
sort_keys, partition_keys, input_deltas, ds_mock_kwargs
|
626
|
+
)
|
627
|
+
|
628
|
+
# Create the rebase table
|
629
|
+
(
|
630
|
+
rebase_table_namespace,
|
631
|
+
rebase_table_name,
|
632
|
+
rebase_table_version,
|
633
|
+
) = create_rebase_table_main(
|
634
|
+
sort_keys, partition_keys, input_deltas, ds_mock_kwargs
|
635
|
+
)
|
636
|
+
|
637
|
+
rebasing_table_stream: Stream = metastore.get_stream(
|
638
|
+
namespace=rebase_table_namespace,
|
639
|
+
table_name=rebase_table_name,
|
640
|
+
table_version=rebase_table_version,
|
641
|
+
**ds_mock_kwargs,
|
642
|
+
)
|
643
|
+
|
644
|
+
staged_partition: Partition = metastore.stage_partition(
|
645
|
+
rebasing_table_stream,
|
646
|
+
converted_partition_values,
|
647
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
648
|
+
**ds_mock_kwargs,
|
649
|
+
)
|
650
|
+
staged_delta: Delta = metastore.stage_delta(
|
651
|
+
input_deltas, staged_partition, **ds_mock_kwargs
|
652
|
+
)
|
653
|
+
staged_delta.locator.stream_position = last_stream_position
|
654
|
+
metastore.commit_delta(staged_delta, **ds_mock_kwargs)
|
655
|
+
metastore.commit_partition(staged_partition, **ds_mock_kwargs)
|
656
|
+
|
657
|
+
# Get destination stream
|
658
|
+
destination_table_stream: Stream = metastore.get_stream(
|
659
|
+
namespace=destination_table_namespace,
|
660
|
+
table_name=destination_table_name,
|
661
|
+
table_version=destination_table_version,
|
662
|
+
**ds_mock_kwargs,
|
663
|
+
)
|
664
|
+
|
665
|
+
rebased_stream_after_committed: Stream = metastore.get_stream(
|
666
|
+
namespace=rebase_table_namespace,
|
667
|
+
table_name=rebase_table_name,
|
668
|
+
table_version=rebase_table_version,
|
669
|
+
**ds_mock_kwargs,
|
670
|
+
)
|
671
|
+
|
672
|
+
return (
|
673
|
+
source_table_stream_after_committed,
|
674
|
+
destination_table_stream,
|
675
|
+
rebased_stream_after_committed,
|
676
|
+
)
|
677
|
+
|
678
|
+
|
679
|
+
def create_incremental_deltas_on_source_table_main(
|
680
|
+
source_namespace: str,
|
681
|
+
source_table_name: str,
|
682
|
+
source_table_version: str,
|
683
|
+
source_table_stream: Stream,
|
684
|
+
partition_values_param,
|
685
|
+
incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
|
686
|
+
ds_mock_kwargs: Optional[Dict[str, Any]] = None,
|
687
|
+
) -> Tuple[PartitionLocator, Delta, int, bool]:
|
688
|
+
"""
|
689
|
+
Main storage version of create_incremental_deltas_on_source_table
|
690
|
+
"""
|
691
|
+
total_records = 0
|
692
|
+
has_delete_deltas = False
|
693
|
+
new_delta = None
|
694
|
+
|
695
|
+
# Convert partition values for partition lookup (same as in other helper functions)
|
696
|
+
converted_partition_values_for_lookup = partition_values_param
|
697
|
+
if (
|
698
|
+
partition_values_param
|
699
|
+
and source_table_stream.partition_scheme
|
700
|
+
and source_table_stream.partition_scheme.keys
|
701
|
+
):
|
702
|
+
converted_partition_values_for_lookup = []
|
703
|
+
|
704
|
+
# Get partition field names from the storage partition scheme
|
705
|
+
storage_partition_keys = source_table_stream.partition_scheme.keys
|
706
|
+
partition_field_names = []
|
707
|
+
|
708
|
+
for storage_key in storage_partition_keys:
|
709
|
+
# Each storage PartitionKey has a 'key' property that contains FieldLocators
|
710
|
+
# Extract the field name from the first FieldLocator
|
711
|
+
field_name = storage_key.key[0] if storage_key.key else None
|
712
|
+
partition_field_names.append(field_name)
|
713
|
+
|
714
|
+
for i, value in enumerate(partition_values_param):
|
715
|
+
# For timestamp fields like 'region_id', we need to convert the timestamp string
|
716
|
+
if i < len(partition_field_names):
|
717
|
+
field_name = partition_field_names[i]
|
718
|
+
|
719
|
+
# Check if this is likely a timestamp field based on the value format
|
720
|
+
if isinstance(value, str) and "T" in value and value.endswith("Z"):
|
721
|
+
# This looks like a timestamp string - convert it
|
722
|
+
import pandas as pd
|
723
|
+
|
724
|
+
ts = pd.to_datetime(value)
|
725
|
+
# Convert to microseconds since epoch for PyArrow timestamp[us]
|
726
|
+
converted_partition_values_for_lookup.append(
|
727
|
+
int(ts.timestamp() * 1_000_000)
|
728
|
+
)
|
729
|
+
elif isinstance(value, str) and value.isdigit():
|
730
|
+
# This looks like an integer string
|
731
|
+
converted_partition_values_for_lookup.append(int(value))
|
732
|
+
else:
|
733
|
+
# Keep as-is
|
734
|
+
converted_partition_values_for_lookup.append(value)
|
735
|
+
else:
|
736
|
+
converted_partition_values_for_lookup.append(value)
|
737
|
+
|
738
|
+
# Get the current partition to stage deltas against
|
739
|
+
try:
|
740
|
+
source_partition: Partition = metastore.get_partition(
|
741
|
+
source_table_stream.locator,
|
742
|
+
converted_partition_values_for_lookup,
|
743
|
+
**ds_mock_kwargs,
|
744
|
+
)
|
745
|
+
except Exception:
|
746
|
+
# If we can't get the partition, it might not exist yet. Try to create it.
|
747
|
+
# Stage a new partition if it doesn't exist
|
748
|
+
staged_partition: Partition = metastore.stage_partition(
|
749
|
+
source_table_stream,
|
750
|
+
converted_partition_values_for_lookup,
|
751
|
+
partition_scheme_id="default_partition_scheme"
|
752
|
+
if source_table_stream.partition_scheme
|
753
|
+
else None,
|
754
|
+
**ds_mock_kwargs,
|
755
|
+
)
|
756
|
+
# Commit the empty partition first
|
757
|
+
metastore.commit_partition(staged_partition, **ds_mock_kwargs)
|
758
|
+
|
759
|
+
# Now try to get it again
|
760
|
+
source_partition: Partition = metastore.get_partition(
|
761
|
+
source_table_stream.locator,
|
762
|
+
converted_partition_values_for_lookup,
|
763
|
+
**ds_mock_kwargs,
|
764
|
+
)
|
765
|
+
|
766
|
+
if source_partition is None:
|
767
|
+
raise ValueError(
|
768
|
+
f"Could not create or retrieve partition for values: {converted_partition_values_for_lookup}"
|
769
|
+
)
|
770
|
+
|
771
|
+
for delta_table, delta_type, properties_dict in incremental_deltas:
|
772
|
+
# Skip None deltas (empty incremental deltas)
|
773
|
+
if delta_table is None:
|
774
|
+
continue
|
775
|
+
|
776
|
+
total_records += len(delta_table)
|
777
|
+
|
778
|
+
if delta_type == DeltaType.DELETE:
|
779
|
+
has_delete_deltas = True
|
780
|
+
|
781
|
+
# Stage and commit the delta
|
782
|
+
staged_delta: Delta = metastore.stage_delta(
|
783
|
+
delta_table,
|
784
|
+
source_partition,
|
785
|
+
delta_type,
|
786
|
+
entry_params=properties_dict,
|
787
|
+
**ds_mock_kwargs,
|
788
|
+
)
|
789
|
+
new_delta = metastore.commit_delta(staged_delta, **ds_mock_kwargs)
|
790
|
+
|
791
|
+
# If all deltas were None, return None for new_delta
|
792
|
+
if new_delta is None:
|
793
|
+
return None, None, total_records, has_delete_deltas
|
794
|
+
|
795
|
+
# Get updated stream after deltas were committed
|
796
|
+
source_table_stream_after_committed: Stream = metastore.get_stream(
|
797
|
+
source_namespace,
|
798
|
+
source_table_name,
|
799
|
+
source_table_version,
|
800
|
+
**ds_mock_kwargs,
|
801
|
+
)
|
802
|
+
|
803
|
+
# Get updated partition after deltas were committed
|
804
|
+
source_partition_after_committed: Partition = metastore.get_partition(
|
805
|
+
source_table_stream_after_committed.locator,
|
806
|
+
converted_partition_values_for_lookup,
|
807
|
+
**ds_mock_kwargs,
|
808
|
+
)
|
809
|
+
|
810
|
+
return (
|
811
|
+
source_partition_after_committed.locator,
|
812
|
+
new_delta,
|
813
|
+
total_records,
|
814
|
+
has_delete_deltas,
|
815
|
+
)
|
816
|
+
|
817
|
+
|
818
|
+
def get_compacted_delta_locator_from_partition(
|
819
|
+
partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
|
181
820
|
):
|
182
|
-
|
821
|
+
"""
|
822
|
+
Get compacted delta locator from partition RoundCompletionInfo.
|
183
823
|
|
184
|
-
|
824
|
+
Args:
|
825
|
+
partition_locator: Locator of the partition containing the RoundCompletionInfo
|
826
|
+
deltacat_storage: Storage implementation (defaults to metastore)
|
827
|
+
**kwargs: Additional arguments to pass to get_rci_from_partition (e.g., catalog)
|
185
828
|
|
186
|
-
|
187
|
-
|
829
|
+
Returns:
|
830
|
+
DeltaLocator of the compacted delta
|
831
|
+
"""
|
832
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
833
|
+
partition_locator, deltacat_storage, **kwargs
|
188
834
|
)
|
189
|
-
|
835
|
+
|
836
|
+
if round_completion_info:
|
837
|
+
return round_completion_info.compacted_delta_locator
|
838
|
+
return None
|
190
839
|
|
191
840
|
|
192
841
|
def offer_iso8601_timestamp_list(
|
@@ -325,3 +974,27 @@ def assert_compaction_audit_no_hash_bucket(
|
|
325
974
|
for entry in audit_entries:
|
326
975
|
assert entry is not None
|
327
976
|
return True
|
977
|
+
|
978
|
+
|
979
|
+
def read_audit_file(audit_file_path: str, catalog_root: str) -> Dict[str, Any]:
|
980
|
+
"""
|
981
|
+
Read audit file from any filesystem.
|
982
|
+
|
983
|
+
Args:
|
984
|
+
audit_file_path: Relative path to the audit file from catalog root
|
985
|
+
catalog_root: Absolute path to the catalog root directory
|
986
|
+
|
987
|
+
Returns:
|
988
|
+
Dictionary containing audit data
|
989
|
+
"""
|
990
|
+
from deltacat.utils.filesystem import resolve_path_and_filesystem
|
991
|
+
import json
|
992
|
+
import posixpath
|
993
|
+
|
994
|
+
# Resolve absolute path from relative audit path
|
995
|
+
absolute_path = posixpath.join(catalog_root, audit_file_path)
|
996
|
+
|
997
|
+
path, filesystem = resolve_path_and_filesystem(absolute_path)
|
998
|
+
with filesystem.open_input_stream(path) as stream:
|
999
|
+
content = stream.read().decode("utf-8")
|
1000
|
+
return json.loads(content)
|