deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -7,10 +7,11 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
+
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
10
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
11
12
|
from uuid import uuid4
|
12
13
|
from deltacat import logs
|
13
|
-
from typing import Callable, Iterator, List, Optional, Tuple
|
14
|
+
from typing import Callable, Iterator, List, Optional, Tuple, Set
|
14
15
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
15
16
|
from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
|
16
17
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
@@ -31,13 +32,14 @@ from deltacat.utils.resources import (
|
|
31
32
|
)
|
32
33
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
33
34
|
generate_pk_hash_column,
|
35
|
+
pk_digest_to_hash_bucket_index,
|
34
36
|
)
|
35
37
|
from deltacat.storage import (
|
36
38
|
Delta,
|
37
39
|
DeltaLocator,
|
38
40
|
DeltaType,
|
39
41
|
Partition,
|
40
|
-
|
42
|
+
metastore,
|
41
43
|
)
|
42
44
|
from deltacat.storage.model.manifest import Manifest
|
43
45
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
@@ -46,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
|
|
46
48
|
MERGE_TIME_IN_SECONDS,
|
47
49
|
MERGE_SUCCESS_COUNT,
|
48
50
|
MERGE_FAILURE_COUNT,
|
51
|
+
BUCKETING_SPEC_COMPLIANCE_PROFILE,
|
52
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
53
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
49
54
|
)
|
50
55
|
from deltacat.exceptions import (
|
51
56
|
categorize_errors,
|
@@ -57,6 +62,10 @@ if importlib.util.find_spec("memray"):
|
|
57
62
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
58
63
|
|
59
64
|
|
65
|
+
_EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
|
66
|
+
_INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
|
67
|
+
|
68
|
+
|
60
69
|
def _append_delta_type_column(table: pa.Table, value: np.bool_):
|
61
70
|
return table.append_column(
|
62
71
|
sc._DELTA_TYPE_COLUMN_FIELD,
|
@@ -85,9 +94,12 @@ def _build_incremental_table(
|
|
85
94
|
# sort by delta file stream position now instead of sorting every row later
|
86
95
|
is_delete = False
|
87
96
|
for df_envelope in df_envelopes:
|
88
|
-
|
89
|
-
|
90
|
-
|
97
|
+
# Allow APPEND, UPSERT, and DELETE delta types
|
98
|
+
assert df_envelope.delta_type in (
|
99
|
+
DeltaType.APPEND,
|
100
|
+
DeltaType.UPSERT,
|
101
|
+
DeltaType.DELETE,
|
102
|
+
), "Only APPEND, UPSERT, and DELETE delta types are supported"
|
91
103
|
if df_envelope.delta_type == DeltaType.DELETE:
|
92
104
|
is_delete = True
|
93
105
|
|
@@ -99,14 +111,35 @@ def _build_incremental_table(
|
|
99
111
|
)
|
100
112
|
|
101
113
|
hb_tables.append(table)
|
102
|
-
result =
|
114
|
+
result = _concat_or_coerce_tables(hb_tables)
|
103
115
|
return result
|
104
116
|
|
105
117
|
|
118
|
+
def _concat_or_coerce_tables(all_tables: List[pa.Table]) -> pa.Table:
|
119
|
+
try:
|
120
|
+
return pa.concat_tables(all_tables)
|
121
|
+
except pa.ArrowInvalid:
|
122
|
+
# Fallback path: schema evolution needed - try PyArrow's built-in unification
|
123
|
+
if all_tables:
|
124
|
+
try:
|
125
|
+
return pa.concat_tables(
|
126
|
+
all_tables, promote_options="permissive", unify_schemas=True
|
127
|
+
)
|
128
|
+
except (pa.ArrowInvalid, TypeError, pa.ArrowNotImplementedError):
|
129
|
+
# If PyArrow unification fails, re-raise the original error
|
130
|
+
raise
|
131
|
+
else:
|
132
|
+
# Empty table list - should not happen but handle gracefully
|
133
|
+
raise RuntimeError("Expected at least one table to merge, but found none.")
|
134
|
+
|
135
|
+
|
106
136
|
def _merge_tables(
|
107
137
|
table: pa.Table,
|
108
138
|
primary_keys: List[str],
|
109
139
|
can_drop_duplicates: bool,
|
140
|
+
hb_index: int,
|
141
|
+
num_buckets: int,
|
142
|
+
original_fields: Set[str],
|
110
143
|
compacted_table: Optional[pa.Table] = None,
|
111
144
|
) -> pa.Table:
|
112
145
|
"""
|
@@ -125,6 +158,20 @@ def _merge_tables(
|
|
125
158
|
|
126
159
|
all_tables.append(table)
|
127
160
|
|
161
|
+
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
162
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
163
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
164
|
+
]
|
165
|
+
|
166
|
+
if primary_keys and check_bucketing_spec:
|
167
|
+
_validate_bucketing_spec_compliance(
|
168
|
+
table=all_tables[incremental_idx],
|
169
|
+
num_buckets=num_buckets,
|
170
|
+
primary_keys=primary_keys,
|
171
|
+
hb_index=hb_index,
|
172
|
+
log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
|
173
|
+
)
|
174
|
+
|
128
175
|
if not primary_keys or not can_drop_duplicates:
|
129
176
|
logger.info(
|
130
177
|
f"Not dropping duplicates for primary keys={primary_keys} "
|
@@ -134,7 +181,7 @@ def _merge_tables(
|
|
134
181
|
all_tables[incremental_idx], DeltaType.DELETE
|
135
182
|
)
|
136
183
|
# we need not drop duplicates
|
137
|
-
return
|
184
|
+
return _concat_or_coerce_tables(all_tables)
|
138
185
|
|
139
186
|
all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
|
140
187
|
|
@@ -144,36 +191,170 @@ def _merge_tables(
|
|
144
191
|
all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
|
145
192
|
)
|
146
193
|
|
194
|
+
# Always drop DELETE rows from incremental table
|
195
|
+
incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
|
196
|
+
|
197
|
+
# Default to using incremental records as-is, override only if merging is needed
|
198
|
+
incremental_data = incremental_table
|
199
|
+
|
147
200
|
if compacted_table:
|
148
201
|
compacted_table = all_tables[0]
|
149
202
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
203
|
+
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
204
|
+
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
205
|
+
|
206
|
+
logger.info(
|
207
|
+
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
208
|
+
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
209
|
+
)
|
210
|
+
|
211
|
+
if (
|
212
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
213
|
+
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
214
|
+
):
|
215
|
+
logger.info("Casting compacted and incremental pk hash to large_string...")
|
216
|
+
# is_in combines the chunks of the chunked array passed which can cause
|
217
|
+
# ArrowCapacityError if the total size of string array is over 2GB.
|
218
|
+
# Using a large_string would resolve this issue.
|
219
|
+
# The cast here should be zero-copy in most cases.
|
220
|
+
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
221
|
+
incremental_pk_hash_str = pc.cast(
|
222
|
+
incremental_pk_hash_str, pa.large_string()
|
154
223
|
)
|
224
|
+
|
225
|
+
records_to_update = pc.is_in(
|
226
|
+
compacted_pk_hash_str,
|
227
|
+
incremental_pk_hash_str,
|
155
228
|
)
|
156
229
|
|
230
|
+
records_to_keep = pc.invert(records_to_update)
|
231
|
+
|
232
|
+
# Keep records that don't have updates
|
157
233
|
result_table_list.append(compacted_table.filter(records_to_keep))
|
158
234
|
|
159
|
-
|
160
|
-
|
235
|
+
# Override default if merging is needed
|
236
|
+
if pc.sum(records_to_update).as_py() > 0: # There are records to update
|
237
|
+
old_records_to_update = compacted_table.filter(records_to_update)
|
238
|
+
# Perform partial UPSERT: merge old and new records field by field
|
239
|
+
incremental_data = _merge_records_partially(
|
240
|
+
old_records=old_records_to_update,
|
241
|
+
new_records=incremental_table,
|
242
|
+
original_fields=original_fields,
|
243
|
+
)
|
244
|
+
|
245
|
+
# Add the determined incremental data
|
246
|
+
result_table_list.append(incremental_data)
|
161
247
|
|
162
|
-
final_table =
|
248
|
+
final_table = _concat_or_coerce_tables(result_table_list)
|
163
249
|
final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
|
164
250
|
|
165
251
|
return final_table
|
166
252
|
|
167
253
|
|
254
|
+
def _merge_records_partially(
|
255
|
+
old_records: pa.Table, new_records: pa.Table, original_fields: Set[str]
|
256
|
+
) -> pa.Table:
|
257
|
+
"""
|
258
|
+
Merge records field by field for partial UPSERT behavior. Fills missing
|
259
|
+
fields in new_records with values from old_records.
|
260
|
+
|
261
|
+
Args:
|
262
|
+
old_records: Records from the compacted table that need updates
|
263
|
+
new_records: New records with potential partial field updates
|
264
|
+
|
265
|
+
Returns:
|
266
|
+
Table with merged records where missing fields preserve old values
|
267
|
+
"""
|
268
|
+
# Get field sets (excluding hash column which is used for joining)
|
269
|
+
old_fields = set(old_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
|
270
|
+
new_fields = set(new_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
|
271
|
+
|
272
|
+
# Find fields that are missing from new_records but exist in old_records
|
273
|
+
missing_fields = old_fields - new_fields
|
274
|
+
|
275
|
+
# Find fields that were auto-added by schema coercion (missing from original user data)
|
276
|
+
# These should be treated as missing fields and filled from old_records
|
277
|
+
auto_added_null_fields = set()
|
278
|
+
|
279
|
+
# Use definitive information about which fields were originally provided
|
280
|
+
# Any field that exists in both tables but was NOT in the original user data
|
281
|
+
# should be treated as auto-added by schema coercion
|
282
|
+
for field_name in old_fields & new_fields: # Fields that exist in both
|
283
|
+
if field_name not in original_fields:
|
284
|
+
auto_added_null_fields.add(field_name)
|
285
|
+
|
286
|
+
# Combine missing fields with auto-added null fields
|
287
|
+
fields_to_fill = missing_fields | auto_added_null_fields
|
288
|
+
|
289
|
+
# Start with new_records and add missing fields from old_records
|
290
|
+
result_columns = {}
|
291
|
+
|
292
|
+
# Copy all existing columns from new_records
|
293
|
+
for column_name in new_records.column_names:
|
294
|
+
result_columns[column_name] = new_records[column_name]
|
295
|
+
|
296
|
+
# Fill in missing/auto-added null fields with values from old_records
|
297
|
+
for field_name in fields_to_fill:
|
298
|
+
# For missing fields, use the old values entirely
|
299
|
+
result_columns[field_name] = old_records[field_name]
|
300
|
+
|
301
|
+
# Create the enhanced new_records table with all fields filled
|
302
|
+
enhanced_new_records = pa.table(result_columns)
|
303
|
+
|
304
|
+
# Now we can return the enhanced table - it has all the fields with proper values
|
305
|
+
# Missing fields are filled with old values, explicitly null fields remain null
|
306
|
+
return enhanced_new_records
|
307
|
+
|
308
|
+
|
309
|
+
def _validate_bucketing_spec_compliance(
|
310
|
+
table: pa.Table,
|
311
|
+
num_buckets: int,
|
312
|
+
hb_index: int,
|
313
|
+
primary_keys: List[str],
|
314
|
+
rci: Optional[RoundCompletionInfo] = None,
|
315
|
+
log_prefix=None,
|
316
|
+
) -> None:
|
317
|
+
if rci is not None:
|
318
|
+
message_prefix = f"{log_prefix}{rci.compacted_delta_locator.namespace}.{rci.compacted_delta_locator.table_name}.{rci.compacted_delta_locator.table_version}.{rci.compacted_delta_locator.partition_id}.{rci.compacted_delta_locator.partition_values}"
|
319
|
+
else:
|
320
|
+
message_prefix = f"{log_prefix}"
|
321
|
+
pki_table = generate_pk_hash_column(
|
322
|
+
[table], primary_keys=primary_keys, requires_hash=True
|
323
|
+
)[0]
|
324
|
+
is_not_compliant: bool = False
|
325
|
+
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
326
|
+
hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
|
327
|
+
if hash_bucket != hb_index:
|
328
|
+
is_not_compliant = True
|
329
|
+
logger.info(
|
330
|
+
f"{message_prefix} has non-compliant bucketing spec at index: {index} "
|
331
|
+
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
332
|
+
)
|
333
|
+
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
334
|
+
raise AssertionError(
|
335
|
+
f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
|
336
|
+
f" to be {hb_index} but found {hash_bucket}"
|
337
|
+
)
|
338
|
+
# No further checks necessary
|
339
|
+
break
|
340
|
+
if not is_not_compliant:
|
341
|
+
logger.debug(
|
342
|
+
f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
|
343
|
+
)
|
344
|
+
|
345
|
+
|
168
346
|
def _download_compacted_table(
|
169
347
|
hb_index: int,
|
170
|
-
|
348
|
+
rci: RoundCompletionInfo,
|
349
|
+
primary_keys: List[str],
|
350
|
+
all_column_names: List[str],
|
351
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
171
352
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
172
|
-
deltacat_storage=
|
353
|
+
deltacat_storage: metastore = metastore,
|
173
354
|
deltacat_storage_kwargs: Optional[dict] = None,
|
174
355
|
) -> pa.Table:
|
175
356
|
tables = []
|
176
|
-
hb_index_to_indices =
|
357
|
+
hb_index_to_indices = rci.hb_index_to_entry_range
|
177
358
|
|
178
359
|
if str(hb_index) not in hb_index_to_indices:
|
179
360
|
return None
|
@@ -183,30 +364,52 @@ def _download_compacted_table(
|
|
183
364
|
), "indices should not be none and contains exactly two elements"
|
184
365
|
for offset in range(indices[1] - indices[0]):
|
185
366
|
table = deltacat_storage.download_delta_manifest_entry(
|
186
|
-
|
367
|
+
Delta.of(
|
368
|
+
rci.compacted_delta_locator,
|
369
|
+
DeltaType.APPEND,
|
370
|
+
compacted_delta_manifest.meta,
|
371
|
+
None,
|
372
|
+
compacted_delta_manifest,
|
373
|
+
),
|
187
374
|
entry_index=(indices[0] + offset),
|
188
375
|
file_reader_kwargs_provider=read_kwargs_provider,
|
376
|
+
all_column_names=all_column_names,
|
189
377
|
**deltacat_storage_kwargs,
|
190
378
|
)
|
191
379
|
|
192
380
|
tables.append(table)
|
193
381
|
|
194
|
-
|
382
|
+
compacted_table = pa.concat_tables(tables)
|
383
|
+
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
384
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
385
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
386
|
+
]
|
387
|
+
|
388
|
+
logger.debug(
|
389
|
+
f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
|
390
|
+
f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
|
391
|
+
)
|
392
|
+
|
393
|
+
# Bucketing spec compliance isn't required without primary keys
|
394
|
+
if primary_keys and check_bucketing_spec:
|
395
|
+
_validate_bucketing_spec_compliance(
|
396
|
+
compacted_table,
|
397
|
+
rci.hash_bucket_count,
|
398
|
+
hb_index,
|
399
|
+
primary_keys,
|
400
|
+
rci=rci,
|
401
|
+
log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
|
402
|
+
)
|
403
|
+
return compacted_table
|
195
404
|
|
196
405
|
|
197
406
|
def _copy_all_manifest_files_from_old_hash_buckets(
|
198
407
|
hb_index_copy_by_reference: List[int],
|
199
408
|
round_completion_info: RoundCompletionInfo,
|
200
409
|
write_to_partition: Partition,
|
201
|
-
|
202
|
-
deltacat_storage_kwargs: Optional[dict] = None,
|
410
|
+
compacted_manifest: Optional[Manifest] = None,
|
203
411
|
) -> List[MaterializeResult]:
|
204
412
|
|
205
|
-
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
206
|
-
manifest = deltacat_storage.get_delta_manifest(
|
207
|
-
compacted_delta_locator, **deltacat_storage_kwargs
|
208
|
-
)
|
209
|
-
|
210
413
|
manifest_entry_referenced_list = []
|
211
414
|
materialize_result_list = []
|
212
415
|
hb_index_to_indices = round_completion_info.hb_index_to_entry_range
|
@@ -223,27 +426,27 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
223
426
|
for offset in range(indices[1] - indices[0]):
|
224
427
|
entry_index = indices[0] + offset
|
225
428
|
assert entry_index < len(
|
226
|
-
|
227
|
-
), f"entry index: {entry_index} >= {len(
|
228
|
-
manifest_entry =
|
429
|
+
compacted_manifest.entries
|
430
|
+
), f"entry index: {entry_index} >= {len(compacted_manifest.entries)}"
|
431
|
+
manifest_entry = compacted_manifest.entries[entry_index]
|
229
432
|
manifest_entry_referenced_list.append(manifest_entry)
|
230
433
|
|
231
|
-
|
434
|
+
compacted_manifest = Manifest.of(
|
232
435
|
entries=manifest_entry_referenced_list, uuid=str(uuid4())
|
233
436
|
)
|
234
437
|
delta = Delta.of(
|
235
438
|
locator=DeltaLocator.of(write_to_partition.locator),
|
236
|
-
delta_type=DeltaType.
|
237
|
-
meta=
|
238
|
-
manifest=
|
439
|
+
delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
|
440
|
+
meta=compacted_manifest.meta,
|
441
|
+
manifest=compacted_manifest,
|
239
442
|
previous_stream_position=write_to_partition.stream_position,
|
240
443
|
properties={},
|
241
444
|
)
|
242
445
|
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
243
446
|
len(manifest_entry_referenced_list),
|
244
|
-
|
245
|
-
|
246
|
-
|
447
|
+
compacted_manifest.meta.source_content_length,
|
448
|
+
compacted_manifest.meta.content_length,
|
449
|
+
compacted_manifest.meta.record_count,
|
247
450
|
)
|
248
451
|
materialize_result = MaterializeResult.of(
|
249
452
|
delta=delta,
|
@@ -268,6 +471,7 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
|
|
268
471
|
"""
|
269
472
|
return (
|
270
473
|
input.round_completion_info
|
474
|
+
and input.compacted_manifest is not None
|
271
475
|
and input.round_completion_info.hb_index_to_entry_range
|
272
476
|
and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
|
273
477
|
is not None
|
@@ -285,6 +489,7 @@ def _can_copy_by_reference(
|
|
285
489
|
not has_delete
|
286
490
|
and not merge_file_group.dfe_groups
|
287
491
|
and input.round_completion_info is not None
|
492
|
+
and input.compacted_manifest is not None
|
288
493
|
)
|
289
494
|
|
290
495
|
if input.disable_copy_by_reference:
|
@@ -383,9 +588,9 @@ def _compact_tables(
|
|
383
588
|
delete_file_envelopes + df_envelopes
|
384
589
|
)
|
385
590
|
assert all(
|
386
|
-
dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
|
591
|
+
dfe.delta_type in (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE)
|
387
592
|
for dfe in reordered_all_dfes
|
388
|
-
), "All reordered delta file envelopes must be of the UPSERT or DELETE"
|
593
|
+
), "All reordered delta file envelopes must be of the APPEND, UPSERT or DELETE"
|
389
594
|
table = compacted_table
|
390
595
|
aggregated_incremental_len = 0
|
391
596
|
aggregated_deduped_records = 0
|
@@ -393,13 +598,13 @@ def _compact_tables(
|
|
393
598
|
for i, (delta_type, delta_type_sequence) in enumerate(
|
394
599
|
_group_sequence_by_delta_type(reordered_all_dfes)
|
395
600
|
):
|
396
|
-
if delta_type is DeltaType.UPSERT:
|
397
|
-
(
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
)
|
601
|
+
if delta_type is DeltaType.UPSERT or delta_type is DeltaType.APPEND:
|
602
|
+
(table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
|
603
|
+
input=input,
|
604
|
+
dfe_list=delta_type_sequence,
|
605
|
+
hb_idx=hb_idx,
|
606
|
+
prev_table=table,
|
607
|
+
)
|
403
608
|
logger.info(
|
404
609
|
f" [Merge task index {input.merge_task_index}] Merged"
|
405
610
|
f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
@@ -434,8 +639,9 @@ def _apply_upserts(
|
|
434
639
|
prev_table=None,
|
435
640
|
) -> Tuple[pa.Table, int, int, int]:
|
436
641
|
assert all(
|
437
|
-
dfe.delta_type is DeltaType.UPSERT
|
438
|
-
|
642
|
+
dfe.delta_type is DeltaType.UPSERT or dfe.delta_type is DeltaType.APPEND
|
643
|
+
for dfe in dfe_list
|
644
|
+
), "All incoming delta file envelopes must of the DeltaType.UPSERT or DeltaType.APPEND"
|
439
645
|
logger.info(
|
440
646
|
f"[Hash bucket index {hb_idx}] Reading dedupe input for "
|
441
647
|
f"{len(dfe_list)} delta file envelope lists..."
|
@@ -459,6 +665,9 @@ def _apply_upserts(
|
|
459
665
|
table=table,
|
460
666
|
primary_keys=input.primary_keys,
|
461
667
|
can_drop_duplicates=input.drop_duplicates,
|
668
|
+
hb_index=hb_idx,
|
669
|
+
num_buckets=input.hash_bucket_count,
|
670
|
+
original_fields=input.original_fields,
|
462
671
|
compacted_table=prev_table,
|
463
672
|
)
|
464
673
|
deduped_records = hb_table_record_count - len(table)
|
@@ -476,8 +685,7 @@ def _copy_manifests_from_hash_bucketing(
|
|
476
685
|
hb_index_copy_by_reference_ids,
|
477
686
|
input.round_completion_info,
|
478
687
|
input.write_to_partition,
|
479
|
-
input.
|
480
|
-
input.deltacat_storage_kwargs,
|
688
|
+
input.compacted_manifest,
|
481
689
|
)
|
482
690
|
)
|
483
691
|
logger.info(
|
@@ -494,9 +702,11 @@ def _copy_manifests_from_hash_bucketing(
|
|
494
702
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
495
703
|
task_id = get_current_ray_task_id()
|
496
704
|
worker_id = get_current_ray_worker_id()
|
497
|
-
with
|
498
|
-
f"merge_{worker_id}_{task_id}.bin"
|
499
|
-
|
705
|
+
with (
|
706
|
+
memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
|
707
|
+
if input.enable_profiler
|
708
|
+
else nullcontext()
|
709
|
+
):
|
500
710
|
total_input_records, total_deduped_records = 0, 0
|
501
711
|
total_dropped_records = 0
|
502
712
|
materialized_results: List[MaterializeResult] = []
|
@@ -515,11 +725,13 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
515
725
|
):
|
516
726
|
hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
|
517
727
|
continue
|
518
|
-
|
519
728
|
if _has_previous_compacted_table(input, merge_file_group.hb_index):
|
520
729
|
compacted_table = _download_compacted_table(
|
521
730
|
hb_index=merge_file_group.hb_index,
|
522
|
-
|
731
|
+
rci=input.round_completion_info,
|
732
|
+
primary_keys=input.primary_keys,
|
733
|
+
all_column_names=input.all_column_names,
|
734
|
+
compacted_delta_manifest=input.compacted_manifest,
|
523
735
|
read_kwargs_provider=input.read_kwargs_provider,
|
524
736
|
deltacat_storage=input.deltacat_storage,
|
525
737
|
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
@@ -604,5 +816,5 @@ def merge(input: MergeInput) -> MergeResult:
|
|
604
816
|
merge_result[3],
|
605
817
|
merge_result[4],
|
606
818
|
np.double(emit_metrics_time),
|
607
|
-
merge_result[
|
819
|
+
merge_result[6],
|
608
820
|
)
|
@@ -1,19 +1,21 @@
|
|
1
1
|
import logging
|
2
2
|
import ray
|
3
3
|
import functools
|
4
|
+
from typing import List
|
4
5
|
from deltacat.compute.compactor_v2.constants import (
|
5
6
|
TASK_MAX_PARALLELISM,
|
6
7
|
MAX_PARQUET_METADATA_SIZE,
|
7
8
|
)
|
9
|
+
from deltacat.utils.common import ReadKwargsProvider
|
8
10
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
9
11
|
from deltacat import logs
|
10
12
|
from deltacat.storage import (
|
11
13
|
Delta,
|
12
14
|
ManifestEntry,
|
13
|
-
|
15
|
+
metastore,
|
14
16
|
)
|
15
17
|
from typing import Dict, Optional, Any
|
16
|
-
from deltacat.types.media import
|
18
|
+
from deltacat.types.media import DatasetType
|
17
19
|
from deltacat.types.media import ContentType
|
18
20
|
from deltacat.types.partial_download import PartialParquetParameters
|
19
21
|
from deltacat.exceptions import RetryableError
|
@@ -73,13 +75,26 @@ class AppendContentTypeParamsCache:
|
|
73
75
|
def _download_parquet_metadata_for_manifest_entry(
|
74
76
|
delta: Delta,
|
75
77
|
entry_index: int,
|
76
|
-
|
78
|
+
all_column_names: List[str],
|
79
|
+
deltacat_storage: metastore,
|
77
80
|
deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
|
81
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
78
82
|
) -> Dict[str, Any]:
|
83
|
+
logger.info(
|
84
|
+
f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
|
85
|
+
)
|
86
|
+
if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
|
87
|
+
logger.info(
|
88
|
+
"'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
|
89
|
+
)
|
90
|
+
deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
|
91
|
+
|
79
92
|
pq_file = deltacat_storage.download_delta_manifest_entry(
|
80
93
|
delta,
|
81
94
|
entry_index=entry_index,
|
82
|
-
table_type=
|
95
|
+
table_type=DatasetType.PYARROW_PARQUET,
|
96
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
97
|
+
all_column_names=all_column_names,
|
83
98
|
**deltacat_storage_kwargs,
|
84
99
|
)
|
85
100
|
|
@@ -93,15 +108,20 @@ def _download_parquet_metadata_for_manifest_entry(
|
|
93
108
|
|
94
109
|
def append_content_type_params(
|
95
110
|
delta: Delta,
|
111
|
+
all_column_names: List[str],
|
96
112
|
task_max_parallelism: int = TASK_MAX_PARALLELISM,
|
97
113
|
max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
|
98
|
-
deltacat_storage=
|
114
|
+
deltacat_storage: metastore = metastore,
|
99
115
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
116
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
100
117
|
) -> bool:
|
101
118
|
"""
|
102
119
|
This operation appends content type params into the delta entry. Note
|
103
120
|
that this operation can be time consuming, hence we cache it in a Ray actor.
|
104
121
|
"""
|
122
|
+
logger.info(
|
123
|
+
f"Appending the content type params for Delta with locator {delta.locator}..."
|
124
|
+
)
|
105
125
|
|
106
126
|
if not delta.meta:
|
107
127
|
logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
|
@@ -157,17 +177,25 @@ def append_content_type_params(
|
|
157
177
|
max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
|
158
178
|
)
|
159
179
|
|
180
|
+
# create a copy of deltacat_storage_kwargs without transaction key
|
181
|
+
deltacat_storage_kwargs_copy = {
|
182
|
+
k: v for k, v in deltacat_storage_kwargs.items() if k != "transaction"
|
183
|
+
}
|
184
|
+
|
160
185
|
def input_provider(index, item) -> Dict:
|
161
186
|
return {
|
162
|
-
"
|
187
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
188
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs_copy,
|
163
189
|
"deltacat_storage": deltacat_storage,
|
164
190
|
"delta": delta,
|
165
191
|
"entry_index": item,
|
192
|
+
"all_column_names": all_column_names,
|
166
193
|
}
|
167
194
|
|
168
195
|
logger.info(
|
169
196
|
f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
|
170
197
|
)
|
198
|
+
|
171
199
|
pq_files_promise = invoke_parallel(
|
172
200
|
entry_indices_to_download,
|
173
201
|
ray_task=_download_parquet_metadata_for_manifest_entry,
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -9,7 +9,7 @@ from deltacat.storage import (
|
|
9
9
|
Delta,
|
10
10
|
)
|
11
11
|
from deltacat.storage.model.delta import DeltaType
|
12
|
-
from deltacat.storage import
|
12
|
+
from deltacat.storage import metastore
|
13
13
|
from deltacat.types.media import StorageType
|
14
14
|
from deltacat.utils.common import ReadKwargsProvider
|
15
15
|
from deltacat import logs
|
@@ -30,8 +30,9 @@ def contains_delete_deltas(deltas: List[Delta]) -> bool:
|
|
30
30
|
|
31
31
|
def read_delta_file_envelopes(
|
32
32
|
annotated_delta: DeltaAnnotated,
|
33
|
+
all_column_names: List[str],
|
33
34
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
34
|
-
deltacat_storage=
|
35
|
+
deltacat_storage: metastore = metastore,
|
35
36
|
deltacat_storage_kwargs: Optional[dict] = None,
|
36
37
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
37
38
|
tables = deltacat_storage.download_delta(
|
@@ -39,6 +40,7 @@ def read_delta_file_envelopes(
|
|
39
40
|
max_parallelism=1,
|
40
41
|
file_reader_kwargs_provider=read_kwargs_provider,
|
41
42
|
storage_type=StorageType.LOCAL,
|
43
|
+
all_column_names=all_column_names,
|
42
44
|
**deltacat_storage_kwargs,
|
43
45
|
)
|
44
46
|
annotations = annotated_delta.annotations
|
@@ -80,7 +82,7 @@ def read_delta_file_envelopes(
|
|
80
82
|
def get_local_delta_file_envelopes(
|
81
83
|
uniform_deltas: List[DeltaAnnotated],
|
82
84
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
83
|
-
deltacat_storage=
|
85
|
+
deltacat_storage=metastore,
|
84
86
|
deltacat_storage_kwargs: Optional[dict] = None,
|
85
87
|
) -> Tuple[List[DeltaFileEnvelope], int]:
|
86
88
|
local_dfe_list = []
|