deltacat 1.1.35__py3-none-any.whl → 1.1.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/compaction_session.py +6 -3
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +1 -0
- deltacat/compute/compactor_v2/steps/merge.py +53 -13
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +1 -1
- {deltacat-1.1.35.dist-info → deltacat-1.1.36.dist-info}/METADATA +1 -1
- {deltacat-1.1.35.dist-info → deltacat-1.1.36.dist-info}/RECORD +12 -12
- {deltacat-1.1.35.dist-info → deltacat-1.1.36.dist-info}/LICENSE +0 -0
- {deltacat-1.1.35.dist-info → deltacat-1.1.36.dist-info}/WHEEL +0 -0
- {deltacat-1.1.35.dist-info → deltacat-1.1.36.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -69,14 +69,17 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
69
69
|
assert (
|
70
70
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
71
|
), "hash_bucket_count is a required arg for compactor v2"
|
72
|
+
assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
|
72
73
|
if params.num_rounds > 1:
|
73
74
|
assert (
|
74
75
|
not params.drop_duplicates
|
75
76
|
), "num_rounds > 1, drop_duplicates must be False but is True"
|
76
77
|
|
77
|
-
with
|
78
|
-
"compaction_partition.bin"
|
79
|
-
|
78
|
+
with (
|
79
|
+
memray.Tracker("compaction_partition.bin")
|
80
|
+
if params.enable_profiler
|
81
|
+
else nullcontext()
|
82
|
+
):
|
80
83
|
execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
|
81
84
|
params,
|
82
85
|
**kwargs,
|
@@ -48,6 +48,7 @@ class MergeInput(Dict):
|
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
49
|
memory_logs_enabled: Optional[bool] = None,
|
50
50
|
disable_copy_by_reference: Optional[bool] = None,
|
51
|
+
hash_bucket_count: Optional[int] = None,
|
51
52
|
) -> MergeInput:
|
52
53
|
|
53
54
|
result = MergeInput()
|
@@ -71,6 +72,7 @@ class MergeInput(Dict):
|
|
71
72
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
72
73
|
result["memory_logs_enabled"] = memory_logs_enabled
|
73
74
|
result["disable_copy_by_reference"] = disable_copy_by_reference
|
75
|
+
result["hash_bucket_count"] = hash_bucket_count
|
74
76
|
return result
|
75
77
|
|
76
78
|
@property
|
@@ -154,3 +156,7 @@ class MergeInput(Dict):
|
|
154
156
|
@property
|
155
157
|
def disable_copy_by_reference(self) -> bool:
|
156
158
|
return self["disable_copy_by_reference"]
|
159
|
+
|
160
|
+
@property
|
161
|
+
def hash_bucket_count(self) -> int:
|
162
|
+
return self["hash_bucket_count"]
|
@@ -62,6 +62,10 @@ if importlib.util.find_spec("memray"):
|
|
62
62
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
63
63
|
|
64
64
|
|
65
|
+
_EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
|
66
|
+
_INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
|
67
|
+
|
68
|
+
|
65
69
|
def _append_delta_type_column(table: pa.Table, value: np.bool_):
|
66
70
|
return table.append_column(
|
67
71
|
sc._DELTA_TYPE_COLUMN_FIELD,
|
@@ -112,6 +116,8 @@ def _merge_tables(
|
|
112
116
|
table: pa.Table,
|
113
117
|
primary_keys: List[str],
|
114
118
|
can_drop_duplicates: bool,
|
119
|
+
hb_index: int,
|
120
|
+
num_buckets: int,
|
115
121
|
compacted_table: Optional[pa.Table] = None,
|
116
122
|
) -> pa.Table:
|
117
123
|
"""
|
@@ -130,6 +136,20 @@ def _merge_tables(
|
|
130
136
|
|
131
137
|
all_tables.append(table)
|
132
138
|
|
139
|
+
check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
|
140
|
+
BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
|
141
|
+
BUCKETING_SPEC_COMPLIANCE_ASSERT,
|
142
|
+
]
|
143
|
+
|
144
|
+
if primary_keys and check_bucketing_spec:
|
145
|
+
_validate_bucketing_spec_compliance(
|
146
|
+
table=all_tables[incremental_idx],
|
147
|
+
num_buckets=num_buckets,
|
148
|
+
primary_keys=primary_keys,
|
149
|
+
hb_index=hb_index,
|
150
|
+
log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
|
151
|
+
)
|
152
|
+
|
133
153
|
if not primary_keys or not can_drop_duplicates:
|
134
154
|
logger.info(
|
135
155
|
f"Not dropping duplicates for primary keys={primary_keys} "
|
@@ -193,27 +213,40 @@ def _merge_tables(
|
|
193
213
|
|
194
214
|
|
195
215
|
def _validate_bucketing_spec_compliance(
|
196
|
-
table: pa.Table,
|
216
|
+
table: pa.Table,
|
217
|
+
num_buckets: int,
|
218
|
+
hb_index: int,
|
219
|
+
primary_keys: List[str],
|
220
|
+
rcf: RoundCompletionInfo = None,
|
221
|
+
log_prefix=None,
|
197
222
|
) -> None:
|
223
|
+
if rcf is not None:
|
224
|
+
message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
|
225
|
+
else:
|
226
|
+
message_prefix = f"{log_prefix}"
|
198
227
|
pki_table = generate_pk_hash_column(
|
199
228
|
[table], primary_keys=primary_keys, requires_hash=True
|
200
229
|
)[0]
|
230
|
+
is_not_compliant: bool = False
|
201
231
|
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
202
|
-
hash_bucket = pk_digest_to_hash_bucket_index(hash_value,
|
232
|
+
hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
|
203
233
|
if hash_bucket != hb_index:
|
234
|
+
is_not_compliant = True
|
204
235
|
logger.info(
|
205
|
-
f"{
|
206
|
-
f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
|
207
|
-
f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
|
236
|
+
f"{message_prefix} has non-compliant bucketing spec at index: {index} "
|
208
237
|
f"Expected hash bucket is {hb_index} but found {hash_bucket}."
|
209
238
|
)
|
210
239
|
if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
|
211
240
|
raise AssertionError(
|
212
|
-
"Hash bucket drift detected. Expected hash bucket index"
|
241
|
+
f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
|
213
242
|
f" to be {hb_index} but found {hash_bucket}"
|
214
243
|
)
|
215
244
|
# No further checks necessary
|
216
245
|
break
|
246
|
+
if not is_not_compliant:
|
247
|
+
logger.debug(
|
248
|
+
f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
|
249
|
+
)
|
217
250
|
|
218
251
|
|
219
252
|
def _download_compacted_table(
|
@@ -257,7 +290,12 @@ def _download_compacted_table(
|
|
257
290
|
# Bucketing spec compliance isn't required without primary keys
|
258
291
|
if primary_keys and check_bucketing_spec:
|
259
292
|
_validate_bucketing_spec_compliance(
|
260
|
-
compacted_table,
|
293
|
+
compacted_table,
|
294
|
+
rcf.hash_bucket_count,
|
295
|
+
hb_index,
|
296
|
+
primary_keys,
|
297
|
+
rcf=rcf,
|
298
|
+
log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
|
261
299
|
)
|
262
300
|
return compacted_table
|
263
301
|
|
@@ -462,12 +500,12 @@ def _compact_tables(
|
|
462
500
|
_group_sequence_by_delta_type(reordered_all_dfes)
|
463
501
|
):
|
464
502
|
if delta_type is DeltaType.UPSERT:
|
465
|
-
(
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
)
|
503
|
+
(table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
|
504
|
+
input=input,
|
505
|
+
dfe_list=delta_type_sequence,
|
506
|
+
hb_idx=hb_idx,
|
507
|
+
prev_table=table,
|
508
|
+
)
|
471
509
|
logger.info(
|
472
510
|
f" [Merge task index {input.merge_task_index}] Merged"
|
473
511
|
f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
@@ -526,6 +564,8 @@ def _apply_upserts(
|
|
526
564
|
primary_keys=input.primary_keys,
|
527
565
|
can_drop_duplicates=input.drop_duplicates,
|
528
566
|
compacted_table=prev_table,
|
567
|
+
hb_index=hb_idx,
|
568
|
+
num_buckets=input.hash_bucket_count,
|
529
569
|
)
|
530
570
|
deduped_records = hb_table_record_count - len(table)
|
531
571
|
return table, incremental_len, deduped_records, merge_time
|
@@ -804,7 +804,7 @@ class TestCompactionSession:
|
|
804
804
|
)
|
805
805
|
|
806
806
|
assert (
|
807
|
-
"Hash bucket drift detected. Expected hash bucket index to be 1 but found 0"
|
807
|
+
"Hash bucket drift detected at index: 0. Expected hash bucket index to be 1 but found 0"
|
808
808
|
in str(excinfo.value)
|
809
809
|
)
|
810
810
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=9vJMHGceWew6atD_3VqKurlBJ3crD5mwAQIgSB1yjNY,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -50,7 +50,7 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=fFevhUuveCvrU3g
|
|
50
50
|
deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
|
-
deltacat/compute/compactor_v2/compaction_session.py,sha256=
|
53
|
+
deltacat/compute/compactor_v2/compaction_session.py,sha256=RbO_du0qX7nlyXO-ZSksX8RqWuRwfdvWddpTJjLDVNk,8185
|
54
54
|
deltacat/compute/compactor_v2/constants.py,sha256=F5Phrh-2JgnWvtjHXacxOG5Z2ivKcHnboerI12rc1zk,3632
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
@@ -63,19 +63,19 @@ deltacat/compute/compactor_v2/model/evaluate_compaction_result.py,sha256=XAaEEAd
|
|
63
63
|
deltacat/compute/compactor_v2/model/hash_bucket_input.py,sha256=iJy8kLi1dIpFIyfoAjkaAtZvg8Np1z7BsUNGAcWfFm4,3042
|
64
64
|
deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcLKn3kGWzAX4s4BTR2vYyPUB-wAEOc,309
|
65
65
|
deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViVO1SVljCj6f0B3MfB3hqtGm2S0s,7410
|
66
|
-
deltacat/compute/compactor_v2/model/merge_input.py,sha256
|
66
|
+
deltacat/compute/compactor_v2/model/merge_input.py,sha256=D-6WuHK4X7m9-P6Hskz6RRemeWrNf6IPdhc14O3KDAg,5860
|
67
67
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
|
68
68
|
deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
-
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=
|
69
|
+
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=nz2N3YZVE9bNwOqRXoQYkArJhyUJRis2s9BweZ3tad8,30989
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=4rKQ__SeWO_QLZl2btcFrYHCMOn-8R3kja74UrWOMgg,26225
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=t2j9H9IdFRH9EfpL-9g5XvZs9WK9HybqBGA7fDi82EM,8310
|
75
75
|
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
|
78
|
-
deltacat/compute/compactor_v2/utils/merge.py,sha256=
|
78
|
+
deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
|
79
79
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
@@ -152,7 +152,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=F1DFaranHekHB7HSNH-0_hV5ovdR5HfF9JqTVDw6Vh8,42575
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
|
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
212
212
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
213
213
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
214
214
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
219
|
-
deltacat-1.1.
|
215
|
+
deltacat-1.1.36.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
216
|
+
deltacat-1.1.36.dist-info/METADATA,sha256=wIZbEGHnJWq_TBKi0u463p4-PgG9R_0MApw7IIwmnRc,1733
|
217
|
+
deltacat-1.1.36.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
218
|
+
deltacat-1.1.36.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
219
|
+
deltacat-1.1.36.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|