deltacat 1.1.15__py3-none-any.whl → 1.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/compaction_session.py +1 -6
- deltacat/compute/compactor_v2/private/compaction_utils.py +10 -2
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +146 -0
- {deltacat-1.1.15.dist-info → deltacat-1.1.16.dist-info}/METADATA +1 -1
- {deltacat-1.1.15.dist-info → deltacat-1.1.16.dist-info}/RECORD +9 -9
- {deltacat-1.1.15.dist-info → deltacat-1.1.16.dist-info}/LICENSE +0 -0
- {deltacat-1.1.15.dist-info → deltacat-1.1.16.dist-info}/WHEEL +0 -0
- {deltacat-1.1.15.dist-info → deltacat-1.1.16.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -43,7 +43,6 @@ from deltacat.compute.compactor_v2.private.compaction_utils import (
|
|
43
43
|
_stage_new_partition,
|
44
44
|
_run_hash_and_merge,
|
45
45
|
_process_merge_results,
|
46
|
-
_upload_compaction_audit,
|
47
46
|
_write_new_round_completion_file,
|
48
47
|
_commit_compaction_result,
|
49
48
|
)
|
@@ -201,11 +200,6 @@ def _execute_compaction(
|
|
201
200
|
|
202
201
|
compaction_audit.save_round_completion_stats(mat_results)
|
203
202
|
|
204
|
-
_upload_compaction_audit(
|
205
|
-
params,
|
206
|
-
compaction_audit,
|
207
|
-
round_completion_info,
|
208
|
-
)
|
209
203
|
compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
|
210
204
|
params,
|
211
205
|
compaction_audit,
|
@@ -215,5 +209,6 @@ def _execute_compaction(
|
|
215
209
|
rcf_source_partition_locator,
|
216
210
|
new_compacted_delta_locator,
|
217
211
|
pyarrow_write_result,
|
212
|
+
round_completion_info,
|
218
213
|
)
|
219
214
|
return compaction_result
|
@@ -365,6 +365,7 @@ def _run_hash_and_merge(
|
|
365
365
|
if mutable_compaction_audit.telemetry_time_in_seconds
|
366
366
|
else 0.0
|
367
367
|
)
|
368
|
+
|
368
369
|
mutable_compaction_audit.set_telemetry_time_in_seconds(
|
369
370
|
telemetry_this_round + previous_telemetry
|
370
371
|
)
|
@@ -598,10 +599,10 @@ def _process_merge_results(
|
|
598
599
|
return merged_delta, mat_results, hb_id_to_entry_indices_range
|
599
600
|
|
600
601
|
|
601
|
-
def
|
602
|
+
def _update_and_upload_compaction_audit(
|
602
603
|
params: CompactPartitionParams,
|
603
604
|
mutable_compaction_audit: CompactionSessionAuditInfo,
|
604
|
-
round_completion_info: RoundCompletionInfo,
|
605
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
605
606
|
) -> None:
|
606
607
|
|
607
608
|
# After all incremental delta related calculations, we update
|
@@ -637,6 +638,7 @@ def _write_new_round_completion_file(
|
|
637
638
|
rcf_source_partition_locator: rcf.PartitionLocator,
|
638
639
|
new_compacted_delta_locator: DeltaLocator,
|
639
640
|
pyarrow_write_result: PyArrowWriteResult,
|
641
|
+
prev_round_completion_info: Optional[RoundCompletionInfo] = None,
|
640
642
|
) -> ExecutionCompactionResult:
|
641
643
|
input_inflation = None
|
642
644
|
input_average_record_size_bytes = None
|
@@ -664,6 +666,12 @@ def _write_new_round_completion_file(
|
|
664
666
|
f" and average record size={input_average_record_size_bytes}"
|
665
667
|
)
|
666
668
|
|
669
|
+
_update_and_upload_compaction_audit(
|
670
|
+
params,
|
671
|
+
mutable_compaction_audit,
|
672
|
+
prev_round_completion_info,
|
673
|
+
)
|
674
|
+
|
667
675
|
new_round_completion_info = RoundCompletionInfo.of(
|
668
676
|
high_watermark=params.last_stream_position_to_compact,
|
669
677
|
compacted_delta_locator=new_compacted_delta_locator,
|
@@ -87,6 +87,7 @@ class TestCompactionSession:
|
|
87
87
|
INCREMENTAL_FILE_PATH = (
|
88
88
|
"deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
|
89
89
|
)
|
90
|
+
ERROR_RATE = 0.05
|
90
91
|
|
91
92
|
def test_compact_partition_when_no_input_deltas_to_compact(
|
92
93
|
self, local_deltacat_storage_kwargs
|
@@ -253,3 +254,148 @@ class TestCompactionSession:
|
|
253
254
|
# as it should be running incremental
|
254
255
|
assert compaction_audit.uniform_deltas_created == 1
|
255
256
|
assert compaction_audit.input_records == 6
|
257
|
+
|
258
|
+
def test_compact_partition_when_incremental_then_rcf_stats_accurate(
|
259
|
+
self, s3_resource, local_deltacat_storage_kwargs
|
260
|
+
):
|
261
|
+
"""
|
262
|
+
A test case which asserts the RCF stats are correctly generated for
|
263
|
+
a rebase and incremental use-case.
|
264
|
+
"""
|
265
|
+
|
266
|
+
# setup
|
267
|
+
staged_source = stage_partition_from_file_paths(
|
268
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
269
|
+
)
|
270
|
+
|
271
|
+
source_delta = commit_delta_to_staged_partition(
|
272
|
+
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
273
|
+
)
|
274
|
+
|
275
|
+
staged_dest = stage_partition_from_file_paths(
|
276
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
277
|
+
)
|
278
|
+
dest_partition = ds.commit_partition(
|
279
|
+
staged_dest, **local_deltacat_storage_kwargs
|
280
|
+
)
|
281
|
+
|
282
|
+
# action
|
283
|
+
rcf_url = compact_partition(
|
284
|
+
CompactPartitionParams.of(
|
285
|
+
{
|
286
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
287
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
288
|
+
"dd_max_parallelism_ratio": 1.0,
|
289
|
+
"deltacat_storage": ds,
|
290
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
291
|
+
"destination_partition_locator": dest_partition.locator,
|
292
|
+
"drop_duplicates": True,
|
293
|
+
"hash_bucket_count": 2,
|
294
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
295
|
+
"list_deltas_kwargs": {
|
296
|
+
**local_deltacat_storage_kwargs,
|
297
|
+
**{"equivalent_table_types": []},
|
298
|
+
},
|
299
|
+
"primary_keys": ["pk"],
|
300
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
301
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
302
|
+
"records_per_compacted_file": 4000,
|
303
|
+
"s3_client_kwargs": {},
|
304
|
+
"source_partition_locator": source_delta.partition_locator,
|
305
|
+
}
|
306
|
+
)
|
307
|
+
)
|
308
|
+
|
309
|
+
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
310
|
+
_, compaction_audit_key = backfill_rcf.compaction_audit_url.strip(
|
311
|
+
"s3://"
|
312
|
+
).split("/", 1)
|
313
|
+
compaction_audit = CompactionSessionAuditInfo(
|
314
|
+
**read_s3_contents(
|
315
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
316
|
+
)
|
317
|
+
)
|
318
|
+
|
319
|
+
assert abs(backfill_rcf.input_inflation - 0.05235042735042735) <= 1e-5
|
320
|
+
assert abs(backfill_rcf.input_average_record_size_bytes - 12.25) <= 1e-5
|
321
|
+
|
322
|
+
assert compaction_audit.input_records == 4
|
323
|
+
assert compaction_audit.records_deduped == 0
|
324
|
+
assert compaction_audit.records_deleted == 0
|
325
|
+
assert compaction_audit.untouched_file_count == 0
|
326
|
+
assert compaction_audit.untouched_record_count == 0
|
327
|
+
assert compaction_audit.untouched_size_bytes == 0
|
328
|
+
assert compaction_audit.untouched_file_ratio == 0
|
329
|
+
assert compaction_audit.uniform_deltas_created == 1
|
330
|
+
assert compaction_audit.hash_bucket_count == 2
|
331
|
+
assert compaction_audit.input_file_count == 1
|
332
|
+
assert compaction_audit.output_file_count == 2
|
333
|
+
assert abs(compaction_audit.output_size_bytes - 1832) / 1832 <= self.ERROR_RATE
|
334
|
+
assert abs(compaction_audit.input_size_bytes - 936) / 936 <= self.ERROR_RATE
|
335
|
+
|
336
|
+
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
337
|
+
new_source_delta = commit_delta_to_partition(
|
338
|
+
source_delta.partition_locator,
|
339
|
+
[self.INCREMENTAL_FILE_PATH],
|
340
|
+
**local_deltacat_storage_kwargs,
|
341
|
+
)
|
342
|
+
|
343
|
+
new_destination_partition = ds.get_partition(
|
344
|
+
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
345
|
+
)
|
346
|
+
|
347
|
+
new_rcf_url = compact_partition(
|
348
|
+
CompactPartitionParams.of(
|
349
|
+
{
|
350
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
351
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
352
|
+
"dd_max_parallelism_ratio": 1.0,
|
353
|
+
"deltacat_storage": ds,
|
354
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
355
|
+
"destination_partition_locator": new_destination_partition.locator,
|
356
|
+
"drop_duplicates": True,
|
357
|
+
"hash_bucket_count": 2,
|
358
|
+
"last_stream_position_to_compact": new_source_delta.stream_position,
|
359
|
+
"list_deltas_kwargs": {
|
360
|
+
**local_deltacat_storage_kwargs,
|
361
|
+
**{"equivalent_table_types": []},
|
362
|
+
},
|
363
|
+
"primary_keys": ["pk"],
|
364
|
+
"rebase_source_partition_locator": None,
|
365
|
+
"rebase_source_partition_high_watermark": None,
|
366
|
+
"records_per_compacted_file": 4000,
|
367
|
+
"s3_client_kwargs": {},
|
368
|
+
"source_partition_locator": new_source_delta.partition_locator,
|
369
|
+
}
|
370
|
+
)
|
371
|
+
)
|
372
|
+
|
373
|
+
new_rcf = get_rcf(s3_resource, new_rcf_url)
|
374
|
+
_, compaction_audit_key = new_rcf.compaction_audit_url.strip("s3://").split(
|
375
|
+
"/", 1
|
376
|
+
)
|
377
|
+
compaction_audit = CompactionSessionAuditInfo(
|
378
|
+
**read_s3_contents(
|
379
|
+
s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
|
380
|
+
)
|
381
|
+
)
|
382
|
+
|
383
|
+
# as it should be running incremental
|
384
|
+
assert abs(new_rcf.input_inflation - 0.027292576419213975) <= 1e-5
|
385
|
+
assert abs(new_rcf.input_average_record_size_bytes - 12.5) <= 1e-5
|
386
|
+
|
387
|
+
assert compaction_audit.input_records == 6
|
388
|
+
assert compaction_audit.records_deduped == 1
|
389
|
+
assert compaction_audit.records_deleted == 0
|
390
|
+
assert compaction_audit.untouched_file_count == 1
|
391
|
+
assert compaction_audit.untouched_record_count == 2
|
392
|
+
assert (
|
393
|
+
abs(compaction_audit.untouched_size_bytes - 916) / 916 <= self.ERROR_RATE
|
394
|
+
) # 5% error
|
395
|
+
assert abs(compaction_audit.untouched_file_ratio - 50) <= 1e-5
|
396
|
+
assert compaction_audit.uniform_deltas_created == 1
|
397
|
+
assert compaction_audit.hash_bucket_count == 2
|
398
|
+
assert compaction_audit.input_file_count == 3
|
399
|
+
assert compaction_audit.output_file_count == 2
|
400
|
+
assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
|
401
|
+
assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=o3hcQ85nWUeDYkaXi3ADAbqwM_c5ajyxzlx-Z2jdKbI,1778
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
|
4
4
|
deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
|
@@ -50,7 +50,7 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=_rl8lBSO9KFW07Z
|
|
50
50
|
deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
|
-
deltacat/compute/compactor_v2/compaction_session.py,sha256=
|
53
|
+
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
54
|
deltacat/compute/compactor_v2/constants.py,sha256=4HkSebuRWlAzOnZ-_nYmMsf6d3koTwfrlBx9KxuoGe4,2417
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
|
|
66
66
|
deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
|
67
67
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
|
68
68
|
deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
-
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=
|
69
|
+
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=HIr3ikF4iu_ztdy3FbtOw8vUKjc_RBP93ogH8EzMV64,30294
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
72
|
deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
|
@@ -164,7 +164,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
164
164
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
165
165
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
166
166
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
167
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
167
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=Ln3NxUy_2oC8cTfFSNy28lIRK8iNEabtxaqWzIqzyEY,16260
|
168
168
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
169
169
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
170
170
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=4fc5MJTLm3hFlFHK_-5MfyfzeZtOo8D2kBqDE2b8lh4,862
|
@@ -220,8 +220,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
220
220
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
221
221
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
222
222
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
223
|
-
deltacat-1.1.
|
224
|
-
deltacat-1.1.
|
225
|
-
deltacat-1.1.
|
226
|
-
deltacat-1.1.
|
227
|
-
deltacat-1.1.
|
223
|
+
deltacat-1.1.16.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
224
|
+
deltacat-1.1.16.dist-info/METADATA,sha256=EvOmjI60akKZ6BKEWQ2_KZJxr-Bp6wMZOU6-zV1EDos,1734
|
225
|
+
deltacat-1.1.16.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
226
|
+
deltacat-1.1.16.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
227
|
+
deltacat-1.1.16.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|