deltacat 1.1.15__py3-none-any.whl → 1.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.15"
47
+ __version__ = "1.1.16"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -43,7 +43,6 @@ from deltacat.compute.compactor_v2.private.compaction_utils import (
43
43
  _stage_new_partition,
44
44
  _run_hash_and_merge,
45
45
  _process_merge_results,
46
- _upload_compaction_audit,
47
46
  _write_new_round_completion_file,
48
47
  _commit_compaction_result,
49
48
  )
@@ -201,11 +200,6 @@ def _execute_compaction(
201
200
 
202
201
  compaction_audit.save_round_completion_stats(mat_results)
203
202
 
204
- _upload_compaction_audit(
205
- params,
206
- compaction_audit,
207
- round_completion_info,
208
- )
209
203
  compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
210
204
  params,
211
205
  compaction_audit,
@@ -215,5 +209,6 @@ def _execute_compaction(
215
209
  rcf_source_partition_locator,
216
210
  new_compacted_delta_locator,
217
211
  pyarrow_write_result,
212
+ round_completion_info,
218
213
  )
219
214
  return compaction_result
@@ -365,6 +365,7 @@ def _run_hash_and_merge(
365
365
  if mutable_compaction_audit.telemetry_time_in_seconds
366
366
  else 0.0
367
367
  )
368
+
368
369
  mutable_compaction_audit.set_telemetry_time_in_seconds(
369
370
  telemetry_this_round + previous_telemetry
370
371
  )
@@ -598,10 +599,10 @@ def _process_merge_results(
598
599
  return merged_delta, mat_results, hb_id_to_entry_indices_range
599
600
 
600
601
 
601
- def _upload_compaction_audit(
602
+ def _update_and_upload_compaction_audit(
602
603
  params: CompactPartitionParams,
603
604
  mutable_compaction_audit: CompactionSessionAuditInfo,
604
- round_completion_info: RoundCompletionInfo,
605
+ round_completion_info: Optional[RoundCompletionInfo] = None,
605
606
  ) -> None:
606
607
 
607
608
  # After all incremental delta related calculations, we update
@@ -637,6 +638,7 @@ def _write_new_round_completion_file(
637
638
  rcf_source_partition_locator: rcf.PartitionLocator,
638
639
  new_compacted_delta_locator: DeltaLocator,
639
640
  pyarrow_write_result: PyArrowWriteResult,
641
+ prev_round_completion_info: Optional[RoundCompletionInfo] = None,
640
642
  ) -> ExecutionCompactionResult:
641
643
  input_inflation = None
642
644
  input_average_record_size_bytes = None
@@ -664,6 +666,12 @@ def _write_new_round_completion_file(
664
666
  f" and average record size={input_average_record_size_bytes}"
665
667
  )
666
668
 
669
+ _update_and_upload_compaction_audit(
670
+ params,
671
+ mutable_compaction_audit,
672
+ prev_round_completion_info,
673
+ )
674
+
667
675
  new_round_completion_info = RoundCompletionInfo.of(
668
676
  high_watermark=params.last_stream_position_to_compact,
669
677
  compacted_delta_locator=new_compacted_delta_locator,
@@ -87,6 +87,7 @@ class TestCompactionSession:
87
87
  INCREMENTAL_FILE_PATH = (
88
88
  "deltacat/tests/compute/compactor_v2/data/incremental_source_date_pk.csv"
89
89
  )
90
+ ERROR_RATE = 0.05
90
91
 
91
92
  def test_compact_partition_when_no_input_deltas_to_compact(
92
93
  self, local_deltacat_storage_kwargs
@@ -253,3 +254,148 @@ class TestCompactionSession:
253
254
  # as it should be running incremental
254
255
  assert compaction_audit.uniform_deltas_created == 1
255
256
  assert compaction_audit.input_records == 6
257
+
258
+ def test_compact_partition_when_incremental_then_rcf_stats_accurate(
259
+ self, s3_resource, local_deltacat_storage_kwargs
260
+ ):
261
+ """
262
+ A test case which asserts the RCF stats are correctly generated for
263
+ a rebase and incremental use-case.
264
+ """
265
+
266
+ # setup
267
+ staged_source = stage_partition_from_file_paths(
268
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
269
+ )
270
+
271
+ source_delta = commit_delta_to_staged_partition(
272
+ staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
273
+ )
274
+
275
+ staged_dest = stage_partition_from_file_paths(
276
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
277
+ )
278
+ dest_partition = ds.commit_partition(
279
+ staged_dest, **local_deltacat_storage_kwargs
280
+ )
281
+
282
+ # action
283
+ rcf_url = compact_partition(
284
+ CompactPartitionParams.of(
285
+ {
286
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
287
+ "compacted_file_content_type": ContentType.PARQUET,
288
+ "dd_max_parallelism_ratio": 1.0,
289
+ "deltacat_storage": ds,
290
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
291
+ "destination_partition_locator": dest_partition.locator,
292
+ "drop_duplicates": True,
293
+ "hash_bucket_count": 2,
294
+ "last_stream_position_to_compact": source_delta.stream_position,
295
+ "list_deltas_kwargs": {
296
+ **local_deltacat_storage_kwargs,
297
+ **{"equivalent_table_types": []},
298
+ },
299
+ "primary_keys": ["pk"],
300
+ "rebase_source_partition_locator": source_delta.partition_locator,
301
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
302
+ "records_per_compacted_file": 4000,
303
+ "s3_client_kwargs": {},
304
+ "source_partition_locator": source_delta.partition_locator,
305
+ }
306
+ )
307
+ )
308
+
309
+ backfill_rcf = get_rcf(s3_resource, rcf_url)
310
+ _, compaction_audit_key = backfill_rcf.compaction_audit_url.strip(
311
+ "s3://"
312
+ ).split("/", 1)
313
+ compaction_audit = CompactionSessionAuditInfo(
314
+ **read_s3_contents(
315
+ s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
316
+ )
317
+ )
318
+
319
+ assert abs(backfill_rcf.input_inflation - 0.05235042735042735) <= 1e-5
320
+ assert abs(backfill_rcf.input_average_record_size_bytes - 12.25) <= 1e-5
321
+
322
+ assert compaction_audit.input_records == 4
323
+ assert compaction_audit.records_deduped == 0
324
+ assert compaction_audit.records_deleted == 0
325
+ assert compaction_audit.untouched_file_count == 0
326
+ assert compaction_audit.untouched_record_count == 0
327
+ assert compaction_audit.untouched_size_bytes == 0
328
+ assert compaction_audit.untouched_file_ratio == 0
329
+ assert compaction_audit.uniform_deltas_created == 1
330
+ assert compaction_audit.hash_bucket_count == 2
331
+ assert compaction_audit.input_file_count == 1
332
+ assert compaction_audit.output_file_count == 2
333
+ assert abs(compaction_audit.output_size_bytes - 1832) / 1832 <= self.ERROR_RATE
334
+ assert abs(compaction_audit.input_size_bytes - 936) / 936 <= self.ERROR_RATE
335
+
336
+ # Now run an incremental compaction and verify if the previous RCF was read properly.
337
+ new_source_delta = commit_delta_to_partition(
338
+ source_delta.partition_locator,
339
+ [self.INCREMENTAL_FILE_PATH],
340
+ **local_deltacat_storage_kwargs,
341
+ )
342
+
343
+ new_destination_partition = ds.get_partition(
344
+ dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
345
+ )
346
+
347
+ new_rcf_url = compact_partition(
348
+ CompactPartitionParams.of(
349
+ {
350
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
351
+ "compacted_file_content_type": ContentType.PARQUET,
352
+ "dd_max_parallelism_ratio": 1.0,
353
+ "deltacat_storage": ds,
354
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
355
+ "destination_partition_locator": new_destination_partition.locator,
356
+ "drop_duplicates": True,
357
+ "hash_bucket_count": 2,
358
+ "last_stream_position_to_compact": new_source_delta.stream_position,
359
+ "list_deltas_kwargs": {
360
+ **local_deltacat_storage_kwargs,
361
+ **{"equivalent_table_types": []},
362
+ },
363
+ "primary_keys": ["pk"],
364
+ "rebase_source_partition_locator": None,
365
+ "rebase_source_partition_high_watermark": None,
366
+ "records_per_compacted_file": 4000,
367
+ "s3_client_kwargs": {},
368
+ "source_partition_locator": new_source_delta.partition_locator,
369
+ }
370
+ )
371
+ )
372
+
373
+ new_rcf = get_rcf(s3_resource, new_rcf_url)
374
+ _, compaction_audit_key = new_rcf.compaction_audit_url.strip("s3://").split(
375
+ "/", 1
376
+ )
377
+ compaction_audit = CompactionSessionAuditInfo(
378
+ **read_s3_contents(
379
+ s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
380
+ )
381
+ )
382
+
383
+ # as it should be running incremental
384
+ assert abs(new_rcf.input_inflation - 0.027292576419213975) <= 1e-5
385
+ assert abs(new_rcf.input_average_record_size_bytes - 12.5) <= 1e-5
386
+
387
+ assert compaction_audit.input_records == 6
388
+ assert compaction_audit.records_deduped == 1
389
+ assert compaction_audit.records_deleted == 0
390
+ assert compaction_audit.untouched_file_count == 1
391
+ assert compaction_audit.untouched_record_count == 2
392
+ assert (
393
+ abs(compaction_audit.untouched_size_bytes - 916) / 916 <= self.ERROR_RATE
394
+ ) # 5% error
395
+ assert abs(compaction_audit.untouched_file_ratio - 50) <= 1e-5
396
+ assert compaction_audit.uniform_deltas_created == 1
397
+ assert compaction_audit.hash_bucket_count == 2
398
+ assert compaction_audit.input_file_count == 3
399
+ assert compaction_audit.output_file_count == 2
400
+ assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
401
+ assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.15
3
+ Version: 1.1.16
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=B3feGTJ_HURh5NXPYw8eS-uft2b2FuwzVsZjfVXCa5c,1778
1
+ deltacat/__init__.py,sha256=o3hcQ85nWUeDYkaXi3ADAbqwM_c5ajyxzlx-Z2jdKbI,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
4
4
  deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
@@ -50,7 +50,7 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=_rl8lBSO9KFW07Z
50
50
  deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- deltacat/compute/compactor_v2/compaction_session.py,sha256=mRy7Rh2hIejKmfmVUXTbAD8ArUDRudDAY5YxlKV8dBM,8186
53
+ deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
54
54
  deltacat/compute/compactor_v2/constants.py,sha256=4HkSebuRWlAzOnZ-_nYmMsf6d3koTwfrlBx9KxuoGe4,2417
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
66
66
  deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
68
  deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=B1i9y74gAhAQCu2vpjuN2muH425GHffnc2BhK-l6wQk,30061
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=HIr3ikF4iu_ztdy3FbtOw8vUKjc_RBP93ogH8EzMV64,30294
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
72
  deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
@@ -164,7 +164,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
164
164
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
165
165
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
166
166
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
167
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=X1B47USzXXGt-b_BL6WyFENOXx2aqp6NLIdL_gCukNw,9641
167
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=Ln3NxUy_2oC8cTfFSNy28lIRK8iNEabtxaqWzIqzyEY,16260
168
168
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
169
169
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
170
  deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=4fc5MJTLm3hFlFHK_-5MfyfzeZtOo8D2kBqDE2b8lh4,862
@@ -220,8 +220,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
220
220
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
221
221
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
222
222
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
223
- deltacat-1.1.15.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
224
- deltacat-1.1.15.dist-info/METADATA,sha256=KT4i-8YvQv2xdRV4DsIF7Htj5U-mJEsY68uornP81hg,1734
225
- deltacat-1.1.15.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
226
- deltacat-1.1.15.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
227
- deltacat-1.1.15.dist-info/RECORD,,
223
+ deltacat-1.1.16.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
224
+ deltacat-1.1.16.dist-info/METADATA,sha256=EvOmjI60akKZ6BKEWQ2_KZJxr-Bp6wMZOU6-zV1EDos,1734
225
+ deltacat-1.1.16.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
226
+ deltacat-1.1.16.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
227
+ deltacat-1.1.16.dist-info/RECORD,,