deltacat 1.1.37__py3-none-any.whl → 1.1.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.37"
47
+ __version__ = "1.1.38"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -322,6 +322,17 @@ class CompactionSessionAuditInfo(dict):
322
322
  """
323
323
  return self.get("outputSizePyarrowBytes")
324
324
 
325
+ @property
326
+ def output_record_count(self) -> int:
327
+ """
328
+ The total number of records in the compacted output (includes untouched records).
329
+
330
+ Represents the final record count after compaction, including:
331
+ - Records that were processed and materialized
332
+ - Records that were untouched and copied by reference
333
+ """
334
+ return self.get("outputRecordCount")
335
+
325
336
  @property
326
337
  def total_cluster_memory_bytes(self) -> float:
327
338
  """
@@ -672,6 +683,19 @@ class CompactionSessionAuditInfo(dict):
672
683
  self["outputSizeBytes"] = output_size_bytes
673
684
  return output_size_bytes
674
685
 
686
+ def set_output_record_count(
687
+ self, output_records: int
688
+ ) -> CompactionSessionAuditInfo:
689
+ """
690
+ This includes both processed records and untouched records copied by reference.
691
+ """
692
+ if output_records < 0:
693
+ raise ValueError(
694
+ f"Output record count cannot be negative: {output_records}"
695
+ )
696
+ self["outputRecordCount"] = output_records
697
+ return self
698
+
675
699
  def set_output_size_pyarrow_bytes(
676
700
  self, output_size_pyarrow_bytes: float
677
701
  ) -> CompactionSessionAuditInfo:
@@ -902,6 +926,10 @@ class CompactionSessionAuditInfo(dict):
902
926
  self.set_output_file_count(pyarrow_write_result.files)
903
927
  self.set_output_size_bytes(pyarrow_write_result.file_bytes)
904
928
  self.set_output_size_pyarrow_bytes(pyarrow_write_result.pyarrow_bytes)
929
+ # NOTE: Aggregating untouched_record_count with records to get a total of record count in the compacted table
930
+ self.set_output_record_count(
931
+ pyarrow_write_result.records + untouched_file_record_count
932
+ )
905
933
 
906
934
  self.set_peak_memory_used_bytes_per_task(
907
935
  max(
@@ -93,7 +93,7 @@ def _append_table_by_hash_bucket(
93
93
  all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
94
94
  assert (
95
95
  len(all_buckets) == 1
96
- ), f"Only one hash bucket is allowed by found {len(all_buckets)}"
96
+ ), f"Only one hash bucket is allowed but found {len(all_buckets)}"
97
97
  assert (
98
98
  all_buckets[0].as_py() == hb_idx
99
99
  ), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
@@ -153,7 +153,6 @@ def _optimized_group_record_batches_by_hash_bucket(
153
153
  record_batches.append(record_batch)
154
154
 
155
155
  if record_batches:
156
- print(f"{len(record_batches)} -- END")
157
156
  appended_len, append_latency = timed_invocation(
158
157
  _append_table_by_hash_bucket,
159
158
  pa.Table.from_batches(record_batches),
@@ -356,6 +356,7 @@ class TestCompactionSession:
356
356
  assert compaction_audit.hash_bucket_count == 2
357
357
  assert compaction_audit.input_file_count == 1
358
358
  assert compaction_audit.output_file_count == 2
359
+ assert compaction_audit.output_record_count == 4
359
360
  assert abs(compaction_audit.output_size_bytes - 1832) / 1832 <= self.ERROR_RATE
360
361
  assert abs(compaction_audit.input_size_bytes - 936) / 936 <= self.ERROR_RATE
361
362
 
@@ -423,9 +424,18 @@ class TestCompactionSession:
423
424
  assert compaction_audit.hash_bucket_count == 2
424
425
  assert compaction_audit.input_file_count == 3
425
426
  assert compaction_audit.output_file_count == 2
427
+ assert compaction_audit.output_record_count == 7
426
428
  assert abs(compaction_audit.output_size_bytes - 1843) / 1843 <= self.ERROR_RATE
427
429
  assert abs(compaction_audit.input_size_bytes - 2748) / 2748 <= self.ERROR_RATE
428
430
 
431
+ record_invariant = compaction_audit.output_record_count == (
432
+ compaction_audit.input_records
433
+ - compaction_audit.records_deduped
434
+ - compaction_audit.records_deleted
435
+ + compaction_audit.untouched_record_count
436
+ )
437
+ assert record_invariant is True
438
+
429
439
  def test_compact_partition_when_incremental_then_intelligent_estimation_sanity(
430
440
  self, s3_resource, local_deltacat_storage_kwargs
431
441
  ):
@@ -1006,3 +1016,81 @@ class TestCompactionSession:
1006
1016
 
1007
1017
  rcf = get_rcf(s3_resource, new_uri)
1008
1018
  assert rcf.hash_bucket_count == 4
1019
+
1020
+ def test_compaction_with_zero_records(
1021
+ self, s3_resource, local_deltacat_storage_kwargs
1022
+ ):
1023
+ """
1024
+ Test case where compaction results in 0 records.
1025
+ Verify audit handles this correctly without crashing.
1026
+ """
1027
+ # setup - create empty source delta
1028
+ staged_source = stage_partition_from_file_paths(
1029
+ self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
1030
+ )
1031
+
1032
+ # Create an empty table that will result in 0 records after compaction
1033
+ empty_table = pa.table({"pk": pa.array([])})
1034
+ source_delta = commit_delta_to_staged_partition(
1035
+ staged_source, pa_table=empty_table, **local_deltacat_storage_kwargs
1036
+ )
1037
+
1038
+ staged_dest = stage_partition_from_file_paths(
1039
+ self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
1040
+ )
1041
+ dest_partition = ds.commit_partition(
1042
+ staged_dest, **local_deltacat_storage_kwargs
1043
+ )
1044
+
1045
+ # action
1046
+ rcf_url = compact_partition(
1047
+ CompactPartitionParams.of(
1048
+ {
1049
+ "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
1050
+ "compacted_file_content_type": ContentType.PARQUET,
1051
+ "dd_max_parallelism_ratio": 1.0,
1052
+ "deltacat_storage": ds,
1053
+ "deltacat_storage_kwargs": local_deltacat_storage_kwargs,
1054
+ "destination_partition_locator": dest_partition.locator,
1055
+ "drop_duplicates": True,
1056
+ "hash_bucket_count": 1,
1057
+ "last_stream_position_to_compact": source_delta.stream_position,
1058
+ "list_deltas_kwargs": {
1059
+ **local_deltacat_storage_kwargs,
1060
+ **{"equivalent_table_types": []},
1061
+ },
1062
+ "primary_keys": ["pk"],
1063
+ "rebase_source_partition_locator": source_delta.partition_locator,
1064
+ "rebase_source_partition_high_watermark": source_delta.stream_position,
1065
+ "records_per_compacted_file": 4000,
1066
+ "s3_client_kwargs": {},
1067
+ "source_partition_locator": source_delta.partition_locator,
1068
+ }
1069
+ )
1070
+ )
1071
+
1072
+ # verify - compaction should complete successfully with 0 records
1073
+ assert rcf_url is not None
1074
+ rcf = get_rcf(s3_resource, rcf_url)
1075
+
1076
+ _, compaction_audit_key = rcf.compaction_audit_url.strip("s3://").split("/", 1)
1077
+ compaction_audit = CompactionSessionAuditInfo(
1078
+ **read_s3_contents(
1079
+ s3_resource, TEST_S3_RCF_BUCKET_NAME, compaction_audit_key
1080
+ )
1081
+ )
1082
+
1083
+ # Verify that audit handles zero records correctly
1084
+ assert compaction_audit.input_records == 0
1085
+ assert compaction_audit.output_record_count == 0
1086
+ assert compaction_audit.records_deduped == 0
1087
+ assert compaction_audit.records_deleted == 0
1088
+ assert compaction_audit.untouched_record_count == 0
1089
+ assert compaction_audit.output_file_count >= 0 # May still create empty files
1090
+ record_invariant = compaction_audit.output_record_count == (
1091
+ compaction_audit.input_records
1092
+ - compaction_audit.records_deduped
1093
+ - compaction_audit.records_deleted
1094
+ + compaction_audit.untouched_record_count
1095
+ )
1096
+ assert record_invariant is True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.37
3
+ Version: 1.1.38
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=u00X92zHfZJzS08a-2kx3kCLcz40L-THm0HowDiBOiA,1778
1
+ deltacat/__init__.py,sha256=N5dMIxvjmqQP9-zhdx6gAxQ4tZIS3t3PIPLXaIsxWjI,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -25,7 +25,7 @@ deltacat/compute/compactor/compaction_session.py,sha256=YthBYNpj6qvr6SqfVfXTy5yl
25
25
  deltacat/compute/compactor/repartition_session.py,sha256=AAPwNZtPpC_Mtoja855_alBdXDA6efp7zcvkE-MANaQ,7254
26
26
  deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  deltacat/compute/compactor/model/compact_partition_params.py,sha256=jjvpUiHfGAw-Dy7s4wyTINtruf8Nk4EPMma7Y4KMF2U,19067
28
- deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=Jjt4YOEO8lc-kiV4fB7rOD_Xd17_BS6pRDzqbtZp0GI,31350
28
+ deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=48DjIuJ6_Gv7GVoUfLSA5sjrCyZ4qY3tWnR-bYXh3oc,32440
29
29
  deltacat/compute/compactor/model/compactor_version.py,sha256=RwRvManiCxZmzjAWzm1OPDxjB1BEHu1d0fBJyGhXKxA,87
30
30
  deltacat/compute/compactor/model/dedupe_result.py,sha256=1OCV944qJdLQ_-8scisVKl45ej1eRv9OV539QYZtQ-U,292
31
31
  deltacat/compute/compactor/model/delta_annotated.py,sha256=bCE9H5mrBoHfd1lbL6tYWC4_dbAgucAlFLjOtyPLW14,12515
@@ -76,7 +76,7 @@ deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hd
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
79
- deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=Qsn0BQrlBWSLqu4srd-LJUX8BaVqG6Wo1oAros7LYWw,12677
79
+ deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=1IolwUoS9PkqG7yIASVHgZXIlIt8Px_8DIrqCajCqCs,12631
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
@@ -152,7 +152,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=F1DFaranHekHB7HSNH-0_hV5ovdR5HfF9JqTVDw6Vh8,42575
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=R98elvDY9EwHI8vrC09VX4aFgKVtg8H4xFo7PNNNvZs,46446
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
212
212
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
213
213
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
214
214
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
215
- deltacat-1.1.37.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
- deltacat-1.1.37.dist-info/METADATA,sha256=iHlaZ9sS-CrQby0kxCrOigl1ZGZKpniwf9LyYbagwzI,1733
217
- deltacat-1.1.37.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
- deltacat-1.1.37.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
- deltacat-1.1.37.dist-info/RECORD,,
215
+ deltacat-1.1.38.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
+ deltacat-1.1.38.dist-info/METADATA,sha256=ywP_JPvdLP0cpQUlwAHPSVjeu4nlFQyJGbGevs_RrWc,1733
217
+ deltacat-1.1.38.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
+ deltacat-1.1.38.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
+ deltacat-1.1.38.dist-info/RECORD,,