deltacat 1.1.35__py3-none-any.whl → 1.1.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.35"
47
+ __version__ = "1.1.36"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -69,14 +69,17 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
69
69
  assert (
70
70
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
71
71
  ), "hash_bucket_count is a required arg for compactor v2"
72
+ assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
72
73
  if params.num_rounds > 1:
73
74
  assert (
74
75
  not params.drop_duplicates
75
76
  ), "num_rounds > 1, drop_duplicates must be False but is True"
76
77
 
77
- with memray.Tracker(
78
- "compaction_partition.bin"
79
- ) if params.enable_profiler else nullcontext():
78
+ with (
79
+ memray.Tracker("compaction_partition.bin")
80
+ if params.enable_profiler
81
+ else nullcontext()
82
+ ):
80
83
  execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
81
84
  params,
82
85
  **kwargs,
@@ -48,6 +48,7 @@ class MergeInput(Dict):
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
49
  memory_logs_enabled: Optional[bool] = None,
50
50
  disable_copy_by_reference: Optional[bool] = None,
51
+ hash_bucket_count: Optional[int] = None,
51
52
  ) -> MergeInput:
52
53
 
53
54
  result = MergeInput()
@@ -71,6 +72,7 @@ class MergeInput(Dict):
71
72
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
72
73
  result["memory_logs_enabled"] = memory_logs_enabled
73
74
  result["disable_copy_by_reference"] = disable_copy_by_reference
75
+ result["hash_bucket_count"] = hash_bucket_count
74
76
  return result
75
77
 
76
78
  @property
@@ -154,3 +156,7 @@ class MergeInput(Dict):
154
156
  @property
155
157
  def disable_copy_by_reference(self) -> bool:
156
158
  return self["disable_copy_by_reference"]
159
+
160
+ @property
161
+ def hash_bucket_count(self) -> int:
162
+ return self["hash_bucket_count"]
@@ -438,6 +438,7 @@ def _merge(
438
438
  delete_file_envelopes=delete_file_envelopes,
439
439
  memory_logs_enabled=params.memory_logs_enabled,
440
440
  disable_copy_by_reference=params.disable_copy_by_reference,
441
+ hash_bucket_count=params.hash_bucket_count,
441
442
  )
442
443
  }
443
444
 
@@ -62,6 +62,10 @@ if importlib.util.find_spec("memray"):
62
62
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
63
63
 
64
64
 
65
+ _EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
66
+ _INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
67
+
68
+
65
69
  def _append_delta_type_column(table: pa.Table, value: np.bool_):
66
70
  return table.append_column(
67
71
  sc._DELTA_TYPE_COLUMN_FIELD,
@@ -112,6 +116,8 @@ def _merge_tables(
112
116
  table: pa.Table,
113
117
  primary_keys: List[str],
114
118
  can_drop_duplicates: bool,
119
+ hb_index: int,
120
+ num_buckets: int,
115
121
  compacted_table: Optional[pa.Table] = None,
116
122
  ) -> pa.Table:
117
123
  """
@@ -130,6 +136,20 @@ def _merge_tables(
130
136
 
131
137
  all_tables.append(table)
132
138
 
139
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
140
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
141
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
142
+ ]
143
+
144
+ if primary_keys and check_bucketing_spec:
145
+ _validate_bucketing_spec_compliance(
146
+ table=all_tables[incremental_idx],
147
+ num_buckets=num_buckets,
148
+ primary_keys=primary_keys,
149
+ hb_index=hb_index,
150
+ log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
151
+ )
152
+
133
153
  if not primary_keys or not can_drop_duplicates:
134
154
  logger.info(
135
155
  f"Not dropping duplicates for primary keys={primary_keys} "
@@ -193,27 +213,40 @@ def _merge_tables(
193
213
 
194
214
 
195
215
  def _validate_bucketing_spec_compliance(
196
- table: pa.Table, rcf: RoundCompletionInfo, hb_index: int, primary_keys: List[str]
216
+ table: pa.Table,
217
+ num_buckets: int,
218
+ hb_index: int,
219
+ primary_keys: List[str],
220
+ rcf: RoundCompletionInfo = None,
221
+ log_prefix=None,
197
222
  ) -> None:
223
+ if rcf is not None:
224
+ message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
225
+ else:
226
+ message_prefix = f"{log_prefix}"
198
227
  pki_table = generate_pk_hash_column(
199
228
  [table], primary_keys=primary_keys, requires_hash=True
200
229
  )[0]
230
+ is_not_compliant: bool = False
201
231
  for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
202
- hash_bucket = pk_digest_to_hash_bucket_index(hash_value, rcf.hash_bucket_count)
232
+ hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
203
233
  if hash_bucket != hb_index:
234
+ is_not_compliant = True
204
235
  logger.info(
205
- f"{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}"
206
- f".{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}"
207
- f".{rcf.compacted_delta_locator.partition_values} has non-compliant bucketing spec. "
236
+ f"{message_prefix} has non-compliant bucketing spec at index: {index} "
208
237
  f"Expected hash bucket is {hb_index} but found {hash_bucket}."
209
238
  )
210
239
  if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
211
240
  raise AssertionError(
212
- "Hash bucket drift detected. Expected hash bucket index"
241
+ f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
213
242
  f" to be {hb_index} but found {hash_bucket}"
214
243
  )
215
244
  # No further checks necessary
216
245
  break
246
+ if not is_not_compliant:
247
+ logger.debug(
248
+ f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
249
+ )
217
250
 
218
251
 
219
252
  def _download_compacted_table(
@@ -257,7 +290,12 @@ def _download_compacted_table(
257
290
  # Bucketing spec compliance isn't required without primary keys
258
291
  if primary_keys and check_bucketing_spec:
259
292
  _validate_bucketing_spec_compliance(
260
- compacted_table, rcf, hb_index, primary_keys
293
+ compacted_table,
294
+ rcf.hash_bucket_count,
295
+ hb_index,
296
+ primary_keys,
297
+ rcf=rcf,
298
+ log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
261
299
  )
262
300
  return compacted_table
263
301
 
@@ -462,12 +500,12 @@ def _compact_tables(
462
500
  _group_sequence_by_delta_type(reordered_all_dfes)
463
501
  ):
464
502
  if delta_type is DeltaType.UPSERT:
465
- (
466
- table,
467
- incremental_len,
468
- deduped_records,
469
- merge_time,
470
- ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
503
+ (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
504
+ input=input,
505
+ dfe_list=delta_type_sequence,
506
+ hb_idx=hb_idx,
507
+ prev_table=table,
508
+ )
471
509
  logger.info(
472
510
  f" [Merge task index {input.merge_task_index}] Merged"
473
511
  f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
@@ -526,6 +564,8 @@ def _apply_upserts(
526
564
  primary_keys=input.primary_keys,
527
565
  can_drop_duplicates=input.drop_duplicates,
528
566
  compacted_table=prev_table,
567
+ hb_index=hb_idx,
568
+ num_buckets=input.hash_bucket_count,
529
569
  )
530
570
  deduped_records = hb_table_record_count - len(table)
531
571
  return table, incremental_len, deduped_records, merge_time
@@ -133,4 +133,5 @@ def generate_local_merge_input(
133
133
  delete_strategy=delete_strategy,
134
134
  delete_file_envelopes=delete_file_envelopes,
135
135
  disable_copy_by_reference=params.disable_copy_by_reference,
136
+ hash_bucket_count=params.hash_bucket_count,
136
137
  )
@@ -804,7 +804,7 @@ class TestCompactionSession:
804
804
  )
805
805
 
806
806
  assert (
807
- "Hash bucket drift detected. Expected hash bucket index to be 1 but found 0"
807
+ "Hash bucket drift detected at index: 0. Expected hash bucket index to be 1 but found 0"
808
808
  in str(excinfo.value)
809
809
  )
810
810
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.35
3
+ Version: 1.1.36
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=br2aQSDj5eFS_j0mwGUSEQF386HRAXjiYg421vB9pME,1778
1
+ deltacat/__init__.py,sha256=9vJMHGceWew6atD_3VqKurlBJ3crD5mwAQIgSB1yjNY,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -50,7 +50,7 @@ deltacat/compute/compactor/utils/round_completion_file.py,sha256=fFevhUuveCvrU3g
50
50
  deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZRG2NqBlCw1X3_FBc,2397
51
51
  deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
52
52
  deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
53
+ deltacat/compute/compactor_v2/compaction_session.py,sha256=RbO_du0qX7nlyXO-ZSksX8RqWuRwfdvWddpTJjLDVNk,8185
54
54
  deltacat/compute/compactor_v2/constants.py,sha256=F5Phrh-2JgnWvtjHXacxOG5Z2ivKcHnboerI12rc1zk,3632
55
55
  deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
@@ -63,19 +63,19 @@ deltacat/compute/compactor_v2/model/evaluate_compaction_result.py,sha256=XAaEEAd
63
63
  deltacat/compute/compactor_v2/model/hash_bucket_input.py,sha256=iJy8kLi1dIpFIyfoAjkaAtZvg8Np1z7BsUNGAcWfFm4,3042
64
64
  deltacat/compute/compactor_v2/model/hash_bucket_result.py,sha256=EsY9BPPywhmxlcLKn3kGWzAX4s4BTR2vYyPUB-wAEOc,309
65
65
  deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViVO1SVljCj6f0B3MfB3hqtGm2S0s,7410
66
- deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
66
+ deltacat/compute/compactor_v2/model/merge_input.py,sha256=D-6WuHK4X7m9-P6Hskz6RRemeWrNf6IPdhc14O3KDAg,5860
67
67
  deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
68
68
  deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
- deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
69
+ deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=nz2N3YZVE9bNwOqRXoQYkArJhyUJRis2s9BweZ3tad8,30989
70
70
  deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
71
  deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
72
- deltacat/compute/compactor_v2/steps/merge.py,sha256=T2G2AaVsezYzo6oJtpuXH-bYv8nt-yFHA5ZbDIGodQg,24971
72
+ deltacat/compute/compactor_v2/steps/merge.py,sha256=4rKQ__SeWO_QLZl2btcFrYHCMOn-8R3kja74UrWOMgg,26225
73
73
  deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
74
  deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=t2j9H9IdFRH9EfpL-9g5XvZs9WK9HybqBGA7fDi82EM,8310
75
75
  deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
76
76
  deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=Xjs7_D-0xKSetvllIe4o96aM1elfdjt1Ii7YfsHPvZs,6108
78
- deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
78
+ deltacat/compute/compactor_v2/utils/merge.py,sha256=fAzEYwQYH2ia8MLdEFdZFivWHpi6qZu8AyyEK0H0vwE,5363
79
79
  deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
80
  deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
@@ -152,7 +152,7 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
152
152
  deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
153
153
  deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
154
154
  deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
- deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=zEXOIilybDpKuQt1ZRxGg4x_kUacBOcHE8KWcOmL01s,42563
155
+ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=F1DFaranHekHB7HSNH-0_hV5ovdR5HfF9JqTVDw6Vh8,42575
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py,sha256=eoiDuBUhgCmc3DYKCXL1g4QWtmROhZ0RJCQgePMY9as,9959
@@ -212,8 +212,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
212
212
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
213
213
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
214
214
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
215
- deltacat-1.1.35.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
- deltacat-1.1.35.dist-info/METADATA,sha256=b8Z4aVdNYjBoy0_uh0m4yoU_8h2w8v7I2AZOwacv5Es,1733
217
- deltacat-1.1.35.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
- deltacat-1.1.35.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
- deltacat-1.1.35.dist-info/RECORD,,
215
+ deltacat-1.1.36.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
216
+ deltacat-1.1.36.dist-info/METADATA,sha256=wIZbEGHnJWq_TBKi0u463p4-PgG9R_0MApw7IIwmnRc,1733
217
+ deltacat-1.1.36.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
218
+ deltacat-1.1.36.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
219
+ deltacat-1.1.36.dist-info/RECORD,,