deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
+ import pyarrow as pa
3
4
  import logging
4
5
  from deltacat import logs
5
6
  from typing import List, Union
@@ -18,6 +19,7 @@ class CompactionSessionAuditInfo(dict):
18
19
  DEDUPE_STEP_NAME = "dedupe"
19
20
  MATERIALIZE_STEP_NAME = "materialize"
20
21
  HASH_BUCKET_STEP_NAME = "hashBucket"
22
+ MERGE_STEP_NAME = "merge"
21
23
 
22
24
  def __init__(self, deltacat_version: str, audit_url: str):
23
25
  self.set_deltacat_version(deltacat_version)
@@ -52,7 +54,7 @@ class CompactionSessionAuditInfo(dict):
52
54
  @property
53
55
  def uniform_deltas_created(self) -> int:
54
56
  """
55
- The total number of unitform deltas fed into the hash bucket step.
57
+ The total number of uniform deltas fed into the hash bucket step.
56
58
  """
57
59
  return self.get("uniformDeltasCreated")
58
60
 
@@ -68,7 +70,7 @@ class CompactionSessionAuditInfo(dict):
68
70
  @property
69
71
  def input_size_bytes(self) -> float:
70
72
  """
71
- The on-disk size in bytes of the input.
73
+ The on-disk size in bytes of the input. Analogous to bytes scanned
72
74
  """
73
75
  return self.get("inputSizeBytes")
74
76
 
@@ -142,6 +144,15 @@ class CompactionSessionAuditInfo(dict):
142
144
  """
143
145
  return self.get("materializeTaskPeakMemoryUsedBytes")
144
146
 
147
+ @property
148
+ def peak_memory_used_bytes_per_merge_task(self) -> float:
149
+ """
150
+ The peak memory used by a single merge python process. Note
151
+ that results may be max of merge, and hash bucketing as
152
+ processes are reused by Ray to run all compaction steps.
153
+ """
154
+ return self.get("mergeTaskPeakMemoryUsedBytes")
155
+
145
156
  @property
146
157
  def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
147
158
  """
@@ -164,6 +175,13 @@ class CompactionSessionAuditInfo(dict):
164
175
  """
165
176
  return self.get("materializePostObjectStoreMemoryUsedBytes")
166
177
 
178
+ @property
179
+ def merge_post_object_store_memory_used_bytes(self) -> float:
180
+ """
181
+ The total object store memory used after merge step.
182
+ """
183
+ return self.get("mergePostObjectStoreMemoryUsedBytes")
184
+
167
185
  @property
168
186
  def materialize_buckets(self) -> int:
169
187
  """
@@ -233,11 +251,33 @@ class CompactionSessionAuditInfo(dict):
233
251
  @property
234
252
  def materialize_result_wait_time_in_seconds(self) -> float:
235
253
  """
236
- The time it takes ray.get() to resolve after the last hash bucket task has completed.
254
+ The time it takes ray.get() to resolve after the last materialize task has completed.
237
255
  This value may not be accurate at less than 1 second precision.
238
256
  """
239
257
  return self.get("materializeResultWaitTimeInSeconds")
240
258
 
259
+ @property
260
+ def merge_result_wait_time_in_seconds(self) -> float:
261
+ """
262
+ The time it takes ray.get() to resolve after the last task has completed.
263
+ This value may not be accurate at less than 1 second precision.
264
+ """
265
+ return self.get("mergeResultWaitTimeInSeconds")
266
+
267
+ @property
268
+ def merge_time_in_seconds(self) -> float:
269
+ """
270
+ The time taken by merge step. This includes all merge tasks.
271
+ """
272
+ return self.get("mergeTimeInSeconds")
273
+
274
+ @property
275
+ def merge_invoke_time_in_seconds(self) -> float:
276
+ """
277
+ The time taken to invoke all merge tasks.
278
+ """
279
+ return self.get("mergeInvokeTimeInSeconds")
280
+
241
281
  @property
242
282
  def delta_discovery_time_in_seconds(self) -> float:
243
283
  """
@@ -337,6 +377,13 @@ class CompactionSessionAuditInfo(dict):
337
377
  """
338
378
  return self.get("materializeResultSize")
339
379
 
380
+ @property
381
+ def merge_result_size(self) -> float:
382
+ """
383
+ The size of the results returned by merge step.
384
+ """
385
+ return self.get("mergeResultSize")
386
+
340
387
  @property
341
388
  def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
342
389
  """
@@ -344,6 +391,42 @@ class CompactionSessionAuditInfo(dict):
344
391
  """
345
392
  return self.get("peakMemoryUsedBytesCompactionSessionProcess")
346
393
 
394
+ @property
395
+ def estimated_in_memory_size_bytes_during_discovery(self) -> float:
396
+ """
397
+ The estimated in-memory size during the discovery. This can be used
398
+ to determine the accuracy of memory estimation logic.
399
+ """
400
+ return self.get("estimatedInMemorySizeBytesDuringDiscovery")
401
+
402
+ @property
403
+ def hash_bucket_processed_size_bytes(self) -> int:
404
+ """
405
+ The total size of the input data processed during hash bucket
406
+ """
407
+ return self.get("hashBucketProcessedSizeBytes")
408
+
409
+ @property
410
+ def total_cpu_seconds(self) -> float:
411
+ """
412
+ Total number of vCPUs provisioned in the cluster weighted over time.
413
+ """
414
+ return self.get("totalCPUSeconds")
415
+
416
+ @property
417
+ def used_cpu_seconds(self) -> float:
418
+ """
419
+ Total used vCPU in the cluster weighted over time.
420
+ """
421
+ return self.get("usedCPUSeconds")
422
+
423
+ @property
424
+ def pyarrow_version(self) -> str:
425
+ """
426
+ The version of PyArrow used.
427
+ """
428
+ return self.get("pyarrowVersion")
429
+
347
430
  # Setters follow
348
431
 
349
432
  def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -428,6 +511,12 @@ class CompactionSessionAuditInfo(dict):
428
511
  ] = peak_memory_used_bytes_per_materialize_task
429
512
  return self
430
513
 
514
+ def set_peak_memory_used_bytes_per_merge_task(
515
+ self, peak_memory_used_bytes: float
516
+ ) -> CompactionSessionAuditInfo:
517
+ self["mergeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes
518
+ return self
519
+
431
520
  def set_hash_bucket_post_object_store_memory_used_bytes(
432
521
  self, object_store_memory_used_bytes_by_hb: float
433
522
  ) -> CompactionSessionAuditInfo:
@@ -452,6 +541,12 @@ class CompactionSessionAuditInfo(dict):
452
541
  ] = object_store_memory_used_bytes_by_dedupe
453
542
  return self
454
543
 
544
+ def set_merge_post_object_store_memory_used_bytes(
545
+ self, object_store_memory_used_bytes: float
546
+ ) -> CompactionSessionAuditInfo:
547
+ self["mergePostObjectStoreMemoryUsedBytes"] = object_store_memory_used_bytes
548
+ return self
549
+
455
550
  def set_materialize_buckets(
456
551
  self, materialize_buckets: int
457
552
  ) -> CompactionSessionAuditInfo:
@@ -512,6 +607,24 @@ class CompactionSessionAuditInfo(dict):
512
607
  self.get["materializeResultWaitTimeInSeconds"] = wait_time
513
608
  return self
514
609
 
610
+ def set_merge_time_in_seconds(
611
+ self, time_in_seconds: float
612
+ ) -> CompactionSessionAuditInfo:
613
+ self["mergeTimeInSeconds"] = time_in_seconds
614
+ return self
615
+
616
+ def set_merge_invoke_time_in_seconds(
617
+ self, invoke_time: float
618
+ ) -> CompactionSessionAuditInfo:
619
+ self["mergeInvokeTimeInSeconds"] = invoke_time
620
+ return self
621
+
622
+ def set_merge_result_wait_time_in_seconds(
623
+ self, wait_time: float
624
+ ) -> CompactionSessionAuditInfo:
625
+ self.get["mergeResultWaitTimeInSeconds"] = wait_time
626
+ return self
627
+
515
628
  def set_delta_discovery_time_in_seconds(
516
629
  self, delta_discovery_time_in_seconds: float
517
630
  ) -> CompactionSessionAuditInfo:
@@ -598,12 +711,42 @@ class CompactionSessionAuditInfo(dict):
598
711
  self["materializeResultSize"] = materialize_result_size_bytes
599
712
  return self
600
713
 
714
+ def set_merge_result_size_bytes(
715
+ self, merge_result_size_bytes: float
716
+ ) -> CompactionSessionAuditInfo:
717
+ self["mergeResultSize"] = merge_result_size_bytes
718
+ return self
719
+
601
720
  def set_peak_memory_used_bytes_by_compaction_session_process(
602
721
  self, peak_memory: float
603
722
  ) -> CompactionSessionAuditInfo:
604
723
  self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
605
724
  return self
606
725
 
726
+ def set_estimated_in_memory_size_bytes_during_discovery(
727
+ self, memory: float
728
+ ) -> CompactionSessionAuditInfo:
729
+ self["estimatedInMemorySizeBytesDuringDiscovery"] = memory
730
+ return self
731
+
732
+ def set_hash_bucket_processed_size_bytes(
733
+ self, size: int
734
+ ) -> CompactionSessionAuditInfo:
735
+ self["hashBucketProcessedSizeBytes"] = size
736
+ return self
737
+
738
+ def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
739
+ self["totalCPUSeconds"] = value
740
+ return self
741
+
742
+ def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
743
+ self["usedCPUSeconds"] = value
744
+ return self
745
+
746
+ def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
747
+ self["pyarrowVersion"] = value
748
+ return self
749
+
607
750
  # High level methods to save stats
608
751
  def save_step_stats(
609
752
  self,
@@ -673,7 +816,10 @@ class CompactionSessionAuditInfo(dict):
673
816
  )
674
817
 
675
818
  total_count_of_src_dfl_not_touched = sum(
676
- m.referenced_pyarrow_write_result.files for m in mat_results
819
+ m.referenced_pyarrow_write_result.files
820
+ if m.referenced_pyarrow_write_result
821
+ else 0
822
+ for m in mat_results
677
823
  )
678
824
 
679
825
  logger.info(
@@ -697,10 +843,16 @@ class CompactionSessionAuditInfo(dict):
697
843
  )
698
844
 
699
845
  untouched_file_record_count = sum(
700
- m.referenced_pyarrow_write_result.records for m in mat_results
846
+ m.referenced_pyarrow_write_result.records
847
+ if m.referenced_pyarrow_write_result
848
+ else 0
849
+ for m in mat_results
701
850
  )
702
851
  untouched_file_size_bytes = sum(
703
- m.referenced_pyarrow_write_result.file_bytes for m in mat_results
852
+ m.referenced_pyarrow_write_result.file_bytes
853
+ if m.referenced_pyarrow_write_result
854
+ else 0
855
+ for m in mat_results
704
856
  )
705
857
 
706
858
  self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
@@ -715,11 +867,13 @@ class CompactionSessionAuditInfo(dict):
715
867
  self.set_peak_memory_used_bytes_per_task(
716
868
  max(
717
869
  [
718
- self.peak_memory_used_bytes_per_hash_bucket_task,
719
- self.peak_memory_used_bytes_per_dedupe_task,
720
- self.peak_memory_used_bytes_per_materialize_task,
870
+ self.peak_memory_used_bytes_per_hash_bucket_task or 0,
871
+ self.peak_memory_used_bytes_per_dedupe_task or 0,
872
+ self.peak_memory_used_bytes_per_materialize_task or 0,
873
+ self.peak_memory_used_bytes_per_merge_task or 0,
721
874
  ]
722
875
  )
723
876
  )
724
877
 
878
+ self.set_pyarrow_version(pa.__version__)
725
879
  self.set_telemetry_time_in_seconds(total_telemetry_time)
@@ -2,7 +2,9 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import logging
5
- from types import FunctionType
5
+ import copy
6
+ from deltacat.types.media import ContentType, ContentEncoding
7
+ from deltacat.types.partial_download import PartialParquetParameters
6
8
  from typing import Callable, List, Optional, Union
7
9
 
8
10
  from deltacat import logs
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
64
66
  annotated_deltas: List[DeltaAnnotated],
65
67
  min_delta_bytes: float,
66
68
  min_file_counts: Optional[Union[int, float]] = float("inf"),
67
- estimation_function: Optional[Callable] = None,
69
+ estimation_function: Optional[
70
+ Callable[[ManifestEntry], float]
71
+ ] = lambda entry: entry.meta.content_length,
68
72
  ) -> List[DeltaAnnotated]:
69
73
  """
70
74
  Simple greedy algorithm to split/merge 1 or more annotated deltas into
@@ -76,11 +80,21 @@ class DeltaAnnotated(Delta):
76
80
  of bytes at rest for the associated object. Returns the list of annotated
77
81
  delta groups.
78
82
  """
79
- groups = []
83
+ split_annotated_deltas: List[DeltaAnnotated] = []
84
+ groups: List[DeltaAnnotated] = []
80
85
  new_da = DeltaAnnotated()
81
86
  new_da_bytes = 0
82
87
  da_group_entry_count = 0
83
- for src_da in annotated_deltas:
88
+
89
+ for delta_annotated in annotated_deltas:
90
+ split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
91
+
92
+ logger.info(
93
+ f"Split the {len(annotated_deltas)} annotated deltas "
94
+ f"into {len(split_annotated_deltas)} groups."
95
+ )
96
+
97
+ for src_da in split_annotated_deltas:
84
98
  src_da_annotations = src_da.annotations
85
99
  src_da_entries = src_da.manifest.entries
86
100
  assert (
@@ -105,11 +119,7 @@ class DeltaAnnotated(Delta):
105
119
  src_da, new_da, src_entry, src_da_annotations[i]
106
120
  )
107
121
  # TODO: Fetch s3_obj["Size"] if entry content length undefined?
108
- estimated_new_da_bytes = (
109
- estimation_function(src_entry.meta.content_length)
110
- if type(estimation_function) is FunctionType
111
- else src_entry.meta.content_length
112
- )
122
+ estimated_new_da_bytes = estimation_function(src_entry)
113
123
  new_da_bytes += estimated_new_da_bytes
114
124
  da_group_entry_count += 1
115
125
  if (
@@ -132,6 +142,7 @@ class DeltaAnnotated(Delta):
132
142
  da_group_entry_count = 0
133
143
  if new_da:
134
144
  groups.append(new_da)
145
+
135
146
  return groups
136
147
 
137
148
  @staticmethod
@@ -207,3 +218,78 @@ class DeltaAnnotated(Delta):
207
218
  dst_da.type = None
208
219
  entries.append(src_entry)
209
220
  dst_da.annotations.append(src_annotation)
221
+
222
+ @staticmethod
223
+ def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
224
+ """
225
+ Split a single delta annotated into multiple granular
226
+ annotated entries. Note that split is not always guaranteed.
227
+
228
+ Note: Currently we are only able to split the Parquet File downloads.
229
+ """
230
+
231
+ result = []
232
+
233
+ if (
234
+ delta_annotated.meta
235
+ and delta_annotated.manifest
236
+ and delta_annotated.meta.content_type == ContentType.PARQUET
237
+ and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
238
+ ):
239
+ # we split by row groups
240
+ for entry_index, entry in enumerate(delta_annotated.manifest.entries):
241
+ input_split_params = None
242
+ if entry.meta and entry.meta.content_type_parameters:
243
+ for type_params in entry.meta.content_type_parameters:
244
+ if (
245
+ isinstance(type_params, PartialParquetParameters)
246
+ and type_params.num_row_groups > 1
247
+ and type_params.pq_metadata
248
+ ):
249
+ input_split_params = type_params
250
+ break
251
+
252
+ if input_split_params:
253
+ logger.info(
254
+ f"Splitting input file with URI: {entry.uri} into "
255
+ f"different {input_split_params.num_row_groups} entries"
256
+ )
257
+
258
+ for rg in input_split_params.row_groups_to_download:
259
+ new_da = DeltaAnnotated()
260
+ new_entry_dict = copy.deepcopy(entry)
261
+ new_entry = ManifestEntry(new_entry_dict)
262
+
263
+ row_group_meta = input_split_params.pq_metadata.row_group(rg)
264
+
265
+ new_partial_params = PartialParquetParameters.of(
266
+ row_groups_to_download=[rg],
267
+ num_row_groups=1,
268
+ num_rows=row_group_meta.num_rows,
269
+ in_memory_size_bytes=row_group_meta.total_byte_size,
270
+ pq_metadata=input_split_params.pq_metadata,
271
+ )
272
+
273
+ new_entry.meta.content_type_parameters = [new_partial_params]
274
+ for type_params in entry.meta.content_type_parameters:
275
+ if not isinstance(type_params, PartialParquetParameters):
276
+ new_entry.meta.content_type_parameters.append(
277
+ type_params
278
+ )
279
+
280
+ DeltaAnnotated._append_annotated_entry(
281
+ delta_annotated,
282
+ new_da,
283
+ new_entry,
284
+ delta_annotated.annotations[entry_index],
285
+ )
286
+
287
+ result.append(new_da)
288
+ else:
289
+ return [delta_annotated]
290
+
291
+ logger.info(
292
+ f"Split was not performed on the delta with locator: {delta_annotated.locator}"
293
+ )
294
+
295
+ return [delta_annotated]
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import numpy as np
5
+ import pyarrow as pa
5
6
 
6
7
  from deltacat.storage import DeltaType, LocalTable
7
8
 
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
37
38
  """
38
39
  if stream_position is None:
39
40
  raise ValueError("Missing delta file envelope stream position.")
40
- if file_index is None:
41
- raise ValueError("Missing delta file envelope file index.")
42
41
  if delta_type is None:
43
42
  raise ValueError("Missing Delta file envelope delta type.")
44
43
  if table is None:
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
75
74
  @property
76
75
  def file_record_count(self) -> int:
77
76
  return self["file_record_count"]
77
+
78
+ @property
79
+ def table_size_bytes(self) -> int:
80
+ if isinstance(self.table, pa.Table):
81
+ return self.table.nbytes
82
+ else:
83
+ raise ValueError(
84
+ f"Table type: {type(self.table)} not for supported for size method."
85
+ )
86
+
87
+ @property
88
+ def table_num_rows(self) -> int:
89
+ return len(self.table)
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Tuple
4
5
  from deltacat.storage import DeltaLocator, PartitionLocator
5
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
6
7
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
40
41
  compacted_delta_locator: DeltaLocator,
41
42
  compacted_pyarrow_write_result: PyArrowWriteResult,
42
43
  sort_keys_bit_width: int,
43
- rebase_source_partition_locator: Optional[PartitionLocator],
44
+ rebase_source_partition_locator: Optional[PartitionLocator] = None,
44
45
  manifest_entry_copied_by_reference_ratio: Optional[float] = None,
45
46
  compaction_audit_url: Optional[str] = None,
47
+ hash_bucket_count: Optional[int] = None,
48
+ hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
46
49
  ) -> RoundCompletionInfo:
47
50
 
48
51
  rci = RoundCompletionInfo()
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
55
58
  "manifestEntryCopiedByReferenceRatio"
56
59
  ] = manifest_entry_copied_by_reference_ratio
57
60
  rci["compactionAuditUrl"] = compaction_audit_url
61
+ rci["hashBucketCount"] = hash_bucket_count
62
+ rci["hbIndexToEntryRange"] = hb_index_to_entry_range
58
63
  return rci
59
64
 
60
65
  @property
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
97
102
  @property
98
103
  def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
99
104
  return self["manifestEntryCopiedByReferenceRatio"]
105
+
106
+ @property
107
+ def hash_bucket_count(self) -> Optional[int]:
108
+ return self["hashBucketCount"]
109
+
110
+ @property
111
+ def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
112
+ """
113
+ The start index is inclusive and end index is exclusive by default.
114
+ """
115
+ return self["hbIndexToEntryRange"]
@@ -54,6 +54,7 @@ def repartition(
54
54
  pg_config: Optional[PlacementGroupConfig] = None,
55
55
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
56
56
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
57
58
  s3_client_kwargs: Optional[Dict[str, Any]] = None,
58
59
  deltacat_storage=unimplemented_deltacat_storage,
59
60
  **kwargs,
@@ -91,7 +92,7 @@ def repartition(
91
92
  source_partition_locator.partition_values,
92
93
  ).stream_position,
93
94
  deltacat_storage,
94
- **list_deltas_kwargs,
95
+ list_deltas_kwargs,
95
96
  )
96
97
 
97
98
  uniform_deltas = []
@@ -131,6 +132,7 @@ def repartition(
131
132
  enable_profiler=enable_profiler,
132
133
  metrics_config=metrics_config,
133
134
  read_kwargs_provider=read_kwargs_provider,
135
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
134
136
  repartitioned_file_content_type=repartitioned_file_content_type,
135
137
  deltacat_storage=deltacat_storage,
136
138
  )
@@ -162,6 +164,7 @@ def repartition(
162
164
  source_partition_locator,
163
165
  sort_keys,
164
166
  deltacat_storage,
167
+ deltacat_storage_kwargs={},
165
168
  )
166
169
  repartition_completion_info = RoundCompletionInfo.of(
167
170
  last_stream_position_to_compact,
@@ -107,20 +107,21 @@ def _timed_dedupe(
107
107
  dedupe_task_index: int,
108
108
  enable_profiler: bool,
109
109
  object_store: Optional[IObjectStore],
110
+ **kwargs,
110
111
  ):
111
112
  task_id = get_current_ray_task_id()
112
113
  worker_id = get_current_ray_worker_id()
113
114
  with memray.Tracker(
114
115
  f"dedupe_{worker_id}_{task_id}.bin"
115
116
  ) if enable_profiler else nullcontext():
116
- # TODO (pdames): mitigate risk of running out of memory here in cases of
117
- # severe skew of primary key updates in deltas
117
+ # TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
118
118
  logger.info(
119
119
  f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
120
120
  f"groups for {len(object_ids)} object refs..."
121
121
  )
122
-
123
- delta_file_envelope_groups_list = object_store.get_many(object_ids)
122
+ delta_file_envelope_groups_list: List[object] = object_store.get_many(
123
+ object_ids
124
+ )
124
125
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
125
126
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
126
127
  for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -171,7 +172,8 @@ def _timed_dedupe(
171
172
 
172
173
  hb_table_record_count = len(table)
173
174
  table, drop_time = timed_invocation(
174
- func=_drop_duplicates_by_primary_key_hash, table=table
175
+ func=_drop_duplicates_by_primary_key_hash,
176
+ table=table,
175
177
  )
176
178
  deduped_record_count = hb_table_record_count - len(table)
177
179
  total_deduped_records += deduped_record_count
@@ -227,7 +229,6 @@ def _timed_dedupe(
227
229
  )
228
230
 
229
231
  peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
230
-
231
232
  return DedupeResult(
232
233
  mat_bucket_to_dd_idx_obj_id,
233
234
  np.int64(total_deduped_records),
@@ -246,6 +247,7 @@ def dedupe(
246
247
  enable_profiler: bool,
247
248
  metrics_config: MetricsConfig,
248
249
  object_store: Optional[IObjectStore],
250
+ **kwargs,
249
251
  ) -> DedupeResult:
250
252
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
251
253
  dedupe_result, duration = timed_invocation(
@@ -256,6 +258,7 @@ def dedupe(
256
258
  dedupe_task_index=dedupe_task_index,
257
259
  enable_profiler=enable_profiler,
258
260
  object_store=object_store,
261
+ **kwargs,
259
262
  )
260
263
 
261
264
  emit_metrics_time = 0.0