deltacat 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +11 -1
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +13 -0
  4. deltacat/compute/compactor/model/delta_annotated.py +10 -6
  5. deltacat/compute/compactor/repartition_session.py +2 -0
  6. deltacat/compute/compactor/steps/repartition.py +6 -0
  7. deltacat/compute/compactor_v2/compaction_session.py +72 -69
  8. deltacat/compute/compactor_v2/constants.py +3 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +17 -1
  10. deltacat/compute/compactor_v2/steps/merge.py +430 -2
  11. deltacat/compute/compactor_v2/utils/content_type_params.py +43 -14
  12. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  13. deltacat/compute/compactor_v2/utils/io.py +11 -8
  14. deltacat/compute/compactor_v2/utils/primary_key_index.py +58 -25
  15. deltacat/compute/compactor_v2/utils/task_options.py +8 -15
  16. deltacat/tests/compute/common.py +1 -1
  17. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -0
  18. deltacat/tests/compute/test_compaction_session_incremental.py +16 -1
  19. deltacat/tests/compute/testcases.py +7 -2
  20. deltacat/tests/test_utils/pyarrow.py +23 -6
  21. deltacat/types/partial_download.py +1 -0
  22. deltacat/types/tables.py +5 -0
  23. deltacat/utils/arguments.py +1 -2
  24. deltacat/utils/pyarrow.py +5 -0
  25. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +1 -1
  26. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +29 -30
  27. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  28. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +0 -199
  29. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  30. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +0 -0
  31. {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "0.1.18b15"
47
+ __version__ = "0.1.18b16"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -19,6 +19,7 @@ from deltacat.compute.compactor_v2.constants import (
19
19
  MIN_FILES_IN_BATCH,
20
20
  AVERAGE_RECORD_SIZE_BYTES,
21
21
  TASK_MAX_PARALLELISM,
22
+ DROP_DUPLICATES,
22
23
  )
23
24
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
24
25
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -88,6 +89,7 @@ class CompactPartitionParams(dict):
88
89
  result.hash_group_count = params.get(
89
90
  "hash_group_count", result.hash_bucket_count
90
91
  )
92
+ result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
91
93
 
92
94
  if not importlib.util.find_spec("memray"):
93
95
  result.enable_profiler = False
@@ -196,7 +198,7 @@ class CompactPartitionParams(dict):
196
198
 
197
199
  @property
198
200
  def min_delta_bytes_in_batch(self) -> float:
199
- return self["min_files_in_batch"]
201
+ return self["min_delta_bytes_in_batch"]
200
202
 
201
203
  @min_delta_bytes_in_batch.setter
202
204
  def min_delta_bytes_in_batch(self, min_delta_bytes_in_batch: float) -> None:
@@ -258,6 +260,14 @@ class CompactPartitionParams(dict):
258
260
  def records_per_compacted_file(self, count: int) -> None:
259
261
  self["records_per_compacted_file"] = count
260
262
 
263
+ @property
264
+ def drop_duplicates(self) -> bool:
265
+ return self["drop_duplicates"]
266
+
267
+ @drop_duplicates.setter
268
+ def drop_duplicates(self, value: bool):
269
+ self["drop_duplicates"] = value
270
+
261
271
  @property
262
272
  def bit_width_of_sort_keys(self) -> int:
263
273
  return self["bit_width_of_sort_keys"]
@@ -1,5 +1,6 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
+ import pyarrow as pa
3
4
  import logging
4
5
  from deltacat import logs
5
6
  from typing import List, Union
@@ -419,6 +420,13 @@ class CompactionSessionAuditInfo(dict):
419
420
  """
420
421
  return self.get("usedCPUSeconds")
421
422
 
423
+ @property
424
+ def pyarrow_version(self) -> str:
425
+ """
426
+ The version of PyArrow used.
427
+ """
428
+ return self.get("pyarrowVersion")
429
+
422
430
  # Setters follow
423
431
 
424
432
  def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -735,6 +743,10 @@ class CompactionSessionAuditInfo(dict):
735
743
  self["usedCPUSeconds"] = value
736
744
  return self
737
745
 
746
+ def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
747
+ self["pyarrowVersion"] = value
748
+ return self
749
+
738
750
  # High level methods to save stats
739
751
  def save_step_stats(
740
752
  self,
@@ -863,4 +875,5 @@ class CompactionSessionAuditInfo(dict):
863
875
  )
864
876
  )
865
877
 
878
+ self.set_pyarrow_version(pa.__version__)
866
879
  self.set_telemetry_time_in_seconds(total_telemetry_time)
@@ -89,6 +89,11 @@ class DeltaAnnotated(Delta):
89
89
  for delta_annotated in annotated_deltas:
90
90
  split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
91
91
 
92
+ logger.info(
93
+ f"Split the {len(annotated_deltas)} annotated deltas "
94
+ f"into {len(split_annotated_deltas)} groups."
95
+ )
96
+
92
97
  for src_da in split_annotated_deltas:
93
98
  src_da_annotations = src_da.annotations
94
99
  src_da_entries = src_da.manifest.entries
@@ -280,12 +285,11 @@ class DeltaAnnotated(Delta):
280
285
  )
281
286
 
282
287
  result.append(new_da)
288
+ else:
289
+ return [delta_annotated]
283
290
 
284
- if result:
285
- return result
286
- else:
287
- logger.info(
288
- f"Split was not performed on the delta with locator: {delta_annotated.locator}"
289
- )
291
+ logger.info(
292
+ f"Split was not performed on the delta with locator: {delta_annotated.locator}"
293
+ )
290
294
 
291
295
  return [delta_annotated]
@@ -54,6 +54,7 @@ def repartition(
54
54
  pg_config: Optional[PlacementGroupConfig] = None,
55
55
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
56
56
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
57
58
  s3_client_kwargs: Optional[Dict[str, Any]] = None,
58
59
  deltacat_storage=unimplemented_deltacat_storage,
59
60
  **kwargs,
@@ -131,6 +132,7 @@ def repartition(
131
132
  enable_profiler=enable_profiler,
132
133
  metrics_config=metrics_config,
133
134
  read_kwargs_provider=read_kwargs_provider,
135
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
134
136
  repartitioned_file_content_type=repartitioned_file_content_type,
135
137
  deltacat_storage=deltacat_storage,
136
138
  )
@@ -56,6 +56,7 @@ def repartition_range(
56
56
  destination_partition: Partition,
57
57
  repartition_args: dict,
58
58
  max_records_per_output_file: int,
59
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
59
60
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
60
61
  deltacat_storage=unimplemented_deltacat_storage,
61
62
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -145,6 +146,7 @@ def repartition_range(
145
146
  destination_partition,
146
147
  max_records_per_entry=max_records_per_output_file,
147
148
  content_type=repartitioned_file_content_type,
149
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
148
150
  **deltacat_storage_kwargs,
149
151
  )
150
152
  partition_deltas.append(partition_delta)
@@ -166,6 +168,7 @@ def _timed_repartition(
166
168
  max_records_per_output_file: int,
167
169
  enable_profiler: bool,
168
170
  read_kwargs_provider: Optional[ReadKwargsProvider],
171
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
169
172
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
170
173
  deltacat_storage=unimplemented_deltacat_storage,
171
174
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -189,6 +192,7 @@ def _timed_repartition(
189
192
  destination_partition=destination_partition,
190
193
  repartition_args=repartition_args,
191
194
  max_records_per_output_file=max_records_per_output_file,
195
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
192
196
  repartitioned_file_content_type=repartitioned_file_content_type,
193
197
  deltacat_storage=deltacat_storage,
194
198
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -209,6 +213,7 @@ def repartition(
209
213
  enable_profiler: bool,
210
214
  metrics_config: Optional[MetricsConfig],
211
215
  read_kwargs_provider: Optional[ReadKwargsProvider],
216
+ s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
212
217
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
213
218
  deltacat_storage=unimplemented_deltacat_storage,
214
219
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
@@ -226,6 +231,7 @@ def repartition(
226
231
  max_records_per_output_file=max_records_per_output_file,
227
232
  enable_profiler=enable_profiler,
228
233
  read_kwargs_provider=read_kwargs_provider,
234
+ s3_table_writer_kwargs=s3_table_writer_kwargs,
229
235
  repartitioned_file_content_type=repartitioned_file_content_type,
230
236
  deltacat_storage=deltacat_storage,
231
237
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -133,7 +133,7 @@ def _execute_compaction(
133
133
  # read the results from any previously completed compaction round
134
134
  round_completion_info = None
135
135
  high_watermark = None
136
- previous_compacted_delta = None
136
+ previous_compacted_delta_manifest = None
137
137
 
138
138
  if not params.rebase_source_partition_locator:
139
139
  round_completion_info = rcf.read_round_completion_file(
@@ -147,13 +147,11 @@ def _execute_compaction(
147
147
  )
148
148
  else:
149
149
  compacted_delta_locator = round_completion_info.compacted_delta_locator
150
- previous_compacted_delta = params.deltacat_storage.get_delta(
151
- namespace=compacted_delta_locator.namespace,
152
- table_name=compacted_delta_locator.table_name,
153
- table_version=compacted_delta_locator.table_version,
154
- stream_position=compacted_delta_locator.stream_position,
155
- include_manifest=True,
156
- **params.deltacat_storage_kwargs,
150
+
151
+ previous_compacted_delta_manifest = (
152
+ params.deltacat_storage.get_delta_manifest(
153
+ compacted_delta_locator, **params.deltacat_storage_kwargs
154
+ )
157
155
  )
158
156
 
159
157
  high_watermark = round_completion_info.high_watermark
@@ -182,7 +180,22 @@ def _execute_compaction(
182
180
  params.list_deltas_kwargs,
183
181
  )
184
182
 
183
+ uniform_deltas = io.create_uniform_input_deltas(
184
+ input_deltas=input_deltas,
185
+ hash_bucket_count=params.hash_bucket_count,
186
+ compaction_audit=compaction_audit,
187
+ deltacat_storage=params.deltacat_storage,
188
+ previous_inflation=params.previous_inflation,
189
+ min_delta_bytes=params.min_delta_bytes_in_batch,
190
+ min_file_counts=params.min_files_in_batch,
191
+ # disable input split during rebase as the rebase files are already uniform
192
+ enable_input_split=params.rebase_source_partition_locator is None,
193
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
194
+ )
195
+
185
196
  delta_discovery_end = time.monotonic()
197
+
198
+ compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
186
199
  compaction_audit.set_delta_discovery_time_in_seconds(
187
200
  delta_discovery_end - delta_discovery_start
188
201
  )
@@ -197,19 +210,6 @@ def _execute_compaction(
197
210
  logger.info("No input deltas found to compact.")
198
211
  return None, None, None
199
212
 
200
- uniform_deltas = io.create_uniform_input_deltas(
201
- input_deltas=input_deltas,
202
- hash_bucket_count=params.hash_bucket_count,
203
- compaction_audit=compaction_audit,
204
- deltacat_storage=params.deltacat_storage,
205
- previous_inflation=params.previous_inflation,
206
- min_delta_bytes=params.min_delta_bytes_in_batch,
207
- min_file_counts=params.min_files_in_batch,
208
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
209
- )
210
-
211
- compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
212
-
213
213
  hb_options_provider = functools.partial(
214
214
  task_resource_options_provider,
215
215
  pg_config=params.pg_config,
@@ -221,20 +221,21 @@ def _execute_compaction(
221
221
 
222
222
  hb_start = time.monotonic()
223
223
 
224
- hash_bucket_input_provider = lambda index, item: {
225
- "input": HashBucketInput.of(
226
- item,
227
- primary_keys=params.primary_keys,
228
- num_hash_buckets=params.hash_bucket_count,
229
- num_hash_groups=params.hash_group_count,
230
- enable_profiler=params.enable_profiler,
231
- metrics_config=params.metrics_config,
232
- read_kwargs_provider=params.read_kwargs_provider,
233
- object_store=params.object_store,
234
- deltacat_storage=params.deltacat_storage,
235
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
236
- )
237
- }
224
+ def hash_bucket_input_provider(index, item):
225
+ return {
226
+ "input": HashBucketInput.of(
227
+ item,
228
+ primary_keys=params.primary_keys,
229
+ num_hash_buckets=params.hash_bucket_count,
230
+ num_hash_groups=params.hash_group_count,
231
+ enable_profiler=params.enable_profiler,
232
+ metrics_config=params.metrics_config,
233
+ read_kwargs_provider=params.read_kwargs_provider,
234
+ object_store=params.object_store,
235
+ deltacat_storage=params.deltacat_storage,
236
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
237
+ )
238
+ }
238
239
 
239
240
  hb_tasks_pending = invoke_parallel(
240
241
  items=uniform_deltas,
@@ -332,33 +333,36 @@ def _execute_compaction(
332
333
  hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
333
334
  hash_group_num_rows=all_hash_group_idx_to_num_rows,
334
335
  round_completion_info=round_completion_info,
335
- compacted_delta=previous_compacted_delta,
336
+ compacted_delta_manifest=previous_compacted_delta_manifest,
336
337
  primary_keys=params.primary_keys,
337
338
  deltacat_storage=params.deltacat_storage,
338
339
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
339
340
  )
340
341
 
341
- merge_input_provider = lambda index, item: {
342
- "input": MergeInput.of(
343
- dfe_groups_refs=item[1],
344
- write_to_partition=compacted_partition,
345
- compacted_file_content_type=params.compacted_file_content_type,
346
- primary_keys=params.primary_keys,
347
- sort_keys=params.sort_keys,
348
- merge_task_index=index,
349
- hash_group_index=item[0],
350
- num_hash_groups=params.hash_group_count,
351
- max_records_per_output_file=params.records_per_compacted_file,
352
- enable_profiler=params.enable_profiler,
353
- metrics_config=params.metrics_config,
354
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
355
- read_kwargs_provider=params.read_kwargs_provider,
356
- round_completion_info=round_completion_info,
357
- object_store=params.object_store,
358
- deltacat_storage=params.deltacat_storage,
359
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
360
- )
361
- }
342
+ def merge_input_provider(index, item):
343
+ return {
344
+ "input": MergeInput.of(
345
+ dfe_groups_refs=item[1],
346
+ write_to_partition=compacted_partition,
347
+ compacted_file_content_type=params.compacted_file_content_type,
348
+ primary_keys=params.primary_keys,
349
+ sort_keys=params.sort_keys,
350
+ merge_task_index=index,
351
+ hash_bucket_count=params.hash_bucket_count,
352
+ drop_duplicates=params.drop_duplicates,
353
+ hash_group_index=item[0],
354
+ num_hash_groups=params.hash_group_count,
355
+ max_records_per_output_file=params.records_per_compacted_file,
356
+ enable_profiler=params.enable_profiler,
357
+ metrics_config=params.metrics_config,
358
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
359
+ read_kwargs_provider=params.read_kwargs_provider,
360
+ round_completion_info=round_completion_info,
361
+ object_store=params.object_store,
362
+ deltacat_storage=params.deltacat_storage,
363
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
364
+ )
365
+ }
362
366
 
363
367
  merge_start = time.monotonic()
364
368
 
@@ -399,25 +403,25 @@ def _execute_compaction(
399
403
  mat_results, key=lambda m: m.task_index
400
404
  )
401
405
 
402
- deltas = [m.delta for m in mat_results]
403
-
404
406
  hb_id_to_entry_indices_range = {}
405
407
  file_index = 0
406
408
  previous_task_index = -1
407
409
 
408
- for m in mat_results:
409
- assert m.pyarrow_write_result.files >= 1, "Atleast file must be materialized"
410
- assert m.task_index != previous_task_index, (
411
- "Multiple materialize results found for a " f"hash bucket: {m.task_index}"
412
- )
410
+ for mat_result in mat_results:
411
+ assert (
412
+ mat_result.pyarrow_write_result.files >= 1
413
+ ), "Atleast one file must be materialized"
414
+ assert (
415
+ mat_result.task_index != previous_task_index
416
+ ), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
413
417
 
414
- hb_id_to_entry_indices_range[str(m.task_index)] = (
418
+ hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
415
419
  file_index,
416
- file_index + m.pyarrow_write_result.files - 1,
420
+ file_index + mat_result.pyarrow_write_result.files,
417
421
  )
418
422
 
419
- file_index += m.pyarrow_write_result.files
420
- previous_task_index = m.task_index
423
+ file_index += mat_result.pyarrow_write_result.files
424
+ previous_task_index = mat_result.task_index
421
425
 
422
426
  s3_utils.upload(
423
427
  compaction_audit.audit_url,
@@ -425,7 +429,6 @@ def _execute_compaction(
425
429
  **params.s3_client_kwargs,
426
430
  )
427
431
 
428
- mat_results = sorted(mat_results, key=lambda m: m.task_index)
429
432
  deltas = [m.delta for m in mat_results]
430
433
 
431
434
  # Note: An appropriate last stream position must be set
@@ -32,3 +32,6 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 20
32
32
  # Since, sorting is nlogn, we ensure that is not performed
33
33
  # on a very large dataset for best performance.
34
34
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
35
+
36
+ # Whether to drop duplicates during merge.
37
+ DROP_DUPLICATES = True
@@ -10,6 +10,10 @@ from deltacat.storage import (
10
10
  SortKey,
11
11
  interface as unimplemented_deltacat_storage,
12
12
  )
13
+ from deltacat.compute.compactor_v2.constants import (
14
+ DROP_DUPLICATES,
15
+ MAX_RECORDS_PER_COMPACTED_FILE,
16
+ )
13
17
  from deltacat.types.media import ContentType
14
18
  from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
15
19
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
@@ -24,9 +28,11 @@ class MergeInput(Dict):
24
28
  primary_keys: List[str],
25
29
  hash_group_index: int,
26
30
  num_hash_groups: int,
31
+ hash_bucket_count: int,
32
+ drop_duplicates: Optional[bool] = DROP_DUPLICATES,
27
33
  sort_keys: Optional[List[SortKey]] = None,
28
34
  merge_task_index: Optional[int] = 0,
29
- max_records_per_output_file: Optional[int] = 4_000_000,
35
+ max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
30
36
  enable_profiler: Optional[bool] = False,
31
37
  metrics_config: Optional[MetricsConfig] = None,
32
38
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
@@ -44,6 +50,8 @@ class MergeInput(Dict):
44
50
  result["primary_keys"] = primary_keys
45
51
  result["hash_group_index"] = hash_group_index
46
52
  result["num_hash_groups"] = num_hash_groups
53
+ result["hash_bucket_count"] = hash_bucket_count
54
+ result["drop_duplicates"] = drop_duplicates
47
55
  result["sort_keys"] = sort_keys
48
56
  result["merge_task_index"] = merge_task_index
49
57
  result["max_records_per_output_file"] = max_records_per_output_file
@@ -82,6 +90,14 @@ class MergeInput(Dict):
82
90
  def num_hash_groups(self) -> int:
83
91
  return self["num_hash_groups"]
84
92
 
93
+ @property
94
+ def hash_bucket_count(self) -> int:
95
+ return self["hash_bucket_count"]
96
+
97
+ @property
98
+ def drop_duplicates(self) -> int:
99
+ return self["drop_duplicates"]
100
+
85
101
  @property
86
102
  def sort_keys(self) -> Optional[List[SortKey]]:
87
103
  return self.get("sort_keys")