deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/redshift/__init__.py +4 -0
  3. deltacat/aws/redshift/model/manifest.py +93 -1
  4. deltacat/aws/s3u.py +250 -111
  5. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  6. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  7. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  8. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  9. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  10. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  11. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  12. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  13. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  14. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  15. deltacat/compute/compactor_v2/utils/task_options.py +47 -4
  16. deltacat/compute/merge_on_read/__init__.py +4 -0
  17. deltacat/compute/merge_on_read/daft.py +40 -0
  18. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  20. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  21. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  22. deltacat/storage/interface.py +10 -2
  23. deltacat/storage/model/types.py +3 -11
  24. deltacat/tests/catalog/__init__.py +0 -0
  25. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  26. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  27. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  28. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  29. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  30. deltacat/tests/test_utils/pyarrow.py +33 -14
  31. deltacat/tests/utils/test_daft.py +42 -2
  32. deltacat/types/media.py +5 -0
  33. deltacat/types/tables.py +7 -1
  34. deltacat/utils/daft.py +78 -13
  35. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
  36. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
  37. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
  38. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
  39. {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -6,18 +6,24 @@ import logging
6
6
  import ray
7
7
  import time
8
8
  import json
9
+
10
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
11
+ RemoteMergeFileGroupsProvider,
12
+ )
13
+ from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
14
+
15
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
16
+
9
17
  from deltacat.aws import s3u as s3_utils
10
18
  import deltacat
11
19
  from deltacat import logs
12
- from deltacat.compute.compactor import (
13
- PyArrowWriteResult,
14
- RoundCompletionInfo,
15
- )
16
- from deltacat.compute.compactor_v2.model.merge_input import MergeInput
20
+ from deltacat.compute.compactor import PyArrowWriteResult, RoundCompletionInfo
17
21
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
18
- from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
19
22
  from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
20
23
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
24
+ from deltacat.compute.compactor_v2.utils.merge import (
25
+ generate_local_merge_input,
26
+ )
21
27
  from deltacat.storage import (
22
28
  Delta,
23
29
  DeltaLocator,
@@ -210,107 +216,6 @@ def _execute_compaction(
210
216
  logger.info("No input deltas found to compact.")
211
217
  return None, None, None
212
218
 
213
- hb_options_provider = functools.partial(
214
- task_resource_options_provider,
215
- pg_config=params.pg_config,
216
- resource_amount_provider=hash_bucket_resource_options_provider,
217
- previous_inflation=params.previous_inflation,
218
- average_record_size_bytes=params.average_record_size_bytes,
219
- primary_keys=params.primary_keys,
220
- ray_custom_resources=params.ray_custom_resources,
221
- )
222
-
223
- hb_start = time.monotonic()
224
-
225
- def hash_bucket_input_provider(index, item):
226
- return {
227
- "input": HashBucketInput.of(
228
- item,
229
- primary_keys=params.primary_keys,
230
- num_hash_buckets=params.hash_bucket_count,
231
- num_hash_groups=params.hash_group_count,
232
- enable_profiler=params.enable_profiler,
233
- metrics_config=params.metrics_config,
234
- read_kwargs_provider=params.read_kwargs_provider,
235
- object_store=params.object_store,
236
- deltacat_storage=params.deltacat_storage,
237
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
238
- )
239
- }
240
-
241
- hb_tasks_pending = invoke_parallel(
242
- items=uniform_deltas,
243
- ray_task=hb.hash_bucket,
244
- max_parallelism=task_max_parallelism,
245
- options_provider=hb_options_provider,
246
- kwargs_provider=hash_bucket_input_provider,
247
- )
248
-
249
- hb_invoke_end = time.monotonic()
250
-
251
- logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
252
- hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
253
- logger.info(f"Got {len(hb_results)} hash bucket results.")
254
- hb_end = time.monotonic()
255
-
256
- # we use time.time() here because time.monotonic() has no reference point
257
- # whereas time.time() measures epoch seconds. Hence, it will be reasonable
258
- # to compare time.time()s captured in different nodes.
259
- hb_results_retrieved_at = time.time()
260
-
261
- telemetry_time_hb = compaction_audit.save_step_stats(
262
- CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
263
- hb_results,
264
- hb_results_retrieved_at,
265
- hb_invoke_end - hb_start,
266
- hb_end - hb_start,
267
- )
268
-
269
- s3_utils.upload(
270
- compaction_audit.audit_url,
271
- str(json.dumps(compaction_audit)),
272
- **params.s3_client_kwargs,
273
- )
274
-
275
- all_hash_group_idx_to_obj_id = defaultdict(list)
276
- all_hash_group_idx_to_size_bytes = defaultdict(int)
277
- all_hash_group_idx_to_num_rows = defaultdict(int)
278
- hb_data_processed_size_bytes = np.int64(0)
279
- total_hb_record_count = np.int64(0)
280
-
281
- # initialize all hash groups
282
- for hb_group in range(params.hash_group_count):
283
- all_hash_group_idx_to_num_rows[hb_group] = 0
284
- all_hash_group_idx_to_obj_id[hb_group] = []
285
- all_hash_group_idx_to_size_bytes[hb_group] = 0
286
-
287
- for hb_result in hb_results:
288
- hb_data_processed_size_bytes += hb_result.hb_size_bytes
289
- total_hb_record_count += hb_result.hb_record_count
290
-
291
- for hash_group_index, object_id_size_tuple in enumerate(
292
- hb_result.hash_bucket_group_to_obj_id_tuple
293
- ):
294
- if object_id_size_tuple:
295
- all_hash_group_idx_to_obj_id[hash_group_index].append(
296
- object_id_size_tuple[0]
297
- )
298
- all_hash_group_idx_to_size_bytes[
299
- hash_group_index
300
- ] += object_id_size_tuple[1].item()
301
- all_hash_group_idx_to_num_rows[
302
- hash_group_index
303
- ] += object_id_size_tuple[2].item()
304
-
305
- logger.info(
306
- f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
307
- )
308
-
309
- compaction_audit.set_input_records(total_hb_record_count.item())
310
- compaction_audit.set_hash_bucket_processed_size_bytes(
311
- hb_data_processed_size_bytes.item()
312
- )
313
-
314
219
  # create a new stream for this round
315
220
  compacted_stream_locator = params.destination_partition_locator.stream_locator
316
221
  compacted_stream = params.deltacat_storage.get_stream(
@@ -325,60 +230,176 @@ def _execute_compaction(
325
230
  **params.deltacat_storage_kwargs,
326
231
  )
327
232
 
328
- # BSP Step 2: Merge
329
- merge_options_provider = functools.partial(
233
+ hb_options_provider = functools.partial(
330
234
  task_resource_options_provider,
331
235
  pg_config=params.pg_config,
332
- resource_amount_provider=merge_resource_options_provider,
333
- num_hash_groups=params.hash_group_count,
334
- hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
335
- hash_group_num_rows=all_hash_group_idx_to_num_rows,
336
- round_completion_info=round_completion_info,
337
- compacted_delta_manifest=previous_compacted_delta_manifest,
236
+ resource_amount_provider=hash_bucket_resource_options_provider,
237
+ previous_inflation=params.previous_inflation,
238
+ average_record_size_bytes=params.average_record_size_bytes,
338
239
  primary_keys=params.primary_keys,
339
- deltacat_storage=params.deltacat_storage,
340
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
341
240
  ray_custom_resources=params.ray_custom_resources,
342
241
  )
343
242
 
344
- def merge_input_provider(index, item):
345
- return {
346
- "input": MergeInput.of(
347
- dfe_groups_refs=item[1],
348
- write_to_partition=compacted_partition,
349
- compacted_file_content_type=params.compacted_file_content_type,
350
- primary_keys=params.primary_keys,
351
- sort_keys=params.sort_keys,
352
- merge_task_index=index,
353
- hash_bucket_count=params.hash_bucket_count,
354
- drop_duplicates=params.drop_duplicates,
355
- hash_group_index=item[0],
356
- num_hash_groups=params.hash_group_count,
357
- max_records_per_output_file=params.records_per_compacted_file,
358
- enable_profiler=params.enable_profiler,
359
- metrics_config=params.metrics_config,
360
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
361
- read_kwargs_provider=params.read_kwargs_provider,
362
- round_completion_info=round_completion_info,
363
- object_store=params.object_store,
364
- deltacat_storage=params.deltacat_storage,
365
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
366
- )
367
- }
243
+ total_input_records_count = np.int64(0)
244
+ total_hb_record_count = np.int64(0)
245
+ telemetry_time_hb = 0
246
+ if params.hash_bucket_count == 1:
247
+ merge_start = time.monotonic()
248
+ local_merge_input = generate_local_merge_input(
249
+ params, uniform_deltas, compacted_partition, round_completion_info
250
+ )
251
+ local_merge_result = ray.get(mg.merge.remote(local_merge_input))
252
+ total_input_records_count += local_merge_result.input_record_count
253
+ merge_results = [local_merge_result]
254
+ merge_invoke_end = time.monotonic()
255
+ else:
256
+ hb_start = time.monotonic()
257
+
258
+ def hash_bucket_input_provider(index, item):
259
+ return {
260
+ "input": HashBucketInput.of(
261
+ item,
262
+ primary_keys=params.primary_keys,
263
+ hb_task_index=index,
264
+ num_hash_buckets=params.hash_bucket_count,
265
+ num_hash_groups=params.hash_group_count,
266
+ enable_profiler=params.enable_profiler,
267
+ metrics_config=params.metrics_config,
268
+ read_kwargs_provider=params.read_kwargs_provider,
269
+ object_store=params.object_store,
270
+ deltacat_storage=params.deltacat_storage,
271
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
272
+ )
273
+ }
274
+
275
+ all_hash_group_idx_to_obj_id = defaultdict(list)
276
+ all_hash_group_idx_to_size_bytes = defaultdict(int)
277
+ all_hash_group_idx_to_num_rows = defaultdict(int)
278
+ hb_tasks_pending = invoke_parallel(
279
+ items=uniform_deltas,
280
+ ray_task=hb.hash_bucket,
281
+ max_parallelism=task_max_parallelism,
282
+ options_provider=hb_options_provider,
283
+ kwargs_provider=hash_bucket_input_provider,
284
+ )
368
285
 
369
- merge_start = time.monotonic()
286
+ hb_invoke_end = time.monotonic()
370
287
 
371
- merge_tasks_pending = invoke_parallel(
372
- items=all_hash_group_idx_to_obj_id.items(),
373
- ray_task=mg.merge,
374
- max_parallelism=task_max_parallelism,
375
- options_provider=merge_options_provider,
376
- kwargs_provider=merge_input_provider,
377
- )
288
+ logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
289
+ hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
290
+ logger.info(f"Got {len(hb_results)} hash bucket results.")
291
+ hb_end = time.monotonic()
292
+
293
+ # we use time.time() here because time.monotonic() has no reference point
294
+ # whereas time.time() measures epoch seconds. Hence, it will be reasonable
295
+ # to compare time.time()s captured in different nodes.
296
+ hb_results_retrieved_at = time.time()
297
+
298
+ telemetry_time_hb = compaction_audit.save_step_stats(
299
+ CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
300
+ hb_results,
301
+ hb_results_retrieved_at,
302
+ hb_invoke_end - hb_start,
303
+ hb_end - hb_start,
304
+ )
305
+
306
+ s3_utils.upload(
307
+ compaction_audit.audit_url,
308
+ str(json.dumps(compaction_audit)),
309
+ **params.s3_client_kwargs,
310
+ )
311
+
312
+ hb_data_processed_size_bytes = np.int64(0)
313
+
314
+ # initialize all hash groups
315
+ for hb_group in range(params.hash_group_count):
316
+ all_hash_group_idx_to_num_rows[hb_group] = 0
317
+ all_hash_group_idx_to_obj_id[hb_group] = []
318
+ all_hash_group_idx_to_size_bytes[hb_group] = 0
319
+
320
+ for hb_result in hb_results:
321
+ hb_data_processed_size_bytes += hb_result.hb_size_bytes
322
+ total_input_records_count += hb_result.hb_record_count
323
+
324
+ for hash_group_index, object_id_size_tuple in enumerate(
325
+ hb_result.hash_bucket_group_to_obj_id_tuple
326
+ ):
327
+ if object_id_size_tuple:
328
+ all_hash_group_idx_to_obj_id[hash_group_index].append(
329
+ object_id_size_tuple[0],
330
+ )
331
+ all_hash_group_idx_to_size_bytes[
332
+ hash_group_index
333
+ ] += object_id_size_tuple[1].item()
334
+ all_hash_group_idx_to_num_rows[
335
+ hash_group_index
336
+ ] += object_id_size_tuple[2].item()
337
+
338
+ logger.info(
339
+ f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
340
+ )
341
+
342
+ total_hb_record_count = total_input_records_count
343
+ compaction_audit.set_hash_bucket_processed_size_bytes(
344
+ hb_data_processed_size_bytes.item()
345
+ )
346
+
347
+ # BSP Step 2: Merge
348
+ merge_options_provider = functools.partial(
349
+ task_resource_options_provider,
350
+ pg_config=params.pg_config,
351
+ resource_amount_provider=merge_resource_options_provider,
352
+ num_hash_groups=params.hash_group_count,
353
+ hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
354
+ hash_group_num_rows=all_hash_group_idx_to_num_rows,
355
+ round_completion_info=round_completion_info,
356
+ compacted_delta_manifest=previous_compacted_delta_manifest,
357
+ primary_keys=params.primary_keys,
358
+ deltacat_storage=params.deltacat_storage,
359
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
360
+ ray_custom_resources=params.ray_custom_resources,
361
+ )
362
+
363
+ def merge_input_provider(index, item):
364
+ return {
365
+ "input": MergeInput.of(
366
+ merge_file_groups_provider=RemoteMergeFileGroupsProvider(
367
+ hash_group_index=item[0],
368
+ dfe_groups_refs=item[1],
369
+ hash_bucket_count=params.hash_bucket_count,
370
+ num_hash_groups=params.hash_group_count,
371
+ object_store=params.object_store,
372
+ ),
373
+ write_to_partition=compacted_partition,
374
+ compacted_file_content_type=params.compacted_file_content_type,
375
+ primary_keys=params.primary_keys,
376
+ sort_keys=params.sort_keys,
377
+ merge_task_index=index,
378
+ drop_duplicates=params.drop_duplicates,
379
+ max_records_per_output_file=params.records_per_compacted_file,
380
+ enable_profiler=params.enable_profiler,
381
+ metrics_config=params.metrics_config,
382
+ s3_table_writer_kwargs=params.s3_table_writer_kwargs,
383
+ read_kwargs_provider=params.read_kwargs_provider,
384
+ round_completion_info=round_completion_info,
385
+ object_store=params.object_store,
386
+ deltacat_storage=params.deltacat_storage,
387
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
388
+ )
389
+ }
390
+
391
+ merge_start = time.monotonic()
392
+ merge_tasks_pending = invoke_parallel(
393
+ items=all_hash_group_idx_to_obj_id.items(),
394
+ ray_task=mg.merge,
395
+ max_parallelism=task_max_parallelism,
396
+ options_provider=merge_options_provider,
397
+ kwargs_provider=merge_input_provider,
398
+ )
399
+ merge_invoke_end = time.monotonic()
400
+ logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
401
+ merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
378
402
 
379
- merge_invoke_end = time.monotonic()
380
- logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
381
- merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
382
403
  logger.info(f"Got {len(merge_results)} merge results.")
383
404
 
384
405
  merge_results_retrieved_at = time.time()
@@ -387,6 +408,8 @@ def _execute_compaction(
387
408
  total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
388
409
  logger.info(f"Deduped {total_dd_record_count} records...")
389
410
 
411
+ compaction_audit.set_input_records(total_input_records_count.item())
412
+
390
413
  telemetry_time_merge = compaction_audit.save_step_stats(
391
414
  CompactionSessionAuditInfo.MERGE_STEP_NAME,
392
415
  merge_results,
@@ -15,6 +15,7 @@ class HashBucketInput(Dict):
15
15
  primary_keys: List[str],
16
16
  num_hash_buckets: int,
17
17
  num_hash_groups: int,
18
+ hb_task_index: Optional[int] = 0,
18
19
  enable_profiler: Optional[bool] = False,
19
20
  metrics_config: Optional[MetricsConfig] = None,
20
21
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
@@ -26,6 +27,7 @@ class HashBucketInput(Dict):
26
27
  result = HashBucketInput()
27
28
  result["annotated_delta"] = annotated_delta
28
29
  result["primary_keys"] = primary_keys
30
+ result["hb_task_index"] = hb_task_index
29
31
  result["num_hash_buckets"] = num_hash_buckets
30
32
  result["num_hash_groups"] = num_hash_groups
31
33
  result["enable_profiler"] = enable_profiler
@@ -45,6 +47,10 @@ class HashBucketInput(Dict):
45
47
  def primary_keys(self) -> List[str]:
46
48
  return self["primary_keys"]
47
49
 
50
+ @property
51
+ def hb_task_index(self) -> List[str]:
52
+ return self["hb_task_index"]
53
+
48
54
  @property
49
55
  def num_hash_buckets(self) -> int:
50
56
  return self["num_hash_buckets"]
@@ -0,0 +1,213 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import time
6
+ from abc import ABC, abstractmethod
7
+ from collections import defaultdict
8
+
9
+ from deltacat.utils.common import ReadKwargsProvider
10
+ from ray.types import ObjectRef
11
+
12
+ from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
13
+ from deltacat.compute.compactor_v2.utils.delta import read_delta_file_envelopes
14
+
15
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
16
+ hash_group_index_to_hash_bucket_indices,
17
+ )
18
+
19
+ from deltacat.storage import interface as unimplemented_deltacat_storage
20
+
21
+ from deltacat.io.object_store import IObjectStore
22
+
23
+ from deltacat import logs
24
+
25
+ from deltacat.compute.compactor import DeltaFileEnvelope, DeltaAnnotated
26
+
27
+ from typing import List, Optional
28
+
29
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
+
31
+
32
+ class MergeFileGroup(dict):
33
+ @staticmethod
34
+ def of(hb_index: int, dfe_groups: Optional[List[List[DeltaFileEnvelope]]] = None):
35
+ """
36
+ Creates a container with delta file envelope groupings and other
37
+ additional properties used primarily for the merging step.
38
+
39
+ Args:
40
+ hb_index: This signifies the hash bucket index corresponding to the envelope delta file groups.
41
+ dfe_groups: A list of delta file envelope groups.
42
+ If not present, the provided hash bucket index is a copy by reference candidate during the merge step.
43
+
44
+ Returns:
45
+ A dict
46
+
47
+ """
48
+ d = MergeFileGroup()
49
+ d["hb_index"] = hb_index
50
+ d["dfe_groups"] = dfe_groups
51
+ return d
52
+
53
+ @property
54
+ def dfe_groups(self) -> Optional[List[List[DeltaFileEnvelope]]]:
55
+ return self["dfe_groups"]
56
+
57
+ @property
58
+ def hb_index(self) -> int:
59
+ return self["hb_index"]
60
+
61
+
62
+ class MergeFileGroupsProvider(ABC):
63
+ @abstractmethod
64
+ def create(self) -> List[MergeFileGroup]:
65
+ """
66
+ Creates a list of merge file groups.
67
+
68
+ Returns: a list of merge file groups.
69
+
70
+ """
71
+ raise NotImplementedError("Method not implemented")
72
+
73
+ @property
74
+ @abstractmethod
75
+ def hash_group_index(self):
76
+ raise NotImplementedError("Method not implemented")
77
+
78
+
79
+ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
80
+ """
81
+ A factory class for producing merge file groups given local delta file envelopes.
82
+ """
83
+
84
+ LOCAL_HASH_BUCKET_INDEX = 0
85
+ LOCAL_HASH_GROUP_INDEX = 0
86
+
87
+ def __init__(
88
+ self,
89
+ uniform_deltas: List[DeltaAnnotated],
90
+ read_kwargs_provider: Optional[ReadKwargsProvider],
91
+ deltacat_storage=unimplemented_deltacat_storage,
92
+ deltacat_storage_kwargs: Optional[dict] = None,
93
+ ):
94
+ self._deltas = uniform_deltas
95
+ self._read_kwargs_provider = read_kwargs_provider
96
+ self._deltacat_storage = deltacat_storage
97
+ self._deltacat_storage_kwargs = deltacat_storage_kwargs
98
+ self._loaded_deltas = False
99
+
100
+ def _read_deltas_locally(self):
101
+ local_dfe_list = []
102
+ input_records_count = 0
103
+ uniform_deltas = self._deltas
104
+ logger.info(f"Getting {len(uniform_deltas)} DFE Tasks.")
105
+ dfe_start = time.monotonic()
106
+ for annotated_delta in uniform_deltas:
107
+ (
108
+ delta_file_envelopes,
109
+ total_record_count,
110
+ total_size_bytes,
111
+ ) = read_delta_file_envelopes(
112
+ annotated_delta,
113
+ self._read_kwargs_provider,
114
+ self._deltacat_storage,
115
+ self._deltacat_storage_kwargs,
116
+ )
117
+ if delta_file_envelopes:
118
+ local_dfe_list.extend(delta_file_envelopes)
119
+ input_records_count += total_record_count
120
+ dfe_end = time.monotonic()
121
+ logger.info(
122
+ f"Retrieved {len(local_dfe_list)} DFE Tasks in {dfe_end - dfe_start}s."
123
+ )
124
+
125
+ self._dfe_groups = [local_dfe_list] if len(local_dfe_list) > 0 else None
126
+ self._loaded_deltas = True
127
+
128
+ def create(self) -> List[MergeFileGroup]:
129
+ if not self._loaded_deltas:
130
+ self._read_deltas_locally()
131
+
132
+ # Since hash bucketing is skipped for local merges, we use a fixed index here.
133
+ return [
134
+ MergeFileGroup.of(
135
+ hb_index=LocalMergeFileGroupsProvider.LOCAL_HASH_BUCKET_INDEX,
136
+ dfe_groups=self._dfe_groups,
137
+ )
138
+ ]
139
+
140
+ @property
141
+ def hash_group_index(self):
142
+ return LocalMergeFileGroupsProvider.LOCAL_HASH_GROUP_INDEX
143
+
144
+
145
+ class RemoteMergeFileGroupsProvider(MergeFileGroupsProvider):
146
+ """
147
+ A factory class for producing merge file groups given delta file envelope object refs
148
+ and hash bucketing parameters. Delta file envelopes are pulled from the object store
149
+ remotely and loaded with in-memory pyarrow tables.
150
+ """
151
+
152
+ def __init__(
153
+ self,
154
+ hash_group_index: int,
155
+ dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
156
+ hash_bucket_count: int,
157
+ num_hash_groups: int,
158
+ object_store: IObjectStore,
159
+ ):
160
+ self.hash_bucket_count = hash_bucket_count
161
+ self.num_hash_groups = num_hash_groups
162
+ self.object_store = object_store
163
+ self._hash_group_index = hash_group_index
164
+ self._dfe_groups_refs = dfe_groups_refs
165
+ self._dfe_groups = []
166
+ self._loaded_from_object_store = False
167
+
168
+ def _load_deltas_from_object_store(self):
169
+ delta_file_envelope_groups_list = self.object_store.get_many(
170
+ self._dfe_groups_refs
171
+ )
172
+ hb_index_to_delta_file_envelopes_list = defaultdict(list)
173
+ for delta_file_envelope_groups in delta_file_envelope_groups_list:
174
+ assert self.hash_bucket_count == len(delta_file_envelope_groups), (
175
+ f"The hash bucket count must match the dfe size as {self.hash_bucket_count}"
176
+ f" != {len(delta_file_envelope_groups)}"
177
+ )
178
+
179
+ for hb_idx, dfes in enumerate(delta_file_envelope_groups):
180
+ if dfes:
181
+ hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
182
+ valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
183
+ self.hash_group_index, self.hash_bucket_count, self.num_hash_groups
184
+ )
185
+
186
+ total_dfes_found = 0
187
+ dfe_list_groups = []
188
+ for hb_idx in valid_hb_indices_iterable:
189
+ dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
190
+ if dfe_list:
191
+ total_dfes_found += 1
192
+ dfe_list_groups.append(
193
+ MergeFileGroup.of(hb_index=hb_idx, dfe_groups=dfe_list)
194
+ )
195
+ else:
196
+ dfe_list_groups.append(MergeFileGroup.of(hb_index=hb_idx))
197
+
198
+ assert total_dfes_found == len(hb_index_to_delta_file_envelopes_list), (
199
+ "The total dfe list does not match the input dfes from hash bucket as "
200
+ f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
201
+ )
202
+ self._dfe_groups = dfe_list_groups
203
+ self._loaded_from_object_store = True
204
+
205
+ def create(self) -> List[MergeFileGroup]:
206
+ if not self._loaded_from_object_store:
207
+ self._load_deltas_from_object_store()
208
+
209
+ return self._dfe_groups
210
+
211
+ @property
212
+ def hash_group_index(self):
213
+ return self._hash_group_index
@@ -1,7 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ray.types import ObjectRef
4
3
  from typing import Dict, List, Optional, Any
4
+
5
+ from deltacat.compute.compactor_v2.model.merge_file_group import (
6
+ MergeFileGroupsProvider,
7
+ )
5
8
  from deltacat.utils.metrics import MetricsConfig
6
9
  from deltacat.utils.common import ReadKwargsProvider
7
10
  from deltacat.io.object_store import IObjectStore
@@ -16,19 +19,15 @@ from deltacat.compute.compactor_v2.constants import (
16
19
  )
17
20
  from deltacat.types.media import ContentType
18
21
  from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
19
- from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
20
22
 
21
23
 
22
24
  class MergeInput(Dict):
23
25
  @staticmethod
24
26
  def of(
25
- dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
27
+ merge_file_groups_provider: MergeFileGroupsProvider,
26
28
  write_to_partition: Partition,
27
29
  compacted_file_content_type: ContentType,
28
30
  primary_keys: List[str],
29
- hash_group_index: int,
30
- num_hash_groups: int,
31
- hash_bucket_count: int,
32
31
  drop_duplicates: Optional[bool] = DROP_DUPLICATES,
33
32
  sort_keys: Optional[List[SortKey]] = None,
34
33
  merge_task_index: Optional[int] = 0,
@@ -44,13 +43,10 @@ class MergeInput(Dict):
44
43
  ) -> MergeInput:
45
44
 
46
45
  result = MergeInput()
47
- result["dfe_groups_refs"] = dfe_groups_refs
46
+ result["merge_file_groups_provider"] = merge_file_groups_provider
48
47
  result["write_to_partition"] = write_to_partition
49
48
  result["compacted_file_content_type"] = compacted_file_content_type
50
49
  result["primary_keys"] = primary_keys
51
- result["hash_group_index"] = hash_group_index
52
- result["num_hash_groups"] = num_hash_groups
53
- result["hash_bucket_count"] = hash_bucket_count
54
50
  result["drop_duplicates"] = drop_duplicates
55
51
  result["sort_keys"] = sort_keys
56
52
  result["merge_task_index"] = merge_task_index
@@ -67,8 +63,8 @@ class MergeInput(Dict):
67
63
  return result
68
64
 
69
65
  @property
70
- def dfe_groups_refs(self) -> List[ObjectRef[DeltaFileEnvelopeGroups]]:
71
- return self["dfe_groups_refs"]
66
+ def merge_file_groups_provider(self) -> MergeFileGroupsProvider:
67
+ return self["merge_file_groups_provider"]
72
68
 
73
69
  @property
74
70
  def write_to_partition(self) -> Partition:
@@ -82,18 +78,6 @@ class MergeInput(Dict):
82
78
  def primary_keys(self) -> List[str]:
83
79
  return self["primary_keys"]
84
80
 
85
- @property
86
- def hash_group_index(self) -> int:
87
- return self["hash_group_index"]
88
-
89
- @property
90
- def num_hash_groups(self) -> int:
91
- return self["num_hash_groups"]
92
-
93
- @property
94
- def hash_bucket_count(self) -> int:
95
- return self["hash_bucket_count"]
96
-
97
81
  @property
98
82
  def drop_duplicates(self) -> int:
99
83
  return self["drop_duplicates"]