deltacat 0.2.10__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/s3u.py +250 -111
  3. deltacat/catalog/default_catalog_impl/__init__.py +369 -0
  4. deltacat/compute/compactor_v2/compaction_session.py +175 -152
  5. deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
  6. deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
  7. deltacat/compute/compactor_v2/model/merge_input.py +8 -24
  8. deltacat/compute/compactor_v2/model/merge_result.py +1 -0
  9. deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
  10. deltacat/compute/compactor_v2/steps/merge.py +106 -171
  11. deltacat/compute/compactor_v2/utils/delta.py +97 -0
  12. deltacat/compute/compactor_v2/utils/merge.py +126 -0
  13. deltacat/compute/compactor_v2/utils/task_options.py +16 -4
  14. deltacat/compute/merge_on_read/__init__.py +4 -0
  15. deltacat/compute/merge_on_read/daft.py +40 -0
  16. deltacat/compute/merge_on_read/model/__init__.py +0 -0
  17. deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
  18. deltacat/compute/merge_on_read/utils/__init__.py +0 -0
  19. deltacat/compute/merge_on_read/utils/delta.py +42 -0
  20. deltacat/storage/interface.py +10 -2
  21. deltacat/storage/model/types.py +3 -11
  22. deltacat/tests/catalog/__init__.py +0 -0
  23. deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
  24. deltacat/tests/compute/compact_partition_test_cases.py +126 -1
  25. deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
  26. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
  27. deltacat/tests/local_deltacat_storage/__init__.py +19 -2
  28. deltacat/tests/test_utils/pyarrow.py +33 -14
  29. deltacat/tests/utils/test_daft.py +42 -2
  30. deltacat/types/media.py +5 -0
  31. deltacat/types/tables.py +7 -1
  32. deltacat/utils/daft.py +78 -13
  33. {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
  34. {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/RECORD +37 -25
  35. {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
  36. {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
  37. {deltacat-0.2.10.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ import numpy as np
6
6
 
7
7
  class MergeResult(NamedTuple):
8
8
  materialize_results: List[MaterializeResult]
9
+ input_record_count: np.int64
9
10
  deduped_record_count: np.int64
10
11
  peak_memory_usage_bytes: np.double
11
12
  telemetry_time_in_seconds: np.double
@@ -5,7 +5,6 @@ from contextlib import nullcontext
5
5
  from typing import List, Optional, Tuple
6
6
  from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
7
7
  import numpy as np
8
- import pyarrow as pa
9
8
  import ray
10
9
  from deltacat import logs
11
10
  from deltacat.compute.compactor import (
@@ -14,12 +13,12 @@ from deltacat.compute.compactor import (
14
13
  )
15
14
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
16
15
  from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
16
+ from deltacat.compute.compactor_v2.utils.delta import read_delta_file_envelopes
17
17
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
18
18
  group_hash_bucket_indices,
19
19
  group_by_pk_hash_bucket,
20
20
  )
21
21
  from deltacat.storage import interface as unimplemented_deltacat_storage
22
- from deltacat.types.media import StorageType
23
22
  from deltacat.utils.ray_utils.runtime import (
24
23
  get_current_ray_task_id,
25
24
  get_current_ray_worker_id,
@@ -39,57 +38,6 @@ if importlib.util.find_spec("memray"):
39
38
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
40
39
 
41
40
 
42
- def _read_delta_file_envelopes(
43
- annotated_delta: DeltaAnnotated,
44
- read_kwargs_provider: Optional[ReadKwargsProvider],
45
- deltacat_storage=unimplemented_deltacat_storage,
46
- deltacat_storage_kwargs: Optional[dict] = None,
47
- ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
48
-
49
- tables = deltacat_storage.download_delta(
50
- annotated_delta,
51
- max_parallelism=1,
52
- file_reader_kwargs_provider=read_kwargs_provider,
53
- storage_type=StorageType.LOCAL,
54
- **deltacat_storage_kwargs,
55
- )
56
- annotations = annotated_delta.annotations
57
- assert (
58
- len(tables) == len(annotations),
59
- f"Unexpected Error: Length of downloaded delta manifest tables "
60
- f"({len(tables)}) doesn't match the length of delta manifest "
61
- f"annotations ({len(annotations)}).",
62
- )
63
- if not tables:
64
- return None, 0, 0
65
-
66
- delta_stream_position = annotations[0].annotation_stream_position
67
- delta_type = annotations[0].annotation_delta_type
68
-
69
- for annotation in annotations:
70
- assert annotation.annotation_stream_position == delta_stream_position, (
71
- f"Annotation stream position does not match - {annotation.annotation_stream_position} "
72
- f"!= {delta_stream_position}"
73
- )
74
- assert annotation.annotation_delta_type == delta_type, (
75
- f"Annotation delta type does not match - {annotation.annotation_delta_type} "
76
- f"!= {delta_type}"
77
- )
78
-
79
- delta_file_envelopes = []
80
- table = pa.concat_tables(tables)
81
- total_record_count = len(table)
82
- total_size_bytes = int(table.nbytes)
83
-
84
- delta_file = DeltaFileEnvelope.of(
85
- stream_position=delta_stream_position,
86
- delta_type=delta_type,
87
- table=table,
88
- )
89
- delta_file_envelopes.append(delta_file)
90
- return delta_file_envelopes, total_record_count, total_size_bytes
91
-
92
-
93
41
  def _group_file_records_by_pk_hash_bucket(
94
42
  annotated_delta: DeltaAnnotated,
95
43
  num_hash_buckets: int,
@@ -103,7 +51,7 @@ def _group_file_records_by_pk_hash_bucket(
103
51
  delta_file_envelopes,
104
52
  total_record_count,
105
53
  total_size_bytes,
106
- ) = _read_delta_file_envelopes(
54
+ ) = read_delta_file_envelopes(
107
55
  annotated_delta,
108
56
  read_kwargs_provider,
109
57
  deltacat_storage,
@@ -187,7 +135,7 @@ def _timed_hash_bucket(input: HashBucketInput):
187
135
  @ray.remote
188
136
  def hash_bucket(input: HashBucketInput) -> HashBucketResult:
189
137
  with ProcessUtilizationOverTimeRange() as process_util:
190
- logger.info(f"Starting hash bucket task...")
138
+ logger.info(f"Starting hash bucket task {input.hb_task_index}...")
191
139
 
192
140
  # Log node peak memory utilization every 10 seconds
193
141
  def log_peak_memory():
@@ -212,7 +160,7 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
212
160
  )
213
161
  emit_metrics_time = latency
214
162
 
215
- logger.info(f"Finished hash bucket task...")
163
+ logger.info(f"Finished hash bucket task {input.hb_task_index}...")
216
164
  return HashBucketResult(
217
165
  hash_bucket_result[0],
218
166
  hash_bucket_result[1],
@@ -6,28 +6,21 @@ import pyarrow as pa
6
6
  import ray
7
7
  import time
8
8
  import pyarrow.compute as pc
9
+ import deltacat.compute.compactor_v2.utils.merge as merge_utils
9
10
  from uuid import uuid4
10
- from collections import defaultdict
11
11
  from deltacat import logs
12
- from typing import List, Optional
13
- from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
12
+ from typing import List, Optional, Tuple
14
13
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
15
14
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
16
15
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
17
- from deltacat.compute.compactor import (
18
- RoundCompletionInfo,
19
- DeltaFileEnvelope,
20
- )
16
+ from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
21
17
  from deltacat.utils.common import ReadKwargsProvider
22
-
23
18
  from contextlib import nullcontext
24
- from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
25
19
  from deltacat.utils.ray_utils.runtime import (
26
20
  get_current_ray_task_id,
27
21
  get_current_ray_worker_id,
28
22
  )
29
23
  from deltacat.compute.compactor.utils import system_columns as sc
30
-
31
24
  from deltacat.utils.performance import timed_invocation
32
25
  from deltacat.utils.metrics import emit_timer_metrics
33
26
  from deltacat.utils.resources import (
@@ -36,7 +29,6 @@ from deltacat.utils.resources import (
36
29
  )
37
30
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
38
31
  generate_pk_hash_column,
39
- hash_group_index_to_hash_bucket_indices,
40
32
  )
41
33
  from deltacat.storage import (
42
34
  Delta,
@@ -77,14 +69,9 @@ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
77
69
 
78
70
 
79
71
  def _build_incremental_table(
80
- hash_bucket_index: int,
81
72
  df_envelopes_list: List[List[DeltaFileEnvelope]],
82
73
  ) -> pa.Table:
83
74
 
84
- logger.info(
85
- f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
86
- f"{len(df_envelopes_list)} delta file envelope lists..."
87
- )
88
75
  hb_tables = []
89
76
  # sort by delta file stream position now instead of sorting every row later
90
77
  df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
@@ -270,174 +257,120 @@ def _copy_all_manifest_files_from_old_hash_buckets(
270
257
  return materialize_result_list
271
258
 
272
259
 
273
- def _timed_merge(input: MergeInput) -> MergeResult:
274
- def _materialize(
275
- hash_bucket_index,
276
- compacted_tables: List[pa.Table],
277
- ) -> MaterializeResult:
278
- compacted_table = pa.concat_tables(compacted_tables)
279
- if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
280
- # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
281
- # TODO (pdames): compare performance to pandas-native materialize path
282
- df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
283
- compacted_table = df
284
- delta, stage_delta_time = timed_invocation(
285
- input.deltacat_storage.stage_delta,
286
- compacted_table,
287
- input.write_to_partition,
288
- max_records_per_entry=input.max_records_per_output_file,
289
- content_type=input.compacted_file_content_type,
290
- s3_table_writer_kwargs=input.s3_table_writer_kwargs,
291
- **input.deltacat_storage_kwargs,
292
- )
293
- compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
294
- compacted_table
295
- )
296
- logger.debug(
297
- f"Time taken for materialize task"
298
- f" to upload {len(compacted_table)} records"
299
- f" of size {compacted_table_size} is: {stage_delta_time}s"
260
+ def _compact_tables(
261
+ input: MergeInput, dfe_list: List[List[DeltaFileEnvelope]], hb_idx: int
262
+ ) -> Tuple[pa.Table, int, int]:
263
+ logger.info(
264
+ f"[Hash bucket index {hb_idx}] Reading dedupe input for "
265
+ f"{len(dfe_list)} delta file envelope lists..."
266
+ )
267
+ table = _build_incremental_table(dfe_list)
268
+
269
+ incremental_len = len(table)
270
+ logger.info(
271
+ f"[Hash bucket index {hb_idx}] Got the incremental table of length {incremental_len}"
272
+ )
273
+
274
+ if input.sort_keys:
275
+ # Incremental is sorted and merged, as sorting
276
+ # on non event based sort key does not produce consistent
277
+ # compaction results. E.g., compaction(delta1, delta2, delta3)
278
+ # will not be equal to compaction(compaction(delta1, delta2), delta3).
279
+ table = table.sort_by(input.sort_keys)
280
+
281
+ compacted_table = None
282
+
283
+ if (
284
+ input.round_completion_info
285
+ and input.round_completion_info.hb_index_to_entry_range
286
+ and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
287
+ is not None
288
+ ):
289
+ compacted_table = _download_compacted_table(
290
+ hb_index=hb_idx,
291
+ rcf=input.round_completion_info,
292
+ read_kwargs_provider=input.read_kwargs_provider,
293
+ deltacat_storage=input.deltacat_storage,
294
+ deltacat_storage_kwargs=input.deltacat_storage_kwargs,
300
295
  )
301
- manifest = delta.manifest
302
- manifest_records = manifest.meta.record_count
303
- assert manifest_records == len(compacted_table), (
304
- f"Unexpected Error: Materialized delta manifest record count "
305
- f"({manifest_records}) does not equal compacted table record count "
306
- f"({len(compacted_table)})"
296
+
297
+ hb_table_record_count = len(table) + (
298
+ len(compacted_table) if compacted_table else 0
299
+ )
300
+
301
+ table, merge_time = timed_invocation(
302
+ func=_merge_tables,
303
+ table=table,
304
+ primary_keys=input.primary_keys,
305
+ can_drop_duplicates=input.drop_duplicates,
306
+ compacted_table=compacted_table,
307
+ )
308
+ total_deduped_records = hb_table_record_count - len(table)
309
+
310
+ logger.info(
311
+ f"[Merge task index {input.merge_task_index}] Merged "
312
+ f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
313
+ )
314
+
315
+ return table, incremental_len, total_deduped_records
316
+
317
+
318
+ def _copy_manifests_from_hash_bucketing(
319
+ input: MergeInput, hb_index_copy_by_reference_ids: List[int]
320
+ ) -> List[MaterializeResult]:
321
+ materialized_results: List[MaterializeResult] = []
322
+
323
+ if input.round_completion_info:
324
+ referenced_materialized_results = (
325
+ _copy_all_manifest_files_from_old_hash_buckets(
326
+ hb_index_copy_by_reference_ids,
327
+ input.round_completion_info,
328
+ input.write_to_partition,
329
+ input.deltacat_storage,
330
+ input.deltacat_storage_kwargs,
331
+ )
307
332
  )
308
- materialize_result = MaterializeResult.of(
309
- delta=delta,
310
- task_index=hash_bucket_index,
311
- # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
312
- # and in-memory-table-bytes instead of tight coupling to paBytes
313
- pyarrow_write_result=PyArrowWriteResult.of(
314
- len(manifest.entries),
315
- TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
316
- manifest.meta.content_length,
317
- len(compacted_table),
318
- ),
333
+ logger.info(
334
+ f"Copying {len(referenced_materialized_results)} manifest files by reference..."
319
335
  )
320
- logger.info(f"Materialize result: {materialize_result}")
321
- return materialize_result
336
+ materialized_results.extend(referenced_materialized_results)
322
337
 
338
+ return materialized_results
339
+
340
+
341
+ def _timed_merge(input: MergeInput) -> MergeResult:
323
342
  task_id = get_current_ray_task_id()
324
343
  worker_id = get_current_ray_worker_id()
325
344
  with memray.Tracker(
326
345
  f"merge_{worker_id}_{task_id}.bin"
327
346
  ) if input.enable_profiler else nullcontext():
328
- # In V2, we need to mitigate risk of running out of memory here in cases of
329
- # severe skew of primary key updates in deltas. By severe skew, we mean
330
- # one hash bucket require more memory than a worker instance have.
331
- logger.info(
332
- f"[Merge task {input.merge_task_index}] Getting delta file envelope "
333
- f"groups for {len(input.dfe_groups_refs)} object refs..."
334
- )
335
-
336
- delta_file_envelope_groups_list = input.object_store.get_many(
337
- input.dfe_groups_refs
338
- )
339
- hb_index_to_delta_file_envelopes_list = defaultdict(list)
340
- for delta_file_envelope_groups in delta_file_envelope_groups_list:
341
- assert input.hash_bucket_count == len(delta_file_envelope_groups), (
342
- f"The hash bucket count must match the dfe size as {input.hash_bucket_count}"
343
- f" != {len(delta_file_envelope_groups)}"
344
- )
345
-
346
- for hb_idx, dfes in enumerate(delta_file_envelope_groups):
347
- if dfes:
348
- hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
349
-
350
- valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
351
- input.hash_group_index, input.hash_bucket_count, input.num_hash_groups
352
- )
347
+ total_input_records, total_deduped_records = 0, 0
348
+ materialized_results: List[MaterializeResult] = []
349
+ merge_file_groups = input.merge_file_groups_provider.create()
350
+ hb_index_copy_by_ref_ids = []
353
351
 
354
- total_deduped_records = 0
355
- total_dfes_found = 0
352
+ for merge_file_group in merge_file_groups:
353
+ if not merge_file_group.dfe_groups:
354
+ hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
355
+ continue
356
356
 
357
- materialized_results: List[MaterializeResult] = []
358
- hb_index_copy_by_reference = []
359
- for hb_idx in valid_hb_indices_iterable:
360
- dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
361
-
362
- if dfe_list:
363
- total_dfes_found += 1
364
- table = _build_incremental_table(hb_idx, dfe_list)
365
-
366
- incremental_len = len(table)
367
- logger.info(
368
- f"Got the incremental table of length {incremental_len} for hash bucket {hb_idx}"
369
- )
370
-
371
- if input.sort_keys:
372
- # Incremental is sorted and merged, as sorting
373
- # on non event based sort key does not produce consistent
374
- # compaction results. E.g., compaction(delta1, delta2, delta3)
375
- # will not be equal to compaction(compaction(delta1, delta2), delta3).
376
- table = table.sort_by(input.sort_keys)
377
-
378
- compacted_table = None
379
- if (
380
- input.round_completion_info
381
- and input.round_completion_info.hb_index_to_entry_range
382
- and input.round_completion_info.hb_index_to_entry_range.get(
383
- str(hb_idx)
384
- )
385
- is not None
386
- ):
387
-
388
- compacted_table = _download_compacted_table(
389
- hb_index=hb_idx,
390
- rcf=input.round_completion_info,
391
- read_kwargs_provider=input.read_kwargs_provider,
392
- deltacat_storage=input.deltacat_storage,
393
- deltacat_storage_kwargs=input.deltacat_storage_kwargs,
394
- )
395
-
396
- hb_table_record_count = len(table) + (
397
- len(compacted_table) if compacted_table else 0
398
- )
399
-
400
- table, merge_time = timed_invocation(
401
- func=_merge_tables,
402
- table=table,
403
- primary_keys=input.primary_keys,
404
- can_drop_duplicates=input.drop_duplicates,
405
- compacted_table=compacted_table,
406
- )
407
- total_deduped_records += hb_table_record_count - len(table)
408
-
409
- logger.info(
410
- f"[Merge task index {input.merge_task_index}] Merged "
411
- f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
412
- )
413
-
414
- materialized_results.append(_materialize(hb_idx, [table]))
415
- else:
416
- hb_index_copy_by_reference.append(hb_idx)
417
-
418
- if input.round_completion_info and hb_index_copy_by_reference:
419
- referenced_materialized_results = (
420
- _copy_all_manifest_files_from_old_hash_buckets(
421
- hb_index_copy_by_reference,
422
- input.round_completion_info,
423
- input.write_to_partition,
424
- input.deltacat_storage,
425
- input.deltacat_storage_kwargs,
426
- )
357
+ table, input_records, deduped_records = _compact_tables(
358
+ input, merge_file_group.dfe_groups, merge_file_group.hb_index
427
359
  )
428
- logger.info(
429
- f"Copying {len(referenced_materialized_results)} manifest files by reference..."
360
+ total_input_records += input_records
361
+ total_deduped_records += deduped_records
362
+ materialized_results.append(
363
+ merge_utils.materialize(input, merge_file_group.hb_index, [table])
430
364
  )
431
- materialized_results.extend(referenced_materialized_results)
432
365
 
433
- logger.info(
434
- "Total number of materialized results produced for "
435
- f"hash group index: {input.hash_group_index} is {len(materialized_results)}"
436
- )
366
+ if hb_index_copy_by_ref_ids:
367
+ materialized_results.extend(
368
+ _copy_manifests_from_hash_bucketing(input, hb_index_copy_by_ref_ids)
369
+ )
437
370
 
438
- assert total_dfes_found == len(hb_index_to_delta_file_envelopes_list), (
439
- "The total dfe list does not match the input dfes from hash bucket as "
440
- f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
371
+ logger.info(
372
+ f"[Hash group index: {input.merge_file_groups_provider.hash_group_index}]"
373
+ f" Total number of materialized results produced: {len(materialized_results)} "
441
374
  )
442
375
 
443
376
  peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
@@ -447,6 +380,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
447
380
 
448
381
  return MergeResult(
449
382
  materialized_results,
383
+ np.int64(total_input_records),
450
384
  np.int64(total_deduped_records),
451
385
  np.double(peak_memory_usage_bytes),
452
386
  np.double(0.0),
@@ -457,7 +391,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
457
391
  @ray.remote
458
392
  def merge(input: MergeInput) -> MergeResult:
459
393
  with ProcessUtilizationOverTimeRange() as process_util:
460
- logger.info(f"Starting merge task...")
394
+ logger.info(f"Starting merge task {input.merge_task_index}...")
461
395
 
462
396
  # Log node peak memory utilization every 10 seconds
463
397
  def log_peak_memory():
@@ -480,11 +414,12 @@ def merge(input: MergeInput) -> MergeResult:
480
414
  )
481
415
  emit_metrics_time = latency
482
416
 
483
- logger.info(f"Finished merge task...")
417
+ logger.info(f"Finished merge task {input.merge_task_index}...")
484
418
  return MergeResult(
485
419
  merge_result[0],
486
420
  merge_result[1],
487
421
  merge_result[2],
422
+ merge_result[3],
488
423
  np.double(emit_metrics_time),
489
424
  merge_result[4],
490
425
  )
@@ -0,0 +1,97 @@
1
+ import time
2
+ from typing import List, Optional, Tuple
3
+
4
+ from deltacat.compute.compactor import (
5
+ DeltaAnnotated,
6
+ DeltaFileEnvelope,
7
+ )
8
+
9
+ from deltacat.storage import interface as unimplemented_deltacat_storage
10
+ from deltacat.types.media import StorageType
11
+ from deltacat.utils.common import ReadKwargsProvider
12
+ from deltacat import logs
13
+
14
+ import pyarrow as pa
15
+ import logging
16
+
17
+
18
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
19
+
20
+
21
+ def read_delta_file_envelopes(
22
+ annotated_delta: DeltaAnnotated,
23
+ read_kwargs_provider: Optional[ReadKwargsProvider],
24
+ deltacat_storage=unimplemented_deltacat_storage,
25
+ deltacat_storage_kwargs: Optional[dict] = None,
26
+ ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
27
+ tables = deltacat_storage.download_delta(
28
+ annotated_delta,
29
+ max_parallelism=1,
30
+ file_reader_kwargs_provider=read_kwargs_provider,
31
+ storage_type=StorageType.LOCAL,
32
+ **deltacat_storage_kwargs,
33
+ )
34
+ annotations = annotated_delta.annotations
35
+ assert (
36
+ len(tables) == len(annotations),
37
+ f"Unexpected Error: Length of downloaded delta manifest tables "
38
+ f"({len(tables)}) doesn't match the length of delta manifest "
39
+ f"annotations ({len(annotations)}).",
40
+ )
41
+ if not tables:
42
+ return None, 0, 0
43
+
44
+ delta_stream_position = annotations[0].annotation_stream_position
45
+ delta_type = annotations[0].annotation_delta_type
46
+
47
+ for annotation in annotations:
48
+ assert annotation.annotation_stream_position == delta_stream_position, (
49
+ f"Annotation stream position does not match - {annotation.annotation_stream_position} "
50
+ f"!= {delta_stream_position}"
51
+ )
52
+ assert annotation.annotation_delta_type == delta_type, (
53
+ f"Annotation delta type does not match - {annotation.annotation_delta_type} "
54
+ f"!= {delta_type}"
55
+ )
56
+
57
+ delta_file_envelopes = []
58
+ table = pa.concat_tables(tables)
59
+ total_record_count = len(table)
60
+ total_size_bytes = int(table.nbytes)
61
+
62
+ delta_file = DeltaFileEnvelope.of(
63
+ stream_position=delta_stream_position,
64
+ delta_type=delta_type,
65
+ table=table,
66
+ )
67
+ delta_file_envelopes.append(delta_file)
68
+ return delta_file_envelopes, total_record_count, total_size_bytes
69
+
70
+
71
+ def get_local_delta_file_envelopes(
72
+ uniform_deltas: List[DeltaAnnotated],
73
+ read_kwargs_provider: Optional[ReadKwargsProvider],
74
+ deltacat_storage=unimplemented_deltacat_storage,
75
+ deltacat_storage_kwargs: Optional[dict] = None,
76
+ ) -> Tuple[List[DeltaFileEnvelope], int]:
77
+ local_dfe_list = []
78
+ input_records_count = 0
79
+ logger.info(f"Getting {len(uniform_deltas)} DFE Tasks.")
80
+ dfe_start = time.monotonic()
81
+ for annotated_delta in uniform_deltas:
82
+ (
83
+ delta_file_envelopes,
84
+ total_record_count,
85
+ total_size_bytes,
86
+ ) = read_delta_file_envelopes(
87
+ annotated_delta,
88
+ read_kwargs_provider,
89
+ deltacat_storage,
90
+ deltacat_storage_kwargs,
91
+ )
92
+ if delta_file_envelopes:
93
+ local_dfe_list.extend(delta_file_envelopes)
94
+ input_records_count += total_record_count
95
+ dfe_end = time.monotonic()
96
+ logger.info(f"Retrieved {len(local_dfe_list)} DFE Tasks in {dfe_end - dfe_start}s.")
97
+ return local_dfe_list, input_records_count