deltacat 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +62 -25
  3. deltacat/compute/compactor/model/delta_annotated.py +1 -1
  4. deltacat/compute/compactor/model/materialize_result.py +16 -2
  5. deltacat/compute/compactor/model/repartition_result.py +6 -0
  6. deltacat/compute/compactor/model/round_completion_info.py +8 -0
  7. deltacat/compute/compactor/repartition_session.py +174 -0
  8. deltacat/compute/compactor/steps/materialize.py +116 -27
  9. deltacat/compute/compactor/steps/repartition.py +210 -0
  10. deltacat/compute/compactor/utils/io.py +131 -49
  11. deltacat/compute/compactor/utils/round_completion_file.py +14 -16
  12. deltacat/constants.py +2 -0
  13. deltacat/storage/interface.py +1 -1
  14. deltacat/storage/model/types.py +10 -2
  15. deltacat/tests/compactor/utils/__init__.py +0 -0
  16. deltacat/tests/compactor/utils/test_io.py +69 -0
  17. deltacat/tests/test_repartition.py +193 -0
  18. deltacat/tests/test_utils/__init__.py +0 -0
  19. deltacat/tests/test_utils/constants.py +7 -0
  20. deltacat/tests/utils/test_resources.py +36 -0
  21. deltacat/utils/ray_utils/concurrency.py +2 -0
  22. deltacat/utils/resources.py +72 -0
  23. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/METADATA +2 -5
  24. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/RECORD +28 -18
  25. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/WHEEL +1 -1
  26. /deltacat/{utils/profiling.py → tests/compactor/__init__.py} +0 -0
  27. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/LICENSE +0 -0
  28. {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/top_level.txt +0 -0
deltacat/__init__.py CHANGED
@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
43
43
 
44
44
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
45
45
 
46
- __version__ = "0.1.18.beta1"
46
+ __version__ = "0.1.18b3"
47
47
 
48
48
 
49
49
  __all__ = [
@@ -37,6 +37,7 @@ from deltacat.utils.placement import PlacementGroupConfig
37
37
  from typing import List, Set, Optional, Tuple, Dict, Any
38
38
  from collections import defaultdict
39
39
  from deltacat.utils.metrics import MetricsConfig
40
+ from deltacat.utils.resources import log_current_cluster_utilization
40
41
 
41
42
  if importlib.util.find_spec("memray"):
42
43
  import memray
@@ -113,8 +114,11 @@ def compact_partition(
113
114
  f"compaction_partition.bin"
114
115
  ) if enable_profiler else nullcontext():
115
116
  partition = None
116
- new_rcf_s3_url = None
117
- (new_partition, new_rci, new_rcf_s3_url,) = _execute_compaction_round(
117
+ (
118
+ new_partition,
119
+ new_rci,
120
+ new_rcf_partition_locator,
121
+ ) = _execute_compaction_round(
118
122
  source_partition_locator,
119
123
  destination_partition_locator,
120
124
  primary_keys,
@@ -144,12 +148,19 @@ def compact_partition(
144
148
  logger.info(
145
149
  f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
146
150
  )
151
+ round_completion_file_s3_url = None
147
152
  if partition:
148
153
  logger.info(f"Committing compacted partition to: {partition.locator}")
149
154
  partition = deltacat_storage.commit_partition(partition)
150
155
  logger.info(f"Committed compacted partition: {partition}")
156
+
157
+ round_completion_file_s3_url = rcf.write_round_completion_file(
158
+ compaction_artifact_s3_bucket,
159
+ new_rcf_partition_locator,
160
+ new_rci,
161
+ )
151
162
  logger.info(f"Completed compaction session for: {source_partition_locator}")
152
- return new_rcf_s3_url
163
+ return round_completion_file_s3_url
153
164
 
154
165
 
155
166
  def _execute_compaction_round(
@@ -283,13 +294,22 @@ def _execute_compaction_round(
283
294
  hash_bucket_count,
284
295
  last_stream_position_compacted,
285
296
  require_multiple_rounds,
286
- ) = io.limit_input_deltas(
287
- input_deltas,
288
- cluster_resources,
289
- hash_bucket_count,
290
- min_hash_bucket_chunk_size,
291
- input_deltas_stats=input_deltas_stats,
292
- deltacat_storage=deltacat_storage,
297
+ ) = (
298
+ io.fit_input_deltas(
299
+ input_deltas,
300
+ cluster_resources,
301
+ hash_bucket_count,
302
+ deltacat_storage=deltacat_storage,
303
+ )
304
+ if input_deltas_stats is None
305
+ else io.limit_input_deltas(
306
+ input_deltas,
307
+ cluster_resources,
308
+ hash_bucket_count,
309
+ min_hash_bucket_chunk_size,
310
+ input_deltas_stats=input_deltas_stats,
311
+ deltacat_storage=deltacat_storage,
312
+ )
293
313
  )
294
314
 
295
315
  assert hash_bucket_count is not None and hash_bucket_count > 0, (
@@ -435,11 +455,39 @@ def _execute_compaction_round(
435
455
  )
436
456
  logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
437
457
  mat_results = ray.get(mat_tasks_pending)
458
+ total_count_of_src_dfl_not_touched = sum(
459
+ m.count_of_src_dfl_not_touched for m in mat_results
460
+ )
461
+ total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
462
+ logger.info(
463
+ f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
464
+ )
465
+ logger.info(
466
+ f"Got total of {total_length_src_dfl} manifest files during compaction."
467
+ )
468
+ manifest_entry_copied_by_reference_ratio = (
469
+ (round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
470
+ if total_length_src_dfl != 0
471
+ else None
472
+ )
473
+ logger.info(
474
+ f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
475
+ )
476
+
438
477
  logger.info(f"Got {len(mat_results)} materialize result(s).")
439
478
 
479
+ log_current_cluster_utilization(log_identifier="post_materialize")
480
+
440
481
  mat_results = sorted(mat_results, key=lambda m: m.task_index)
441
482
  deltas = [m.delta for m in mat_results]
442
- merged_delta = Delta.merge_deltas(deltas)
483
+
484
+ # Note: An appropriate last stream position must be set
485
+ # to avoid correctness issue.
486
+ merged_delta = Delta.merge_deltas(
487
+ deltas,
488
+ stream_position=last_stream_position_to_compact,
489
+ )
490
+
443
491
  record_info_msg = (
444
492
  f"Hash bucket records: {total_hb_record_count},"
445
493
  f" Deduped records: {total_dd_record_count}, "
@@ -463,35 +511,24 @@ def _execute_compaction_round(
463
511
  compacted_delta.stream_position,
464
512
  )
465
513
 
466
- rci_high_watermark = (
467
- rebase_source_partition_high_watermark
468
- if rebase_source_partition_high_watermark
469
- else last_stream_position_compacted
470
- )
471
-
472
514
  last_rebase_source_partition_locator = rebase_source_partition_locator or (
473
515
  round_completion_info.rebase_source_partition_locator
474
516
  if round_completion_info
475
517
  else None
476
518
  )
477
519
  new_round_completion_info = RoundCompletionInfo.of(
478
- rci_high_watermark,
520
+ last_stream_position_compacted,
479
521
  new_compacted_delta_locator,
480
522
  PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
481
523
  bit_width_of_sort_keys,
482
524
  last_rebase_source_partition_locator,
525
+ manifest_entry_copied_by_reference_ratio,
483
526
  )
484
527
  rcf_source_partition_locator = (
485
528
  rebase_source_partition_locator
486
529
  if rebase_source_partition_locator
487
530
  else source_partition_locator
488
531
  )
489
- round_completion_file_s3_url = rcf.write_round_completion_file(
490
- compaction_artifact_s3_bucket,
491
- rcf_source_partition_locator,
492
- new_round_completion_info,
493
- )
494
-
495
532
  logger.info(
496
533
  f"partition-{source_partition_locator.partition_values},"
497
534
  f"compacted at: {last_stream_position_compacted},"
@@ -500,5 +537,5 @@ def _execute_compaction_round(
500
537
  return (
501
538
  partition,
502
539
  new_round_completion_info,
503
- round_completion_file_s3_url,
540
+ rcf_source_partition_locator,
504
541
  )
@@ -62,7 +62,7 @@ class DeltaAnnotated(Delta):
62
62
  @staticmethod
63
63
  def rebatch(
64
64
  annotated_deltas: List[DeltaAnnotated],
65
- min_delta_bytes,
65
+ min_delta_bytes: float,
66
66
  min_file_counts: Optional[Union[int, float]] = float("inf"),
67
67
  estimation_function: Optional[Callable] = None,
68
68
  ) -> List[DeltaAnnotated]:
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Any, Dict
4
+ from typing import Any, Dict, Optional
5
5
 
6
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
7
  from deltacat.storage import Delta
@@ -10,12 +10,18 @@ from deltacat.storage import Delta
10
10
  class MaterializeResult(dict):
11
11
  @staticmethod
12
12
  def of(
13
- delta: Delta, task_index: int, pyarrow_write_result: PyArrowWriteResult
13
+ delta: Delta,
14
+ task_index: int,
15
+ pyarrow_write_result: PyArrowWriteResult,
16
+ count_of_src_dfl_not_touched: Optional[int] = 0,
17
+ count_of_src_dfl: Optional[int] = 0,
14
18
  ) -> MaterializeResult:
15
19
  materialize_result = MaterializeResult()
16
20
  materialize_result["delta"] = delta
17
21
  materialize_result["taskIndex"] = task_index
18
22
  materialize_result["paWriteResult"] = pyarrow_write_result
23
+ materialize_result["countOfSrcFileNotTouched"] = count_of_src_dfl_not_touched
24
+ materialize_result["countOfSrcFile"] = count_of_src_dfl
19
25
  return materialize_result
20
26
 
21
27
  @property
@@ -35,3 +41,11 @@ class MaterializeResult(dict):
35
41
  if val is not None and not isinstance(val, PyArrowWriteResult):
36
42
  self["paWriteResult"] = val = PyArrowWriteResult(val)
37
43
  return val
44
+
45
+ @property
46
+ def count_of_src_dfl_not_touched(self) -> int:
47
+ return self["countOfSrcFileNotTouched"]
48
+
49
+ @property
50
+ def count_of_src_dfl(self) -> int:
51
+ return self["countOfSrcFile"]
@@ -0,0 +1,6 @@
1
+ from typing import NamedTuple, List
2
+ from deltacat.storage import Delta
3
+
4
+
5
+ class RepartitionResult(NamedTuple):
6
+ range_deltas: List[Delta]
@@ -38,6 +38,7 @@ class RoundCompletionInfo(dict):
38
38
  compacted_pyarrow_write_result: PyArrowWriteResult,
39
39
  sort_keys_bit_width: int,
40
40
  rebase_source_partition_locator: Optional[PartitionLocator],
41
+ manifest_entry_copied_by_reference_ratio: Optional[float] = None,
41
42
  ) -> RoundCompletionInfo:
42
43
 
43
44
  rci = RoundCompletionInfo()
@@ -46,6 +47,9 @@ class RoundCompletionInfo(dict):
46
47
  rci["compactedPyarrowWriteResult"] = compacted_pyarrow_write_result
47
48
  rci["sortKeysBitWidth"] = sort_keys_bit_width
48
49
  rci["rebaseSourcePartitionLocator"] = rebase_source_partition_locator
50
+ rci[
51
+ "manifestEntryCopiedByReferenceRatio"
52
+ ] = manifest_entry_copied_by_reference_ratio
49
53
  return rci
50
54
 
51
55
  @property
@@ -80,3 +84,7 @@ class RoundCompletionInfo(dict):
80
84
  @property
81
85
  def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
82
86
  return self.get("rebaseSourcePartitionLocator")
87
+
88
+ @property
89
+ def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
90
+ return self["manifestEntryCopiedByReferenceRatio"]
@@ -0,0 +1,174 @@
1
+ import ray
2
+ import time
3
+ import logging
4
+ from deltacat import logs
5
+ from deltacat.utils.common import ReadKwargsProvider
6
+ import functools
7
+ import itertools
8
+ from deltacat.compute.compactor import (
9
+ RoundCompletionInfo,
10
+ SortKey,
11
+ )
12
+ from deltacat.types.media import ContentType
13
+ from deltacat.compute.compactor import DeltaAnnotated
14
+ from deltacat.utils.ray_utils.concurrency import (
15
+ invoke_parallel,
16
+ round_robin_options_provider,
17
+ )
18
+
19
+ from deltacat.compute.compactor.model.repartition_result import RepartitionResult
20
+ from deltacat.utils.placement import PlacementGroupConfig
21
+ from typing import List, Optional, Dict, Any
22
+ from deltacat.utils.ray_utils.runtime import live_node_resource_keys
23
+ from deltacat.compute.compactor.utils import io
24
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
25
+ from deltacat.compute.compactor.steps import repartition as repar
26
+ from deltacat.compute.compactor.steps.repartition import RepartitionType
27
+ from deltacat.storage import (
28
+ Delta,
29
+ DeltaLocator,
30
+ PartitionLocator,
31
+ interface as unimplemented_deltacat_storage,
32
+ )
33
+ from deltacat.utils.metrics import MetricsConfig
34
+
35
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
+
37
+
38
+ # TODO: move this repartition function to a separate module under compute
39
+ def repartition(
40
+ source_partition_locator: PartitionLocator,
41
+ destination_partition_locator: PartitionLocator,
42
+ repartition_args: Any,
43
+ repartition_completion_file_s3_url: str,
44
+ last_stream_position_to_compact: int,
45
+ repartition_type: RepartitionType = RepartitionType.RANGE,
46
+ sort_keys: List[SortKey] = None,
47
+ records_per_repartitioned_file: int = 4_000_000,
48
+ min_file_count: int = 1000,
49
+ min_delta_bytes: int = 200 * 2**20,
50
+ repartitioned_file_content_type: ContentType = ContentType.PARQUET,
51
+ enable_profiler: bool = False,
52
+ metrics_config: Optional[MetricsConfig] = None,
53
+ pg_config: Optional[PlacementGroupConfig] = None,
54
+ list_deltas_kwargs: Optional[Dict[str, Any]] = None,
55
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
56
+ deltacat_storage=unimplemented_deltacat_storage,
57
+ **kwargs,
58
+ ) -> Optional[str]:
59
+
60
+ node_resource_keys = None
61
+ if pg_config: # use resource in each placement group
62
+ cluster_resources = pg_config.resource
63
+ cluster_cpus = cluster_resources["CPU"]
64
+ else: # use all cluster resource
65
+ cluster_resources = ray.cluster_resources()
66
+ logger.info(f"Total cluster resources: {cluster_resources}")
67
+ logger.info(f"Available cluster resources: {ray.available_resources()}")
68
+ cluster_cpus = int(cluster_resources["CPU"])
69
+ logger.info(f"Total cluster CPUs: {cluster_cpus}")
70
+ node_resource_keys = live_node_resource_keys()
71
+ logger.info(
72
+ f"Found {len(node_resource_keys)} live cluster nodes: "
73
+ f"{node_resource_keys}"
74
+ )
75
+
76
+ # create a remote options provider to round-robin tasks across all nodes or allocated bundles
77
+ logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
78
+ round_robin_opt_provider = functools.partial(
79
+ round_robin_options_provider,
80
+ resource_keys=node_resource_keys,
81
+ pg_config=pg_config.opts if pg_config else None,
82
+ )
83
+
84
+ deltas = io._discover_deltas(
85
+ source_partition_locator,
86
+ None,
87
+ deltacat_storage.get_partition(
88
+ source_partition_locator.stream_locator,
89
+ source_partition_locator.partition_values,
90
+ ).stream_position,
91
+ deltacat_storage,
92
+ **list_deltas_kwargs,
93
+ )
94
+
95
+ uniform_deltas = []
96
+ for delta in deltas:
97
+ uniform_deltas_part = DeltaAnnotated.rebatch(
98
+ [DeltaAnnotated.of(delta)],
99
+ min_delta_bytes=min_delta_bytes,
100
+ min_file_counts=min_file_count,
101
+ )
102
+ uniform_deltas.extend(uniform_deltas_part)
103
+
104
+ logger.info(f"Retrieved a total of {len(uniform_deltas)} uniform deltas.")
105
+
106
+ max_parallelism = cluster_cpus
107
+ # create a new stream for this round
108
+ compacted_stream_locator = destination_partition_locator.stream_locator
109
+ stream = deltacat_storage.get_stream(
110
+ compacted_stream_locator.namespace,
111
+ compacted_stream_locator.table_name,
112
+ compacted_stream_locator.table_version,
113
+ )
114
+ partition = deltacat_storage.stage_partition(
115
+ stream,
116
+ destination_partition_locator.partition_values,
117
+ )
118
+ new_compacted_partition_locator = partition.locator
119
+ repar_start = time.time()
120
+ repar_tasks_pending = invoke_parallel(
121
+ items=uniform_deltas,
122
+ ray_task=repar.repartition,
123
+ max_parallelism=max_parallelism,
124
+ options_provider=round_robin_opt_provider,
125
+ repartition_type=repartition_type,
126
+ repartition_args=repartition_args,
127
+ max_records_per_output_file=records_per_repartitioned_file,
128
+ destination_partition=partition,
129
+ enable_profiler=enable_profiler,
130
+ metrics_config=metrics_config,
131
+ read_kwargs_provider=read_kwargs_provider,
132
+ repartitioned_file_content_type=repartitioned_file_content_type,
133
+ deltacat_storage=deltacat_storage,
134
+ )
135
+ logger.info(f"Getting {len(repar_tasks_pending)} task results...")
136
+ repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
137
+ repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
138
+ transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
139
+ ordered_deltas: List[Delta] = [
140
+ i for sublist in transposed for i in sublist if i is not None
141
+ ]
142
+ repar_end = time.time()
143
+ logger.info(f"repartition {repar_end - repar_start} seconds")
144
+ logger.info(f"Got {len(ordered_deltas)} task results.")
145
+ # ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
146
+ merged_delta = Delta.merge_deltas(ordered_deltas)
147
+ compacted_delta = deltacat_storage.commit_delta(
148
+ merged_delta, properties=kwargs.get("properties", {})
149
+ )
150
+ deltacat_storage.commit_partition(partition)
151
+ logger.info(f"Committed final delta: {compacted_delta}")
152
+ logger.info(f"Job run completed successfully!")
153
+ new_compacted_delta_locator = DeltaLocator.of(
154
+ new_compacted_partition_locator,
155
+ compacted_delta.stream_position,
156
+ )
157
+ bit_width_of_sort_keys = SortKey.validate_sort_keys(
158
+ source_partition_locator,
159
+ sort_keys,
160
+ deltacat_storage,
161
+ )
162
+ repartition_completion_info = RoundCompletionInfo.of(
163
+ last_stream_position_to_compact,
164
+ new_compacted_delta_locator,
165
+ None,
166
+ bit_width_of_sort_keys,
167
+ None,
168
+ )
169
+ return rcf.write_round_completion_file(
170
+ None,
171
+ None,
172
+ repartition_completion_info,
173
+ repartition_completion_file_s3_url,
174
+ )
@@ -1,10 +1,11 @@
1
1
  import importlib
2
2
  import logging
3
3
  import time
4
+ from uuid import uuid4
4
5
  from collections import defaultdict
5
6
  from contextlib import nullcontext
6
7
  from itertools import chain, repeat
7
- from typing import List, Optional, Tuple, Dict, Any
8
+ from typing import List, Optional, Tuple, Dict, Any, Union
8
9
  import pyarrow as pa
9
10
  import ray
10
11
  from ray import cloudpickle
@@ -18,7 +19,18 @@ from deltacat.compute.compactor.steps.dedupe import (
18
19
  DedupeTaskIndexWithObjectId,
19
20
  DeltaFileLocatorToRecords,
20
21
  )
21
- from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
22
+ from deltacat.storage import (
23
+ Delta,
24
+ DeltaLocator,
25
+ DeltaType,
26
+ Partition,
27
+ PartitionLocator,
28
+ Manifest,
29
+ ManifestEntry,
30
+ LocalDataset,
31
+ LocalTable,
32
+ DistributedDataset,
33
+ )
22
34
  from deltacat.storage import interface as unimplemented_deltacat_storage
23
35
  from deltacat.utils.common import ReadKwargsProvider
24
36
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
@@ -56,12 +68,44 @@ def materialize(
56
68
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
69
  s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
58
70
  deltacat_storage=unimplemented_deltacat_storage,
59
- ) -> MaterializeResult:
71
+ ):
72
+ def _stage_delta_implementation(
73
+ data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
74
+ partition: Partition,
75
+ stage_delta_from_existing_manifest: Optional[bool],
76
+ ) -> Delta:
77
+ if stage_delta_from_existing_manifest:
78
+ delta = Delta.of(
79
+ locator=DeltaLocator.of(partition.locator),
80
+ delta_type=DeltaType.UPSERT,
81
+ meta=manifest.meta,
82
+ manifest=data,
83
+ previous_stream_position=partition.stream_position,
84
+ properties={},
85
+ )
86
+ return delta
87
+
88
+ def _stage_delta_from_manifest_entry_reference_list(
89
+ manifest_entry_list_reference: List[ManifestEntry],
90
+ partition: Partition,
91
+ delta_type: DeltaType = DeltaType.UPSERT,
92
+ ) -> Delta:
93
+ assert (
94
+ delta_type == DeltaType.UPSERT
95
+ ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
96
+ manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
97
+ delta = _stage_delta_implementation(
98
+ data=manifest,
99
+ partition=partition,
100
+ delta_type=delta_type,
101
+ stage_delta_from_existing_manifest=True,
102
+ )
103
+ return delta
104
+
60
105
  # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
61
106
  # https://github.com/ray-project/deltacat/issues/79
62
107
  def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
63
108
  compacted_table = pa.concat_tables(compacted_tables)
64
-
65
109
  if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
66
110
  # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
67
111
  # TODO (pdames): compare performance to pandas-native materialize path
@@ -92,11 +136,11 @@ def materialize(
92
136
  f"({len(compacted_table)})",
93
137
  )
94
138
  materialize_result = MaterializeResult.of(
95
- delta,
96
- mat_bucket_index,
139
+ delta=delta,
140
+ task_index=mat_bucket_index,
97
141
  # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
98
142
  # and in-memory-table-bytes instead of tight coupling to paBytes
99
- PyArrowWriteResult.of(
143
+ pyarrow_write_result=PyArrowWriteResult.of(
100
144
  len(manifest.entries),
101
145
  TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
102
146
  manifest.meta.content_length,
@@ -138,6 +182,9 @@ def materialize(
138
182
  manifest_cache = {}
139
183
  materialized_results: List[MaterializeResult] = []
140
184
  record_batch_tables = RecordBatchTables(max_records_per_output_file)
185
+ count_of_src_dfl = 0
186
+ manifest_entry_list_reference = []
187
+ referenced_pyarrow_write_results = []
141
188
  for src_dfl in sorted(all_src_file_records.keys()):
142
189
  record_numbers_dd_task_idx_tpl_list: List[
143
190
  Tuple[DeltaFileLocatorToRecords, repeat]
@@ -148,11 +195,13 @@ def materialize(
148
195
  is_src_partition_file_np = src_dfl.is_source_delta
149
196
  src_stream_position_np = src_dfl.stream_position
150
197
  src_file_idx_np = src_dfl.file_index
198
+ count_of_src_dfl += 1
151
199
  src_file_partition_locator = (
152
200
  source_partition_locator
153
201
  if is_src_partition_file_np
154
202
  else round_completion_info.compacted_delta_locator.partition_locator
155
203
  )
204
+
156
205
  delta_locator = DeltaLocator.of(
157
206
  src_file_partition_locator,
158
207
  src_stream_position_np.item(),
@@ -185,39 +234,79 @@ def materialize(
185
234
  f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
186
235
  f" is: {download_delta_manifest_entry_time}s"
187
236
  )
188
- mask_pylist = list(repeat(False, len(pa_table)))
189
237
  record_numbers = chain.from_iterable(record_numbers_tpl)
190
- # TODO(raghumdani): reference the same file URIs while writing the files
191
- # instead of copying the data over and creating new files.
238
+ record_numbers_length = 0
239
+ mask_pylist = list(repeat(False, len(pa_table)))
192
240
  for record_number in record_numbers:
241
+ record_numbers_length += 1
193
242
  mask_pylist[record_number] = True
194
- mask = pa.array(mask_pylist)
195
- pa_table = pa_table.filter(mask)
196
- record_batch_tables.append(pa_table)
197
- if record_batch_tables.has_batches():
198
- batched_tables = record_batch_tables.evict()
199
- materialized_results.append(_materialize(batched_tables))
243
+ if (
244
+ record_numbers_length == len(pa_table)
245
+ and src_file_partition_locator
246
+ == round_completion_info.compacted_delta_locator.partition_locator
247
+ ):
248
+ logger.debug(
249
+ f"Untouched manifest file found, "
250
+ f"record numbers length: {record_numbers_length} "
251
+ f"same as downloaded table length: {len(pa_table)}"
252
+ )
253
+ untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
254
+ manifest_entry_list_reference.append(untouched_src_manifest_entry)
255
+ referenced_pyarrow_write_result = PyArrowWriteResult.of(
256
+ len(untouched_src_manifest_entry.entries),
257
+ TABLE_CLASS_TO_SIZE_FUNC[type(pa_table)](pa_table),
258
+ manifest.meta.content_length,
259
+ len(pa_table),
260
+ )
261
+ referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
262
+ else:
263
+ mask = pa.array(mask_pylist)
264
+ pa_table = pa_table.filter(mask)
265
+ record_batch_tables.append(pa_table)
266
+ if record_batch_tables.has_batches():
267
+ batched_tables = record_batch_tables.evict()
268
+ materialized_results.append(_materialize(batched_tables))
200
269
 
201
270
  if record_batch_tables.has_remaining():
202
271
  materialized_results.append(_materialize(record_batch_tables.remaining))
203
272
 
204
- merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
205
- assert (
206
- materialized_results and len(materialized_results) > 0
207
- ), f"Expected at least one materialized result in materialize step."
273
+ logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
274
+
275
+ referenced_manifest_delta = (
276
+ _stage_delta_from_manifest_entry_reference_list(
277
+ manifest_entry_list_reference
278
+ )
279
+ if manifest_entry_list_reference
280
+ else None
281
+ )
282
+ if referenced_manifest_delta:
283
+ logger.info(
284
+ f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
285
+ )
286
+
287
+ merged_materialized_delta = [mr.delta for mr in materialized_results]
288
+ merged_materialized_delta.append(referenced_manifest_delta)
289
+ merged_delta = Delta.merge_deltas(
290
+ [d for d in merged_materialized_delta if d is not None]
291
+ )
292
+
293
+ write_results_union = referenced_pyarrow_write_results
294
+ if materialized_results:
295
+ for mr in materialized_results:
296
+ write_results_union.append(mr.pyarrow_write_result)
297
+ write_result = PyArrowWriteResult.union(write_results_union)
208
298
 
209
- write_results = [mr.pyarrow_write_result for mr in materialized_results]
210
299
  logger.debug(
211
- f"{len(write_results)} files written"
212
- f" with records: {[wr.records for wr in write_results]}"
300
+ f"{len(write_results_union)} files written"
301
+ f" with records: {[wr.records for wr in write_results_union]}"
213
302
  )
214
303
  # Merge all new deltas into one for this materialize bucket index
215
304
  merged_materialize_result = MaterializeResult.of(
216
305
  merged_delta,
217
- materialized_results[0].task_index,
218
- PyArrowWriteResult.union(
219
- [mr.pyarrow_write_result for mr in materialized_results]
220
- ),
306
+ mat_bucket_index,
307
+ write_result,
308
+ len(manifest_entry_list_reference),
309
+ count_of_src_dfl,
221
310
  )
222
311
 
223
312
  logger.info(f"Finished materialize task...")