deltacat 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +62 -25
- deltacat/compute/compactor/model/delta_annotated.py +1 -1
- deltacat/compute/compactor/model/materialize_result.py +16 -2
- deltacat/compute/compactor/model/repartition_result.py +6 -0
- deltacat/compute/compactor/model/round_completion_info.py +8 -0
- deltacat/compute/compactor/repartition_session.py +174 -0
- deltacat/compute/compactor/steps/materialize.py +116 -27
- deltacat/compute/compactor/steps/repartition.py +210 -0
- deltacat/compute/compactor/utils/io.py +131 -49
- deltacat/compute/compactor/utils/round_completion_file.py +14 -16
- deltacat/constants.py +2 -0
- deltacat/storage/interface.py +1 -1
- deltacat/storage/model/types.py +10 -2
- deltacat/tests/compactor/utils/__init__.py +0 -0
- deltacat/tests/compactor/utils/test_io.py +69 -0
- deltacat/tests/test_repartition.py +193 -0
- deltacat/tests/test_utils/__init__.py +0 -0
- deltacat/tests/test_utils/constants.py +7 -0
- deltacat/tests/utils/test_resources.py +36 -0
- deltacat/utils/ray_utils/concurrency.py +2 -0
- deltacat/utils/resources.py +72 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/METADATA +2 -5
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/RECORD +28 -18
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/WHEEL +1 -1
- /deltacat/{utils/profiling.py → tests/compactor/__init__.py} +0 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -37,6 +37,7 @@ from deltacat.utils.placement import PlacementGroupConfig
|
|
37
37
|
from typing import List, Set, Optional, Tuple, Dict, Any
|
38
38
|
from collections import defaultdict
|
39
39
|
from deltacat.utils.metrics import MetricsConfig
|
40
|
+
from deltacat.utils.resources import log_current_cluster_utilization
|
40
41
|
|
41
42
|
if importlib.util.find_spec("memray"):
|
42
43
|
import memray
|
@@ -113,8 +114,11 @@ def compact_partition(
|
|
113
114
|
f"compaction_partition.bin"
|
114
115
|
) if enable_profiler else nullcontext():
|
115
116
|
partition = None
|
116
|
-
|
117
|
-
|
117
|
+
(
|
118
|
+
new_partition,
|
119
|
+
new_rci,
|
120
|
+
new_rcf_partition_locator,
|
121
|
+
) = _execute_compaction_round(
|
118
122
|
source_partition_locator,
|
119
123
|
destination_partition_locator,
|
120
124
|
primary_keys,
|
@@ -144,12 +148,19 @@ def compact_partition(
|
|
144
148
|
logger.info(
|
145
149
|
f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
|
146
150
|
)
|
151
|
+
round_completion_file_s3_url = None
|
147
152
|
if partition:
|
148
153
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
149
154
|
partition = deltacat_storage.commit_partition(partition)
|
150
155
|
logger.info(f"Committed compacted partition: {partition}")
|
156
|
+
|
157
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
158
|
+
compaction_artifact_s3_bucket,
|
159
|
+
new_rcf_partition_locator,
|
160
|
+
new_rci,
|
161
|
+
)
|
151
162
|
logger.info(f"Completed compaction session for: {source_partition_locator}")
|
152
|
-
return
|
163
|
+
return round_completion_file_s3_url
|
153
164
|
|
154
165
|
|
155
166
|
def _execute_compaction_round(
|
@@ -283,13 +294,22 @@ def _execute_compaction_round(
|
|
283
294
|
hash_bucket_count,
|
284
295
|
last_stream_position_compacted,
|
285
296
|
require_multiple_rounds,
|
286
|
-
) =
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
297
|
+
) = (
|
298
|
+
io.fit_input_deltas(
|
299
|
+
input_deltas,
|
300
|
+
cluster_resources,
|
301
|
+
hash_bucket_count,
|
302
|
+
deltacat_storage=deltacat_storage,
|
303
|
+
)
|
304
|
+
if input_deltas_stats is None
|
305
|
+
else io.limit_input_deltas(
|
306
|
+
input_deltas,
|
307
|
+
cluster_resources,
|
308
|
+
hash_bucket_count,
|
309
|
+
min_hash_bucket_chunk_size,
|
310
|
+
input_deltas_stats=input_deltas_stats,
|
311
|
+
deltacat_storage=deltacat_storage,
|
312
|
+
)
|
293
313
|
)
|
294
314
|
|
295
315
|
assert hash_bucket_count is not None and hash_bucket_count > 0, (
|
@@ -435,11 +455,39 @@ def _execute_compaction_round(
|
|
435
455
|
)
|
436
456
|
logger.info(f"Getting {len(mat_tasks_pending)} materialize result(s)...")
|
437
457
|
mat_results = ray.get(mat_tasks_pending)
|
458
|
+
total_count_of_src_dfl_not_touched = sum(
|
459
|
+
m.count_of_src_dfl_not_touched for m in mat_results
|
460
|
+
)
|
461
|
+
total_length_src_dfl = sum(m.count_of_src_dfl for m in mat_results)
|
462
|
+
logger.info(
|
463
|
+
f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
|
464
|
+
)
|
465
|
+
logger.info(
|
466
|
+
f"Got total of {total_length_src_dfl} manifest files during compaction."
|
467
|
+
)
|
468
|
+
manifest_entry_copied_by_reference_ratio = (
|
469
|
+
(round(total_count_of_src_dfl_not_touched / total_length_src_dfl, 4) * 100)
|
470
|
+
if total_length_src_dfl != 0
|
471
|
+
else None
|
472
|
+
)
|
473
|
+
logger.info(
|
474
|
+
f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
|
475
|
+
)
|
476
|
+
|
438
477
|
logger.info(f"Got {len(mat_results)} materialize result(s).")
|
439
478
|
|
479
|
+
log_current_cluster_utilization(log_identifier="post_materialize")
|
480
|
+
|
440
481
|
mat_results = sorted(mat_results, key=lambda m: m.task_index)
|
441
482
|
deltas = [m.delta for m in mat_results]
|
442
|
-
|
483
|
+
|
484
|
+
# Note: An appropriate last stream position must be set
|
485
|
+
# to avoid correctness issue.
|
486
|
+
merged_delta = Delta.merge_deltas(
|
487
|
+
deltas,
|
488
|
+
stream_position=last_stream_position_to_compact,
|
489
|
+
)
|
490
|
+
|
443
491
|
record_info_msg = (
|
444
492
|
f"Hash bucket records: {total_hb_record_count},"
|
445
493
|
f" Deduped records: {total_dd_record_count}, "
|
@@ -463,35 +511,24 @@ def _execute_compaction_round(
|
|
463
511
|
compacted_delta.stream_position,
|
464
512
|
)
|
465
513
|
|
466
|
-
rci_high_watermark = (
|
467
|
-
rebase_source_partition_high_watermark
|
468
|
-
if rebase_source_partition_high_watermark
|
469
|
-
else last_stream_position_compacted
|
470
|
-
)
|
471
|
-
|
472
514
|
last_rebase_source_partition_locator = rebase_source_partition_locator or (
|
473
515
|
round_completion_info.rebase_source_partition_locator
|
474
516
|
if round_completion_info
|
475
517
|
else None
|
476
518
|
)
|
477
519
|
new_round_completion_info = RoundCompletionInfo.of(
|
478
|
-
|
520
|
+
last_stream_position_compacted,
|
479
521
|
new_compacted_delta_locator,
|
480
522
|
PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
|
481
523
|
bit_width_of_sort_keys,
|
482
524
|
last_rebase_source_partition_locator,
|
525
|
+
manifest_entry_copied_by_reference_ratio,
|
483
526
|
)
|
484
527
|
rcf_source_partition_locator = (
|
485
528
|
rebase_source_partition_locator
|
486
529
|
if rebase_source_partition_locator
|
487
530
|
else source_partition_locator
|
488
531
|
)
|
489
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
490
|
-
compaction_artifact_s3_bucket,
|
491
|
-
rcf_source_partition_locator,
|
492
|
-
new_round_completion_info,
|
493
|
-
)
|
494
|
-
|
495
532
|
logger.info(
|
496
533
|
f"partition-{source_partition_locator.partition_values},"
|
497
534
|
f"compacted at: {last_stream_position_compacted},"
|
@@ -500,5 +537,5 @@ def _execute_compaction_round(
|
|
500
537
|
return (
|
501
538
|
partition,
|
502
539
|
new_round_completion_info,
|
503
|
-
|
540
|
+
rcf_source_partition_locator,
|
504
541
|
)
|
@@ -62,7 +62,7 @@ class DeltaAnnotated(Delta):
|
|
62
62
|
@staticmethod
|
63
63
|
def rebatch(
|
64
64
|
annotated_deltas: List[DeltaAnnotated],
|
65
|
-
min_delta_bytes,
|
65
|
+
min_delta_bytes: float,
|
66
66
|
min_file_counts: Optional[Union[int, float]] = float("inf"),
|
67
67
|
estimation_function: Optional[Callable] = None,
|
68
68
|
) -> List[DeltaAnnotated]:
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import Any, Dict
|
4
|
+
from typing import Any, Dict, Optional
|
5
5
|
|
6
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
7
|
from deltacat.storage import Delta
|
@@ -10,12 +10,18 @@ from deltacat.storage import Delta
|
|
10
10
|
class MaterializeResult(dict):
|
11
11
|
@staticmethod
|
12
12
|
def of(
|
13
|
-
delta: Delta,
|
13
|
+
delta: Delta,
|
14
|
+
task_index: int,
|
15
|
+
pyarrow_write_result: PyArrowWriteResult,
|
16
|
+
count_of_src_dfl_not_touched: Optional[int] = 0,
|
17
|
+
count_of_src_dfl: Optional[int] = 0,
|
14
18
|
) -> MaterializeResult:
|
15
19
|
materialize_result = MaterializeResult()
|
16
20
|
materialize_result["delta"] = delta
|
17
21
|
materialize_result["taskIndex"] = task_index
|
18
22
|
materialize_result["paWriteResult"] = pyarrow_write_result
|
23
|
+
materialize_result["countOfSrcFileNotTouched"] = count_of_src_dfl_not_touched
|
24
|
+
materialize_result["countOfSrcFile"] = count_of_src_dfl
|
19
25
|
return materialize_result
|
20
26
|
|
21
27
|
@property
|
@@ -35,3 +41,11 @@ class MaterializeResult(dict):
|
|
35
41
|
if val is not None and not isinstance(val, PyArrowWriteResult):
|
36
42
|
self["paWriteResult"] = val = PyArrowWriteResult(val)
|
37
43
|
return val
|
44
|
+
|
45
|
+
@property
|
46
|
+
def count_of_src_dfl_not_touched(self) -> int:
|
47
|
+
return self["countOfSrcFileNotTouched"]
|
48
|
+
|
49
|
+
@property
|
50
|
+
def count_of_src_dfl(self) -> int:
|
51
|
+
return self["countOfSrcFile"]
|
@@ -38,6 +38,7 @@ class RoundCompletionInfo(dict):
|
|
38
38
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
39
39
|
sort_keys_bit_width: int,
|
40
40
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
41
|
+
manifest_entry_copied_by_reference_ratio: Optional[float] = None,
|
41
42
|
) -> RoundCompletionInfo:
|
42
43
|
|
43
44
|
rci = RoundCompletionInfo()
|
@@ -46,6 +47,9 @@ class RoundCompletionInfo(dict):
|
|
46
47
|
rci["compactedPyarrowWriteResult"] = compacted_pyarrow_write_result
|
47
48
|
rci["sortKeysBitWidth"] = sort_keys_bit_width
|
48
49
|
rci["rebaseSourcePartitionLocator"] = rebase_source_partition_locator
|
50
|
+
rci[
|
51
|
+
"manifestEntryCopiedByReferenceRatio"
|
52
|
+
] = manifest_entry_copied_by_reference_ratio
|
49
53
|
return rci
|
50
54
|
|
51
55
|
@property
|
@@ -80,3 +84,7 @@ class RoundCompletionInfo(dict):
|
|
80
84
|
@property
|
81
85
|
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
82
86
|
return self.get("rebaseSourcePartitionLocator")
|
87
|
+
|
88
|
+
@property
|
89
|
+
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
90
|
+
return self["manifestEntryCopiedByReferenceRatio"]
|
@@ -0,0 +1,174 @@
|
|
1
|
+
import ray
|
2
|
+
import time
|
3
|
+
import logging
|
4
|
+
from deltacat import logs
|
5
|
+
from deltacat.utils.common import ReadKwargsProvider
|
6
|
+
import functools
|
7
|
+
import itertools
|
8
|
+
from deltacat.compute.compactor import (
|
9
|
+
RoundCompletionInfo,
|
10
|
+
SortKey,
|
11
|
+
)
|
12
|
+
from deltacat.types.media import ContentType
|
13
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
14
|
+
from deltacat.utils.ray_utils.concurrency import (
|
15
|
+
invoke_parallel,
|
16
|
+
round_robin_options_provider,
|
17
|
+
)
|
18
|
+
|
19
|
+
from deltacat.compute.compactor.model.repartition_result import RepartitionResult
|
20
|
+
from deltacat.utils.placement import PlacementGroupConfig
|
21
|
+
from typing import List, Optional, Dict, Any
|
22
|
+
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
23
|
+
from deltacat.compute.compactor.utils import io
|
24
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
25
|
+
from deltacat.compute.compactor.steps import repartition as repar
|
26
|
+
from deltacat.compute.compactor.steps.repartition import RepartitionType
|
27
|
+
from deltacat.storage import (
|
28
|
+
Delta,
|
29
|
+
DeltaLocator,
|
30
|
+
PartitionLocator,
|
31
|
+
interface as unimplemented_deltacat_storage,
|
32
|
+
)
|
33
|
+
from deltacat.utils.metrics import MetricsConfig
|
34
|
+
|
35
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
|
+
|
37
|
+
|
38
|
+
# TODO: move this repartition function to a separate module under compute
|
39
|
+
def repartition(
|
40
|
+
source_partition_locator: PartitionLocator,
|
41
|
+
destination_partition_locator: PartitionLocator,
|
42
|
+
repartition_args: Any,
|
43
|
+
repartition_completion_file_s3_url: str,
|
44
|
+
last_stream_position_to_compact: int,
|
45
|
+
repartition_type: RepartitionType = RepartitionType.RANGE,
|
46
|
+
sort_keys: List[SortKey] = None,
|
47
|
+
records_per_repartitioned_file: int = 4_000_000,
|
48
|
+
min_file_count: int = 1000,
|
49
|
+
min_delta_bytes: int = 200 * 2**20,
|
50
|
+
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
51
|
+
enable_profiler: bool = False,
|
52
|
+
metrics_config: Optional[MetricsConfig] = None,
|
53
|
+
pg_config: Optional[PlacementGroupConfig] = None,
|
54
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
55
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
56
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
57
|
+
**kwargs,
|
58
|
+
) -> Optional[str]:
|
59
|
+
|
60
|
+
node_resource_keys = None
|
61
|
+
if pg_config: # use resource in each placement group
|
62
|
+
cluster_resources = pg_config.resource
|
63
|
+
cluster_cpus = cluster_resources["CPU"]
|
64
|
+
else: # use all cluster resource
|
65
|
+
cluster_resources = ray.cluster_resources()
|
66
|
+
logger.info(f"Total cluster resources: {cluster_resources}")
|
67
|
+
logger.info(f"Available cluster resources: {ray.available_resources()}")
|
68
|
+
cluster_cpus = int(cluster_resources["CPU"])
|
69
|
+
logger.info(f"Total cluster CPUs: {cluster_cpus}")
|
70
|
+
node_resource_keys = live_node_resource_keys()
|
71
|
+
logger.info(
|
72
|
+
f"Found {len(node_resource_keys)} live cluster nodes: "
|
73
|
+
f"{node_resource_keys}"
|
74
|
+
)
|
75
|
+
|
76
|
+
# create a remote options provider to round-robin tasks across all nodes or allocated bundles
|
77
|
+
logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
|
78
|
+
round_robin_opt_provider = functools.partial(
|
79
|
+
round_robin_options_provider,
|
80
|
+
resource_keys=node_resource_keys,
|
81
|
+
pg_config=pg_config.opts if pg_config else None,
|
82
|
+
)
|
83
|
+
|
84
|
+
deltas = io._discover_deltas(
|
85
|
+
source_partition_locator,
|
86
|
+
None,
|
87
|
+
deltacat_storage.get_partition(
|
88
|
+
source_partition_locator.stream_locator,
|
89
|
+
source_partition_locator.partition_values,
|
90
|
+
).stream_position,
|
91
|
+
deltacat_storage,
|
92
|
+
**list_deltas_kwargs,
|
93
|
+
)
|
94
|
+
|
95
|
+
uniform_deltas = []
|
96
|
+
for delta in deltas:
|
97
|
+
uniform_deltas_part = DeltaAnnotated.rebatch(
|
98
|
+
[DeltaAnnotated.of(delta)],
|
99
|
+
min_delta_bytes=min_delta_bytes,
|
100
|
+
min_file_counts=min_file_count,
|
101
|
+
)
|
102
|
+
uniform_deltas.extend(uniform_deltas_part)
|
103
|
+
|
104
|
+
logger.info(f"Retrieved a total of {len(uniform_deltas)} uniform deltas.")
|
105
|
+
|
106
|
+
max_parallelism = cluster_cpus
|
107
|
+
# create a new stream for this round
|
108
|
+
compacted_stream_locator = destination_partition_locator.stream_locator
|
109
|
+
stream = deltacat_storage.get_stream(
|
110
|
+
compacted_stream_locator.namespace,
|
111
|
+
compacted_stream_locator.table_name,
|
112
|
+
compacted_stream_locator.table_version,
|
113
|
+
)
|
114
|
+
partition = deltacat_storage.stage_partition(
|
115
|
+
stream,
|
116
|
+
destination_partition_locator.partition_values,
|
117
|
+
)
|
118
|
+
new_compacted_partition_locator = partition.locator
|
119
|
+
repar_start = time.time()
|
120
|
+
repar_tasks_pending = invoke_parallel(
|
121
|
+
items=uniform_deltas,
|
122
|
+
ray_task=repar.repartition,
|
123
|
+
max_parallelism=max_parallelism,
|
124
|
+
options_provider=round_robin_opt_provider,
|
125
|
+
repartition_type=repartition_type,
|
126
|
+
repartition_args=repartition_args,
|
127
|
+
max_records_per_output_file=records_per_repartitioned_file,
|
128
|
+
destination_partition=partition,
|
129
|
+
enable_profiler=enable_profiler,
|
130
|
+
metrics_config=metrics_config,
|
131
|
+
read_kwargs_provider=read_kwargs_provider,
|
132
|
+
repartitioned_file_content_type=repartitioned_file_content_type,
|
133
|
+
deltacat_storage=deltacat_storage,
|
134
|
+
)
|
135
|
+
logger.info(f"Getting {len(repar_tasks_pending)} task results...")
|
136
|
+
repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
|
137
|
+
repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
|
138
|
+
transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
|
139
|
+
ordered_deltas: List[Delta] = [
|
140
|
+
i for sublist in transposed for i in sublist if i is not None
|
141
|
+
]
|
142
|
+
repar_end = time.time()
|
143
|
+
logger.info(f"repartition {repar_end - repar_start} seconds")
|
144
|
+
logger.info(f"Got {len(ordered_deltas)} task results.")
|
145
|
+
# ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
|
146
|
+
merged_delta = Delta.merge_deltas(ordered_deltas)
|
147
|
+
compacted_delta = deltacat_storage.commit_delta(
|
148
|
+
merged_delta, properties=kwargs.get("properties", {})
|
149
|
+
)
|
150
|
+
deltacat_storage.commit_partition(partition)
|
151
|
+
logger.info(f"Committed final delta: {compacted_delta}")
|
152
|
+
logger.info(f"Job run completed successfully!")
|
153
|
+
new_compacted_delta_locator = DeltaLocator.of(
|
154
|
+
new_compacted_partition_locator,
|
155
|
+
compacted_delta.stream_position,
|
156
|
+
)
|
157
|
+
bit_width_of_sort_keys = SortKey.validate_sort_keys(
|
158
|
+
source_partition_locator,
|
159
|
+
sort_keys,
|
160
|
+
deltacat_storage,
|
161
|
+
)
|
162
|
+
repartition_completion_info = RoundCompletionInfo.of(
|
163
|
+
last_stream_position_to_compact,
|
164
|
+
new_compacted_delta_locator,
|
165
|
+
None,
|
166
|
+
bit_width_of_sort_keys,
|
167
|
+
None,
|
168
|
+
)
|
169
|
+
return rcf.write_round_completion_file(
|
170
|
+
None,
|
171
|
+
None,
|
172
|
+
repartition_completion_info,
|
173
|
+
repartition_completion_file_s3_url,
|
174
|
+
)
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import importlib
|
2
2
|
import logging
|
3
3
|
import time
|
4
|
+
from uuid import uuid4
|
4
5
|
from collections import defaultdict
|
5
6
|
from contextlib import nullcontext
|
6
7
|
from itertools import chain, repeat
|
7
|
-
from typing import List, Optional, Tuple, Dict, Any
|
8
|
+
from typing import List, Optional, Tuple, Dict, Any, Union
|
8
9
|
import pyarrow as pa
|
9
10
|
import ray
|
10
11
|
from ray import cloudpickle
|
@@ -18,7 +19,18 @@ from deltacat.compute.compactor.steps.dedupe import (
|
|
18
19
|
DedupeTaskIndexWithObjectId,
|
19
20
|
DeltaFileLocatorToRecords,
|
20
21
|
)
|
21
|
-
from deltacat.storage import
|
22
|
+
from deltacat.storage import (
|
23
|
+
Delta,
|
24
|
+
DeltaLocator,
|
25
|
+
DeltaType,
|
26
|
+
Partition,
|
27
|
+
PartitionLocator,
|
28
|
+
Manifest,
|
29
|
+
ManifestEntry,
|
30
|
+
LocalDataset,
|
31
|
+
LocalTable,
|
32
|
+
DistributedDataset,
|
33
|
+
)
|
22
34
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
23
35
|
from deltacat.utils.common import ReadKwargsProvider
|
24
36
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
@@ -56,12 +68,44 @@ def materialize(
|
|
56
68
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
57
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
58
70
|
deltacat_storage=unimplemented_deltacat_storage,
|
59
|
-
)
|
71
|
+
):
|
72
|
+
def _stage_delta_implementation(
|
73
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
74
|
+
partition: Partition,
|
75
|
+
stage_delta_from_existing_manifest: Optional[bool],
|
76
|
+
) -> Delta:
|
77
|
+
if stage_delta_from_existing_manifest:
|
78
|
+
delta = Delta.of(
|
79
|
+
locator=DeltaLocator.of(partition.locator),
|
80
|
+
delta_type=DeltaType.UPSERT,
|
81
|
+
meta=manifest.meta,
|
82
|
+
manifest=data,
|
83
|
+
previous_stream_position=partition.stream_position,
|
84
|
+
properties={},
|
85
|
+
)
|
86
|
+
return delta
|
87
|
+
|
88
|
+
def _stage_delta_from_manifest_entry_reference_list(
|
89
|
+
manifest_entry_list_reference: List[ManifestEntry],
|
90
|
+
partition: Partition,
|
91
|
+
delta_type: DeltaType = DeltaType.UPSERT,
|
92
|
+
) -> Delta:
|
93
|
+
assert (
|
94
|
+
delta_type == DeltaType.UPSERT
|
95
|
+
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
96
|
+
manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
|
97
|
+
delta = _stage_delta_implementation(
|
98
|
+
data=manifest,
|
99
|
+
partition=partition,
|
100
|
+
delta_type=delta_type,
|
101
|
+
stage_delta_from_existing_manifest=True,
|
102
|
+
)
|
103
|
+
return delta
|
104
|
+
|
60
105
|
# TODO (rkenmi): Add docstrings for the steps in the compaction workflow
|
61
106
|
# https://github.com/ray-project/deltacat/issues/79
|
62
107
|
def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
|
63
108
|
compacted_table = pa.concat_tables(compacted_tables)
|
64
|
-
|
65
109
|
if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
66
110
|
# TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
|
67
111
|
# TODO (pdames): compare performance to pandas-native materialize path
|
@@ -92,11 +136,11 @@ def materialize(
|
|
92
136
|
f"({len(compacted_table)})",
|
93
137
|
)
|
94
138
|
materialize_result = MaterializeResult.of(
|
95
|
-
delta,
|
96
|
-
mat_bucket_index,
|
139
|
+
delta=delta,
|
140
|
+
task_index=mat_bucket_index,
|
97
141
|
# TODO (pdames): Generalize WriteResult to contain in-memory-table-type
|
98
142
|
# and in-memory-table-bytes instead of tight coupling to paBytes
|
99
|
-
PyArrowWriteResult.of(
|
143
|
+
pyarrow_write_result=PyArrowWriteResult.of(
|
100
144
|
len(manifest.entries),
|
101
145
|
TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
|
102
146
|
manifest.meta.content_length,
|
@@ -138,6 +182,9 @@ def materialize(
|
|
138
182
|
manifest_cache = {}
|
139
183
|
materialized_results: List[MaterializeResult] = []
|
140
184
|
record_batch_tables = RecordBatchTables(max_records_per_output_file)
|
185
|
+
count_of_src_dfl = 0
|
186
|
+
manifest_entry_list_reference = []
|
187
|
+
referenced_pyarrow_write_results = []
|
141
188
|
for src_dfl in sorted(all_src_file_records.keys()):
|
142
189
|
record_numbers_dd_task_idx_tpl_list: List[
|
143
190
|
Tuple[DeltaFileLocatorToRecords, repeat]
|
@@ -148,11 +195,13 @@ def materialize(
|
|
148
195
|
is_src_partition_file_np = src_dfl.is_source_delta
|
149
196
|
src_stream_position_np = src_dfl.stream_position
|
150
197
|
src_file_idx_np = src_dfl.file_index
|
198
|
+
count_of_src_dfl += 1
|
151
199
|
src_file_partition_locator = (
|
152
200
|
source_partition_locator
|
153
201
|
if is_src_partition_file_np
|
154
202
|
else round_completion_info.compacted_delta_locator.partition_locator
|
155
203
|
)
|
204
|
+
|
156
205
|
delta_locator = DeltaLocator.of(
|
157
206
|
src_file_partition_locator,
|
158
207
|
src_stream_position_np.item(),
|
@@ -185,39 +234,79 @@ def materialize(
|
|
185
234
|
f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
|
186
235
|
f" is: {download_delta_manifest_entry_time}s"
|
187
236
|
)
|
188
|
-
mask_pylist = list(repeat(False, len(pa_table)))
|
189
237
|
record_numbers = chain.from_iterable(record_numbers_tpl)
|
190
|
-
|
191
|
-
|
238
|
+
record_numbers_length = 0
|
239
|
+
mask_pylist = list(repeat(False, len(pa_table)))
|
192
240
|
for record_number in record_numbers:
|
241
|
+
record_numbers_length += 1
|
193
242
|
mask_pylist[record_number] = True
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
243
|
+
if (
|
244
|
+
record_numbers_length == len(pa_table)
|
245
|
+
and src_file_partition_locator
|
246
|
+
== round_completion_info.compacted_delta_locator.partition_locator
|
247
|
+
):
|
248
|
+
logger.debug(
|
249
|
+
f"Untouched manifest file found, "
|
250
|
+
f"record numbers length: {record_numbers_length} "
|
251
|
+
f"same as downloaded table length: {len(pa_table)}"
|
252
|
+
)
|
253
|
+
untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
|
254
|
+
manifest_entry_list_reference.append(untouched_src_manifest_entry)
|
255
|
+
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
256
|
+
len(untouched_src_manifest_entry.entries),
|
257
|
+
TABLE_CLASS_TO_SIZE_FUNC[type(pa_table)](pa_table),
|
258
|
+
manifest.meta.content_length,
|
259
|
+
len(pa_table),
|
260
|
+
)
|
261
|
+
referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
|
262
|
+
else:
|
263
|
+
mask = pa.array(mask_pylist)
|
264
|
+
pa_table = pa_table.filter(mask)
|
265
|
+
record_batch_tables.append(pa_table)
|
266
|
+
if record_batch_tables.has_batches():
|
267
|
+
batched_tables = record_batch_tables.evict()
|
268
|
+
materialized_results.append(_materialize(batched_tables))
|
200
269
|
|
201
270
|
if record_batch_tables.has_remaining():
|
202
271
|
materialized_results.append(_materialize(record_batch_tables.remaining))
|
203
272
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
273
|
+
logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
|
274
|
+
|
275
|
+
referenced_manifest_delta = (
|
276
|
+
_stage_delta_from_manifest_entry_reference_list(
|
277
|
+
manifest_entry_list_reference
|
278
|
+
)
|
279
|
+
if manifest_entry_list_reference
|
280
|
+
else None
|
281
|
+
)
|
282
|
+
if referenced_manifest_delta:
|
283
|
+
logger.info(
|
284
|
+
f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
|
285
|
+
)
|
286
|
+
|
287
|
+
merged_materialized_delta = [mr.delta for mr in materialized_results]
|
288
|
+
merged_materialized_delta.append(referenced_manifest_delta)
|
289
|
+
merged_delta = Delta.merge_deltas(
|
290
|
+
[d for d in merged_materialized_delta if d is not None]
|
291
|
+
)
|
292
|
+
|
293
|
+
write_results_union = referenced_pyarrow_write_results
|
294
|
+
if materialized_results:
|
295
|
+
for mr in materialized_results:
|
296
|
+
write_results_union.append(mr.pyarrow_write_result)
|
297
|
+
write_result = PyArrowWriteResult.union(write_results_union)
|
208
298
|
|
209
|
-
write_results = [mr.pyarrow_write_result for mr in materialized_results]
|
210
299
|
logger.debug(
|
211
|
-
f"{len(
|
212
|
-
f" with records: {[wr.records for wr in
|
300
|
+
f"{len(write_results_union)} files written"
|
301
|
+
f" with records: {[wr.records for wr in write_results_union]}"
|
213
302
|
)
|
214
303
|
# Merge all new deltas into one for this materialize bucket index
|
215
304
|
merged_materialize_result = MaterializeResult.of(
|
216
305
|
merged_delta,
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
306
|
+
mat_bucket_index,
|
307
|
+
write_result,
|
308
|
+
len(manifest_entry_list_reference),
|
309
|
+
count_of_src_dfl,
|
221
310
|
)
|
222
311
|
|
223
312
|
logger.info(f"Finished materialize task...")
|