deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from ray.types import ObjectRef
|
4
|
+
from typing import Dict, List, Optional, Any
|
5
|
+
from deltacat.utils.metrics import MetricsConfig
|
6
|
+
from deltacat.utils.common import ReadKwargsProvider
|
7
|
+
from deltacat.io.object_store import IObjectStore
|
8
|
+
from deltacat.storage import (
|
9
|
+
Partition,
|
10
|
+
SortKey,
|
11
|
+
interface as unimplemented_deltacat_storage,
|
12
|
+
)
|
13
|
+
from deltacat.types.media import ContentType
|
14
|
+
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
15
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
16
|
+
|
17
|
+
|
18
|
+
class MergeInput(Dict):
|
19
|
+
@staticmethod
|
20
|
+
def of(
|
21
|
+
dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
|
22
|
+
write_to_partition: Partition,
|
23
|
+
compacted_file_content_type: ContentType,
|
24
|
+
primary_keys: List[str],
|
25
|
+
hash_group_index: int,
|
26
|
+
num_hash_groups: int,
|
27
|
+
sort_keys: Optional[List[SortKey]] = None,
|
28
|
+
merge_task_index: Optional[int] = 0,
|
29
|
+
max_records_per_output_file: Optional[int] = 4_000_000,
|
30
|
+
enable_profiler: Optional[bool] = False,
|
31
|
+
metrics_config: Optional[MetricsConfig] = None,
|
32
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
33
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
34
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
35
|
+
object_store: Optional[IObjectStore] = None,
|
36
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
37
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
38
|
+
) -> MergeInput:
|
39
|
+
|
40
|
+
result = MergeInput()
|
41
|
+
result["dfe_groups_refs"] = dfe_groups_refs
|
42
|
+
result["write_to_partition"] = write_to_partition
|
43
|
+
result["compacted_file_content_type"] = compacted_file_content_type
|
44
|
+
result["primary_keys"] = primary_keys
|
45
|
+
result["hash_group_index"] = hash_group_index
|
46
|
+
result["num_hash_groups"] = num_hash_groups
|
47
|
+
result["sort_keys"] = sort_keys
|
48
|
+
result["merge_task_index"] = merge_task_index
|
49
|
+
result["max_records_per_output_file"] = max_records_per_output_file
|
50
|
+
result["enable_profiler"] = enable_profiler
|
51
|
+
result["metrics_config"] = metrics_config
|
52
|
+
result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
|
53
|
+
result["read_kwargs_provider"] = read_kwargs_provider
|
54
|
+
result["round_completion_info"] = round_completion_info
|
55
|
+
result["object_store"] = object_store
|
56
|
+
result["deltacat_storage"] = deltacat_storage
|
57
|
+
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
58
|
+
|
59
|
+
return result
|
60
|
+
|
61
|
+
@property
|
62
|
+
def dfe_groups_refs(self) -> List[ObjectRef[DeltaFileEnvelopeGroups]]:
|
63
|
+
return self["dfe_groups_refs"]
|
64
|
+
|
65
|
+
@property
|
66
|
+
def write_to_partition(self) -> Partition:
|
67
|
+
return self["write_to_partition"]
|
68
|
+
|
69
|
+
@property
|
70
|
+
def compacted_file_content_type(self) -> ContentType:
|
71
|
+
return self["compacted_file_content_type"]
|
72
|
+
|
73
|
+
@property
|
74
|
+
def primary_keys(self) -> List[str]:
|
75
|
+
return self["primary_keys"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def hash_group_index(self) -> int:
|
79
|
+
return self["hash_group_index"]
|
80
|
+
|
81
|
+
@property
|
82
|
+
def num_hash_groups(self) -> int:
|
83
|
+
return self["num_hash_groups"]
|
84
|
+
|
85
|
+
@property
|
86
|
+
def sort_keys(self) -> Optional[List[SortKey]]:
|
87
|
+
return self.get("sort_keys")
|
88
|
+
|
89
|
+
@property
|
90
|
+
def merge_task_index(self) -> int:
|
91
|
+
return self.get("merge_task_index")
|
92
|
+
|
93
|
+
@property
|
94
|
+
def max_records_per_output_file(self) -> int:
|
95
|
+
return self.get("max_records_per_output_file")
|
96
|
+
|
97
|
+
@property
|
98
|
+
def enable_profiler(self) -> bool:
|
99
|
+
return self.get("enable_profiler")
|
100
|
+
|
101
|
+
@property
|
102
|
+
def metrics_config(self) -> Optional[MetricsConfig]:
|
103
|
+
return self.get("metrics_config")
|
104
|
+
|
105
|
+
@property
|
106
|
+
def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
|
107
|
+
return self.get("s3_table_writer_kwargs")
|
108
|
+
|
109
|
+
@property
|
110
|
+
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
111
|
+
return self.get("read_kwargs_provider")
|
112
|
+
|
113
|
+
@property
|
114
|
+
def round_completion_info(self) -> Optional[RoundCompletionInfo]:
|
115
|
+
return self.get("round_completion_info")
|
116
|
+
|
117
|
+
@property
|
118
|
+
def object_store(self) -> Optional[IObjectStore]:
|
119
|
+
return self.get("object_store")
|
120
|
+
|
121
|
+
@property
|
122
|
+
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
123
|
+
return self["deltacat_storage"]
|
124
|
+
|
125
|
+
@property
|
126
|
+
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
127
|
+
return self.get("deltacat_storage_kwargs")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from typing import NamedTuple, List
|
2
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
|
7
|
+
class MergeResult(NamedTuple):
|
8
|
+
materialize_results: List[MaterializeResult]
|
9
|
+
deduped_record_count: np.int64
|
10
|
+
peak_memory_usage_bytes: np.double
|
11
|
+
telemetry_time_in_seconds: np.double
|
12
|
+
task_completed_at: np.double
|
File without changes
|
@@ -0,0 +1,203 @@
|
|
1
|
+
import importlib
|
2
|
+
import logging
|
3
|
+
import time
|
4
|
+
from contextlib import nullcontext
|
5
|
+
from typing import List, Optional, Tuple
|
6
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
7
|
+
import numpy as np
|
8
|
+
import pyarrow as pa
|
9
|
+
import ray
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat.compute.compactor import (
|
12
|
+
DeltaAnnotated,
|
13
|
+
DeltaFileEnvelope,
|
14
|
+
)
|
15
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
16
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
17
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
18
|
+
group_hash_bucket_indices,
|
19
|
+
group_by_pk_hash_bucket,
|
20
|
+
)
|
21
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
22
|
+
from deltacat.types.media import StorageType
|
23
|
+
from deltacat.utils.ray_utils.runtime import (
|
24
|
+
get_current_ray_task_id,
|
25
|
+
get_current_ray_worker_id,
|
26
|
+
)
|
27
|
+
from deltacat.utils.common import ReadKwargsProvider
|
28
|
+
from deltacat.utils.performance import timed_invocation
|
29
|
+
from deltacat.utils.metrics import emit_timer_metrics
|
30
|
+
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
31
|
+
|
32
|
+
if importlib.util.find_spec("memray"):
|
33
|
+
import memray
|
34
|
+
|
35
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
|
+
|
37
|
+
|
38
|
+
def _read_delta_file_envelopes(
|
39
|
+
annotated_delta: DeltaAnnotated,
|
40
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
41
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
42
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
43
|
+
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
44
|
+
|
45
|
+
tables = deltacat_storage.download_delta(
|
46
|
+
annotated_delta,
|
47
|
+
max_parallelism=1,
|
48
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
49
|
+
storage_type=StorageType.LOCAL,
|
50
|
+
**deltacat_storage_kwargs,
|
51
|
+
)
|
52
|
+
annotations = annotated_delta.annotations
|
53
|
+
assert (
|
54
|
+
len(tables) == len(annotations),
|
55
|
+
f"Unexpected Error: Length of downloaded delta manifest tables "
|
56
|
+
f"({len(tables)}) doesn't match the length of delta manifest "
|
57
|
+
f"annotations ({len(annotations)}).",
|
58
|
+
)
|
59
|
+
if not tables:
|
60
|
+
return None, 0, 0
|
61
|
+
|
62
|
+
delta_stream_position = annotations[0].annotation_stream_position
|
63
|
+
delta_type = annotations[0].annotation_delta_type
|
64
|
+
|
65
|
+
for annotation in annotations:
|
66
|
+
assert annotation.annotation_stream_position == delta_stream_position, (
|
67
|
+
f"Annotation stream position does not match - {annotation.annotation_stream_position} "
|
68
|
+
f"!= {delta_stream_position}"
|
69
|
+
)
|
70
|
+
assert annotation.annotation_delta_type == delta_type, (
|
71
|
+
f"Annotation delta type does not match - {annotation.annotation_delta_type} "
|
72
|
+
f"!= {delta_type}"
|
73
|
+
)
|
74
|
+
|
75
|
+
delta_file_envelopes = []
|
76
|
+
table = pa.concat_tables(tables)
|
77
|
+
total_record_count = len(table)
|
78
|
+
total_size_bytes = int(table.nbytes)
|
79
|
+
|
80
|
+
delta_file = DeltaFileEnvelope.of(
|
81
|
+
stream_position=delta_stream_position,
|
82
|
+
delta_type=delta_type,
|
83
|
+
table=table,
|
84
|
+
)
|
85
|
+
delta_file_envelopes.append(delta_file)
|
86
|
+
return delta_file_envelopes, total_record_count, total_size_bytes
|
87
|
+
|
88
|
+
|
89
|
+
def _group_file_records_by_pk_hash_bucket(
|
90
|
+
annotated_delta: DeltaAnnotated,
|
91
|
+
num_hash_buckets: int,
|
92
|
+
primary_keys: List[str],
|
93
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
94
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
95
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
96
|
+
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
|
97
|
+
# read input parquet s3 objects into a list of delta file envelopes
|
98
|
+
(
|
99
|
+
delta_file_envelopes,
|
100
|
+
total_record_count,
|
101
|
+
total_size_bytes,
|
102
|
+
) = _read_delta_file_envelopes(
|
103
|
+
annotated_delta,
|
104
|
+
read_kwargs_provider,
|
105
|
+
deltacat_storage,
|
106
|
+
deltacat_storage_kwargs,
|
107
|
+
)
|
108
|
+
|
109
|
+
if delta_file_envelopes is None:
|
110
|
+
return None, 0, 0
|
111
|
+
|
112
|
+
logger.info(f"Read all delta file envelopes: {len(delta_file_envelopes)}")
|
113
|
+
|
114
|
+
# group the data by primary key hash value
|
115
|
+
hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
|
116
|
+
for dfe in delta_file_envelopes:
|
117
|
+
logger.info("Grouping by pk hash bucket")
|
118
|
+
start = time.monotonic()
|
119
|
+
hash_bucket_to_table = group_by_pk_hash_bucket(
|
120
|
+
dfe.table,
|
121
|
+
num_hash_buckets,
|
122
|
+
primary_keys,
|
123
|
+
)
|
124
|
+
group_end = time.monotonic()
|
125
|
+
logger.info(f"Grouping took: {group_end - start}")
|
126
|
+
for hb, table in enumerate(hash_bucket_to_table):
|
127
|
+
if table:
|
128
|
+
if hb_to_delta_file_envelopes[hb] is None:
|
129
|
+
hb_to_delta_file_envelopes[hb] = []
|
130
|
+
hb_to_delta_file_envelopes[hb].append(
|
131
|
+
DeltaFileEnvelope.of(
|
132
|
+
stream_position=dfe.stream_position,
|
133
|
+
file_index=dfe.file_index,
|
134
|
+
delta_type=dfe.delta_type,
|
135
|
+
table=table,
|
136
|
+
)
|
137
|
+
)
|
138
|
+
return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
|
139
|
+
|
140
|
+
|
141
|
+
def _timed_hash_bucket(input: HashBucketInput):
|
142
|
+
task_id = get_current_ray_task_id()
|
143
|
+
worker_id = get_current_ray_worker_id()
|
144
|
+
with memray.Tracker(
|
145
|
+
f"hash_bucket_{worker_id}_{task_id}.bin"
|
146
|
+
) if input.enable_profiler else nullcontext():
|
147
|
+
(
|
148
|
+
delta_file_envelope_groups,
|
149
|
+
total_record_count,
|
150
|
+
total_size_bytes,
|
151
|
+
) = _group_file_records_by_pk_hash_bucket(
|
152
|
+
annotated_delta=input.annotated_delta,
|
153
|
+
num_hash_buckets=input.num_hash_buckets,
|
154
|
+
primary_keys=input.primary_keys,
|
155
|
+
read_kwargs_provider=input.read_kwargs_provider,
|
156
|
+
deltacat_storage=input.deltacat_storage,
|
157
|
+
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
158
|
+
)
|
159
|
+
hash_bucket_group_to_obj_id_tuple = group_hash_bucket_indices(
|
160
|
+
hash_bucket_object_groups=delta_file_envelope_groups,
|
161
|
+
num_buckets=input.num_hash_buckets,
|
162
|
+
num_groups=input.num_hash_groups,
|
163
|
+
object_store=input.object_store,
|
164
|
+
)
|
165
|
+
|
166
|
+
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
167
|
+
return HashBucketResult(
|
168
|
+
hash_bucket_group_to_obj_id_tuple,
|
169
|
+
np.int64(total_size_bytes),
|
170
|
+
np.int64(total_record_count),
|
171
|
+
np.double(peak_memory_usage_bytes),
|
172
|
+
np.double(0.0),
|
173
|
+
np.double(time.time()),
|
174
|
+
)
|
175
|
+
|
176
|
+
|
177
|
+
@ray.remote
|
178
|
+
def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
179
|
+
|
180
|
+
logger.info(f"Starting hash bucket task...")
|
181
|
+
hash_bucket_result, duration = timed_invocation(
|
182
|
+
func=_timed_hash_bucket, input=input
|
183
|
+
)
|
184
|
+
|
185
|
+
emit_metrics_time = 0.0
|
186
|
+
if input.metrics_config:
|
187
|
+
emit_result, latency = timed_invocation(
|
188
|
+
func=emit_timer_metrics,
|
189
|
+
metrics_name="hash_bucket",
|
190
|
+
value=duration,
|
191
|
+
metrics_config=input.metrics_config,
|
192
|
+
)
|
193
|
+
emit_metrics_time = latency
|
194
|
+
|
195
|
+
logger.info(f"Finished hash bucket task...")
|
196
|
+
return HashBucketResult(
|
197
|
+
hash_bucket_result[0],
|
198
|
+
hash_bucket_result[1],
|
199
|
+
hash_bucket_result[2],
|
200
|
+
hash_bucket_result[3],
|
201
|
+
np.double(emit_metrics_time),
|
202
|
+
hash_bucket_result[5],
|
203
|
+
)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
import logging
|
2
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
3
|
+
import numpy as np
|
4
|
+
import ray
|
5
|
+
from deltacat import logs
|
6
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
7
|
+
from deltacat.utils.performance import timed_invocation
|
8
|
+
from deltacat.utils.metrics import emit_timer_metrics
|
9
|
+
|
10
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
11
|
+
|
12
|
+
|
13
|
+
def _timed_merge(input: MergeInput) -> MergeResult:
|
14
|
+
# TODO: Implementation goes here
|
15
|
+
pass
|
16
|
+
|
17
|
+
|
18
|
+
@ray.remote
|
19
|
+
def merge(input: MergeInput) -> MergeResult:
|
20
|
+
|
21
|
+
logger.info(f"Starting merge task...")
|
22
|
+
merge_result, duration = timed_invocation(func=_timed_merge, input=input)
|
23
|
+
|
24
|
+
emit_metrics_time = 0.0
|
25
|
+
if input.metrics_config:
|
26
|
+
emit_result, latency = timed_invocation(
|
27
|
+
func=emit_timer_metrics,
|
28
|
+
metrics_name="merge",
|
29
|
+
value=duration,
|
30
|
+
metrics_config=input.metrics_config,
|
31
|
+
)
|
32
|
+
emit_metrics_time = latency
|
33
|
+
|
34
|
+
logger.info(f"Finished merge task...")
|
35
|
+
return MergeResult(
|
36
|
+
merge_result[0],
|
37
|
+
merge_result[1],
|
38
|
+
merge_result[2],
|
39
|
+
np.double(emit_metrics_time),
|
40
|
+
merge_result[4],
|
41
|
+
)
|
File without changes
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from deltacat.storage import (
|
2
|
+
Delta,
|
3
|
+
interface as unimplemented_deltacat_storage,
|
4
|
+
)
|
5
|
+
from typing import Dict, Optional, Any
|
6
|
+
from deltacat.types.media import TableType
|
7
|
+
from deltacat.types.media import ContentType
|
8
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
9
|
+
|
10
|
+
|
11
|
+
def append_content_type_params(
|
12
|
+
delta: Delta,
|
13
|
+
entry_index: int,
|
14
|
+
deltacat_storage: unimplemented_deltacat_storage,
|
15
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
16
|
+
):
|
17
|
+
|
18
|
+
entry = delta.manifest.entries[entry_index]
|
19
|
+
|
20
|
+
if entry.meta.content_type == ContentType.PARQUET:
|
21
|
+
# Set partial download content type parameters to allow
|
22
|
+
# row group level rebatching for parquet.
|
23
|
+
pq_file = deltacat_storage.download_delta_manifest_entry(
|
24
|
+
delta.locator,
|
25
|
+
entry_index=entry_index,
|
26
|
+
table_type=TableType.PYARROW_PARQUET,
|
27
|
+
**deltacat_storage_kwargs,
|
28
|
+
)
|
29
|
+
|
30
|
+
if not entry.meta.content_type_parameters:
|
31
|
+
entry.meta.content_type_parameters = []
|
32
|
+
|
33
|
+
entry.meta.content_type_parameters.append(
|
34
|
+
PartialParquetParameters.of(pq_metadata=pq_file.metadata)
|
35
|
+
)
|
36
|
+
|
37
|
+
return entry
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import logging
|
2
|
+
import functools
|
3
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
4
|
+
from deltacat.storage import (
|
5
|
+
PartitionLocator,
|
6
|
+
Delta,
|
7
|
+
interface as unimplemented_deltacat_storage,
|
8
|
+
)
|
9
|
+
from deltacat import logs
|
10
|
+
from deltacat.compute.compactor.utils import io as io_v1
|
11
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
12
|
+
from typing import Dict, List, Optional, Any
|
13
|
+
from deltacat.compute.compactor_v2.constants import (
|
14
|
+
MIN_FILES_IN_BATCH,
|
15
|
+
MIN_DELTA_BYTES_IN_BATCH,
|
16
|
+
)
|
17
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
18
|
+
CompactionSessionAuditInfo,
|
19
|
+
)
|
20
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
21
|
+
estimate_manifest_entry_size_bytes,
|
22
|
+
)
|
23
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
24
|
+
append_content_type_params,
|
25
|
+
)
|
26
|
+
|
27
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
|
+
|
29
|
+
|
30
|
+
def discover_deltas(
|
31
|
+
source_partition_locator: PartitionLocator,
|
32
|
+
last_stream_position_to_compact: int,
|
33
|
+
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
34
|
+
rebase_source_partition_high_watermark: Optional[int] = None,
|
35
|
+
rcf_high_watermark: Optional[int] = None,
|
36
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
37
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
38
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
39
|
+
) -> List[Delta]:
|
40
|
+
|
41
|
+
previous_compacted_high_watermark = (
|
42
|
+
rebase_source_partition_high_watermark or rcf_high_watermark
|
43
|
+
)
|
44
|
+
|
45
|
+
delta_source_partition_locator = (
|
46
|
+
rebase_source_partition_locator or source_partition_locator
|
47
|
+
)
|
48
|
+
|
49
|
+
result = []
|
50
|
+
|
51
|
+
delta_source_incremental_deltas = io_v1._discover_deltas(
|
52
|
+
delta_source_partition_locator,
|
53
|
+
previous_compacted_high_watermark,
|
54
|
+
last_stream_position_to_compact,
|
55
|
+
deltacat_storage,
|
56
|
+
deltacat_storage_kwargs,
|
57
|
+
list_deltas_kwargs,
|
58
|
+
)
|
59
|
+
|
60
|
+
result.extend(delta_source_incremental_deltas)
|
61
|
+
|
62
|
+
logger.info(
|
63
|
+
f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
|
64
|
+
f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
|
65
|
+
)
|
66
|
+
|
67
|
+
if rebase_source_partition_locator:
|
68
|
+
previous_compacted_deltas = io_v1._discover_deltas(
|
69
|
+
source_partition_locator,
|
70
|
+
None,
|
71
|
+
None,
|
72
|
+
deltacat_storage,
|
73
|
+
deltacat_storage_kwargs,
|
74
|
+
list_deltas_kwargs,
|
75
|
+
)
|
76
|
+
|
77
|
+
result.extend(previous_compacted_deltas)
|
78
|
+
|
79
|
+
logger.info(
|
80
|
+
f"Length of input deltas from previous compacted table is {len(previous_compacted_deltas)}"
|
81
|
+
f" from ({None}, {None}]"
|
82
|
+
)
|
83
|
+
|
84
|
+
return result
|
85
|
+
|
86
|
+
|
87
|
+
def create_uniform_input_deltas(
|
88
|
+
input_deltas: List[Delta],
|
89
|
+
hash_bucket_count: int,
|
90
|
+
compaction_audit: CompactionSessionAuditInfo,
|
91
|
+
min_delta_bytes: Optional[float] = MIN_DELTA_BYTES_IN_BATCH,
|
92
|
+
min_file_counts: Optional[float] = MIN_FILES_IN_BATCH,
|
93
|
+
previous_inflation: Optional[float] = PYARROW_INFLATION_MULTIPLIER,
|
94
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
95
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
96
|
+
) -> List[DeltaAnnotated]:
|
97
|
+
|
98
|
+
delta_bytes = 0
|
99
|
+
delta_manifest_entries = 0
|
100
|
+
estimated_da_bytes = 0
|
101
|
+
input_da_list = []
|
102
|
+
|
103
|
+
for delta in input_deltas:
|
104
|
+
manifest_entries = delta.manifest.entries
|
105
|
+
delta_manifest_entries += len(manifest_entries)
|
106
|
+
for entry_index in range(len(manifest_entries)):
|
107
|
+
entry = append_content_type_params(
|
108
|
+
delta=delta,
|
109
|
+
entry_index=entry_index,
|
110
|
+
deltacat_storage=deltacat_storage,
|
111
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
112
|
+
)
|
113
|
+
|
114
|
+
delta_bytes += entry.meta.content_length
|
115
|
+
estimated_da_bytes += estimate_manifest_entry_size_bytes(
|
116
|
+
entry=entry, previous_inflation=previous_inflation
|
117
|
+
)
|
118
|
+
|
119
|
+
delta_annotated = DeltaAnnotated.of(delta)
|
120
|
+
input_da_list.append(delta_annotated)
|
121
|
+
|
122
|
+
logger.info(f"Input deltas to compact this round: " f"{len(input_da_list)}")
|
123
|
+
logger.info(f"Input delta bytes to compact: {delta_bytes}")
|
124
|
+
logger.info(f"Input delta files to compact: {delta_manifest_entries}")
|
125
|
+
|
126
|
+
if not input_da_list:
|
127
|
+
raise RuntimeError("No input deltas to compact!")
|
128
|
+
|
129
|
+
size_estimation_function = functools.partial(
|
130
|
+
estimate_manifest_entry_size_bytes, previous_inflation=previous_inflation
|
131
|
+
)
|
132
|
+
|
133
|
+
rebatched_da_list = DeltaAnnotated.rebatch(
|
134
|
+
input_da_list,
|
135
|
+
min_delta_bytes=min_delta_bytes,
|
136
|
+
min_file_counts=min_file_counts,
|
137
|
+
estimation_function=size_estimation_function,
|
138
|
+
)
|
139
|
+
|
140
|
+
compaction_audit.set_input_size_bytes(delta_bytes)
|
141
|
+
compaction_audit.set_input_file_count(delta_manifest_entries)
|
142
|
+
compaction_audit.set_estimated_in_memory_size_bytes_during_discovery(
|
143
|
+
estimated_da_bytes
|
144
|
+
)
|
145
|
+
|
146
|
+
logger.info(f"Hash bucket count: {hash_bucket_count}")
|
147
|
+
logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
|
148
|
+
|
149
|
+
return rebatched_da_list
|