deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,506 @@
|
|
1
|
+
import importlib
|
2
|
+
from contextlib import nullcontext
|
3
|
+
import numpy as np
|
4
|
+
import functools
|
5
|
+
import logging
|
6
|
+
import ray
|
7
|
+
import time
|
8
|
+
import json
|
9
|
+
from deltacat.aws import s3u as s3_utils
|
10
|
+
import deltacat
|
11
|
+
from deltacat import logs
|
12
|
+
from deltacat.compute.compactor import (
|
13
|
+
PyArrowWriteResult,
|
14
|
+
RoundCompletionInfo,
|
15
|
+
)
|
16
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
17
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
18
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
19
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
20
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
21
|
+
from deltacat.storage import (
|
22
|
+
Delta,
|
23
|
+
DeltaLocator,
|
24
|
+
Partition,
|
25
|
+
)
|
26
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
27
|
+
CompactPartitionParams,
|
28
|
+
)
|
29
|
+
from deltacat.utils.ray_utils.concurrency import (
|
30
|
+
invoke_parallel,
|
31
|
+
task_resource_options_provider,
|
32
|
+
)
|
33
|
+
from deltacat.compute.compactor_v2.steps import merge as mg
|
34
|
+
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
35
|
+
from deltacat.compute.compactor_v2.utils import io
|
36
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
37
|
+
|
38
|
+
from typing import List, Optional, Tuple
|
39
|
+
from collections import defaultdict
|
40
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
41
|
+
CompactionSessionAuditInfo,
|
42
|
+
)
|
43
|
+
from deltacat.utils.resources import (
|
44
|
+
get_current_node_peak_memory_usage_in_bytes,
|
45
|
+
)
|
46
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
47
|
+
hash_bucket_resource_options_provider,
|
48
|
+
merge_resource_options_provider,
|
49
|
+
)
|
50
|
+
from deltacat.utils.resources import ClusterUtilizationOverTimeRange
|
51
|
+
|
52
|
+
if importlib.util.find_spec("memray"):
|
53
|
+
import memray
|
54
|
+
|
55
|
+
|
56
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
57
|
+
|
58
|
+
|
59
|
+
def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
|
60
|
+
|
61
|
+
assert (
|
62
|
+
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
63
|
+
), "hash_bucket_count is a required arg for compactor v2"
|
64
|
+
|
65
|
+
with memray.Tracker(
|
66
|
+
f"compaction_partition.bin"
|
67
|
+
) if params.enable_profiler else nullcontext(), ClusterUtilizationOverTimeRange() as cluster_util:
|
68
|
+
(new_partition, new_rci, new_rcf_partition_locator,) = _execute_compaction(
|
69
|
+
params,
|
70
|
+
cluster_util=cluster_util,
|
71
|
+
**kwargs,
|
72
|
+
)
|
73
|
+
|
74
|
+
logger.info(
|
75
|
+
f"Partition-{params.source_partition_locator} -> "
|
76
|
+
f"Compaction session data processing completed"
|
77
|
+
)
|
78
|
+
round_completion_file_s3_url = None
|
79
|
+
if new_partition:
|
80
|
+
logger.info(f"Committing compacted partition to: {new_partition.locator}")
|
81
|
+
partition = params.deltacat_storage.commit_partition(
|
82
|
+
new_partition, **params.deltacat_storage_kwargs
|
83
|
+
)
|
84
|
+
logger.info(f"Committed compacted partition: {partition}")
|
85
|
+
|
86
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
87
|
+
params.compaction_artifact_s3_bucket,
|
88
|
+
new_rcf_partition_locator,
|
89
|
+
new_rci,
|
90
|
+
**params.s3_client_kwargs,
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
logger.warn("No new partition was committed during compaction.")
|
94
|
+
|
95
|
+
logger.info(
|
96
|
+
f"Completed compaction session for: {params.source_partition_locator}"
|
97
|
+
)
|
98
|
+
return round_completion_file_s3_url
|
99
|
+
|
100
|
+
|
101
|
+
def _execute_compaction(
|
102
|
+
params: CompactPartitionParams, **kwargs
|
103
|
+
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
104
|
+
|
105
|
+
rcf_source_partition_locator = (
|
106
|
+
params.rebase_source_partition_locator or params.source_partition_locator
|
107
|
+
)
|
108
|
+
|
109
|
+
base_audit_url = rcf_source_partition_locator.path(
|
110
|
+
f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
|
111
|
+
)
|
112
|
+
audit_url = f"{base_audit_url}.json"
|
113
|
+
logger.info(f"Compaction audit will be written to {audit_url}")
|
114
|
+
compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
|
115
|
+
|
116
|
+
compaction_audit.set_hash_bucket_count(params.hash_bucket_count)
|
117
|
+
|
118
|
+
compaction_start = time.monotonic()
|
119
|
+
|
120
|
+
task_max_parallelism = params.task_max_parallelism
|
121
|
+
|
122
|
+
if params.pg_config:
|
123
|
+
logger.info(
|
124
|
+
"pg_config specified. Tasks will be scheduled in a placement group."
|
125
|
+
)
|
126
|
+
cluster_resources = params.pg_config.resource
|
127
|
+
cluster_cpus = cluster_resources["CPU"]
|
128
|
+
cluster_memory = cluster_resources["memory"]
|
129
|
+
task_max_parallelism = cluster_cpus
|
130
|
+
compaction_audit.set_cluster_cpu_max(cluster_cpus)
|
131
|
+
compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
|
132
|
+
|
133
|
+
# read the results from any previously completed compaction round
|
134
|
+
round_completion_info = None
|
135
|
+
high_watermark = None
|
136
|
+
previous_compacted_delta = None
|
137
|
+
|
138
|
+
if not params.rebase_source_partition_locator:
|
139
|
+
round_completion_info = rcf.read_round_completion_file(
|
140
|
+
params.compaction_artifact_s3_bucket,
|
141
|
+
params.source_partition_locator,
|
142
|
+
**params.s3_client_kwargs,
|
143
|
+
)
|
144
|
+
if not round_completion_info:
|
145
|
+
logger.info(
|
146
|
+
f"Both rebase partition and round completion file not found. Performing an entire backfill on source."
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
150
|
+
previous_compacted_delta = params.deltacat_storage.get_delta(
|
151
|
+
namespace=compacted_delta_locator.namespace,
|
152
|
+
table_name=compacted_delta_locator.table_name,
|
153
|
+
table_version=compacted_delta_locator.table_version,
|
154
|
+
stream_position=compacted_delta_locator.stream_position,
|
155
|
+
include_manifest=True,
|
156
|
+
**params.deltacat_storage_kwargs,
|
157
|
+
)
|
158
|
+
|
159
|
+
high_watermark = round_completion_info.high_watermark
|
160
|
+
logger.info(f"Setting round completion high watermark: {high_watermark}")
|
161
|
+
assert (
|
162
|
+
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
163
|
+
), (
|
164
|
+
"The hash bucket count has changed. "
|
165
|
+
"Kindly run rebase compaction and trigger incremental again. "
|
166
|
+
f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
|
167
|
+
f"not equal to Hash bucket count in args={params.hash_bucket_count}."
|
168
|
+
)
|
169
|
+
|
170
|
+
logger.info(f"Round completion file: {round_completion_info}")
|
171
|
+
|
172
|
+
delta_discovery_start = time.monotonic()
|
173
|
+
|
174
|
+
input_deltas = io.discover_deltas(
|
175
|
+
params.source_partition_locator,
|
176
|
+
params.last_stream_position_to_compact,
|
177
|
+
params.rebase_source_partition_locator,
|
178
|
+
params.rebase_source_partition_high_watermark,
|
179
|
+
high_watermark,
|
180
|
+
params.deltacat_storage,
|
181
|
+
params.deltacat_storage_kwargs,
|
182
|
+
params.list_deltas_kwargs,
|
183
|
+
)
|
184
|
+
|
185
|
+
delta_discovery_end = time.monotonic()
|
186
|
+
compaction_audit.set_delta_discovery_time_in_seconds(
|
187
|
+
delta_discovery_end - delta_discovery_start
|
188
|
+
)
|
189
|
+
|
190
|
+
s3_utils.upload(
|
191
|
+
compaction_audit.audit_url,
|
192
|
+
str(json.dumps(compaction_audit)),
|
193
|
+
**params.s3_client_kwargs,
|
194
|
+
)
|
195
|
+
|
196
|
+
if not input_deltas:
|
197
|
+
logger.info("No input deltas found to compact.")
|
198
|
+
return None, None, None
|
199
|
+
|
200
|
+
uniform_deltas = io.create_uniform_input_deltas(
|
201
|
+
input_deltas=input_deltas,
|
202
|
+
hash_bucket_count=params.hash_bucket_count,
|
203
|
+
compaction_audit=compaction_audit,
|
204
|
+
deltacat_storage=params.deltacat_storage,
|
205
|
+
previous_inflation=params.previous_inflation,
|
206
|
+
min_delta_bytes=params.min_delta_bytes_in_batch,
|
207
|
+
min_file_counts=params.min_files_in_batch,
|
208
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
209
|
+
)
|
210
|
+
|
211
|
+
compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
212
|
+
|
213
|
+
hb_options_provider = functools.partial(
|
214
|
+
task_resource_options_provider,
|
215
|
+
pg_config=params.pg_config,
|
216
|
+
resource_amount_provider=hash_bucket_resource_options_provider,
|
217
|
+
previous_inflation=params.previous_inflation,
|
218
|
+
average_record_size_bytes=params.average_record_size_bytes,
|
219
|
+
primary_keys=params.primary_keys,
|
220
|
+
)
|
221
|
+
|
222
|
+
hb_start = time.monotonic()
|
223
|
+
|
224
|
+
hash_bucket_input_provider = lambda index, item: {
|
225
|
+
"input": HashBucketInput.of(
|
226
|
+
item,
|
227
|
+
primary_keys=params.primary_keys,
|
228
|
+
num_hash_buckets=params.hash_bucket_count,
|
229
|
+
num_hash_groups=params.hash_group_count,
|
230
|
+
enable_profiler=params.enable_profiler,
|
231
|
+
metrics_config=params.metrics_config,
|
232
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
233
|
+
object_store=params.object_store,
|
234
|
+
deltacat_storage=params.deltacat_storage,
|
235
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
236
|
+
)
|
237
|
+
}
|
238
|
+
|
239
|
+
hb_tasks_pending = invoke_parallel(
|
240
|
+
items=uniform_deltas,
|
241
|
+
ray_task=hb.hash_bucket,
|
242
|
+
max_parallelism=task_max_parallelism,
|
243
|
+
options_provider=hb_options_provider,
|
244
|
+
kwargs_provider=hash_bucket_input_provider,
|
245
|
+
)
|
246
|
+
|
247
|
+
hb_invoke_end = time.monotonic()
|
248
|
+
|
249
|
+
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
250
|
+
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
251
|
+
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
252
|
+
hb_end = time.monotonic()
|
253
|
+
|
254
|
+
# we use time.time() here because time.monotonic() has no reference point
|
255
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
256
|
+
# to compare time.time()s captured in different nodes.
|
257
|
+
hb_results_retrieved_at = time.time()
|
258
|
+
|
259
|
+
telemetry_time_hb = compaction_audit.save_step_stats(
|
260
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
261
|
+
hb_results,
|
262
|
+
hb_results_retrieved_at,
|
263
|
+
hb_invoke_end - hb_start,
|
264
|
+
hb_end - hb_start,
|
265
|
+
)
|
266
|
+
|
267
|
+
s3_utils.upload(
|
268
|
+
compaction_audit.audit_url,
|
269
|
+
str(json.dumps(compaction_audit)),
|
270
|
+
**params.s3_client_kwargs,
|
271
|
+
)
|
272
|
+
|
273
|
+
all_hash_group_idx_to_obj_id = defaultdict(list)
|
274
|
+
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
275
|
+
all_hash_group_idx_to_num_rows = defaultdict(int)
|
276
|
+
hb_data_processed_size_bytes = np.int64(0)
|
277
|
+
total_hb_record_count = np.int64(0)
|
278
|
+
|
279
|
+
# initialize all hash groups
|
280
|
+
for hb_group in range(params.hash_group_count):
|
281
|
+
all_hash_group_idx_to_num_rows[hb_group] = 0
|
282
|
+
all_hash_group_idx_to_obj_id[hb_group] = []
|
283
|
+
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
284
|
+
|
285
|
+
for hb_result in hb_results:
|
286
|
+
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
287
|
+
total_hb_record_count += hb_result.hb_record_count
|
288
|
+
|
289
|
+
for hash_group_index, object_id_size_tuple in enumerate(
|
290
|
+
hb_result.hash_bucket_group_to_obj_id_tuple
|
291
|
+
):
|
292
|
+
if object_id_size_tuple:
|
293
|
+
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
294
|
+
object_id_size_tuple[0]
|
295
|
+
)
|
296
|
+
all_hash_group_idx_to_size_bytes[
|
297
|
+
hash_group_index
|
298
|
+
] += object_id_size_tuple[1].item()
|
299
|
+
all_hash_group_idx_to_num_rows[
|
300
|
+
hash_group_index
|
301
|
+
] += object_id_size_tuple[2].item()
|
302
|
+
|
303
|
+
logger.info(
|
304
|
+
f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
|
305
|
+
)
|
306
|
+
|
307
|
+
compaction_audit.set_input_records(total_hb_record_count.item())
|
308
|
+
compaction_audit.set_hash_bucket_processed_size_bytes(
|
309
|
+
hb_data_processed_size_bytes.item()
|
310
|
+
)
|
311
|
+
|
312
|
+
# create a new stream for this round
|
313
|
+
compacted_stream_locator = params.destination_partition_locator.stream_locator
|
314
|
+
compacted_stream = params.deltacat_storage.get_stream(
|
315
|
+
compacted_stream_locator.namespace,
|
316
|
+
compacted_stream_locator.table_name,
|
317
|
+
compacted_stream_locator.table_version,
|
318
|
+
**params.deltacat_storage_kwargs,
|
319
|
+
)
|
320
|
+
compacted_partition = params.deltacat_storage.stage_partition(
|
321
|
+
compacted_stream,
|
322
|
+
params.destination_partition_locator.partition_values,
|
323
|
+
**params.deltacat_storage_kwargs,
|
324
|
+
)
|
325
|
+
|
326
|
+
# BSP Step 2: Merge
|
327
|
+
merge_options_provider = functools.partial(
|
328
|
+
task_resource_options_provider,
|
329
|
+
pg_config=params.pg_config,
|
330
|
+
resource_amount_provider=merge_resource_options_provider,
|
331
|
+
num_hash_groups=params.hash_group_count,
|
332
|
+
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
333
|
+
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
334
|
+
round_completion_info=round_completion_info,
|
335
|
+
compacted_delta=previous_compacted_delta,
|
336
|
+
primary_keys=params.primary_keys,
|
337
|
+
deltacat_storage=params.deltacat_storage,
|
338
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
339
|
+
)
|
340
|
+
|
341
|
+
merge_input_provider = lambda index, item: {
|
342
|
+
"input": MergeInput.of(
|
343
|
+
dfe_groups_refs=item[1],
|
344
|
+
write_to_partition=compacted_partition,
|
345
|
+
compacted_file_content_type=params.compacted_file_content_type,
|
346
|
+
primary_keys=params.primary_keys,
|
347
|
+
sort_keys=params.sort_keys,
|
348
|
+
merge_task_index=index,
|
349
|
+
hash_group_index=item[0],
|
350
|
+
num_hash_groups=params.hash_group_count,
|
351
|
+
max_records_per_output_file=params.records_per_compacted_file,
|
352
|
+
enable_profiler=params.enable_profiler,
|
353
|
+
metrics_config=params.metrics_config,
|
354
|
+
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
355
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
356
|
+
round_completion_info=round_completion_info,
|
357
|
+
object_store=params.object_store,
|
358
|
+
deltacat_storage=params.deltacat_storage,
|
359
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
360
|
+
)
|
361
|
+
}
|
362
|
+
|
363
|
+
merge_start = time.monotonic()
|
364
|
+
|
365
|
+
merge_tasks_pending = invoke_parallel(
|
366
|
+
items=all_hash_group_idx_to_obj_id.items(),
|
367
|
+
ray_task=mg.merge,
|
368
|
+
max_parallelism=task_max_parallelism,
|
369
|
+
options_provider=merge_options_provider,
|
370
|
+
kwargs_provider=merge_input_provider,
|
371
|
+
)
|
372
|
+
|
373
|
+
merge_invoke_end = time.monotonic()
|
374
|
+
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
375
|
+
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
376
|
+
logger.info(f"Got {len(merge_results)} merge results.")
|
377
|
+
|
378
|
+
merge_results_retrieved_at = time.time()
|
379
|
+
merge_end = time.monotonic()
|
380
|
+
|
381
|
+
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
382
|
+
logger.info(f"Deduped {total_dd_record_count} records...")
|
383
|
+
|
384
|
+
telemetry_time_merge = compaction_audit.save_step_stats(
|
385
|
+
CompactionSessionAuditInfo.MERGE_STEP_NAME,
|
386
|
+
merge_results,
|
387
|
+
merge_results_retrieved_at,
|
388
|
+
merge_invoke_end - merge_start,
|
389
|
+
merge_end - merge_start,
|
390
|
+
)
|
391
|
+
|
392
|
+
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
393
|
+
|
394
|
+
mat_results = []
|
395
|
+
for merge_result in merge_results:
|
396
|
+
mat_results.extend(merge_result.materialize_results)
|
397
|
+
|
398
|
+
mat_results: List[MaterializeResult] = sorted(
|
399
|
+
mat_results, key=lambda m: m.task_index
|
400
|
+
)
|
401
|
+
|
402
|
+
deltas = [m.delta for m in mat_results]
|
403
|
+
|
404
|
+
hb_id_to_entry_indices_range = {}
|
405
|
+
file_index = 0
|
406
|
+
previous_task_index = -1
|
407
|
+
|
408
|
+
for m in mat_results:
|
409
|
+
assert m.pyarrow_write_result.files >= 1, "Atleast file must be materialized"
|
410
|
+
assert m.task_index != previous_task_index, (
|
411
|
+
"Multiple materialize results found for a " f"hash bucket: {m.task_index}"
|
412
|
+
)
|
413
|
+
|
414
|
+
hb_id_to_entry_indices_range[str(m.task_index)] = (
|
415
|
+
file_index,
|
416
|
+
file_index + m.pyarrow_write_result.files - 1,
|
417
|
+
)
|
418
|
+
|
419
|
+
file_index += m.pyarrow_write_result.files
|
420
|
+
previous_task_index = m.task_index
|
421
|
+
|
422
|
+
s3_utils.upload(
|
423
|
+
compaction_audit.audit_url,
|
424
|
+
str(json.dumps(compaction_audit)),
|
425
|
+
**params.s3_client_kwargs,
|
426
|
+
)
|
427
|
+
|
428
|
+
mat_results = sorted(mat_results, key=lambda m: m.task_index)
|
429
|
+
deltas = [m.delta for m in mat_results]
|
430
|
+
|
431
|
+
# Note: An appropriate last stream position must be set
|
432
|
+
# to avoid correctness issue.
|
433
|
+
merged_delta = Delta.merge_deltas(
|
434
|
+
deltas,
|
435
|
+
stream_position=params.last_stream_position_to_compact,
|
436
|
+
)
|
437
|
+
|
438
|
+
record_info_msg = (
|
439
|
+
f"Hash bucket records: {total_hb_record_count},"
|
440
|
+
f" Deduped records: {total_dd_record_count}, "
|
441
|
+
f" Materialized records: {merged_delta.meta.record_count}"
|
442
|
+
)
|
443
|
+
logger.info(record_info_msg)
|
444
|
+
|
445
|
+
compacted_delta = params.deltacat_storage.commit_delta(
|
446
|
+
merged_delta,
|
447
|
+
properties=kwargs.get("properties", {}),
|
448
|
+
**params.deltacat_storage_kwargs,
|
449
|
+
)
|
450
|
+
|
451
|
+
logger.info(f"Committed compacted delta: {compacted_delta}")
|
452
|
+
|
453
|
+
compaction_end = time.monotonic()
|
454
|
+
compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
|
455
|
+
|
456
|
+
new_compacted_delta_locator = DeltaLocator.of(
|
457
|
+
compacted_partition.locator,
|
458
|
+
compacted_delta.stream_position,
|
459
|
+
)
|
460
|
+
|
461
|
+
pyarrow_write_result = PyArrowWriteResult.union(
|
462
|
+
[m.pyarrow_write_result for m in mat_results]
|
463
|
+
)
|
464
|
+
|
465
|
+
session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
|
466
|
+
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
467
|
+
session_peak_memory
|
468
|
+
)
|
469
|
+
|
470
|
+
compaction_audit.save_round_completion_stats(
|
471
|
+
mat_results, telemetry_time_hb + telemetry_time_merge
|
472
|
+
)
|
473
|
+
|
474
|
+
cluster_util: ClusterUtilizationOverTimeRange = kwargs.get("cluster_util")
|
475
|
+
|
476
|
+
if cluster_util:
|
477
|
+
compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
|
478
|
+
compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
|
479
|
+
|
480
|
+
s3_utils.upload(
|
481
|
+
compaction_audit.audit_url,
|
482
|
+
str(json.dumps(compaction_audit)),
|
483
|
+
**params.s3_client_kwargs,
|
484
|
+
)
|
485
|
+
|
486
|
+
new_round_completion_info = RoundCompletionInfo.of(
|
487
|
+
high_watermark=params.last_stream_position_to_compact,
|
488
|
+
compacted_delta_locator=new_compacted_delta_locator,
|
489
|
+
compacted_pyarrow_write_result=pyarrow_write_result,
|
490
|
+
sort_keys_bit_width=params.bit_width_of_sort_keys,
|
491
|
+
manifest_entry_copied_by_reference_ratio=compaction_audit.untouched_file_ratio,
|
492
|
+
compaction_audit_url=audit_url,
|
493
|
+
hash_bucket_count=params.hash_bucket_count,
|
494
|
+
hb_index_to_entry_range=hb_id_to_entry_indices_range,
|
495
|
+
)
|
496
|
+
|
497
|
+
logger.info(
|
498
|
+
f"partition-{params.source_partition_locator.partition_values},"
|
499
|
+
f"compacted at: {params.last_stream_position_to_compact},"
|
500
|
+
)
|
501
|
+
|
502
|
+
return (
|
503
|
+
compacted_partition,
|
504
|
+
new_round_completion_info,
|
505
|
+
rcf_source_partition_locator,
|
506
|
+
)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
|
+
|
3
|
+
PK_DELIMITER = "L6kl7u5f"
|
4
|
+
|
5
|
+
MAX_RECORDS_PER_COMPACTED_FILE = 4_000_000
|
6
|
+
|
7
|
+
# The maximum amount of delta bytes allowed in a batch.
|
8
|
+
# A single task will not process more than these many bytes
|
9
|
+
# unless a single manifest entry (non-parquet) or single row
|
10
|
+
# group (parquet) is bigger than this size.
|
11
|
+
MIN_DELTA_BYTES_IN_BATCH = 5_000_000_000
|
12
|
+
|
13
|
+
# The total number of files that can be processed in a
|
14
|
+
# batch. Hence, if there are tiny files, this value can be
|
15
|
+
# limited so that enough parallelism can be attained.
|
16
|
+
MIN_FILES_IN_BATCH = float("inf")
|
17
|
+
|
18
|
+
# The average record size in a table.
|
19
|
+
AVERAGE_RECORD_SIZE_BYTES = 1000
|
20
|
+
|
21
|
+
# Maximum parallelism for the tasks at each BSP step.
|
22
|
+
# Default is the number of vCPUs in about 168
|
23
|
+
# r5.8xlarge EC2 instances.
|
24
|
+
TASK_MAX_PARALLELISM = 5367
|
25
|
+
|
26
|
+
# The percentage of memory that needs to be estimated
|
27
|
+
# as buffer. This value will ensure the job doesn't run out
|
28
|
+
# of memory by considering buffer for uncertainities.
|
29
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE = 20
|
30
|
+
|
31
|
+
# The total size of records that will be hash bucketed at once
|
32
|
+
# Since, sorting is nlogn, we ensure that is not performed
|
33
|
+
# on a very large dataset for best performance.
|
34
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
|
File without changes
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Dict, List, Optional, Any
|
4
|
+
from deltacat.utils.metrics import MetricsConfig
|
5
|
+
from deltacat.utils.common import ReadKwargsProvider
|
6
|
+
from deltacat.io.object_store import IObjectStore
|
7
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
8
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
9
|
+
|
10
|
+
|
11
|
+
class HashBucketInput(Dict):
|
12
|
+
@staticmethod
|
13
|
+
def of(
|
14
|
+
annotated_delta: DeltaAnnotated,
|
15
|
+
primary_keys: List[str],
|
16
|
+
num_hash_buckets: int,
|
17
|
+
num_hash_groups: int,
|
18
|
+
enable_profiler: Optional[bool] = False,
|
19
|
+
metrics_config: Optional[MetricsConfig] = None,
|
20
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
21
|
+
object_store: Optional[IObjectStore] = None,
|
22
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
23
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
24
|
+
) -> HashBucketInput:
|
25
|
+
|
26
|
+
result = HashBucketInput()
|
27
|
+
result["annotated_delta"] = annotated_delta
|
28
|
+
result["primary_keys"] = primary_keys
|
29
|
+
result["num_hash_buckets"] = num_hash_buckets
|
30
|
+
result["num_hash_groups"] = num_hash_groups
|
31
|
+
result["enable_profiler"] = enable_profiler
|
32
|
+
result["metrics_config"] = metrics_config
|
33
|
+
result["read_kwargs_provider"] = read_kwargs_provider
|
34
|
+
result["object_store"] = object_store
|
35
|
+
result["deltacat_storage"] = deltacat_storage
|
36
|
+
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
37
|
+
|
38
|
+
return result
|
39
|
+
|
40
|
+
@property
|
41
|
+
def annotated_delta(self) -> DeltaAnnotated:
|
42
|
+
return self["annotated_delta"]
|
43
|
+
|
44
|
+
@property
|
45
|
+
def primary_keys(self) -> List[str]:
|
46
|
+
return self["primary_keys"]
|
47
|
+
|
48
|
+
@property
|
49
|
+
def num_hash_buckets(self) -> int:
|
50
|
+
return self["num_hash_buckets"]
|
51
|
+
|
52
|
+
@property
|
53
|
+
def num_hash_groups(self) -> int:
|
54
|
+
return self["num_hash_groups"]
|
55
|
+
|
56
|
+
@property
|
57
|
+
def enable_profiler(self) -> Optional[bool]:
|
58
|
+
return self.get("enable_profiler")
|
59
|
+
|
60
|
+
@property
|
61
|
+
def metrics_config(self) -> Optional[MetricsConfig]:
|
62
|
+
return self.get("metrics_config")
|
63
|
+
|
64
|
+
@property
|
65
|
+
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
66
|
+
return self.get("read_kwargs_provider")
|
67
|
+
|
68
|
+
@property
|
69
|
+
def object_store(self) -> Optional[IObjectStore]:
|
70
|
+
return self.get("object_store")
|
71
|
+
|
72
|
+
@property
|
73
|
+
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
74
|
+
return self.get("deltacat_storage")
|
75
|
+
|
76
|
+
@property
|
77
|
+
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
78
|
+
return self.get("deltacat_storage_kwargs")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from typing import NamedTuple
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
|
6
|
+
class HashBucketResult(NamedTuple):
|
7
|
+
hash_bucket_group_to_obj_id_tuple: np.ndarray
|
8
|
+
hb_size_bytes: np.int64
|
9
|
+
hb_record_count: np.int64
|
10
|
+
peak_memory_usage_bytes: np.double
|
11
|
+
telemetry_time_in_seconds: np.double
|
12
|
+
task_completed_at: np.double
|