deltacat 1.1.12__py3-none-any.whl → 1.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +5 -0
- deltacat/compute/compactor_v2/compaction_session.py +97 -573
- deltacat/compute/compactor_v2/utils/task_options.py +0 -1
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +1 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +32 -0
- deltacat/tests/compute/compact_partition_test_cases.py +19 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +13 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +34 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +12 -0
- deltacat/tests/compute/test_util_common.py +101 -0
- {deltacat-1.1.12.dist-info → deltacat-1.1.13.dist-info}/METADATA +15 -15
- {deltacat-1.1.12.dist-info → deltacat-1.1.13.dist-info}/RECORD +17 -17
- {deltacat-1.1.12.dist-info → deltacat-1.1.13.dist-info}/WHEEL +1 -1
- {deltacat-1.1.12.dist-info → deltacat-1.1.13.dist-info}/LICENSE +0 -0
- {deltacat-1.1.12.dist-info → deltacat-1.1.13.dist-info}/top_level.txt +0 -0
@@ -1,82 +1,58 @@
|
|
1
|
+
import numpy as np
|
1
2
|
import importlib
|
2
3
|
from contextlib import nullcontext
|
3
|
-
import numpy as np
|
4
|
-
import functools
|
5
4
|
import logging
|
6
|
-
import ray
|
7
5
|
import time
|
8
|
-
import
|
9
|
-
|
10
|
-
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
11
|
-
RemoteMergeFileGroupsProvider,
|
12
|
-
)
|
13
|
-
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
14
|
-
|
15
|
-
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
6
|
+
import ray
|
16
7
|
|
17
|
-
from deltacat.aws import s3u as s3_utils
|
18
8
|
import deltacat
|
19
|
-
from deltacat import logs
|
20
9
|
from deltacat.compute.compactor import (
|
21
|
-
HighWatermark,
|
22
10
|
PyArrowWriteResult,
|
23
11
|
RoundCompletionInfo,
|
24
12
|
)
|
25
|
-
from deltacat
|
26
|
-
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
13
|
+
from deltacat import logs
|
27
14
|
from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
28
15
|
ExecutionCompactionResult,
|
29
16
|
)
|
30
|
-
from deltacat.compute.compactor.model.
|
31
|
-
from deltacat.compute.
|
32
|
-
generate_local_merge_input,
|
33
|
-
)
|
17
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
18
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
34
19
|
from deltacat.compute.compactor import DeltaAnnotated
|
35
|
-
from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
|
36
20
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
37
21
|
DeleteStrategy,
|
38
22
|
)
|
23
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
24
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
39
25
|
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
40
26
|
DeleteFileEnvelope,
|
41
27
|
)
|
42
|
-
from deltacat.compute.compactor_v2.deletes.utils import prepare_deletes
|
43
|
-
|
44
28
|
from deltacat.storage import (
|
45
29
|
Delta,
|
46
30
|
DeltaLocator,
|
47
|
-
DeltaType,
|
48
31
|
Manifest,
|
49
32
|
Partition,
|
50
|
-
Stream,
|
51
|
-
StreamLocator,
|
52
33
|
)
|
53
34
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
54
35
|
CompactPartitionParams,
|
55
36
|
)
|
56
|
-
from deltacat.utils.
|
57
|
-
|
58
|
-
|
37
|
+
from deltacat.utils.resources import (
|
38
|
+
get_current_process_peak_memory_usage_in_bytes,
|
39
|
+
)
|
40
|
+
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
41
|
+
_fetch_compaction_metadata,
|
42
|
+
_build_uniform_deltas,
|
43
|
+
_run_hash_and_merge,
|
44
|
+
_process_merge_results,
|
45
|
+
_upload_compaction_audit,
|
46
|
+
_write_new_round_completion_file,
|
47
|
+
_commit_compaction_result,
|
59
48
|
)
|
60
|
-
from deltacat.compute.compactor_v2.steps import merge as mg
|
61
|
-
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
62
|
-
from deltacat.compute.compactor_v2.utils import io
|
63
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
64
49
|
from deltacat.utils.metrics import metrics
|
65
|
-
|
66
|
-
from typing import List, Optional
|
67
|
-
from collections import defaultdict
|
68
50
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
69
51
|
CompactionSessionAuditInfo,
|
70
52
|
)
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
from deltacat.compute.compactor_v2.utils.task_options import (
|
75
|
-
hash_bucket_resource_options_provider,
|
76
|
-
merge_resource_options_provider,
|
77
|
-
local_merge_resource_options_provider,
|
78
|
-
)
|
79
|
-
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
53
|
+
|
54
|
+
from typing import List, Optional
|
55
|
+
from deltacat.compute.compactor_v2.utils import io
|
80
56
|
from deltacat.exceptions import categorize_errors
|
81
57
|
from deltacat.compute.compactor_v2.constants import COMPACT_PARTITION_METRIC_PREFIX
|
82
58
|
|
@@ -101,118 +77,50 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
101
77
|
params,
|
102
78
|
**kwargs,
|
103
79
|
)
|
104
|
-
|
105
|
-
"INPLACE"
|
106
|
-
if execute_compaction_result.is_inplace_compacted
|
107
|
-
else "NON-INPLACE"
|
108
|
-
)
|
109
|
-
logger.info(
|
110
|
-
f"Partition-{params.source_partition_locator} -> "
|
111
|
-
f"{compaction_session_type} Compaction session data processing completed"
|
112
|
-
)
|
113
|
-
if execute_compaction_result.new_compacted_partition:
|
114
|
-
previous_partition: Optional[Partition] = None
|
115
|
-
if execute_compaction_result.is_inplace_compacted:
|
116
|
-
previous_partition: Optional[
|
117
|
-
Partition
|
118
|
-
] = params.deltacat_storage.get_partition(
|
119
|
-
params.source_partition_locator.stream_locator,
|
120
|
-
params.source_partition_locator.partition_values,
|
121
|
-
**params.deltacat_storage_kwargs,
|
122
|
-
)
|
123
|
-
# NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
|
124
|
-
logger.info(
|
125
|
-
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
126
|
-
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
127
|
-
)
|
128
|
-
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
129
|
-
execute_compaction_result.new_compacted_partition,
|
130
|
-
previous_partition,
|
131
|
-
**params.deltacat_storage_kwargs,
|
132
|
-
)
|
133
|
-
logger.info(f"Committed compacted partition: {committed_partition}")
|
134
|
-
else:
|
135
|
-
logger.warning("No new partition was committed during compaction.")
|
136
|
-
|
137
|
-
logger.info(
|
138
|
-
f"Completed compaction session for: {params.source_partition_locator}"
|
139
|
-
)
|
80
|
+
_commit_compaction_result(params, execute_compaction_result)
|
140
81
|
return execute_compaction_result.round_completion_file_s3_url
|
141
82
|
|
142
83
|
|
143
84
|
def _execute_compaction(
|
144
85
|
params: CompactPartitionParams, **kwargs
|
145
86
|
) -> ExecutionCompactionResult:
|
146
|
-
|
147
|
-
|
87
|
+
compaction_start_time: float = time.monotonic()
|
88
|
+
# Fetch round completion info for previously compacted partition, if it exists
|
89
|
+
fetch_compaction_metadata_result: tuple[
|
90
|
+
Optional[Manifest], Optional[RoundCompletionInfo]
|
91
|
+
] = _fetch_compaction_metadata(params)
|
92
|
+
(
|
93
|
+
previous_compacted_delta_manifest,
|
94
|
+
round_completion_info,
|
95
|
+
) = fetch_compaction_metadata_result
|
96
|
+
rcf_source_partition_locator: rcf.PartitionLocator = (
|
148
97
|
params.rebase_source_partition_locator or params.source_partition_locator
|
149
98
|
)
|
150
99
|
|
151
|
-
base_audit_url = rcf_source_partition_locator.path(
|
100
|
+
base_audit_url: str = rcf_source_partition_locator.path(
|
152
101
|
f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
|
153
102
|
)
|
154
|
-
audit_url = f"{base_audit_url}.json"
|
103
|
+
audit_url: str = f"{base_audit_url}.json"
|
155
104
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
156
|
-
compaction_audit = (
|
105
|
+
compaction_audit: CompactionSessionAuditInfo = (
|
157
106
|
CompactionSessionAuditInfo(deltacat.__version__, ray.__version__, audit_url)
|
158
107
|
.set_hash_bucket_count(params.hash_bucket_count)
|
159
108
|
.set_compactor_version(CompactorVersion.V2.value)
|
160
109
|
)
|
161
110
|
|
162
|
-
compaction_start = time.monotonic()
|
163
|
-
|
164
|
-
task_max_parallelism: int = params.task_max_parallelism
|
165
|
-
|
166
111
|
if params.pg_config:
|
167
112
|
logger.info(
|
168
113
|
"pg_config specified. Tasks will be scheduled in a placement group."
|
169
114
|
)
|
170
115
|
cluster_resources = params.pg_config.resource
|
171
|
-
cluster_cpus = cluster_resources["CPU"]
|
172
116
|
cluster_memory = cluster_resources["memory"]
|
173
|
-
task_max_parallelism = cluster_cpus
|
174
117
|
compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
if not params.rebase_source_partition_locator:
|
182
|
-
round_completion_info = rcf.read_round_completion_file(
|
183
|
-
params.compaction_artifact_s3_bucket,
|
184
|
-
params.source_partition_locator,
|
185
|
-
params.destination_partition_locator,
|
186
|
-
**params.s3_client_kwargs,
|
187
|
-
)
|
188
|
-
if not round_completion_info:
|
189
|
-
logger.info(
|
190
|
-
"Both rebase partition and round completion file not found. Performing an entire backfill on source."
|
191
|
-
)
|
192
|
-
else:
|
193
|
-
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
194
|
-
|
195
|
-
previous_compacted_delta_manifest = (
|
196
|
-
params.deltacat_storage.get_delta_manifest(
|
197
|
-
compacted_delta_locator, **params.deltacat_storage_kwargs
|
198
|
-
)
|
199
|
-
)
|
200
|
-
|
201
|
-
high_watermark = round_completion_info.high_watermark
|
202
|
-
logger.info(f"Setting round completion high watermark: {high_watermark}")
|
203
|
-
assert (
|
204
|
-
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
205
|
-
), (
|
206
|
-
"The hash bucket count has changed. "
|
207
|
-
"Kindly run rebase compaction and trigger incremental again. "
|
208
|
-
f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
|
209
|
-
f"not equal to Hash bucket count in args={params.hash_bucket_count}."
|
210
|
-
)
|
211
|
-
|
212
|
-
logger.info(f"Round completion file: {round_completion_info}")
|
213
|
-
|
118
|
+
high_watermark = (
|
119
|
+
round_completion_info.high_watermark if round_completion_info else None
|
120
|
+
)
|
121
|
+
audit_url = compaction_audit.audit_url if compaction_audit else None
|
122
|
+
# discover and build uniform deltas
|
214
123
|
delta_discovery_start = time.monotonic()
|
215
|
-
|
216
124
|
input_deltas: List[Delta] = io.discover_deltas(
|
217
125
|
params.source_partition_locator,
|
218
126
|
params.last_stream_position_to_compact,
|
@@ -226,348 +134,46 @@ def _execute_compaction(
|
|
226
134
|
if not input_deltas:
|
227
135
|
logger.info("No input deltas found to compact.")
|
228
136
|
return ExecutionCompactionResult(None, None, None, False)
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
)
|
255
|
-
|
256
|
-
delta_discovery_end = time.monotonic()
|
257
|
-
|
258
|
-
compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
259
|
-
compaction_audit.set_delta_discovery_time_in_seconds(
|
260
|
-
delta_discovery_end - delta_discovery_start
|
261
|
-
)
|
262
|
-
|
263
|
-
s3_utils.upload(
|
264
|
-
compaction_audit.audit_url,
|
265
|
-
str(json.dumps(compaction_audit)),
|
266
|
-
**params.s3_client_kwargs,
|
267
|
-
)
|
268
|
-
|
269
|
-
# create a new stream for this round
|
270
|
-
compacted_stream_locator: Optional[
|
271
|
-
StreamLocator
|
272
|
-
] = params.destination_partition_locator.stream_locator
|
273
|
-
compacted_stream: Stream = params.deltacat_storage.get_stream(
|
274
|
-
compacted_stream_locator.namespace,
|
275
|
-
compacted_stream_locator.table_name,
|
276
|
-
compacted_stream_locator.table_version,
|
277
|
-
**params.deltacat_storage_kwargs,
|
278
|
-
)
|
279
|
-
compacted_partition: Partition = params.deltacat_storage.stage_partition(
|
280
|
-
compacted_stream,
|
281
|
-
params.destination_partition_locator.partition_values,
|
282
|
-
**params.deltacat_storage_kwargs,
|
283
|
-
)
|
284
|
-
|
285
|
-
hb_options_provider = functools.partial(
|
286
|
-
task_resource_options_provider,
|
287
|
-
pg_config=params.pg_config,
|
288
|
-
resource_amount_provider=hash_bucket_resource_options_provider,
|
289
|
-
previous_inflation=params.previous_inflation,
|
290
|
-
average_record_size_bytes=params.average_record_size_bytes,
|
291
|
-
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
292
|
-
primary_keys=params.primary_keys,
|
293
|
-
ray_custom_resources=params.ray_custom_resources,
|
294
|
-
memory_logs_enabled=params.memory_logs_enabled,
|
295
|
-
)
|
296
|
-
|
297
|
-
total_input_records_count = np.int64(0)
|
298
|
-
total_hb_record_count = np.int64(0)
|
299
|
-
telemetry_time_hb = 0
|
300
|
-
if params.hash_bucket_count == 1:
|
301
|
-
logger.info("Hash bucket count set to 1. Running local merge")
|
302
|
-
merge_start = time.monotonic()
|
303
|
-
local_merge_input = generate_local_merge_input(
|
304
|
-
params,
|
305
|
-
uniform_deltas,
|
306
|
-
compacted_partition,
|
307
|
-
round_completion_info,
|
308
|
-
delete_strategy,
|
309
|
-
delete_file_envelopes,
|
310
|
-
)
|
311
|
-
estimated_da_bytes = (
|
312
|
-
compaction_audit.estimated_in_memory_size_bytes_during_discovery
|
313
|
-
)
|
314
|
-
estimated_num_records = sum(
|
315
|
-
[
|
316
|
-
entry.meta.record_count
|
317
|
-
for delta in uniform_deltas
|
318
|
-
for entry in delta.manifest.entries
|
319
|
-
]
|
320
|
-
)
|
321
|
-
local_merge_options = local_merge_resource_options_provider(
|
322
|
-
estimated_da_size=estimated_da_bytes,
|
323
|
-
estimated_num_rows=estimated_num_records,
|
324
|
-
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
325
|
-
round_completion_info=round_completion_info,
|
326
|
-
compacted_delta_manifest=previous_compacted_delta_manifest,
|
327
|
-
ray_custom_resources=params.ray_custom_resources,
|
328
|
-
primary_keys=params.primary_keys,
|
329
|
-
memory_logs_enabled=params.memory_logs_enabled,
|
330
|
-
)
|
331
|
-
local_merge_result = ray.get(
|
332
|
-
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
333
|
-
)
|
334
|
-
total_input_records_count += local_merge_result.input_record_count
|
335
|
-
merge_results = [local_merge_result]
|
336
|
-
merge_invoke_end = time.monotonic()
|
337
|
-
else:
|
338
|
-
hb_start = time.monotonic()
|
339
|
-
|
340
|
-
def hash_bucket_input_provider(index, item):
|
341
|
-
return {
|
342
|
-
"input": HashBucketInput.of(
|
343
|
-
item,
|
344
|
-
primary_keys=params.primary_keys,
|
345
|
-
hb_task_index=index,
|
346
|
-
num_hash_buckets=params.hash_bucket_count,
|
347
|
-
num_hash_groups=params.hash_group_count,
|
348
|
-
enable_profiler=params.enable_profiler,
|
349
|
-
metrics_config=params.metrics_config,
|
350
|
-
read_kwargs_provider=params.read_kwargs_provider,
|
351
|
-
object_store=params.object_store,
|
352
|
-
deltacat_storage=params.deltacat_storage,
|
353
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
354
|
-
memory_logs_enabled=params.memory_logs_enabled,
|
355
|
-
)
|
356
|
-
}
|
357
|
-
|
358
|
-
all_hash_group_idx_to_obj_id = defaultdict(list)
|
359
|
-
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
360
|
-
all_hash_group_idx_to_num_rows = defaultdict(int)
|
361
|
-
hb_tasks_pending = invoke_parallel(
|
362
|
-
items=uniform_deltas,
|
363
|
-
ray_task=hb.hash_bucket,
|
364
|
-
max_parallelism=task_max_parallelism,
|
365
|
-
options_provider=hb_options_provider,
|
366
|
-
kwargs_provider=hash_bucket_input_provider,
|
367
|
-
)
|
368
|
-
|
369
|
-
hb_invoke_end = time.monotonic()
|
370
|
-
|
371
|
-
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
372
|
-
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
373
|
-
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
374
|
-
hb_end = time.monotonic()
|
375
|
-
|
376
|
-
# we use time.time() here because time.monotonic() has no reference point
|
377
|
-
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
378
|
-
# to compare time.time()s captured in different nodes.
|
379
|
-
hb_results_retrieved_at = time.time()
|
380
|
-
|
381
|
-
telemetry_time_hb = compaction_audit.save_step_stats(
|
382
|
-
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
383
|
-
hb_results,
|
384
|
-
hb_results_retrieved_at,
|
385
|
-
hb_invoke_end - hb_start,
|
386
|
-
hb_end - hb_start,
|
387
|
-
)
|
388
|
-
|
389
|
-
s3_utils.upload(
|
390
|
-
compaction_audit.audit_url,
|
391
|
-
str(json.dumps(compaction_audit)),
|
392
|
-
**params.s3_client_kwargs,
|
393
|
-
)
|
394
|
-
|
395
|
-
hb_data_processed_size_bytes = np.int64(0)
|
396
|
-
|
397
|
-
# initialize all hash groups
|
398
|
-
for hb_group in range(params.hash_group_count):
|
399
|
-
all_hash_group_idx_to_num_rows[hb_group] = 0
|
400
|
-
all_hash_group_idx_to_obj_id[hb_group] = []
|
401
|
-
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
402
|
-
|
403
|
-
for hb_result in hb_results:
|
404
|
-
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
405
|
-
total_input_records_count += hb_result.hb_record_count
|
406
|
-
|
407
|
-
for hash_group_index, object_id_size_tuple in enumerate(
|
408
|
-
hb_result.hash_bucket_group_to_obj_id_tuple
|
409
|
-
):
|
410
|
-
if object_id_size_tuple:
|
411
|
-
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
412
|
-
object_id_size_tuple[0],
|
413
|
-
)
|
414
|
-
all_hash_group_idx_to_size_bytes[
|
415
|
-
hash_group_index
|
416
|
-
] += object_id_size_tuple[1].item()
|
417
|
-
all_hash_group_idx_to_num_rows[
|
418
|
-
hash_group_index
|
419
|
-
] += object_id_size_tuple[2].item()
|
420
|
-
|
421
|
-
logger.info(
|
422
|
-
f"Got {total_input_records_count} hash bucket records from hash bucketing step..."
|
423
|
-
)
|
424
|
-
|
425
|
-
total_hb_record_count = total_input_records_count
|
426
|
-
compaction_audit.set_hash_bucket_processed_size_bytes(
|
427
|
-
hb_data_processed_size_bytes.item()
|
428
|
-
)
|
429
|
-
|
430
|
-
# BSP Step 2: Merge
|
431
|
-
# NOTE: DELETE-type deltas are stored in Plasma object store
|
432
|
-
# in prepare_deletes and therefore don't need to included
|
433
|
-
# in merge task resource estimation
|
434
|
-
merge_options_provider = functools.partial(
|
435
|
-
task_resource_options_provider,
|
436
|
-
pg_config=params.pg_config,
|
437
|
-
resource_amount_provider=merge_resource_options_provider,
|
438
|
-
num_hash_groups=params.hash_group_count,
|
439
|
-
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
440
|
-
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
441
|
-
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
442
|
-
round_completion_info=round_completion_info,
|
443
|
-
compacted_delta_manifest=previous_compacted_delta_manifest,
|
444
|
-
primary_keys=params.primary_keys,
|
445
|
-
deltacat_storage=params.deltacat_storage,
|
446
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
447
|
-
ray_custom_resources=params.ray_custom_resources,
|
448
|
-
memory_logs_enabled=params.memory_logs_enabled,
|
449
|
-
)
|
450
|
-
|
451
|
-
def merge_input_provider(index, item):
|
452
|
-
return {
|
453
|
-
"input": MergeInput.of(
|
454
|
-
merge_file_groups_provider=RemoteMergeFileGroupsProvider(
|
455
|
-
hash_group_index=item[0],
|
456
|
-
dfe_groups_refs=item[1],
|
457
|
-
hash_bucket_count=params.hash_bucket_count,
|
458
|
-
num_hash_groups=params.hash_group_count,
|
459
|
-
object_store=params.object_store,
|
460
|
-
),
|
461
|
-
write_to_partition=compacted_partition,
|
462
|
-
compacted_file_content_type=params.compacted_file_content_type,
|
463
|
-
primary_keys=params.primary_keys,
|
464
|
-
sort_keys=params.sort_keys,
|
465
|
-
merge_task_index=index,
|
466
|
-
drop_duplicates=params.drop_duplicates,
|
467
|
-
max_records_per_output_file=params.records_per_compacted_file,
|
468
|
-
enable_profiler=params.enable_profiler,
|
469
|
-
metrics_config=params.metrics_config,
|
470
|
-
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
471
|
-
read_kwargs_provider=params.read_kwargs_provider,
|
472
|
-
round_completion_info=round_completion_info,
|
473
|
-
object_store=params.object_store,
|
474
|
-
deltacat_storage=params.deltacat_storage,
|
475
|
-
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
476
|
-
delete_strategy=delete_strategy,
|
477
|
-
delete_file_envelopes=delete_file_envelopes,
|
478
|
-
memory_logs_enabled=params.memory_logs_enabled,
|
479
|
-
disable_copy_by_reference=params.disable_copy_by_reference,
|
480
|
-
)
|
481
|
-
}
|
482
|
-
|
483
|
-
merge_start = time.monotonic()
|
484
|
-
merge_tasks_pending = invoke_parallel(
|
485
|
-
items=all_hash_group_idx_to_obj_id.items(),
|
486
|
-
ray_task=mg.merge,
|
487
|
-
max_parallelism=task_max_parallelism,
|
488
|
-
options_provider=merge_options_provider,
|
489
|
-
kwargs_provider=merge_input_provider,
|
490
|
-
)
|
491
|
-
merge_invoke_end = time.monotonic()
|
492
|
-
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
493
|
-
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
494
|
-
|
495
|
-
logger.info(f"Got {len(merge_results)} merge results.")
|
496
|
-
|
497
|
-
merge_results_retrieved_at = time.time()
|
498
|
-
merge_end = time.monotonic()
|
499
|
-
|
500
|
-
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
501
|
-
total_deleted_record_count = sum(
|
502
|
-
[ddr.deleted_record_count for ddr in merge_results]
|
503
|
-
)
|
504
|
-
logger.info(
|
505
|
-
f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
|
506
|
-
)
|
507
|
-
|
508
|
-
compaction_audit.set_input_records(total_input_records_count.item())
|
509
|
-
|
510
|
-
telemetry_time_merge = compaction_audit.save_step_stats(
|
511
|
-
CompactionSessionAuditInfo.MERGE_STEP_NAME,
|
137
|
+
build_uniform_deltas_result: tuple[
|
138
|
+
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition
|
139
|
+
] = _build_uniform_deltas(
|
140
|
+
params, compaction_audit, input_deltas, delta_discovery_start
|
141
|
+
)
|
142
|
+
(
|
143
|
+
uniform_deltas,
|
144
|
+
delete_strategy,
|
145
|
+
delete_file_envelopes,
|
146
|
+
) = build_uniform_deltas_result
|
147
|
+
|
148
|
+
# run merge
|
149
|
+
_run_hash_and_merge_result: tuple[
|
150
|
+
Optional[List[MergeResult]],
|
151
|
+
np.float64,
|
152
|
+
np.float64,
|
153
|
+
Partition,
|
154
|
+
] = _run_hash_and_merge(
|
155
|
+
params,
|
156
|
+
uniform_deltas,
|
157
|
+
round_completion_info,
|
158
|
+
delete_strategy,
|
159
|
+
delete_file_envelopes,
|
160
|
+
compaction_audit,
|
161
|
+
previous_compacted_delta_manifest,
|
162
|
+
)
|
163
|
+
(
|
512
164
|
merge_results,
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
)
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
mat_results: List[MaterializeResult] = sorted(
|
525
|
-
mat_results, key=lambda m: m.task_index
|
526
|
-
)
|
527
|
-
|
528
|
-
hb_id_to_entry_indices_range = {}
|
529
|
-
file_index = 0
|
530
|
-
previous_task_index = -1
|
531
|
-
|
532
|
-
for mat_result in mat_results:
|
533
|
-
assert (
|
534
|
-
mat_result.pyarrow_write_result.files >= 1
|
535
|
-
), "Atleast one file must be materialized"
|
536
|
-
assert (
|
537
|
-
mat_result.task_index != previous_task_index
|
538
|
-
), f"Multiple materialize results found for a hash bucket: {mat_result.task_index}"
|
539
|
-
|
540
|
-
hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
|
541
|
-
file_index,
|
542
|
-
file_index + mat_result.pyarrow_write_result.files,
|
543
|
-
)
|
544
|
-
|
545
|
-
file_index += mat_result.pyarrow_write_result.files
|
546
|
-
previous_task_index = mat_result.task_index
|
547
|
-
|
548
|
-
s3_utils.upload(
|
549
|
-
compaction_audit.audit_url,
|
550
|
-
str(json.dumps(compaction_audit)),
|
551
|
-
**params.s3_client_kwargs,
|
552
|
-
)
|
553
|
-
|
554
|
-
deltas = [m.delta for m in mat_results]
|
555
|
-
|
556
|
-
# Note: An appropriate last stream position must be set
|
557
|
-
# to avoid correctness issue.
|
558
|
-
merged_delta: Delta = Delta.merge_deltas(
|
559
|
-
deltas,
|
560
|
-
stream_position=params.last_stream_position_to_compact,
|
561
|
-
)
|
562
|
-
|
563
|
-
record_info_msg = (
|
564
|
-
f"Hash bucket records: {total_hb_record_count},"
|
565
|
-
f" Deduped records: {total_dd_record_count}, "
|
566
|
-
f" Deleted records: {total_deleted_record_count}, "
|
567
|
-
f" Materialized records: {merged_delta.meta.record_count}"
|
568
|
-
)
|
165
|
+
telemetry_time_hb,
|
166
|
+
telemetry_time_merge,
|
167
|
+
compacted_partition,
|
168
|
+
) = _run_hash_and_merge_result
|
169
|
+
# process merge results
|
170
|
+
process_merge_results: tuple[
|
171
|
+
Delta, list[MaterializeResult], dict
|
172
|
+
] = _process_merge_results(params, merge_results, compaction_audit)
|
173
|
+
merged_delta, mat_results, hb_id_to_entry_indices_range = process_merge_results
|
174
|
+
# Record information, logging, and return ExecutionCompactionResult
|
175
|
+
record_info_msg: str = f" Materialized records: {merged_delta.meta.record_count}"
|
569
176
|
logger.info(record_info_msg)
|
570
|
-
|
571
177
|
compacted_delta: Delta = params.deltacat_storage.commit_delta(
|
572
178
|
merged_delta,
|
573
179
|
properties=kwargs.get("properties", {}),
|
@@ -575,16 +181,15 @@ def _execute_compaction(
|
|
575
181
|
)
|
576
182
|
|
577
183
|
logger.info(f"Committed compacted delta: {compacted_delta}")
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
new_compacted_delta_locator = DeltaLocator.of(
|
184
|
+
compaction_end_time: float = time.monotonic()
|
185
|
+
compaction_audit.set_compaction_time_in_seconds(
|
186
|
+
compaction_end_time - compaction_start_time
|
187
|
+
)
|
188
|
+
new_compacted_delta_locator: DeltaLocator = DeltaLocator.of(
|
583
189
|
compacted_partition.locator,
|
584
190
|
compacted_delta.stream_position,
|
585
191
|
)
|
586
|
-
|
587
|
-
pyarrow_write_result = PyArrowWriteResult.union(
|
192
|
+
pyarrow_write_result: PyArrowWriteResult = PyArrowWriteResult.union(
|
588
193
|
[m.pyarrow_write_result for m in mat_results]
|
589
194
|
)
|
590
195
|
|
@@ -597,100 +202,19 @@ def _execute_compaction(
|
|
597
202
|
mat_results, telemetry_time_hb + telemetry_time_merge
|
598
203
|
)
|
599
204
|
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
compaction_audit.input_size_bytes
|
605
|
-
and compaction_audit.hash_bucket_processed_size_bytes
|
606
|
-
):
|
607
|
-
input_inflation = (
|
608
|
-
compaction_audit.hash_bucket_processed_size_bytes
|
609
|
-
/ compaction_audit.input_size_bytes
|
610
|
-
)
|
611
|
-
|
612
|
-
if (
|
613
|
-
compaction_audit.hash_bucket_processed_size_bytes
|
614
|
-
and compaction_audit.input_records
|
615
|
-
):
|
616
|
-
input_average_record_size_bytes = (
|
617
|
-
compaction_audit.hash_bucket_processed_size_bytes
|
618
|
-
/ compaction_audit.input_records
|
619
|
-
)
|
620
|
-
|
621
|
-
logger.info(
|
622
|
-
f"The inflation of input deltas={input_inflation}"
|
623
|
-
f" and average record size={input_average_record_size_bytes}"
|
205
|
+
_upload_compaction_audit(
|
206
|
+
params,
|
207
|
+
compaction_audit,
|
208
|
+
round_completion_info,
|
624
209
|
)
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
if round_completion_info:
|
629
|
-
compaction_audit.set_input_file_count(
|
630
|
-
(compaction_audit.input_file_count or 0)
|
631
|
-
+ round_completion_info.compacted_pyarrow_write_result.files
|
632
|
-
)
|
633
|
-
compaction_audit.set_input_size_bytes(
|
634
|
-
(compaction_audit.input_size_bytes or 0.0)
|
635
|
-
+ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
636
|
-
)
|
637
|
-
compaction_audit.set_input_records(
|
638
|
-
(compaction_audit.input_records or 0)
|
639
|
-
+ round_completion_info.compacted_pyarrow_write_result.records
|
640
|
-
)
|
641
|
-
|
642
|
-
s3_utils.upload(
|
643
|
-
compaction_audit.audit_url,
|
644
|
-
str(json.dumps(compaction_audit)),
|
645
|
-
**params.s3_client_kwargs,
|
646
|
-
)
|
647
|
-
|
648
|
-
new_round_completion_info = RoundCompletionInfo.of(
|
649
|
-
high_watermark=params.last_stream_position_to_compact,
|
650
|
-
compacted_delta_locator=new_compacted_delta_locator,
|
651
|
-
compacted_pyarrow_write_result=pyarrow_write_result,
|
652
|
-
sort_keys_bit_width=params.bit_width_of_sort_keys,
|
653
|
-
manifest_entry_copied_by_reference_ratio=compaction_audit.untouched_file_ratio,
|
654
|
-
compaction_audit_url=audit_url,
|
655
|
-
hash_bucket_count=params.hash_bucket_count,
|
656
|
-
hb_index_to_entry_range=hb_id_to_entry_indices_range,
|
657
|
-
compactor_version=CompactorVersion.V2.value,
|
658
|
-
input_inflation=input_inflation,
|
659
|
-
input_average_record_size_bytes=input_average_record_size_bytes,
|
660
|
-
)
|
661
|
-
|
662
|
-
logger.info(
|
663
|
-
f"Partition-{params.source_partition_locator.partition_values},"
|
664
|
-
f"compacted at: {params.last_stream_position_to_compact},"
|
665
|
-
)
|
666
|
-
logger.info(
|
667
|
-
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
668
|
-
)
|
669
|
-
is_inplace_compacted: bool = (
|
670
|
-
rcf_source_partition_locator.partition_values
|
671
|
-
== params.destination_partition_locator.partition_values
|
672
|
-
and rcf_source_partition_locator.stream_id
|
673
|
-
== params.destination_partition_locator.stream_id
|
674
|
-
)
|
675
|
-
if is_inplace_compacted:
|
676
|
-
logger.info(
|
677
|
-
"Overriding round completion file source partition locator as in-place compacted. "
|
678
|
-
+ f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
|
679
|
-
f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
|
680
|
-
)
|
681
|
-
rcf_source_partition_locator = compacted_partition.locator
|
682
|
-
|
683
|
-
round_completion_file_s3_url = rcf.write_round_completion_file(
|
684
|
-
params.compaction_artifact_s3_bucket,
|
685
|
-
rcf_source_partition_locator,
|
686
|
-
compacted_partition.locator,
|
687
|
-
new_round_completion_info,
|
688
|
-
**params.s3_client_kwargs,
|
689
|
-
)
|
690
|
-
|
691
|
-
return ExecutionCompactionResult(
|
210
|
+
compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
|
211
|
+
params,
|
212
|
+
compaction_audit,
|
692
213
|
compacted_partition,
|
693
|
-
|
694
|
-
|
695
|
-
|
214
|
+
audit_url,
|
215
|
+
hb_id_to_entry_indices_range,
|
216
|
+
rcf_source_partition_locator,
|
217
|
+
new_compacted_delta_locator,
|
218
|
+
pyarrow_write_result,
|
696
219
|
)
|
220
|
+
return compaction_result
|