deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/redshift/__init__.py +4 -0
- deltacat/aws/redshift/model/manifest.py +93 -1
- deltacat/aws/s3u.py +250 -111
- deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- deltacat/compute/compactor_v2/compaction_session.py +175 -152
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat/compute/compactor_v2/utils/merge.py +126 -0
- deltacat/compute/compactor_v2/utils/task_options.py +47 -4
- deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat/compute/merge_on_read/model/__init__.py +0 -0
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- deltacat/compute/merge_on_read/utils/delta.py +42 -0
- deltacat/storage/interface.py +10 -2
- deltacat/storage/model/types.py +3 -11
- deltacat/tests/catalog/__init__.py +0 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat/tests/test_utils/pyarrow.py +33 -14
- deltacat/tests/utils/test_daft.py +42 -2
- deltacat/types/media.py +5 -0
- deltacat/types/tables.py +7 -1
- deltacat/utils/daft.py +78 -13
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
- {deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,6 @@ from contextlib import nullcontext
|
|
5
5
|
from typing import List, Optional, Tuple
|
6
6
|
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
7
7
|
import numpy as np
|
8
|
-
import pyarrow as pa
|
9
8
|
import ray
|
10
9
|
from deltacat import logs
|
11
10
|
from deltacat.compute.compactor import (
|
@@ -14,12 +13,12 @@ from deltacat.compute.compactor import (
|
|
14
13
|
)
|
15
14
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
16
15
|
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
16
|
+
from deltacat.compute.compactor_v2.utils.delta import read_delta_file_envelopes
|
17
17
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
18
18
|
group_hash_bucket_indices,
|
19
19
|
group_by_pk_hash_bucket,
|
20
20
|
)
|
21
21
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
22
|
-
from deltacat.types.media import StorageType
|
23
22
|
from deltacat.utils.ray_utils.runtime import (
|
24
23
|
get_current_ray_task_id,
|
25
24
|
get_current_ray_worker_id,
|
@@ -39,57 +38,6 @@ if importlib.util.find_spec("memray"):
|
|
39
38
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
40
39
|
|
41
40
|
|
42
|
-
def _read_delta_file_envelopes(
|
43
|
-
annotated_delta: DeltaAnnotated,
|
44
|
-
read_kwargs_provider: Optional[ReadKwargsProvider],
|
45
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
46
|
-
deltacat_storage_kwargs: Optional[dict] = None,
|
47
|
-
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
48
|
-
|
49
|
-
tables = deltacat_storage.download_delta(
|
50
|
-
annotated_delta,
|
51
|
-
max_parallelism=1,
|
52
|
-
file_reader_kwargs_provider=read_kwargs_provider,
|
53
|
-
storage_type=StorageType.LOCAL,
|
54
|
-
**deltacat_storage_kwargs,
|
55
|
-
)
|
56
|
-
annotations = annotated_delta.annotations
|
57
|
-
assert (
|
58
|
-
len(tables) == len(annotations),
|
59
|
-
f"Unexpected Error: Length of downloaded delta manifest tables "
|
60
|
-
f"({len(tables)}) doesn't match the length of delta manifest "
|
61
|
-
f"annotations ({len(annotations)}).",
|
62
|
-
)
|
63
|
-
if not tables:
|
64
|
-
return None, 0, 0
|
65
|
-
|
66
|
-
delta_stream_position = annotations[0].annotation_stream_position
|
67
|
-
delta_type = annotations[0].annotation_delta_type
|
68
|
-
|
69
|
-
for annotation in annotations:
|
70
|
-
assert annotation.annotation_stream_position == delta_stream_position, (
|
71
|
-
f"Annotation stream position does not match - {annotation.annotation_stream_position} "
|
72
|
-
f"!= {delta_stream_position}"
|
73
|
-
)
|
74
|
-
assert annotation.annotation_delta_type == delta_type, (
|
75
|
-
f"Annotation delta type does not match - {annotation.annotation_delta_type} "
|
76
|
-
f"!= {delta_type}"
|
77
|
-
)
|
78
|
-
|
79
|
-
delta_file_envelopes = []
|
80
|
-
table = pa.concat_tables(tables)
|
81
|
-
total_record_count = len(table)
|
82
|
-
total_size_bytes = int(table.nbytes)
|
83
|
-
|
84
|
-
delta_file = DeltaFileEnvelope.of(
|
85
|
-
stream_position=delta_stream_position,
|
86
|
-
delta_type=delta_type,
|
87
|
-
table=table,
|
88
|
-
)
|
89
|
-
delta_file_envelopes.append(delta_file)
|
90
|
-
return delta_file_envelopes, total_record_count, total_size_bytes
|
91
|
-
|
92
|
-
|
93
41
|
def _group_file_records_by_pk_hash_bucket(
|
94
42
|
annotated_delta: DeltaAnnotated,
|
95
43
|
num_hash_buckets: int,
|
@@ -103,7 +51,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
103
51
|
delta_file_envelopes,
|
104
52
|
total_record_count,
|
105
53
|
total_size_bytes,
|
106
|
-
) =
|
54
|
+
) = read_delta_file_envelopes(
|
107
55
|
annotated_delta,
|
108
56
|
read_kwargs_provider,
|
109
57
|
deltacat_storage,
|
@@ -187,7 +135,7 @@ def _timed_hash_bucket(input: HashBucketInput):
|
|
187
135
|
@ray.remote
|
188
136
|
def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
189
137
|
with ProcessUtilizationOverTimeRange() as process_util:
|
190
|
-
logger.info(f"Starting hash bucket task...")
|
138
|
+
logger.info(f"Starting hash bucket task {input.hb_task_index}...")
|
191
139
|
|
192
140
|
# Log node peak memory utilization every 10 seconds
|
193
141
|
def log_peak_memory():
|
@@ -212,7 +160,7 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
|
212
160
|
)
|
213
161
|
emit_metrics_time = latency
|
214
162
|
|
215
|
-
logger.info(f"Finished hash bucket task...")
|
163
|
+
logger.info(f"Finished hash bucket task {input.hb_task_index}...")
|
216
164
|
return HashBucketResult(
|
217
165
|
hash_bucket_result[0],
|
218
166
|
hash_bucket_result[1],
|
@@ -6,28 +6,21 @@ import pyarrow as pa
|
|
6
6
|
import ray
|
7
7
|
import time
|
8
8
|
import pyarrow.compute as pc
|
9
|
+
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
9
10
|
from uuid import uuid4
|
10
|
-
from collections import defaultdict
|
11
11
|
from deltacat import logs
|
12
|
-
from typing import List, Optional
|
13
|
-
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
|
12
|
+
from typing import List, Optional, Tuple
|
14
13
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
15
14
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
16
15
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
17
|
-
from deltacat.compute.compactor import
|
18
|
-
RoundCompletionInfo,
|
19
|
-
DeltaFileEnvelope,
|
20
|
-
)
|
16
|
+
from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
|
21
17
|
from deltacat.utils.common import ReadKwargsProvider
|
22
|
-
|
23
18
|
from contextlib import nullcontext
|
24
|
-
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
25
19
|
from deltacat.utils.ray_utils.runtime import (
|
26
20
|
get_current_ray_task_id,
|
27
21
|
get_current_ray_worker_id,
|
28
22
|
)
|
29
23
|
from deltacat.compute.compactor.utils import system_columns as sc
|
30
|
-
|
31
24
|
from deltacat.utils.performance import timed_invocation
|
32
25
|
from deltacat.utils.metrics import emit_timer_metrics
|
33
26
|
from deltacat.utils.resources import (
|
@@ -36,7 +29,6 @@ from deltacat.utils.resources import (
|
|
36
29
|
)
|
37
30
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
38
31
|
generate_pk_hash_column,
|
39
|
-
hash_group_index_to_hash_bucket_indices,
|
40
32
|
)
|
41
33
|
from deltacat.storage import (
|
42
34
|
Delta,
|
@@ -77,14 +69,9 @@ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
|
|
77
69
|
|
78
70
|
|
79
71
|
def _build_incremental_table(
|
80
|
-
hash_bucket_index: int,
|
81
72
|
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
82
73
|
) -> pa.Table:
|
83
74
|
|
84
|
-
logger.info(
|
85
|
-
f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
|
86
|
-
f"{len(df_envelopes_list)} delta file envelope lists..."
|
87
|
-
)
|
88
75
|
hb_tables = []
|
89
76
|
# sort by delta file stream position now instead of sorting every row later
|
90
77
|
df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
|
@@ -270,174 +257,120 @@ def _copy_all_manifest_files_from_old_hash_buckets(
|
|
270
257
|
return materialize_result_list
|
271
258
|
|
272
259
|
|
273
|
-
def
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
)
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
260
|
+
def _compact_tables(
|
261
|
+
input: MergeInput, dfe_list: List[List[DeltaFileEnvelope]], hb_idx: int
|
262
|
+
) -> Tuple[pa.Table, int, int]:
|
263
|
+
logger.info(
|
264
|
+
f"[Hash bucket index {hb_idx}] Reading dedupe input for "
|
265
|
+
f"{len(dfe_list)} delta file envelope lists..."
|
266
|
+
)
|
267
|
+
table = _build_incremental_table(dfe_list)
|
268
|
+
|
269
|
+
incremental_len = len(table)
|
270
|
+
logger.info(
|
271
|
+
f"[Hash bucket index {hb_idx}] Got the incremental table of length {incremental_len}"
|
272
|
+
)
|
273
|
+
|
274
|
+
if input.sort_keys:
|
275
|
+
# Incremental is sorted and merged, as sorting
|
276
|
+
# on non event based sort key does not produce consistent
|
277
|
+
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
278
|
+
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
279
|
+
table = table.sort_by(input.sort_keys)
|
280
|
+
|
281
|
+
compacted_table = None
|
282
|
+
|
283
|
+
if (
|
284
|
+
input.round_completion_info
|
285
|
+
and input.round_completion_info.hb_index_to_entry_range
|
286
|
+
and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
|
287
|
+
is not None
|
288
|
+
):
|
289
|
+
compacted_table = _download_compacted_table(
|
290
|
+
hb_index=hb_idx,
|
291
|
+
rcf=input.round_completion_info,
|
292
|
+
read_kwargs_provider=input.read_kwargs_provider,
|
293
|
+
deltacat_storage=input.deltacat_storage,
|
294
|
+
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
300
295
|
)
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
296
|
+
|
297
|
+
hb_table_record_count = len(table) + (
|
298
|
+
len(compacted_table) if compacted_table else 0
|
299
|
+
)
|
300
|
+
|
301
|
+
table, merge_time = timed_invocation(
|
302
|
+
func=_merge_tables,
|
303
|
+
table=table,
|
304
|
+
primary_keys=input.primary_keys,
|
305
|
+
can_drop_duplicates=input.drop_duplicates,
|
306
|
+
compacted_table=compacted_table,
|
307
|
+
)
|
308
|
+
total_deduped_records = hb_table_record_count - len(table)
|
309
|
+
|
310
|
+
logger.info(
|
311
|
+
f"[Merge task index {input.merge_task_index}] Merged "
|
312
|
+
f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
313
|
+
)
|
314
|
+
|
315
|
+
return table, incremental_len, total_deduped_records
|
316
|
+
|
317
|
+
|
318
|
+
def _copy_manifests_from_hash_bucketing(
|
319
|
+
input: MergeInput, hb_index_copy_by_reference_ids: List[int]
|
320
|
+
) -> List[MaterializeResult]:
|
321
|
+
materialized_results: List[MaterializeResult] = []
|
322
|
+
|
323
|
+
if input.round_completion_info:
|
324
|
+
referenced_materialized_results = (
|
325
|
+
_copy_all_manifest_files_from_old_hash_buckets(
|
326
|
+
hb_index_copy_by_reference_ids,
|
327
|
+
input.round_completion_info,
|
328
|
+
input.write_to_partition,
|
329
|
+
input.deltacat_storage,
|
330
|
+
input.deltacat_storage_kwargs,
|
331
|
+
)
|
307
332
|
)
|
308
|
-
|
309
|
-
|
310
|
-
task_index=hash_bucket_index,
|
311
|
-
# TODO (pdames): Generalize WriteResult to contain in-memory-table-type
|
312
|
-
# and in-memory-table-bytes instead of tight coupling to paBytes
|
313
|
-
pyarrow_write_result=PyArrowWriteResult.of(
|
314
|
-
len(manifest.entries),
|
315
|
-
TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
|
316
|
-
manifest.meta.content_length,
|
317
|
-
len(compacted_table),
|
318
|
-
),
|
333
|
+
logger.info(
|
334
|
+
f"Copying {len(referenced_materialized_results)} manifest files by reference..."
|
319
335
|
)
|
320
|
-
|
321
|
-
return materialize_result
|
336
|
+
materialized_results.extend(referenced_materialized_results)
|
322
337
|
|
338
|
+
return materialized_results
|
339
|
+
|
340
|
+
|
341
|
+
def _timed_merge(input: MergeInput) -> MergeResult:
|
323
342
|
task_id = get_current_ray_task_id()
|
324
343
|
worker_id = get_current_ray_worker_id()
|
325
344
|
with memray.Tracker(
|
326
345
|
f"merge_{worker_id}_{task_id}.bin"
|
327
346
|
) if input.enable_profiler else nullcontext():
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
f"[Merge task {input.merge_task_index}] Getting delta file envelope "
|
333
|
-
f"groups for {len(input.dfe_groups_refs)} object refs..."
|
334
|
-
)
|
335
|
-
|
336
|
-
delta_file_envelope_groups_list = input.object_store.get_many(
|
337
|
-
input.dfe_groups_refs
|
338
|
-
)
|
339
|
-
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
340
|
-
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
341
|
-
assert input.hash_bucket_count == len(delta_file_envelope_groups), (
|
342
|
-
f"The hash bucket count must match the dfe size as {input.hash_bucket_count}"
|
343
|
-
f" != {len(delta_file_envelope_groups)}"
|
344
|
-
)
|
345
|
-
|
346
|
-
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
347
|
-
if dfes:
|
348
|
-
hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
|
349
|
-
|
350
|
-
valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
|
351
|
-
input.hash_group_index, input.hash_bucket_count, input.num_hash_groups
|
352
|
-
)
|
347
|
+
total_input_records, total_deduped_records = 0, 0
|
348
|
+
materialized_results: List[MaterializeResult] = []
|
349
|
+
merge_file_groups = input.merge_file_groups_provider.create()
|
350
|
+
hb_index_copy_by_ref_ids = []
|
353
351
|
|
354
|
-
|
355
|
-
|
352
|
+
for merge_file_group in merge_file_groups:
|
353
|
+
if not merge_file_group.dfe_groups:
|
354
|
+
hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
|
355
|
+
continue
|
356
356
|
|
357
|
-
|
358
|
-
|
359
|
-
for hb_idx in valid_hb_indices_iterable:
|
360
|
-
dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
|
361
|
-
|
362
|
-
if dfe_list:
|
363
|
-
total_dfes_found += 1
|
364
|
-
table = _build_incremental_table(hb_idx, dfe_list)
|
365
|
-
|
366
|
-
incremental_len = len(table)
|
367
|
-
logger.info(
|
368
|
-
f"Got the incremental table of length {incremental_len} for hash bucket {hb_idx}"
|
369
|
-
)
|
370
|
-
|
371
|
-
if input.sort_keys:
|
372
|
-
# Incremental is sorted and merged, as sorting
|
373
|
-
# on non event based sort key does not produce consistent
|
374
|
-
# compaction results. E.g., compaction(delta1, delta2, delta3)
|
375
|
-
# will not be equal to compaction(compaction(delta1, delta2), delta3).
|
376
|
-
table = table.sort_by(input.sort_keys)
|
377
|
-
|
378
|
-
compacted_table = None
|
379
|
-
if (
|
380
|
-
input.round_completion_info
|
381
|
-
and input.round_completion_info.hb_index_to_entry_range
|
382
|
-
and input.round_completion_info.hb_index_to_entry_range.get(
|
383
|
-
str(hb_idx)
|
384
|
-
)
|
385
|
-
is not None
|
386
|
-
):
|
387
|
-
|
388
|
-
compacted_table = _download_compacted_table(
|
389
|
-
hb_index=hb_idx,
|
390
|
-
rcf=input.round_completion_info,
|
391
|
-
read_kwargs_provider=input.read_kwargs_provider,
|
392
|
-
deltacat_storage=input.deltacat_storage,
|
393
|
-
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
394
|
-
)
|
395
|
-
|
396
|
-
hb_table_record_count = len(table) + (
|
397
|
-
len(compacted_table) if compacted_table else 0
|
398
|
-
)
|
399
|
-
|
400
|
-
table, merge_time = timed_invocation(
|
401
|
-
func=_merge_tables,
|
402
|
-
table=table,
|
403
|
-
primary_keys=input.primary_keys,
|
404
|
-
can_drop_duplicates=input.drop_duplicates,
|
405
|
-
compacted_table=compacted_table,
|
406
|
-
)
|
407
|
-
total_deduped_records += hb_table_record_count - len(table)
|
408
|
-
|
409
|
-
logger.info(
|
410
|
-
f"[Merge task index {input.merge_task_index}] Merged "
|
411
|
-
f"record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
|
412
|
-
)
|
413
|
-
|
414
|
-
materialized_results.append(_materialize(hb_idx, [table]))
|
415
|
-
else:
|
416
|
-
hb_index_copy_by_reference.append(hb_idx)
|
417
|
-
|
418
|
-
if input.round_completion_info and hb_index_copy_by_reference:
|
419
|
-
referenced_materialized_results = (
|
420
|
-
_copy_all_manifest_files_from_old_hash_buckets(
|
421
|
-
hb_index_copy_by_reference,
|
422
|
-
input.round_completion_info,
|
423
|
-
input.write_to_partition,
|
424
|
-
input.deltacat_storage,
|
425
|
-
input.deltacat_storage_kwargs,
|
426
|
-
)
|
357
|
+
table, input_records, deduped_records = _compact_tables(
|
358
|
+
input, merge_file_group.dfe_groups, merge_file_group.hb_index
|
427
359
|
)
|
428
|
-
|
429
|
-
|
360
|
+
total_input_records += input_records
|
361
|
+
total_deduped_records += deduped_records
|
362
|
+
materialized_results.append(
|
363
|
+
merge_utils.materialize(input, merge_file_group.hb_index, [table])
|
430
364
|
)
|
431
|
-
materialized_results.extend(referenced_materialized_results)
|
432
365
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
366
|
+
if hb_index_copy_by_ref_ids:
|
367
|
+
materialized_results.extend(
|
368
|
+
_copy_manifests_from_hash_bucketing(input, hb_index_copy_by_ref_ids)
|
369
|
+
)
|
437
370
|
|
438
|
-
|
439
|
-
"
|
440
|
-
f"
|
371
|
+
logger.info(
|
372
|
+
f"[Hash group index: {input.merge_file_groups_provider.hash_group_index}]"
|
373
|
+
f" Total number of materialized results produced: {len(materialized_results)} "
|
441
374
|
)
|
442
375
|
|
443
376
|
peak_memory_usage_bytes = get_current_process_peak_memory_usage_in_bytes()
|
@@ -447,6 +380,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
447
380
|
|
448
381
|
return MergeResult(
|
449
382
|
materialized_results,
|
383
|
+
np.int64(total_input_records),
|
450
384
|
np.int64(total_deduped_records),
|
451
385
|
np.double(peak_memory_usage_bytes),
|
452
386
|
np.double(0.0),
|
@@ -457,7 +391,7 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
457
391
|
@ray.remote
|
458
392
|
def merge(input: MergeInput) -> MergeResult:
|
459
393
|
with ProcessUtilizationOverTimeRange() as process_util:
|
460
|
-
logger.info(f"Starting merge task...")
|
394
|
+
logger.info(f"Starting merge task {input.merge_task_index}...")
|
461
395
|
|
462
396
|
# Log node peak memory utilization every 10 seconds
|
463
397
|
def log_peak_memory():
|
@@ -480,11 +414,12 @@ def merge(input: MergeInput) -> MergeResult:
|
|
480
414
|
)
|
481
415
|
emit_metrics_time = latency
|
482
416
|
|
483
|
-
logger.info(f"Finished merge task...")
|
417
|
+
logger.info(f"Finished merge task {input.merge_task_index}...")
|
484
418
|
return MergeResult(
|
485
419
|
merge_result[0],
|
486
420
|
merge_result[1],
|
487
421
|
merge_result[2],
|
422
|
+
merge_result[3],
|
488
423
|
np.double(emit_metrics_time),
|
489
424
|
merge_result[4],
|
490
425
|
)
|
@@ -0,0 +1,97 @@
|
|
1
|
+
import time
|
2
|
+
from typing import List, Optional, Tuple
|
3
|
+
|
4
|
+
from deltacat.compute.compactor import (
|
5
|
+
DeltaAnnotated,
|
6
|
+
DeltaFileEnvelope,
|
7
|
+
)
|
8
|
+
|
9
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
10
|
+
from deltacat.types.media import StorageType
|
11
|
+
from deltacat.utils.common import ReadKwargsProvider
|
12
|
+
from deltacat import logs
|
13
|
+
|
14
|
+
import pyarrow as pa
|
15
|
+
import logging
|
16
|
+
|
17
|
+
|
18
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
19
|
+
|
20
|
+
|
21
|
+
def read_delta_file_envelopes(
|
22
|
+
annotated_delta: DeltaAnnotated,
|
23
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
24
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
25
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
26
|
+
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
27
|
+
tables = deltacat_storage.download_delta(
|
28
|
+
annotated_delta,
|
29
|
+
max_parallelism=1,
|
30
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
31
|
+
storage_type=StorageType.LOCAL,
|
32
|
+
**deltacat_storage_kwargs,
|
33
|
+
)
|
34
|
+
annotations = annotated_delta.annotations
|
35
|
+
assert (
|
36
|
+
len(tables) == len(annotations),
|
37
|
+
f"Unexpected Error: Length of downloaded delta manifest tables "
|
38
|
+
f"({len(tables)}) doesn't match the length of delta manifest "
|
39
|
+
f"annotations ({len(annotations)}).",
|
40
|
+
)
|
41
|
+
if not tables:
|
42
|
+
return None, 0, 0
|
43
|
+
|
44
|
+
delta_stream_position = annotations[0].annotation_stream_position
|
45
|
+
delta_type = annotations[0].annotation_delta_type
|
46
|
+
|
47
|
+
for annotation in annotations:
|
48
|
+
assert annotation.annotation_stream_position == delta_stream_position, (
|
49
|
+
f"Annotation stream position does not match - {annotation.annotation_stream_position} "
|
50
|
+
f"!= {delta_stream_position}"
|
51
|
+
)
|
52
|
+
assert annotation.annotation_delta_type == delta_type, (
|
53
|
+
f"Annotation delta type does not match - {annotation.annotation_delta_type} "
|
54
|
+
f"!= {delta_type}"
|
55
|
+
)
|
56
|
+
|
57
|
+
delta_file_envelopes = []
|
58
|
+
table = pa.concat_tables(tables)
|
59
|
+
total_record_count = len(table)
|
60
|
+
total_size_bytes = int(table.nbytes)
|
61
|
+
|
62
|
+
delta_file = DeltaFileEnvelope.of(
|
63
|
+
stream_position=delta_stream_position,
|
64
|
+
delta_type=delta_type,
|
65
|
+
table=table,
|
66
|
+
)
|
67
|
+
delta_file_envelopes.append(delta_file)
|
68
|
+
return delta_file_envelopes, total_record_count, total_size_bytes
|
69
|
+
|
70
|
+
|
71
|
+
def get_local_delta_file_envelopes(
|
72
|
+
uniform_deltas: List[DeltaAnnotated],
|
73
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
74
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
75
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
76
|
+
) -> Tuple[List[DeltaFileEnvelope], int]:
|
77
|
+
local_dfe_list = []
|
78
|
+
input_records_count = 0
|
79
|
+
logger.info(f"Getting {len(uniform_deltas)} DFE Tasks.")
|
80
|
+
dfe_start = time.monotonic()
|
81
|
+
for annotated_delta in uniform_deltas:
|
82
|
+
(
|
83
|
+
delta_file_envelopes,
|
84
|
+
total_record_count,
|
85
|
+
total_size_bytes,
|
86
|
+
) = read_delta_file_envelopes(
|
87
|
+
annotated_delta,
|
88
|
+
read_kwargs_provider,
|
89
|
+
deltacat_storage,
|
90
|
+
deltacat_storage_kwargs,
|
91
|
+
)
|
92
|
+
if delta_file_envelopes:
|
93
|
+
local_dfe_list.extend(delta_file_envelopes)
|
94
|
+
input_records_count += total_record_count
|
95
|
+
dfe_end = time.monotonic()
|
96
|
+
logger.info(f"Retrieved {len(local_dfe_list)} DFE Tasks in {dfe_end - dfe_start}s.")
|
97
|
+
return local_dfe_list, input_records_count
|