deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +297 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
- deltacat/compute/compactor/model/delta_annotated.py +95 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +4 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +22 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +509 -0
- deltacat/compute/compactor_v2/constants.py +37 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +143 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +469 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +152 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
- deltacat/compute/compactor_v2/utils/task_options.py +221 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
- deltacat/tests/compute/testcases.py +395 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +49 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +83 -0
- deltacat/types/tables.py +6 -0
- deltacat/utils/arguments.py +25 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +218 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
|
+
import pyarrow as pa
|
3
4
|
import logging
|
4
5
|
from deltacat import logs
|
5
6
|
from typing import List, Union
|
@@ -18,6 +19,7 @@ class CompactionSessionAuditInfo(dict):
|
|
18
19
|
DEDUPE_STEP_NAME = "dedupe"
|
19
20
|
MATERIALIZE_STEP_NAME = "materialize"
|
20
21
|
HASH_BUCKET_STEP_NAME = "hashBucket"
|
22
|
+
MERGE_STEP_NAME = "merge"
|
21
23
|
|
22
24
|
def __init__(self, deltacat_version: str, audit_url: str):
|
23
25
|
self.set_deltacat_version(deltacat_version)
|
@@ -52,7 +54,7 @@ class CompactionSessionAuditInfo(dict):
|
|
52
54
|
@property
|
53
55
|
def uniform_deltas_created(self) -> int:
|
54
56
|
"""
|
55
|
-
The total number of
|
57
|
+
The total number of uniform deltas fed into the hash bucket step.
|
56
58
|
"""
|
57
59
|
return self.get("uniformDeltasCreated")
|
58
60
|
|
@@ -68,7 +70,7 @@ class CompactionSessionAuditInfo(dict):
|
|
68
70
|
@property
|
69
71
|
def input_size_bytes(self) -> float:
|
70
72
|
"""
|
71
|
-
The on-disk size in bytes of the input.
|
73
|
+
The on-disk size in bytes of the input. Analogous to bytes scanned
|
72
74
|
"""
|
73
75
|
return self.get("inputSizeBytes")
|
74
76
|
|
@@ -142,6 +144,15 @@ class CompactionSessionAuditInfo(dict):
|
|
142
144
|
"""
|
143
145
|
return self.get("materializeTaskPeakMemoryUsedBytes")
|
144
146
|
|
147
|
+
@property
|
148
|
+
def peak_memory_used_bytes_per_merge_task(self) -> float:
|
149
|
+
"""
|
150
|
+
The peak memory used by a single merge python process. Note
|
151
|
+
that results may be max of merge, and hash bucketing as
|
152
|
+
processes are reused by Ray to run all compaction steps.
|
153
|
+
"""
|
154
|
+
return self.get("mergeTaskPeakMemoryUsedBytes")
|
155
|
+
|
145
156
|
@property
|
146
157
|
def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
|
147
158
|
"""
|
@@ -164,6 +175,13 @@ class CompactionSessionAuditInfo(dict):
|
|
164
175
|
"""
|
165
176
|
return self.get("materializePostObjectStoreMemoryUsedBytes")
|
166
177
|
|
178
|
+
@property
|
179
|
+
def merge_post_object_store_memory_used_bytes(self) -> float:
|
180
|
+
"""
|
181
|
+
The total object store memory used after merge step.
|
182
|
+
"""
|
183
|
+
return self.get("mergePostObjectStoreMemoryUsedBytes")
|
184
|
+
|
167
185
|
@property
|
168
186
|
def materialize_buckets(self) -> int:
|
169
187
|
"""
|
@@ -233,11 +251,33 @@ class CompactionSessionAuditInfo(dict):
|
|
233
251
|
@property
|
234
252
|
def materialize_result_wait_time_in_seconds(self) -> float:
|
235
253
|
"""
|
236
|
-
The time it takes ray.get() to resolve after the last
|
254
|
+
The time it takes ray.get() to resolve after the last materialize task has completed.
|
237
255
|
This value may not be accurate at less than 1 second precision.
|
238
256
|
"""
|
239
257
|
return self.get("materializeResultWaitTimeInSeconds")
|
240
258
|
|
259
|
+
@property
|
260
|
+
def merge_result_wait_time_in_seconds(self) -> float:
|
261
|
+
"""
|
262
|
+
The time it takes ray.get() to resolve after the last task has completed.
|
263
|
+
This value may not be accurate at less than 1 second precision.
|
264
|
+
"""
|
265
|
+
return self.get("mergeResultWaitTimeInSeconds")
|
266
|
+
|
267
|
+
@property
|
268
|
+
def merge_time_in_seconds(self) -> float:
|
269
|
+
"""
|
270
|
+
The time taken by merge step. This includes all merge tasks.
|
271
|
+
"""
|
272
|
+
return self.get("mergeTimeInSeconds")
|
273
|
+
|
274
|
+
@property
|
275
|
+
def merge_invoke_time_in_seconds(self) -> float:
|
276
|
+
"""
|
277
|
+
The time taken to invoke all merge tasks.
|
278
|
+
"""
|
279
|
+
return self.get("mergeInvokeTimeInSeconds")
|
280
|
+
|
241
281
|
@property
|
242
282
|
def delta_discovery_time_in_seconds(self) -> float:
|
243
283
|
"""
|
@@ -337,6 +377,13 @@ class CompactionSessionAuditInfo(dict):
|
|
337
377
|
"""
|
338
378
|
return self.get("materializeResultSize")
|
339
379
|
|
380
|
+
@property
|
381
|
+
def merge_result_size(self) -> float:
|
382
|
+
"""
|
383
|
+
The size of the results returned by merge step.
|
384
|
+
"""
|
385
|
+
return self.get("mergeResultSize")
|
386
|
+
|
340
387
|
@property
|
341
388
|
def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
|
342
389
|
"""
|
@@ -344,6 +391,42 @@ class CompactionSessionAuditInfo(dict):
|
|
344
391
|
"""
|
345
392
|
return self.get("peakMemoryUsedBytesCompactionSessionProcess")
|
346
393
|
|
394
|
+
@property
|
395
|
+
def estimated_in_memory_size_bytes_during_discovery(self) -> float:
|
396
|
+
"""
|
397
|
+
The estimated in-memory size during the discovery. This can be used
|
398
|
+
to determine the accuracy of memory estimation logic.
|
399
|
+
"""
|
400
|
+
return self.get("estimatedInMemorySizeBytesDuringDiscovery")
|
401
|
+
|
402
|
+
@property
|
403
|
+
def hash_bucket_processed_size_bytes(self) -> int:
|
404
|
+
"""
|
405
|
+
The total size of the input data processed during hash bucket
|
406
|
+
"""
|
407
|
+
return self.get("hashBucketProcessedSizeBytes")
|
408
|
+
|
409
|
+
@property
|
410
|
+
def total_cpu_seconds(self) -> float:
|
411
|
+
"""
|
412
|
+
Total number of vCPUs provisioned in the cluster weighted over time.
|
413
|
+
"""
|
414
|
+
return self.get("totalCPUSeconds")
|
415
|
+
|
416
|
+
@property
|
417
|
+
def used_cpu_seconds(self) -> float:
|
418
|
+
"""
|
419
|
+
Total used vCPU in the cluster weighted over time.
|
420
|
+
"""
|
421
|
+
return self.get("usedCPUSeconds")
|
422
|
+
|
423
|
+
@property
|
424
|
+
def pyarrow_version(self) -> str:
|
425
|
+
"""
|
426
|
+
The version of PyArrow used.
|
427
|
+
"""
|
428
|
+
return self.get("pyarrowVersion")
|
429
|
+
|
347
430
|
# Setters follow
|
348
431
|
|
349
432
|
def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
|
@@ -428,6 +511,12 @@ class CompactionSessionAuditInfo(dict):
|
|
428
511
|
] = peak_memory_used_bytes_per_materialize_task
|
429
512
|
return self
|
430
513
|
|
514
|
+
def set_peak_memory_used_bytes_per_merge_task(
|
515
|
+
self, peak_memory_used_bytes: float
|
516
|
+
) -> CompactionSessionAuditInfo:
|
517
|
+
self["mergeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes
|
518
|
+
return self
|
519
|
+
|
431
520
|
def set_hash_bucket_post_object_store_memory_used_bytes(
|
432
521
|
self, object_store_memory_used_bytes_by_hb: float
|
433
522
|
) -> CompactionSessionAuditInfo:
|
@@ -452,6 +541,12 @@ class CompactionSessionAuditInfo(dict):
|
|
452
541
|
] = object_store_memory_used_bytes_by_dedupe
|
453
542
|
return self
|
454
543
|
|
544
|
+
def set_merge_post_object_store_memory_used_bytes(
|
545
|
+
self, object_store_memory_used_bytes: float
|
546
|
+
) -> CompactionSessionAuditInfo:
|
547
|
+
self["mergePostObjectStoreMemoryUsedBytes"] = object_store_memory_used_bytes
|
548
|
+
return self
|
549
|
+
|
455
550
|
def set_materialize_buckets(
|
456
551
|
self, materialize_buckets: int
|
457
552
|
) -> CompactionSessionAuditInfo:
|
@@ -512,6 +607,24 @@ class CompactionSessionAuditInfo(dict):
|
|
512
607
|
self.get["materializeResultWaitTimeInSeconds"] = wait_time
|
513
608
|
return self
|
514
609
|
|
610
|
+
def set_merge_time_in_seconds(
|
611
|
+
self, time_in_seconds: float
|
612
|
+
) -> CompactionSessionAuditInfo:
|
613
|
+
self["mergeTimeInSeconds"] = time_in_seconds
|
614
|
+
return self
|
615
|
+
|
616
|
+
def set_merge_invoke_time_in_seconds(
|
617
|
+
self, invoke_time: float
|
618
|
+
) -> CompactionSessionAuditInfo:
|
619
|
+
self["mergeInvokeTimeInSeconds"] = invoke_time
|
620
|
+
return self
|
621
|
+
|
622
|
+
def set_merge_result_wait_time_in_seconds(
|
623
|
+
self, wait_time: float
|
624
|
+
) -> CompactionSessionAuditInfo:
|
625
|
+
self.get["mergeResultWaitTimeInSeconds"] = wait_time
|
626
|
+
return self
|
627
|
+
|
515
628
|
def set_delta_discovery_time_in_seconds(
|
516
629
|
self, delta_discovery_time_in_seconds: float
|
517
630
|
) -> CompactionSessionAuditInfo:
|
@@ -598,12 +711,42 @@ class CompactionSessionAuditInfo(dict):
|
|
598
711
|
self["materializeResultSize"] = materialize_result_size_bytes
|
599
712
|
return self
|
600
713
|
|
714
|
+
def set_merge_result_size_bytes(
|
715
|
+
self, merge_result_size_bytes: float
|
716
|
+
) -> CompactionSessionAuditInfo:
|
717
|
+
self["mergeResultSize"] = merge_result_size_bytes
|
718
|
+
return self
|
719
|
+
|
601
720
|
def set_peak_memory_used_bytes_by_compaction_session_process(
|
602
721
|
self, peak_memory: float
|
603
722
|
) -> CompactionSessionAuditInfo:
|
604
723
|
self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
|
605
724
|
return self
|
606
725
|
|
726
|
+
def set_estimated_in_memory_size_bytes_during_discovery(
|
727
|
+
self, memory: float
|
728
|
+
) -> CompactionSessionAuditInfo:
|
729
|
+
self["estimatedInMemorySizeBytesDuringDiscovery"] = memory
|
730
|
+
return self
|
731
|
+
|
732
|
+
def set_hash_bucket_processed_size_bytes(
|
733
|
+
self, size: int
|
734
|
+
) -> CompactionSessionAuditInfo:
|
735
|
+
self["hashBucketProcessedSizeBytes"] = size
|
736
|
+
return self
|
737
|
+
|
738
|
+
def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
739
|
+
self["totalCPUSeconds"] = value
|
740
|
+
return self
|
741
|
+
|
742
|
+
def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
743
|
+
self["usedCPUSeconds"] = value
|
744
|
+
return self
|
745
|
+
|
746
|
+
def set_pyarrow_version(self, value: str) -> CompactionSessionAuditInfo:
|
747
|
+
self["pyarrowVersion"] = value
|
748
|
+
return self
|
749
|
+
|
607
750
|
# High level methods to save stats
|
608
751
|
def save_step_stats(
|
609
752
|
self,
|
@@ -673,7 +816,10 @@ class CompactionSessionAuditInfo(dict):
|
|
673
816
|
)
|
674
817
|
|
675
818
|
total_count_of_src_dfl_not_touched = sum(
|
676
|
-
m.referenced_pyarrow_write_result.files
|
819
|
+
m.referenced_pyarrow_write_result.files
|
820
|
+
if m.referenced_pyarrow_write_result
|
821
|
+
else 0
|
822
|
+
for m in mat_results
|
677
823
|
)
|
678
824
|
|
679
825
|
logger.info(
|
@@ -697,10 +843,16 @@ class CompactionSessionAuditInfo(dict):
|
|
697
843
|
)
|
698
844
|
|
699
845
|
untouched_file_record_count = sum(
|
700
|
-
m.referenced_pyarrow_write_result.records
|
846
|
+
m.referenced_pyarrow_write_result.records
|
847
|
+
if m.referenced_pyarrow_write_result
|
848
|
+
else 0
|
849
|
+
for m in mat_results
|
701
850
|
)
|
702
851
|
untouched_file_size_bytes = sum(
|
703
|
-
m.referenced_pyarrow_write_result.file_bytes
|
852
|
+
m.referenced_pyarrow_write_result.file_bytes
|
853
|
+
if m.referenced_pyarrow_write_result
|
854
|
+
else 0
|
855
|
+
for m in mat_results
|
704
856
|
)
|
705
857
|
|
706
858
|
self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
|
@@ -715,11 +867,13 @@ class CompactionSessionAuditInfo(dict):
|
|
715
867
|
self.set_peak_memory_used_bytes_per_task(
|
716
868
|
max(
|
717
869
|
[
|
718
|
-
self.peak_memory_used_bytes_per_hash_bucket_task,
|
719
|
-
self.peak_memory_used_bytes_per_dedupe_task,
|
720
|
-
self.peak_memory_used_bytes_per_materialize_task,
|
870
|
+
self.peak_memory_used_bytes_per_hash_bucket_task or 0,
|
871
|
+
self.peak_memory_used_bytes_per_dedupe_task or 0,
|
872
|
+
self.peak_memory_used_bytes_per_materialize_task or 0,
|
873
|
+
self.peak_memory_used_bytes_per_merge_task or 0,
|
721
874
|
]
|
722
875
|
)
|
723
876
|
)
|
724
877
|
|
878
|
+
self.set_pyarrow_version(pa.__version__)
|
725
879
|
self.set_telemetry_time_in_seconds(total_telemetry_time)
|
@@ -2,7 +2,9 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import logging
|
5
|
-
|
5
|
+
import copy
|
6
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
7
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
6
8
|
from typing import Callable, List, Optional, Union
|
7
9
|
|
8
10
|
from deltacat import logs
|
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
|
|
64
66
|
annotated_deltas: List[DeltaAnnotated],
|
65
67
|
min_delta_bytes: float,
|
66
68
|
min_file_counts: Optional[Union[int, float]] = float("inf"),
|
67
|
-
estimation_function: Optional[
|
69
|
+
estimation_function: Optional[
|
70
|
+
Callable[[ManifestEntry], float]
|
71
|
+
] = lambda entry: entry.meta.content_length,
|
68
72
|
) -> List[DeltaAnnotated]:
|
69
73
|
"""
|
70
74
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
@@ -76,11 +80,21 @@ class DeltaAnnotated(Delta):
|
|
76
80
|
of bytes at rest for the associated object. Returns the list of annotated
|
77
81
|
delta groups.
|
78
82
|
"""
|
79
|
-
|
83
|
+
split_annotated_deltas: List[DeltaAnnotated] = []
|
84
|
+
groups: List[DeltaAnnotated] = []
|
80
85
|
new_da = DeltaAnnotated()
|
81
86
|
new_da_bytes = 0
|
82
87
|
da_group_entry_count = 0
|
83
|
-
|
88
|
+
|
89
|
+
for delta_annotated in annotated_deltas:
|
90
|
+
split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
|
91
|
+
|
92
|
+
logger.info(
|
93
|
+
f"Split the {len(annotated_deltas)} annotated deltas "
|
94
|
+
f"into {len(split_annotated_deltas)} groups."
|
95
|
+
)
|
96
|
+
|
97
|
+
for src_da in split_annotated_deltas:
|
84
98
|
src_da_annotations = src_da.annotations
|
85
99
|
src_da_entries = src_da.manifest.entries
|
86
100
|
assert (
|
@@ -105,11 +119,7 @@ class DeltaAnnotated(Delta):
|
|
105
119
|
src_da, new_da, src_entry, src_da_annotations[i]
|
106
120
|
)
|
107
121
|
# TODO: Fetch s3_obj["Size"] if entry content length undefined?
|
108
|
-
estimated_new_da_bytes = (
|
109
|
-
estimation_function(src_entry.meta.content_length)
|
110
|
-
if type(estimation_function) is FunctionType
|
111
|
-
else src_entry.meta.content_length
|
112
|
-
)
|
122
|
+
estimated_new_da_bytes = estimation_function(src_entry)
|
113
123
|
new_da_bytes += estimated_new_da_bytes
|
114
124
|
da_group_entry_count += 1
|
115
125
|
if (
|
@@ -132,6 +142,7 @@ class DeltaAnnotated(Delta):
|
|
132
142
|
da_group_entry_count = 0
|
133
143
|
if new_da:
|
134
144
|
groups.append(new_da)
|
145
|
+
|
135
146
|
return groups
|
136
147
|
|
137
148
|
@staticmethod
|
@@ -207,3 +218,78 @@ class DeltaAnnotated(Delta):
|
|
207
218
|
dst_da.type = None
|
208
219
|
entries.append(src_entry)
|
209
220
|
dst_da.annotations.append(src_annotation)
|
221
|
+
|
222
|
+
@staticmethod
|
223
|
+
def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
|
224
|
+
"""
|
225
|
+
Split a single delta annotated into multiple granular
|
226
|
+
annotated entries. Note that split is not always guaranteed.
|
227
|
+
|
228
|
+
Note: Currently we are only able to split the Parquet File downloads.
|
229
|
+
"""
|
230
|
+
|
231
|
+
result = []
|
232
|
+
|
233
|
+
if (
|
234
|
+
delta_annotated.meta
|
235
|
+
and delta_annotated.manifest
|
236
|
+
and delta_annotated.meta.content_type == ContentType.PARQUET
|
237
|
+
and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
|
238
|
+
):
|
239
|
+
# we split by row groups
|
240
|
+
for entry_index, entry in enumerate(delta_annotated.manifest.entries):
|
241
|
+
input_split_params = None
|
242
|
+
if entry.meta and entry.meta.content_type_parameters:
|
243
|
+
for type_params in entry.meta.content_type_parameters:
|
244
|
+
if (
|
245
|
+
isinstance(type_params, PartialParquetParameters)
|
246
|
+
and type_params.num_row_groups > 1
|
247
|
+
and type_params.pq_metadata
|
248
|
+
):
|
249
|
+
input_split_params = type_params
|
250
|
+
break
|
251
|
+
|
252
|
+
if input_split_params:
|
253
|
+
logger.info(
|
254
|
+
f"Splitting input file with URI: {entry.uri} into "
|
255
|
+
f"different {input_split_params.num_row_groups} entries"
|
256
|
+
)
|
257
|
+
|
258
|
+
for rg in input_split_params.row_groups_to_download:
|
259
|
+
new_da = DeltaAnnotated()
|
260
|
+
new_entry_dict = copy.deepcopy(entry)
|
261
|
+
new_entry = ManifestEntry(new_entry_dict)
|
262
|
+
|
263
|
+
row_group_meta = input_split_params.pq_metadata.row_group(rg)
|
264
|
+
|
265
|
+
new_partial_params = PartialParquetParameters.of(
|
266
|
+
row_groups_to_download=[rg],
|
267
|
+
num_row_groups=1,
|
268
|
+
num_rows=row_group_meta.num_rows,
|
269
|
+
in_memory_size_bytes=row_group_meta.total_byte_size,
|
270
|
+
pq_metadata=input_split_params.pq_metadata,
|
271
|
+
)
|
272
|
+
|
273
|
+
new_entry.meta.content_type_parameters = [new_partial_params]
|
274
|
+
for type_params in entry.meta.content_type_parameters:
|
275
|
+
if not isinstance(type_params, PartialParquetParameters):
|
276
|
+
new_entry.meta.content_type_parameters.append(
|
277
|
+
type_params
|
278
|
+
)
|
279
|
+
|
280
|
+
DeltaAnnotated._append_annotated_entry(
|
281
|
+
delta_annotated,
|
282
|
+
new_da,
|
283
|
+
new_entry,
|
284
|
+
delta_annotated.annotations[entry_index],
|
285
|
+
)
|
286
|
+
|
287
|
+
result.append(new_da)
|
288
|
+
else:
|
289
|
+
return [delta_annotated]
|
290
|
+
|
291
|
+
logger.info(
|
292
|
+
f"Split was not performed on the delta with locator: {delta_annotated.locator}"
|
293
|
+
)
|
294
|
+
|
295
|
+
return [delta_annotated]
|
@@ -2,6 +2,7 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
+
import pyarrow as pa
|
5
6
|
|
6
7
|
from deltacat.storage import DeltaType, LocalTable
|
7
8
|
|
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
|
|
37
38
|
"""
|
38
39
|
if stream_position is None:
|
39
40
|
raise ValueError("Missing delta file envelope stream position.")
|
40
|
-
if file_index is None:
|
41
|
-
raise ValueError("Missing delta file envelope file index.")
|
42
41
|
if delta_type is None:
|
43
42
|
raise ValueError("Missing Delta file envelope delta type.")
|
44
43
|
if table is None:
|
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
|
|
75
74
|
@property
|
76
75
|
def file_record_count(self) -> int:
|
77
76
|
return self["file_record_count"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def table_size_bytes(self) -> int:
|
80
|
+
if isinstance(self.table, pa.Table):
|
81
|
+
return self.table.nbytes
|
82
|
+
else:
|
83
|
+
raise ValueError(
|
84
|
+
f"Table type: {type(self.table)} not for supported for size method."
|
85
|
+
)
|
86
|
+
|
87
|
+
@property
|
88
|
+
def table_num_rows(self) -> int:
|
89
|
+
return len(self.table)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Tuple
|
4
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
5
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
|
|
40
41
|
compacted_delta_locator: DeltaLocator,
|
41
42
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
42
43
|
sort_keys_bit_width: int,
|
43
|
-
rebase_source_partition_locator: Optional[PartitionLocator],
|
44
|
+
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
44
45
|
manifest_entry_copied_by_reference_ratio: Optional[float] = None,
|
45
46
|
compaction_audit_url: Optional[str] = None,
|
47
|
+
hash_bucket_count: Optional[int] = None,
|
48
|
+
hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
|
46
49
|
) -> RoundCompletionInfo:
|
47
50
|
|
48
51
|
rci = RoundCompletionInfo()
|
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
|
|
55
58
|
"manifestEntryCopiedByReferenceRatio"
|
56
59
|
] = manifest_entry_copied_by_reference_ratio
|
57
60
|
rci["compactionAuditUrl"] = compaction_audit_url
|
61
|
+
rci["hashBucketCount"] = hash_bucket_count
|
62
|
+
rci["hbIndexToEntryRange"] = hb_index_to_entry_range
|
58
63
|
return rci
|
59
64
|
|
60
65
|
@property
|
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
|
|
97
102
|
@property
|
98
103
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
99
104
|
return self["manifestEntryCopiedByReferenceRatio"]
|
105
|
+
|
106
|
+
@property
|
107
|
+
def hash_bucket_count(self) -> Optional[int]:
|
108
|
+
return self["hashBucketCount"]
|
109
|
+
|
110
|
+
@property
|
111
|
+
def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
|
112
|
+
"""
|
113
|
+
The start index is inclusive and end index is exclusive by default.
|
114
|
+
"""
|
115
|
+
return self["hbIndexToEntryRange"]
|
@@ -54,6 +54,7 @@ def repartition(
|
|
54
54
|
pg_config: Optional[PlacementGroupConfig] = None,
|
55
55
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
56
56
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
57
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
57
58
|
s3_client_kwargs: Optional[Dict[str, Any]] = None,
|
58
59
|
deltacat_storage=unimplemented_deltacat_storage,
|
59
60
|
**kwargs,
|
@@ -91,7 +92,7 @@ def repartition(
|
|
91
92
|
source_partition_locator.partition_values,
|
92
93
|
).stream_position,
|
93
94
|
deltacat_storage,
|
94
|
-
|
95
|
+
list_deltas_kwargs,
|
95
96
|
)
|
96
97
|
|
97
98
|
uniform_deltas = []
|
@@ -131,6 +132,7 @@ def repartition(
|
|
131
132
|
enable_profiler=enable_profiler,
|
132
133
|
metrics_config=metrics_config,
|
133
134
|
read_kwargs_provider=read_kwargs_provider,
|
135
|
+
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
134
136
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
135
137
|
deltacat_storage=deltacat_storage,
|
136
138
|
)
|
@@ -162,6 +164,7 @@ def repartition(
|
|
162
164
|
source_partition_locator,
|
163
165
|
sort_keys,
|
164
166
|
deltacat_storage,
|
167
|
+
deltacat_storage_kwargs={},
|
165
168
|
)
|
166
169
|
repartition_completion_info = RoundCompletionInfo.of(
|
167
170
|
last_stream_position_to_compact,
|
@@ -107,20 +107,21 @@ def _timed_dedupe(
|
|
107
107
|
dedupe_task_index: int,
|
108
108
|
enable_profiler: bool,
|
109
109
|
object_store: Optional[IObjectStore],
|
110
|
+
**kwargs,
|
110
111
|
):
|
111
112
|
task_id = get_current_ray_task_id()
|
112
113
|
worker_id = get_current_ray_worker_id()
|
113
114
|
with memray.Tracker(
|
114
115
|
f"dedupe_{worker_id}_{task_id}.bin"
|
115
116
|
) if enable_profiler else nullcontext():
|
116
|
-
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
117
|
-
# severe skew of primary key updates in deltas
|
117
|
+
# TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
|
118
118
|
logger.info(
|
119
119
|
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
120
120
|
f"groups for {len(object_ids)} object refs..."
|
121
121
|
)
|
122
|
-
|
123
|
-
|
122
|
+
delta_file_envelope_groups_list: List[object] = object_store.get_many(
|
123
|
+
object_ids
|
124
|
+
)
|
124
125
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
125
126
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
126
127
|
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
@@ -171,7 +172,8 @@ def _timed_dedupe(
|
|
171
172
|
|
172
173
|
hb_table_record_count = len(table)
|
173
174
|
table, drop_time = timed_invocation(
|
174
|
-
func=_drop_duplicates_by_primary_key_hash,
|
175
|
+
func=_drop_duplicates_by_primary_key_hash,
|
176
|
+
table=table,
|
175
177
|
)
|
176
178
|
deduped_record_count = hb_table_record_count - len(table)
|
177
179
|
total_deduped_records += deduped_record_count
|
@@ -227,7 +229,6 @@ def _timed_dedupe(
|
|
227
229
|
)
|
228
230
|
|
229
231
|
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
230
|
-
|
231
232
|
return DedupeResult(
|
232
233
|
mat_bucket_to_dd_idx_obj_id,
|
233
234
|
np.int64(total_deduped_records),
|
@@ -246,6 +247,7 @@ def dedupe(
|
|
246
247
|
enable_profiler: bool,
|
247
248
|
metrics_config: MetricsConfig,
|
248
249
|
object_store: Optional[IObjectStore],
|
250
|
+
**kwargs,
|
249
251
|
) -> DedupeResult:
|
250
252
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
251
253
|
dedupe_result, duration = timed_invocation(
|
@@ -256,6 +258,7 @@ def dedupe(
|
|
256
258
|
dedupe_task_index=dedupe_task_index,
|
257
259
|
enable_profiler=enable_profiler,
|
258
260
|
object_store=object_store,
|
261
|
+
**kwargs,
|
259
262
|
)
|
260
263
|
|
261
264
|
emit_metrics_time = 0.0
|