deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,9 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import logging
|
5
|
-
|
5
|
+
import copy
|
6
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
7
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
6
8
|
from typing import Callable, List, Optional, Union
|
7
9
|
|
8
10
|
from deltacat import logs
|
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
|
|
64
66
|
annotated_deltas: List[DeltaAnnotated],
|
65
67
|
min_delta_bytes: float,
|
66
68
|
min_file_counts: Optional[Union[int, float]] = float("inf"),
|
67
|
-
estimation_function: Optional[
|
69
|
+
estimation_function: Optional[
|
70
|
+
Callable[[ManifestEntry], float]
|
71
|
+
] = lambda entry: entry.meta.content_length,
|
68
72
|
) -> List[DeltaAnnotated]:
|
69
73
|
"""
|
70
74
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
@@ -76,11 +80,16 @@ class DeltaAnnotated(Delta):
|
|
76
80
|
of bytes at rest for the associated object. Returns the list of annotated
|
77
81
|
delta groups.
|
78
82
|
"""
|
79
|
-
|
83
|
+
split_annotated_deltas: List[DeltaAnnotated] = []
|
84
|
+
groups: List[DeltaAnnotated] = []
|
80
85
|
new_da = DeltaAnnotated()
|
81
86
|
new_da_bytes = 0
|
82
87
|
da_group_entry_count = 0
|
83
|
-
|
88
|
+
|
89
|
+
for delta_annotated in annotated_deltas:
|
90
|
+
split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
|
91
|
+
|
92
|
+
for src_da in split_annotated_deltas:
|
84
93
|
src_da_annotations = src_da.annotations
|
85
94
|
src_da_entries = src_da.manifest.entries
|
86
95
|
assert (
|
@@ -105,11 +114,7 @@ class DeltaAnnotated(Delta):
|
|
105
114
|
src_da, new_da, src_entry, src_da_annotations[i]
|
106
115
|
)
|
107
116
|
# TODO: Fetch s3_obj["Size"] if entry content length undefined?
|
108
|
-
estimated_new_da_bytes = (
|
109
|
-
estimation_function(src_entry.meta.content_length)
|
110
|
-
if type(estimation_function) is FunctionType
|
111
|
-
else src_entry.meta.content_length
|
112
|
-
)
|
117
|
+
estimated_new_da_bytes = estimation_function(src_entry)
|
113
118
|
new_da_bytes += estimated_new_da_bytes
|
114
119
|
da_group_entry_count += 1
|
115
120
|
if (
|
@@ -132,6 +137,7 @@ class DeltaAnnotated(Delta):
|
|
132
137
|
da_group_entry_count = 0
|
133
138
|
if new_da:
|
134
139
|
groups.append(new_da)
|
140
|
+
|
135
141
|
return groups
|
136
142
|
|
137
143
|
@staticmethod
|
@@ -207,3 +213,79 @@ class DeltaAnnotated(Delta):
|
|
207
213
|
dst_da.type = None
|
208
214
|
entries.append(src_entry)
|
209
215
|
dst_da.annotations.append(src_annotation)
|
216
|
+
|
217
|
+
@staticmethod
|
218
|
+
def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
|
219
|
+
"""
|
220
|
+
Split a single delta annotated into multiple granular
|
221
|
+
annotated entries. Note that split is not always guaranteed.
|
222
|
+
|
223
|
+
Note: Currently we are only able to split the Parquet File downloads.
|
224
|
+
"""
|
225
|
+
|
226
|
+
result = []
|
227
|
+
|
228
|
+
if (
|
229
|
+
delta_annotated.meta
|
230
|
+
and delta_annotated.manifest
|
231
|
+
and delta_annotated.meta.content_type == ContentType.PARQUET
|
232
|
+
and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
|
233
|
+
):
|
234
|
+
# we split by row groups
|
235
|
+
for entry_index, entry in enumerate(delta_annotated.manifest.entries):
|
236
|
+
input_split_params = None
|
237
|
+
if entry.meta and entry.meta.content_type_parameters:
|
238
|
+
for type_params in entry.meta.content_type_parameters:
|
239
|
+
if (
|
240
|
+
isinstance(type_params, PartialParquetParameters)
|
241
|
+
and type_params.num_row_groups > 1
|
242
|
+
and type_params.pq_metadata
|
243
|
+
):
|
244
|
+
input_split_params = type_params
|
245
|
+
break
|
246
|
+
|
247
|
+
if input_split_params:
|
248
|
+
logger.info(
|
249
|
+
f"Splitting input file with URI: {entry.uri} into "
|
250
|
+
f"different {input_split_params.num_row_groups} entries"
|
251
|
+
)
|
252
|
+
|
253
|
+
for rg in input_split_params.row_groups_to_download:
|
254
|
+
new_da = DeltaAnnotated()
|
255
|
+
new_entry_dict = copy.deepcopy(entry)
|
256
|
+
new_entry = ManifestEntry(new_entry_dict)
|
257
|
+
|
258
|
+
row_group_meta = input_split_params.pq_metadata.row_group(rg)
|
259
|
+
|
260
|
+
new_partial_params = PartialParquetParameters.of(
|
261
|
+
row_groups_to_download=[rg],
|
262
|
+
num_row_groups=1,
|
263
|
+
num_rows=row_group_meta.num_rows,
|
264
|
+
in_memory_size_bytes=row_group_meta.total_byte_size,
|
265
|
+
pq_metadata=input_split_params.pq_metadata,
|
266
|
+
)
|
267
|
+
|
268
|
+
new_entry.meta.content_type_parameters = [new_partial_params]
|
269
|
+
for type_params in entry.meta.content_type_parameters:
|
270
|
+
if not isinstance(type_params, PartialParquetParameters):
|
271
|
+
new_entry.meta.content_type_parameters.append(
|
272
|
+
type_params
|
273
|
+
)
|
274
|
+
|
275
|
+
DeltaAnnotated._append_annotated_entry(
|
276
|
+
delta_annotated,
|
277
|
+
new_da,
|
278
|
+
new_entry,
|
279
|
+
delta_annotated.annotations[entry_index],
|
280
|
+
)
|
281
|
+
|
282
|
+
result.append(new_da)
|
283
|
+
|
284
|
+
if result:
|
285
|
+
return result
|
286
|
+
else:
|
287
|
+
logger.info(
|
288
|
+
f"Split was not performed on the delta with locator: {delta_annotated.locator}"
|
289
|
+
)
|
290
|
+
|
291
|
+
return [delta_annotated]
|
@@ -2,6 +2,7 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
+
import pyarrow as pa
|
5
6
|
|
6
7
|
from deltacat.storage import DeltaType, LocalTable
|
7
8
|
|
@@ -14,9 +15,9 @@ class DeltaFileEnvelope(dict):
|
|
14
15
|
@staticmethod
|
15
16
|
def of(
|
16
17
|
stream_position: int,
|
17
|
-
file_index: int,
|
18
18
|
delta_type: DeltaType,
|
19
19
|
table: LocalTable,
|
20
|
+
file_index: int = None,
|
20
21
|
is_src_delta: np.bool_ = True,
|
21
22
|
file_record_count: Optional[int] = None,
|
22
23
|
) -> DeltaFileEnvelope:
|
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
|
|
37
38
|
"""
|
38
39
|
if stream_position is None:
|
39
40
|
raise ValueError("Missing delta file envelope stream position.")
|
40
|
-
if file_index is None:
|
41
|
-
raise ValueError("Missing delta file envelope file index.")
|
42
41
|
if delta_type is None:
|
43
42
|
raise ValueError("Missing Delta file envelope delta type.")
|
44
43
|
if table is None:
|
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
|
|
75
74
|
@property
|
76
75
|
def file_record_count(self) -> int:
|
77
76
|
return self["file_record_count"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def table_size_bytes(self) -> int:
|
80
|
+
if isinstance(self.table, pa.Table):
|
81
|
+
return self.table.nbytes
|
82
|
+
else:
|
83
|
+
raise ValueError(
|
84
|
+
f"Table type: {type(self.table)} not for supported for size method."
|
85
|
+
)
|
86
|
+
|
87
|
+
@property
|
88
|
+
def table_num_rows(self) -> int:
|
89
|
+
return len(self.table)
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
4
4
|
from typing import Any, Dict, List
|
5
5
|
from uuid import uuid4
|
6
6
|
|
7
|
-
from deltacat.
|
7
|
+
from deltacat.storage.model.sort_key import SortKey
|
8
8
|
from deltacat.storage import Locator, PartitionLocator
|
9
9
|
from deltacat.utils.common import sha1_hexdigest
|
10
10
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Tuple
|
4
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
5
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
|
|
40
41
|
compacted_delta_locator: DeltaLocator,
|
41
42
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
42
43
|
sort_keys_bit_width: int,
|
43
|
-
rebase_source_partition_locator: Optional[PartitionLocator],
|
44
|
+
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
44
45
|
manifest_entry_copied_by_reference_ratio: Optional[float] = None,
|
45
46
|
compaction_audit_url: Optional[str] = None,
|
47
|
+
hash_bucket_count: Optional[int] = None,
|
48
|
+
hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
|
46
49
|
) -> RoundCompletionInfo:
|
47
50
|
|
48
51
|
rci = RoundCompletionInfo()
|
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
|
|
55
58
|
"manifestEntryCopiedByReferenceRatio"
|
56
59
|
] = manifest_entry_copied_by_reference_ratio
|
57
60
|
rci["compactionAuditUrl"] = compaction_audit_url
|
61
|
+
rci["hashBucketCount"] = hash_bucket_count
|
62
|
+
rci["hbIndexToEntryRange"] = hb_index_to_entry_range
|
58
63
|
return rci
|
59
64
|
|
60
65
|
@property
|
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
|
|
97
102
|
@property
|
98
103
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
99
104
|
return self["manifestEntryCopiedByReferenceRatio"]
|
105
|
+
|
106
|
+
@property
|
107
|
+
def hash_bucket_count(self) -> Optional[int]:
|
108
|
+
return self["hashBucketCount"]
|
109
|
+
|
110
|
+
@property
|
111
|
+
def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
|
112
|
+
"""
|
113
|
+
The start index is inclusive and end index is exclusive by default.
|
114
|
+
"""
|
115
|
+
return self["hbIndexToEntryRange"]
|
@@ -7,8 +7,8 @@ import functools
|
|
7
7
|
import itertools
|
8
8
|
from deltacat.compute.compactor import (
|
9
9
|
RoundCompletionInfo,
|
10
|
-
SortKey,
|
11
10
|
)
|
11
|
+
from deltacat.storage.model.sort_key import SortKey
|
12
12
|
from deltacat.types.media import ContentType
|
13
13
|
from deltacat.compute.compactor import DeltaAnnotated
|
14
14
|
from deltacat.utils.ray_utils.concurrency import (
|
@@ -31,6 +31,7 @@ from deltacat.storage import (
|
|
31
31
|
interface as unimplemented_deltacat_storage,
|
32
32
|
)
|
33
33
|
from deltacat.utils.metrics import MetricsConfig
|
34
|
+
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
34
35
|
|
35
36
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
37
|
|
@@ -90,7 +91,7 @@ def repartition(
|
|
90
91
|
source_partition_locator.partition_values,
|
91
92
|
).stream_position,
|
92
93
|
deltacat_storage,
|
93
|
-
|
94
|
+
list_deltas_kwargs,
|
94
95
|
)
|
95
96
|
|
96
97
|
uniform_deltas = []
|
@@ -157,10 +158,11 @@ def repartition(
|
|
157
158
|
new_compacted_partition_locator,
|
158
159
|
compacted_delta.stream_position,
|
159
160
|
)
|
160
|
-
bit_width_of_sort_keys =
|
161
|
+
bit_width_of_sort_keys = validate_sort_keys(
|
161
162
|
source_partition_locator,
|
162
163
|
sort_keys,
|
163
164
|
deltacat_storage,
|
165
|
+
deltacat_storage_kwargs={},
|
164
166
|
)
|
165
167
|
repartition_completion_info = RoundCompletionInfo.of(
|
166
168
|
last_stream_position_to_compact,
|
@@ -12,11 +12,10 @@ import ray
|
|
12
12
|
|
13
13
|
from deltacat import logs
|
14
14
|
from deltacat.compute.compactor import (
|
15
|
-
SortKey,
|
16
|
-
SortOrder,
|
17
15
|
DeltaFileEnvelope,
|
18
16
|
DeltaFileLocator,
|
19
17
|
)
|
18
|
+
from deltacat.storage.model.sort_key import SortKey, SortOrder
|
20
19
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
21
20
|
from deltacat.compute.compactor.utils import system_columns as sc
|
22
21
|
from deltacat.utils.ray_utils.runtime import (
|
@@ -108,20 +107,21 @@ def _timed_dedupe(
|
|
108
107
|
dedupe_task_index: int,
|
109
108
|
enable_profiler: bool,
|
110
109
|
object_store: Optional[IObjectStore],
|
110
|
+
**kwargs,
|
111
111
|
):
|
112
112
|
task_id = get_current_ray_task_id()
|
113
113
|
worker_id = get_current_ray_worker_id()
|
114
114
|
with memray.Tracker(
|
115
115
|
f"dedupe_{worker_id}_{task_id}.bin"
|
116
116
|
) if enable_profiler else nullcontext():
|
117
|
-
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
118
|
-
# severe skew of primary key updates in deltas
|
117
|
+
# TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
|
119
118
|
logger.info(
|
120
119
|
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
121
120
|
f"groups for {len(object_ids)} object refs..."
|
122
121
|
)
|
123
|
-
|
124
|
-
|
122
|
+
delta_file_envelope_groups_list: List[object] = object_store.get_many(
|
123
|
+
object_ids
|
124
|
+
)
|
125
125
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
126
126
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
127
127
|
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
@@ -172,7 +172,8 @@ def _timed_dedupe(
|
|
172
172
|
|
173
173
|
hb_table_record_count = len(table)
|
174
174
|
table, drop_time = timed_invocation(
|
175
|
-
func=_drop_duplicates_by_primary_key_hash,
|
175
|
+
func=_drop_duplicates_by_primary_key_hash,
|
176
|
+
table=table,
|
176
177
|
)
|
177
178
|
deduped_record_count = hb_table_record_count - len(table)
|
178
179
|
total_deduped_records += deduped_record_count
|
@@ -228,7 +229,6 @@ def _timed_dedupe(
|
|
228
229
|
)
|
229
230
|
|
230
231
|
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
231
|
-
|
232
232
|
return DedupeResult(
|
233
233
|
mat_bucket_to_dd_idx_obj_id,
|
234
234
|
np.int64(total_deduped_records),
|
@@ -247,6 +247,7 @@ def dedupe(
|
|
247
247
|
enable_profiler: bool,
|
248
248
|
metrics_config: MetricsConfig,
|
249
249
|
object_store: Optional[IObjectStore],
|
250
|
+
**kwargs,
|
250
251
|
) -> DedupeResult:
|
251
252
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
252
253
|
dedupe_result, duration = timed_invocation(
|
@@ -257,6 +258,7 @@ def dedupe(
|
|
257
258
|
dedupe_task_index=dedupe_task_index,
|
258
259
|
enable_profiler=enable_profiler,
|
259
260
|
object_store=object_store,
|
261
|
+
**kwargs,
|
260
262
|
)
|
261
263
|
|
262
264
|
emit_metrics_time = 0.0
|
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import time
|
4
4
|
from contextlib import nullcontext
|
5
5
|
from itertools import chain
|
6
|
-
from typing import Generator, List, Optional, Tuple
|
6
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
7
7
|
import numpy as np
|
8
8
|
import pyarrow as pa
|
9
9
|
import ray
|
@@ -11,9 +11,9 @@ from deltacat import logs
|
|
11
11
|
from deltacat.compute.compactor import (
|
12
12
|
DeltaAnnotated,
|
13
13
|
DeltaFileEnvelope,
|
14
|
-
SortKey,
|
15
14
|
RoundCompletionInfo,
|
16
15
|
)
|
16
|
+
from deltacat.storage.model.sort_key import SortKey
|
17
17
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
18
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
19
|
from deltacat.compute.compactor.utils import system_columns as sc
|
@@ -91,7 +91,11 @@ def _group_file_records_by_pk_hash_bucket(
|
|
91
91
|
is_src_delta: np.bool_ = True,
|
92
92
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
93
93
|
deltacat_storage=unimplemented_deltacat_storage,
|
94
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
95
|
+
**kwargs,
|
94
96
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
|
97
|
+
if deltacat_storage_kwargs is None:
|
98
|
+
deltacat_storage_kwargs = {}
|
95
99
|
# read input parquet s3 objects into a list of delta file envelopes
|
96
100
|
delta_file_envelopes, total_record_count = _read_delta_file_envelopes(
|
97
101
|
annotated_delta,
|
@@ -99,6 +103,8 @@ def _group_file_records_by_pk_hash_bucket(
|
|
99
103
|
sort_key_names,
|
100
104
|
read_kwargs_provider,
|
101
105
|
deltacat_storage,
|
106
|
+
deltacat_storage_kwargs,
|
107
|
+
**kwargs,
|
102
108
|
)
|
103
109
|
if delta_file_envelopes is None:
|
104
110
|
return None, 0
|
@@ -134,8 +140,11 @@ def _read_delta_file_envelopes(
|
|
134
140
|
sort_key_names: List[str],
|
135
141
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
136
142
|
deltacat_storage=unimplemented_deltacat_storage,
|
143
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
144
|
+
**kwargs,
|
137
145
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
|
138
|
-
|
146
|
+
if deltacat_storage_kwargs is None:
|
147
|
+
deltacat_storage_kwargs = {}
|
139
148
|
columns_to_read = list(chain(primary_keys, sort_key_names))
|
140
149
|
# TODO (rootliu) compare performance of column read from unpartitioned vs partitioned file
|
141
150
|
# https://arrow.apache.org/docs/python/parquet.html#writing-to-partitioned-datasets
|
@@ -145,6 +154,7 @@ def _read_delta_file_envelopes(
|
|
145
154
|
columns=columns_to_read,
|
146
155
|
file_reader_kwargs_provider=read_kwargs_provider,
|
147
156
|
storage_type=StorageType.LOCAL,
|
157
|
+
**deltacat_storage_kwargs,
|
148
158
|
)
|
149
159
|
annotations = annotated_delta.annotations
|
150
160
|
assert (
|
@@ -182,7 +192,11 @@ def _timed_hash_bucket(
|
|
182
192
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
183
193
|
object_store: Optional[IObjectStore] = None,
|
184
194
|
deltacat_storage=unimplemented_deltacat_storage,
|
195
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
196
|
+
**kwargs,
|
185
197
|
):
|
198
|
+
if deltacat_storage_kwargs is None:
|
199
|
+
deltacat_storage_kwargs = {}
|
186
200
|
task_id = get_current_ray_task_id()
|
187
201
|
worker_id = get_current_ray_worker_id()
|
188
202
|
with memray.Tracker(
|
@@ -207,6 +221,8 @@ def _timed_hash_bucket(
|
|
207
221
|
is_src_delta,
|
208
222
|
read_kwargs_provider,
|
209
223
|
deltacat_storage,
|
224
|
+
deltacat_storage_kwargs,
|
225
|
+
**kwargs,
|
210
226
|
)
|
211
227
|
hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
|
212
228
|
delta_file_envelope_groups, num_buckets, num_groups, object_store
|
@@ -235,8 +251,11 @@ def hash_bucket(
|
|
235
251
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
236
252
|
object_store: Optional[IObjectStore],
|
237
253
|
deltacat_storage=unimplemented_deltacat_storage,
|
254
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
255
|
+
**kwargs,
|
238
256
|
) -> HashBucketResult:
|
239
|
-
|
257
|
+
if deltacat_storage_kwargs is None:
|
258
|
+
deltacat_storage_kwargs = {}
|
240
259
|
logger.info(f"Starting hash bucket task...")
|
241
260
|
hash_bucket_result, duration = timed_invocation(
|
242
261
|
func=_timed_hash_bucket,
|
@@ -250,6 +269,8 @@ def hash_bucket(
|
|
250
269
|
read_kwargs_provider=read_kwargs_provider,
|
251
270
|
object_store=object_store,
|
252
271
|
deltacat_storage=deltacat_storage,
|
272
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
273
|
+
**kwargs,
|
253
274
|
)
|
254
275
|
|
255
276
|
emit_metrics_time = 0.0
|
@@ -69,7 +69,11 @@ def materialize(
|
|
69
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
70
|
object_store: Optional[IObjectStore] = None,
|
71
71
|
deltacat_storage=unimplemented_deltacat_storage,
|
72
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
72
73
|
):
|
74
|
+
if deltacat_storage_kwargs is None:
|
75
|
+
deltacat_storage_kwargs = {}
|
76
|
+
|
73
77
|
def _stage_delta_from_manifest_entry_reference_list(
|
74
78
|
manifest_entry_list_reference: List[ManifestEntry],
|
75
79
|
partition: Partition,
|
@@ -105,6 +109,7 @@ def materialize(
|
|
105
109
|
max_records_per_entry=max_records_per_output_file,
|
106
110
|
content_type=compacted_file_content_type,
|
107
111
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
112
|
+
**deltacat_storage_kwargs,
|
108
113
|
)
|
109
114
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
110
115
|
compacted_table
|
@@ -116,11 +121,10 @@ def materialize(
|
|
116
121
|
)
|
117
122
|
manifest = delta.manifest
|
118
123
|
manifest_records = manifest.meta.record_count
|
119
|
-
assert (
|
120
|
-
manifest_records == len(compacted_table),
|
124
|
+
assert manifest_records == len(compacted_table), (
|
121
125
|
f"Unexpected Error: Materialized delta manifest record count "
|
122
126
|
f"({manifest_records}) does not equal compacted table record count "
|
123
|
-
f"({len(compacted_table)})"
|
127
|
+
f"({len(compacted_table)})"
|
124
128
|
)
|
125
129
|
materialize_result = MaterializeResult.of(
|
126
130
|
delta=delta,
|
@@ -187,10 +191,11 @@ def materialize(
|
|
187
191
|
src_stream_position_np.item(),
|
188
192
|
)
|
189
193
|
dl_digest = delta_locator.digest()
|
190
|
-
|
191
194
|
manifest = manifest_cache.setdefault(
|
192
195
|
dl_digest,
|
193
|
-
deltacat_storage.get_delta_manifest(
|
196
|
+
deltacat_storage.get_delta_manifest(
|
197
|
+
delta_locator, **deltacat_storage_kwargs
|
198
|
+
),
|
194
199
|
)
|
195
200
|
|
196
201
|
if read_kwargs_provider is None:
|
@@ -236,6 +241,7 @@ def materialize(
|
|
236
241
|
Delta.of(delta_locator, None, None, None, manifest),
|
237
242
|
src_file_idx_np.item(),
|
238
243
|
file_reader_kwargs_provider=read_kwargs_provider,
|
244
|
+
**deltacat_storage_kwargs,
|
239
245
|
)
|
240
246
|
logger.debug(
|
241
247
|
f"Time taken for materialize task"
|
@@ -253,7 +259,6 @@ def materialize(
|
|
253
259
|
materialized_results.append(_materialize(record_batch_tables.remaining))
|
254
260
|
|
255
261
|
logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
|
256
|
-
|
257
262
|
referenced_manifest_delta = (
|
258
263
|
_stage_delta_from_manifest_entry_reference_list(
|
259
264
|
manifest_entry_list_reference, partition
|
@@ -4,7 +4,7 @@ from contextlib import nullcontext
|
|
4
4
|
import pyarrow.compute as pc
|
5
5
|
from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
|
6
6
|
import pyarrow as pa
|
7
|
-
from typing import List, Optional
|
7
|
+
from typing import Any, Dict, List, Optional
|
8
8
|
from deltacat.types.media import StorageType, ContentType
|
9
9
|
import ray
|
10
10
|
from deltacat import logs
|
@@ -58,6 +58,8 @@ def repartition_range(
|
|
58
58
|
max_records_per_output_file: int,
|
59
59
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
60
60
|
deltacat_storage=unimplemented_deltacat_storage,
|
61
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
62
|
+
**kwargs,
|
61
63
|
):
|
62
64
|
"""
|
63
65
|
Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
|
@@ -85,6 +87,8 @@ def repartition_range(
|
|
85
87
|
in the tables, an error will be raised. For each partition range, a new file is created. This could result in
|
86
88
|
more output files than input files.
|
87
89
|
"""
|
90
|
+
if deltacat_storage_kwargs is None:
|
91
|
+
deltacat_storage_kwargs = {}
|
88
92
|
column: str = repartition_args["column"]
|
89
93
|
partition_ranges: List = repartition_args["ranges"]
|
90
94
|
if len(partition_ranges) == 0:
|
@@ -141,6 +145,7 @@ def repartition_range(
|
|
141
145
|
destination_partition,
|
142
146
|
max_records_per_entry=max_records_per_output_file,
|
143
147
|
content_type=repartitioned_file_content_type,
|
148
|
+
**deltacat_storage_kwargs,
|
144
149
|
)
|
145
150
|
partition_deltas.append(partition_delta)
|
146
151
|
|
@@ -163,7 +168,11 @@ def _timed_repartition(
|
|
163
168
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
164
169
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
165
170
|
deltacat_storage=unimplemented_deltacat_storage,
|
171
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
172
|
+
**kwargs,
|
166
173
|
) -> RepartitionResult:
|
174
|
+
if deltacat_storage_kwargs is None:
|
175
|
+
deltacat_storage_kwargs = {}
|
167
176
|
task_id = get_current_ray_task_id()
|
168
177
|
worker_id = get_current_ray_worker_id()
|
169
178
|
with memray.Tracker(
|
@@ -182,6 +191,7 @@ def _timed_repartition(
|
|
182
191
|
max_records_per_output_file=max_records_per_output_file,
|
183
192
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
184
193
|
deltacat_storage=deltacat_storage,
|
194
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
185
195
|
)
|
186
196
|
else:
|
187
197
|
raise NotImplementedError(
|
@@ -201,7 +211,11 @@ def repartition(
|
|
201
211
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
202
212
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
203
213
|
deltacat_storage=unimplemented_deltacat_storage,
|
214
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
215
|
+
**kwargs,
|
204
216
|
) -> RepartitionResult:
|
217
|
+
if deltacat_storage_kwargs is None:
|
218
|
+
deltacat_storage_kwargs = {}
|
205
219
|
logger.info(f"Starting repartition task...")
|
206
220
|
repartition_result, duration = timed_invocation(
|
207
221
|
func=_timed_repartition,
|
@@ -214,6 +228,7 @@ def repartition(
|
|
214
228
|
read_kwargs_provider=read_kwargs_provider,
|
215
229
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
216
230
|
deltacat_storage=deltacat_storage,
|
231
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
217
232
|
)
|
218
233
|
if metrics_config:
|
219
234
|
emit_timer_metrics(
|