deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,9 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import logging
|
5
|
-
|
5
|
+
import copy
|
6
|
+
from deltacat.types.media import ContentType, ContentEncoding
|
7
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
6
8
|
from typing import Callable, List, Optional, Union
|
7
9
|
|
8
10
|
from deltacat import logs
|
@@ -64,7 +66,9 @@ class DeltaAnnotated(Delta):
|
|
64
66
|
annotated_deltas: List[DeltaAnnotated],
|
65
67
|
min_delta_bytes: float,
|
66
68
|
min_file_counts: Optional[Union[int, float]] = float("inf"),
|
67
|
-
estimation_function: Optional[
|
69
|
+
estimation_function: Optional[
|
70
|
+
Callable[[ManifestEntry], float]
|
71
|
+
] = lambda entry: entry.meta.content_length,
|
68
72
|
) -> List[DeltaAnnotated]:
|
69
73
|
"""
|
70
74
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
@@ -76,11 +80,16 @@ class DeltaAnnotated(Delta):
|
|
76
80
|
of bytes at rest for the associated object. Returns the list of annotated
|
77
81
|
delta groups.
|
78
82
|
"""
|
79
|
-
|
83
|
+
split_annotated_deltas: List[DeltaAnnotated] = []
|
84
|
+
groups: List[DeltaAnnotated] = []
|
80
85
|
new_da = DeltaAnnotated()
|
81
86
|
new_da_bytes = 0
|
82
87
|
da_group_entry_count = 0
|
83
|
-
|
88
|
+
|
89
|
+
for delta_annotated in annotated_deltas:
|
90
|
+
split_annotated_deltas.extend(DeltaAnnotated._split_single(delta_annotated))
|
91
|
+
|
92
|
+
for src_da in split_annotated_deltas:
|
84
93
|
src_da_annotations = src_da.annotations
|
85
94
|
src_da_entries = src_da.manifest.entries
|
86
95
|
assert (
|
@@ -105,11 +114,7 @@ class DeltaAnnotated(Delta):
|
|
105
114
|
src_da, new_da, src_entry, src_da_annotations[i]
|
106
115
|
)
|
107
116
|
# TODO: Fetch s3_obj["Size"] if entry content length undefined?
|
108
|
-
estimated_new_da_bytes = (
|
109
|
-
estimation_function(src_entry.meta.content_length)
|
110
|
-
if type(estimation_function) is FunctionType
|
111
|
-
else src_entry.meta.content_length
|
112
|
-
)
|
117
|
+
estimated_new_da_bytes = estimation_function(src_entry)
|
113
118
|
new_da_bytes += estimated_new_da_bytes
|
114
119
|
da_group_entry_count += 1
|
115
120
|
if (
|
@@ -132,6 +137,7 @@ class DeltaAnnotated(Delta):
|
|
132
137
|
da_group_entry_count = 0
|
133
138
|
if new_da:
|
134
139
|
groups.append(new_da)
|
140
|
+
|
135
141
|
return groups
|
136
142
|
|
137
143
|
@staticmethod
|
@@ -207,3 +213,79 @@ class DeltaAnnotated(Delta):
|
|
207
213
|
dst_da.type = None
|
208
214
|
entries.append(src_entry)
|
209
215
|
dst_da.annotations.append(src_annotation)
|
216
|
+
|
217
|
+
@staticmethod
|
218
|
+
def _split_single(delta_annotated: DeltaAnnotated) -> List[DeltaAnnotated]:
|
219
|
+
"""
|
220
|
+
Split a single delta annotated into multiple granular
|
221
|
+
annotated entries. Note that split is not always guaranteed.
|
222
|
+
|
223
|
+
Note: Currently we are only able to split the Parquet File downloads.
|
224
|
+
"""
|
225
|
+
|
226
|
+
result = []
|
227
|
+
|
228
|
+
if (
|
229
|
+
delta_annotated.meta
|
230
|
+
and delta_annotated.manifest
|
231
|
+
and delta_annotated.meta.content_type == ContentType.PARQUET
|
232
|
+
and delta_annotated.meta.content_encoding == ContentEncoding.IDENTITY
|
233
|
+
):
|
234
|
+
# we split by row groups
|
235
|
+
for entry_index, entry in enumerate(delta_annotated.manifest.entries):
|
236
|
+
input_split_params = None
|
237
|
+
if entry.meta and entry.meta.content_type_parameters:
|
238
|
+
for type_params in entry.meta.content_type_parameters:
|
239
|
+
if (
|
240
|
+
isinstance(type_params, PartialParquetParameters)
|
241
|
+
and type_params.num_row_groups > 1
|
242
|
+
and type_params.pq_metadata
|
243
|
+
):
|
244
|
+
input_split_params = type_params
|
245
|
+
break
|
246
|
+
|
247
|
+
if input_split_params:
|
248
|
+
logger.info(
|
249
|
+
f"Splitting input file with URI: {entry.uri} into "
|
250
|
+
f"different {input_split_params.num_row_groups} entries"
|
251
|
+
)
|
252
|
+
|
253
|
+
for rg in input_split_params.row_groups_to_download:
|
254
|
+
new_da = DeltaAnnotated()
|
255
|
+
new_entry_dict = copy.deepcopy(entry)
|
256
|
+
new_entry = ManifestEntry(new_entry_dict)
|
257
|
+
|
258
|
+
row_group_meta = input_split_params.pq_metadata.row_group(rg)
|
259
|
+
|
260
|
+
new_partial_params = PartialParquetParameters.of(
|
261
|
+
row_groups_to_download=[rg],
|
262
|
+
num_row_groups=1,
|
263
|
+
num_rows=row_group_meta.num_rows,
|
264
|
+
in_memory_size_bytes=row_group_meta.total_byte_size,
|
265
|
+
pq_metadata=input_split_params.pq_metadata,
|
266
|
+
)
|
267
|
+
|
268
|
+
new_entry.meta.content_type_parameters = [new_partial_params]
|
269
|
+
for type_params in entry.meta.content_type_parameters:
|
270
|
+
if not isinstance(type_params, PartialParquetParameters):
|
271
|
+
new_entry.meta.content_type_parameters.append(
|
272
|
+
type_params
|
273
|
+
)
|
274
|
+
|
275
|
+
DeltaAnnotated._append_annotated_entry(
|
276
|
+
delta_annotated,
|
277
|
+
new_da,
|
278
|
+
new_entry,
|
279
|
+
delta_annotated.annotations[entry_index],
|
280
|
+
)
|
281
|
+
|
282
|
+
result.append(new_da)
|
283
|
+
|
284
|
+
if result:
|
285
|
+
return result
|
286
|
+
else:
|
287
|
+
logger.info(
|
288
|
+
f"Split was not performed on the delta with locator: {delta_annotated.locator}"
|
289
|
+
)
|
290
|
+
|
291
|
+
return [delta_annotated]
|
@@ -2,6 +2,7 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
+
import pyarrow as pa
|
5
6
|
|
6
7
|
from deltacat.storage import DeltaType, LocalTable
|
7
8
|
|
@@ -37,8 +38,6 @@ class DeltaFileEnvelope(dict):
|
|
37
38
|
"""
|
38
39
|
if stream_position is None:
|
39
40
|
raise ValueError("Missing delta file envelope stream position.")
|
40
|
-
if file_index is None:
|
41
|
-
raise ValueError("Missing delta file envelope file index.")
|
42
41
|
if delta_type is None:
|
43
42
|
raise ValueError("Missing Delta file envelope delta type.")
|
44
43
|
if table is None:
|
@@ -75,3 +74,16 @@ class DeltaFileEnvelope(dict):
|
|
75
74
|
@property
|
76
75
|
def file_record_count(self) -> int:
|
77
76
|
return self["file_record_count"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def table_size_bytes(self) -> int:
|
80
|
+
if isinstance(self.table, pa.Table):
|
81
|
+
return self.table.nbytes
|
82
|
+
else:
|
83
|
+
raise ValueError(
|
84
|
+
f"Table type: {type(self.table)} not for supported for size method."
|
85
|
+
)
|
86
|
+
|
87
|
+
@property
|
88
|
+
def table_num_rows(self) -> int:
|
89
|
+
return len(self.table)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Tuple
|
4
5
|
from deltacat.storage import DeltaLocator, PartitionLocator
|
5
6
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -40,9 +41,11 @@ class RoundCompletionInfo(dict):
|
|
40
41
|
compacted_delta_locator: DeltaLocator,
|
41
42
|
compacted_pyarrow_write_result: PyArrowWriteResult,
|
42
43
|
sort_keys_bit_width: int,
|
43
|
-
rebase_source_partition_locator: Optional[PartitionLocator],
|
44
|
+
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
44
45
|
manifest_entry_copied_by_reference_ratio: Optional[float] = None,
|
45
46
|
compaction_audit_url: Optional[str] = None,
|
47
|
+
hash_bucket_count: Optional[int] = None,
|
48
|
+
hb_index_to_entry_range: Optional[Dict[int, Tuple[int, int]]] = None,
|
46
49
|
) -> RoundCompletionInfo:
|
47
50
|
|
48
51
|
rci = RoundCompletionInfo()
|
@@ -55,6 +58,8 @@ class RoundCompletionInfo(dict):
|
|
55
58
|
"manifestEntryCopiedByReferenceRatio"
|
56
59
|
] = manifest_entry_copied_by_reference_ratio
|
57
60
|
rci["compactionAuditUrl"] = compaction_audit_url
|
61
|
+
rci["hashBucketCount"] = hash_bucket_count
|
62
|
+
rci["hbIndexToEntryRange"] = hb_index_to_entry_range
|
58
63
|
return rci
|
59
64
|
|
60
65
|
@property
|
@@ -97,3 +102,14 @@ class RoundCompletionInfo(dict):
|
|
97
102
|
@property
|
98
103
|
def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
|
99
104
|
return self["manifestEntryCopiedByReferenceRatio"]
|
105
|
+
|
106
|
+
@property
|
107
|
+
def hash_bucket_count(self) -> Optional[int]:
|
108
|
+
return self["hashBucketCount"]
|
109
|
+
|
110
|
+
@property
|
111
|
+
def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
|
112
|
+
"""
|
113
|
+
The start index is inclusive and end index is exclusive by default.
|
114
|
+
"""
|
115
|
+
return self["hbIndexToEntryRange"]
|
@@ -91,7 +91,7 @@ def repartition(
|
|
91
91
|
source_partition_locator.partition_values,
|
92
92
|
).stream_position,
|
93
93
|
deltacat_storage,
|
94
|
-
|
94
|
+
list_deltas_kwargs,
|
95
95
|
)
|
96
96
|
|
97
97
|
uniform_deltas = []
|
@@ -162,6 +162,7 @@ def repartition(
|
|
162
162
|
source_partition_locator,
|
163
163
|
sort_keys,
|
164
164
|
deltacat_storage,
|
165
|
+
deltacat_storage_kwargs={},
|
165
166
|
)
|
166
167
|
repartition_completion_info = RoundCompletionInfo.of(
|
167
168
|
last_stream_position_to_compact,
|
@@ -107,20 +107,21 @@ def _timed_dedupe(
|
|
107
107
|
dedupe_task_index: int,
|
108
108
|
enable_profiler: bool,
|
109
109
|
object_store: Optional[IObjectStore],
|
110
|
+
**kwargs,
|
110
111
|
):
|
111
112
|
task_id = get_current_ray_task_id()
|
112
113
|
worker_id = get_current_ray_worker_id()
|
113
114
|
with memray.Tracker(
|
114
115
|
f"dedupe_{worker_id}_{task_id}.bin"
|
115
116
|
) if enable_profiler else nullcontext():
|
116
|
-
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
117
|
-
# severe skew of primary key updates in deltas
|
117
|
+
# TODO (pdames): mitigate risk of running out of memory here in cases of severe skew of primary key updates in deltas
|
118
118
|
logger.info(
|
119
119
|
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
120
120
|
f"groups for {len(object_ids)} object refs..."
|
121
121
|
)
|
122
|
-
|
123
|
-
|
122
|
+
delta_file_envelope_groups_list: List[object] = object_store.get_many(
|
123
|
+
object_ids
|
124
|
+
)
|
124
125
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
125
126
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
126
127
|
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
@@ -171,7 +172,8 @@ def _timed_dedupe(
|
|
171
172
|
|
172
173
|
hb_table_record_count = len(table)
|
173
174
|
table, drop_time = timed_invocation(
|
174
|
-
func=_drop_duplicates_by_primary_key_hash,
|
175
|
+
func=_drop_duplicates_by_primary_key_hash,
|
176
|
+
table=table,
|
175
177
|
)
|
176
178
|
deduped_record_count = hb_table_record_count - len(table)
|
177
179
|
total_deduped_records += deduped_record_count
|
@@ -227,7 +229,6 @@ def _timed_dedupe(
|
|
227
229
|
)
|
228
230
|
|
229
231
|
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
230
|
-
|
231
232
|
return DedupeResult(
|
232
233
|
mat_bucket_to_dd_idx_obj_id,
|
233
234
|
np.int64(total_deduped_records),
|
@@ -246,6 +247,7 @@ def dedupe(
|
|
246
247
|
enable_profiler: bool,
|
247
248
|
metrics_config: MetricsConfig,
|
248
249
|
object_store: Optional[IObjectStore],
|
250
|
+
**kwargs,
|
249
251
|
) -> DedupeResult:
|
250
252
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
251
253
|
dedupe_result, duration = timed_invocation(
|
@@ -256,6 +258,7 @@ def dedupe(
|
|
256
258
|
dedupe_task_index=dedupe_task_index,
|
257
259
|
enable_profiler=enable_profiler,
|
258
260
|
object_store=object_store,
|
261
|
+
**kwargs,
|
259
262
|
)
|
260
263
|
|
261
264
|
emit_metrics_time = 0.0
|
@@ -3,7 +3,7 @@ import logging
|
|
3
3
|
import time
|
4
4
|
from contextlib import nullcontext
|
5
5
|
from itertools import chain
|
6
|
-
from typing import Generator, List, Optional, Tuple
|
6
|
+
from typing import Any, Dict, Generator, List, Optional, Tuple
|
7
7
|
import numpy as np
|
8
8
|
import pyarrow as pa
|
9
9
|
import ray
|
@@ -91,7 +91,11 @@ def _group_file_records_by_pk_hash_bucket(
|
|
91
91
|
is_src_delta: np.bool_ = True,
|
92
92
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
93
93
|
deltacat_storage=unimplemented_deltacat_storage,
|
94
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
95
|
+
**kwargs,
|
94
96
|
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
|
97
|
+
if deltacat_storage_kwargs is None:
|
98
|
+
deltacat_storage_kwargs = {}
|
95
99
|
# read input parquet s3 objects into a list of delta file envelopes
|
96
100
|
delta_file_envelopes, total_record_count = _read_delta_file_envelopes(
|
97
101
|
annotated_delta,
|
@@ -99,6 +103,8 @@ def _group_file_records_by_pk_hash_bucket(
|
|
99
103
|
sort_key_names,
|
100
104
|
read_kwargs_provider,
|
101
105
|
deltacat_storage,
|
106
|
+
deltacat_storage_kwargs,
|
107
|
+
**kwargs,
|
102
108
|
)
|
103
109
|
if delta_file_envelopes is None:
|
104
110
|
return None, 0
|
@@ -134,8 +140,11 @@ def _read_delta_file_envelopes(
|
|
134
140
|
sort_key_names: List[str],
|
135
141
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
136
142
|
deltacat_storage=unimplemented_deltacat_storage,
|
143
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
144
|
+
**kwargs,
|
137
145
|
) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
|
138
|
-
|
146
|
+
if deltacat_storage_kwargs is None:
|
147
|
+
deltacat_storage_kwargs = {}
|
139
148
|
columns_to_read = list(chain(primary_keys, sort_key_names))
|
140
149
|
# TODO (rootliu) compare performance of column read from unpartitioned vs partitioned file
|
141
150
|
# https://arrow.apache.org/docs/python/parquet.html#writing-to-partitioned-datasets
|
@@ -145,6 +154,7 @@ def _read_delta_file_envelopes(
|
|
145
154
|
columns=columns_to_read,
|
146
155
|
file_reader_kwargs_provider=read_kwargs_provider,
|
147
156
|
storage_type=StorageType.LOCAL,
|
157
|
+
**deltacat_storage_kwargs,
|
148
158
|
)
|
149
159
|
annotations = annotated_delta.annotations
|
150
160
|
assert (
|
@@ -182,7 +192,11 @@ def _timed_hash_bucket(
|
|
182
192
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
183
193
|
object_store: Optional[IObjectStore] = None,
|
184
194
|
deltacat_storage=unimplemented_deltacat_storage,
|
195
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
196
|
+
**kwargs,
|
185
197
|
):
|
198
|
+
if deltacat_storage_kwargs is None:
|
199
|
+
deltacat_storage_kwargs = {}
|
186
200
|
task_id = get_current_ray_task_id()
|
187
201
|
worker_id = get_current_ray_worker_id()
|
188
202
|
with memray.Tracker(
|
@@ -207,6 +221,8 @@ def _timed_hash_bucket(
|
|
207
221
|
is_src_delta,
|
208
222
|
read_kwargs_provider,
|
209
223
|
deltacat_storage,
|
224
|
+
deltacat_storage_kwargs,
|
225
|
+
**kwargs,
|
210
226
|
)
|
211
227
|
hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
|
212
228
|
delta_file_envelope_groups, num_buckets, num_groups, object_store
|
@@ -235,8 +251,11 @@ def hash_bucket(
|
|
235
251
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
236
252
|
object_store: Optional[IObjectStore],
|
237
253
|
deltacat_storage=unimplemented_deltacat_storage,
|
254
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
255
|
+
**kwargs,
|
238
256
|
) -> HashBucketResult:
|
239
|
-
|
257
|
+
if deltacat_storage_kwargs is None:
|
258
|
+
deltacat_storage_kwargs = {}
|
240
259
|
logger.info(f"Starting hash bucket task...")
|
241
260
|
hash_bucket_result, duration = timed_invocation(
|
242
261
|
func=_timed_hash_bucket,
|
@@ -250,6 +269,8 @@ def hash_bucket(
|
|
250
269
|
read_kwargs_provider=read_kwargs_provider,
|
251
270
|
object_store=object_store,
|
252
271
|
deltacat_storage=deltacat_storage,
|
272
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
273
|
+
**kwargs,
|
253
274
|
)
|
254
275
|
|
255
276
|
emit_metrics_time = 0.0
|
@@ -69,7 +69,11 @@ def materialize(
|
|
69
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
70
|
object_store: Optional[IObjectStore] = None,
|
71
71
|
deltacat_storage=unimplemented_deltacat_storage,
|
72
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
72
73
|
):
|
74
|
+
if deltacat_storage_kwargs is None:
|
75
|
+
deltacat_storage_kwargs = {}
|
76
|
+
|
73
77
|
def _stage_delta_from_manifest_entry_reference_list(
|
74
78
|
manifest_entry_list_reference: List[ManifestEntry],
|
75
79
|
partition: Partition,
|
@@ -105,6 +109,7 @@ def materialize(
|
|
105
109
|
max_records_per_entry=max_records_per_output_file,
|
106
110
|
content_type=compacted_file_content_type,
|
107
111
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
112
|
+
**deltacat_storage_kwargs,
|
108
113
|
)
|
109
114
|
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
110
115
|
compacted_table
|
@@ -116,11 +121,10 @@ def materialize(
|
|
116
121
|
)
|
117
122
|
manifest = delta.manifest
|
118
123
|
manifest_records = manifest.meta.record_count
|
119
|
-
assert (
|
120
|
-
manifest_records == len(compacted_table),
|
124
|
+
assert manifest_records == len(compacted_table), (
|
121
125
|
f"Unexpected Error: Materialized delta manifest record count "
|
122
126
|
f"({manifest_records}) does not equal compacted table record count "
|
123
|
-
f"({len(compacted_table)})"
|
127
|
+
f"({len(compacted_table)})"
|
124
128
|
)
|
125
129
|
materialize_result = MaterializeResult.of(
|
126
130
|
delta=delta,
|
@@ -187,10 +191,11 @@ def materialize(
|
|
187
191
|
src_stream_position_np.item(),
|
188
192
|
)
|
189
193
|
dl_digest = delta_locator.digest()
|
190
|
-
|
191
194
|
manifest = manifest_cache.setdefault(
|
192
195
|
dl_digest,
|
193
|
-
deltacat_storage.get_delta_manifest(
|
196
|
+
deltacat_storage.get_delta_manifest(
|
197
|
+
delta_locator, **deltacat_storage_kwargs
|
198
|
+
),
|
194
199
|
)
|
195
200
|
|
196
201
|
if read_kwargs_provider is None:
|
@@ -236,6 +241,7 @@ def materialize(
|
|
236
241
|
Delta.of(delta_locator, None, None, None, manifest),
|
237
242
|
src_file_idx_np.item(),
|
238
243
|
file_reader_kwargs_provider=read_kwargs_provider,
|
244
|
+
**deltacat_storage_kwargs,
|
239
245
|
)
|
240
246
|
logger.debug(
|
241
247
|
f"Time taken for materialize task"
|
@@ -253,7 +259,6 @@ def materialize(
|
|
253
259
|
materialized_results.append(_materialize(record_batch_tables.remaining))
|
254
260
|
|
255
261
|
logger.info(f"Got {count_of_src_dfl} source delta files during materialize")
|
256
|
-
|
257
262
|
referenced_manifest_delta = (
|
258
263
|
_stage_delta_from_manifest_entry_reference_list(
|
259
264
|
manifest_entry_list_reference, partition
|
@@ -4,7 +4,7 @@ from contextlib import nullcontext
|
|
4
4
|
import pyarrow.compute as pc
|
5
5
|
from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
|
6
6
|
import pyarrow as pa
|
7
|
-
from typing import List, Optional
|
7
|
+
from typing import Any, Dict, List, Optional
|
8
8
|
from deltacat.types.media import StorageType, ContentType
|
9
9
|
import ray
|
10
10
|
from deltacat import logs
|
@@ -58,6 +58,8 @@ def repartition_range(
|
|
58
58
|
max_records_per_output_file: int,
|
59
59
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
60
60
|
deltacat_storage=unimplemented_deltacat_storage,
|
61
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
62
|
+
**kwargs,
|
61
63
|
):
|
62
64
|
"""
|
63
65
|
Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
|
@@ -85,6 +87,8 @@ def repartition_range(
|
|
85
87
|
in the tables, an error will be raised. For each partition range, a new file is created. This could result in
|
86
88
|
more output files than input files.
|
87
89
|
"""
|
90
|
+
if deltacat_storage_kwargs is None:
|
91
|
+
deltacat_storage_kwargs = {}
|
88
92
|
column: str = repartition_args["column"]
|
89
93
|
partition_ranges: List = repartition_args["ranges"]
|
90
94
|
if len(partition_ranges) == 0:
|
@@ -141,6 +145,7 @@ def repartition_range(
|
|
141
145
|
destination_partition,
|
142
146
|
max_records_per_entry=max_records_per_output_file,
|
143
147
|
content_type=repartitioned_file_content_type,
|
148
|
+
**deltacat_storage_kwargs,
|
144
149
|
)
|
145
150
|
partition_deltas.append(partition_delta)
|
146
151
|
|
@@ -163,7 +168,11 @@ def _timed_repartition(
|
|
163
168
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
164
169
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
165
170
|
deltacat_storage=unimplemented_deltacat_storage,
|
171
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
172
|
+
**kwargs,
|
166
173
|
) -> RepartitionResult:
|
174
|
+
if deltacat_storage_kwargs is None:
|
175
|
+
deltacat_storage_kwargs = {}
|
167
176
|
task_id = get_current_ray_task_id()
|
168
177
|
worker_id = get_current_ray_worker_id()
|
169
178
|
with memray.Tracker(
|
@@ -182,6 +191,7 @@ def _timed_repartition(
|
|
182
191
|
max_records_per_output_file=max_records_per_output_file,
|
183
192
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
184
193
|
deltacat_storage=deltacat_storage,
|
194
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
185
195
|
)
|
186
196
|
else:
|
187
197
|
raise NotImplementedError(
|
@@ -201,7 +211,11 @@ def repartition(
|
|
201
211
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
202
212
|
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
203
213
|
deltacat_storage=unimplemented_deltacat_storage,
|
214
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
215
|
+
**kwargs,
|
204
216
|
) -> RepartitionResult:
|
217
|
+
if deltacat_storage_kwargs is None:
|
218
|
+
deltacat_storage_kwargs = {}
|
205
219
|
logger.info(f"Starting repartition task...")
|
206
220
|
repartition_result, duration = timed_invocation(
|
207
221
|
func=_timed_repartition,
|
@@ -214,6 +228,7 @@ def repartition(
|
|
214
228
|
read_kwargs_provider=read_kwargs_provider,
|
215
229
|
repartitioned_file_content_type=repartitioned_file_content_type,
|
216
230
|
deltacat_storage=deltacat_storage,
|
231
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
217
232
|
)
|
218
233
|
if metrics_config:
|
219
234
|
emit_timer_metrics(
|
@@ -10,11 +10,12 @@ from deltacat.constants import (
|
|
10
10
|
from deltacat.storage import (
|
11
11
|
PartitionLocator,
|
12
12
|
Delta,
|
13
|
+
ManifestEntry,
|
13
14
|
interface as unimplemented_deltacat_storage,
|
14
15
|
)
|
15
16
|
from deltacat import logs
|
16
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
17
|
-
from typing import Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
18
19
|
from deltacat.compute.compactor import HighWatermark
|
19
20
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
20
21
|
CompactionSessionAuditInfo,
|
@@ -31,23 +32,30 @@ def discover_deltas(
|
|
31
32
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
32
33
|
rebase_source_partition_high_watermark: Optional[int],
|
33
34
|
deltacat_storage=unimplemented_deltacat_storage,
|
34
|
-
|
35
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
36
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
35
37
|
) -> Tuple[List[Delta], int]:
|
36
|
-
|
38
|
+
if deltacat_storage_kwargs is None:
|
39
|
+
deltacat_storage_kwargs = {}
|
37
40
|
# Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
|
38
|
-
|
39
|
-
source_partition_locator,
|
41
|
+
start_position_exclusive = (
|
40
42
|
high_watermark.get(source_partition_locator)
|
41
43
|
if isinstance(high_watermark, dict)
|
42
|
-
else high_watermark
|
44
|
+
else high_watermark
|
45
|
+
)
|
46
|
+
input_deltas = _discover_deltas(
|
47
|
+
source_partition_locator,
|
48
|
+
start_position_exclusive,
|
43
49
|
last_stream_position_to_compact
|
44
50
|
if not rebase_source_partition_locator
|
45
51
|
else deltacat_storage.get_partition(
|
46
52
|
source_partition_locator.stream_locator,
|
47
53
|
source_partition_locator.partition_values,
|
54
|
+
**deltacat_storage_kwargs,
|
48
55
|
).stream_position,
|
49
56
|
deltacat_storage,
|
50
|
-
|
57
|
+
deltacat_storage_kwargs,
|
58
|
+
list_deltas_kwargs,
|
51
59
|
)
|
52
60
|
|
53
61
|
# Source Two: delta from compacted table for incremental compaction or new deltas from uncompacted table for rebase
|
@@ -56,6 +64,7 @@ def discover_deltas(
|
|
56
64
|
compacted_partition = deltacat_storage.get_partition(
|
57
65
|
compacted_partition_locator.stream_locator,
|
58
66
|
compacted_partition_locator.partition_values,
|
67
|
+
**deltacat_storage_kwargs,
|
59
68
|
)
|
60
69
|
previous_last_stream_position_compacted = (
|
61
70
|
compacted_partition.stream_position if compacted_partition else -1
|
@@ -67,7 +76,8 @@ def discover_deltas(
|
|
67
76
|
None,
|
68
77
|
previous_last_stream_position_compacted,
|
69
78
|
deltacat_storage,
|
70
|
-
|
79
|
+
deltacat_storage_kwargs,
|
80
|
+
list_deltas_kwargs,
|
71
81
|
)
|
72
82
|
logger.info(
|
73
83
|
f"Length of input deltas from uncompacted table {len(input_deltas)} up to {last_stream_position_to_compact},"
|
@@ -80,7 +90,8 @@ def discover_deltas(
|
|
80
90
|
rebase_source_partition_high_watermark,
|
81
91
|
last_stream_position_to_compact,
|
82
92
|
deltacat_storage,
|
83
|
-
|
93
|
+
deltacat_storage_kwargs,
|
94
|
+
list_deltas_kwargs,
|
84
95
|
)
|
85
96
|
logger.info(
|
86
97
|
f"Length of input deltas from uncompacted table {len(input_deltas_new)} up to {last_stream_position_to_compact},"
|
@@ -99,6 +110,8 @@ def limit_input_deltas(
|
|
99
110
|
input_deltas_stats: Dict[int, DeltaStats],
|
100
111
|
compaction_audit: CompactionSessionAuditInfo,
|
101
112
|
deltacat_storage=unimplemented_deltacat_storage,
|
113
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
114
|
+
**kwargs,
|
102
115
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
103
116
|
# TODO (pdames): when row counts are available in metadata, use them
|
104
117
|
# instead of bytes - memory consumption depends more on number of
|
@@ -108,6 +121,8 @@ def limit_input_deltas(
|
|
108
121
|
# this assumption could be removed, but we'd still need to know the max
|
109
122
|
# resources we COULD get for this cluster, and the amount of memory
|
110
123
|
# available per CPU should remain fixed across the cluster.
|
124
|
+
if deltacat_storage_kwargs is None:
|
125
|
+
deltacat_storage_kwargs = {}
|
111
126
|
worker_cpus = int(cluster_resources["CPU"])
|
112
127
|
worker_obj_store_mem = float(cluster_resources["object_store_memory"])
|
113
128
|
logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
|
@@ -135,7 +150,7 @@ def limit_input_deltas(
|
|
135
150
|
for stream_pos, delta_stats in input_deltas_stats.items()
|
136
151
|
}
|
137
152
|
for delta in input_deltas:
|
138
|
-
manifest = deltacat_storage.get_delta_manifest(delta)
|
153
|
+
manifest = deltacat_storage.get_delta_manifest(delta, **deltacat_storage_kwargs)
|
139
154
|
delta.manifest = manifest
|
140
155
|
position = delta.stream_position
|
141
156
|
delta_stats = input_deltas_stats.get(delta.stream_position, DeltaStats())
|
@@ -258,6 +273,8 @@ def fit_input_deltas(
|
|
258
273
|
compaction_audit: CompactionSessionAuditInfo,
|
259
274
|
hash_bucket_count: Optional[int],
|
260
275
|
deltacat_storage=unimplemented_deltacat_storage,
|
276
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
277
|
+
**kwargs,
|
261
278
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
262
279
|
"""
|
263
280
|
This method tries to fit all the input deltas to run into the existing cluster. Contrary to
|
@@ -277,6 +294,8 @@ def fit_input_deltas(
|
|
277
294
|
Tuple of list of annotated deltas, recommended hash bucket count, high watermark,
|
278
295
|
and whether multiple rounds are required (which is always False)
|
279
296
|
"""
|
297
|
+
if deltacat_storage_kwargs is None:
|
298
|
+
deltacat_storage_kwargs = {}
|
280
299
|
worker_cpus = int(cluster_resources["CPU"])
|
281
300
|
total_memory = float(cluster_resources["memory"])
|
282
301
|
high_watermark = HighWatermark()
|
@@ -306,8 +325,8 @@ def fit_input_deltas(
|
|
306
325
|
# We assume that the cluster is capable of distributing all tasks
|
307
326
|
# correctly. Hence, the correct in-memory size will be in the ratio of
|
308
327
|
# in-disk size.
|
309
|
-
def estimate_size(
|
310
|
-
return (content_length * 1.0 / delta_bytes) * total_memory
|
328
|
+
def estimate_size(manifest_entry: ManifestEntry):
|
329
|
+
return (manifest_entry.meta.content_length * 1.0 / delta_bytes) * total_memory
|
311
330
|
|
312
331
|
# Assuming each CPU consumes equal amount of memory
|
313
332
|
min_delta_bytes = total_memory / worker_cpus
|
@@ -341,18 +360,16 @@ def _discover_deltas(
|
|
341
360
|
start_position_exclusive: Optional[int],
|
342
361
|
end_position_inclusive: int,
|
343
362
|
deltacat_storage=unimplemented_deltacat_storage,
|
344
|
-
|
363
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
345
365
|
) -> List[Delta]:
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
deltas_list_result = deltacat_storage.
|
352
|
-
|
353
|
-
table_name=table_name,
|
354
|
-
partition_values=partition_values,
|
355
|
-
table_version=table_version,
|
366
|
+
if deltacat_storage_kwargs is None:
|
367
|
+
deltacat_storage_kwargs = {}
|
368
|
+
|
369
|
+
kwargs = {**deltacat_storage_kwargs, **list_deltas_kwargs}
|
370
|
+
|
371
|
+
deltas_list_result = deltacat_storage.list_partition_deltas(
|
372
|
+
partition_like=source_partition_locator,
|
356
373
|
first_stream_position=start_position_exclusive,
|
357
374
|
last_stream_position=end_position_inclusive,
|
358
375
|
ascending_order=True,
|