deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Optional, Iterable
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
import pyarrow as pa
|
6
|
+
import pyarrow.compute as pc
|
7
|
+
import uuid
|
8
|
+
import hashlib
|
9
|
+
from deltacat.compute.compactor_v2.constants import (
|
10
|
+
TOTAL_BYTES_IN_SHA1_HASH,
|
11
|
+
PK_DELIMITER,
|
12
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
)
|
14
|
+
import time
|
15
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
16
|
+
from deltacat import logs
|
17
|
+
from deltacat.compute.compactor.utils import system_columns as sc
|
18
|
+
from deltacat.io.object_store import IObjectStore
|
19
|
+
from deltacat.utils.performance import timed_invocation
|
20
|
+
|
21
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
22
|
+
|
23
|
+
|
24
|
+
def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Table:
|
25
|
+
hash_column_np = hash_column.to_numpy()
|
26
|
+
|
27
|
+
result = []
|
28
|
+
for hash_value in hash_column_np:
|
29
|
+
result.append(hashlib.sha1(hash_value.encode("utf-8")).hexdigest())
|
30
|
+
|
31
|
+
return sc.append_pk_hash_string_column(table, result)
|
32
|
+
|
33
|
+
|
34
|
+
def _is_sha1_desired(hash_column: pa.Array) -> bool:
|
35
|
+
return hash_column.nbytes > TOTAL_BYTES_IN_SHA1_HASH * len(hash_column)
|
36
|
+
|
37
|
+
|
38
|
+
def _append_table_by_hash_bucket(
|
39
|
+
pki_table: pa.Table, hash_bucket_to_table: np.ndarray
|
40
|
+
) -> int:
|
41
|
+
|
42
|
+
hb_pk_table, sort_latency = timed_invocation(
|
43
|
+
lambda: pki_table.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
|
44
|
+
)
|
45
|
+
logger.info(f"Sorting a pk table of length {len(pki_table)} took {sort_latency}s")
|
46
|
+
|
47
|
+
hb_pk_grouped_by, groupby_latency = timed_invocation(
|
48
|
+
lambda: hb_pk_table.group_by(sc._HASH_BUCKET_IDX_COLUMN_NAME).aggregate(
|
49
|
+
[(sc._HASH_BUCKET_IDX_COLUMN_NAME, "count")]
|
50
|
+
)
|
51
|
+
)
|
52
|
+
|
53
|
+
logger.info(
|
54
|
+
f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
|
55
|
+
)
|
56
|
+
|
57
|
+
group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
|
58
|
+
hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
|
59
|
+
|
60
|
+
result_len = 0
|
61
|
+
for i, group_count in enumerate(group_count_array):
|
62
|
+
hb_idx = hb_group_array[i].as_py()
|
63
|
+
pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
|
64
|
+
pyarrow_table = pyarrow_table.drop([sc._HASH_BUCKET_IDX_COLUMN_NAME])
|
65
|
+
if hash_bucket_to_table[hb_idx] is None:
|
66
|
+
hash_bucket_to_table[hb_idx] = []
|
67
|
+
hash_bucket_to_table[hb_idx].append(pyarrow_table)
|
68
|
+
result_len += len(pyarrow_table)
|
69
|
+
|
70
|
+
return result_len
|
71
|
+
|
72
|
+
|
73
|
+
def _optimized_group_record_batches_by_hash_bucket(
|
74
|
+
pki_table: pa.Table, num_buckets: int
|
75
|
+
):
|
76
|
+
|
77
|
+
input_table_len = len(pki_table)
|
78
|
+
|
79
|
+
hash_bucket_to_tables = np.empty([num_buckets], dtype="object")
|
80
|
+
hb_to_table = np.empty([num_buckets], dtype="object")
|
81
|
+
|
82
|
+
# This split will ensure that the sort is not performed on a very huge table
|
83
|
+
# resulting in ArrowInvalid: offset overflow while concatenating arrays
|
84
|
+
# Known issue with Arrow: https://github.com/apache/arrow/issues/25822
|
85
|
+
table_batches, to_batches_latency = timed_invocation(lambda: pki_table.to_batches())
|
86
|
+
|
87
|
+
logger.info(f"to_batches took {to_batches_latency} for {len(pki_table)} rows")
|
88
|
+
|
89
|
+
current_bytes = 0
|
90
|
+
record_batches = []
|
91
|
+
result_len = 0
|
92
|
+
for record_batch in table_batches:
|
93
|
+
current_bytes += record_batch.nbytes
|
94
|
+
record_batches.append(record_batch)
|
95
|
+
if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
|
96
|
+
logger.info(
|
97
|
+
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
98
|
+
f"is {len(record_batches)} and size {current_bytes}"
|
99
|
+
)
|
100
|
+
appended_len, append_latency = timed_invocation(
|
101
|
+
_append_table_by_hash_bucket,
|
102
|
+
pa.Table.from_batches(record_batches),
|
103
|
+
hash_bucket_to_tables,
|
104
|
+
)
|
105
|
+
logger.info(
|
106
|
+
f"Appended the hash bucketed batch of {appended_len} in {append_latency}s"
|
107
|
+
)
|
108
|
+
|
109
|
+
result_len += appended_len
|
110
|
+
current_bytes = 0
|
111
|
+
record_batches.clear()
|
112
|
+
|
113
|
+
if record_batches:
|
114
|
+
appended_len, append_latency = timed_invocation(
|
115
|
+
_append_table_by_hash_bucket,
|
116
|
+
pa.Table.from_batches(record_batches),
|
117
|
+
hash_bucket_to_tables,
|
118
|
+
)
|
119
|
+
result_len += appended_len
|
120
|
+
current_bytes = 0
|
121
|
+
record_batches.clear()
|
122
|
+
|
123
|
+
concat_start = time.monotonic()
|
124
|
+
for hb, tables in enumerate(hash_bucket_to_tables):
|
125
|
+
if tables:
|
126
|
+
assert hb_to_table[hb] is None, f"The HB index is repeated {hb}"
|
127
|
+
hb_to_table[hb] = pa.concat_tables(tables)
|
128
|
+
|
129
|
+
concat_end = time.monotonic()
|
130
|
+
logger.info(
|
131
|
+
f"Total time taken to concat all record batches with length "
|
132
|
+
f"{input_table_len}: {concat_end - concat_start}s"
|
133
|
+
)
|
134
|
+
|
135
|
+
assert (
|
136
|
+
input_table_len == result_len
|
137
|
+
), f"Grouping has resulted in record loss as {result_len} != {input_table_len}"
|
138
|
+
|
139
|
+
return hb_to_table
|
140
|
+
|
141
|
+
|
142
|
+
def group_by_pk_hash_bucket(
|
143
|
+
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
144
|
+
) -> np.ndarray:
|
145
|
+
table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
|
146
|
+
|
147
|
+
# group hash bucket record indices
|
148
|
+
result = group_record_indices_by_hash_bucket(
|
149
|
+
table,
|
150
|
+
num_buckets,
|
151
|
+
)
|
152
|
+
|
153
|
+
return result
|
154
|
+
|
155
|
+
|
156
|
+
def generate_pk_hash_column(
|
157
|
+
table: pa.Table,
|
158
|
+
primary_keys: Optional[List[str]] = None,
|
159
|
+
requires_sha1: bool = False,
|
160
|
+
) -> pa.Table:
|
161
|
+
"""
|
162
|
+
Returns a new table after generating the primary key hash if desired.
|
163
|
+
|
164
|
+
1. If there are no primary keys, each hash will be unique uuid/sha1 hex
|
165
|
+
2. If there are more than 0 primary keys, returns a table with new columns appended.
|
166
|
+
"""
|
167
|
+
|
168
|
+
start = time.monotonic()
|
169
|
+
|
170
|
+
can_sha1 = False
|
171
|
+
if primary_keys:
|
172
|
+
pk_columns = []
|
173
|
+
for pk_name in primary_keys:
|
174
|
+
pk_columns.append(pc.cast(table[pk_name], pa.string()))
|
175
|
+
|
176
|
+
pk_columns.append(PK_DELIMITER)
|
177
|
+
hash_column = pc.binary_join_element_wise(*pk_columns)
|
178
|
+
|
179
|
+
can_sha1 = requires_sha1 or _is_sha1_desired(hash_column)
|
180
|
+
else:
|
181
|
+
hash_column = pa.array(
|
182
|
+
[uuid.uuid4().hex for _ in range(len(table))], pa.string()
|
183
|
+
)
|
184
|
+
|
185
|
+
logger.info(
|
186
|
+
f"can_generate_sha1={can_sha1} for the table with hash column size"
|
187
|
+
f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
|
188
|
+
f"and requires_sha1={requires_sha1}"
|
189
|
+
)
|
190
|
+
|
191
|
+
if can_sha1:
|
192
|
+
table = _append_sha1_hash_to_table(table, hash_column)
|
193
|
+
else:
|
194
|
+
table = table.append_column(sc._PK_HASH_STRING_COLUMN_FIELD, hash_column)
|
195
|
+
|
196
|
+
end = time.monotonic()
|
197
|
+
|
198
|
+
logger.info(
|
199
|
+
f"Took {end - start}s to generate pk hash of len: {len(hash_column)}"
|
200
|
+
f" and size: {hash_column.nbytes} bytes"
|
201
|
+
)
|
202
|
+
|
203
|
+
return table
|
204
|
+
|
205
|
+
|
206
|
+
def group_record_indices_by_hash_bucket(
|
207
|
+
pki_table: pa.Table, num_buckets: int
|
208
|
+
) -> np.ndarray:
|
209
|
+
"""
|
210
|
+
Groups the record indices by it's corresponding hash bucket. Hence, this method may
|
211
|
+
create num_buckets tables as a result.
|
212
|
+
"""
|
213
|
+
|
214
|
+
input_table_len = len(pki_table)
|
215
|
+
|
216
|
+
hash_bucket_id_col_list = np.empty([input_table_len], dtype="int32")
|
217
|
+
bucketing_start_time = time.monotonic()
|
218
|
+
|
219
|
+
for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
|
220
|
+
hash_bucket = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
|
221
|
+
hash_bucket_id_col_list[index] = hash_bucket
|
222
|
+
|
223
|
+
pki_table = sc.append_hash_bucket_idx_col(pki_table, hash_bucket_id_col_list)
|
224
|
+
bucketing_end_time = time.monotonic()
|
225
|
+
|
226
|
+
logger.info(
|
227
|
+
f"Took {bucketing_end_time - bucketing_start_time}s to generate the "
|
228
|
+
f"hb index for {len(pki_table)} rows"
|
229
|
+
)
|
230
|
+
|
231
|
+
result, group_latency = timed_invocation(
|
232
|
+
_optimized_group_record_batches_by_hash_bucket,
|
233
|
+
pki_table=pki_table,
|
234
|
+
num_buckets=num_buckets,
|
235
|
+
)
|
236
|
+
|
237
|
+
logger.info(
|
238
|
+
f"Final grouping of table with {input_table_len} records took: {group_latency}s"
|
239
|
+
)
|
240
|
+
|
241
|
+
return result
|
242
|
+
|
243
|
+
|
244
|
+
def group_hash_bucket_indices(
|
245
|
+
hash_bucket_object_groups: np.ndarray,
|
246
|
+
num_buckets: int,
|
247
|
+
num_groups: int,
|
248
|
+
object_store: Optional[IObjectStore] = None,
|
249
|
+
) -> np.ndarray:
|
250
|
+
"""
|
251
|
+
This method persists all tables for a given hash bucket into the object store
|
252
|
+
and returns the object references for each hash group.
|
253
|
+
"""
|
254
|
+
|
255
|
+
hash_bucket_group_to_obj_id_size_tuple = np.empty([num_groups], dtype="object")
|
256
|
+
|
257
|
+
if hash_bucket_object_groups is None:
|
258
|
+
return hash_bucket_group_to_obj_id_size_tuple
|
259
|
+
|
260
|
+
hb_group_to_object = np.empty([num_groups], dtype="object")
|
261
|
+
hash_group_to_size = np.empty([num_groups], dtype="int64")
|
262
|
+
hash_group_to_num_rows = np.empty([num_groups], dtype="int64")
|
263
|
+
|
264
|
+
for hb_index, obj in enumerate(hash_bucket_object_groups):
|
265
|
+
if obj:
|
266
|
+
hb_group = hash_bucket_index_to_hash_group_index(hb_index, num_groups)
|
267
|
+
if hb_group_to_object[hb_group] is None:
|
268
|
+
hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
|
269
|
+
hash_group_to_size[hb_group] = np.int64(0)
|
270
|
+
hash_group_to_num_rows[hb_group] = np.int64(0)
|
271
|
+
hb_group_to_object[hb_group][hb_index] = obj
|
272
|
+
for dfe in obj:
|
273
|
+
casted_dfe: DeltaFileEnvelope = dfe
|
274
|
+
hash_group_to_size[hb_group] += casted_dfe.table_size_bytes
|
275
|
+
hash_group_to_num_rows[hb_group] += casted_dfe.table_num_rows
|
276
|
+
|
277
|
+
for hb_group, obj in enumerate(hb_group_to_object):
|
278
|
+
if obj is None:
|
279
|
+
continue
|
280
|
+
object_ref = object_store.put(obj)
|
281
|
+
hash_bucket_group_to_obj_id_size_tuple[hb_group] = (
|
282
|
+
object_ref,
|
283
|
+
hash_group_to_size[hb_group],
|
284
|
+
hash_group_to_num_rows[hb_group],
|
285
|
+
)
|
286
|
+
del object_ref
|
287
|
+
return hash_bucket_group_to_obj_id_size_tuple
|
288
|
+
|
289
|
+
|
290
|
+
def hash_bucket_index_to_hash_group_index(hb_index: int, num_groups: int) -> int:
|
291
|
+
return hb_index % num_groups
|
292
|
+
|
293
|
+
|
294
|
+
def hash_group_index_to_hash_bucket_indices(
|
295
|
+
hb_group: int, num_buckets: int, num_groups: int
|
296
|
+
) -> Iterable[int]:
|
297
|
+
|
298
|
+
if hb_group > num_buckets:
|
299
|
+
return []
|
300
|
+
|
301
|
+
return range(hb_group, num_groups, num_buckets)
|
302
|
+
|
303
|
+
|
304
|
+
def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
|
305
|
+
"""
|
306
|
+
Generates the hash bucket index from the given digest.
|
307
|
+
"""
|
308
|
+
return int(digest, 16) % num_buckets
|
@@ -0,0 +1,228 @@
|
|
1
|
+
from typing import Dict, Optional, List, Tuple
|
2
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
3
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
4
|
+
from deltacat.storage import (
|
5
|
+
Delta,
|
6
|
+
ManifestEntry,
|
7
|
+
interface as unimplemented_deltacat_storage,
|
8
|
+
)
|
9
|
+
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
10
|
+
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
11
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
12
|
+
hash_group_index_to_hash_bucket_indices,
|
13
|
+
)
|
14
|
+
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
15
|
+
append_content_type_params,
|
16
|
+
)
|
17
|
+
from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
|
18
|
+
|
19
|
+
|
20
|
+
def _get_parquet_type_params_if_exist(
|
21
|
+
entry: ManifestEntry,
|
22
|
+
) -> Optional[PartialParquetParameters]:
|
23
|
+
if (
|
24
|
+
entry.meta
|
25
|
+
and entry.meta.content_type == ContentType.PARQUET
|
26
|
+
and entry.meta.content_encoding == ContentEncoding.IDENTITY
|
27
|
+
):
|
28
|
+
for type_params in entry.meta.content_type_parameters:
|
29
|
+
if isinstance(type_params, PartialParquetParameters):
|
30
|
+
return type_params
|
31
|
+
return None
|
32
|
+
|
33
|
+
|
34
|
+
def _calculate_parquet_column_size(
|
35
|
+
type_params: PartialParquetParameters, columns: List[str]
|
36
|
+
):
|
37
|
+
column_size = 0.0
|
38
|
+
for rg in type_params.row_groups_to_download:
|
39
|
+
columns_found = 0
|
40
|
+
row_group_meta = type_params.pq_metadata.row_group(rg)
|
41
|
+
for col in range(row_group_meta.num_columns):
|
42
|
+
column_meta = row_group_meta.column(col)
|
43
|
+
if column_meta.path_in_schema in columns:
|
44
|
+
columns_found += 1
|
45
|
+
column_size += column_meta.total_uncompressed_size
|
46
|
+
assert columns_found == len(columns), (
|
47
|
+
"Columns not found in the parquet data as "
|
48
|
+
f"{columns_found} != {len(columns)}"
|
49
|
+
)
|
50
|
+
return column_size
|
51
|
+
|
52
|
+
|
53
|
+
def estimate_manifest_entry_size_bytes(
|
54
|
+
entry: ManifestEntry, previous_inflation: float, **kwargs
|
55
|
+
) -> float:
|
56
|
+
if entry.meta.source_content_length:
|
57
|
+
return entry.meta.source_content_length
|
58
|
+
|
59
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
60
|
+
|
61
|
+
if type_params:
|
62
|
+
return type_params.in_memory_size_bytes
|
63
|
+
|
64
|
+
return entry.meta.content_length * previous_inflation
|
65
|
+
|
66
|
+
|
67
|
+
def estimate_manifest_entry_num_rows(
|
68
|
+
entry: ManifestEntry,
|
69
|
+
average_record_size_bytes: float,
|
70
|
+
previous_inflation: float,
|
71
|
+
**kwargs,
|
72
|
+
) -> int:
|
73
|
+
if entry.meta.record_count:
|
74
|
+
return entry.meta.record_count
|
75
|
+
|
76
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
77
|
+
|
78
|
+
if type_params:
|
79
|
+
return type_params.num_rows
|
80
|
+
|
81
|
+
total_size_bytes = estimate_manifest_entry_size_bytes(
|
82
|
+
entry=entry, previous_inflation=previous_inflation, **kwargs
|
83
|
+
)
|
84
|
+
|
85
|
+
return int(total_size_bytes / average_record_size_bytes)
|
86
|
+
|
87
|
+
|
88
|
+
def estimate_manifest_entry_column_size_bytes(
|
89
|
+
entry: ManifestEntry, columns: Optional[List[str]] = None
|
90
|
+
) -> Optional[float]:
|
91
|
+
if not columns:
|
92
|
+
return 0
|
93
|
+
|
94
|
+
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
95
|
+
|
96
|
+
if type_params.pq_metadata:
|
97
|
+
return _calculate_parquet_column_size(type_params=type_params, columns=columns)
|
98
|
+
|
99
|
+
return None
|
100
|
+
|
101
|
+
|
102
|
+
def hash_bucket_resource_options_provider(
|
103
|
+
index: int,
|
104
|
+
item: DeltaAnnotated,
|
105
|
+
previous_inflation: float,
|
106
|
+
average_record_size_bytes: float,
|
107
|
+
primary_keys: List[str] = None,
|
108
|
+
**kwargs,
|
109
|
+
) -> Dict:
|
110
|
+
size_bytes = 0.0
|
111
|
+
num_rows = 0
|
112
|
+
total_pk_size = 0
|
113
|
+
|
114
|
+
if not item.manifest or not item.manifest.entries:
|
115
|
+
return {"CPU": 0.01}
|
116
|
+
|
117
|
+
for entry in item.manifest.entries:
|
118
|
+
entry_size = estimate_manifest_entry_size_bytes(
|
119
|
+
entry=entry, previous_inflation=previous_inflation
|
120
|
+
)
|
121
|
+
num_rows += estimate_manifest_entry_num_rows(
|
122
|
+
entry=entry,
|
123
|
+
previous_inflation=previous_inflation,
|
124
|
+
average_record_size_bytes=average_record_size_bytes,
|
125
|
+
)
|
126
|
+
size_bytes += entry_size
|
127
|
+
|
128
|
+
if primary_keys:
|
129
|
+
pk_size = estimate_manifest_entry_column_size_bytes(
|
130
|
+
entry=entry,
|
131
|
+
columns=primary_keys,
|
132
|
+
)
|
133
|
+
|
134
|
+
if pk_size is None:
|
135
|
+
total_pk_size += entry_size
|
136
|
+
else:
|
137
|
+
total_pk_size += pk_size
|
138
|
+
|
139
|
+
# total size + pk size + pk hash column + hash bucket index column
|
140
|
+
# Refer to hash_bucket step for more details.
|
141
|
+
total_memory = size_bytes + total_pk_size + num_rows * 20 + num_rows * 4
|
142
|
+
|
143
|
+
# Consider buffer
|
144
|
+
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
145
|
+
|
146
|
+
return {"num_cpus": 0.01, "memory": total_memory}
|
147
|
+
|
148
|
+
|
149
|
+
def merge_resource_options_provider(
|
150
|
+
index: int,
|
151
|
+
item: Tuple[int, List],
|
152
|
+
num_hash_groups: int,
|
153
|
+
hash_group_size_bytes: Dict[int, int],
|
154
|
+
hash_group_num_rows: Dict[int, int],
|
155
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
156
|
+
compacted_delta: Optional[Delta] = None,
|
157
|
+
primary_keys: Optional[List[str]] = None,
|
158
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
159
|
+
deltacat_storage_kwargs: Optional[Dict] = {},
|
160
|
+
**kwargs,
|
161
|
+
) -> Dict:
|
162
|
+
hb_group_idx = item[0]
|
163
|
+
|
164
|
+
data_size = hash_group_size_bytes.get(hb_group_idx, 0)
|
165
|
+
num_rows = hash_group_num_rows.get(hb_group_idx, 0)
|
166
|
+
|
167
|
+
pk_size_bytes = 0
|
168
|
+
|
169
|
+
if (
|
170
|
+
round_completion_info
|
171
|
+
and compacted_delta
|
172
|
+
and round_completion_info.hb_index_to_entry_range_both_inclusive
|
173
|
+
):
|
174
|
+
|
175
|
+
previous_inflation = (
|
176
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
177
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
178
|
+
)
|
179
|
+
average_record_size = (
|
180
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
181
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
182
|
+
)
|
183
|
+
|
184
|
+
iterable = hash_group_index_to_hash_bucket_indices(
|
185
|
+
hb_group_idx, round_completion_info.hash_bucket_count, num_hash_groups
|
186
|
+
)
|
187
|
+
|
188
|
+
for hb_idx in iterable:
|
189
|
+
entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
|
190
|
+
hb_idx
|
191
|
+
]
|
192
|
+
for entry_index in range(entry_start, entry_end):
|
193
|
+
entry = append_content_type_params(
|
194
|
+
compacted_delta,
|
195
|
+
entry_index=entry_index,
|
196
|
+
deltacat_storage=deltacat_storage,
|
197
|
+
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
198
|
+
)
|
199
|
+
|
200
|
+
current_entry_size = estimate_manifest_entry_size_bytes(
|
201
|
+
entry=entry, previous_inflation=previous_inflation
|
202
|
+
)
|
203
|
+
current_entry_rows = estimate_manifest_entry_num_rows(
|
204
|
+
entry=entry,
|
205
|
+
average_record_size_bytes=average_record_size,
|
206
|
+
previous_inflation=previous_inflation,
|
207
|
+
)
|
208
|
+
|
209
|
+
data_size += current_entry_size
|
210
|
+
num_rows += current_entry_rows
|
211
|
+
|
212
|
+
if primary_keys:
|
213
|
+
pk_size = estimate_manifest_entry_column_size_bytes(
|
214
|
+
entry=entry,
|
215
|
+
columns=primary_keys,
|
216
|
+
)
|
217
|
+
|
218
|
+
if pk_size is None:
|
219
|
+
pk_size_bytes += current_entry_size
|
220
|
+
else:
|
221
|
+
pk_size_bytes += pk_size
|
222
|
+
|
223
|
+
# total data downloaded + primary key hash column + primary key column + dict size for merge
|
224
|
+
total_memory = data_size + pk_size_bytes + num_rows * 20 + num_rows * 20
|
225
|
+
|
226
|
+
total_memory = total_memory * (1 + TOTAL_MEMORY_BUFFER_PERCENTAGE / 100.0)
|
227
|
+
|
228
|
+
return {"num_cpus": 0.01, "memory": total_memory}
|
@@ -5,7 +5,7 @@ import functools
|
|
5
5
|
import logging
|
6
6
|
import os
|
7
7
|
import pathlib
|
8
|
-
from typing import Dict, List, Optional, Set
|
8
|
+
from typing import Any, Dict, List, Optional, Set
|
9
9
|
|
10
10
|
import ray
|
11
11
|
from ray.types import ObjectRef
|
@@ -118,10 +118,12 @@ def collect_from_partition(
|
|
118
118
|
stat_results_s3_bucket: Optional[str] = None,
|
119
119
|
metastats_results_s3_bucket: Optional[str] = None,
|
120
120
|
deltacat_storage=unimplemented_deltacat_storage,
|
121
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
121
122
|
*args,
|
122
123
|
**kwargs,
|
123
124
|
) -> ObjectRef[Dict[int, DeltaStats]]:
|
124
|
-
|
125
|
+
if deltacat_storage_kwargs is None:
|
126
|
+
deltacat_storage_kwargs = {}
|
125
127
|
if not columns:
|
126
128
|
columns = deltacat_storage.get_table_version_column_names(
|
127
129
|
source_partition_locator.namespace,
|
@@ -33,6 +33,7 @@ def start_stats_collection(
|
|
33
33
|
stat_results_s3_bucket: Optional[str] = None,
|
34
34
|
metastats_results_s3_bucket: Optional[str] = None,
|
35
35
|
deltacat_storage=unimplemented_deltacat_storage,
|
36
|
+
**kwargs,
|
36
37
|
) -> Dict[str, List[DeltaStats]]:
|
37
38
|
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
38
39
|
Example:
|
@@ -171,6 +171,7 @@ def collect_stats_by_columns(
|
|
171
171
|
delta_annotated: DeltaAnnotated,
|
172
172
|
columns_to_compute: Optional[List[str]] = None,
|
173
173
|
deltacat_storage=unimplemented_deltacat_storage,
|
174
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
174
175
|
) -> Dict[str, Any]:
|
175
176
|
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
176
177
|
|
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
|
|
182
183
|
Returns:
|
183
184
|
A delta wide stats container
|
184
185
|
"""
|
186
|
+
if deltacat_storage_kwargs is None:
|
187
|
+
deltacat_storage_kwargs = {}
|
185
188
|
total_tables_size = 0
|
186
189
|
|
187
190
|
# Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
|
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
|
|
198
201
|
TableType.PYARROW,
|
199
202
|
columns_to_compute,
|
200
203
|
equivalent_table_types="uncompacted",
|
204
|
+
**deltacat_storage_kwargs,
|
201
205
|
)
|
202
206
|
)
|
203
207
|
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from collections import defaultdict
|
3
|
-
from typing import Dict, List, Optional
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
5
|
import pyarrow
|
6
6
|
import ray
|
@@ -83,6 +83,7 @@ def get_delta_stats(
|
|
83
83
|
delta_locator: DeltaLocator,
|
84
84
|
columns: Optional[List[str]] = None,
|
85
85
|
deltacat_storage=unimplemented_deltacat_storage,
|
86
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
86
87
|
) -> DeltaStats:
|
87
88
|
"""Ray distributed task to compute and collect stats for a requested delta.
|
88
89
|
If no columns are requested, stats will be computed for all columns.
|
@@ -93,10 +94,15 @@ def get_delta_stats(
|
|
93
94
|
Returns:
|
94
95
|
A delta wide stats container
|
95
96
|
"""
|
96
|
-
|
97
|
-
|
97
|
+
if deltacat_storage_kwargs is None:
|
98
|
+
deltacat_storage_kwargs = {}
|
99
|
+
manifest = deltacat_storage.get_delta_manifest(
|
100
|
+
delta_locator, **deltacat_storage_kwargs
|
101
|
+
)
|
98
102
|
delta = Delta.of(delta_locator, None, None, None, manifest)
|
99
|
-
return _collect_stats_by_columns(
|
103
|
+
return _collect_stats_by_columns(
|
104
|
+
delta, columns, deltacat_storage, deltacat_storage_kwargs
|
105
|
+
)
|
100
106
|
|
101
107
|
|
102
108
|
@ray.remote
|
@@ -105,6 +111,7 @@ def get_deltas_from_range(
|
|
105
111
|
start_position_inclusive: DeltaRange,
|
106
112
|
end_position_inclusive: DeltaRange,
|
107
113
|
deltacat_storage=unimplemented_deltacat_storage,
|
114
|
+
**kwargs,
|
108
115
|
) -> List[Delta]:
|
109
116
|
"""Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
|
110
117
|
|
@@ -137,6 +144,7 @@ def get_deltas_from_range(
|
|
137
144
|
end_position_inclusive,
|
138
145
|
ascending_order=True,
|
139
146
|
include_manifest=False,
|
147
|
+
**kwargs,
|
140
148
|
)
|
141
149
|
return deltas_list_result.all_items()
|
142
150
|
|
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
|
|
145
153
|
delta: Delta,
|
146
154
|
columns_to_compute: Optional[List[str]] = None,
|
147
155
|
deltacat_storage=unimplemented_deltacat_storage,
|
156
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
148
157
|
) -> DeltaStats:
|
149
158
|
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
150
159
|
Args:
|
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
|
|
154
163
|
Returns:
|
155
164
|
A delta wide stats container
|
156
165
|
"""
|
166
|
+
if deltacat_storage_kwargs is None:
|
167
|
+
deltacat_storage_kwargs = {}
|
157
168
|
assert (
|
158
169
|
delta.manifest is not None
|
159
170
|
), f"Manifest should not be missing from delta for stats calculation: {delta}"
|
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
|
|
167
178
|
for file_idx, manifest in enumerate(delta.manifest.entries):
|
168
179
|
entry_pyarrow_table: LocalTable = (
|
169
180
|
deltacat_storage.download_delta_manifest_entry(
|
170
|
-
delta,
|
181
|
+
delta,
|
182
|
+
file_idx,
|
183
|
+
TableType.PYARROW,
|
184
|
+
columns_to_compute,
|
185
|
+
**deltacat_storage_kwargs,
|
171
186
|
)
|
172
187
|
)
|
173
188
|
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|