deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
|
|
1
1
|
import logging
|
2
|
-
import time
|
3
2
|
from collections import defaultdict
|
4
|
-
from itertools import repeat
|
5
3
|
from typing import Any, Dict, List, Optional, Tuple
|
6
4
|
|
7
5
|
import numpy as np
|
@@ -10,7 +8,6 @@ import pyarrow.compute as pc
|
|
10
8
|
import ray
|
11
9
|
from ray import cloudpickle
|
12
10
|
from ray.types import ObjectRef
|
13
|
-
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
14
11
|
|
15
12
|
from deltacat import logs
|
16
13
|
from deltacat.compute.compactor import (
|
@@ -25,15 +22,7 @@ from deltacat.compute.compactor import (
|
|
25
22
|
from deltacat.compute.compactor.utils import primary_key_index as pki
|
26
23
|
from deltacat.compute.compactor.utils import system_columns as sc
|
27
24
|
from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
|
28
|
-
from deltacat.storage import DeltaType
|
29
|
-
from deltacat.compute.compactor import SortKey, SortOrder, \
|
30
|
-
RoundCompletionInfo, PrimaryKeyIndexVersionLocator, DeltaFileEnvelope, \
|
31
|
-
DeltaFileLocator, PyArrowWriteResult
|
32
|
-
from deltacat.compute.compactor.utils import system_columns as sc, \
|
33
|
-
primary_key_index as pki
|
34
25
|
from deltacat.utils.performance import timed_invocation
|
35
|
-
|
36
|
-
from typing import Any, Dict, List, Optional, Tuple
|
37
26
|
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
38
27
|
|
39
28
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -46,18 +35,21 @@ DedupeTaskIndexWithObjectId = Tuple[DedupeTaskIndex, PickledObjectRef]
|
|
46
35
|
DedupeResult = Tuple[
|
47
36
|
Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId],
|
48
37
|
List[ObjectRef[DeltaFileLocatorToRecords]],
|
49
|
-
PyArrowWriteResult
|
38
|
+
PyArrowWriteResult,
|
50
39
|
]
|
51
40
|
|
52
41
|
|
53
42
|
def _union_primary_key_indices(
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
43
|
+
s3_bucket: str,
|
44
|
+
round_completion_info: RoundCompletionInfo,
|
45
|
+
hash_bucket_index: int,
|
46
|
+
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
47
|
+
) -> pa.Table:
|
48
|
+
|
49
|
+
logger.info(
|
50
|
+
f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
|
51
|
+
f"{len(df_envelopes_list)} delta file envelope lists..."
|
52
|
+
)
|
61
53
|
# read compacted input parquet files first
|
62
54
|
# (which implicitly have older stream positions than deltas)
|
63
55
|
hb_tables = []
|
@@ -67,12 +59,16 @@ def _union_primary_key_indices(
|
|
67
59
|
hash_bucket_index,
|
68
60
|
round_completion_info.primary_key_index_version_locator,
|
69
61
|
# Enforce consistent column ordering by reading from a schema, to prevent schema mismatch errors
|
70
|
-
file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
62
|
+
file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
63
|
+
schema=get_minimal_hb_schema()
|
64
|
+
),
|
71
65
|
)
|
72
66
|
if tables:
|
73
67
|
prior_pk_index_table = pa.concat_tables(tables)
|
74
|
-
logger.info(
|
75
|
-
|
68
|
+
logger.info(
|
69
|
+
f"Number of records in prior primary index for hash bucket"
|
70
|
+
f" {hash_bucket_index}: {prior_pk_index_table.num_rows}"
|
71
|
+
)
|
76
72
|
hb_tables.append(prior_pk_index_table)
|
77
73
|
|
78
74
|
# sort by delta file stream position now instead of sorting every row later
|
@@ -87,43 +83,53 @@ def _union_primary_key_indices(
|
|
87
83
|
|
88
84
|
hb_table = pa.concat_tables(hb_tables)
|
89
85
|
|
90
|
-
logger.info(
|
86
|
+
logger.info(
|
87
|
+
f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}"
|
88
|
+
)
|
91
89
|
return hb_table
|
92
90
|
|
93
91
|
|
94
92
|
def _drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
|
95
93
|
value_to_last_row_idx = {}
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
94
|
+
|
95
|
+
pk_hash_np = sc.pk_hash_column_np(table)
|
96
|
+
op_type_np = sc.delta_type_column_np(table)
|
97
|
+
|
98
|
+
assert len(pk_hash_np) == len(op_type_np), (
|
99
|
+
f"Primary key digest column length ({len(pk_hash_np)}) doesn't "
|
100
|
+
f"match delta type column length ({len(op_type_np)})."
|
100
101
|
)
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
102
|
+
|
103
|
+
# TODO(raghumdani): move the dedupe to C++ using arrow methods or similar.
|
104
|
+
row_idx = 0
|
105
|
+
pk_op_val_iter = zip(pk_hash_np, op_type_np)
|
106
|
+
for (pk_val, op_val) in pk_op_val_iter:
|
107
|
+
|
108
|
+
# operation type is True for `UPSERT` and False for `DELETE`
|
109
|
+
if op_val:
|
110
|
+
# UPSERT this row
|
111
|
+
value_to_last_row_idx[pk_val] = row_idx
|
112
|
+
else:
|
113
|
+
# DELETE this row
|
114
|
+
value_to_last_row_idx.pop(pk_val, None)
|
115
|
+
|
116
|
+
row_idx += 1
|
117
|
+
|
115
118
|
return table.take(list(value_to_last_row_idx.values()))
|
116
119
|
|
117
120
|
|
118
121
|
def _write_new_primary_key_index(
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
122
|
+
s3_bucket: str,
|
123
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
124
|
+
max_rows_per_index_file: int,
|
125
|
+
dedupe_task_index: int,
|
126
|
+
deduped_tables: List[Tuple[int, pa.Table]],
|
127
|
+
) -> PyArrowWriteResult:
|
128
|
+
|
129
|
+
logger.info(
|
130
|
+
f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
|
131
|
+
f"{new_primary_key_index_version_locator}"
|
132
|
+
)
|
127
133
|
|
128
134
|
pki_results = []
|
129
135
|
for hb_index, table in deduped_tables:
|
@@ -137,36 +143,43 @@ def _write_new_primary_key_index(
|
|
137
143
|
pki_results.append(hb_pki_result)
|
138
144
|
|
139
145
|
result = PyArrowWriteResult.union(pki_results)
|
140
|
-
logger.info(
|
141
|
-
|
146
|
+
logger.info(
|
147
|
+
f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
|
148
|
+
f"{new_primary_key_index_version_locator}. Result: {result}"
|
149
|
+
)
|
142
150
|
return result
|
143
151
|
|
144
152
|
|
145
153
|
def delta_file_locator_to_mat_bucket_index(
|
146
|
-
|
147
|
-
|
154
|
+
df_locator: DeltaFileLocator, materialize_bucket_count: int
|
155
|
+
) -> int:
|
148
156
|
digest = df_locator.digest()
|
149
157
|
return int.from_bytes(digest, "big") % materialize_bucket_count
|
150
158
|
|
159
|
+
|
151
160
|
@ray.remote(num_returns=3)
|
152
161
|
def dedupe(
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
+
compaction_artifact_s3_bucket: str,
|
163
|
+
round_completion_info: Optional[RoundCompletionInfo],
|
164
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
165
|
+
object_ids: List[Any],
|
166
|
+
sort_keys: List[SortKey],
|
167
|
+
max_records_per_index_file: int,
|
168
|
+
num_materialize_buckets: int,
|
169
|
+
dedupe_task_index: int,
|
170
|
+
delete_old_primary_key_index: bool,
|
171
|
+
) -> DedupeResult:
|
162
172
|
|
163
173
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
164
174
|
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
165
175
|
# severe skew of primary key updates in deltas
|
166
176
|
src_file_records_obj_refs = [
|
167
|
-
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
168
|
-
|
169
|
-
|
177
|
+
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
178
|
+
]
|
179
|
+
logger.info(
|
180
|
+
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
181
|
+
f"groups for {len(src_file_records_obj_refs)} object refs..."
|
182
|
+
)
|
170
183
|
|
171
184
|
delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
|
172
185
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
@@ -176,8 +189,10 @@ def dedupe(
|
|
176
189
|
hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
|
177
190
|
src_file_id_to_row_indices = defaultdict(list)
|
178
191
|
deduped_tables = []
|
179
|
-
logger.info(
|
180
|
-
|
192
|
+
logger.info(
|
193
|
+
f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
|
194
|
+
f"dedupe rounds..."
|
195
|
+
)
|
181
196
|
for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
|
182
197
|
logger.info(f"{dedupe_task_index}: union primary keys for hb_index: {hb_idx}")
|
183
198
|
|
@@ -186,32 +201,39 @@ def dedupe(
|
|
186
201
|
s3_bucket=compaction_artifact_s3_bucket,
|
187
202
|
round_completion_info=round_completion_info,
|
188
203
|
hash_bucket_index=hb_idx,
|
189
|
-
df_envelopes_list=dfe_list
|
190
|
-
|
191
|
-
|
204
|
+
df_envelopes_list=dfe_list,
|
205
|
+
)
|
206
|
+
logger.info(
|
207
|
+
f"[Dedupe {dedupe_task_index}] Dedupe round input "
|
208
|
+
f"record count: {len(table)}, took {union_time}s"
|
209
|
+
)
|
192
210
|
|
193
211
|
# sort by sort keys
|
194
212
|
if len(sort_keys):
|
195
213
|
# TODO (pdames): convert to O(N) dedupe w/ sort keys
|
196
|
-
sort_keys.extend(
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
),
|
205
|
-
])
|
214
|
+
sort_keys.extend(
|
215
|
+
[
|
216
|
+
SortKey.of(
|
217
|
+
sc._PARTITION_STREAM_POSITION_COLUMN_NAME, SortOrder.ASCENDING
|
218
|
+
),
|
219
|
+
SortKey.of(sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING),
|
220
|
+
]
|
221
|
+
)
|
206
222
|
table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
|
207
223
|
|
208
224
|
# drop duplicates by primary key hash column
|
209
|
-
logger.info(
|
225
|
+
logger.info(
|
226
|
+
f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}"
|
227
|
+
)
|
210
228
|
|
211
|
-
table, drop_time = timed_invocation(
|
229
|
+
table, drop_time = timed_invocation(
|
230
|
+
func=_drop_duplicates_by_primary_key_hash, table=table
|
231
|
+
)
|
212
232
|
|
213
|
-
logger.info(
|
214
|
-
|
233
|
+
logger.info(
|
234
|
+
f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
|
235
|
+
f"record count: {len(table)}, took: {drop_time}s"
|
236
|
+
)
|
215
237
|
|
216
238
|
deduped_tables.append((hb_idx, table))
|
217
239
|
|
@@ -230,7 +252,9 @@ def dedupe(
|
|
230
252
|
|
231
253
|
logger.info(f"Finished all dedupe rounds...")
|
232
254
|
mat_bucket_to_src_file_record_count = defaultdict(dict)
|
233
|
-
mat_bucket_to_src_file_records: Dict[
|
255
|
+
mat_bucket_to_src_file_records: Dict[
|
256
|
+
MaterializeBucketIndex, DeltaFileLocatorToRecords
|
257
|
+
] = defaultdict(dict)
|
234
258
|
for src_dfl, src_row_indices in src_file_id_to_row_indices.items():
|
235
259
|
mat_bucket = delta_file_locator_to_mat_bucket_index(
|
236
260
|
src_dfl,
|
@@ -239,10 +263,11 @@ def dedupe(
|
|
239
263
|
mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
|
240
264
|
src_row_indices,
|
241
265
|
)
|
242
|
-
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] =
|
243
|
-
len(src_row_indices)
|
266
|
+
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(src_row_indices)
|
244
267
|
|
245
|
-
mat_bucket_to_dd_idx_obj_id: Dict[
|
268
|
+
mat_bucket_to_dd_idx_obj_id: Dict[
|
269
|
+
MaterializeBucketIndex, DedupeTaskIndexWithObjectId
|
270
|
+
] = {}
|
246
271
|
src_file_records_obj_refs: List[ObjectRef[DeltaFileLocatorToRecords]] = []
|
247
272
|
for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
|
248
273
|
object_ref = ray.put(src_file_records)
|
@@ -254,15 +279,17 @@ def dedupe(
|
|
254
279
|
)
|
255
280
|
del object_ref
|
256
281
|
del pickled_object_ref
|
257
|
-
logger.info(
|
258
|
-
|
282
|
+
logger.info(
|
283
|
+
f"Count of materialize buckets with object refs: "
|
284
|
+
f"{len(mat_bucket_to_dd_idx_obj_id)}"
|
285
|
+
)
|
259
286
|
|
260
287
|
write_pki_result: PyArrowWriteResult = _write_new_primary_key_index(
|
261
288
|
compaction_artifact_s3_bucket,
|
262
289
|
new_primary_key_index_version_locator,
|
263
290
|
max_records_per_index_file,
|
264
291
|
dedupe_task_index,
|
265
|
-
deduped_tables
|
292
|
+
deduped_tables,
|
266
293
|
)
|
267
294
|
|
268
295
|
if delete_old_primary_key_index:
|
@@ -271,6 +298,4 @@ def dedupe(
|
|
271
298
|
round_completion_info.primary_key_index_version_locator,
|
272
299
|
)
|
273
300
|
logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
|
274
|
-
return mat_bucket_to_dd_idx_obj_id,
|
275
|
-
src_file_records_obj_refs, \
|
276
|
-
write_pki_result
|
301
|
+
return mat_bucket_to_dd_idx_obj_id, src_file_records_obj_refs, write_pki_result
|
@@ -1,37 +1,37 @@
|
|
1
|
-
import ray
|
2
|
-
import pyarrow as pa
|
3
|
-
import numpy as np
|
4
1
|
import logging
|
5
|
-
|
6
|
-
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
7
2
|
from itertools import chain
|
3
|
+
from typing import Generator, List, Optional, Tuple
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pyarrow as pa
|
7
|
+
import ray
|
8
|
+
from ray.types import ObjectRef
|
8
9
|
|
9
10
|
from deltacat import logs
|
10
|
-
from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope,
|
11
|
-
|
12
|
-
from deltacat.compute.compactor.utils
|
13
|
-
|
11
|
+
from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, SortKey
|
12
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
13
|
+
from deltacat.compute.compactor.utils import system_columns as sc
|
14
|
+
from deltacat.compute.compactor.utils.primary_key_index import (
|
15
|
+
group_hash_bucket_indices,
|
16
|
+
group_record_indices_by_hash_bucket,
|
17
|
+
)
|
14
18
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
15
19
|
from deltacat.types.media import StorageType
|
16
20
|
from deltacat.utils.common import sha1_digest
|
17
|
-
from deltacat.compute.compactor.utils import system_columns as sc
|
18
|
-
|
19
|
-
from typing import List, Optional, Generator, Tuple
|
20
|
-
|
21
|
-
from ray.types import ObjectRef
|
22
21
|
|
23
22
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
24
23
|
|
25
|
-
_PK_BYTES_DELIMITER = b
|
24
|
+
_PK_BYTES_DELIMITER = b"L6kl7u5f"
|
26
25
|
|
27
26
|
HashBucketGroupToObjectId = np.ndarray
|
28
|
-
HashBucketResult = Tuple[
|
27
|
+
HashBucketResult = Tuple[
|
28
|
+
HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]
|
29
|
+
]
|
29
30
|
|
30
31
|
|
31
32
|
def _group_by_pk_hash_bucket(
|
32
|
-
|
33
|
-
|
34
|
-
primary_keys: List[str]) -> np.ndarray:
|
33
|
+
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
34
|
+
) -> np.ndarray:
|
35
35
|
|
36
36
|
# generate the primary key digest column
|
37
37
|
all_pk_column_fields = []
|
@@ -66,19 +66,17 @@ def _hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
|
|
66
66
|
for field_index in range(len(all_column_fields[0])):
|
67
67
|
bytes_to_join = []
|
68
68
|
for column_fields in all_column_fields:
|
69
|
-
bytes_to_join.append(
|
70
|
-
bytes(str(column_fields[field_index]), "utf-8")
|
71
|
-
)
|
69
|
+
bytes_to_join.append(bytes(str(column_fields[field_index]), "utf-8"))
|
72
70
|
yield sha1_digest(_PK_BYTES_DELIMITER.join(bytes_to_join))
|
73
71
|
|
74
72
|
|
75
73
|
def _group_file_records_by_pk_hash_bucket(
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
74
|
+
annotated_delta: DeltaAnnotated,
|
75
|
+
num_hash_buckets: int,
|
76
|
+
primary_keys: List[str],
|
77
|
+
sort_key_names: List[str],
|
78
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
79
|
+
) -> Optional[DeltaFileEnvelopeGroups]:
|
82
80
|
|
83
81
|
# read input parquet s3 objects into a list of delta file envelopes
|
84
82
|
delta_file_envelopes = _read_delta_file_envelopes(
|
@@ -104,18 +102,18 @@ def _group_file_records_by_pk_hash_bucket(
|
|
104
102
|
hb_to_delta_file_envelopes[hb] = []
|
105
103
|
hb_to_delta_file_envelopes[hb].append(
|
106
104
|
DeltaFileEnvelope.of(
|
107
|
-
dfe.stream_position,
|
108
|
-
|
109
|
-
|
110
|
-
table))
|
105
|
+
dfe.stream_position, dfe.file_index, dfe.delta_type, table
|
106
|
+
)
|
107
|
+
)
|
111
108
|
return hb_to_delta_file_envelopes
|
112
109
|
|
110
|
+
|
113
111
|
def _read_delta_file_envelopes(
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
annotated_delta: DeltaAnnotated,
|
113
|
+
primary_keys: List[str],
|
114
|
+
sort_key_names: List[str],
|
115
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
116
|
+
) -> Optional[List[DeltaFileEnvelope]]:
|
119
117
|
|
120
118
|
columns_to_read = list(chain(primary_keys, sort_key_names))
|
121
119
|
tables = deltacat_storage.download_delta(
|
@@ -125,10 +123,12 @@ def _read_delta_file_envelopes(
|
|
125
123
|
storage_type=StorageType.LOCAL,
|
126
124
|
)
|
127
125
|
annotations = annotated_delta.annotations
|
128
|
-
assert
|
129
|
-
|
130
|
-
|
131
|
-
|
126
|
+
assert (
|
127
|
+
len(tables) == len(annotations),
|
128
|
+
f"Unexpected Error: Length of downloaded delta manifest tables "
|
129
|
+
f"({len(tables)}) doesn't match the length of delta manifest "
|
130
|
+
f"annotations ({len(annotations)}).",
|
131
|
+
)
|
132
132
|
if not tables:
|
133
133
|
return None
|
134
134
|
|
@@ -146,12 +146,13 @@ def _read_delta_file_envelopes(
|
|
146
146
|
|
147
147
|
@ray.remote(num_returns=2)
|
148
148
|
def hash_bucket(
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
149
|
+
annotated_delta: DeltaAnnotated,
|
150
|
+
primary_keys: List[str],
|
151
|
+
sort_keys: List[SortKey],
|
152
|
+
num_buckets: int,
|
153
|
+
num_groups: int,
|
154
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
155
|
+
) -> HashBucketResult:
|
155
156
|
|
156
157
|
logger.info(f"Starting hash bucket task...")
|
157
158
|
sort_key_names = [key.key_name for key in sort_keys]
|