deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +188 -218
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +259 -316
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +152 -259
- deltacat/compute/compactor/steps/hash_bucket.py +57 -73
- deltacat/compute/compactor/steps/materialize.py +138 -99
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +131 -90
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -42
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +8 -10
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +276 -231
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +38 -32
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
- deltacat-0.1.11.dist-info/RECORD +110 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
- deltacat/autoscaler/events/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/cluster.py +0 -82
- deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
- deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
- deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
- deltacat/autoscaler/events/compaction/input.py +0 -27
- deltacat/autoscaler/events/compaction/process.py +0 -25
- deltacat/autoscaler/events/compaction/session_manager.py +0 -13
- deltacat/autoscaler/events/compaction/utils.py +0 -216
- deltacat/autoscaler/events/compaction/workflow.py +0 -303
- deltacat/autoscaler/events/dispatcher.py +0 -95
- deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
- deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
- deltacat/autoscaler/events/event_store.py +0 -55
- deltacat/autoscaler/events/exceptions.py +0 -6
- deltacat/autoscaler/events/processor.py +0 -177
- deltacat/autoscaler/events/session_manager.py +0 -25
- deltacat/autoscaler/events/states.py +0 -88
- deltacat/autoscaler/events/workflow.py +0 -54
- deltacat/autoscaler/node_group.py +0 -230
- deltacat/autoscaler/utils.py +0 -69
- deltacat-0.1.8.dist-info/RECORD +0 -131
- /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
- {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,29 @@
|
|
1
1
|
import logging
|
2
|
+
from collections import defaultdict
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
4
|
+
|
5
|
+
import numpy as np
|
2
6
|
import pyarrow as pa
|
3
|
-
import ray
|
4
|
-
import time
|
5
7
|
import pyarrow.compute as pc
|
6
|
-
import
|
7
|
-
from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
|
8
|
-
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
8
|
+
import ray
|
9
9
|
from ray import cloudpickle
|
10
10
|
from ray.types import ObjectRef
|
11
11
|
|
12
12
|
from deltacat import logs
|
13
|
-
from
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
from
|
23
|
-
from
|
13
|
+
from deltacat.compute.compactor import (
|
14
|
+
DeltaFileEnvelope,
|
15
|
+
DeltaFileLocator,
|
16
|
+
PrimaryKeyIndexVersionLocator,
|
17
|
+
PyArrowWriteResult,
|
18
|
+
RoundCompletionInfo,
|
19
|
+
SortKey,
|
20
|
+
SortOrder,
|
21
|
+
)
|
22
|
+
from deltacat.compute.compactor.utils import primary_key_index as pki
|
23
|
+
from deltacat.compute.compactor.utils import system_columns as sc
|
24
|
+
from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
|
25
|
+
from deltacat.utils.performance import timed_invocation
|
26
|
+
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
24
27
|
|
25
28
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
29
|
|
@@ -32,18 +35,21 @@ DedupeTaskIndexWithObjectId = Tuple[DedupeTaskIndex, PickledObjectRef]
|
|
32
35
|
DedupeResult = Tuple[
|
33
36
|
Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId],
|
34
37
|
List[ObjectRef[DeltaFileLocatorToRecords]],
|
35
|
-
PyArrowWriteResult
|
38
|
+
PyArrowWriteResult,
|
36
39
|
]
|
37
40
|
|
38
41
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
42
|
+
def _union_primary_key_indices(
|
43
|
+
s3_bucket: str,
|
44
|
+
round_completion_info: RoundCompletionInfo,
|
45
|
+
hash_bucket_index: int,
|
46
|
+
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
47
|
+
) -> pa.Table:
|
44
48
|
|
45
|
-
logger.info(
|
46
|
-
|
49
|
+
logger.info(
|
50
|
+
f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
|
51
|
+
f"{len(df_envelopes_list)} delta file envelope lists..."
|
52
|
+
)
|
47
53
|
# read compacted input parquet files first
|
48
54
|
# (which implicitly have older stream positions than deltas)
|
49
55
|
hb_tables = []
|
@@ -53,37 +59,15 @@ def union_primary_key_indices(
|
|
53
59
|
hash_bucket_index,
|
54
60
|
round_completion_info.primary_key_index_version_locator,
|
55
61
|
# Enforce consistent column ordering by reading from a schema, to prevent schema mismatch errors
|
56
|
-
file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
62
|
+
file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
63
|
+
schema=get_minimal_hb_schema()
|
64
|
+
),
|
57
65
|
)
|
58
66
|
if tables:
|
59
|
-
prev_compacted_delta_stream_pos = round_completion_info\
|
60
|
-
.compacted_delta_locator \
|
61
|
-
.stream_position
|
62
|
-
if prev_compacted_delta_stream_pos is None:
|
63
|
-
raise ValueError(f"Unexpected Error: No previous compacted "
|
64
|
-
f"delta stream position found in round "
|
65
|
-
f"completion info: {round_completion_info}")
|
66
67
|
prior_pk_index_table = pa.concat_tables(tables)
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
prev_compacted_delta_stream_pos,
|
71
|
-
len(prior_pk_index_table),
|
72
|
-
),
|
73
|
-
)
|
74
|
-
prior_pk_index_table = sc.append_delta_type_col(
|
75
|
-
prior_pk_index_table,
|
76
|
-
repeat(
|
77
|
-
sc.delta_type_to_field(DeltaType.UPSERT),
|
78
|
-
len(prior_pk_index_table),
|
79
|
-
)
|
80
|
-
)
|
81
|
-
prior_pk_index_table = sc.append_is_source_col(
|
82
|
-
prior_pk_index_table,
|
83
|
-
repeat(
|
84
|
-
False,
|
85
|
-
len(prior_pk_index_table),
|
86
|
-
)
|
68
|
+
logger.info(
|
69
|
+
f"Number of records in prior primary index for hash bucket"
|
70
|
+
f" {hash_bucket_index}: {prior_pk_index_table.num_rows}"
|
87
71
|
)
|
88
72
|
hb_tables.append(prior_pk_index_table)
|
89
73
|
|
@@ -99,114 +83,56 @@ def union_primary_key_indices(
|
|
99
83
|
|
100
84
|
hb_table = pa.concat_tables(hb_tables)
|
101
85
|
|
86
|
+
logger.info(
|
87
|
+
f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}"
|
88
|
+
)
|
102
89
|
return hb_table
|
103
90
|
|
104
91
|
|
105
|
-
def
|
92
|
+
def _drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
|
106
93
|
value_to_last_row_idx = {}
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
94
|
+
|
95
|
+
pk_hash_np = sc.pk_hash_column_np(table)
|
96
|
+
op_type_np = sc.delta_type_column_np(table)
|
97
|
+
|
98
|
+
assert len(pk_hash_np) == len(op_type_np), (
|
99
|
+
f"Primary key digest column length ({len(pk_hash_np)}) doesn't "
|
100
|
+
f"match delta type column length ({len(op_type_np)})."
|
111
101
|
)
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
102
|
+
|
103
|
+
# TODO(raghumdani): move the dedupe to C++ using arrow methods or similar.
|
104
|
+
row_idx = 0
|
105
|
+
pk_op_val_iter = zip(pk_hash_np, op_type_np)
|
106
|
+
for (pk_val, op_val) in pk_op_val_iter:
|
107
|
+
|
108
|
+
# operation type is True for `UPSERT` and False for `DELETE`
|
109
|
+
if op_val:
|
110
|
+
# UPSERT this row
|
111
|
+
value_to_last_row_idx[pk_val] = row_idx
|
112
|
+
else:
|
113
|
+
# DELETE this row
|
114
|
+
value_to_last_row_idx.pop(pk_val, None)
|
115
|
+
|
116
|
+
row_idx += 1
|
117
|
+
|
126
118
|
return table.take(list(value_to_last_row_idx.values()))
|
127
119
|
|
128
120
|
|
129
|
-
def
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
logger.info(f"Writing new deduped primary key index: "
|
141
|
-
f"{new_primary_key_index_version_locator}")
|
142
|
-
# TODO (pdames): move to RecordCountsPendingMaterialize.finalize()?
|
143
|
-
file_idx = 0
|
144
|
-
prev_file_idx = 0
|
145
|
-
dest_file_indices = defaultdict(
|
146
|
-
lambda: defaultdict(
|
147
|
-
lambda: defaultdict(int)
|
148
|
-
)
|
149
|
-
)
|
150
|
-
dest_file_row_indices = defaultdict(
|
151
|
-
lambda: defaultdict(
|
152
|
-
lambda: defaultdict(int)
|
153
|
-
)
|
121
|
+
def _write_new_primary_key_index(
|
122
|
+
s3_bucket: str,
|
123
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
124
|
+
max_rows_per_index_file: int,
|
125
|
+
dedupe_task_index: int,
|
126
|
+
deduped_tables: List[Tuple[int, pa.Table]],
|
127
|
+
) -> PyArrowWriteResult:
|
128
|
+
|
129
|
+
logger.info(
|
130
|
+
f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
|
131
|
+
f"{new_primary_key_index_version_locator}"
|
154
132
|
)
|
155
|
-
for mat_bucket in sorted(row_counts.keys()):
|
156
|
-
mat_bucket_row_idx = 0
|
157
|
-
sorted_src_dfls = sorted(row_counts[mat_bucket].keys())
|
158
|
-
for src_dfl in sorted_src_dfls:
|
159
|
-
sorted_dd_tasks = sorted(row_counts[mat_bucket][src_dfl].keys())
|
160
|
-
for dd_task_idx in sorted_dd_tasks:
|
161
|
-
dest_file_row_indices[mat_bucket][src_dfl][dd_task_idx] = \
|
162
|
-
mat_bucket_row_idx % max_rows_per_mat_file
|
163
|
-
file_idx = prev_file_idx + int(
|
164
|
-
mat_bucket_row_idx / max_rows_per_mat_file
|
165
|
-
)
|
166
|
-
dest_file_indices[mat_bucket][src_dfl][dd_task_idx] = file_idx
|
167
|
-
row_count = row_counts[mat_bucket][src_dfl][dd_task_idx]
|
168
|
-
mat_bucket_row_idx += row_count
|
169
|
-
prev_file_idx = file_idx + 1
|
170
133
|
|
171
134
|
pki_results = []
|
172
|
-
src_dfl_row_counts = defaultdict(int)
|
173
135
|
for hb_index, table in deduped_tables:
|
174
|
-
is_source_col = sc.is_source_column_np(table)
|
175
|
-
stream_pos_col = sc.stream_position_column_np(table)
|
176
|
-
file_idx_col = sc.file_index_column_np(table)
|
177
|
-
dest_file_idx_col = []
|
178
|
-
dest_file_row_idx_col = []
|
179
|
-
for row_idx in range(len(table)):
|
180
|
-
src_dfl = DeltaFileLocator.of(
|
181
|
-
is_source_col[row_idx],
|
182
|
-
stream_pos_col[row_idx],
|
183
|
-
file_idx_col[row_idx],
|
184
|
-
)
|
185
|
-
mat_bucket = delta_file_locator_to_mat_bucket_index(
|
186
|
-
src_dfl,
|
187
|
-
num_materialize_buckets,
|
188
|
-
)
|
189
|
-
dest_file_start_idx = \
|
190
|
-
dest_file_indices[mat_bucket][src_dfl][dedupe_task_index]
|
191
|
-
dest_file_row_idx_offset = src_dfl_row_counts[src_dfl] + \
|
192
|
-
dest_file_row_indices[mat_bucket][src_dfl][dedupe_task_index]
|
193
|
-
dest_file_idx_offset = int(
|
194
|
-
dest_file_row_idx_offset / max_rows_per_mat_file
|
195
|
-
)
|
196
|
-
dest_file_idx = dest_file_start_idx + dest_file_idx_offset
|
197
|
-
dest_file_idx_col.append(dest_file_idx)
|
198
|
-
dest_file_row_idx = dest_file_row_idx_offset % max_rows_per_mat_file
|
199
|
-
dest_file_row_idx_col.append(dest_file_row_idx)
|
200
|
-
src_dfl_row_counts[src_dfl] += 1
|
201
|
-
table = table.drop([
|
202
|
-
sc._IS_SOURCE_COLUMN_NAME,
|
203
|
-
sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
|
204
|
-
sc._ORDERED_FILE_IDX_COLUMN_NAME,
|
205
|
-
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
206
|
-
])
|
207
|
-
table = sc.append_file_idx_column(table, dest_file_idx_col)
|
208
|
-
table = sc.append_record_idx_col(table, dest_file_row_idx_col)
|
209
|
-
|
210
136
|
hb_pki_result = pki.write_primary_key_index_files(
|
211
137
|
table,
|
212
138
|
new_primary_key_index_version_locator,
|
@@ -217,77 +143,44 @@ def write_new_primary_key_index(
|
|
217
143
|
pki_results.append(hb_pki_result)
|
218
144
|
|
219
145
|
result = PyArrowWriteResult.union(pki_results)
|
220
|
-
logger.info(
|
221
|
-
|
146
|
+
logger.info(
|
147
|
+
f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
|
148
|
+
f"{new_primary_key_index_version_locator}. Result: {result}"
|
149
|
+
)
|
222
150
|
return result
|
223
151
|
|
224
152
|
|
225
153
|
def delta_file_locator_to_mat_bucket_index(
|
226
|
-
|
227
|
-
|
154
|
+
df_locator: DeltaFileLocator, materialize_bucket_count: int
|
155
|
+
) -> int:
|
228
156
|
digest = df_locator.digest()
|
229
157
|
return int.from_bytes(digest, "big") % materialize_bucket_count
|
230
158
|
|
231
159
|
|
232
|
-
@ray.remote(num_cpus=0.1)
|
233
|
-
class RecordCountsPendingMaterialize:
|
234
|
-
def __init__(self, expected_result_count: int):
|
235
|
-
# materialize_bucket -> src_file_id
|
236
|
-
self.record_counts = defaultdict(
|
237
|
-
# delta_file_locator -> dedupe task index
|
238
|
-
lambda: defaultdict(
|
239
|
-
# dedupe task index -> row count
|
240
|
-
lambda: defaultdict(int)
|
241
|
-
)
|
242
|
-
)
|
243
|
-
self.expected_result_count = expected_result_count
|
244
|
-
self.actual_result_count = 0
|
245
|
-
|
246
|
-
def add_record_counts(
|
247
|
-
self,
|
248
|
-
result_idx: int,
|
249
|
-
record_counts:
|
250
|
-
Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32], int]]) -> None:
|
251
|
-
for mat_bucket, df_locator_rows in record_counts.items():
|
252
|
-
for df_locator, rows in df_locator_rows.items():
|
253
|
-
self.record_counts[mat_bucket][df_locator][result_idx] += rows
|
254
|
-
self.actual_result_count += 1
|
255
|
-
|
256
|
-
def get_record_counts(self) -> \
|
257
|
-
Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32],
|
258
|
-
Dict[int, int]]]:
|
259
|
-
return self.record_counts
|
260
|
-
|
261
|
-
def get_expected_result_count(self) -> int:
|
262
|
-
return self.expected_result_count
|
263
|
-
|
264
|
-
def get_actual_result_count(self) -> int:
|
265
|
-
return self.actual_result_count
|
266
|
-
|
267
|
-
def is_finalized(self) -> bool:
|
268
|
-
return self.actual_result_count == self.expected_result_count
|
269
|
-
|
270
|
-
|
271
160
|
@ray.remote(num_returns=3)
|
272
161
|
def dedupe(
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
logger.info(f"Starting dedupe task...")
|
162
|
+
compaction_artifact_s3_bucket: str,
|
163
|
+
round_completion_info: Optional[RoundCompletionInfo],
|
164
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
165
|
+
object_ids: List[Any],
|
166
|
+
sort_keys: List[SortKey],
|
167
|
+
max_records_per_index_file: int,
|
168
|
+
num_materialize_buckets: int,
|
169
|
+
dedupe_task_index: int,
|
170
|
+
delete_old_primary_key_index: bool,
|
171
|
+
) -> DedupeResult:
|
172
|
+
|
173
|
+
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
286
174
|
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
287
175
|
# severe skew of primary key updates in deltas
|
288
176
|
src_file_records_obj_refs = [
|
289
|
-
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
290
|
-
|
177
|
+
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
178
|
+
]
|
179
|
+
logger.info(
|
180
|
+
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
181
|
+
f"groups for {len(src_file_records_obj_refs)} object refs..."
|
182
|
+
)
|
183
|
+
|
291
184
|
delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
|
292
185
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
293
186
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
@@ -296,36 +189,51 @@ def dedupe(
|
|
296
189
|
hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
|
297
190
|
src_file_id_to_row_indices = defaultdict(list)
|
298
191
|
deduped_tables = []
|
299
|
-
logger.info(
|
300
|
-
|
192
|
+
logger.info(
|
193
|
+
f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
|
194
|
+
f"dedupe rounds..."
|
195
|
+
)
|
301
196
|
for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
197
|
+
logger.info(f"{dedupe_task_index}: union primary keys for hb_index: {hb_idx}")
|
198
|
+
|
199
|
+
table, union_time = timed_invocation(
|
200
|
+
func=_union_primary_key_indices,
|
201
|
+
s3_bucket=compaction_artifact_s3_bucket,
|
202
|
+
round_completion_info=round_completion_info,
|
203
|
+
hash_bucket_index=hb_idx,
|
204
|
+
df_envelopes_list=dfe_list,
|
205
|
+
)
|
206
|
+
logger.info(
|
207
|
+
f"[Dedupe {dedupe_task_index}] Dedupe round input "
|
208
|
+
f"record count: {len(table)}, took {union_time}s"
|
307
209
|
)
|
308
|
-
logger.info(f"Dedupe round input record count: {len(table)}")
|
309
210
|
|
310
211
|
# sort by sort keys
|
311
212
|
if len(sort_keys):
|
312
213
|
# TODO (pdames): convert to O(N) dedupe w/ sort keys
|
313
|
-
sort_keys.extend(
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
),
|
322
|
-
])
|
214
|
+
sort_keys.extend(
|
215
|
+
[
|
216
|
+
SortKey.of(
|
217
|
+
sc._PARTITION_STREAM_POSITION_COLUMN_NAME, SortOrder.ASCENDING
|
218
|
+
),
|
219
|
+
SortKey.of(sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING),
|
220
|
+
]
|
221
|
+
)
|
323
222
|
table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
|
324
223
|
|
325
224
|
# drop duplicates by primary key hash column
|
326
|
-
|
327
|
-
|
328
|
-
|
225
|
+
logger.info(
|
226
|
+
f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}"
|
227
|
+
)
|
228
|
+
|
229
|
+
table, drop_time = timed_invocation(
|
230
|
+
func=_drop_duplicates_by_primary_key_hash, table=table
|
231
|
+
)
|
232
|
+
|
233
|
+
logger.info(
|
234
|
+
f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
|
235
|
+
f"record count: {len(table)}, took: {drop_time}s"
|
236
|
+
)
|
329
237
|
|
330
238
|
deduped_tables.append((hb_idx, table))
|
331
239
|
|
@@ -344,7 +252,9 @@ def dedupe(
|
|
344
252
|
|
345
253
|
logger.info(f"Finished all dedupe rounds...")
|
346
254
|
mat_bucket_to_src_file_record_count = defaultdict(dict)
|
347
|
-
mat_bucket_to_src_file_records: Dict[
|
255
|
+
mat_bucket_to_src_file_records: Dict[
|
256
|
+
MaterializeBucketIndex, DeltaFileLocatorToRecords
|
257
|
+
] = defaultdict(dict)
|
348
258
|
for src_dfl, src_row_indices in src_file_id_to_row_indices.items():
|
349
259
|
mat_bucket = delta_file_locator_to_mat_bucket_index(
|
350
260
|
src_dfl,
|
@@ -353,48 +263,33 @@ def dedupe(
|
|
353
263
|
mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
|
354
264
|
src_row_indices,
|
355
265
|
)
|
356
|
-
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] =
|
357
|
-
len(src_row_indices)
|
266
|
+
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(src_row_indices)
|
358
267
|
|
359
|
-
mat_bucket_to_dd_idx_obj_id: Dict[
|
268
|
+
mat_bucket_to_dd_idx_obj_id: Dict[
|
269
|
+
MaterializeBucketIndex, DedupeTaskIndexWithObjectId
|
270
|
+
] = {}
|
360
271
|
src_file_records_obj_refs: List[ObjectRef[DeltaFileLocatorToRecords]] = []
|
361
272
|
for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
|
362
273
|
object_ref = ray.put(src_file_records)
|
363
|
-
|
274
|
+
pickled_object_ref = cloudpickle.dumps(object_ref)
|
275
|
+
src_file_records_obj_refs.append(pickled_object_ref)
|
364
276
|
mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
|
365
277
|
dedupe_task_index,
|
366
|
-
|
278
|
+
pickled_object_ref,
|
367
279
|
)
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
record_counts_pending_materialize.add_record_counts.remote(
|
372
|
-
dedupe_task_index,
|
373
|
-
mat_bucket_to_src_file_record_count,
|
374
|
-
)
|
375
|
-
|
376
|
-
# wait for all dedupe tasks to reach this point before continuing
|
280
|
+
del object_ref
|
281
|
+
del pickled_object_ref
|
377
282
|
logger.info(
|
378
|
-
f"
|
379
|
-
|
380
|
-
while not finalized:
|
381
|
-
finalized = ray.get(
|
382
|
-
record_counts_pending_materialize.is_finalized.remote()
|
383
|
-
)
|
384
|
-
time.sleep(0.25)
|
385
|
-
record_counts = ray.get(
|
386
|
-
record_counts_pending_materialize.get_record_counts.remote()
|
283
|
+
f"Count of materialize buckets with object refs: "
|
284
|
+
f"{len(mat_bucket_to_dd_idx_obj_id)}"
|
387
285
|
)
|
388
286
|
|
389
|
-
write_pki_result: PyArrowWriteResult =
|
287
|
+
write_pki_result: PyArrowWriteResult = _write_new_primary_key_index(
|
390
288
|
compaction_artifact_s3_bucket,
|
391
289
|
new_primary_key_index_version_locator,
|
392
290
|
max_records_per_index_file,
|
393
|
-
max_records_per_materialized_file,
|
394
|
-
num_materialize_buckets,
|
395
291
|
dedupe_task_index,
|
396
292
|
deduped_tables,
|
397
|
-
record_counts,
|
398
293
|
)
|
399
294
|
|
400
295
|
if delete_old_primary_key_index:
|
@@ -402,7 +297,5 @@ def dedupe(
|
|
402
297
|
compaction_artifact_s3_bucket,
|
403
298
|
round_completion_info.primary_key_index_version_locator,
|
404
299
|
)
|
405
|
-
logger.info(f"Finished dedupe task...")
|
406
|
-
return mat_bucket_to_dd_idx_obj_id,
|
407
|
-
src_file_records_obj_refs, \
|
408
|
-
write_pki_result
|
300
|
+
logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
|
301
|
+
return mat_bucket_to_dd_idx_obj_id, src_file_records_obj_refs, write_pki_result
|