deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,51 +1,66 @@
|
|
1
|
-
import logging
|
2
1
|
import functools
|
3
|
-
import
|
4
|
-
|
2
|
+
import logging
|
5
3
|
from collections import defaultdict
|
4
|
+
from typing import Dict, List, Optional, Set, Tuple
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
import ray
|
6
8
|
|
7
9
|
from deltacat import logs
|
10
|
+
from deltacat.compute.compactor import (
|
11
|
+
PrimaryKeyIndexLocator,
|
12
|
+
PrimaryKeyIndexMeta,
|
13
|
+
PrimaryKeyIndexVersionLocator,
|
14
|
+
PrimaryKeyIndexVersionMeta,
|
15
|
+
PyArrowWriteResult,
|
16
|
+
RoundCompletionInfo,
|
17
|
+
SortKey,
|
18
|
+
)
|
19
|
+
from deltacat.compute.compactor.steps import dedupe as dd
|
20
|
+
from deltacat.compute.compactor.steps import hash_bucket as hb
|
21
|
+
from deltacat.compute.compactor.steps import materialize as mat
|
22
|
+
from deltacat.compute.compactor.utils import io
|
23
|
+
from deltacat.compute.compactor.utils import primary_key_index as pki
|
24
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
8
25
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
9
|
-
from deltacat.storage import Delta, DeltaLocator, Partition,
|
10
|
-
|
11
|
-
from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
|
12
|
-
round_robin_options_provider
|
13
|
-
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
14
|
-
from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
|
15
|
-
materialize as mat
|
16
|
-
from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
|
17
|
-
PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
|
18
|
-
PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
|
19
|
-
PyArrowWriteResult
|
20
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
|
21
|
-
primary_key_index as pki
|
26
|
+
from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
|
27
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
22
28
|
from deltacat.types.media import ContentType
|
23
29
|
from deltacat.utils.placement import PlacementGroupConfig
|
24
|
-
from
|
30
|
+
from deltacat.utils.ray_utils.concurrency import (
|
31
|
+
invoke_parallel,
|
32
|
+
round_robin_options_provider,
|
33
|
+
)
|
34
|
+
from deltacat.utils.ray_utils.runtime import live_node_resource_keys
|
25
35
|
|
26
|
-
import pyarrow as pa
|
27
36
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
37
|
|
29
38
|
_PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
|
30
39
|
|
31
40
|
|
32
41
|
def check_preconditions(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
42
|
+
source_partition_locator: PartitionLocator,
|
43
|
+
compacted_partition_locator: PartitionLocator,
|
44
|
+
sort_keys: List[SortKey],
|
45
|
+
max_records_per_output_file: int,
|
46
|
+
new_hash_bucket_count: Optional[int],
|
47
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
48
|
+
) -> int:
|
49
|
+
|
50
|
+
assert (
|
51
|
+
source_partition_locator.partition_values
|
52
|
+
== compacted_partition_locator.partition_values
|
53
|
+
), (
|
54
|
+
"In-place compaction must use the same partition values for the "
|
43
55
|
"source and destination."
|
44
|
-
|
45
|
-
|
56
|
+
)
|
57
|
+
assert (
|
58
|
+
max_records_per_output_file >= 1
|
59
|
+
), "Max records per output file must be a positive value"
|
46
60
|
if new_hash_bucket_count is not None:
|
47
|
-
assert
|
48
|
-
|
61
|
+
assert (
|
62
|
+
new_hash_bucket_count >= 1
|
63
|
+
), "New hash bucket count must be a positive value"
|
49
64
|
return SortKey.validate_sort_keys(
|
50
65
|
source_partition_locator,
|
51
66
|
sort_keys,
|
@@ -54,95 +69,110 @@ def check_preconditions(
|
|
54
69
|
|
55
70
|
|
56
71
|
def compact_partition(
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
source_partition_locator: PartitionLocator,
|
73
|
+
destination_partition_locator: PartitionLocator,
|
74
|
+
primary_keys: Set[str],
|
75
|
+
compaction_artifact_s3_bucket: str,
|
76
|
+
last_stream_position_to_compact: int,
|
77
|
+
*,
|
78
|
+
hash_bucket_count: Optional[int] = None,
|
79
|
+
sort_keys: List[SortKey] = None,
|
80
|
+
records_per_primary_key_index_file: int = 38_000_000,
|
81
|
+
records_per_compacted_file: int = 4_000_000,
|
82
|
+
input_deltas_stats: Dict[int, DeltaStats] = None,
|
83
|
+
min_pk_index_pa_bytes: int = 0,
|
84
|
+
min_hash_bucket_chunk_size: int = 0,
|
85
|
+
compacted_file_content_type: ContentType = ContentType.PARQUET,
|
86
|
+
delete_prev_primary_key_index: bool = False,
|
87
|
+
pg_config: Optional[PlacementGroupConfig] = None,
|
88
|
+
schema_on_read: Optional[
|
89
|
+
pa.schema
|
90
|
+
] = None, # TODO (ricmiyam): Remove this and retrieve schema from storage API
|
91
|
+
rebase_source_partition_locator: Optional[PartitionLocator] = None,
|
92
|
+
rebase_source_partition_high_watermark: Optional[int] = None,
|
93
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
94
|
+
) -> Optional[str]:
|
75
95
|
|
76
96
|
logger.info(f"Starting compaction session for: {source_partition_locator}")
|
77
97
|
partition = None
|
78
98
|
compaction_rounds_executed = 0
|
79
99
|
has_next_compaction_round = True
|
100
|
+
new_rcf_s3_url = None
|
80
101
|
while has_next_compaction_round:
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
+
(
|
103
|
+
has_next_compaction_round,
|
104
|
+
new_partition,
|
105
|
+
new_rci,
|
106
|
+
new_rcf_s3_url,
|
107
|
+
) = _execute_compaction_round(
|
108
|
+
source_partition_locator,
|
109
|
+
destination_partition_locator,
|
110
|
+
primary_keys,
|
111
|
+
compaction_artifact_s3_bucket,
|
112
|
+
last_stream_position_to_compact,
|
113
|
+
hash_bucket_count,
|
114
|
+
sort_keys,
|
115
|
+
records_per_primary_key_index_file,
|
116
|
+
records_per_compacted_file,
|
117
|
+
input_deltas_stats,
|
118
|
+
min_pk_index_pa_bytes,
|
119
|
+
min_hash_bucket_chunk_size,
|
120
|
+
compacted_file_content_type,
|
121
|
+
delete_prev_primary_key_index,
|
122
|
+
pg_config,
|
123
|
+
schema_on_read,
|
124
|
+
rebase_source_partition_locator,
|
125
|
+
rebase_source_partition_high_watermark,
|
126
|
+
deltacat_storage,
|
127
|
+
)
|
102
128
|
if new_partition:
|
103
129
|
partition = new_partition
|
104
|
-
|
130
|
+
destination_partition_locator = new_partition.locator
|
105
131
|
compaction_rounds_executed += 1
|
106
132
|
# Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
|
107
133
|
if new_rci:
|
108
134
|
min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
|
109
135
|
|
110
|
-
logger.info(
|
111
|
-
|
136
|
+
logger.info(
|
137
|
+
f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
|
138
|
+
f"{compaction_rounds_executed} rounds."
|
139
|
+
)
|
112
140
|
if partition:
|
113
141
|
logger.info(f"Committing compacted partition to: {partition.locator}")
|
114
142
|
partition = deltacat_storage.commit_partition(partition)
|
115
143
|
logger.info(f"Committed compacted partition: {partition}")
|
116
144
|
logger.info(f"Completed compaction session for: {source_partition_locator}")
|
145
|
+
return new_rcf_s3_url
|
117
146
|
|
118
147
|
|
119
148
|
def _execute_compaction_round(
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
149
|
+
source_partition_locator: PartitionLocator,
|
150
|
+
compacted_partition_locator: PartitionLocator,
|
151
|
+
primary_keys: Set[str],
|
152
|
+
compaction_artifact_s3_bucket: str,
|
153
|
+
last_stream_position_to_compact: int,
|
154
|
+
new_hash_bucket_count: Optional[int],
|
155
|
+
sort_keys: List[SortKey],
|
156
|
+
records_per_primary_key_index_file: int,
|
157
|
+
records_per_compacted_file: int,
|
158
|
+
input_deltas_stats: Dict[int, DeltaStats],
|
159
|
+
min_pk_index_pa_bytes: int,
|
160
|
+
min_hash_bucket_chunk_size: int,
|
161
|
+
compacted_file_content_type: ContentType,
|
162
|
+
delete_prev_primary_key_index: bool,
|
163
|
+
pg_config: Optional[PlacementGroupConfig],
|
164
|
+
schema_on_read: Optional[pa.schema],
|
165
|
+
rebase_source_partition_locator: Optional[PartitionLocator],
|
166
|
+
rebase_source_partition_high_watermark: Optional[int],
|
167
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
168
|
+
) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
140
169
|
|
141
170
|
if not primary_keys:
|
142
171
|
# TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
|
143
172
|
# with normalized manifest entry sizes
|
144
173
|
raise NotImplementedError(
|
145
|
-
"Compaction only supports tables with 1 or more primary keys"
|
174
|
+
"Compaction only supports tables with 1 or more primary keys"
|
175
|
+
)
|
146
176
|
if sort_keys is None:
|
147
177
|
sort_keys = []
|
148
178
|
# TODO (pdames): detect and handle schema evolution (at least ensure that
|
@@ -166,23 +196,25 @@ def _execute_compaction_round(
|
|
166
196
|
cluster_resources = ray.cluster_resources()
|
167
197
|
logger.info(f"Total cluster resources: {cluster_resources}")
|
168
198
|
node_resource_keys = None
|
169
|
-
if pg_config:
|
199
|
+
if pg_config: # use resource in each placement group
|
170
200
|
cluster_resources = pg_config.resource
|
171
|
-
cluster_cpus = cluster_resources[
|
172
|
-
else:
|
201
|
+
cluster_cpus = cluster_resources["CPU"]
|
202
|
+
else: # use all cluster resource
|
173
203
|
logger.info(f"Available cluster resources: {ray.available_resources()}")
|
174
204
|
cluster_cpus = int(cluster_resources["CPU"])
|
175
205
|
logger.info(f"Total cluster CPUs: {cluster_cpus}")
|
176
206
|
node_resource_keys = live_node_resource_keys()
|
177
|
-
logger.info(
|
178
|
-
|
207
|
+
logger.info(
|
208
|
+
f"Found {len(node_resource_keys)} live cluster nodes: "
|
209
|
+
f"{node_resource_keys}"
|
210
|
+
)
|
179
211
|
|
180
212
|
# create a remote options provider to round-robin tasks across all nodes or allocated bundles
|
181
213
|
logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
|
182
214
|
round_robin_opt_provider = functools.partial(
|
183
215
|
round_robin_options_provider,
|
184
216
|
resource_keys=node_resource_keys,
|
185
|
-
pg_config
|
217
|
+
pg_config=pg_config.opts if pg_config else None,
|
186
218
|
)
|
187
219
|
|
188
220
|
# assign a distinct index to each node in the cluster
|
@@ -206,16 +238,20 @@ def _execute_compaction_round(
|
|
206
238
|
_PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
|
207
239
|
)
|
208
240
|
compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
|
209
|
-
compatible_primary_key_index_meta
|
210
|
-
|
241
|
+
compatible_primary_key_index_meta
|
242
|
+
)
|
243
|
+
compatible_primary_key_index_root_path = (
|
211
244
|
compatible_primary_key_index_locator.primary_key_index_root_path
|
245
|
+
)
|
212
246
|
|
213
247
|
# read the results from any previously completed compaction round that used
|
214
248
|
# a compatible primary key index
|
215
249
|
round_completion_info = None
|
216
|
-
if
|
217
|
-
logger.info(
|
218
|
-
|
250
|
+
if not rebase_source_partition_locator:
|
251
|
+
logger.info(
|
252
|
+
f"Reading round completion file for compatible "
|
253
|
+
f"primary key index root path: {compatible_primary_key_index_root_path}"
|
254
|
+
)
|
219
255
|
round_completion_info = rcf.read_round_completion_file(
|
220
256
|
compaction_artifact_s3_bucket,
|
221
257
|
source_partition_locator,
|
@@ -226,21 +262,34 @@ def _execute_compaction_round(
|
|
226
262
|
# read the previous compaction round's hash bucket count, if any
|
227
263
|
old_hash_bucket_count = None
|
228
264
|
if round_completion_info:
|
229
|
-
old_pki_version_locator =
|
230
|
-
.primary_key_index_version_locator
|
231
|
-
|
232
|
-
|
233
|
-
.hash_bucket_count
|
234
|
-
|
265
|
+
old_pki_version_locator = (
|
266
|
+
round_completion_info.primary_key_index_version_locator
|
267
|
+
)
|
268
|
+
old_hash_bucket_count = (
|
269
|
+
old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
|
270
|
+
)
|
271
|
+
min_pk_index_pa_bytes = (
|
272
|
+
round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
|
273
|
+
)
|
274
|
+
else:
|
275
|
+
logger.info(
|
276
|
+
f"No prior round info read. Source partition: "
|
277
|
+
f"{source_partition_locator}. Primary key index locator: "
|
278
|
+
f"{compatible_primary_key_index_locator}. Rebase source "
|
279
|
+
f"partition locator: {rebase_source_partition_locator}"
|
280
|
+
)
|
235
281
|
|
236
282
|
# use the new hash bucket count if provided, or fall back to old count
|
237
|
-
hash_bucket_count =
|
238
|
-
|
283
|
+
hash_bucket_count = (
|
284
|
+
new_hash_bucket_count
|
285
|
+
if new_hash_bucket_count is not None
|
239
286
|
else old_hash_bucket_count
|
287
|
+
)
|
240
288
|
|
241
289
|
# discover input delta files
|
242
|
-
high_watermark =
|
243
|
-
if round_completion_info else None
|
290
|
+
high_watermark = (
|
291
|
+
round_completion_info.high_watermark if round_completion_info else None
|
292
|
+
)
|
244
293
|
|
245
294
|
input_deltas = io.discover_deltas(
|
246
295
|
source_partition_locator,
|
@@ -251,25 +300,29 @@ def _execute_compaction_round(
|
|
251
300
|
|
252
301
|
if not input_deltas:
|
253
302
|
logger.info("No input deltas found to compact.")
|
254
|
-
return False, None, None
|
303
|
+
return False, None, None, None
|
255
304
|
|
256
305
|
# limit the input deltas to fit on this cluster and convert them to
|
257
306
|
# annotated deltas of equivalent size for easy parallel distribution
|
258
307
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
308
|
+
(
|
309
|
+
uniform_deltas,
|
310
|
+
hash_bucket_count,
|
311
|
+
last_stream_position_compacted,
|
312
|
+
) = io.limit_input_deltas(
|
313
|
+
input_deltas,
|
314
|
+
cluster_resources,
|
315
|
+
hash_bucket_count,
|
316
|
+
min_pk_index_pa_bytes,
|
317
|
+
min_hash_bucket_chunk_size,
|
318
|
+
input_deltas_stats=input_deltas_stats,
|
319
|
+
deltacat_storage=deltacat_storage,
|
320
|
+
)
|
269
321
|
|
270
|
-
assert hash_bucket_count is not None and hash_bucket_count > 0,
|
271
|
-
f"
|
272
|
-
f"
|
322
|
+
assert hash_bucket_count is not None and hash_bucket_count > 0, (
|
323
|
+
f"Expected hash bucket count to be a positive integer, but found "
|
324
|
+
f"`{hash_bucket_count}`"
|
325
|
+
)
|
273
326
|
|
274
327
|
# rehash the primary key index if necessary
|
275
328
|
if round_completion_info:
|
@@ -277,8 +330,8 @@ def _execute_compaction_round(
|
|
277
330
|
# the previous primary key index is compatible with the current, but
|
278
331
|
# will need to be rehashed if the hash bucket count has changed
|
279
332
|
if hash_bucket_count != old_hash_bucket_count:
|
280
|
-
# TODO(draghave): manually test the path after prior primary key
|
281
|
-
#
|
333
|
+
# TODO(draghave): manually test the path after prior primary key
|
334
|
+
# index was already built
|
282
335
|
round_completion_info = pki.rehash(
|
283
336
|
round_robin_opt_provider,
|
284
337
|
compaction_artifact_s3_bucket,
|
@@ -289,10 +342,6 @@ def _execute_compaction_round(
|
|
289
342
|
records_per_primary_key_index_file,
|
290
343
|
delete_prev_primary_key_index,
|
291
344
|
)
|
292
|
-
else:
|
293
|
-
logger.info(f"No prior round completion file found. Source partition: "
|
294
|
-
f"{source_partition_locator}. Primary key index locator: "
|
295
|
-
f"{compatible_primary_key_index_locator}")
|
296
345
|
|
297
346
|
# parallel step 1:
|
298
347
|
# group like primary keys together by hashing them into buckets
|
@@ -315,7 +364,7 @@ def _execute_compaction_round(
|
|
315
364
|
for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
|
316
365
|
if object_id:
|
317
366
|
all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
|
318
|
-
hash_group_count =
|
367
|
+
hash_group_count = len(all_hash_group_idx_to_obj_id)
|
319
368
|
logger.info(f"Hash bucket groups created: {hash_group_count}")
|
320
369
|
|
321
370
|
# TODO (pdames): when resources are freed during the last round of hash
|
@@ -343,9 +392,11 @@ def _execute_compaction_round(
|
|
343
392
|
_PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
|
344
393
|
)
|
345
394
|
new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
|
346
|
-
new_primary_key_index_meta
|
347
|
-
|
348
|
-
|
395
|
+
new_primary_key_index_meta
|
396
|
+
)
|
397
|
+
new_primary_key_index_root_path = (
|
398
|
+
new_primary_key_index_locator.primary_key_index_root_path
|
399
|
+
)
|
349
400
|
|
350
401
|
# generate a new primary key index version locator for this round
|
351
402
|
new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
|
@@ -353,8 +404,8 @@ def _execute_compaction_round(
|
|
353
404
|
hash_bucket_count,
|
354
405
|
)
|
355
406
|
new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
|
356
|
-
new_primary_key_index_version_meta
|
357
|
-
|
407
|
+
new_primary_key_index_version_meta
|
408
|
+
)
|
358
409
|
|
359
410
|
# parallel step 2:
|
360
411
|
# discover records with duplicate primary keys in each hash bucket, and
|
@@ -366,30 +417,34 @@ def _execute_compaction_round(
|
|
366
417
|
ray_task=dd.dedupe,
|
367
418
|
max_parallelism=max_parallelism,
|
368
419
|
options_provider=round_robin_opt_provider,
|
369
|
-
kwargs_provider=lambda index, item: {
|
370
|
-
|
420
|
+
kwargs_provider=lambda index, item: {
|
421
|
+
"dedupe_task_index": index,
|
422
|
+
"object_ids": item,
|
423
|
+
},
|
371
424
|
compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
|
372
425
|
round_completion_info=round_completion_info,
|
373
426
|
new_primary_key_index_version_locator=new_pki_version_locator,
|
374
427
|
sort_keys=sort_keys,
|
375
428
|
max_records_per_index_file=records_per_primary_key_index_file,
|
376
429
|
num_materialize_buckets=num_materialize_buckets,
|
377
|
-
delete_old_primary_key_index=delete_prev_primary_key_index
|
430
|
+
delete_old_primary_key_index=delete_prev_primary_key_index,
|
378
431
|
)
|
379
432
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
|
380
433
|
dd_results = ray.get([t[0] for t in dd_tasks_pending])
|
381
434
|
logger.info(f"Got {len(dd_results)} dedupe results.")
|
382
435
|
all_mat_buckets_to_obj_id = defaultdict(list)
|
383
436
|
for mat_bucket_idx_to_obj_id in dd_results:
|
384
|
-
for
|
385
|
-
|
437
|
+
for (
|
438
|
+
bucket_idx,
|
439
|
+
dd_task_index_and_object_id_tuple,
|
440
|
+
) in mat_bucket_idx_to_obj_id.items():
|
386
441
|
all_mat_buckets_to_obj_id[bucket_idx].append(
|
387
|
-
dd_task_index_and_object_id_tuple
|
442
|
+
dd_task_index_and_object_id_tuple
|
443
|
+
)
|
388
444
|
logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
|
389
445
|
pki_stats = ray.get([t[2] for t in dd_tasks_pending])
|
390
446
|
logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
|
391
|
-
logger.info(f"Materialize buckets created: "
|
392
|
-
f"{len(all_mat_buckets_to_obj_id)}")
|
447
|
+
logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
|
393
448
|
|
394
449
|
# TODO(pdames): when resources are freed during the last round of deduping
|
395
450
|
# start running materialize tasks that read materialization source file
|
@@ -408,9 +463,9 @@ def _execute_compaction_round(
|
|
408
463
|
ray_task=mat.materialize,
|
409
464
|
max_parallelism=max_parallelism,
|
410
465
|
options_provider=round_robin_opt_provider,
|
411
|
-
kwargs_provider=lambda index,
|
412
|
-
"mat_bucket_index":
|
413
|
-
"dedupe_task_idx_and_obj_id_tuples":
|
466
|
+
kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
|
467
|
+
"mat_bucket_index": mat_bucket_index_to_obj_id[0],
|
468
|
+
"dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
|
414
469
|
},
|
415
470
|
schema=schema_on_read,
|
416
471
|
round_completion_info=round_completion_info,
|
@@ -435,24 +490,40 @@ def _execute_compaction_round(
|
|
435
490
|
compacted_delta.stream_position,
|
436
491
|
)
|
437
492
|
|
438
|
-
|
439
|
-
|
493
|
+
rci_high_watermark = (
|
494
|
+
rebase_source_partition_high_watermark
|
495
|
+
if rebase_source_partition_high_watermark
|
496
|
+
else last_stream_position_compacted
|
497
|
+
)
|
498
|
+
new_round_completion_info = RoundCompletionInfo.of(
|
499
|
+
rci_high_watermark,
|
440
500
|
new_compacted_delta_locator,
|
441
|
-
PyArrowWriteResult.union([m.pyarrow_write_result
|
442
|
-
for m in mat_results]),
|
501
|
+
PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
|
443
502
|
PyArrowWriteResult.union(pki_stats),
|
444
503
|
bit_width_of_sort_keys,
|
445
504
|
new_pki_version_locator,
|
505
|
+
rebase_source_partition_locator
|
506
|
+
or round_completion_info.rebase_source_partition_locator,
|
446
507
|
)
|
447
|
-
|
508
|
+
rcf_source_partition_locator = (
|
509
|
+
rebase_source_partition_locator
|
510
|
+
if rebase_source_partition_locator
|
511
|
+
else source_partition_locator
|
512
|
+
)
|
513
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
448
514
|
compaction_artifact_s3_bucket,
|
449
|
-
|
515
|
+
rcf_source_partition_locator,
|
450
516
|
new_primary_key_index_root_path,
|
451
|
-
|
517
|
+
new_round_completion_info,
|
518
|
+
)
|
519
|
+
logger.info(
|
520
|
+
f"partition-{source_partition_locator.partition_values},"
|
521
|
+
f"compacted at: {last_stream_position_compacted},"
|
522
|
+
f"last position: {last_stream_position_to_compact}"
|
523
|
+
)
|
524
|
+
return (
|
525
|
+
(last_stream_position_compacted < last_stream_position_to_compact),
|
526
|
+
partition,
|
527
|
+
new_round_completion_info,
|
528
|
+
round_completion_file_s3_url,
|
452
529
|
)
|
453
|
-
logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
|
454
|
-
return \
|
455
|
-
(last_stream_position_compacted < last_stream_position_to_compact), \
|
456
|
-
partition, \
|
457
|
-
round_completion_info
|
458
|
-
|