deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,8 @@ def validate_sort_keys(
|
|
9
9
|
source_partition_locator: PartitionLocator,
|
10
10
|
sort_keys: List[SortKey],
|
11
11
|
deltacat_storage,
|
12
|
+
deltacat_storage_kwargs,
|
13
|
+
**kwargs,
|
12
14
|
) -> int:
|
13
15
|
"""
|
14
16
|
Validates the input sort keys to ensure that they are unique, are using
|
@@ -16,6 +18,8 @@ def validate_sort_keys(
|
|
16
18
|
sum of bit widths across sort key data types is less-than-or-equal-to
|
17
19
|
256. Returns the sum of bit widths across all sort keys.
|
18
20
|
"""
|
21
|
+
if deltacat_storage_kwargs is None:
|
22
|
+
deltacat_storage_kwargs = {}
|
19
23
|
total_sort_keys_bit_width = 0
|
20
24
|
if sort_keys:
|
21
25
|
sort_key_names = [key.key_name for key in sort_keys]
|
@@ -27,6 +31,7 @@ def validate_sort_keys(
|
|
27
31
|
stream_locator.namespace,
|
28
32
|
stream_locator.table_name,
|
29
33
|
stream_locator.table_version,
|
34
|
+
**deltacat_storage_kwargs,
|
30
35
|
)
|
31
36
|
if isinstance(table_version_schema, pa.Schema):
|
32
37
|
for sort_key_name in sort_key_names:
|
@@ -22,6 +22,13 @@ _PK_HASH_COLUMN_FIELD = pa.field(
|
|
22
22
|
_PK_HASH_COLUMN_TYPE,
|
23
23
|
)
|
24
24
|
|
25
|
+
_PK_HASH_STRING_COLUMN_NAME = _get_sys_col_name("hash_str")
|
26
|
+
_PK_HASH_STRING_COLUMN_TYPE = pa.string()
|
27
|
+
_PK_HASH_STRING_COLUMN_FIELD = pa.field(
|
28
|
+
_PK_HASH_STRING_COLUMN_NAME,
|
29
|
+
_PK_HASH_STRING_COLUMN_TYPE,
|
30
|
+
)
|
31
|
+
|
25
32
|
_DEDUPE_TASK_IDX_COLUMN_NAME = _get_sys_col_name("dedupe_task_idx")
|
26
33
|
_DEDUPE_TASK_IDX_COLUMN_TYPE = pa.int32()
|
27
34
|
_DEDUPE_TASK_IDX_COLUMN_FIELD = pa.field(
|
@@ -36,6 +43,12 @@ _PARTITION_STREAM_POSITION_COLUMN_FIELD = pa.field(
|
|
36
43
|
_PARTITION_STREAM_POSITION_COLUMN_TYPE,
|
37
44
|
)
|
38
45
|
|
46
|
+
_HASH_BUCKET_IDX_COLUMN_NAME = _get_sys_col_name("hash_bucket_idx")
|
47
|
+
_HASH_BUCKET_IDX_COLUMN_TYPE = pa.int32()
|
48
|
+
_HASH_BUCKET_IDX_COLUMN_FIELD = pa.field(
|
49
|
+
_HASH_BUCKET_IDX_COLUMN_NAME, _HASH_BUCKET_IDX_COLUMN_TYPE
|
50
|
+
)
|
51
|
+
|
39
52
|
_ORDERED_FILE_IDX_COLUMN_NAME = _get_sys_col_name("file_index")
|
40
53
|
_ORDERED_FILE_IDX_COLUMN_TYPE = pa.int32()
|
41
54
|
_ORDERED_FILE_IDX_COLUMN_FIELD = pa.field(
|
@@ -76,10 +89,18 @@ def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
76
89
|
return pa.array(obj, _PK_HASH_COLUMN_TYPE)
|
77
90
|
|
78
91
|
|
92
|
+
def get_pk_hash_string_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
93
|
+
return pa.array(obj, _PK_HASH_STRING_COLUMN_TYPE)
|
94
|
+
|
95
|
+
|
79
96
|
def pk_hash_column_np(table: pa.Table) -> np.ndarray:
|
80
97
|
return table[_PK_HASH_COLUMN_NAME].to_numpy()
|
81
98
|
|
82
99
|
|
100
|
+
def pk_hash_string_column_np(table: pa.Table) -> np.ndarray:
|
101
|
+
return table[_PK_HASH_STRING_COLUMN_NAME].to_numpy()
|
102
|
+
|
103
|
+
|
83
104
|
def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
|
84
105
|
return table[_PK_HASH_COLUMN_NAME]
|
85
106
|
|
@@ -143,6 +164,10 @@ def get_delta_type_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
143
164
|
)
|
144
165
|
|
145
166
|
|
167
|
+
def get_hash_bucket_idx_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
168
|
+
return pa.array(obj, _HASH_BUCKET_IDX_COLUMN_TYPE)
|
169
|
+
|
170
|
+
|
146
171
|
def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
147
172
|
return pa.array(
|
148
173
|
obj,
|
@@ -232,6 +257,24 @@ def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
|
|
232
257
|
return table
|
233
258
|
|
234
259
|
|
260
|
+
def append_pk_hash_string_column(table: pa.Table, pk_hashes) -> pa.Table:
|
261
|
+
|
262
|
+
table = table.append_column(
|
263
|
+
_PK_HASH_STRING_COLUMN_FIELD, get_pk_hash_string_column_array(pk_hashes)
|
264
|
+
)
|
265
|
+
return table
|
266
|
+
|
267
|
+
|
268
|
+
def append_hash_bucket_idx_col(table: pa.Table, hash_bucket_indexes) -> pa.Table:
|
269
|
+
|
270
|
+
table = table.append_column(
|
271
|
+
_HASH_BUCKET_IDX_COLUMN_FIELD,
|
272
|
+
get_hash_bucket_idx_column_array(hash_bucket_indexes),
|
273
|
+
)
|
274
|
+
|
275
|
+
return table
|
276
|
+
|
277
|
+
|
235
278
|
def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
|
236
279
|
|
237
280
|
table = table.append_column(
|
@@ -0,0 +1,506 @@
|
|
1
|
+
import importlib
|
2
|
+
from contextlib import nullcontext
|
3
|
+
import numpy as np
|
4
|
+
import functools
|
5
|
+
import logging
|
6
|
+
import ray
|
7
|
+
import time
|
8
|
+
import json
|
9
|
+
from deltacat.aws import s3u as s3_utils
|
10
|
+
import deltacat
|
11
|
+
from deltacat import logs
|
12
|
+
from deltacat.compute.compactor import (
|
13
|
+
PyArrowWriteResult,
|
14
|
+
RoundCompletionInfo,
|
15
|
+
)
|
16
|
+
from deltacat.compute.compactor_v2.model.merge_input import MergeInput
|
17
|
+
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
18
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
19
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
20
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
21
|
+
from deltacat.storage import (
|
22
|
+
Delta,
|
23
|
+
DeltaLocator,
|
24
|
+
Partition,
|
25
|
+
)
|
26
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
27
|
+
CompactPartitionParams,
|
28
|
+
)
|
29
|
+
from deltacat.utils.ray_utils.concurrency import (
|
30
|
+
invoke_parallel,
|
31
|
+
task_resource_options_provider,
|
32
|
+
)
|
33
|
+
from deltacat.compute.compactor_v2.steps import merge as mg
|
34
|
+
from deltacat.compute.compactor_v2.steps import hash_bucket as hb
|
35
|
+
from deltacat.compute.compactor_v2.utils import io
|
36
|
+
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
37
|
+
|
38
|
+
from typing import List, Optional, Tuple
|
39
|
+
from collections import defaultdict
|
40
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
41
|
+
CompactionSessionAuditInfo,
|
42
|
+
)
|
43
|
+
from deltacat.utils.resources import (
|
44
|
+
get_current_node_peak_memory_usage_in_bytes,
|
45
|
+
)
|
46
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
47
|
+
hash_bucket_resource_options_provider,
|
48
|
+
merge_resource_options_provider,
|
49
|
+
)
|
50
|
+
from deltacat.utils.resources import ClusterUtilizationOverTimeRange
|
51
|
+
|
52
|
+
if importlib.util.find_spec("memray"):
|
53
|
+
import memray
|
54
|
+
|
55
|
+
|
56
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
57
|
+
|
58
|
+
|
59
|
+
def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
|
60
|
+
|
61
|
+
assert (
|
62
|
+
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
63
|
+
), "hash_bucket_count is a required arg for compactor v2"
|
64
|
+
|
65
|
+
with memray.Tracker(
|
66
|
+
f"compaction_partition.bin"
|
67
|
+
) if params.enable_profiler else nullcontext(), ClusterUtilizationOverTimeRange() as cluster_util:
|
68
|
+
(new_partition, new_rci, new_rcf_partition_locator,) = _execute_compaction(
|
69
|
+
params,
|
70
|
+
cluster_util=cluster_util,
|
71
|
+
**kwargs,
|
72
|
+
)
|
73
|
+
|
74
|
+
logger.info(
|
75
|
+
f"Partition-{params.source_partition_locator} -> "
|
76
|
+
f"Compaction session data processing completed"
|
77
|
+
)
|
78
|
+
round_completion_file_s3_url = None
|
79
|
+
if new_partition:
|
80
|
+
logger.info(f"Committing compacted partition to: {new_partition.locator}")
|
81
|
+
partition = params.deltacat_storage.commit_partition(
|
82
|
+
new_partition, **params.deltacat_storage_kwargs
|
83
|
+
)
|
84
|
+
logger.info(f"Committed compacted partition: {partition}")
|
85
|
+
|
86
|
+
round_completion_file_s3_url = rcf.write_round_completion_file(
|
87
|
+
params.compaction_artifact_s3_bucket,
|
88
|
+
new_rcf_partition_locator,
|
89
|
+
new_rci,
|
90
|
+
**params.s3_client_kwargs,
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
logger.warn("No new partition was committed during compaction.")
|
94
|
+
|
95
|
+
logger.info(
|
96
|
+
f"Completed compaction session for: {params.source_partition_locator}"
|
97
|
+
)
|
98
|
+
return round_completion_file_s3_url
|
99
|
+
|
100
|
+
|
101
|
+
def _execute_compaction(
|
102
|
+
params: CompactPartitionParams, **kwargs
|
103
|
+
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
104
|
+
|
105
|
+
rcf_source_partition_locator = (
|
106
|
+
params.rebase_source_partition_locator or params.source_partition_locator
|
107
|
+
)
|
108
|
+
|
109
|
+
base_audit_url = rcf_source_partition_locator.path(
|
110
|
+
f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
|
111
|
+
)
|
112
|
+
audit_url = f"{base_audit_url}.json"
|
113
|
+
logger.info(f"Compaction audit will be written to {audit_url}")
|
114
|
+
compaction_audit = CompactionSessionAuditInfo(deltacat.__version__, audit_url)
|
115
|
+
|
116
|
+
compaction_audit.set_hash_bucket_count(params.hash_bucket_count)
|
117
|
+
|
118
|
+
compaction_start = time.monotonic()
|
119
|
+
|
120
|
+
task_max_parallelism = params.task_max_parallelism
|
121
|
+
|
122
|
+
if params.pg_config:
|
123
|
+
logger.info(
|
124
|
+
"pg_config specified. Tasks will be scheduled in a placement group."
|
125
|
+
)
|
126
|
+
cluster_resources = params.pg_config.resource
|
127
|
+
cluster_cpus = cluster_resources["CPU"]
|
128
|
+
cluster_memory = cluster_resources["memory"]
|
129
|
+
task_max_parallelism = cluster_cpus
|
130
|
+
compaction_audit.set_cluster_cpu_max(cluster_cpus)
|
131
|
+
compaction_audit.set_total_cluster_memory_bytes(cluster_memory)
|
132
|
+
|
133
|
+
# read the results from any previously completed compaction round
|
134
|
+
round_completion_info = None
|
135
|
+
high_watermark = None
|
136
|
+
previous_compacted_delta = None
|
137
|
+
|
138
|
+
if not params.rebase_source_partition_locator:
|
139
|
+
round_completion_info = rcf.read_round_completion_file(
|
140
|
+
params.compaction_artifact_s3_bucket,
|
141
|
+
params.source_partition_locator,
|
142
|
+
**params.s3_client_kwargs,
|
143
|
+
)
|
144
|
+
if not round_completion_info:
|
145
|
+
logger.info(
|
146
|
+
f"Both rebase partition and round completion file not found. Performing an entire backfill on source."
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
150
|
+
previous_compacted_delta = params.deltacat_storage.get_delta(
|
151
|
+
namespace=compacted_delta_locator.namespace,
|
152
|
+
table_name=compacted_delta_locator.table_name,
|
153
|
+
table_version=compacted_delta_locator.table_version,
|
154
|
+
stream_position=compacted_delta_locator.stream_position,
|
155
|
+
include_manifest=True,
|
156
|
+
**params.deltacat_storage_kwargs,
|
157
|
+
)
|
158
|
+
|
159
|
+
high_watermark = round_completion_info.high_watermark
|
160
|
+
logger.info(f"Setting round completion high watermark: {high_watermark}")
|
161
|
+
assert (
|
162
|
+
params.hash_bucket_count == round_completion_info.hash_bucket_count
|
163
|
+
), (
|
164
|
+
"The hash bucket count has changed. "
|
165
|
+
"Kindly run rebase compaction and trigger incremental again. "
|
166
|
+
f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
|
167
|
+
f"not equal to Hash bucket count in args={params.hash_bucket_count}."
|
168
|
+
)
|
169
|
+
|
170
|
+
logger.info(f"Round completion file: {round_completion_info}")
|
171
|
+
|
172
|
+
delta_discovery_start = time.monotonic()
|
173
|
+
|
174
|
+
input_deltas = io.discover_deltas(
|
175
|
+
params.source_partition_locator,
|
176
|
+
params.last_stream_position_to_compact,
|
177
|
+
params.rebase_source_partition_locator,
|
178
|
+
params.rebase_source_partition_high_watermark,
|
179
|
+
high_watermark,
|
180
|
+
params.deltacat_storage,
|
181
|
+
params.deltacat_storage_kwargs,
|
182
|
+
params.list_deltas_kwargs,
|
183
|
+
)
|
184
|
+
|
185
|
+
delta_discovery_end = time.monotonic()
|
186
|
+
compaction_audit.set_delta_discovery_time_in_seconds(
|
187
|
+
delta_discovery_end - delta_discovery_start
|
188
|
+
)
|
189
|
+
|
190
|
+
s3_utils.upload(
|
191
|
+
compaction_audit.audit_url,
|
192
|
+
str(json.dumps(compaction_audit)),
|
193
|
+
**params.s3_client_kwargs,
|
194
|
+
)
|
195
|
+
|
196
|
+
if not input_deltas:
|
197
|
+
logger.info("No input deltas found to compact.")
|
198
|
+
return None, None, None
|
199
|
+
|
200
|
+
uniform_deltas = io.create_uniform_input_deltas(
|
201
|
+
input_deltas=input_deltas,
|
202
|
+
hash_bucket_count=params.hash_bucket_count,
|
203
|
+
compaction_audit=compaction_audit,
|
204
|
+
deltacat_storage=params.deltacat_storage,
|
205
|
+
previous_inflation=params.previous_inflation,
|
206
|
+
min_delta_bytes=params.min_delta_bytes_in_batch,
|
207
|
+
min_file_counts=params.min_files_in_batch,
|
208
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
209
|
+
)
|
210
|
+
|
211
|
+
compaction_audit.set_uniform_deltas_created(len(uniform_deltas))
|
212
|
+
|
213
|
+
hb_options_provider = functools.partial(
|
214
|
+
task_resource_options_provider,
|
215
|
+
pg_config=params.pg_config,
|
216
|
+
resource_amount_provider=hash_bucket_resource_options_provider,
|
217
|
+
previous_inflation=params.previous_inflation,
|
218
|
+
average_record_size_bytes=params.average_record_size_bytes,
|
219
|
+
primary_keys=params.primary_keys,
|
220
|
+
)
|
221
|
+
|
222
|
+
hb_start = time.monotonic()
|
223
|
+
|
224
|
+
hash_bucket_input_provider = lambda index, item: {
|
225
|
+
"input": HashBucketInput.of(
|
226
|
+
item,
|
227
|
+
primary_keys=params.primary_keys,
|
228
|
+
num_hash_buckets=params.hash_bucket_count,
|
229
|
+
num_hash_groups=params.hash_group_count,
|
230
|
+
enable_profiler=params.enable_profiler,
|
231
|
+
metrics_config=params.metrics_config,
|
232
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
233
|
+
object_store=params.object_store,
|
234
|
+
deltacat_storage=params.deltacat_storage,
|
235
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
236
|
+
)
|
237
|
+
}
|
238
|
+
|
239
|
+
hb_tasks_pending = invoke_parallel(
|
240
|
+
items=uniform_deltas,
|
241
|
+
ray_task=hb.hash_bucket,
|
242
|
+
max_parallelism=task_max_parallelism,
|
243
|
+
options_provider=hb_options_provider,
|
244
|
+
kwargs_provider=hash_bucket_input_provider,
|
245
|
+
)
|
246
|
+
|
247
|
+
hb_invoke_end = time.monotonic()
|
248
|
+
|
249
|
+
logger.info(f"Getting {len(hb_tasks_pending)} hash bucket results...")
|
250
|
+
hb_results: List[HashBucketResult] = ray.get(hb_tasks_pending)
|
251
|
+
logger.info(f"Got {len(hb_results)} hash bucket results.")
|
252
|
+
hb_end = time.monotonic()
|
253
|
+
|
254
|
+
# we use time.time() here because time.monotonic() has no reference point
|
255
|
+
# whereas time.time() measures epoch seconds. Hence, it will be reasonable
|
256
|
+
# to compare time.time()s captured in different nodes.
|
257
|
+
hb_results_retrieved_at = time.time()
|
258
|
+
|
259
|
+
telemetry_time_hb = compaction_audit.save_step_stats(
|
260
|
+
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
|
261
|
+
hb_results,
|
262
|
+
hb_results_retrieved_at,
|
263
|
+
hb_invoke_end - hb_start,
|
264
|
+
hb_end - hb_start,
|
265
|
+
)
|
266
|
+
|
267
|
+
s3_utils.upload(
|
268
|
+
compaction_audit.audit_url,
|
269
|
+
str(json.dumps(compaction_audit)),
|
270
|
+
**params.s3_client_kwargs,
|
271
|
+
)
|
272
|
+
|
273
|
+
all_hash_group_idx_to_obj_id = defaultdict(list)
|
274
|
+
all_hash_group_idx_to_size_bytes = defaultdict(int)
|
275
|
+
all_hash_group_idx_to_num_rows = defaultdict(int)
|
276
|
+
hb_data_processed_size_bytes = np.int64(0)
|
277
|
+
total_hb_record_count = np.int64(0)
|
278
|
+
|
279
|
+
# initialize all hash groups
|
280
|
+
for hb_group in range(params.hash_group_count):
|
281
|
+
all_hash_group_idx_to_num_rows[hb_group] = 0
|
282
|
+
all_hash_group_idx_to_obj_id[hb_group] = []
|
283
|
+
all_hash_group_idx_to_size_bytes[hb_group] = 0
|
284
|
+
|
285
|
+
for hb_result in hb_results:
|
286
|
+
hb_data_processed_size_bytes += hb_result.hb_size_bytes
|
287
|
+
total_hb_record_count += hb_result.hb_record_count
|
288
|
+
|
289
|
+
for hash_group_index, object_id_size_tuple in enumerate(
|
290
|
+
hb_result.hash_bucket_group_to_obj_id_tuple
|
291
|
+
):
|
292
|
+
if object_id_size_tuple:
|
293
|
+
all_hash_group_idx_to_obj_id[hash_group_index].append(
|
294
|
+
object_id_size_tuple[0]
|
295
|
+
)
|
296
|
+
all_hash_group_idx_to_size_bytes[
|
297
|
+
hash_group_index
|
298
|
+
] += object_id_size_tuple[1].item()
|
299
|
+
all_hash_group_idx_to_num_rows[
|
300
|
+
hash_group_index
|
301
|
+
] += object_id_size_tuple[2].item()
|
302
|
+
|
303
|
+
logger.info(
|
304
|
+
f"Got {total_hb_record_count} hash bucket records from hash bucketing step..."
|
305
|
+
)
|
306
|
+
|
307
|
+
compaction_audit.set_input_records(total_hb_record_count.item())
|
308
|
+
compaction_audit.set_hash_bucket_processed_size_bytes(
|
309
|
+
hb_data_processed_size_bytes.item()
|
310
|
+
)
|
311
|
+
|
312
|
+
# create a new stream for this round
|
313
|
+
compacted_stream_locator = params.destination_partition_locator.stream_locator
|
314
|
+
compacted_stream = params.deltacat_storage.get_stream(
|
315
|
+
compacted_stream_locator.namespace,
|
316
|
+
compacted_stream_locator.table_name,
|
317
|
+
compacted_stream_locator.table_version,
|
318
|
+
**params.deltacat_storage_kwargs,
|
319
|
+
)
|
320
|
+
compacted_partition = params.deltacat_storage.stage_partition(
|
321
|
+
compacted_stream,
|
322
|
+
params.destination_partition_locator.partition_values,
|
323
|
+
**params.deltacat_storage_kwargs,
|
324
|
+
)
|
325
|
+
|
326
|
+
# BSP Step 2: Merge
|
327
|
+
merge_options_provider = functools.partial(
|
328
|
+
task_resource_options_provider,
|
329
|
+
pg_config=params.pg_config,
|
330
|
+
resource_amount_provider=merge_resource_options_provider,
|
331
|
+
num_hash_groups=params.hash_group_count,
|
332
|
+
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
333
|
+
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
334
|
+
round_completion_info=round_completion_info,
|
335
|
+
compacted_delta=previous_compacted_delta,
|
336
|
+
primary_keys=params.primary_keys,
|
337
|
+
deltacat_storage=params.deltacat_storage,
|
338
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
339
|
+
)
|
340
|
+
|
341
|
+
merge_input_provider = lambda index, item: {
|
342
|
+
"input": MergeInput.of(
|
343
|
+
dfe_groups_refs=item[1],
|
344
|
+
write_to_partition=compacted_partition,
|
345
|
+
compacted_file_content_type=params.compacted_file_content_type,
|
346
|
+
primary_keys=params.primary_keys,
|
347
|
+
sort_keys=params.sort_keys,
|
348
|
+
merge_task_index=index,
|
349
|
+
hash_group_index=item[0],
|
350
|
+
num_hash_groups=params.hash_group_count,
|
351
|
+
max_records_per_output_file=params.records_per_compacted_file,
|
352
|
+
enable_profiler=params.enable_profiler,
|
353
|
+
metrics_config=params.metrics_config,
|
354
|
+
s3_table_writer_kwargs=params.s3_table_writer_kwargs,
|
355
|
+
read_kwargs_provider=params.read_kwargs_provider,
|
356
|
+
round_completion_info=round_completion_info,
|
357
|
+
object_store=params.object_store,
|
358
|
+
deltacat_storage=params.deltacat_storage,
|
359
|
+
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
360
|
+
)
|
361
|
+
}
|
362
|
+
|
363
|
+
merge_start = time.monotonic()
|
364
|
+
|
365
|
+
merge_tasks_pending = invoke_parallel(
|
366
|
+
items=all_hash_group_idx_to_obj_id.items(),
|
367
|
+
ray_task=mg.merge,
|
368
|
+
max_parallelism=task_max_parallelism,
|
369
|
+
options_provider=merge_options_provider,
|
370
|
+
kwargs_provider=merge_input_provider,
|
371
|
+
)
|
372
|
+
|
373
|
+
merge_invoke_end = time.monotonic()
|
374
|
+
logger.info(f"Getting {len(merge_tasks_pending)} merge results...")
|
375
|
+
merge_results: List[MergeResult] = ray.get(merge_tasks_pending)
|
376
|
+
logger.info(f"Got {len(merge_results)} merge results.")
|
377
|
+
|
378
|
+
merge_results_retrieved_at = time.time()
|
379
|
+
merge_end = time.monotonic()
|
380
|
+
|
381
|
+
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
382
|
+
logger.info(f"Deduped {total_dd_record_count} records...")
|
383
|
+
|
384
|
+
telemetry_time_merge = compaction_audit.save_step_stats(
|
385
|
+
CompactionSessionAuditInfo.MERGE_STEP_NAME,
|
386
|
+
merge_results,
|
387
|
+
merge_results_retrieved_at,
|
388
|
+
merge_invoke_end - merge_start,
|
389
|
+
merge_end - merge_start,
|
390
|
+
)
|
391
|
+
|
392
|
+
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
393
|
+
|
394
|
+
mat_results = []
|
395
|
+
for merge_result in merge_results:
|
396
|
+
mat_results.extend(merge_result.materialize_results)
|
397
|
+
|
398
|
+
mat_results: List[MaterializeResult] = sorted(
|
399
|
+
mat_results, key=lambda m: m.task_index
|
400
|
+
)
|
401
|
+
|
402
|
+
deltas = [m.delta for m in mat_results]
|
403
|
+
|
404
|
+
hb_id_to_entry_indices_range = {}
|
405
|
+
file_index = 0
|
406
|
+
previous_task_index = -1
|
407
|
+
|
408
|
+
for m in mat_results:
|
409
|
+
assert m.pyarrow_write_result.files >= 1, "Atleast file must be materialized"
|
410
|
+
assert m.task_index != previous_task_index, (
|
411
|
+
"Multiple materialize results found for a " f"hash bucket: {m.task_index}"
|
412
|
+
)
|
413
|
+
|
414
|
+
hb_id_to_entry_indices_range[str(m.task_index)] = (
|
415
|
+
file_index,
|
416
|
+
file_index + m.pyarrow_write_result.files - 1,
|
417
|
+
)
|
418
|
+
|
419
|
+
file_index += m.pyarrow_write_result.files
|
420
|
+
previous_task_index = m.task_index
|
421
|
+
|
422
|
+
s3_utils.upload(
|
423
|
+
compaction_audit.audit_url,
|
424
|
+
str(json.dumps(compaction_audit)),
|
425
|
+
**params.s3_client_kwargs,
|
426
|
+
)
|
427
|
+
|
428
|
+
mat_results = sorted(mat_results, key=lambda m: m.task_index)
|
429
|
+
deltas = [m.delta for m in mat_results]
|
430
|
+
|
431
|
+
# Note: An appropriate last stream position must be set
|
432
|
+
# to avoid correctness issue.
|
433
|
+
merged_delta = Delta.merge_deltas(
|
434
|
+
deltas,
|
435
|
+
stream_position=params.last_stream_position_to_compact,
|
436
|
+
)
|
437
|
+
|
438
|
+
record_info_msg = (
|
439
|
+
f"Hash bucket records: {total_hb_record_count},"
|
440
|
+
f" Deduped records: {total_dd_record_count}, "
|
441
|
+
f" Materialized records: {merged_delta.meta.record_count}"
|
442
|
+
)
|
443
|
+
logger.info(record_info_msg)
|
444
|
+
|
445
|
+
compacted_delta = params.deltacat_storage.commit_delta(
|
446
|
+
merged_delta,
|
447
|
+
properties=kwargs.get("properties", {}),
|
448
|
+
**params.deltacat_storage_kwargs,
|
449
|
+
)
|
450
|
+
|
451
|
+
logger.info(f"Committed compacted delta: {compacted_delta}")
|
452
|
+
|
453
|
+
compaction_end = time.monotonic()
|
454
|
+
compaction_audit.set_compaction_time_in_seconds(compaction_end - compaction_start)
|
455
|
+
|
456
|
+
new_compacted_delta_locator = DeltaLocator.of(
|
457
|
+
compacted_partition.locator,
|
458
|
+
compacted_delta.stream_position,
|
459
|
+
)
|
460
|
+
|
461
|
+
pyarrow_write_result = PyArrowWriteResult.union(
|
462
|
+
[m.pyarrow_write_result for m in mat_results]
|
463
|
+
)
|
464
|
+
|
465
|
+
session_peak_memory = get_current_node_peak_memory_usage_in_bytes()
|
466
|
+
compaction_audit.set_peak_memory_used_bytes_by_compaction_session_process(
|
467
|
+
session_peak_memory
|
468
|
+
)
|
469
|
+
|
470
|
+
compaction_audit.save_round_completion_stats(
|
471
|
+
mat_results, telemetry_time_hb + telemetry_time_merge
|
472
|
+
)
|
473
|
+
|
474
|
+
cluster_util: ClusterUtilizationOverTimeRange = kwargs.get("cluster_util")
|
475
|
+
|
476
|
+
if cluster_util:
|
477
|
+
compaction_audit.set_total_cpu_seconds(cluster_util.total_vcpu_seconds)
|
478
|
+
compaction_audit.set_used_cpu_seconds(cluster_util.used_vcpu_seconds)
|
479
|
+
|
480
|
+
s3_utils.upload(
|
481
|
+
compaction_audit.audit_url,
|
482
|
+
str(json.dumps(compaction_audit)),
|
483
|
+
**params.s3_client_kwargs,
|
484
|
+
)
|
485
|
+
|
486
|
+
new_round_completion_info = RoundCompletionInfo.of(
|
487
|
+
high_watermark=params.last_stream_position_to_compact,
|
488
|
+
compacted_delta_locator=new_compacted_delta_locator,
|
489
|
+
compacted_pyarrow_write_result=pyarrow_write_result,
|
490
|
+
sort_keys_bit_width=params.bit_width_of_sort_keys,
|
491
|
+
manifest_entry_copied_by_reference_ratio=compaction_audit.untouched_file_ratio,
|
492
|
+
compaction_audit_url=audit_url,
|
493
|
+
hash_bucket_count=params.hash_bucket_count,
|
494
|
+
hb_index_to_entry_range=hb_id_to_entry_indices_range,
|
495
|
+
)
|
496
|
+
|
497
|
+
logger.info(
|
498
|
+
f"partition-{params.source_partition_locator.partition_values},"
|
499
|
+
f"compacted at: {params.last_stream_position_to_compact},"
|
500
|
+
)
|
501
|
+
|
502
|
+
return (
|
503
|
+
compacted_partition,
|
504
|
+
new_round_completion_info,
|
505
|
+
rcf_source_partition_locator,
|
506
|
+
)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
|
+
|
3
|
+
PK_DELIMITER = "L6kl7u5f"
|
4
|
+
|
5
|
+
MAX_RECORDS_PER_COMPACTED_FILE = 4_000_000
|
6
|
+
|
7
|
+
# The maximum amount of delta bytes allowed in a batch.
|
8
|
+
# A single task will not process more than these many bytes
|
9
|
+
# unless a single manifest entry (non-parquet) or single row
|
10
|
+
# group (parquet) is bigger than this size.
|
11
|
+
MIN_DELTA_BYTES_IN_BATCH = 5_000_000_000
|
12
|
+
|
13
|
+
# The total number of files that can be processed in a
|
14
|
+
# batch. Hence, if there are tiny files, this value can be
|
15
|
+
# limited so that enough parallelism can be attained.
|
16
|
+
MIN_FILES_IN_BATCH = float("inf")
|
17
|
+
|
18
|
+
# The average record size in a table.
|
19
|
+
AVERAGE_RECORD_SIZE_BYTES = 1000
|
20
|
+
|
21
|
+
# Maximum parallelism for the tasks at each BSP step.
|
22
|
+
# Default is the number of vCPUs in about 168
|
23
|
+
# r5.8xlarge EC2 instances.
|
24
|
+
TASK_MAX_PARALLELISM = 5367
|
25
|
+
|
26
|
+
# The percentage of memory that needs to be estimated
|
27
|
+
# as buffer. This value will ensure the job doesn't run out
|
28
|
+
# of memory by considering buffer for uncertainities.
|
29
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE = 20
|
30
|
+
|
31
|
+
# The total size of records that will be hash bucketed at once
|
32
|
+
# Since, sorting is nlogn, we ensure that is not performed
|
33
|
+
# on a very large dataset for best performance.
|
34
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
|