deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,28 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
import importlib
|
3
3
|
import copy
|
4
4
|
import json
|
5
5
|
from typing import Any, Dict, List, Optional
|
6
|
-
|
6
|
+
from deltacat.io.object_store import IObjectStore
|
7
|
+
from deltacat.utils.common import ReadKwargsProvider
|
7
8
|
from deltacat.types.media import ContentType
|
9
|
+
from deltacat.utils.placement import PlacementGroupConfig
|
10
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
11
|
+
from deltacat.storage import (
|
12
|
+
interface as unimplemented_deltacat_storage,
|
13
|
+
PartitionLocator,
|
14
|
+
SortKey,
|
15
|
+
)
|
16
|
+
from deltacat.compute.compactor_v2.constants import (
|
17
|
+
MAX_RECORDS_PER_COMPACTED_FILE,
|
18
|
+
MIN_DELTA_BYTES_IN_BATCH,
|
19
|
+
MIN_FILES_IN_BATCH,
|
20
|
+
AVERAGE_RECORD_SIZE_BYTES,
|
21
|
+
TASK_MAX_PARALLELISM,
|
22
|
+
)
|
23
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
24
|
+
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
25
|
+
from deltacat.utils.metrics import MetricsConfig
|
8
26
|
|
9
27
|
|
10
28
|
class CompactPartitionParams(dict):
|
@@ -16,104 +34,315 @@ class CompactPartitionParams(dict):
|
|
16
34
|
def of(params: Optional[Dict]) -> CompactPartitionParams:
|
17
35
|
if params is None:
|
18
36
|
params = {}
|
19
|
-
|
20
|
-
|
21
|
-
"destination_partition_locator"
|
22
|
-
)
|
23
|
-
|
24
|
-
"last_stream_position_to_compact"
|
37
|
+
|
38
|
+
assert (
|
39
|
+
params.get("destination_partition_locator") is not None
|
40
|
+
), "destination_partition_locator is a required arg"
|
41
|
+
assert (
|
42
|
+
params.get("last_stream_position_to_compact") is not None
|
43
|
+
), "last_stream_position_to_compact is a required arg"
|
44
|
+
assert (
|
45
|
+
params.get("source_partition_locator") is not None
|
46
|
+
), "source_partition_locator is a required arg"
|
47
|
+
assert (
|
48
|
+
params.get("compaction_artifact_s3_bucket") is not None
|
49
|
+
), "compaction_artifact_s3_bucket is a required arg"
|
50
|
+
|
51
|
+
result = CompactPartitionParams(params)
|
52
|
+
|
53
|
+
# TODO: move defaults to single file
|
54
|
+
result.records_per_compacted_file = params.get(
|
55
|
+
"records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
|
25
56
|
)
|
26
|
-
|
27
|
-
"
|
57
|
+
result.compacted_file_content_type = params.get(
|
58
|
+
"compacted_file_content_type", ContentType.PARQUET
|
28
59
|
)
|
29
|
-
|
30
|
-
|
31
|
-
|
60
|
+
result.object_store = params.get("object_store", RayPlasmaObjectStore())
|
61
|
+
result.enable_profiler = params.get("enable_profiler", False)
|
62
|
+
result.deltacat_storage = params.get(
|
63
|
+
"deltacat_storage", unimplemented_deltacat_storage
|
32
64
|
)
|
33
|
-
|
34
|
-
|
65
|
+
result.s3_client_kwargs = params.get("s3_client_kwargs", {})
|
66
|
+
result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
|
67
|
+
result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
|
68
|
+
result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
|
69
|
+
result.bit_width_of_sort_keys = validate_sort_keys(
|
70
|
+
result.source_partition_locator,
|
71
|
+
result.sort_keys,
|
72
|
+
result.deltacat_storage,
|
73
|
+
result.deltacat_storage_kwargs,
|
35
74
|
)
|
36
|
-
|
37
|
-
|
38
|
-
compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
|
39
|
-
"compaction_artifact_s3_bucket"
|
75
|
+
result.task_max_parallelism = params.get(
|
76
|
+
"task_max_parallelism", TASK_MAX_PARALLELISM
|
40
77
|
)
|
41
|
-
|
42
|
-
|
43
|
-
"
|
78
|
+
result.min_files_in_batch = params.get("min_files_in_batch", MIN_FILES_IN_BATCH)
|
79
|
+
result.min_delta_bytes_in_batch = params.get(
|
80
|
+
"min_delta_bytes_in_batch", MIN_DELTA_BYTES_IN_BATCH
|
44
81
|
)
|
45
|
-
|
46
|
-
"
|
82
|
+
result.previous_inflation = params.get(
|
83
|
+
"previous_inflation", PYARROW_INFLATION_MULTIPLIER
|
47
84
|
)
|
48
|
-
|
49
|
-
|
50
|
-
"read_kwargs_provider"
|
85
|
+
result.average_record_size_bytes = params.get(
|
86
|
+
"average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
|
51
87
|
)
|
52
|
-
|
53
|
-
"
|
88
|
+
result.hash_group_count = params.get(
|
89
|
+
"hash_group_count", result.hash_bucket_count
|
54
90
|
)
|
55
|
-
|
91
|
+
|
92
|
+
if not importlib.util.find_spec("memray"):
|
93
|
+
result.enable_profiler = False
|
94
|
+
|
95
|
+
if result.primary_keys:
|
96
|
+
result.primary_keys = sorted(result.primary_keys)
|
97
|
+
|
98
|
+
# assertions
|
99
|
+
assert (
|
100
|
+
result.source_partition_locator.partition_values
|
101
|
+
== result.destination_partition_locator.partition_values
|
102
|
+
), "Source and destination partitions values must be equal"
|
103
|
+
|
104
|
+
assert (
|
105
|
+
result.records_per_compacted_file and result.records_per_compacted_file >= 1
|
106
|
+
), "Max records per output file must be a positive value"
|
107
|
+
|
108
|
+
return result
|
56
109
|
|
57
110
|
@property
|
58
|
-
def destination_partition_locator(self) ->
|
59
|
-
|
111
|
+
def destination_partition_locator(self) -> PartitionLocator:
|
112
|
+
val = self["destination_partition_locator"]
|
113
|
+
if not isinstance(val, PartitionLocator):
|
114
|
+
val = PartitionLocator(val)
|
115
|
+
|
116
|
+
return val
|
117
|
+
|
118
|
+
@destination_partition_locator.setter
|
119
|
+
def destination_partition_locator(self, locator: PartitionLocator) -> None:
|
120
|
+
self["destination_partition_locator"] = locator
|
60
121
|
|
61
122
|
@property
|
62
|
-
def last_stream_position_to_compact(self) ->
|
123
|
+
def last_stream_position_to_compact(self) -> int:
|
63
124
|
return self["last_stream_position_to_compact"]
|
64
125
|
|
126
|
+
@last_stream_position_to_compact.setter
|
127
|
+
def last_stream_position_to_compact(self, stream_position: int) -> None:
|
128
|
+
self["last_stream_position_to_compact"] = stream_position
|
129
|
+
|
65
130
|
@property
|
66
|
-
def source_partition_locator(self) ->
|
67
|
-
|
131
|
+
def source_partition_locator(self) -> PartitionLocator:
|
132
|
+
val = self["source_partition_locator"]
|
133
|
+
if not isinstance(val, PartitionLocator):
|
134
|
+
val = PartitionLocator(val)
|
135
|
+
return val
|
136
|
+
|
137
|
+
@source_partition_locator.setter
|
138
|
+
def source_partition_locator(self, locator: PartitionLocator) -> None:
|
139
|
+
self["source_partition_locator"] = locator
|
68
140
|
|
69
141
|
@property
|
70
|
-
def
|
71
|
-
return
|
142
|
+
def compaction_artifact_s3_bucket(self) -> str:
|
143
|
+
return self["compaction_artifact_s3_bucket"]
|
144
|
+
|
145
|
+
@compaction_artifact_s3_bucket.setter
|
146
|
+
def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
|
147
|
+
self["compaction_artifact_s3_bucket"] = s3_bucket
|
72
148
|
|
73
149
|
@property
|
74
|
-
def
|
75
|
-
return self["
|
150
|
+
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
151
|
+
return self["deltacat_storage"]
|
152
|
+
|
153
|
+
@deltacat_storage.setter
|
154
|
+
def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
|
155
|
+
self["deltacat_storage"] = storage
|
76
156
|
|
77
157
|
@property
|
78
|
-
def
|
79
|
-
return self["
|
158
|
+
def object_store(self) -> IObjectStore:
|
159
|
+
return self["object_store"]
|
160
|
+
|
161
|
+
@object_store.setter
|
162
|
+
def object_store(self, obj_store: IObjectStore) -> None:
|
163
|
+
self["object_store"] = obj_store
|
80
164
|
|
81
165
|
@property
|
82
|
-
def
|
83
|
-
return self["
|
166
|
+
def compacted_file_content_type(self) -> ContentType:
|
167
|
+
return self["compacted_file_content_type"]
|
168
|
+
|
169
|
+
@compacted_file_content_type.setter
|
170
|
+
def compacted_file_content_type(self, content_type: ContentType) -> None:
|
171
|
+
self["compacted_file_content_type"] = content_type
|
84
172
|
|
85
173
|
@property
|
86
|
-
def
|
87
|
-
return self["
|
174
|
+
def task_max_parallelism(self) -> int:
|
175
|
+
return self["task_max_parallelism"]
|
176
|
+
|
177
|
+
@task_max_parallelism.setter
|
178
|
+
def task_max_parallelism(self, max_parallelism: int) -> None:
|
179
|
+
self["task_max_parallelism"] = max_parallelism
|
88
180
|
|
89
181
|
@property
|
90
|
-
def
|
91
|
-
return self["
|
182
|
+
def average_record_size_bytes(self) -> float:
|
183
|
+
return self["average_record_size_bytes"]
|
184
|
+
|
185
|
+
@average_record_size_bytes.setter
|
186
|
+
def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
|
187
|
+
self["average_record_size_bytes"] = average_record_size_bytes
|
92
188
|
|
93
189
|
@property
|
94
|
-
def
|
95
|
-
return self["
|
190
|
+
def min_files_in_batch(self) -> float:
|
191
|
+
return self["min_files_in_batch"]
|
192
|
+
|
193
|
+
@min_files_in_batch.setter
|
194
|
+
def min_files_in_batch(self, min_files_in_batch: float) -> None:
|
195
|
+
self["min_files_in_batch"] = min_files_in_batch
|
96
196
|
|
97
197
|
@property
|
98
|
-
def
|
99
|
-
return self["
|
198
|
+
def min_delta_bytes_in_batch(self) -> float:
|
199
|
+
return self["min_files_in_batch"]
|
200
|
+
|
201
|
+
@min_delta_bytes_in_batch.setter
|
202
|
+
def min_delta_bytes_in_batch(self, min_delta_bytes_in_batch: float) -> None:
|
203
|
+
self["min_delta_bytes_in_batch"] = min_delta_bytes_in_batch
|
100
204
|
|
101
205
|
@property
|
102
|
-
def
|
103
|
-
return self["
|
206
|
+
def previous_inflation(self) -> float:
|
207
|
+
return self["previous_inflation"]
|
208
|
+
|
209
|
+
@previous_inflation.setter
|
210
|
+
def previous_inflation(self, previous_inflation: float) -> None:
|
211
|
+
self["previous_inflation"] = previous_inflation
|
104
212
|
|
105
213
|
@property
|
106
|
-
def
|
107
|
-
return self["
|
214
|
+
def enable_profiler(self) -> bool:
|
215
|
+
return self["enable_profiler"]
|
216
|
+
|
217
|
+
@enable_profiler.setter
|
218
|
+
def enable_profiler(self, value: bool) -> None:
|
219
|
+
self["enable_profiler"] = value
|
108
220
|
|
109
221
|
@property
|
110
|
-
def
|
111
|
-
return self["
|
222
|
+
def list_deltas_kwargs(self) -> dict:
|
223
|
+
return self["list_deltas_kwargs"]
|
224
|
+
|
225
|
+
@list_deltas_kwargs.setter
|
226
|
+
def list_deltas_kwargs(self, kwargs: dict) -> None:
|
227
|
+
self["list_deltas_kwargs"] = kwargs
|
112
228
|
|
113
229
|
@property
|
114
|
-
def s3_table_writer_kwargs(self) ->
|
230
|
+
def s3_table_writer_kwargs(self) -> dict:
|
115
231
|
return self["s3_table_writer_kwargs"]
|
116
232
|
|
233
|
+
@s3_table_writer_kwargs.setter
|
234
|
+
def s3_table_writer_kwargs(self, kwargs: dict) -> None:
|
235
|
+
self["s3_table_writer_kwargs"] = kwargs
|
236
|
+
|
237
|
+
@property
|
238
|
+
def deltacat_storage_kwargs(self) -> dict:
|
239
|
+
return self["deltacat_storage_kwargs"]
|
240
|
+
|
241
|
+
@deltacat_storage_kwargs.setter
|
242
|
+
def deltacat_storage_kwargs(self, kwargs: dict) -> None:
|
243
|
+
self["deltacat_storage_kwargs"] = kwargs
|
244
|
+
|
245
|
+
@property
|
246
|
+
def s3_client_kwargs(self) -> dict:
|
247
|
+
return self["s3_client_kwargs"]
|
248
|
+
|
249
|
+
@s3_client_kwargs.setter
|
250
|
+
def s3_client_kwargs(self, kwargs: dict) -> None:
|
251
|
+
self["s3_client_kwargs"] = kwargs
|
252
|
+
|
253
|
+
@property
|
254
|
+
def records_per_compacted_file(self) -> int:
|
255
|
+
return self["records_per_compacted_file"]
|
256
|
+
|
257
|
+
@records_per_compacted_file.setter
|
258
|
+
def records_per_compacted_file(self, count: int) -> None:
|
259
|
+
self["records_per_compacted_file"] = count
|
260
|
+
|
261
|
+
@property
|
262
|
+
def bit_width_of_sort_keys(self) -> int:
|
263
|
+
return self["bit_width_of_sort_keys"]
|
264
|
+
|
265
|
+
@bit_width_of_sort_keys.setter
|
266
|
+
def bit_width_of_sort_keys(self, width: int) -> None:
|
267
|
+
self["bit_width_of_sort_keys"] = width
|
268
|
+
|
269
|
+
@property
|
270
|
+
def hash_bucket_count(self) -> Optional[int]:
|
271
|
+
return self.get("hash_bucket_count")
|
272
|
+
|
273
|
+
@hash_bucket_count.setter
|
274
|
+
def hash_bucket_count(self, count: int) -> None:
|
275
|
+
self["hash_bucket_count"] = count
|
276
|
+
|
277
|
+
@property
|
278
|
+
def hash_group_count(self) -> int:
|
279
|
+
return self["hash_group_count"]
|
280
|
+
|
281
|
+
@hash_group_count.setter
|
282
|
+
def hash_group_count(self, count: int) -> None:
|
283
|
+
self["hash_group_count"] = count
|
284
|
+
|
285
|
+
@property
|
286
|
+
def primary_keys(self) -> Optional[List[str]]:
|
287
|
+
return self.get("primary_keys")
|
288
|
+
|
289
|
+
@primary_keys.setter
|
290
|
+
def primary_keys(self, keys: List[str]) -> None:
|
291
|
+
self["primary_keys"] = keys
|
292
|
+
|
293
|
+
@property
|
294
|
+
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
295
|
+
val = self.get("rebase_source_partition_locator")
|
296
|
+
|
297
|
+
if val and not isinstance(val, PartitionLocator):
|
298
|
+
val = PartitionLocator(val)
|
299
|
+
|
300
|
+
return val
|
301
|
+
|
302
|
+
@rebase_source_partition_locator.setter
|
303
|
+
def rebase_source_partition_locator(self, locator: PartitionLocator) -> None:
|
304
|
+
self["rebase_source_partition_locator"] = locator
|
305
|
+
|
306
|
+
@property
|
307
|
+
def rebase_source_partition_high_watermark(self) -> Optional[int]:
|
308
|
+
return self.get("rebase_source_partition_high_watermark")
|
309
|
+
|
310
|
+
@rebase_source_partition_high_watermark.setter
|
311
|
+
def rebase_source_partition_high_watermark(self, high_watermark: int) -> None:
|
312
|
+
self["rebase_source_partition_high_watermark"] = high_watermark
|
313
|
+
|
314
|
+
@property
|
315
|
+
def pg_config(self) -> Optional[PlacementGroupConfig]:
|
316
|
+
return self.get("pg_config")
|
317
|
+
|
318
|
+
@pg_config.setter
|
319
|
+
def pg_config(self, config: PlacementGroupConfig) -> None:
|
320
|
+
self["pg_config"] = config
|
321
|
+
|
322
|
+
@property
|
323
|
+
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
324
|
+
return self.get("read_kwargs_provider")
|
325
|
+
|
326
|
+
@read_kwargs_provider.setter
|
327
|
+
def read_kwargs_provider(self, kwargs_provider: ReadKwargsProvider) -> None:
|
328
|
+
self["read_kwargs_provider"] = kwargs_provider
|
329
|
+
|
330
|
+
@property
|
331
|
+
def sort_keys(self) -> Optional[List[SortKey]]:
|
332
|
+
return self.get("sort_keys")
|
333
|
+
|
334
|
+
@sort_keys.setter
|
335
|
+
def sort_keys(self, keys: List[SortKey]) -> None:
|
336
|
+
self["sort_keys"] = keys
|
337
|
+
|
338
|
+
@property
|
339
|
+
def metrics_config(self) -> Optional[MetricsConfig]:
|
340
|
+
return self.get("metrics_config")
|
341
|
+
|
342
|
+
@metrics_config.setter
|
343
|
+
def metrics_config(self, config: MetricsConfig) -> None:
|
344
|
+
self["metrics_config"] = config
|
345
|
+
|
117
346
|
@staticmethod
|
118
347
|
def json_handler_for_compact_partition_params(obj):
|
119
348
|
"""
|
@@ -18,6 +18,7 @@ class CompactionSessionAuditInfo(dict):
|
|
18
18
|
DEDUPE_STEP_NAME = "dedupe"
|
19
19
|
MATERIALIZE_STEP_NAME = "materialize"
|
20
20
|
HASH_BUCKET_STEP_NAME = "hashBucket"
|
21
|
+
MERGE_STEP_NAME = "merge"
|
21
22
|
|
22
23
|
def __init__(self, deltacat_version: str, audit_url: str):
|
23
24
|
self.set_deltacat_version(deltacat_version)
|
@@ -52,7 +53,7 @@ class CompactionSessionAuditInfo(dict):
|
|
52
53
|
@property
|
53
54
|
def uniform_deltas_created(self) -> int:
|
54
55
|
"""
|
55
|
-
The total number of
|
56
|
+
The total number of uniform deltas fed into the hash bucket step.
|
56
57
|
"""
|
57
58
|
return self.get("uniformDeltasCreated")
|
58
59
|
|
@@ -68,7 +69,7 @@ class CompactionSessionAuditInfo(dict):
|
|
68
69
|
@property
|
69
70
|
def input_size_bytes(self) -> float:
|
70
71
|
"""
|
71
|
-
The on-disk size in bytes of the input.
|
72
|
+
The on-disk size in bytes of the input. Analogous to bytes scanned
|
72
73
|
"""
|
73
74
|
return self.get("inputSizeBytes")
|
74
75
|
|
@@ -142,6 +143,15 @@ class CompactionSessionAuditInfo(dict):
|
|
142
143
|
"""
|
143
144
|
return self.get("materializeTaskPeakMemoryUsedBytes")
|
144
145
|
|
146
|
+
@property
|
147
|
+
def peak_memory_used_bytes_per_merge_task(self) -> float:
|
148
|
+
"""
|
149
|
+
The peak memory used by a single merge python process. Note
|
150
|
+
that results may be max of merge, and hash bucketing as
|
151
|
+
processes are reused by Ray to run all compaction steps.
|
152
|
+
"""
|
153
|
+
return self.get("mergeTaskPeakMemoryUsedBytes")
|
154
|
+
|
145
155
|
@property
|
146
156
|
def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
|
147
157
|
"""
|
@@ -164,6 +174,13 @@ class CompactionSessionAuditInfo(dict):
|
|
164
174
|
"""
|
165
175
|
return self.get("materializePostObjectStoreMemoryUsedBytes")
|
166
176
|
|
177
|
+
@property
|
178
|
+
def merge_post_object_store_memory_used_bytes(self) -> float:
|
179
|
+
"""
|
180
|
+
The total object store memory used after merge step.
|
181
|
+
"""
|
182
|
+
return self.get("mergePostObjectStoreMemoryUsedBytes")
|
183
|
+
|
167
184
|
@property
|
168
185
|
def materialize_buckets(self) -> int:
|
169
186
|
"""
|
@@ -233,11 +250,33 @@ class CompactionSessionAuditInfo(dict):
|
|
233
250
|
@property
|
234
251
|
def materialize_result_wait_time_in_seconds(self) -> float:
|
235
252
|
"""
|
236
|
-
The time it takes ray.get() to resolve after the last
|
253
|
+
The time it takes ray.get() to resolve after the last materialize task has completed.
|
237
254
|
This value may not be accurate at less than 1 second precision.
|
238
255
|
"""
|
239
256
|
return self.get("materializeResultWaitTimeInSeconds")
|
240
257
|
|
258
|
+
@property
|
259
|
+
def merge_result_wait_time_in_seconds(self) -> float:
|
260
|
+
"""
|
261
|
+
The time it takes ray.get() to resolve after the last task has completed.
|
262
|
+
This value may not be accurate at less than 1 second precision.
|
263
|
+
"""
|
264
|
+
return self.get("mergeResultWaitTimeInSeconds")
|
265
|
+
|
266
|
+
@property
|
267
|
+
def merge_time_in_seconds(self) -> float:
|
268
|
+
"""
|
269
|
+
The time taken by merge step. This includes all merge tasks.
|
270
|
+
"""
|
271
|
+
return self.get("mergeTimeInSeconds")
|
272
|
+
|
273
|
+
@property
|
274
|
+
def merge_invoke_time_in_seconds(self) -> float:
|
275
|
+
"""
|
276
|
+
The time taken to invoke all merge tasks.
|
277
|
+
"""
|
278
|
+
return self.get("mergeInvokeTimeInSeconds")
|
279
|
+
|
241
280
|
@property
|
242
281
|
def delta_discovery_time_in_seconds(self) -> float:
|
243
282
|
"""
|
@@ -337,6 +376,13 @@ class CompactionSessionAuditInfo(dict):
|
|
337
376
|
"""
|
338
377
|
return self.get("materializeResultSize")
|
339
378
|
|
379
|
+
@property
|
380
|
+
def merge_result_size(self) -> float:
|
381
|
+
"""
|
382
|
+
The size of the results returned by merge step.
|
383
|
+
"""
|
384
|
+
return self.get("mergeResultSize")
|
385
|
+
|
340
386
|
@property
|
341
387
|
def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
|
342
388
|
"""
|
@@ -344,6 +390,35 @@ class CompactionSessionAuditInfo(dict):
|
|
344
390
|
"""
|
345
391
|
return self.get("peakMemoryUsedBytesCompactionSessionProcess")
|
346
392
|
|
393
|
+
@property
|
394
|
+
def estimated_in_memory_size_bytes_during_discovery(self) -> float:
|
395
|
+
"""
|
396
|
+
The estimated in-memory size during the discovery. This can be used
|
397
|
+
to determine the accuracy of memory estimation logic.
|
398
|
+
"""
|
399
|
+
return self.get("estimatedInMemorySizeBytesDuringDiscovery")
|
400
|
+
|
401
|
+
@property
|
402
|
+
def hash_bucket_processed_size_bytes(self) -> int:
|
403
|
+
"""
|
404
|
+
The total size of the input data processed during hash bucket
|
405
|
+
"""
|
406
|
+
return self.get("hashBucketProcessedSizeBytes")
|
407
|
+
|
408
|
+
@property
|
409
|
+
def total_cpu_seconds(self) -> float:
|
410
|
+
"""
|
411
|
+
Total number of vCPUs provisioned in the cluster weighted over time.
|
412
|
+
"""
|
413
|
+
return self.get("totalCPUSeconds")
|
414
|
+
|
415
|
+
@property
|
416
|
+
def used_cpu_seconds(self) -> float:
|
417
|
+
"""
|
418
|
+
Total used vCPU in the cluster weighted over time.
|
419
|
+
"""
|
420
|
+
return self.get("usedCPUSeconds")
|
421
|
+
|
347
422
|
# Setters follow
|
348
423
|
|
349
424
|
def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
|
@@ -428,6 +503,12 @@ class CompactionSessionAuditInfo(dict):
|
|
428
503
|
] = peak_memory_used_bytes_per_materialize_task
|
429
504
|
return self
|
430
505
|
|
506
|
+
def set_peak_memory_used_bytes_per_merge_task(
|
507
|
+
self, peak_memory_used_bytes: float
|
508
|
+
) -> CompactionSessionAuditInfo:
|
509
|
+
self["mergeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes
|
510
|
+
return self
|
511
|
+
|
431
512
|
def set_hash_bucket_post_object_store_memory_used_bytes(
|
432
513
|
self, object_store_memory_used_bytes_by_hb: float
|
433
514
|
) -> CompactionSessionAuditInfo:
|
@@ -452,6 +533,12 @@ class CompactionSessionAuditInfo(dict):
|
|
452
533
|
] = object_store_memory_used_bytes_by_dedupe
|
453
534
|
return self
|
454
535
|
|
536
|
+
def set_merge_post_object_store_memory_used_bytes(
|
537
|
+
self, object_store_memory_used_bytes: float
|
538
|
+
) -> CompactionSessionAuditInfo:
|
539
|
+
self["mergePostObjectStoreMemoryUsedBytes"] = object_store_memory_used_bytes
|
540
|
+
return self
|
541
|
+
|
455
542
|
def set_materialize_buckets(
|
456
543
|
self, materialize_buckets: int
|
457
544
|
) -> CompactionSessionAuditInfo:
|
@@ -512,6 +599,24 @@ class CompactionSessionAuditInfo(dict):
|
|
512
599
|
self.get["materializeResultWaitTimeInSeconds"] = wait_time
|
513
600
|
return self
|
514
601
|
|
602
|
+
def set_merge_time_in_seconds(
|
603
|
+
self, time_in_seconds: float
|
604
|
+
) -> CompactionSessionAuditInfo:
|
605
|
+
self["mergeTimeInSeconds"] = time_in_seconds
|
606
|
+
return self
|
607
|
+
|
608
|
+
def set_merge_invoke_time_in_seconds(
|
609
|
+
self, invoke_time: float
|
610
|
+
) -> CompactionSessionAuditInfo:
|
611
|
+
self["mergeInvokeTimeInSeconds"] = invoke_time
|
612
|
+
return self
|
613
|
+
|
614
|
+
def set_merge_result_wait_time_in_seconds(
|
615
|
+
self, wait_time: float
|
616
|
+
) -> CompactionSessionAuditInfo:
|
617
|
+
self.get["mergeResultWaitTimeInSeconds"] = wait_time
|
618
|
+
return self
|
619
|
+
|
515
620
|
def set_delta_discovery_time_in_seconds(
|
516
621
|
self, delta_discovery_time_in_seconds: float
|
517
622
|
) -> CompactionSessionAuditInfo:
|
@@ -598,12 +703,38 @@ class CompactionSessionAuditInfo(dict):
|
|
598
703
|
self["materializeResultSize"] = materialize_result_size_bytes
|
599
704
|
return self
|
600
705
|
|
706
|
+
def set_merge_result_size_bytes(
|
707
|
+
self, merge_result_size_bytes: float
|
708
|
+
) -> CompactionSessionAuditInfo:
|
709
|
+
self["mergeResultSize"] = merge_result_size_bytes
|
710
|
+
return self
|
711
|
+
|
601
712
|
def set_peak_memory_used_bytes_by_compaction_session_process(
|
602
713
|
self, peak_memory: float
|
603
714
|
) -> CompactionSessionAuditInfo:
|
604
715
|
self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
|
605
716
|
return self
|
606
717
|
|
718
|
+
def set_estimated_in_memory_size_bytes_during_discovery(
|
719
|
+
self, memory: float
|
720
|
+
) -> CompactionSessionAuditInfo:
|
721
|
+
self["estimatedInMemorySizeBytesDuringDiscovery"] = memory
|
722
|
+
return self
|
723
|
+
|
724
|
+
def set_hash_bucket_processed_size_bytes(
|
725
|
+
self, size: int
|
726
|
+
) -> CompactionSessionAuditInfo:
|
727
|
+
self["hashBucketProcessedSizeBytes"] = size
|
728
|
+
return self
|
729
|
+
|
730
|
+
def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
731
|
+
self["totalCPUSeconds"] = value
|
732
|
+
return self
|
733
|
+
|
734
|
+
def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
|
735
|
+
self["usedCPUSeconds"] = value
|
736
|
+
return self
|
737
|
+
|
607
738
|
# High level methods to save stats
|
608
739
|
def save_step_stats(
|
609
740
|
self,
|
@@ -673,7 +804,10 @@ class CompactionSessionAuditInfo(dict):
|
|
673
804
|
)
|
674
805
|
|
675
806
|
total_count_of_src_dfl_not_touched = sum(
|
676
|
-
m.referenced_pyarrow_write_result.files
|
807
|
+
m.referenced_pyarrow_write_result.files
|
808
|
+
if m.referenced_pyarrow_write_result
|
809
|
+
else 0
|
810
|
+
for m in mat_results
|
677
811
|
)
|
678
812
|
|
679
813
|
logger.info(
|
@@ -697,10 +831,16 @@ class CompactionSessionAuditInfo(dict):
|
|
697
831
|
)
|
698
832
|
|
699
833
|
untouched_file_record_count = sum(
|
700
|
-
m.referenced_pyarrow_write_result.records
|
834
|
+
m.referenced_pyarrow_write_result.records
|
835
|
+
if m.referenced_pyarrow_write_result
|
836
|
+
else 0
|
837
|
+
for m in mat_results
|
701
838
|
)
|
702
839
|
untouched_file_size_bytes = sum(
|
703
|
-
m.referenced_pyarrow_write_result.file_bytes
|
840
|
+
m.referenced_pyarrow_write_result.file_bytes
|
841
|
+
if m.referenced_pyarrow_write_result
|
842
|
+
else 0
|
843
|
+
for m in mat_results
|
704
844
|
)
|
705
845
|
|
706
846
|
self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
|
@@ -715,9 +855,10 @@ class CompactionSessionAuditInfo(dict):
|
|
715
855
|
self.set_peak_memory_used_bytes_per_task(
|
716
856
|
max(
|
717
857
|
[
|
718
|
-
self.peak_memory_used_bytes_per_hash_bucket_task,
|
719
|
-
self.peak_memory_used_bytes_per_dedupe_task,
|
720
|
-
self.peak_memory_used_bytes_per_materialize_task,
|
858
|
+
self.peak_memory_used_bytes_per_hash_bucket_task or 0,
|
859
|
+
self.peak_memory_used_bytes_per_dedupe_task or 0,
|
860
|
+
self.peak_memory_used_bytes_per_materialize_task or 0,
|
861
|
+
self.peak_memory_used_bytes_per_merge_task or 0,
|
721
862
|
]
|
722
863
|
)
|
723
864
|
)
|