deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +184 -29
- deltacat/compute/compactor/model/compact_partition_params.py +153 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
- deltacat/compute/compactor/model/dedupe_result.py +3 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
- deltacat/compute/compactor/model/delta_file_locator.py +11 -6
- deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
- deltacat/compute/compactor/model/materialize_result.py +27 -6
- deltacat/compute/compactor/model/round_completion_info.py +9 -0
- deltacat/compute/compactor/steps/dedupe.py +35 -19
- deltacat/compute/compactor/steps/hash_bucket.py +41 -16
- deltacat/compute/compactor/steps/materialize.py +73 -70
- deltacat/compute/compactor/utils/io.py +15 -0
- deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat/compute/compactor/utils/round_completion_file.py +13 -4
- deltacat/compute/compactor/utils/system_columns.py +32 -0
- deltacat/io/__init__.py +0 -7
- deltacat/io/file_object_store.py +48 -0
- deltacat/io/memcached_object_store.py +121 -0
- deltacat/io/object_store.py +51 -0
- deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat/io/redis_object_store.py +114 -0
- deltacat/io/s3_object_store.py +44 -0
- deltacat/storage/model/delta.py +2 -1
- deltacat/tests/compactor/test_compact_partition_params.py +237 -0
- deltacat/tests/compactor/utils/test_io.py +27 -5
- deltacat/tests/io/__init__.py +0 -0
- deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat/tests/io/test_s3_object_store.py +59 -0
- deltacat/tests/utils/test_record_batch_tables.py +1 -1
- deltacat/tests/utils/test_resources.py +9 -0
- deltacat/utils/ray_utils/concurrency.py +0 -2
- deltacat/utils/resources.py +30 -18
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,725 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
import logging
|
4
|
+
from deltacat import logs
|
5
|
+
from typing import List, Union
|
6
|
+
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
7
|
+
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
8
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
9
|
+
from deltacat.utils.performance import timed_invocation
|
10
|
+
from deltacat.utils.resources import ClusterUtilization, get_size_of_object_in_bytes
|
11
|
+
from deltacat.compute.compactor import PyArrowWriteResult
|
12
|
+
|
13
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
+
|
15
|
+
|
16
|
+
class CompactionSessionAuditInfo(dict):
|
17
|
+
|
18
|
+
DEDUPE_STEP_NAME = "dedupe"
|
19
|
+
MATERIALIZE_STEP_NAME = "materialize"
|
20
|
+
HASH_BUCKET_STEP_NAME = "hashBucket"
|
21
|
+
|
22
|
+
def __init__(self, deltacat_version: str, audit_url: str):
|
23
|
+
self.set_deltacat_version(deltacat_version)
|
24
|
+
self.set_audit_url(audit_url)
|
25
|
+
|
26
|
+
@property
|
27
|
+
def audit_url(self) -> str:
|
28
|
+
return self.get("auditUrl")
|
29
|
+
|
30
|
+
@property
|
31
|
+
def deltacat_version(self) -> str:
|
32
|
+
"""
|
33
|
+
The deltacat version used to run compaction job.
|
34
|
+
"""
|
35
|
+
return self.get("deltacatVersion")
|
36
|
+
|
37
|
+
@property
|
38
|
+
def input_records(self) -> int:
|
39
|
+
"""
|
40
|
+
The total number of records from input deltas that needs to be compacted
|
41
|
+
(before deduplication).
|
42
|
+
"""
|
43
|
+
return self.get("inputRecords")
|
44
|
+
|
45
|
+
@property
|
46
|
+
def input_file_count(self) -> int:
|
47
|
+
"""
|
48
|
+
The total number of input files that needs to be compacted.
|
49
|
+
"""
|
50
|
+
return self.get("inputFileCount")
|
51
|
+
|
52
|
+
@property
|
53
|
+
def uniform_deltas_created(self) -> int:
|
54
|
+
"""
|
55
|
+
The total number of unitform deltas fed into the hash bucket step.
|
56
|
+
"""
|
57
|
+
return self.get("uniformDeltasCreated")
|
58
|
+
|
59
|
+
@property
|
60
|
+
def records_deduped(self) -> int:
|
61
|
+
"""
|
62
|
+
The total number of records that were deduplicated. For example,
|
63
|
+
if there are 100 records with a particular primary key, 99 records
|
64
|
+
will be deduplicated.
|
65
|
+
"""
|
66
|
+
return self.get("recordsDeduped")
|
67
|
+
|
68
|
+
@property
|
69
|
+
def input_size_bytes(self) -> float:
|
70
|
+
"""
|
71
|
+
The on-disk size in bytes of the input.
|
72
|
+
"""
|
73
|
+
return self.get("inputSizeBytes")
|
74
|
+
|
75
|
+
@property
|
76
|
+
def hash_bucket_count(self) -> int:
|
77
|
+
"""
|
78
|
+
Total number of hash buckets used during compaction.
|
79
|
+
"""
|
80
|
+
return self.get("hashBucketCount")
|
81
|
+
|
82
|
+
@property
|
83
|
+
def cluster_cpu_max(self) -> float:
|
84
|
+
"""
|
85
|
+
Total cluster cpu allocated for the compaction job. If it is autoscaling cluster,
|
86
|
+
max cpu at any time will be reported.
|
87
|
+
"""
|
88
|
+
return self.get("clusterCpuMax")
|
89
|
+
|
90
|
+
@property
|
91
|
+
def compaction_time_in_seconds(self) -> float:
|
92
|
+
"""
|
93
|
+
The total time taken by the compaction session to complete.
|
94
|
+
"""
|
95
|
+
return self.get("compactionTimeInSeconds")
|
96
|
+
|
97
|
+
@property
|
98
|
+
def total_object_store_memory_used_bytes(self) -> float:
|
99
|
+
"""
|
100
|
+
The total object store memory used by the compaction session across all
|
101
|
+
nodes in the entire cluster.
|
102
|
+
"""
|
103
|
+
return self.get("totalObjectStoreMemoryUsedBytes")
|
104
|
+
|
105
|
+
@property
|
106
|
+
def peak_memory_used_bytes_per_task(self) -> float:
|
107
|
+
"""
|
108
|
+
The peak memory used by a single process in the compaction job. Note that
|
109
|
+
Ray creates a single process to run each hash bucketing, dedupe and
|
110
|
+
materialize task and the process is reused. Hence, you may see
|
111
|
+
monotonically increasing values. Peak memory is important because,
|
112
|
+
the cluster must be scaled to handle the peak memory per node even
|
113
|
+
though average memory usage is low.
|
114
|
+
"""
|
115
|
+
return self.get("peakMemoryUsedBytesPerTask")
|
116
|
+
|
117
|
+
@property
|
118
|
+
def peak_memory_used_bytes_per_hash_bucket_task(self) -> float:
|
119
|
+
"""
|
120
|
+
The peak memory used by a single hash bucketing process. For example,
|
121
|
+
if peak usage of hash bucketing process is 40GB, it is not safe to run
|
122
|
+
more than 3 hash bucketing tasks on a node with 120GB to avoid crashing
|
123
|
+
due to memory overflow.
|
124
|
+
"""
|
125
|
+
return self.get("hashBucketTaskPeakMemoryUsedBytes")
|
126
|
+
|
127
|
+
@property
|
128
|
+
def peak_memory_used_bytes_per_dedupe_task(self) -> float:
|
129
|
+
"""
|
130
|
+
The peak memory used by a single dedupe python process. Note that
|
131
|
+
results may be max of dedupe and hash bucketing as processes are
|
132
|
+
reused by Ray to run dedupe and hash bucketing.
|
133
|
+
"""
|
134
|
+
return self.get("dedupeTaskPeakMemoryUsedBytes")
|
135
|
+
|
136
|
+
@property
|
137
|
+
def peak_memory_used_bytes_per_materialize_task(self) -> float:
|
138
|
+
"""
|
139
|
+
The peak memory used by a single materialize python process. Note
|
140
|
+
that results may be max of materialize, dedupe and hash bucketing as
|
141
|
+
processes are reused by Ray to run all compaction steps.
|
142
|
+
"""
|
143
|
+
return self.get("materializeTaskPeakMemoryUsedBytes")
|
144
|
+
|
145
|
+
@property
|
146
|
+
def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
|
147
|
+
"""
|
148
|
+
The total object store memory used by hash bucketing step across
|
149
|
+
cluster, before dedupe is run.
|
150
|
+
"""
|
151
|
+
return self.get("hashBucketPostObjectStoreMemoryUsedBytes")
|
152
|
+
|
153
|
+
@property
|
154
|
+
def dedupe_post_object_store_memory_used_bytes(self) -> float:
|
155
|
+
"""
|
156
|
+
The total object store memory used after dedupe step before materialize is run.
|
157
|
+
"""
|
158
|
+
return self.get("dedupePostObjectStoreMemoryUsedBytes")
|
159
|
+
|
160
|
+
@property
|
161
|
+
def materialize_post_object_store_memory_used_bytes(self) -> float:
|
162
|
+
"""
|
163
|
+
The total object store memory used after materialize step.
|
164
|
+
"""
|
165
|
+
return self.get("materializePostObjectStoreMemoryUsedBytes")
|
166
|
+
|
167
|
+
@property
|
168
|
+
def materialize_buckets(self) -> int:
|
169
|
+
"""
|
170
|
+
The total number of materialize buckets created.
|
171
|
+
"""
|
172
|
+
return self.get("materializeBuckets")
|
173
|
+
|
174
|
+
@property
|
175
|
+
def hash_bucket_time_in_seconds(self) -> float:
|
176
|
+
"""
|
177
|
+
The time taken by hash bucketing step. This includes all hash bucket tasks.
|
178
|
+
This includes invoke time.
|
179
|
+
"""
|
180
|
+
return self.get("hashBucketTimeInSeconds")
|
181
|
+
|
182
|
+
@property
|
183
|
+
def hash_bucket_invoke_time_in_seconds(self) -> float:
|
184
|
+
"""
|
185
|
+
The time taken to invoke and create all hash bucketing tasks.
|
186
|
+
"""
|
187
|
+
return self.get("hashBucketInvokeTimeInSeconds")
|
188
|
+
|
189
|
+
@property
|
190
|
+
def hash_bucket_result_wait_time_in_seconds(self) -> float:
|
191
|
+
"""
|
192
|
+
The time it takes ray.get() to resolve after the last hash bucket task has completed.
|
193
|
+
This value may not be accurate at less than 1 second precision.
|
194
|
+
"""
|
195
|
+
return self.get("hashBucketResultWaitTimeInSeconds")
|
196
|
+
|
197
|
+
@property
|
198
|
+
def dedupe_time_in_seconds(self) -> float:
|
199
|
+
"""
|
200
|
+
The time taken by dedupe step. This include all dedupe tasks.
|
201
|
+
"""
|
202
|
+
return self.get("dedupeTimeInSeconds")
|
203
|
+
|
204
|
+
@property
|
205
|
+
def dedupe_invoke_time_in_seconds(self) -> float:
|
206
|
+
"""
|
207
|
+
The time taken to invoke all dedupe tasks.
|
208
|
+
"""
|
209
|
+
return self.get("dedupeInvokeTimeInSeconds")
|
210
|
+
|
211
|
+
@property
|
212
|
+
def dedupe_result_wait_time_in_seconds(self) -> float:
|
213
|
+
"""
|
214
|
+
The time it takes ray.get() to resolve after the last dedupe task has completed.
|
215
|
+
This value may not be accurate at less than 1 second precision.
|
216
|
+
"""
|
217
|
+
return self.get("dedupeResultWaitTimeInSeconds")
|
218
|
+
|
219
|
+
@property
|
220
|
+
def materialize_time_in_seconds(self) -> float:
|
221
|
+
"""
|
222
|
+
The time taken by materialize step. This includes all materialize tasks.
|
223
|
+
"""
|
224
|
+
return self.get("materializeTimeInSeconds")
|
225
|
+
|
226
|
+
@property
|
227
|
+
def materialize_invoke_time_in_seconds(self) -> float:
|
228
|
+
"""
|
229
|
+
The time taken to invoke all materialize tasks.
|
230
|
+
"""
|
231
|
+
return self.get("materializeInvokeTimeInSeconds")
|
232
|
+
|
233
|
+
@property
|
234
|
+
def materialize_result_wait_time_in_seconds(self) -> float:
|
235
|
+
"""
|
236
|
+
The time it takes ray.get() to resolve after the last hash bucket task has completed.
|
237
|
+
This value may not be accurate at less than 1 second precision.
|
238
|
+
"""
|
239
|
+
return self.get("materializeResultWaitTimeInSeconds")
|
240
|
+
|
241
|
+
@property
|
242
|
+
def delta_discovery_time_in_seconds(self) -> float:
|
243
|
+
"""
|
244
|
+
The time taken by delta discovery step which is mostly run before hash bucketing is started.
|
245
|
+
"""
|
246
|
+
return self.get("deltaDiscoveryTimeInSeconds")
|
247
|
+
|
248
|
+
@property
|
249
|
+
def output_file_count(self) -> int:
|
250
|
+
"""
|
251
|
+
The total number of files in the compacted output (includes untouched files).
|
252
|
+
"""
|
253
|
+
return self.get("outputFileCount")
|
254
|
+
|
255
|
+
@property
|
256
|
+
def output_size_bytes(self) -> float:
|
257
|
+
"""
|
258
|
+
The on-disk size of the compacted output including any untouched files.
|
259
|
+
"""
|
260
|
+
return self.get("outputSizeBytes")
|
261
|
+
|
262
|
+
@property
|
263
|
+
def output_size_pyarrow_bytes(self) -> float:
|
264
|
+
"""
|
265
|
+
The pyarrow in-memory size of compacted output including any untouched files.
|
266
|
+
"""
|
267
|
+
return self.get("outputSizePyarrowBytes")
|
268
|
+
|
269
|
+
@property
|
270
|
+
def total_cluster_memory_bytes(self) -> float:
|
271
|
+
"""
|
272
|
+
The total memory allocated to the cluster.
|
273
|
+
"""
|
274
|
+
return self.get("totalClusterMemoryBytes")
|
275
|
+
|
276
|
+
@property
|
277
|
+
def total_cluster_object_store_memory_bytes(self) -> float:
|
278
|
+
"""
|
279
|
+
The total object store memory allocated to the cluster.
|
280
|
+
"""
|
281
|
+
return self.get("totalClusterObjectStoreMemoryBytes")
|
282
|
+
|
283
|
+
@property
|
284
|
+
def untouched_file_count(self) -> int:
|
285
|
+
"""
|
286
|
+
The total number of files that were untouched by materialize step.
|
287
|
+
"""
|
288
|
+
return self.get("untouchedFileCount")
|
289
|
+
|
290
|
+
@property
|
291
|
+
def untouched_file_ratio(self) -> float:
|
292
|
+
"""
|
293
|
+
The ratio between total number of files untouched and total number of files in the compacted output.
|
294
|
+
"""
|
295
|
+
return self.get("untouchedFileRatio")
|
296
|
+
|
297
|
+
@property
|
298
|
+
def untouched_record_count(self) -> int:
|
299
|
+
"""
|
300
|
+
The total number of records untouched during materialization.
|
301
|
+
"""
|
302
|
+
return self.get("untouchedRecordCount")
|
303
|
+
|
304
|
+
@property
|
305
|
+
def untouched_size_bytes(self) -> float:
|
306
|
+
"""
|
307
|
+
The on-disk size of the data untouched during materialization.
|
308
|
+
"""
|
309
|
+
return self.get("untouchedSizeBytes")
|
310
|
+
|
311
|
+
@property
|
312
|
+
def telemetry_time_in_seconds(self) -> float:
|
313
|
+
"""
|
314
|
+
The total time taken by all the telemetry activity across the nodes in the cluster. This includes
|
315
|
+
collecting cluster resources information, emitting metrics, etc.
|
316
|
+
"""
|
317
|
+
return self.get("telemetryTimeInSeconds")
|
318
|
+
|
319
|
+
@property
|
320
|
+
def hash_bucket_result_size_bytes(self) -> float:
|
321
|
+
"""
|
322
|
+
The size of the results returned by hash bucket step.
|
323
|
+
"""
|
324
|
+
return self.get("hashBucketResultSize")
|
325
|
+
|
326
|
+
@property
|
327
|
+
def dedupe_result_size_bytes(self) -> float:
|
328
|
+
"""
|
329
|
+
The size of the results returned by dedupe step.
|
330
|
+
"""
|
331
|
+
return self.get("dedupeResultSize")
|
332
|
+
|
333
|
+
@property
|
334
|
+
def materialize_result_size(self) -> float:
|
335
|
+
"""
|
336
|
+
The size of the results returned by materialize step.
|
337
|
+
"""
|
338
|
+
return self.get("materializeResultSize")
|
339
|
+
|
340
|
+
@property
|
341
|
+
def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
|
342
|
+
"""
|
343
|
+
The peak memory used by the entrypoint for compaction_session.
|
344
|
+
"""
|
345
|
+
return self.get("peakMemoryUsedBytesCompactionSessionProcess")
|
346
|
+
|
347
|
+
# Setters follow
|
348
|
+
|
349
|
+
def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
|
350
|
+
self["auditUrl"] = audit_url
|
351
|
+
return self
|
352
|
+
|
353
|
+
def set_deltacat_version(self, version: str) -> CompactionSessionAuditInfo:
|
354
|
+
self["deltacatVersion"] = version
|
355
|
+
return self
|
356
|
+
|
357
|
+
def set_input_records(self, input_records: int) -> CompactionSessionAuditInfo:
|
358
|
+
self["inputRecords"] = input_records
|
359
|
+
return self
|
360
|
+
|
361
|
+
def set_input_file_count(self, input_file_count: int) -> CompactionSessionAuditInfo:
|
362
|
+
self["inputFileCount"] = input_file_count
|
363
|
+
return self
|
364
|
+
|
365
|
+
def set_uniform_deltas_created(
|
366
|
+
self, uniform_deltas_created: int
|
367
|
+
) -> CompactionSessionAuditInfo:
|
368
|
+
self["uniformDeltasCreated"] = uniform_deltas_created
|
369
|
+
return self
|
370
|
+
|
371
|
+
def set_records_deduped(self, records_deduped: int) -> CompactionSessionAuditInfo:
|
372
|
+
self["recordsDeduped"] = records_deduped
|
373
|
+
return self
|
374
|
+
|
375
|
+
def set_input_size_bytes(
|
376
|
+
self, input_size_bytes: float
|
377
|
+
) -> CompactionSessionAuditInfo:
|
378
|
+
self["inputSizeBytes"] = input_size_bytes
|
379
|
+
return self
|
380
|
+
|
381
|
+
def set_hash_bucket_count(
|
382
|
+
self, hash_bucket_count: int
|
383
|
+
) -> CompactionSessionAuditInfo:
|
384
|
+
self["hashBucketCount"] = hash_bucket_count
|
385
|
+
return self
|
386
|
+
|
387
|
+
def set_cluster_cpu_max(self, cluster_cpu_max: float) -> CompactionSessionAuditInfo:
|
388
|
+
self["clusterCpuMax"] = cluster_cpu_max
|
389
|
+
return self
|
390
|
+
|
391
|
+
def set_compaction_time_in_seconds(
|
392
|
+
self, compaction_time_in_seconds: float
|
393
|
+
) -> CompactionSessionAuditInfo:
|
394
|
+
self["compactionTimeInSeconds"] = compaction_time_in_seconds
|
395
|
+
return self
|
396
|
+
|
397
|
+
def set_total_object_store_memory_used_bytes(
|
398
|
+
self, total_object_store_memory_used_bytes: float
|
399
|
+
) -> CompactionSessionAuditInfo:
|
400
|
+
self["totalObjectStoreMemoryUsedBytes"] = total_object_store_memory_used_bytes
|
401
|
+
return self
|
402
|
+
|
403
|
+
def set_peak_memory_used_bytes_per_task(
|
404
|
+
self, peak_memory_used_bytes: float
|
405
|
+
) -> CompactionSessionAuditInfo:
|
406
|
+
self["peakMemoryUsedBytesPerTask"] = peak_memory_used_bytes
|
407
|
+
return self
|
408
|
+
|
409
|
+
def set_peak_memory_used_bytes_per_hash_bucket_task(
|
410
|
+
self, peak_memory_used_bytes_per_hash_bucket_task: float
|
411
|
+
) -> CompactionSessionAuditInfo:
|
412
|
+
self[
|
413
|
+
"hashBucketTaskPeakMemoryUsedBytes"
|
414
|
+
] = peak_memory_used_bytes_per_hash_bucket_task
|
415
|
+
return self
|
416
|
+
|
417
|
+
def set_peak_memory_used_bytes_per_dedupe_task(
|
418
|
+
self, peak_memory_used_bytes_per_dedupe_task: float
|
419
|
+
) -> CompactionSessionAuditInfo:
|
420
|
+
self["dedupeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes_per_dedupe_task
|
421
|
+
return self
|
422
|
+
|
423
|
+
def set_peak_memory_used_bytes_per_materialize_task(
|
424
|
+
self, peak_memory_used_bytes_per_materialize_task: float
|
425
|
+
) -> CompactionSessionAuditInfo:
|
426
|
+
self[
|
427
|
+
"materializeTaskPeakMemoryUsedBytes"
|
428
|
+
] = peak_memory_used_bytes_per_materialize_task
|
429
|
+
return self
|
430
|
+
|
431
|
+
def set_hash_bucket_post_object_store_memory_used_bytes(
|
432
|
+
self, object_store_memory_used_bytes_by_hb: float
|
433
|
+
) -> CompactionSessionAuditInfo:
|
434
|
+
self[
|
435
|
+
"hashBucketPostObjectStoreMemoryUsedBytes"
|
436
|
+
] = object_store_memory_used_bytes_by_hb
|
437
|
+
return self
|
438
|
+
|
439
|
+
def set_dedupe_post_object_store_memory_used_bytes(
|
440
|
+
self, object_store_memory_used_bytes_by_dedupe: float
|
441
|
+
) -> CompactionSessionAuditInfo:
|
442
|
+
self[
|
443
|
+
"dedupePostObjectStoreMemoryUsedBytes"
|
444
|
+
] = object_store_memory_used_bytes_by_dedupe
|
445
|
+
return self
|
446
|
+
|
447
|
+
def set_materialize_post_object_store_memory_used_bytes(
|
448
|
+
self, object_store_memory_used_bytes_by_dedupe: float
|
449
|
+
) -> CompactionSessionAuditInfo:
|
450
|
+
self[
|
451
|
+
"materializePostObjectStoreMemoryUsedBytes"
|
452
|
+
] = object_store_memory_used_bytes_by_dedupe
|
453
|
+
return self
|
454
|
+
|
455
|
+
def set_materialize_buckets(
|
456
|
+
self, materialize_buckets: int
|
457
|
+
) -> CompactionSessionAuditInfo:
|
458
|
+
self["materializeBuckets"] = materialize_buckets
|
459
|
+
return self
|
460
|
+
|
461
|
+
def set_hash_bucket_time_in_seconds(
|
462
|
+
self, hash_bucket_time_in_seconds: float
|
463
|
+
) -> CompactionSessionAuditInfo:
|
464
|
+
self["hashBucketTimeInSeconds"] = hash_bucket_time_in_seconds
|
465
|
+
return self
|
466
|
+
|
467
|
+
def set_hash_bucket_invoke_time_in_seconds(
|
468
|
+
self, hash_bucket_invoke_time: float
|
469
|
+
) -> CompactionSessionAuditInfo:
|
470
|
+
self["hashBucketInvokeTimeInSeconds"] = hash_bucket_invoke_time
|
471
|
+
return self
|
472
|
+
|
473
|
+
def set_hash_bucket_result_wait_time_in_seconds(
|
474
|
+
self, wait_time: float
|
475
|
+
) -> CompactionSessionAuditInfo:
|
476
|
+
self.get["hashBucketResultWaitTimeInSeconds"] = wait_time
|
477
|
+
return self
|
478
|
+
|
479
|
+
def set_dedupe_time_in_seconds(
|
480
|
+
self, dedupe_time_in_seconds: float
|
481
|
+
) -> CompactionSessionAuditInfo:
|
482
|
+
self["dedupeTimeInSeconds"] = dedupe_time_in_seconds
|
483
|
+
return self
|
484
|
+
|
485
|
+
def set_dedupe_invoke_time_in_seconds(
|
486
|
+
self, dedupe_invoke_time: float
|
487
|
+
) -> CompactionSessionAuditInfo:
|
488
|
+
self["dedupeInvokeTimeInSeconds"] = dedupe_invoke_time
|
489
|
+
return self
|
490
|
+
|
491
|
+
def set_dedupe_result_wait_time_in_seconds(
|
492
|
+
self, wait_time: float
|
493
|
+
) -> CompactionSessionAuditInfo:
|
494
|
+
self.get["dedupeResultWaitTimeInSeconds"] = wait_time
|
495
|
+
return self
|
496
|
+
|
497
|
+
def set_materialize_time_in_seconds(
|
498
|
+
self, materialize_time_in_seconds: float
|
499
|
+
) -> CompactionSessionAuditInfo:
|
500
|
+
self["materializeTimeInSeconds"] = materialize_time_in_seconds
|
501
|
+
return self
|
502
|
+
|
503
|
+
def set_materialize_invoke_time_in_seconds(
|
504
|
+
self, materialize_invoke_time: float
|
505
|
+
) -> CompactionSessionAuditInfo:
|
506
|
+
self["materializeInvokeTimeInSeconds"] = materialize_invoke_time
|
507
|
+
return self
|
508
|
+
|
509
|
+
def set_materialize_result_wait_time_in_seconds(
|
510
|
+
self, wait_time: float
|
511
|
+
) -> CompactionSessionAuditInfo:
|
512
|
+
self.get["materializeResultWaitTimeInSeconds"] = wait_time
|
513
|
+
return self
|
514
|
+
|
515
|
+
def set_delta_discovery_time_in_seconds(
|
516
|
+
self, delta_discovery_time_in_seconds: float
|
517
|
+
) -> CompactionSessionAuditInfo:
|
518
|
+
self["deltaDiscoveryTimeInSeconds"] = delta_discovery_time_in_seconds
|
519
|
+
return self
|
520
|
+
|
521
|
+
def set_output_file_count(
|
522
|
+
self, output_file_count: float
|
523
|
+
) -> CompactionSessionAuditInfo:
|
524
|
+
self["outputFileCount"] = output_file_count
|
525
|
+
return self
|
526
|
+
|
527
|
+
def set_output_size_bytes(
|
528
|
+
self, output_size_bytes: float
|
529
|
+
) -> CompactionSessionAuditInfo:
|
530
|
+
self["outputSizeBytes"] = output_size_bytes
|
531
|
+
return output_size_bytes
|
532
|
+
|
533
|
+
def set_output_size_pyarrow_bytes(
|
534
|
+
self, output_size_pyarrow_bytes: float
|
535
|
+
) -> CompactionSessionAuditInfo:
|
536
|
+
self["outputSizePyarrowBytes"] = output_size_pyarrow_bytes
|
537
|
+
return output_size_pyarrow_bytes
|
538
|
+
|
539
|
+
def set_total_cluster_memory_bytes(
|
540
|
+
self, total_cluster_memory_bytes: float
|
541
|
+
) -> CompactionSessionAuditInfo:
|
542
|
+
self["totalClusterMemoryBytes"] = total_cluster_memory_bytes
|
543
|
+
return self
|
544
|
+
|
545
|
+
def set_total_cluster_object_store_memory_bytes(
|
546
|
+
self, total_cluster_object_store_memory_bytes: float
|
547
|
+
) -> CompactionSessionAuditInfo:
|
548
|
+
self[
|
549
|
+
"totalClusterObjectStoreMemoryBytes"
|
550
|
+
] = total_cluster_object_store_memory_bytes
|
551
|
+
return self
|
552
|
+
|
553
|
+
def set_untouched_file_count(
|
554
|
+
self, untouched_file_count: int
|
555
|
+
) -> CompactionSessionAuditInfo:
|
556
|
+
self["untouchedFileCount"] = untouched_file_count
|
557
|
+
return self
|
558
|
+
|
559
|
+
def set_untouched_file_ratio(
|
560
|
+
self, untouched_file_ratio: float
|
561
|
+
) -> CompactionSessionAuditInfo:
|
562
|
+
self["untouchedFileRatio"] = untouched_file_ratio
|
563
|
+
return self
|
564
|
+
|
565
|
+
def set_untouched_record_count(
|
566
|
+
self, untouched_record_count: int
|
567
|
+
) -> CompactionSessionAuditInfo:
|
568
|
+
self["untouchedRecordCount"] = untouched_record_count
|
569
|
+
return self
|
570
|
+
|
571
|
+
def set_untouched_size_bytes(
|
572
|
+
self, untouched_size_bytes: float
|
573
|
+
) -> CompactionSessionAuditInfo:
|
574
|
+
self["untouchedSizeBytes"] = untouched_size_bytes
|
575
|
+
return self
|
576
|
+
|
577
|
+
def set_telemetry_time_in_seconds(
|
578
|
+
self, telemetry_time_in_seconds: float
|
579
|
+
) -> CompactionSessionAuditInfo:
|
580
|
+
self["telemetryTimeInSeconds"] = telemetry_time_in_seconds
|
581
|
+
return self
|
582
|
+
|
583
|
+
def set_hash_bucket_result_size_bytes(
|
584
|
+
self, hash_bucket_result_size_bytes: float
|
585
|
+
) -> CompactionSessionAuditInfo:
|
586
|
+
self["hashBucketResultSize"] = hash_bucket_result_size_bytes
|
587
|
+
return self
|
588
|
+
|
589
|
+
def set_dedupe_result_size_bytes(
|
590
|
+
self, dedupe_result_size_bytes: float
|
591
|
+
) -> CompactionSessionAuditInfo:
|
592
|
+
self["dedupeResultSize"] = dedupe_result_size_bytes
|
593
|
+
return self
|
594
|
+
|
595
|
+
def set_materialize_result_size_bytes(
|
596
|
+
self, materialize_result_size_bytes: float
|
597
|
+
) -> CompactionSessionAuditInfo:
|
598
|
+
self["materializeResultSize"] = materialize_result_size_bytes
|
599
|
+
return self
|
600
|
+
|
601
|
+
def set_peak_memory_used_bytes_by_compaction_session_process(
|
602
|
+
self, peak_memory: float
|
603
|
+
) -> CompactionSessionAuditInfo:
|
604
|
+
self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
|
605
|
+
return self
|
606
|
+
|
607
|
+
# High level methods to save stats
|
608
|
+
def save_step_stats(
|
609
|
+
self,
|
610
|
+
step_name: str,
|
611
|
+
task_results: Union[
|
612
|
+
List[HashBucketResult], List[DedupeResult], List[MaterializeResult]
|
613
|
+
],
|
614
|
+
task_results_retrieved_at: float,
|
615
|
+
invoke_time_in_seconds: float,
|
616
|
+
task_time_in_seconds: float,
|
617
|
+
) -> float:
|
618
|
+
"""
|
619
|
+
Saves the stats by calling individual setters and returns the cluster telemetry time.
|
620
|
+
"""
|
621
|
+
|
622
|
+
last_task_completed_at = max(
|
623
|
+
result.task_completed_at for result in task_results
|
624
|
+
)
|
625
|
+
|
626
|
+
self[f"{step_name}ResultWaitTimeInSeconds"] = (
|
627
|
+
task_results_retrieved_at - last_task_completed_at.item()
|
628
|
+
)
|
629
|
+
self[f"{step_name}TimeInSeconds"] = task_time_in_seconds
|
630
|
+
self[f"{step_name}InvokeTimeInSeconds"] = invoke_time_in_seconds
|
631
|
+
|
632
|
+
self[f"{step_name}ResultSize"] = get_size_of_object_in_bytes(task_results)
|
633
|
+
|
634
|
+
(
|
635
|
+
cluster_utilization_after_task,
|
636
|
+
cluster_util_after_task_latency,
|
637
|
+
) = timed_invocation(ClusterUtilization.get_current_cluster_utilization)
|
638
|
+
|
639
|
+
self.set_total_cluster_object_store_memory_bytes(
|
640
|
+
cluster_utilization_after_task.total_object_store_memory_bytes
|
641
|
+
)
|
642
|
+
self.set_total_cluster_memory_bytes(
|
643
|
+
cluster_utilization_after_task.total_memory_bytes
|
644
|
+
)
|
645
|
+
self.set_total_object_store_memory_used_bytes(
|
646
|
+
cluster_utilization_after_task.used_object_store_memory_bytes
|
647
|
+
)
|
648
|
+
|
649
|
+
self[
|
650
|
+
f"{step_name}PostObjectStoreMemoryUsedBytes"
|
651
|
+
] = cluster_utilization_after_task.used_object_store_memory_bytes
|
652
|
+
|
653
|
+
peak_task_memory = max(
|
654
|
+
result.peak_memory_usage_bytes for result in task_results
|
655
|
+
)
|
656
|
+
|
657
|
+
telemetry_time = sum(
|
658
|
+
result.telemetry_time_in_seconds for result in task_results
|
659
|
+
)
|
660
|
+
|
661
|
+
self[f"{step_name}TaskPeakMemoryUsedBytes"] = peak_task_memory.item()
|
662
|
+
|
663
|
+
return cluster_util_after_task_latency + telemetry_time
|
664
|
+
|
665
|
+
def save_round_completion_stats(
|
666
|
+
self, mat_results: List[MaterializeResult], total_telemetry_time: float
|
667
|
+
) -> None:
|
668
|
+
"""
|
669
|
+
This method saves all the relevant stats after all the steps are completed.
|
670
|
+
"""
|
671
|
+
pyarrow_write_result = PyArrowWriteResult.union(
|
672
|
+
[m.pyarrow_write_result for m in mat_results]
|
673
|
+
)
|
674
|
+
|
675
|
+
total_count_of_src_dfl_not_touched = sum(
|
676
|
+
m.referenced_pyarrow_write_result.files for m in mat_results
|
677
|
+
)
|
678
|
+
|
679
|
+
logger.info(
|
680
|
+
f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
|
681
|
+
)
|
682
|
+
logger.info(
|
683
|
+
f"Got total of {pyarrow_write_result.files} manifest files during compaction."
|
684
|
+
)
|
685
|
+
manifest_entry_copied_by_reference_ratio = (
|
686
|
+
(
|
687
|
+
round(
|
688
|
+
total_count_of_src_dfl_not_touched / pyarrow_write_result.files, 4
|
689
|
+
)
|
690
|
+
* 100
|
691
|
+
)
|
692
|
+
if pyarrow_write_result.files != 0
|
693
|
+
else None
|
694
|
+
)
|
695
|
+
logger.info(
|
696
|
+
f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
|
697
|
+
)
|
698
|
+
|
699
|
+
untouched_file_record_count = sum(
|
700
|
+
m.referenced_pyarrow_write_result.records for m in mat_results
|
701
|
+
)
|
702
|
+
untouched_file_size_bytes = sum(
|
703
|
+
m.referenced_pyarrow_write_result.file_bytes for m in mat_results
|
704
|
+
)
|
705
|
+
|
706
|
+
self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
|
707
|
+
self.set_untouched_file_ratio(manifest_entry_copied_by_reference_ratio)
|
708
|
+
self.set_untouched_record_count(untouched_file_record_count)
|
709
|
+
self.set_untouched_size_bytes(untouched_file_size_bytes)
|
710
|
+
|
711
|
+
self.set_output_file_count(pyarrow_write_result.files)
|
712
|
+
self.set_output_size_bytes(pyarrow_write_result.file_bytes)
|
713
|
+
self.set_output_size_pyarrow_bytes(pyarrow_write_result.pyarrow_bytes)
|
714
|
+
|
715
|
+
self.set_peak_memory_used_bytes_per_task(
|
716
|
+
max(
|
717
|
+
[
|
718
|
+
self.peak_memory_used_bytes_per_hash_bucket_task,
|
719
|
+
self.peak_memory_used_bytes_per_dedupe_task,
|
720
|
+
self.peak_memory_used_bytes_per_materialize_task,
|
721
|
+
]
|
722
|
+
)
|
723
|
+
)
|
724
|
+
|
725
|
+
self.set_telemetry_time_in_seconds(total_telemetry_time)
|