deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +184 -29
  3. deltacat/compute/compactor/model/compact_partition_params.py +153 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
  5. deltacat/compute/compactor/model/dedupe_result.py +3 -0
  6. deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
  7. deltacat/compute/compactor/model/delta_file_locator.py +11 -6
  8. deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
  9. deltacat/compute/compactor/model/materialize_result.py +27 -6
  10. deltacat/compute/compactor/model/round_completion_info.py +9 -0
  11. deltacat/compute/compactor/steps/dedupe.py +35 -19
  12. deltacat/compute/compactor/steps/hash_bucket.py +41 -16
  13. deltacat/compute/compactor/steps/materialize.py +73 -70
  14. deltacat/compute/compactor/utils/io.py +15 -0
  15. deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  16. deltacat/compute/compactor/utils/round_completion_file.py +13 -4
  17. deltacat/compute/compactor/utils/system_columns.py +32 -0
  18. deltacat/io/__init__.py +0 -7
  19. deltacat/io/file_object_store.py +48 -0
  20. deltacat/io/memcached_object_store.py +121 -0
  21. deltacat/io/object_store.py +51 -0
  22. deltacat/io/ray_plasma_object_store.py +23 -0
  23. deltacat/io/redis_object_store.py +114 -0
  24. deltacat/io/s3_object_store.py +44 -0
  25. deltacat/storage/model/delta.py +2 -1
  26. deltacat/tests/compactor/test_compact_partition_params.py +237 -0
  27. deltacat/tests/compactor/utils/test_io.py +27 -5
  28. deltacat/tests/io/__init__.py +0 -0
  29. deltacat/tests/io/test_file_object_store.py +86 -0
  30. deltacat/tests/io/test_memcached_object_store.py +158 -0
  31. deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  32. deltacat/tests/io/test_redis_object_store.py +103 -0
  33. deltacat/tests/io/test_s3_object_store.py +59 -0
  34. deltacat/tests/utils/test_record_batch_tables.py +1 -1
  35. deltacat/tests/utils/test_resources.py +9 -0
  36. deltacat/utils/ray_utils/concurrency.py +0 -2
  37. deltacat/utils/resources.py +30 -18
  38. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
  39. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
  40. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
  41. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
  42. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,725 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+ import logging
4
+ from deltacat import logs
5
+ from typing import List, Union
6
+ from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
7
+ from deltacat.compute.compactor.model.dedupe_result import DedupeResult
8
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
9
+ from deltacat.utils.performance import timed_invocation
10
+ from deltacat.utils.resources import ClusterUtilization, get_size_of_object_in_bytes
11
+ from deltacat.compute.compactor import PyArrowWriteResult
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
+
15
+
16
+ class CompactionSessionAuditInfo(dict):
17
+
18
+ DEDUPE_STEP_NAME = "dedupe"
19
+ MATERIALIZE_STEP_NAME = "materialize"
20
+ HASH_BUCKET_STEP_NAME = "hashBucket"
21
+
22
+ def __init__(self, deltacat_version: str, audit_url: str):
23
+ self.set_deltacat_version(deltacat_version)
24
+ self.set_audit_url(audit_url)
25
+
26
+ @property
27
+ def audit_url(self) -> str:
28
+ return self.get("auditUrl")
29
+
30
+ @property
31
+ def deltacat_version(self) -> str:
32
+ """
33
+ The deltacat version used to run compaction job.
34
+ """
35
+ return self.get("deltacatVersion")
36
+
37
+ @property
38
+ def input_records(self) -> int:
39
+ """
40
+ The total number of records from input deltas that needs to be compacted
41
+ (before deduplication).
42
+ """
43
+ return self.get("inputRecords")
44
+
45
+ @property
46
+ def input_file_count(self) -> int:
47
+ """
48
+ The total number of input files that needs to be compacted.
49
+ """
50
+ return self.get("inputFileCount")
51
+
52
+ @property
53
+ def uniform_deltas_created(self) -> int:
54
+ """
55
+ The total number of unitform deltas fed into the hash bucket step.
56
+ """
57
+ return self.get("uniformDeltasCreated")
58
+
59
+ @property
60
+ def records_deduped(self) -> int:
61
+ """
62
+ The total number of records that were deduplicated. For example,
63
+ if there are 100 records with a particular primary key, 99 records
64
+ will be deduplicated.
65
+ """
66
+ return self.get("recordsDeduped")
67
+
68
+ @property
69
+ def input_size_bytes(self) -> float:
70
+ """
71
+ The on-disk size in bytes of the input.
72
+ """
73
+ return self.get("inputSizeBytes")
74
+
75
+ @property
76
+ def hash_bucket_count(self) -> int:
77
+ """
78
+ Total number of hash buckets used during compaction.
79
+ """
80
+ return self.get("hashBucketCount")
81
+
82
+ @property
83
+ def cluster_cpu_max(self) -> float:
84
+ """
85
+ Total cluster cpu allocated for the compaction job. If it is autoscaling cluster,
86
+ max cpu at any time will be reported.
87
+ """
88
+ return self.get("clusterCpuMax")
89
+
90
+ @property
91
+ def compaction_time_in_seconds(self) -> float:
92
+ """
93
+ The total time taken by the compaction session to complete.
94
+ """
95
+ return self.get("compactionTimeInSeconds")
96
+
97
+ @property
98
+ def total_object_store_memory_used_bytes(self) -> float:
99
+ """
100
+ The total object store memory used by the compaction session across all
101
+ nodes in the entire cluster.
102
+ """
103
+ return self.get("totalObjectStoreMemoryUsedBytes")
104
+
105
+ @property
106
+ def peak_memory_used_bytes_per_task(self) -> float:
107
+ """
108
+ The peak memory used by a single process in the compaction job. Note that
109
+ Ray creates a single process to run each hash bucketing, dedupe and
110
+ materialize task and the process is reused. Hence, you may see
111
+ monotonically increasing values. Peak memory is important because,
112
+ the cluster must be scaled to handle the peak memory per node even
113
+ though average memory usage is low.
114
+ """
115
+ return self.get("peakMemoryUsedBytesPerTask")
116
+
117
+ @property
118
+ def peak_memory_used_bytes_per_hash_bucket_task(self) -> float:
119
+ """
120
+ The peak memory used by a single hash bucketing process. For example,
121
+ if peak usage of hash bucketing process is 40GB, it is not safe to run
122
+ more than 3 hash bucketing tasks on a node with 120GB to avoid crashing
123
+ due to memory overflow.
124
+ """
125
+ return self.get("hashBucketTaskPeakMemoryUsedBytes")
126
+
127
+ @property
128
+ def peak_memory_used_bytes_per_dedupe_task(self) -> float:
129
+ """
130
+ The peak memory used by a single dedupe python process. Note that
131
+ results may be max of dedupe and hash bucketing as processes are
132
+ reused by Ray to run dedupe and hash bucketing.
133
+ """
134
+ return self.get("dedupeTaskPeakMemoryUsedBytes")
135
+
136
+ @property
137
+ def peak_memory_used_bytes_per_materialize_task(self) -> float:
138
+ """
139
+ The peak memory used by a single materialize python process. Note
140
+ that results may be max of materialize, dedupe and hash bucketing as
141
+ processes are reused by Ray to run all compaction steps.
142
+ """
143
+ return self.get("materializeTaskPeakMemoryUsedBytes")
144
+
145
+ @property
146
+ def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
147
+ """
148
+ The total object store memory used by hash bucketing step across
149
+ cluster, before dedupe is run.
150
+ """
151
+ return self.get("hashBucketPostObjectStoreMemoryUsedBytes")
152
+
153
+ @property
154
+ def dedupe_post_object_store_memory_used_bytes(self) -> float:
155
+ """
156
+ The total object store memory used after dedupe step before materialize is run.
157
+ """
158
+ return self.get("dedupePostObjectStoreMemoryUsedBytes")
159
+
160
+ @property
161
+ def materialize_post_object_store_memory_used_bytes(self) -> float:
162
+ """
163
+ The total object store memory used after materialize step.
164
+ """
165
+ return self.get("materializePostObjectStoreMemoryUsedBytes")
166
+
167
+ @property
168
+ def materialize_buckets(self) -> int:
169
+ """
170
+ The total number of materialize buckets created.
171
+ """
172
+ return self.get("materializeBuckets")
173
+
174
+ @property
175
+ def hash_bucket_time_in_seconds(self) -> float:
176
+ """
177
+ The time taken by hash bucketing step. This includes all hash bucket tasks.
178
+ This includes invoke time.
179
+ """
180
+ return self.get("hashBucketTimeInSeconds")
181
+
182
+ @property
183
+ def hash_bucket_invoke_time_in_seconds(self) -> float:
184
+ """
185
+ The time taken to invoke and create all hash bucketing tasks.
186
+ """
187
+ return self.get("hashBucketInvokeTimeInSeconds")
188
+
189
+ @property
190
+ def hash_bucket_result_wait_time_in_seconds(self) -> float:
191
+ """
192
+ The time it takes ray.get() to resolve after the last hash bucket task has completed.
193
+ This value may not be accurate at less than 1 second precision.
194
+ """
195
+ return self.get("hashBucketResultWaitTimeInSeconds")
196
+
197
+ @property
198
+ def dedupe_time_in_seconds(self) -> float:
199
+ """
200
+ The time taken by dedupe step. This include all dedupe tasks.
201
+ """
202
+ return self.get("dedupeTimeInSeconds")
203
+
204
+ @property
205
+ def dedupe_invoke_time_in_seconds(self) -> float:
206
+ """
207
+ The time taken to invoke all dedupe tasks.
208
+ """
209
+ return self.get("dedupeInvokeTimeInSeconds")
210
+
211
+ @property
212
+ def dedupe_result_wait_time_in_seconds(self) -> float:
213
+ """
214
+ The time it takes ray.get() to resolve after the last dedupe task has completed.
215
+ This value may not be accurate at less than 1 second precision.
216
+ """
217
+ return self.get("dedupeResultWaitTimeInSeconds")
218
+
219
+ @property
220
+ def materialize_time_in_seconds(self) -> float:
221
+ """
222
+ The time taken by materialize step. This includes all materialize tasks.
223
+ """
224
+ return self.get("materializeTimeInSeconds")
225
+
226
+ @property
227
+ def materialize_invoke_time_in_seconds(self) -> float:
228
+ """
229
+ The time taken to invoke all materialize tasks.
230
+ """
231
+ return self.get("materializeInvokeTimeInSeconds")
232
+
233
+ @property
234
+ def materialize_result_wait_time_in_seconds(self) -> float:
235
+ """
236
+ The time it takes ray.get() to resolve after the last hash bucket task has completed.
237
+ This value may not be accurate at less than 1 second precision.
238
+ """
239
+ return self.get("materializeResultWaitTimeInSeconds")
240
+
241
+ @property
242
+ def delta_discovery_time_in_seconds(self) -> float:
243
+ """
244
+ The time taken by delta discovery step which is mostly run before hash bucketing is started.
245
+ """
246
+ return self.get("deltaDiscoveryTimeInSeconds")
247
+
248
+ @property
249
+ def output_file_count(self) -> int:
250
+ """
251
+ The total number of files in the compacted output (includes untouched files).
252
+ """
253
+ return self.get("outputFileCount")
254
+
255
+ @property
256
+ def output_size_bytes(self) -> float:
257
+ """
258
+ The on-disk size of the compacted output including any untouched files.
259
+ """
260
+ return self.get("outputSizeBytes")
261
+
262
+ @property
263
+ def output_size_pyarrow_bytes(self) -> float:
264
+ """
265
+ The pyarrow in-memory size of compacted output including any untouched files.
266
+ """
267
+ return self.get("outputSizePyarrowBytes")
268
+
269
+ @property
270
+ def total_cluster_memory_bytes(self) -> float:
271
+ """
272
+ The total memory allocated to the cluster.
273
+ """
274
+ return self.get("totalClusterMemoryBytes")
275
+
276
+ @property
277
+ def total_cluster_object_store_memory_bytes(self) -> float:
278
+ """
279
+ The total object store memory allocated to the cluster.
280
+ """
281
+ return self.get("totalClusterObjectStoreMemoryBytes")
282
+
283
+ @property
284
+ def untouched_file_count(self) -> int:
285
+ """
286
+ The total number of files that were untouched by materialize step.
287
+ """
288
+ return self.get("untouchedFileCount")
289
+
290
+ @property
291
+ def untouched_file_ratio(self) -> float:
292
+ """
293
+ The ratio between total number of files untouched and total number of files in the compacted output.
294
+ """
295
+ return self.get("untouchedFileRatio")
296
+
297
+ @property
298
+ def untouched_record_count(self) -> int:
299
+ """
300
+ The total number of records untouched during materialization.
301
+ """
302
+ return self.get("untouchedRecordCount")
303
+
304
+ @property
305
+ def untouched_size_bytes(self) -> float:
306
+ """
307
+ The on-disk size of the data untouched during materialization.
308
+ """
309
+ return self.get("untouchedSizeBytes")
310
+
311
+ @property
312
+ def telemetry_time_in_seconds(self) -> float:
313
+ """
314
+ The total time taken by all the telemetry activity across the nodes in the cluster. This includes
315
+ collecting cluster resources information, emitting metrics, etc.
316
+ """
317
+ return self.get("telemetryTimeInSeconds")
318
+
319
+ @property
320
+ def hash_bucket_result_size_bytes(self) -> float:
321
+ """
322
+ The size of the results returned by hash bucket step.
323
+ """
324
+ return self.get("hashBucketResultSize")
325
+
326
+ @property
327
+ def dedupe_result_size_bytes(self) -> float:
328
+ """
329
+ The size of the results returned by dedupe step.
330
+ """
331
+ return self.get("dedupeResultSize")
332
+
333
+ @property
334
+ def materialize_result_size(self) -> float:
335
+ """
336
+ The size of the results returned by materialize step.
337
+ """
338
+ return self.get("materializeResultSize")
339
+
340
+ @property
341
+ def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
342
+ """
343
+ The peak memory used by the entrypoint for compaction_session.
344
+ """
345
+ return self.get("peakMemoryUsedBytesCompactionSessionProcess")
346
+
347
+ # Setters follow
348
+
349
+ def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
350
+ self["auditUrl"] = audit_url
351
+ return self
352
+
353
+ def set_deltacat_version(self, version: str) -> CompactionSessionAuditInfo:
354
+ self["deltacatVersion"] = version
355
+ return self
356
+
357
+ def set_input_records(self, input_records: int) -> CompactionSessionAuditInfo:
358
+ self["inputRecords"] = input_records
359
+ return self
360
+
361
+ def set_input_file_count(self, input_file_count: int) -> CompactionSessionAuditInfo:
362
+ self["inputFileCount"] = input_file_count
363
+ return self
364
+
365
+ def set_uniform_deltas_created(
366
+ self, uniform_deltas_created: int
367
+ ) -> CompactionSessionAuditInfo:
368
+ self["uniformDeltasCreated"] = uniform_deltas_created
369
+ return self
370
+
371
+ def set_records_deduped(self, records_deduped: int) -> CompactionSessionAuditInfo:
372
+ self["recordsDeduped"] = records_deduped
373
+ return self
374
+
375
+ def set_input_size_bytes(
376
+ self, input_size_bytes: float
377
+ ) -> CompactionSessionAuditInfo:
378
+ self["inputSizeBytes"] = input_size_bytes
379
+ return self
380
+
381
+ def set_hash_bucket_count(
382
+ self, hash_bucket_count: int
383
+ ) -> CompactionSessionAuditInfo:
384
+ self["hashBucketCount"] = hash_bucket_count
385
+ return self
386
+
387
+ def set_cluster_cpu_max(self, cluster_cpu_max: float) -> CompactionSessionAuditInfo:
388
+ self["clusterCpuMax"] = cluster_cpu_max
389
+ return self
390
+
391
+ def set_compaction_time_in_seconds(
392
+ self, compaction_time_in_seconds: float
393
+ ) -> CompactionSessionAuditInfo:
394
+ self["compactionTimeInSeconds"] = compaction_time_in_seconds
395
+ return self
396
+
397
+ def set_total_object_store_memory_used_bytes(
398
+ self, total_object_store_memory_used_bytes: float
399
+ ) -> CompactionSessionAuditInfo:
400
+ self["totalObjectStoreMemoryUsedBytes"] = total_object_store_memory_used_bytes
401
+ return self
402
+
403
+ def set_peak_memory_used_bytes_per_task(
404
+ self, peak_memory_used_bytes: float
405
+ ) -> CompactionSessionAuditInfo:
406
+ self["peakMemoryUsedBytesPerTask"] = peak_memory_used_bytes
407
+ return self
408
+
409
+ def set_peak_memory_used_bytes_per_hash_bucket_task(
410
+ self, peak_memory_used_bytes_per_hash_bucket_task: float
411
+ ) -> CompactionSessionAuditInfo:
412
+ self[
413
+ "hashBucketTaskPeakMemoryUsedBytes"
414
+ ] = peak_memory_used_bytes_per_hash_bucket_task
415
+ return self
416
+
417
+ def set_peak_memory_used_bytes_per_dedupe_task(
418
+ self, peak_memory_used_bytes_per_dedupe_task: float
419
+ ) -> CompactionSessionAuditInfo:
420
+ self["dedupeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes_per_dedupe_task
421
+ return self
422
+
423
+ def set_peak_memory_used_bytes_per_materialize_task(
424
+ self, peak_memory_used_bytes_per_materialize_task: float
425
+ ) -> CompactionSessionAuditInfo:
426
+ self[
427
+ "materializeTaskPeakMemoryUsedBytes"
428
+ ] = peak_memory_used_bytes_per_materialize_task
429
+ return self
430
+
431
+ def set_hash_bucket_post_object_store_memory_used_bytes(
432
+ self, object_store_memory_used_bytes_by_hb: float
433
+ ) -> CompactionSessionAuditInfo:
434
+ self[
435
+ "hashBucketPostObjectStoreMemoryUsedBytes"
436
+ ] = object_store_memory_used_bytes_by_hb
437
+ return self
438
+
439
+ def set_dedupe_post_object_store_memory_used_bytes(
440
+ self, object_store_memory_used_bytes_by_dedupe: float
441
+ ) -> CompactionSessionAuditInfo:
442
+ self[
443
+ "dedupePostObjectStoreMemoryUsedBytes"
444
+ ] = object_store_memory_used_bytes_by_dedupe
445
+ return self
446
+
447
+ def set_materialize_post_object_store_memory_used_bytes(
448
+ self, object_store_memory_used_bytes_by_dedupe: float
449
+ ) -> CompactionSessionAuditInfo:
450
+ self[
451
+ "materializePostObjectStoreMemoryUsedBytes"
452
+ ] = object_store_memory_used_bytes_by_dedupe
453
+ return self
454
+
455
+ def set_materialize_buckets(
456
+ self, materialize_buckets: int
457
+ ) -> CompactionSessionAuditInfo:
458
+ self["materializeBuckets"] = materialize_buckets
459
+ return self
460
+
461
+ def set_hash_bucket_time_in_seconds(
462
+ self, hash_bucket_time_in_seconds: float
463
+ ) -> CompactionSessionAuditInfo:
464
+ self["hashBucketTimeInSeconds"] = hash_bucket_time_in_seconds
465
+ return self
466
+
467
+ def set_hash_bucket_invoke_time_in_seconds(
468
+ self, hash_bucket_invoke_time: float
469
+ ) -> CompactionSessionAuditInfo:
470
+ self["hashBucketInvokeTimeInSeconds"] = hash_bucket_invoke_time
471
+ return self
472
+
473
+ def set_hash_bucket_result_wait_time_in_seconds(
474
+ self, wait_time: float
475
+ ) -> CompactionSessionAuditInfo:
476
+ self.get["hashBucketResultWaitTimeInSeconds"] = wait_time
477
+ return self
478
+
479
+ def set_dedupe_time_in_seconds(
480
+ self, dedupe_time_in_seconds: float
481
+ ) -> CompactionSessionAuditInfo:
482
+ self["dedupeTimeInSeconds"] = dedupe_time_in_seconds
483
+ return self
484
+
485
+ def set_dedupe_invoke_time_in_seconds(
486
+ self, dedupe_invoke_time: float
487
+ ) -> CompactionSessionAuditInfo:
488
+ self["dedupeInvokeTimeInSeconds"] = dedupe_invoke_time
489
+ return self
490
+
491
+ def set_dedupe_result_wait_time_in_seconds(
492
+ self, wait_time: float
493
+ ) -> CompactionSessionAuditInfo:
494
+ self.get["dedupeResultWaitTimeInSeconds"] = wait_time
495
+ return self
496
+
497
+ def set_materialize_time_in_seconds(
498
+ self, materialize_time_in_seconds: float
499
+ ) -> CompactionSessionAuditInfo:
500
+ self["materializeTimeInSeconds"] = materialize_time_in_seconds
501
+ return self
502
+
503
+ def set_materialize_invoke_time_in_seconds(
504
+ self, materialize_invoke_time: float
505
+ ) -> CompactionSessionAuditInfo:
506
+ self["materializeInvokeTimeInSeconds"] = materialize_invoke_time
507
+ return self
508
+
509
+ def set_materialize_result_wait_time_in_seconds(
510
+ self, wait_time: float
511
+ ) -> CompactionSessionAuditInfo:
512
+ self.get["materializeResultWaitTimeInSeconds"] = wait_time
513
+ return self
514
+
515
+ def set_delta_discovery_time_in_seconds(
516
+ self, delta_discovery_time_in_seconds: float
517
+ ) -> CompactionSessionAuditInfo:
518
+ self["deltaDiscoveryTimeInSeconds"] = delta_discovery_time_in_seconds
519
+ return self
520
+
521
+ def set_output_file_count(
522
+ self, output_file_count: float
523
+ ) -> CompactionSessionAuditInfo:
524
+ self["outputFileCount"] = output_file_count
525
+ return self
526
+
527
+ def set_output_size_bytes(
528
+ self, output_size_bytes: float
529
+ ) -> CompactionSessionAuditInfo:
530
+ self["outputSizeBytes"] = output_size_bytes
531
+ return output_size_bytes
532
+
533
+ def set_output_size_pyarrow_bytes(
534
+ self, output_size_pyarrow_bytes: float
535
+ ) -> CompactionSessionAuditInfo:
536
+ self["outputSizePyarrowBytes"] = output_size_pyarrow_bytes
537
+ return output_size_pyarrow_bytes
538
+
539
+ def set_total_cluster_memory_bytes(
540
+ self, total_cluster_memory_bytes: float
541
+ ) -> CompactionSessionAuditInfo:
542
+ self["totalClusterMemoryBytes"] = total_cluster_memory_bytes
543
+ return self
544
+
545
+ def set_total_cluster_object_store_memory_bytes(
546
+ self, total_cluster_object_store_memory_bytes: float
547
+ ) -> CompactionSessionAuditInfo:
548
+ self[
549
+ "totalClusterObjectStoreMemoryBytes"
550
+ ] = total_cluster_object_store_memory_bytes
551
+ return self
552
+
553
+ def set_untouched_file_count(
554
+ self, untouched_file_count: int
555
+ ) -> CompactionSessionAuditInfo:
556
+ self["untouchedFileCount"] = untouched_file_count
557
+ return self
558
+
559
+ def set_untouched_file_ratio(
560
+ self, untouched_file_ratio: float
561
+ ) -> CompactionSessionAuditInfo:
562
+ self["untouchedFileRatio"] = untouched_file_ratio
563
+ return self
564
+
565
+ def set_untouched_record_count(
566
+ self, untouched_record_count: int
567
+ ) -> CompactionSessionAuditInfo:
568
+ self["untouchedRecordCount"] = untouched_record_count
569
+ return self
570
+
571
+ def set_untouched_size_bytes(
572
+ self, untouched_size_bytes: float
573
+ ) -> CompactionSessionAuditInfo:
574
+ self["untouchedSizeBytes"] = untouched_size_bytes
575
+ return self
576
+
577
+ def set_telemetry_time_in_seconds(
578
+ self, telemetry_time_in_seconds: float
579
+ ) -> CompactionSessionAuditInfo:
580
+ self["telemetryTimeInSeconds"] = telemetry_time_in_seconds
581
+ return self
582
+
583
+ def set_hash_bucket_result_size_bytes(
584
+ self, hash_bucket_result_size_bytes: float
585
+ ) -> CompactionSessionAuditInfo:
586
+ self["hashBucketResultSize"] = hash_bucket_result_size_bytes
587
+ return self
588
+
589
+ def set_dedupe_result_size_bytes(
590
+ self, dedupe_result_size_bytes: float
591
+ ) -> CompactionSessionAuditInfo:
592
+ self["dedupeResultSize"] = dedupe_result_size_bytes
593
+ return self
594
+
595
+ def set_materialize_result_size_bytes(
596
+ self, materialize_result_size_bytes: float
597
+ ) -> CompactionSessionAuditInfo:
598
+ self["materializeResultSize"] = materialize_result_size_bytes
599
+ return self
600
+
601
+ def set_peak_memory_used_bytes_by_compaction_session_process(
602
+ self, peak_memory: float
603
+ ) -> CompactionSessionAuditInfo:
604
+ self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
605
+ return self
606
+
607
+ # High level methods to save stats
608
+ def save_step_stats(
609
+ self,
610
+ step_name: str,
611
+ task_results: Union[
612
+ List[HashBucketResult], List[DedupeResult], List[MaterializeResult]
613
+ ],
614
+ task_results_retrieved_at: float,
615
+ invoke_time_in_seconds: float,
616
+ task_time_in_seconds: float,
617
+ ) -> float:
618
+ """
619
+ Saves the stats by calling individual setters and returns the cluster telemetry time.
620
+ """
621
+
622
+ last_task_completed_at = max(
623
+ result.task_completed_at for result in task_results
624
+ )
625
+
626
+ self[f"{step_name}ResultWaitTimeInSeconds"] = (
627
+ task_results_retrieved_at - last_task_completed_at.item()
628
+ )
629
+ self[f"{step_name}TimeInSeconds"] = task_time_in_seconds
630
+ self[f"{step_name}InvokeTimeInSeconds"] = invoke_time_in_seconds
631
+
632
+ self[f"{step_name}ResultSize"] = get_size_of_object_in_bytes(task_results)
633
+
634
+ (
635
+ cluster_utilization_after_task,
636
+ cluster_util_after_task_latency,
637
+ ) = timed_invocation(ClusterUtilization.get_current_cluster_utilization)
638
+
639
+ self.set_total_cluster_object_store_memory_bytes(
640
+ cluster_utilization_after_task.total_object_store_memory_bytes
641
+ )
642
+ self.set_total_cluster_memory_bytes(
643
+ cluster_utilization_after_task.total_memory_bytes
644
+ )
645
+ self.set_total_object_store_memory_used_bytes(
646
+ cluster_utilization_after_task.used_object_store_memory_bytes
647
+ )
648
+
649
+ self[
650
+ f"{step_name}PostObjectStoreMemoryUsedBytes"
651
+ ] = cluster_utilization_after_task.used_object_store_memory_bytes
652
+
653
+ peak_task_memory = max(
654
+ result.peak_memory_usage_bytes for result in task_results
655
+ )
656
+
657
+ telemetry_time = sum(
658
+ result.telemetry_time_in_seconds for result in task_results
659
+ )
660
+
661
+ self[f"{step_name}TaskPeakMemoryUsedBytes"] = peak_task_memory.item()
662
+
663
+ return cluster_util_after_task_latency + telemetry_time
664
+
665
+ def save_round_completion_stats(
666
+ self, mat_results: List[MaterializeResult], total_telemetry_time: float
667
+ ) -> None:
668
+ """
669
+ This method saves all the relevant stats after all the steps are completed.
670
+ """
671
+ pyarrow_write_result = PyArrowWriteResult.union(
672
+ [m.pyarrow_write_result for m in mat_results]
673
+ )
674
+
675
+ total_count_of_src_dfl_not_touched = sum(
676
+ m.referenced_pyarrow_write_result.files for m in mat_results
677
+ )
678
+
679
+ logger.info(
680
+ f"Got total of {total_count_of_src_dfl_not_touched} manifest files not touched."
681
+ )
682
+ logger.info(
683
+ f"Got total of {pyarrow_write_result.files} manifest files during compaction."
684
+ )
685
+ manifest_entry_copied_by_reference_ratio = (
686
+ (
687
+ round(
688
+ total_count_of_src_dfl_not_touched / pyarrow_write_result.files, 4
689
+ )
690
+ * 100
691
+ )
692
+ if pyarrow_write_result.files != 0
693
+ else None
694
+ )
695
+ logger.info(
696
+ f"{manifest_entry_copied_by_reference_ratio} percent of manifest files are copied by reference during materialize."
697
+ )
698
+
699
+ untouched_file_record_count = sum(
700
+ m.referenced_pyarrow_write_result.records for m in mat_results
701
+ )
702
+ untouched_file_size_bytes = sum(
703
+ m.referenced_pyarrow_write_result.file_bytes for m in mat_results
704
+ )
705
+
706
+ self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
707
+ self.set_untouched_file_ratio(manifest_entry_copied_by_reference_ratio)
708
+ self.set_untouched_record_count(untouched_file_record_count)
709
+ self.set_untouched_size_bytes(untouched_file_size_bytes)
710
+
711
+ self.set_output_file_count(pyarrow_write_result.files)
712
+ self.set_output_size_bytes(pyarrow_write_result.file_bytes)
713
+ self.set_output_size_pyarrow_bytes(pyarrow_write_result.pyarrow_bytes)
714
+
715
+ self.set_peak_memory_used_bytes_per_task(
716
+ max(
717
+ [
718
+ self.peak_memory_used_bytes_per_hash_bucket_task,
719
+ self.peak_memory_used_bytes_per_dedupe_task,
720
+ self.peak_memory_used_bytes_per_materialize_task,
721
+ ]
722
+ )
723
+ )
724
+
725
+ self.set_telemetry_time_in_seconds(total_telemetry_time)