deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +2 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +16 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  20. deltacat/compute/compactor_v2/constants.py +34 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  30. deltacat/compute/compactor_v2/utils/io.py +149 -0
  31. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  32. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  33. deltacat/compute/metastats/meta_stats.py +4 -2
  34. deltacat/compute/metastats/stats.py +1 -0
  35. deltacat/compute/metastats/utils/io.py +4 -0
  36. deltacat/compute/stats/utils/io.py +20 -5
  37. deltacat/exceptions.py +4 -0
  38. deltacat/io/memcached_object_store.py +37 -14
  39. deltacat/logs.py +4 -3
  40. deltacat/storage/interface.py +8 -1
  41. deltacat/storage/model/types.py +2 -1
  42. deltacat/tests/aws/test_clients.py +16 -3
  43. deltacat/tests/compute/__init__.py +0 -0
  44. deltacat/tests/compute/common.py +96 -0
  45. deltacat/tests/compute/compactor/__init__.py +0 -0
  46. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  47. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  48. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  49. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  50. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  51. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  52. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  53. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  54. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  55. deltacat/tests/compute/testcases.py +390 -0
  56. deltacat/tests/io/test_memcached_object_store.py +5 -4
  57. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  58. deltacat/tests/test_utils/pyarrow.py +32 -0
  59. deltacat/tests/test_utils/utils.py +13 -0
  60. deltacat/tests/utils/data/__init__.py +0 -0
  61. deltacat/tests/utils/test_daft.py +76 -0
  62. deltacat/tests/utils/test_pyarrow.py +133 -0
  63. deltacat/tests/utils/test_resources.py +23 -20
  64. deltacat/types/media.py +1 -0
  65. deltacat/types/partial_download.py +82 -0
  66. deltacat/types/tables.py +1 -0
  67. deltacat/utils/arguments.py +26 -0
  68. deltacat/utils/daft.py +87 -0
  69. deltacat/utils/placement.py +20 -3
  70. deltacat/utils/pyarrow.py +213 -1
  71. deltacat/utils/ray_utils/concurrency.py +26 -1
  72. deltacat/utils/resources.py +72 -1
  73. deltacat/utils/s3fs.py +21 -0
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
  76. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  77. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  78. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  80. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,28 @@
1
1
  from __future__ import annotations
2
-
2
+ import importlib
3
3
  import copy
4
4
  import json
5
5
  from typing import Any, Dict, List, Optional
6
-
6
+ from deltacat.io.object_store import IObjectStore
7
+ from deltacat.utils.common import ReadKwargsProvider
7
8
  from deltacat.types.media import ContentType
9
+ from deltacat.utils.placement import PlacementGroupConfig
10
+ from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
11
+ from deltacat.storage import (
12
+ interface as unimplemented_deltacat_storage,
13
+ PartitionLocator,
14
+ SortKey,
15
+ )
16
+ from deltacat.compute.compactor_v2.constants import (
17
+ MAX_RECORDS_PER_COMPACTED_FILE,
18
+ MIN_DELTA_BYTES_IN_BATCH,
19
+ MIN_FILES_IN_BATCH,
20
+ AVERAGE_RECORD_SIZE_BYTES,
21
+ TASK_MAX_PARALLELISM,
22
+ )
23
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
24
+ from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
25
+ from deltacat.utils.metrics import MetricsConfig
8
26
 
9
27
 
10
28
  class CompactPartitionParams(dict):
@@ -16,104 +34,315 @@ class CompactPartitionParams(dict):
16
34
  def of(params: Optional[Dict]) -> CompactPartitionParams:
17
35
  if params is None:
18
36
  params = {}
19
- compact_partition_params = CompactPartitionParams()
20
- compact_partition_params["destination_partition_locator"] = params.get(
21
- "destination_partition_locator"
22
- )
23
- compact_partition_params["last_stream_position_to_compact"] = params.get(
24
- "last_stream_position_to_compact"
37
+
38
+ assert (
39
+ params.get("destination_partition_locator") is not None
40
+ ), "destination_partition_locator is a required arg"
41
+ assert (
42
+ params.get("last_stream_position_to_compact") is not None
43
+ ), "last_stream_position_to_compact is a required arg"
44
+ assert (
45
+ params.get("source_partition_locator") is not None
46
+ ), "source_partition_locator is a required arg"
47
+ assert (
48
+ params.get("compaction_artifact_s3_bucket") is not None
49
+ ), "compaction_artifact_s3_bucket is a required arg"
50
+
51
+ result = CompactPartitionParams(params)
52
+
53
+ # TODO: move defaults to single file
54
+ result.records_per_compacted_file = params.get(
55
+ "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
25
56
  )
26
- compact_partition_params["source_partition_locator"] = params.get(
27
- "source_partition_locator"
57
+ result.compacted_file_content_type = params.get(
58
+ "compacted_file_content_type", ContentType.PARQUET
28
59
  )
29
- compact_partition_params["primary_keys"] = params.get("primary_keys")
30
- compact_partition_params["rebase_source_partition_locator"] = params.get(
31
- "rebase_source_partition_locator"
60
+ result.object_store = params.get("object_store", RayPlasmaObjectStore())
61
+ result.enable_profiler = params.get("enable_profiler", False)
62
+ result.deltacat_storage = params.get(
63
+ "deltacat_storage", unimplemented_deltacat_storage
32
64
  )
33
- compact_partition_params["rebase_source_partition_high_watermark"] = params.get(
34
- "rebase_source_partition_high_watermark"
65
+ result.s3_client_kwargs = params.get("s3_client_kwargs", {})
66
+ result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
67
+ result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
68
+ result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
69
+ result.bit_width_of_sort_keys = validate_sort_keys(
70
+ result.source_partition_locator,
71
+ result.sort_keys,
72
+ result.deltacat_storage,
73
+ result.deltacat_storage_kwargs,
35
74
  )
36
- compact_partition_params["hash_bucket_count"] = params.get("hash_bucket_count")
37
- compact_partition_params["deltacat_storage"] = params.get("deltacat_storage")
38
- compact_partition_params["compaction_artifact_s3_bucket"] = params.get(
39
- "compaction_artifact_s3_bucket"
75
+ result.task_max_parallelism = params.get(
76
+ "task_max_parallelism", TASK_MAX_PARALLELISM
40
77
  )
41
- compact_partition_params["properties"] = params.get("properties")
42
- compact_partition_params["compacted_file_content_type"] = params.get(
43
- "compacted_file_content_type"
78
+ result.min_files_in_batch = params.get("min_files_in_batch", MIN_FILES_IN_BATCH)
79
+ result.min_delta_bytes_in_batch = params.get(
80
+ "min_delta_bytes_in_batch", MIN_DELTA_BYTES_IN_BATCH
44
81
  )
45
- compact_partition_params["list_deltas_kwargs"] = params.get(
46
- "list_deltas_kwargs"
82
+ result.previous_inflation = params.get(
83
+ "previous_inflation", PYARROW_INFLATION_MULTIPLIER
47
84
  )
48
- compact_partition_params["pg_config"] = params.get("pg_config")
49
- compact_partition_params["read_kwargs_provider"] = params.get(
50
- "read_kwargs_provider"
85
+ result.average_record_size_bytes = params.get(
86
+ "average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
51
87
  )
52
- compact_partition_params["s3_table_writer_kwargs"] = params.get(
53
- "s3_table_writer_kwargs"
88
+ result.hash_group_count = params.get(
89
+ "hash_group_count", result.hash_bucket_count
54
90
  )
55
- return compact_partition_params
91
+
92
+ if not importlib.util.find_spec("memray"):
93
+ result.enable_profiler = False
94
+
95
+ if result.primary_keys:
96
+ result.primary_keys = sorted(result.primary_keys)
97
+
98
+ # assertions
99
+ assert (
100
+ result.source_partition_locator.partition_values
101
+ == result.destination_partition_locator.partition_values
102
+ ), "Source and destination partitions values must be equal"
103
+
104
+ assert (
105
+ result.records_per_compacted_file and result.records_per_compacted_file >= 1
106
+ ), "Max records per output file must be a positive value"
107
+
108
+ return result
56
109
 
57
110
  @property
58
- def destination_partition_locator(self) -> Optional[dict]:
59
- return self["destination_partition_locator"]
111
+ def destination_partition_locator(self) -> PartitionLocator:
112
+ val = self["destination_partition_locator"]
113
+ if not isinstance(val, PartitionLocator):
114
+ val = PartitionLocator(val)
115
+
116
+ return val
117
+
118
+ @destination_partition_locator.setter
119
+ def destination_partition_locator(self, locator: PartitionLocator) -> None:
120
+ self["destination_partition_locator"] = locator
60
121
 
61
122
  @property
62
- def last_stream_position_to_compact(self) -> Optional[int]:
123
+ def last_stream_position_to_compact(self) -> int:
63
124
  return self["last_stream_position_to_compact"]
64
125
 
126
+ @last_stream_position_to_compact.setter
127
+ def last_stream_position_to_compact(self, stream_position: int) -> None:
128
+ self["last_stream_position_to_compact"] = stream_position
129
+
65
130
  @property
66
- def source_partition_locator(self) -> Optional[dict]:
67
- return self["source_partition_locator"]
131
+ def source_partition_locator(self) -> PartitionLocator:
132
+ val = self["source_partition_locator"]
133
+ if not isinstance(val, PartitionLocator):
134
+ val = PartitionLocator(val)
135
+ return val
136
+
137
+ @source_partition_locator.setter
138
+ def source_partition_locator(self, locator: PartitionLocator) -> None:
139
+ self["source_partition_locator"] = locator
68
140
 
69
141
  @property
70
- def primary_keys(self) -> Optional[List[str]]:
71
- return list(self["primary_keys"])
142
+ def compaction_artifact_s3_bucket(self) -> str:
143
+ return self["compaction_artifact_s3_bucket"]
144
+
145
+ @compaction_artifact_s3_bucket.setter
146
+ def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
147
+ self["compaction_artifact_s3_bucket"] = s3_bucket
72
148
 
73
149
  @property
74
- def rebase_source_partition_locator(self) -> Optional[dict]:
75
- return self["rebase_source_partition_locator"]
150
+ def deltacat_storage(self) -> unimplemented_deltacat_storage:
151
+ return self["deltacat_storage"]
152
+
153
+ @deltacat_storage.setter
154
+ def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
155
+ self["deltacat_storage"] = storage
76
156
 
77
157
  @property
78
- def rebase_source_partition_high_watermark(self) -> Optional[int]:
79
- return self["rebase_source_partition_high_watermark"]
158
+ def object_store(self) -> IObjectStore:
159
+ return self["object_store"]
160
+
161
+ @object_store.setter
162
+ def object_store(self, obj_store: IObjectStore) -> None:
163
+ self["object_store"] = obj_store
80
164
 
81
165
  @property
82
- def hash_bucket_count(self) -> Optional[int]:
83
- return self["hash_bucket_count"]
166
+ def compacted_file_content_type(self) -> ContentType:
167
+ return self["compacted_file_content_type"]
168
+
169
+ @compacted_file_content_type.setter
170
+ def compacted_file_content_type(self, content_type: ContentType) -> None:
171
+ self["compacted_file_content_type"] = content_type
84
172
 
85
173
  @property
86
- def deltacat_storage(self) -> Optional[str]:
87
- return self["deltacat_storage"]
174
+ def task_max_parallelism(self) -> int:
175
+ return self["task_max_parallelism"]
176
+
177
+ @task_max_parallelism.setter
178
+ def task_max_parallelism(self, max_parallelism: int) -> None:
179
+ self["task_max_parallelism"] = max_parallelism
88
180
 
89
181
  @property
90
- def compaction_artifact_s3_bucket(self) -> Optional[str]:
91
- return self["compaction_artifact_s3_bucket"]
182
+ def average_record_size_bytes(self) -> float:
183
+ return self["average_record_size_bytes"]
184
+
185
+ @average_record_size_bytes.setter
186
+ def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
187
+ self["average_record_size_bytes"] = average_record_size_bytes
92
188
 
93
189
  @property
94
- def properties(self) -> Optional[Dict[str, str]]:
95
- return self["properties"]
190
+ def min_files_in_batch(self) -> float:
191
+ return self["min_files_in_batch"]
192
+
193
+ @min_files_in_batch.setter
194
+ def min_files_in_batch(self, min_files_in_batch: float) -> None:
195
+ self["min_files_in_batch"] = min_files_in_batch
96
196
 
97
197
  @property
98
- def compacted_file_content_type(self) -> Optional[ContentType]:
99
- return self["compacted_file_content_type"]
198
+ def min_delta_bytes_in_batch(self) -> float:
199
+ return self["min_files_in_batch"]
200
+
201
+ @min_delta_bytes_in_batch.setter
202
+ def min_delta_bytes_in_batch(self, min_delta_bytes_in_batch: float) -> None:
203
+ self["min_delta_bytes_in_batch"] = min_delta_bytes_in_batch
100
204
 
101
205
  @property
102
- def list_deltas_kwargs(self) -> Optional[dict]:
103
- return self["list_deltas_kwargs"]
206
+ def previous_inflation(self) -> float:
207
+ return self["previous_inflation"]
208
+
209
+ @previous_inflation.setter
210
+ def previous_inflation(self, previous_inflation: float) -> None:
211
+ self["previous_inflation"] = previous_inflation
104
212
 
105
213
  @property
106
- def pg_config(self) -> Optional[Any]:
107
- return self["pg_config"]
214
+ def enable_profiler(self) -> bool:
215
+ return self["enable_profiler"]
216
+
217
+ @enable_profiler.setter
218
+ def enable_profiler(self, value: bool) -> None:
219
+ self["enable_profiler"] = value
108
220
 
109
221
  @property
110
- def read_kwargs_provider(self) -> Optional[Any]:
111
- return self["read_kwargs_provider"]
222
+ def list_deltas_kwargs(self) -> dict:
223
+ return self["list_deltas_kwargs"]
224
+
225
+ @list_deltas_kwargs.setter
226
+ def list_deltas_kwargs(self, kwargs: dict) -> None:
227
+ self["list_deltas_kwargs"] = kwargs
112
228
 
113
229
  @property
114
- def s3_table_writer_kwargs(self) -> Optional[Any]:
230
+ def s3_table_writer_kwargs(self) -> dict:
115
231
  return self["s3_table_writer_kwargs"]
116
232
 
233
+ @s3_table_writer_kwargs.setter
234
+ def s3_table_writer_kwargs(self, kwargs: dict) -> None:
235
+ self["s3_table_writer_kwargs"] = kwargs
236
+
237
+ @property
238
+ def deltacat_storage_kwargs(self) -> dict:
239
+ return self["deltacat_storage_kwargs"]
240
+
241
+ @deltacat_storage_kwargs.setter
242
+ def deltacat_storage_kwargs(self, kwargs: dict) -> None:
243
+ self["deltacat_storage_kwargs"] = kwargs
244
+
245
+ @property
246
+ def s3_client_kwargs(self) -> dict:
247
+ return self["s3_client_kwargs"]
248
+
249
+ @s3_client_kwargs.setter
250
+ def s3_client_kwargs(self, kwargs: dict) -> None:
251
+ self["s3_client_kwargs"] = kwargs
252
+
253
+ @property
254
+ def records_per_compacted_file(self) -> int:
255
+ return self["records_per_compacted_file"]
256
+
257
+ @records_per_compacted_file.setter
258
+ def records_per_compacted_file(self, count: int) -> None:
259
+ self["records_per_compacted_file"] = count
260
+
261
+ @property
262
+ def bit_width_of_sort_keys(self) -> int:
263
+ return self["bit_width_of_sort_keys"]
264
+
265
+ @bit_width_of_sort_keys.setter
266
+ def bit_width_of_sort_keys(self, width: int) -> None:
267
+ self["bit_width_of_sort_keys"] = width
268
+
269
+ @property
270
+ def hash_bucket_count(self) -> Optional[int]:
271
+ return self.get("hash_bucket_count")
272
+
273
+ @hash_bucket_count.setter
274
+ def hash_bucket_count(self, count: int) -> None:
275
+ self["hash_bucket_count"] = count
276
+
277
+ @property
278
+ def hash_group_count(self) -> int:
279
+ return self["hash_group_count"]
280
+
281
+ @hash_group_count.setter
282
+ def hash_group_count(self, count: int) -> None:
283
+ self["hash_group_count"] = count
284
+
285
+ @property
286
+ def primary_keys(self) -> Optional[List[str]]:
287
+ return self.get("primary_keys")
288
+
289
+ @primary_keys.setter
290
+ def primary_keys(self, keys: List[str]) -> None:
291
+ self["primary_keys"] = keys
292
+
293
+ @property
294
+ def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
295
+ val = self.get("rebase_source_partition_locator")
296
+
297
+ if val and not isinstance(val, PartitionLocator):
298
+ val = PartitionLocator(val)
299
+
300
+ return val
301
+
302
+ @rebase_source_partition_locator.setter
303
+ def rebase_source_partition_locator(self, locator: PartitionLocator) -> None:
304
+ self["rebase_source_partition_locator"] = locator
305
+
306
+ @property
307
+ def rebase_source_partition_high_watermark(self) -> Optional[int]:
308
+ return self.get("rebase_source_partition_high_watermark")
309
+
310
+ @rebase_source_partition_high_watermark.setter
311
+ def rebase_source_partition_high_watermark(self, high_watermark: int) -> None:
312
+ self["rebase_source_partition_high_watermark"] = high_watermark
313
+
314
+ @property
315
+ def pg_config(self) -> Optional[PlacementGroupConfig]:
316
+ return self.get("pg_config")
317
+
318
+ @pg_config.setter
319
+ def pg_config(self, config: PlacementGroupConfig) -> None:
320
+ self["pg_config"] = config
321
+
322
+ @property
323
+ def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
324
+ return self.get("read_kwargs_provider")
325
+
326
+ @read_kwargs_provider.setter
327
+ def read_kwargs_provider(self, kwargs_provider: ReadKwargsProvider) -> None:
328
+ self["read_kwargs_provider"] = kwargs_provider
329
+
330
+ @property
331
+ def sort_keys(self) -> Optional[List[SortKey]]:
332
+ return self.get("sort_keys")
333
+
334
+ @sort_keys.setter
335
+ def sort_keys(self, keys: List[SortKey]) -> None:
336
+ self["sort_keys"] = keys
337
+
338
+ @property
339
+ def metrics_config(self) -> Optional[MetricsConfig]:
340
+ return self.get("metrics_config")
341
+
342
+ @metrics_config.setter
343
+ def metrics_config(self, config: MetricsConfig) -> None:
344
+ self["metrics_config"] = config
345
+
117
346
  @staticmethod
118
347
  def json_handler_for_compact_partition_params(obj):
119
348
  """
@@ -18,6 +18,7 @@ class CompactionSessionAuditInfo(dict):
18
18
  DEDUPE_STEP_NAME = "dedupe"
19
19
  MATERIALIZE_STEP_NAME = "materialize"
20
20
  HASH_BUCKET_STEP_NAME = "hashBucket"
21
+ MERGE_STEP_NAME = "merge"
21
22
 
22
23
  def __init__(self, deltacat_version: str, audit_url: str):
23
24
  self.set_deltacat_version(deltacat_version)
@@ -52,7 +53,7 @@ class CompactionSessionAuditInfo(dict):
52
53
  @property
53
54
  def uniform_deltas_created(self) -> int:
54
55
  """
55
- The total number of unitform deltas fed into the hash bucket step.
56
+ The total number of uniform deltas fed into the hash bucket step.
56
57
  """
57
58
  return self.get("uniformDeltasCreated")
58
59
 
@@ -68,7 +69,7 @@ class CompactionSessionAuditInfo(dict):
68
69
  @property
69
70
  def input_size_bytes(self) -> float:
70
71
  """
71
- The on-disk size in bytes of the input.
72
+ The on-disk size in bytes of the input. Analogous to bytes scanned
72
73
  """
73
74
  return self.get("inputSizeBytes")
74
75
 
@@ -142,6 +143,15 @@ class CompactionSessionAuditInfo(dict):
142
143
  """
143
144
  return self.get("materializeTaskPeakMemoryUsedBytes")
144
145
 
146
+ @property
147
+ def peak_memory_used_bytes_per_merge_task(self) -> float:
148
+ """
149
+ The peak memory used by a single merge python process. Note
150
+ that results may be max of merge, and hash bucketing as
151
+ processes are reused by Ray to run all compaction steps.
152
+ """
153
+ return self.get("mergeTaskPeakMemoryUsedBytes")
154
+
145
155
  @property
146
156
  def hash_bucket_post_object_store_memory_used_bytes(self) -> float:
147
157
  """
@@ -164,6 +174,13 @@ class CompactionSessionAuditInfo(dict):
164
174
  """
165
175
  return self.get("materializePostObjectStoreMemoryUsedBytes")
166
176
 
177
+ @property
178
+ def merge_post_object_store_memory_used_bytes(self) -> float:
179
+ """
180
+ The total object store memory used after merge step.
181
+ """
182
+ return self.get("mergePostObjectStoreMemoryUsedBytes")
183
+
167
184
  @property
168
185
  def materialize_buckets(self) -> int:
169
186
  """
@@ -233,11 +250,33 @@ class CompactionSessionAuditInfo(dict):
233
250
  @property
234
251
  def materialize_result_wait_time_in_seconds(self) -> float:
235
252
  """
236
- The time it takes ray.get() to resolve after the last hash bucket task has completed.
253
+ The time it takes ray.get() to resolve after the last materialize task has completed.
237
254
  This value may not be accurate at less than 1 second precision.
238
255
  """
239
256
  return self.get("materializeResultWaitTimeInSeconds")
240
257
 
258
+ @property
259
+ def merge_result_wait_time_in_seconds(self) -> float:
260
+ """
261
+ The time it takes ray.get() to resolve after the last task has completed.
262
+ This value may not be accurate at less than 1 second precision.
263
+ """
264
+ return self.get("mergeResultWaitTimeInSeconds")
265
+
266
+ @property
267
+ def merge_time_in_seconds(self) -> float:
268
+ """
269
+ The time taken by merge step. This includes all merge tasks.
270
+ """
271
+ return self.get("mergeTimeInSeconds")
272
+
273
+ @property
274
+ def merge_invoke_time_in_seconds(self) -> float:
275
+ """
276
+ The time taken to invoke all merge tasks.
277
+ """
278
+ return self.get("mergeInvokeTimeInSeconds")
279
+
241
280
  @property
242
281
  def delta_discovery_time_in_seconds(self) -> float:
243
282
  """
@@ -337,6 +376,13 @@ class CompactionSessionAuditInfo(dict):
337
376
  """
338
377
  return self.get("materializeResultSize")
339
378
 
379
+ @property
380
+ def merge_result_size(self) -> float:
381
+ """
382
+ The size of the results returned by merge step.
383
+ """
384
+ return self.get("mergeResultSize")
385
+
340
386
  @property
341
387
  def peak_memory_used_bytes_by_compaction_session_process(self) -> float:
342
388
  """
@@ -344,6 +390,35 @@ class CompactionSessionAuditInfo(dict):
344
390
  """
345
391
  return self.get("peakMemoryUsedBytesCompactionSessionProcess")
346
392
 
393
+ @property
394
+ def estimated_in_memory_size_bytes_during_discovery(self) -> float:
395
+ """
396
+ The estimated in-memory size during the discovery. This can be used
397
+ to determine the accuracy of memory estimation logic.
398
+ """
399
+ return self.get("estimatedInMemorySizeBytesDuringDiscovery")
400
+
401
+ @property
402
+ def hash_bucket_processed_size_bytes(self) -> int:
403
+ """
404
+ The total size of the input data processed during hash bucket
405
+ """
406
+ return self.get("hashBucketProcessedSizeBytes")
407
+
408
+ @property
409
+ def total_cpu_seconds(self) -> float:
410
+ """
411
+ Total number of vCPUs provisioned in the cluster weighted over time.
412
+ """
413
+ return self.get("totalCPUSeconds")
414
+
415
+ @property
416
+ def used_cpu_seconds(self) -> float:
417
+ """
418
+ Total used vCPU in the cluster weighted over time.
419
+ """
420
+ return self.get("usedCPUSeconds")
421
+
347
422
  # Setters follow
348
423
 
349
424
  def set_audit_url(self, audit_url: str) -> CompactionSessionAuditInfo:
@@ -428,6 +503,12 @@ class CompactionSessionAuditInfo(dict):
428
503
  ] = peak_memory_used_bytes_per_materialize_task
429
504
  return self
430
505
 
506
+ def set_peak_memory_used_bytes_per_merge_task(
507
+ self, peak_memory_used_bytes: float
508
+ ) -> CompactionSessionAuditInfo:
509
+ self["mergeTaskPeakMemoryUsedBytes"] = peak_memory_used_bytes
510
+ return self
511
+
431
512
  def set_hash_bucket_post_object_store_memory_used_bytes(
432
513
  self, object_store_memory_used_bytes_by_hb: float
433
514
  ) -> CompactionSessionAuditInfo:
@@ -452,6 +533,12 @@ class CompactionSessionAuditInfo(dict):
452
533
  ] = object_store_memory_used_bytes_by_dedupe
453
534
  return self
454
535
 
536
+ def set_merge_post_object_store_memory_used_bytes(
537
+ self, object_store_memory_used_bytes: float
538
+ ) -> CompactionSessionAuditInfo:
539
+ self["mergePostObjectStoreMemoryUsedBytes"] = object_store_memory_used_bytes
540
+ return self
541
+
455
542
  def set_materialize_buckets(
456
543
  self, materialize_buckets: int
457
544
  ) -> CompactionSessionAuditInfo:
@@ -512,6 +599,24 @@ class CompactionSessionAuditInfo(dict):
512
599
  self.get["materializeResultWaitTimeInSeconds"] = wait_time
513
600
  return self
514
601
 
602
+ def set_merge_time_in_seconds(
603
+ self, time_in_seconds: float
604
+ ) -> CompactionSessionAuditInfo:
605
+ self["mergeTimeInSeconds"] = time_in_seconds
606
+ return self
607
+
608
+ def set_merge_invoke_time_in_seconds(
609
+ self, invoke_time: float
610
+ ) -> CompactionSessionAuditInfo:
611
+ self["mergeInvokeTimeInSeconds"] = invoke_time
612
+ return self
613
+
614
+ def set_merge_result_wait_time_in_seconds(
615
+ self, wait_time: float
616
+ ) -> CompactionSessionAuditInfo:
617
+ self.get["mergeResultWaitTimeInSeconds"] = wait_time
618
+ return self
619
+
515
620
  def set_delta_discovery_time_in_seconds(
516
621
  self, delta_discovery_time_in_seconds: float
517
622
  ) -> CompactionSessionAuditInfo:
@@ -598,12 +703,38 @@ class CompactionSessionAuditInfo(dict):
598
703
  self["materializeResultSize"] = materialize_result_size_bytes
599
704
  return self
600
705
 
706
+ def set_merge_result_size_bytes(
707
+ self, merge_result_size_bytes: float
708
+ ) -> CompactionSessionAuditInfo:
709
+ self["mergeResultSize"] = merge_result_size_bytes
710
+ return self
711
+
601
712
  def set_peak_memory_used_bytes_by_compaction_session_process(
602
713
  self, peak_memory: float
603
714
  ) -> CompactionSessionAuditInfo:
604
715
  self["peakMemoryUsedBytesCompactionSessionProcess"] = peak_memory
605
716
  return self
606
717
 
718
+ def set_estimated_in_memory_size_bytes_during_discovery(
719
+ self, memory: float
720
+ ) -> CompactionSessionAuditInfo:
721
+ self["estimatedInMemorySizeBytesDuringDiscovery"] = memory
722
+ return self
723
+
724
+ def set_hash_bucket_processed_size_bytes(
725
+ self, size: int
726
+ ) -> CompactionSessionAuditInfo:
727
+ self["hashBucketProcessedSizeBytes"] = size
728
+ return self
729
+
730
+ def set_total_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
731
+ self["totalCPUSeconds"] = value
732
+ return self
733
+
734
+ def set_used_cpu_seconds(self, value: float) -> CompactionSessionAuditInfo:
735
+ self["usedCPUSeconds"] = value
736
+ return self
737
+
607
738
  # High level methods to save stats
608
739
  def save_step_stats(
609
740
  self,
@@ -673,7 +804,10 @@ class CompactionSessionAuditInfo(dict):
673
804
  )
674
805
 
675
806
  total_count_of_src_dfl_not_touched = sum(
676
- m.referenced_pyarrow_write_result.files for m in mat_results
807
+ m.referenced_pyarrow_write_result.files
808
+ if m.referenced_pyarrow_write_result
809
+ else 0
810
+ for m in mat_results
677
811
  )
678
812
 
679
813
  logger.info(
@@ -697,10 +831,16 @@ class CompactionSessionAuditInfo(dict):
697
831
  )
698
832
 
699
833
  untouched_file_record_count = sum(
700
- m.referenced_pyarrow_write_result.records for m in mat_results
834
+ m.referenced_pyarrow_write_result.records
835
+ if m.referenced_pyarrow_write_result
836
+ else 0
837
+ for m in mat_results
701
838
  )
702
839
  untouched_file_size_bytes = sum(
703
- m.referenced_pyarrow_write_result.file_bytes for m in mat_results
840
+ m.referenced_pyarrow_write_result.file_bytes
841
+ if m.referenced_pyarrow_write_result
842
+ else 0
843
+ for m in mat_results
704
844
  )
705
845
 
706
846
  self.set_untouched_file_count(total_count_of_src_dfl_not_touched)
@@ -715,9 +855,10 @@ class CompactionSessionAuditInfo(dict):
715
855
  self.set_peak_memory_used_bytes_per_task(
716
856
  max(
717
857
  [
718
- self.peak_memory_used_bytes_per_hash_bucket_task,
719
- self.peak_memory_used_bytes_per_dedupe_task,
720
- self.peak_memory_used_bytes_per_materialize_task,
858
+ self.peak_memory_used_bytes_per_hash_bucket_task or 0,
859
+ self.peak_memory_used_bytes_per_dedupe_task or 0,
860
+ self.peak_memory_used_bytes_per_materialize_task or 0,
861
+ self.peak_memory_used_bytes_per_merge_task or 0,
721
862
  ]
722
863
  )
723
864
  )