deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/compaction_session.py +184 -29
  3. deltacat/compute/compactor/model/compact_partition_params.py +153 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
  5. deltacat/compute/compactor/model/dedupe_result.py +3 -0
  6. deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
  7. deltacat/compute/compactor/model/delta_file_locator.py +11 -6
  8. deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
  9. deltacat/compute/compactor/model/materialize_result.py +27 -6
  10. deltacat/compute/compactor/model/round_completion_info.py +9 -0
  11. deltacat/compute/compactor/steps/dedupe.py +35 -19
  12. deltacat/compute/compactor/steps/hash_bucket.py +41 -16
  13. deltacat/compute/compactor/steps/materialize.py +73 -70
  14. deltacat/compute/compactor/utils/io.py +15 -0
  15. deltacat/compute/compactor/utils/primary_key_index.py +9 -15
  16. deltacat/compute/compactor/utils/round_completion_file.py +13 -4
  17. deltacat/compute/compactor/utils/system_columns.py +32 -0
  18. deltacat/io/__init__.py +0 -7
  19. deltacat/io/file_object_store.py +48 -0
  20. deltacat/io/memcached_object_store.py +121 -0
  21. deltacat/io/object_store.py +51 -0
  22. deltacat/io/ray_plasma_object_store.py +23 -0
  23. deltacat/io/redis_object_store.py +114 -0
  24. deltacat/io/s3_object_store.py +44 -0
  25. deltacat/storage/model/delta.py +2 -1
  26. deltacat/tests/compactor/test_compact_partition_params.py +237 -0
  27. deltacat/tests/compactor/utils/test_io.py +27 -5
  28. deltacat/tests/io/__init__.py +0 -0
  29. deltacat/tests/io/test_file_object_store.py +86 -0
  30. deltacat/tests/io/test_memcached_object_store.py +158 -0
  31. deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
  32. deltacat/tests/io/test_redis_object_store.py +103 -0
  33. deltacat/tests/io/test_s3_object_store.py +59 -0
  34. deltacat/tests/utils/test_record_batch_tables.py +1 -1
  35. deltacat/tests/utils/test_resources.py +9 -0
  36. deltacat/utils/ray_utils/concurrency.py +0 -2
  37. deltacat/utils/resources.py +30 -18
  38. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
  39. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
  40. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
  41. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
  42. {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -6,3 +6,6 @@ import numpy as np
6
6
  class DedupeResult(NamedTuple):
7
7
  mat_bucket_idx_to_obj_id: Dict[int, Tuple]
8
8
  deduped_record_count: np.int64
9
+ peak_memory_usage_bytes: np.double
10
+ telemetry_time_in_seconds: np.double
11
+ task_completed_at: np.double
@@ -5,6 +5,8 @@ import numpy as np
5
5
 
6
6
  from deltacat.storage import DeltaType, LocalTable
7
7
 
8
+ from typing import Optional
9
+
8
10
  DeltaFileEnvelopeGroups = np.ndarray
9
11
 
10
12
 
@@ -16,6 +18,7 @@ class DeltaFileEnvelope(dict):
16
18
  delta_type: DeltaType,
17
19
  table: LocalTable,
18
20
  is_src_delta: np.bool_ = True,
21
+ file_record_count: Optional[int] = None,
19
22
  ) -> DeltaFileEnvelope:
20
23
  """Static factory builder for a Delta File Envelope
21
24
  `
@@ -46,6 +49,7 @@ class DeltaFileEnvelope(dict):
46
49
  delta_file_envelope["deltaType"] = delta_type.value
47
50
  delta_file_envelope["table"] = table
48
51
  delta_file_envelope["is_src_delta"] = is_src_delta
52
+ delta_file_envelope["file_record_count"] = file_record_count
49
53
  return delta_file_envelope
50
54
 
51
55
  @property
@@ -67,3 +71,7 @@ class DeltaFileEnvelope(dict):
67
71
  @property
68
72
  def is_src_delta(self) -> np.bool_:
69
73
  return self["is_src_delta"]
74
+
75
+ @property
76
+ def file_record_count(self) -> int:
77
+ return self["file_record_count"]
@@ -5,11 +5,16 @@ import numpy as np
5
5
 
6
6
  from deltacat.storage import Locator
7
7
 
8
+ from typing import Optional
9
+
8
10
 
9
11
  class DeltaFileLocator(Locator, tuple):
10
12
  @staticmethod
11
13
  def of(
12
- is_src_delta: np.bool_, stream_position: np.int64, file_index: np.int32
14
+ is_src_delta: np.bool_,
15
+ stream_position: np.int64,
16
+ file_index: np.int32,
17
+ file_record_count: Optional[np.int64] = None,
13
18
  ) -> DeltaFileLocator:
14
19
  """
15
20
  Create a Delta File Locator tuple that can be used to uniquely identify
@@ -31,11 +36,7 @@ class DeltaFileLocator(Locator, tuple):
31
36
  (is_source_delta, stream_position, file_index).
32
37
  """
33
38
  return DeltaFileLocator(
34
- (
35
- is_src_delta,
36
- stream_position,
37
- file_index,
38
- )
39
+ (is_src_delta, stream_position, file_index, file_record_count)
39
40
  )
40
41
 
41
42
  @property
@@ -50,6 +51,10 @@ class DeltaFileLocator(Locator, tuple):
50
51
  def file_index(self) -> np.int32:
51
52
  return self[2]
52
53
 
54
+ @property
55
+ def file_record_count(self) -> np.int64:
56
+ return self[3]
57
+
53
58
  def canonical_string(self) -> str:
54
59
  """
55
60
  Returns a unique string for the given locator that can be used
@@ -6,3 +6,6 @@ import numpy as np
6
6
  class HashBucketResult(NamedTuple):
7
7
  hash_bucket_group_to_obj_id: np.ndarray
8
8
  hb_record_count: np.int64
9
+ peak_memory_usage_bytes: np.double
10
+ telemetry_time_in_seconds: np.double
11
+ task_completed_at: np.double
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  from typing import Any, Dict, Optional
5
+ import numpy as np
5
6
 
6
7
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
8
  from deltacat.storage import Delta
@@ -13,15 +14,19 @@ class MaterializeResult(dict):
13
14
  delta: Delta,
14
15
  task_index: int,
15
16
  pyarrow_write_result: PyArrowWriteResult,
16
- count_of_src_dfl_not_touched: Optional[int] = 0,
17
- count_of_src_dfl: Optional[int] = 0,
17
+ referenced_pyarrow_write_result: Optional[PyArrowWriteResult] = None,
18
+ peak_memory_usage_bytes: Optional[np.double] = None,
19
+ telemetry_time_in_seconds: Optional[np.double] = None,
20
+ task_completed_at: Optional[np.double] = None,
18
21
  ) -> MaterializeResult:
19
22
  materialize_result = MaterializeResult()
20
23
  materialize_result["delta"] = delta
21
24
  materialize_result["taskIndex"] = task_index
22
25
  materialize_result["paWriteResult"] = pyarrow_write_result
23
- materialize_result["countOfSrcFileNotTouched"] = count_of_src_dfl_not_touched
24
- materialize_result["countOfSrcFile"] = count_of_src_dfl
26
+ materialize_result["referencedPaWriteResult"] = referenced_pyarrow_write_result
27
+ materialize_result["peakMemoryUsageBytes"] = peak_memory_usage_bytes
28
+ materialize_result["telemetryTimeInSeconds"] = telemetry_time_in_seconds
29
+ materialize_result["taskCompletedAt"] = task_completed_at
25
30
  return materialize_result
26
31
 
27
32
  @property
@@ -35,6 +40,14 @@ class MaterializeResult(dict):
35
40
  def task_index(self) -> int:
36
41
  return self["taskIndex"]
37
42
 
43
+ @property
44
+ def peak_memory_usage_bytes(self) -> Optional[np.double]:
45
+ return self["peakMemoryUsageBytes"]
46
+
47
+ @property
48
+ def telemetry_time_in_seconds(self) -> Optional[np.double]:
49
+ return self["telemetryTimeInSeconds"]
50
+
38
51
  @property
39
52
  def pyarrow_write_result(self) -> PyArrowWriteResult:
40
53
  val: Dict[str, Any] = self.get("paWriteResult")
@@ -47,5 +60,13 @@ class MaterializeResult(dict):
47
60
  return self["countOfSrcFileNotTouched"]
48
61
 
49
62
  @property
50
- def count_of_src_dfl(self) -> int:
51
- return self["countOfSrcFile"]
63
+ def referenced_pyarrow_write_result(self) -> PyArrowWriteResult:
64
+ val: Dict[str, Any] = self.get("referencedPaWriteResult")
65
+ if val is not None and not isinstance(val, PyArrowWriteResult):
66
+ self["referencedPaWriteResult"] = val = PyArrowWriteResult(val)
67
+
68
+ return val
69
+
70
+ @property
71
+ def task_completed_at(self) -> Optional[np.double]:
72
+ return self["taskCompletedAt"]
@@ -3,6 +3,9 @@ from __future__ import annotations
3
3
 
4
4
  from deltacat.storage import DeltaLocator, PartitionLocator
5
5
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
6
+ from deltacat.compute.compactor.model.compaction_session_audit_info import (
7
+ CompactionSessionAuditInfo,
8
+ )
6
9
  from typing import Any, Dict, Optional
7
10
 
8
11
 
@@ -39,6 +42,7 @@ class RoundCompletionInfo(dict):
39
42
  sort_keys_bit_width: int,
40
43
  rebase_source_partition_locator: Optional[PartitionLocator],
41
44
  manifest_entry_copied_by_reference_ratio: Optional[float] = None,
45
+ compaction_audit_url: Optional[str] = None,
42
46
  ) -> RoundCompletionInfo:
43
47
 
44
48
  rci = RoundCompletionInfo()
@@ -50,6 +54,7 @@ class RoundCompletionInfo(dict):
50
54
  rci[
51
55
  "manifestEntryCopiedByReferenceRatio"
52
56
  ] = manifest_entry_copied_by_reference_ratio
57
+ rci["compactionAuditUrl"] = compaction_audit_url
53
58
  return rci
54
59
 
55
60
  @property
@@ -81,6 +86,10 @@ class RoundCompletionInfo(dict):
81
86
  def sort_keys_bit_width(self) -> int:
82
87
  return self["sortKeysBitWidth"]
83
88
 
89
+ @property
90
+ def compaction_audit(self) -> Optional[CompactionSessionAuditInfo]:
91
+ return self.get("compactionAudit")
92
+
84
93
  @property
85
94
  def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
86
95
  return self.get("rebaseSourcePartitionLocator")
@@ -1,14 +1,14 @@
1
1
  import importlib
2
2
  import logging
3
+ from typing import Optional
4
+ import time
3
5
  from collections import defaultdict
4
6
  from contextlib import nullcontext
5
7
  from typing import Any, Dict, List, Tuple
6
-
7
8
  import numpy as np
8
9
  import pyarrow as pa
9
10
  import pyarrow.compute as pc
10
11
  import ray
11
- from ray import cloudpickle
12
12
 
13
13
  from deltacat import logs
14
14
  from deltacat.compute.compactor import (
@@ -25,6 +25,8 @@ from deltacat.utils.ray_utils.runtime import (
25
25
  )
26
26
  from deltacat.utils.performance import timed_invocation
27
27
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
28
+ from deltacat.io.object_store import IObjectStore
29
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
28
30
 
29
31
  if importlib.util.find_spec("memray"):
30
32
  import memray
@@ -105,6 +107,7 @@ def _timed_dedupe(
105
107
  num_materialize_buckets: int,
106
108
  dedupe_task_index: int,
107
109
  enable_profiler: bool,
110
+ object_store: Optional[IObjectStore],
108
111
  ):
109
112
  task_id = get_current_ray_task_id()
110
113
  worker_id = get_current_ray_worker_id()
@@ -113,15 +116,12 @@ def _timed_dedupe(
113
116
  ) if enable_profiler else nullcontext():
114
117
  # TODO (pdames): mitigate risk of running out of memory here in cases of
115
118
  # severe skew of primary key updates in deltas
116
- src_file_records_obj_refs = [
117
- cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
118
- ]
119
119
  logger.info(
120
120
  f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
121
- f"groups for {len(src_file_records_obj_refs)} object refs..."
121
+ f"groups for {len(object_ids)} object refs..."
122
122
  )
123
123
 
124
- delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
124
+ delta_file_envelope_groups_list = object_store.get_many(object_ids)
125
125
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
126
126
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
127
127
  for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -188,17 +188,18 @@ def _timed_dedupe(
188
188
  file_idx_col = sc.file_index_column_np(table)
189
189
  row_idx_col = sc.record_index_column_np(table)
190
190
  is_source_col = sc.is_source_column_np(table)
191
+ file_record_count_col = sc.file_record_count_column_np(table)
191
192
  for row_idx in range(len(table)):
192
193
  src_dfl = DeltaFileLocator.of(
193
194
  is_source_col[row_idx],
194
195
  stream_position_col[row_idx],
195
196
  file_idx_col[row_idx],
197
+ file_record_count_col[row_idx],
196
198
  )
197
199
  # TODO(pdames): merge contiguous record number ranges
198
200
  src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
199
201
 
200
202
  logger.info(f"Finished all dedupe rounds...")
201
- mat_bucket_to_src_file_record_count = defaultdict(dict)
202
203
  mat_bucket_to_src_file_records: Dict[
203
204
  MaterializeBucketIndex, DeltaFileLocatorToRecords
204
205
  ] = defaultdict(dict)
@@ -210,29 +211,30 @@ def _timed_dedupe(
210
211
  mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
211
212
  src_row_indices,
212
213
  )
213
- mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
214
- src_row_indices
215
- )
216
214
 
217
215
  mat_bucket_to_dd_idx_obj_id: Dict[
218
216
  MaterializeBucketIndex, DedupeTaskIndexWithObjectId
219
217
  ] = {}
220
218
  for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
221
- object_ref = ray.put(src_file_records)
222
- pickled_object_ref = cloudpickle.dumps(object_ref)
219
+ object_ref = object_store.put(src_file_records)
223
220
  mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
224
221
  dedupe_task_index,
225
- pickled_object_ref,
222
+ object_ref,
226
223
  )
227
224
  del object_ref
228
- del pickled_object_ref
229
225
  logger.info(
230
226
  f"Count of materialize buckets with object refs: "
231
227
  f"{len(mat_bucket_to_dd_idx_obj_id)}"
232
228
  )
233
229
 
230
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
231
+
234
232
  return DedupeResult(
235
- mat_bucket_to_dd_idx_obj_id, np.int64(total_deduped_records)
233
+ mat_bucket_to_dd_idx_obj_id,
234
+ np.int64(total_deduped_records),
235
+ np.double(peak_memory_usage_bytes),
236
+ np.double(0.0),
237
+ np.double(time.time()),
236
238
  )
237
239
 
238
240
 
@@ -244,6 +246,7 @@ def dedupe(
244
246
  dedupe_task_index: int,
245
247
  enable_profiler: bool,
246
248
  metrics_config: MetricsConfig,
249
+ object_store: Optional[IObjectStore],
247
250
  ) -> DedupeResult:
248
251
  logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
249
252
  dedupe_result, duration = timed_invocation(
@@ -253,11 +256,24 @@ def dedupe(
253
256
  num_materialize_buckets=num_materialize_buckets,
254
257
  dedupe_task_index=dedupe_task_index,
255
258
  enable_profiler=enable_profiler,
259
+ object_store=object_store,
256
260
  )
261
+
262
+ emit_metrics_time = 0.0
257
263
  if metrics_config:
258
- emit_timer_metrics(
259
- metrics_name="dedupe", value=duration, metrics_config=metrics_config
264
+ emit_result, latency = timed_invocation(
265
+ func=emit_timer_metrics,
266
+ metrics_name="dedupe",
267
+ value=duration,
268
+ metrics_config=metrics_config,
260
269
  )
270
+ emit_metrics_time = latency
261
271
 
262
272
  logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
263
- return dedupe_result
273
+ return DedupeResult(
274
+ dedupe_result[0],
275
+ dedupe_result[1],
276
+ dedupe_result[2],
277
+ np.double(emit_metrics_time),
278
+ dedupe_result[4],
279
+ )
@@ -1,5 +1,6 @@
1
1
  import importlib
2
2
  import logging
3
+ import time
3
4
  from contextlib import nullcontext
4
5
  from itertools import chain
5
6
  from typing import Generator, List, Optional, Tuple
@@ -30,6 +31,8 @@ from deltacat.utils.ray_utils.runtime import (
30
31
  from deltacat.utils.common import ReadKwargsProvider
31
32
  from deltacat.utils.performance import timed_invocation
32
33
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
34
+ from deltacat.io.object_store import IObjectStore
35
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
33
36
 
34
37
  if importlib.util.find_spec("memray"):
35
38
  import memray
@@ -114,11 +117,12 @@ def _group_file_records_by_pk_hash_bucket(
114
117
  hb_to_delta_file_envelopes[hb] = []
115
118
  hb_to_delta_file_envelopes[hb].append(
116
119
  DeltaFileEnvelope.of(
117
- dfe.stream_position,
118
- dfe.file_index,
119
- dfe.delta_type,
120
- table,
121
- is_src_delta,
120
+ stream_position=dfe.stream_position,
121
+ file_index=dfe.file_index,
122
+ delta_type=dfe.delta_type,
123
+ table=table,
124
+ is_src_delta=is_src_delta,
125
+ file_record_count=dfe.file_record_count,
122
126
  )
123
127
  )
124
128
  return hb_to_delta_file_envelopes, total_record_count
@@ -157,10 +161,11 @@ def _read_delta_file_envelopes(
157
161
  for i, table in enumerate(tables):
158
162
  total_record_count += len(table)
159
163
  delta_file = DeltaFileEnvelope.of(
160
- annotations[i].annotation_stream_position,
161
- annotations[i].annotation_file_index,
162
- annotations[i].annotation_delta_type,
163
- table,
164
+ stream_position=annotations[i].annotation_stream_position,
165
+ file_index=annotations[i].annotation_file_index,
166
+ delta_type=annotations[i].annotation_delta_type,
167
+ table=table,
168
+ file_record_count=len(table),
164
169
  )
165
170
  delta_file_envelopes.append(delta_file)
166
171
  return delta_file_envelopes, total_record_count
@@ -175,6 +180,7 @@ def _timed_hash_bucket(
175
180
  num_groups: int,
176
181
  enable_profiler: bool,
177
182
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
183
+ object_store: Optional[IObjectStore] = None,
178
184
  deltacat_storage=unimplemented_deltacat_storage,
179
185
  ):
180
186
  task_id = get_current_ray_task_id()
@@ -203,12 +209,16 @@ def _timed_hash_bucket(
203
209
  deltacat_storage,
204
210
  )
205
211
  hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
206
- delta_file_envelope_groups,
207
- num_buckets,
208
- num_groups,
212
+ delta_file_envelope_groups, num_buckets, num_groups, object_store
209
213
  )
214
+
215
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
210
216
  return HashBucketResult(
211
- hash_bucket_group_to_obj_id, np.int64(total_record_count)
217
+ hash_bucket_group_to_obj_id,
218
+ np.int64(total_record_count),
219
+ np.double(peak_memory_usage_bytes),
220
+ np.double(0.0),
221
+ np.double(time.time()),
212
222
  )
213
223
 
214
224
 
@@ -223,6 +233,7 @@ def hash_bucket(
223
233
  enable_profiler: bool,
224
234
  metrics_config: MetricsConfig,
225
235
  read_kwargs_provider: Optional[ReadKwargsProvider],
236
+ object_store: Optional[IObjectStore],
226
237
  deltacat_storage=unimplemented_deltacat_storage,
227
238
  ) -> HashBucketResult:
228
239
 
@@ -237,11 +248,25 @@ def hash_bucket(
237
248
  num_groups=num_groups,
238
249
  enable_profiler=enable_profiler,
239
250
  read_kwargs_provider=read_kwargs_provider,
251
+ object_store=object_store,
240
252
  deltacat_storage=deltacat_storage,
241
253
  )
254
+
255
+ emit_metrics_time = 0.0
242
256
  if metrics_config:
243
- emit_timer_metrics(
244
- metrics_name="hash_bucket", value=duration, metrics_config=metrics_config
257
+ emit_result, latency = timed_invocation(
258
+ func=emit_timer_metrics,
259
+ metrics_name="hash_bucket",
260
+ value=duration,
261
+ metrics_config=metrics_config,
245
262
  )
263
+ emit_metrics_time = latency
264
+
246
265
  logger.info(f"Finished hash bucket task...")
247
- return hash_bucket_result
266
+ return HashBucketResult(
267
+ hash_bucket_result[0],
268
+ hash_bucket_result[1],
269
+ hash_bucket_result[2],
270
+ np.double(emit_metrics_time),
271
+ hash_bucket_result[4],
272
+ )