deltacat 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.0"
47
+ __version__ = "1.1.2"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/clients.py CHANGED
@@ -4,6 +4,7 @@ from typing import Optional
4
4
  from http import HTTPStatus
5
5
 
6
6
  import boto3
7
+ from botocore.exceptions import CredentialRetrievalError
7
8
  from boto3.exceptions import ResourceNotExistsError
8
9
  from boto3.resources.base import ServiceResource
9
10
  from botocore.client import BaseClient
@@ -15,6 +16,8 @@ from tenacity import (
15
16
  wait_fixed,
16
17
  retry_if_exception,
17
18
  stop_after_delay,
19
+ retry_if_exception_type,
20
+ wait_random_exponential,
18
21
  )
19
22
 
20
23
  from deltacat import logs
@@ -37,6 +40,13 @@ RETRYABLE_HTTP_STATUS_CODES = [
37
40
  HTTPStatus.GATEWAY_TIMEOUT,
38
41
  ]
39
42
 
43
+ boto_retry_wrapper = Retrying(
44
+ wait=wait_random_exponential(multiplier=1, max=10),
45
+ stop=stop_after_delay(60 * 5),
46
+ # CredentialRetrievalError can still be thrown due to throttling, even if IMDS health checks succeed.
47
+ retry=retry_if_exception_type(CredentialRetrievalError),
48
+ )
49
+
40
50
 
41
51
  class RetryIfRetryableHTTPStatusCode(retry_if_exception):
42
52
  """
@@ -183,10 +193,10 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
183
193
  def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
184
194
  # we don't use the @lru_cache decorator because Ray can't pickle it
185
195
  cached_function = lru_cache()(_resource)
186
- return cached_function(name, region, **kwargs)
196
+ return boto_retry_wrapper(cached_function, name, region, **kwargs)
187
197
 
188
198
 
189
199
  def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
190
200
  # we don't use the @lru_cache decorator because Ray can't pickle it
191
201
  cached_function = lru_cache()(_client)
192
- return cached_function(name, region, **kwargs)
202
+ return boto_retry_wrapper(cached_function, name, region, **kwargs)
deltacat/aws/constants.py CHANGED
@@ -3,6 +3,10 @@ from typing import List
3
3
  from deltacat.utils.common import env_integer, env_string
4
4
 
5
5
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
6
- BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 15)
6
+ BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
7
7
  TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
8
8
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
9
+
10
+ # Metric Names
11
+ DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX = "download_manifest_entry"
12
+ UPLOAD_SLICED_TABLE_METRIC_PREFIX = "upload_sliced_table"
deltacat/aws/s3u.py CHANGED
@@ -25,7 +25,11 @@ from tenacity import (
25
25
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
26
26
  import deltacat.aws.clients as aws_utils
27
27
  from deltacat import logs
28
- from deltacat.aws.constants import TIMEOUT_ERROR_CODES
28
+ from deltacat.aws.constants import (
29
+ TIMEOUT_ERROR_CODES,
30
+ DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX,
31
+ UPLOAD_SLICED_TABLE_METRIC_PREFIX,
32
+ )
29
33
  from deltacat.exceptions import NonRetryableError, RetryableError
30
34
  from deltacat.storage import (
31
35
  DistributedDataset,
@@ -50,6 +54,7 @@ from deltacat.types.tables import (
50
54
  )
51
55
  from deltacat.types.partial_download import PartialFileDownloadParams
52
56
  from deltacat.utils.common import ReadKwargsProvider
57
+ from deltacat.utils.metrics import metrics
53
58
 
54
59
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
55
60
 
@@ -238,6 +243,7 @@ def read_file(
238
243
  raise e
239
244
 
240
245
 
246
+ @metrics(prefix=UPLOAD_SLICED_TABLE_METRIC_PREFIX)
241
247
  def upload_sliced_table(
242
248
  table: Union[LocalTable, DistributedDataset],
243
249
  s3_url_prefix: str,
@@ -346,6 +352,7 @@ def upload_table(
346
352
  return manifest_entries
347
353
 
348
354
 
355
+ @metrics(prefix=DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX)
349
356
  def download_manifest_entry(
350
357
  manifest_entry: ManifestEntry,
351
358
  token_holder: Optional[Dict[str, Any]] = None,
@@ -20,6 +20,7 @@ from deltacat.compute.compactor_v2.constants import (
20
20
  AVERAGE_RECORD_SIZE_BYTES,
21
21
  TASK_MAX_PARALLELISM,
22
22
  DROP_DUPLICATES,
23
+ TOTAL_MEMORY_BUFFER_PERCENTAGE,
23
24
  )
24
25
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
25
26
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -85,12 +86,17 @@ class CompactPartitionParams(dict):
85
86
  result.average_record_size_bytes = params.get(
86
87
  "average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
87
88
  )
89
+ result.total_memory_buffer_percentage = params.get(
90
+ "total_memory_buffer_percentage", TOTAL_MEMORY_BUFFER_PERCENTAGE
91
+ )
88
92
  result.hash_group_count = params.get(
89
93
  "hash_group_count", result.hash_bucket_count
90
94
  )
91
95
  result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
92
96
  result.ray_custom_resources = params.get("ray_custom_resources")
93
97
 
98
+ result.memory_logs_enabled = params.get("memory_logs_enabled", False)
99
+
94
100
  result.metrics_config = params.get("metrics_config")
95
101
 
96
102
  if not importlib.util.find_spec("memray"):
@@ -190,6 +196,16 @@ class CompactPartitionParams(dict):
190
196
  def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
191
197
  self["average_record_size_bytes"] = average_record_size_bytes
192
198
 
199
+ @property
200
+ def total_memory_buffer_percentage(self) -> int:
201
+ return self["total_memory_buffer_percentage"]
202
+
203
+ @total_memory_buffer_percentage.setter
204
+ def total_memory_buffer_percentage(
205
+ self, total_memory_buffer_percentage: int
206
+ ) -> None:
207
+ self["total_memory_buffer_percentage"] = total_memory_buffer_percentage
208
+
193
209
  @property
194
210
  def min_files_in_batch(self) -> float:
195
211
  return self["min_files_in_batch"]
@@ -355,6 +371,14 @@ class CompactPartitionParams(dict):
355
371
  def sort_keys(self, keys: List[SortKey]) -> None:
356
372
  self["sort_keys"] = keys
357
373
 
374
+ @property
375
+ def memory_logs_enabled(self) -> bool:
376
+ return self.get("memory_logs_enabled")
377
+
378
+ @memory_logs_enabled.setter
379
+ def memory_logs_enabled(self, value: bool) -> None:
380
+ self["memory_logs_enabled"] = value
381
+
358
382
  @property
359
383
  def metrics_config(self) -> Optional[MetricsConfig]:
360
384
  return self.get("metrics_config")
@@ -84,6 +84,13 @@ class CompactionSessionAuditInfo(dict):
84
84
  """
85
85
  return self.get("recordsDeduped")
86
86
 
87
+ @property
88
+ def records_deleted(self) -> int:
89
+ """
90
+ The total count of deleted records in a compaction session if delete deltas are present.
91
+ """
92
+ return self.get("recordsDeleted")
93
+
87
94
  @property
88
95
  def input_size_bytes(self) -> float:
89
96
  """
@@ -461,6 +468,10 @@ class CompactionSessionAuditInfo(dict):
461
468
  self["recordsDeduped"] = records_deduped
462
469
  return self
463
470
 
471
+ def set_records_deleted(self, records_deleted: int) -> CompactionSessionAuditInfo:
472
+ self["recordsDeleted"] = records_deleted
473
+ return self
474
+
464
475
  def set_input_size_bytes(
465
476
  self, input_size_bytes: float
466
477
  ) -> CompactionSessionAuditInfo:
@@ -62,8 +62,10 @@ from deltacat.utils.resources import (
62
62
  from deltacat.compute.compactor_v2.utils.task_options import (
63
63
  hash_bucket_resource_options_provider,
64
64
  merge_resource_options_provider,
65
+ local_merge_resource_options_provider,
65
66
  )
66
67
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
68
+ from deltacat.utils.metrics import MetricsActor, METRICS_CONFIG_ACTOR_NAME
67
69
 
68
70
  if importlib.util.find_spec("memray"):
69
71
  import memray
@@ -117,6 +119,15 @@ def _execute_compaction(
117
119
  params: CompactPartitionParams, **kwargs
118
120
  ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
119
121
 
122
+ if params.metrics_config:
123
+ logger.info(
124
+ f"Setting metrics config with target: {params.metrics_config.metrics_target}"
125
+ )
126
+ metrics_actor = MetricsActor.options(
127
+ name=METRICS_CONFIG_ACTOR_NAME, get_if_exists=True
128
+ ).remote()
129
+ ray.get(metrics_actor.set_metrics_config.remote(params.metrics_config))
130
+
120
131
  rcf_source_partition_locator = (
121
132
  params.rebase_source_partition_locator or params.source_partition_locator
122
133
  )
@@ -258,8 +269,10 @@ def _execute_compaction(
258
269
  resource_amount_provider=hash_bucket_resource_options_provider,
259
270
  previous_inflation=params.previous_inflation,
260
271
  average_record_size_bytes=params.average_record_size_bytes,
272
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
261
273
  primary_keys=params.primary_keys,
262
274
  ray_custom_resources=params.ray_custom_resources,
275
+ memory_logs_enabled=params.memory_logs_enabled,
263
276
  )
264
277
 
265
278
  total_input_records_count = np.int64(0)
@@ -275,7 +288,29 @@ def _execute_compaction(
275
288
  delete_strategy,
276
289
  delete_file_envelopes,
277
290
  )
278
- local_merge_result = ray.get(mg.merge.remote(local_merge_input))
291
+ estimated_da_bytes = (
292
+ compaction_audit.estimated_in_memory_size_bytes_during_discovery
293
+ )
294
+ estimated_num_records = sum(
295
+ [
296
+ entry.meta.record_count
297
+ for delta in uniform_deltas
298
+ for entry in delta.manifest.entries
299
+ ]
300
+ )
301
+ local_merge_options = local_merge_resource_options_provider(
302
+ estimated_da_size=estimated_da_bytes,
303
+ estimated_num_rows=estimated_num_records,
304
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
305
+ round_completion_info=round_completion_info,
306
+ compacted_delta_manifest=previous_compacted_delta_manifest,
307
+ ray_custom_resources=params.ray_custom_resources,
308
+ primary_keys=params.primary_keys,
309
+ memory_logs_enabled=params.memory_logs_enabled,
310
+ )
311
+ local_merge_result = ray.get(
312
+ mg.merge.options(**local_merge_options).remote(local_merge_input)
313
+ )
279
314
  total_input_records_count += local_merge_result.input_record_count
280
315
  merge_results = [local_merge_result]
281
316
  merge_invoke_end = time.monotonic()
@@ -296,6 +331,7 @@ def _execute_compaction(
296
331
  object_store=params.object_store,
297
332
  deltacat_storage=params.deltacat_storage,
298
333
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
334
+ memory_logs_enabled=params.memory_logs_enabled,
299
335
  )
300
336
  }
301
337
 
@@ -382,12 +418,14 @@ def _execute_compaction(
382
418
  num_hash_groups=params.hash_group_count,
383
419
  hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
384
420
  hash_group_num_rows=all_hash_group_idx_to_num_rows,
421
+ total_memory_buffer_percentage=params.total_memory_buffer_percentage,
385
422
  round_completion_info=round_completion_info,
386
423
  compacted_delta_manifest=previous_compacted_delta_manifest,
387
424
  primary_keys=params.primary_keys,
388
425
  deltacat_storage=params.deltacat_storage,
389
426
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
390
427
  ray_custom_resources=params.ray_custom_resources,
428
+ memory_logs_enabled=params.memory_logs_enabled,
391
429
  )
392
430
 
393
431
  def merge_input_provider(index, item):
@@ -417,6 +455,7 @@ def _execute_compaction(
417
455
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
418
456
  delete_strategy=delete_strategy,
419
457
  delete_file_envelopes=delete_file_envelopes,
458
+ memory_logs_enabled=params.memory_logs_enabled,
420
459
  )
421
460
  }
422
461
 
@@ -438,11 +477,11 @@ def _execute_compaction(
438
477
  merge_end = time.monotonic()
439
478
 
440
479
  total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
441
- total_dropped_record_count = sum(
480
+ total_deleted_record_count = sum(
442
481
  [ddr.deleted_record_count for ddr in merge_results]
443
482
  )
444
483
  logger.info(
445
- f"Deduped {total_dd_record_count} records and dropped {total_dropped_record_count} records..."
484
+ f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
446
485
  )
447
486
 
448
487
  compaction_audit.set_input_records(total_input_records_count.item())
@@ -456,7 +495,7 @@ def _execute_compaction(
456
495
  )
457
496
 
458
497
  compaction_audit.set_records_deduped(total_dd_record_count.item())
459
-
498
+ compaction_audit.set_records_deleted(total_deleted_record_count.item())
460
499
  mat_results = []
461
500
  for merge_result in merge_results:
462
501
  mat_results.extend(merge_result.materialize_results)
@@ -503,7 +542,7 @@ def _execute_compaction(
503
542
  record_info_msg = (
504
543
  f"Hash bucket records: {total_hb_record_count},"
505
544
  f" Deduped records: {total_dd_record_count}, "
506
- f" Dropped records: {total_dropped_record_count}, "
545
+ f" Deleted records: {total_deleted_record_count}, "
507
546
  f" Materialized records: {merged_delta.meta.record_count}"
508
547
  )
509
548
  logger.info(record_info_msg)
@@ -603,7 +642,6 @@ def _execute_compaction(
603
642
  f"partition-{params.source_partition_locator.partition_values},"
604
643
  f"compacted at: {params.last_stream_position_to_compact},"
605
644
  )
606
-
607
645
  return (
608
646
  compacted_partition,
609
647
  new_round_completion_info,
@@ -40,3 +40,31 @@ DROP_DUPLICATES = True
40
40
  # This is the observed upper bound inflation for parquet
41
41
  # size in metadata to pyarrow table size.
42
42
  PARQUET_TO_PYARROW_INFLATION = 4
43
+
44
+ # Metric Names
45
+ # Time taken for a hash bucket task
46
+ HASH_BUCKET_TIME_IN_SECONDS = "hash_bucket_time"
47
+
48
+ # Hash bucket success count
49
+ HASH_BUCKET_SUCCESS_COUNT = "hash_bucket_success_count"
50
+
51
+ # Hash bucket failure count
52
+ HASH_BUCKET_FAILURE_COUNT = "hash_bucket_failure_count"
53
+
54
+ # Time taken for a merge task
55
+ MERGE_TIME_IN_SECONDS = "merge_time"
56
+
57
+ # Merge success count
58
+ MERGE_SUCCESS_COUNT = "merge_success_count"
59
+
60
+ # Merge failure count
61
+ MERGE_FAILURE_COUNT = "merge_failure_count"
62
+
63
+ # Metric prefix for discover deltas
64
+ DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
65
+
66
+ # Metric prefix for prepare deletes
67
+ PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
68
+
69
+ # Metric prefix for materialize
70
+ MATERIALIZE_METRIC_PREFIX = "delta_materialize"
@@ -23,6 +23,8 @@ from deltacat.storage import (
23
23
  Delta,
24
24
  )
25
25
  from deltacat import logs
26
+ from deltacat.utils.metrics import metrics
27
+ from deltacat.compute.compactor_v2.constants import PREPARE_DELETES_METRIC_PREFIX
26
28
 
27
29
 
28
30
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -115,6 +117,7 @@ def _get_delete_file_envelopes(
115
117
  return delete_file_envelopes
116
118
 
117
119
 
120
+ @metrics(prefix=PREPARE_DELETES_METRIC_PREFIX)
118
121
  def prepare_deletes(
119
122
  params: CompactPartitionParams,
120
123
  input_deltas: List[Delta],
@@ -22,6 +22,7 @@ class HashBucketInput(Dict):
22
22
  object_store: Optional[IObjectStore] = None,
23
23
  deltacat_storage=unimplemented_deltacat_storage,
24
24
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
25
+ memory_logs_enabled: Optional[bool] = None,
25
26
  ) -> HashBucketInput:
26
27
 
27
28
  result = HashBucketInput()
@@ -36,6 +37,7 @@ class HashBucketInput(Dict):
36
37
  result["object_store"] = object_store
37
38
  result["deltacat_storage"] = deltacat_storage
38
39
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
40
+ result["memory_logs_enabled"] = memory_logs_enabled
39
41
 
40
42
  return result
41
43
 
@@ -82,3 +84,7 @@ class HashBucketInput(Dict):
82
84
  @property
83
85
  def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
84
86
  return self.get("deltacat_storage_kwargs")
87
+
88
+ @property
89
+ def memory_logs_enabled(self) -> Optional[bool]:
90
+ return self.get("memory_logs_enabled")
@@ -46,6 +46,7 @@ class MergeInput(Dict):
46
46
  delete_file_envelopes: Optional[List] = None,
47
47
  deltacat_storage=unimplemented_deltacat_storage,
48
48
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
+ memory_logs_enabled: Optional[bool] = None,
49
50
  ) -> MergeInput:
50
51
 
51
52
  result = MergeInput()
@@ -67,6 +68,7 @@ class MergeInput(Dict):
67
68
  result["delete_strategy"] = delete_strategy
68
69
  result["deltacat_storage"] = deltacat_storage
69
70
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
71
+ result["memory_logs_enabled"] = memory_logs_enabled
70
72
  return result
71
73
 
72
74
  @property
@@ -133,6 +135,10 @@ class MergeInput(Dict):
133
135
  def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
134
136
  return self.get("deltacat_storage_kwargs")
135
137
 
138
+ @property
139
+ def memory_logs_enabled(self) -> Optional[bool]:
140
+ return self.get("memory_logs_enabled")
141
+
136
142
  @property
137
143
  def delete_file_envelopes(
138
144
  self,
@@ -25,12 +25,17 @@ from deltacat.utils.ray_utils.runtime import (
25
25
  )
26
26
  from deltacat.utils.common import ReadKwargsProvider
27
27
  from deltacat.utils.performance import timed_invocation
28
- from deltacat.utils.metrics import emit_timer_metrics
28
+ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_metric
29
29
  from deltacat.utils.resources import (
30
30
  get_current_process_peak_memory_usage_in_bytes,
31
31
  ProcessUtilizationOverTimeRange,
32
32
  )
33
33
  from deltacat.constants import BYTES_PER_GIBIBYTE
34
+ from deltacat.compute.compactor_v2.constants import (
35
+ HASH_BUCKET_TIME_IN_SECONDS,
36
+ HASH_BUCKET_FAILURE_COUNT,
37
+ HASH_BUCKET_SUCCESS_COUNT,
38
+ )
34
39
 
35
40
  if importlib.util.find_spec("memray"):
36
41
  import memray
@@ -91,6 +96,8 @@ def _group_file_records_by_pk_hash_bucket(
91
96
  return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
92
97
 
93
98
 
99
+ @success_metric(name=HASH_BUCKET_SUCCESS_COUNT)
100
+ @failure_metric(name=HASH_BUCKET_FAILURE_COUNT)
94
101
  def _timed_hash_bucket(input: HashBucketInput):
95
102
  task_id = get_current_ray_task_id()
96
103
  worker_id = get_current_ray_worker_id()
@@ -142,7 +149,8 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
142
149
  f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
143
150
  )
144
151
 
145
- process_util.schedule_callback(log_peak_memory, 10)
152
+ if input.memory_logs_enabled:
153
+ process_util.schedule_callback(log_peak_memory, 10)
146
154
 
147
155
  hash_bucket_result, duration = timed_invocation(
148
156
  func=_timed_hash_bucket, input=input
@@ -152,7 +160,7 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
152
160
  if input.metrics_config:
153
161
  emit_result, latency = timed_invocation(
154
162
  func=emit_timer_metrics,
155
- metrics_name="hash_bucket",
163
+ metrics_name=HASH_BUCKET_TIME_IN_SECONDS,
156
164
  value=duration,
157
165
  metrics_config=input.metrics_config,
158
166
  )
@@ -12,6 +12,7 @@ from uuid import uuid4
12
12
  from deltacat import logs
13
13
  from typing import Callable, Iterator, List, Optional, Tuple
14
14
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
15
+ from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
15
16
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
16
17
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
17
18
  from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
@@ -23,7 +24,7 @@ from deltacat.utils.ray_utils.runtime import (
23
24
  )
24
25
  from deltacat.compute.compactor.utils import system_columns as sc
25
26
  from deltacat.utils.performance import timed_invocation
26
- from deltacat.utils.metrics import emit_timer_metrics
27
+ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_metric
27
28
  from deltacat.utils.resources import (
28
29
  get_current_process_peak_memory_usage_in_bytes,
29
30
  ProcessUtilizationOverTimeRange,
@@ -41,6 +42,11 @@ from deltacat.storage import (
41
42
  )
42
43
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
43
44
  from deltacat.constants import BYTES_PER_GIBIBYTE
45
+ from deltacat.compute.compactor_v2.constants import (
46
+ MERGE_TIME_IN_SECONDS,
47
+ MERGE_SUCCESS_COUNT,
48
+ MERGE_FAILURE_COUNT,
49
+ )
44
50
 
45
51
 
46
52
  if importlib.util.find_spec("memray"):
@@ -269,6 +275,24 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
269
275
  )
270
276
 
271
277
 
278
+ def _can_copy_by_reference(
279
+ has_delete: bool, merge_file_group: MergeFileGroup, input: MergeInput
280
+ ) -> bool:
281
+ """
282
+ Can copy by reference only if there are no deletes to merge in
283
+ and previous compacted stream id matches that of new stream
284
+ """
285
+ return (
286
+ not has_delete
287
+ and not merge_file_group.dfe_groups
288
+ and input.round_completion_info is not None
289
+ and (
290
+ input.write_to_partition.stream_id
291
+ == input.round_completion_info.compacted_delta_locator.stream_id
292
+ )
293
+ )
294
+
295
+
272
296
  def _flatten_dfe_list(
273
297
  df_envelopes_list: List[List[DeltaFileEnvelope]],
274
298
  ) -> List[DeltaFileEnvelope]:
@@ -349,7 +373,7 @@ def _compact_tables(
349
373
  1. The compacted PyArrow table.
350
374
  2. The total number of records in the incremental data.
351
375
  3. The total number of deduplicated records.
352
- 4. The total number of dropped records due to DELETE operations.
376
+ 4. The total number of deleted records due to DELETE operations.
353
377
  """
354
378
  df_envelopes: List[DeltaFileEnvelope] = _flatten_dfe_list(dfe_list)
355
379
  delete_file_envelopes = input.delete_file_envelopes or []
@@ -460,6 +484,8 @@ def _copy_manifests_from_hash_bucketing(
460
484
  return materialized_results
461
485
 
462
486
 
487
+ @success_metric(name=MERGE_SUCCESS_COUNT)
488
+ @failure_metric(name=MERGE_FAILURE_COUNT)
463
489
  def _timed_merge(input: MergeInput) -> MergeResult:
464
490
  task_id = get_current_ray_task_id()
465
491
  worker_id = get_current_ray_worker_id()
@@ -479,10 +505,12 @@ def _timed_merge(input: MergeInput) -> MergeResult:
479
505
  assert (
480
506
  input.delete_strategy is not None
481
507
  ), "Merge input missing delete_strategy"
482
- if not has_delete and not merge_file_group.dfe_groups:
483
- # Can copy by reference only if there are no deletes to merge in
508
+ if _can_copy_by_reference(
509
+ has_delete=has_delete, merge_file_group=merge_file_group, input=input
510
+ ):
484
511
  hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
485
512
  continue
513
+
486
514
  if _has_previous_compacted_table(input, merge_file_group.hb_index):
487
515
  compacted_table = _download_compacted_table(
488
516
  hb_index=merge_file_group.hb_index,
@@ -548,7 +576,8 @@ def merge(input: MergeInput) -> MergeResult:
548
576
  f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
549
577
  )
550
578
 
551
- process_util.schedule_callback(log_peak_memory, 10)
579
+ if input.memory_logs_enabled:
580
+ process_util.schedule_callback(log_peak_memory, 10)
552
581
 
553
582
  merge_result, duration = timed_invocation(func=_timed_merge, input=input)
554
583
 
@@ -556,7 +585,7 @@ def merge(input: MergeInput) -> MergeResult:
556
585
  if input.metrics_config:
557
586
  emit_result, latency = timed_invocation(
558
587
  func=emit_timer_metrics,
559
- metrics_name="merge",
588
+ metrics_name=MERGE_TIME_IN_SECONDS,
560
589
  value=duration,
561
590
  metrics_config=input.metrics_config,
562
591
  )
@@ -23,10 +23,13 @@ from deltacat.compute.compactor_v2.utils.task_options import (
23
23
  from deltacat.compute.compactor_v2.utils.content_type_params import (
24
24
  append_content_type_params,
25
25
  )
26
+ from deltacat.utils.metrics import metrics
27
+ from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
26
28
 
27
29
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
30
 
29
31
 
32
+ @metrics(prefix=DISCOVER_DELTAS_METRIC_PREFIX)
30
33
  def discover_deltas(
31
34
  source_partition_locator: PartitionLocator,
32
35
  last_stream_position_to_compact: int,
@@ -31,11 +31,14 @@ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
31
31
  from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
32
32
  DeleteFileEnvelope,
33
33
  )
34
+ from deltacat.utils.metrics import metrics
35
+ from deltacat.compute.compactor_v2.constants import MATERIALIZE_METRIC_PREFIX
34
36
 
35
37
 
36
38
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
37
39
 
38
40
 
41
+ @metrics(prefix=MATERIALIZE_METRIC_PREFIX)
39
42
  def materialize(
40
43
  input: MergeInput,
41
44
  task_index: int,