deltacat 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +12 -2
- deltacat/aws/constants.py +5 -1
- deltacat/aws/s3u.py +8 -1
- deltacat/compute/compactor/model/compact_partition_params.py +24 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
- deltacat/compute/compactor_v2/compaction_session.py +44 -6
- deltacat/compute/compactor_v2/constants.py +28 -0
- deltacat/compute/compactor_v2/deletes/utils.py +3 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +11 -3
- deltacat/compute/compactor_v2/steps/merge.py +35 -6
- deltacat/compute/compactor_v2/utils/io.py +3 -0
- deltacat/compute/compactor_v2/utils/merge.py +3 -0
- deltacat/compute/compactor_v2/utils/task_options.py +94 -8
- deltacat/io/memcached_object_store.py +20 -0
- deltacat/logs.py +29 -2
- deltacat/tests/compute/test_compact_partition_params.py +5 -0
- deltacat/tests/io/test_memcached_object_store.py +19 -0
- deltacat/tests/utils/test_metrics.py +575 -0
- deltacat/utils/metrics.py +158 -23
- deltacat/utils/resources.py +5 -3
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/METADATA +1 -1
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/RECORD +28 -27
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.0.dist-info → deltacat-1.1.2.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/clients.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Optional
|
|
4
4
|
from http import HTTPStatus
|
5
5
|
|
6
6
|
import boto3
|
7
|
+
from botocore.exceptions import CredentialRetrievalError
|
7
8
|
from boto3.exceptions import ResourceNotExistsError
|
8
9
|
from boto3.resources.base import ServiceResource
|
9
10
|
from botocore.client import BaseClient
|
@@ -15,6 +16,8 @@ from tenacity import (
|
|
15
16
|
wait_fixed,
|
16
17
|
retry_if_exception,
|
17
18
|
stop_after_delay,
|
19
|
+
retry_if_exception_type,
|
20
|
+
wait_random_exponential,
|
18
21
|
)
|
19
22
|
|
20
23
|
from deltacat import logs
|
@@ -37,6 +40,13 @@ RETRYABLE_HTTP_STATUS_CODES = [
|
|
37
40
|
HTTPStatus.GATEWAY_TIMEOUT,
|
38
41
|
]
|
39
42
|
|
43
|
+
boto_retry_wrapper = Retrying(
|
44
|
+
wait=wait_random_exponential(multiplier=1, max=10),
|
45
|
+
stop=stop_after_delay(60 * 5),
|
46
|
+
# CredentialRetrievalError can still be thrown due to throttling, even if IMDS health checks succeed.
|
47
|
+
retry=retry_if_exception_type(CredentialRetrievalError),
|
48
|
+
)
|
49
|
+
|
40
50
|
|
41
51
|
class RetryIfRetryableHTTPStatusCode(retry_if_exception):
|
42
52
|
"""
|
@@ -183,10 +193,10 @@ def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
|
183
193
|
def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
184
194
|
# we don't use the @lru_cache decorator because Ray can't pickle it
|
185
195
|
cached_function = lru_cache()(_resource)
|
186
|
-
return cached_function
|
196
|
+
return boto_retry_wrapper(cached_function, name, region, **kwargs)
|
187
197
|
|
188
198
|
|
189
199
|
def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
190
200
|
# we don't use the @lru_cache decorator because Ray can't pickle it
|
191
201
|
cached_function = lru_cache()(_client)
|
192
|
-
return cached_function
|
202
|
+
return boto_retry_wrapper(cached_function, name, region, **kwargs)
|
deltacat/aws/constants.py
CHANGED
@@ -3,6 +3,10 @@ from typing import List
|
|
3
3
|
from deltacat.utils.common import env_integer, env_string
|
4
4
|
|
5
5
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
|
6
|
-
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES",
|
6
|
+
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
|
7
7
|
TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
|
8
8
|
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
9
|
+
|
10
|
+
# Metric Names
|
11
|
+
DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX = "download_manifest_entry"
|
12
|
+
UPLOAD_SLICED_TABLE_METRIC_PREFIX = "upload_sliced_table"
|
deltacat/aws/s3u.py
CHANGED
@@ -25,7 +25,11 @@ from tenacity import (
|
|
25
25
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
26
26
|
import deltacat.aws.clients as aws_utils
|
27
27
|
from deltacat import logs
|
28
|
-
from deltacat.aws.constants import
|
28
|
+
from deltacat.aws.constants import (
|
29
|
+
TIMEOUT_ERROR_CODES,
|
30
|
+
DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX,
|
31
|
+
UPLOAD_SLICED_TABLE_METRIC_PREFIX,
|
32
|
+
)
|
29
33
|
from deltacat.exceptions import NonRetryableError, RetryableError
|
30
34
|
from deltacat.storage import (
|
31
35
|
DistributedDataset,
|
@@ -50,6 +54,7 @@ from deltacat.types.tables import (
|
|
50
54
|
)
|
51
55
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
52
56
|
from deltacat.utils.common import ReadKwargsProvider
|
57
|
+
from deltacat.utils.metrics import metrics
|
53
58
|
|
54
59
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
55
60
|
|
@@ -238,6 +243,7 @@ def read_file(
|
|
238
243
|
raise e
|
239
244
|
|
240
245
|
|
246
|
+
@metrics(prefix=UPLOAD_SLICED_TABLE_METRIC_PREFIX)
|
241
247
|
def upload_sliced_table(
|
242
248
|
table: Union[LocalTable, DistributedDataset],
|
243
249
|
s3_url_prefix: str,
|
@@ -346,6 +352,7 @@ def upload_table(
|
|
346
352
|
return manifest_entries
|
347
353
|
|
348
354
|
|
355
|
+
@metrics(prefix=DOWNLOAD_MANIFEST_ENTRY_METRIC_PREFIX)
|
349
356
|
def download_manifest_entry(
|
350
357
|
manifest_entry: ManifestEntry,
|
351
358
|
token_holder: Optional[Dict[str, Any]] = None,
|
@@ -20,6 +20,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
20
20
|
AVERAGE_RECORD_SIZE_BYTES,
|
21
21
|
TASK_MAX_PARALLELISM,
|
22
22
|
DROP_DUPLICATES,
|
23
|
+
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
23
24
|
)
|
24
25
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
25
26
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -85,12 +86,17 @@ class CompactPartitionParams(dict):
|
|
85
86
|
result.average_record_size_bytes = params.get(
|
86
87
|
"average_record_size_bytes", AVERAGE_RECORD_SIZE_BYTES
|
87
88
|
)
|
89
|
+
result.total_memory_buffer_percentage = params.get(
|
90
|
+
"total_memory_buffer_percentage", TOTAL_MEMORY_BUFFER_PERCENTAGE
|
91
|
+
)
|
88
92
|
result.hash_group_count = params.get(
|
89
93
|
"hash_group_count", result.hash_bucket_count
|
90
94
|
)
|
91
95
|
result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
|
92
96
|
result.ray_custom_resources = params.get("ray_custom_resources")
|
93
97
|
|
98
|
+
result.memory_logs_enabled = params.get("memory_logs_enabled", False)
|
99
|
+
|
94
100
|
result.metrics_config = params.get("metrics_config")
|
95
101
|
|
96
102
|
if not importlib.util.find_spec("memray"):
|
@@ -190,6 +196,16 @@ class CompactPartitionParams(dict):
|
|
190
196
|
def average_record_size_bytes(self, average_record_size_bytes: float) -> None:
|
191
197
|
self["average_record_size_bytes"] = average_record_size_bytes
|
192
198
|
|
199
|
+
@property
|
200
|
+
def total_memory_buffer_percentage(self) -> int:
|
201
|
+
return self["total_memory_buffer_percentage"]
|
202
|
+
|
203
|
+
@total_memory_buffer_percentage.setter
|
204
|
+
def total_memory_buffer_percentage(
|
205
|
+
self, total_memory_buffer_percentage: int
|
206
|
+
) -> None:
|
207
|
+
self["total_memory_buffer_percentage"] = total_memory_buffer_percentage
|
208
|
+
|
193
209
|
@property
|
194
210
|
def min_files_in_batch(self) -> float:
|
195
211
|
return self["min_files_in_batch"]
|
@@ -355,6 +371,14 @@ class CompactPartitionParams(dict):
|
|
355
371
|
def sort_keys(self, keys: List[SortKey]) -> None:
|
356
372
|
self["sort_keys"] = keys
|
357
373
|
|
374
|
+
@property
|
375
|
+
def memory_logs_enabled(self) -> bool:
|
376
|
+
return self.get("memory_logs_enabled")
|
377
|
+
|
378
|
+
@memory_logs_enabled.setter
|
379
|
+
def memory_logs_enabled(self, value: bool) -> None:
|
380
|
+
self["memory_logs_enabled"] = value
|
381
|
+
|
358
382
|
@property
|
359
383
|
def metrics_config(self) -> Optional[MetricsConfig]:
|
360
384
|
return self.get("metrics_config")
|
@@ -84,6 +84,13 @@ class CompactionSessionAuditInfo(dict):
|
|
84
84
|
"""
|
85
85
|
return self.get("recordsDeduped")
|
86
86
|
|
87
|
+
@property
|
88
|
+
def records_deleted(self) -> int:
|
89
|
+
"""
|
90
|
+
The total count of deleted records in a compaction session if delete deltas are present.
|
91
|
+
"""
|
92
|
+
return self.get("recordsDeleted")
|
93
|
+
|
87
94
|
@property
|
88
95
|
def input_size_bytes(self) -> float:
|
89
96
|
"""
|
@@ -461,6 +468,10 @@ class CompactionSessionAuditInfo(dict):
|
|
461
468
|
self["recordsDeduped"] = records_deduped
|
462
469
|
return self
|
463
470
|
|
471
|
+
def set_records_deleted(self, records_deleted: int) -> CompactionSessionAuditInfo:
|
472
|
+
self["recordsDeleted"] = records_deleted
|
473
|
+
return self
|
474
|
+
|
464
475
|
def set_input_size_bytes(
|
465
476
|
self, input_size_bytes: float
|
466
477
|
) -> CompactionSessionAuditInfo:
|
@@ -62,8 +62,10 @@ from deltacat.utils.resources import (
|
|
62
62
|
from deltacat.compute.compactor_v2.utils.task_options import (
|
63
63
|
hash_bucket_resource_options_provider,
|
64
64
|
merge_resource_options_provider,
|
65
|
+
local_merge_resource_options_provider,
|
65
66
|
)
|
66
67
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
68
|
+
from deltacat.utils.metrics import MetricsActor, METRICS_CONFIG_ACTOR_NAME
|
67
69
|
|
68
70
|
if importlib.util.find_spec("memray"):
|
69
71
|
import memray
|
@@ -117,6 +119,15 @@ def _execute_compaction(
|
|
117
119
|
params: CompactPartitionParams, **kwargs
|
118
120
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
119
121
|
|
122
|
+
if params.metrics_config:
|
123
|
+
logger.info(
|
124
|
+
f"Setting metrics config with target: {params.metrics_config.metrics_target}"
|
125
|
+
)
|
126
|
+
metrics_actor = MetricsActor.options(
|
127
|
+
name=METRICS_CONFIG_ACTOR_NAME, get_if_exists=True
|
128
|
+
).remote()
|
129
|
+
ray.get(metrics_actor.set_metrics_config.remote(params.metrics_config))
|
130
|
+
|
120
131
|
rcf_source_partition_locator = (
|
121
132
|
params.rebase_source_partition_locator or params.source_partition_locator
|
122
133
|
)
|
@@ -258,8 +269,10 @@ def _execute_compaction(
|
|
258
269
|
resource_amount_provider=hash_bucket_resource_options_provider,
|
259
270
|
previous_inflation=params.previous_inflation,
|
260
271
|
average_record_size_bytes=params.average_record_size_bytes,
|
272
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
261
273
|
primary_keys=params.primary_keys,
|
262
274
|
ray_custom_resources=params.ray_custom_resources,
|
275
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
263
276
|
)
|
264
277
|
|
265
278
|
total_input_records_count = np.int64(0)
|
@@ -275,7 +288,29 @@ def _execute_compaction(
|
|
275
288
|
delete_strategy,
|
276
289
|
delete_file_envelopes,
|
277
290
|
)
|
278
|
-
|
291
|
+
estimated_da_bytes = (
|
292
|
+
compaction_audit.estimated_in_memory_size_bytes_during_discovery
|
293
|
+
)
|
294
|
+
estimated_num_records = sum(
|
295
|
+
[
|
296
|
+
entry.meta.record_count
|
297
|
+
for delta in uniform_deltas
|
298
|
+
for entry in delta.manifest.entries
|
299
|
+
]
|
300
|
+
)
|
301
|
+
local_merge_options = local_merge_resource_options_provider(
|
302
|
+
estimated_da_size=estimated_da_bytes,
|
303
|
+
estimated_num_rows=estimated_num_records,
|
304
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
305
|
+
round_completion_info=round_completion_info,
|
306
|
+
compacted_delta_manifest=previous_compacted_delta_manifest,
|
307
|
+
ray_custom_resources=params.ray_custom_resources,
|
308
|
+
primary_keys=params.primary_keys,
|
309
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
310
|
+
)
|
311
|
+
local_merge_result = ray.get(
|
312
|
+
mg.merge.options(**local_merge_options).remote(local_merge_input)
|
313
|
+
)
|
279
314
|
total_input_records_count += local_merge_result.input_record_count
|
280
315
|
merge_results = [local_merge_result]
|
281
316
|
merge_invoke_end = time.monotonic()
|
@@ -296,6 +331,7 @@ def _execute_compaction(
|
|
296
331
|
object_store=params.object_store,
|
297
332
|
deltacat_storage=params.deltacat_storage,
|
298
333
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
334
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
299
335
|
)
|
300
336
|
}
|
301
337
|
|
@@ -382,12 +418,14 @@ def _execute_compaction(
|
|
382
418
|
num_hash_groups=params.hash_group_count,
|
383
419
|
hash_group_size_bytes=all_hash_group_idx_to_size_bytes,
|
384
420
|
hash_group_num_rows=all_hash_group_idx_to_num_rows,
|
421
|
+
total_memory_buffer_percentage=params.total_memory_buffer_percentage,
|
385
422
|
round_completion_info=round_completion_info,
|
386
423
|
compacted_delta_manifest=previous_compacted_delta_manifest,
|
387
424
|
primary_keys=params.primary_keys,
|
388
425
|
deltacat_storage=params.deltacat_storage,
|
389
426
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
390
427
|
ray_custom_resources=params.ray_custom_resources,
|
428
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
391
429
|
)
|
392
430
|
|
393
431
|
def merge_input_provider(index, item):
|
@@ -417,6 +455,7 @@ def _execute_compaction(
|
|
417
455
|
deltacat_storage_kwargs=params.deltacat_storage_kwargs,
|
418
456
|
delete_strategy=delete_strategy,
|
419
457
|
delete_file_envelopes=delete_file_envelopes,
|
458
|
+
memory_logs_enabled=params.memory_logs_enabled,
|
420
459
|
)
|
421
460
|
}
|
422
461
|
|
@@ -438,11 +477,11 @@ def _execute_compaction(
|
|
438
477
|
merge_end = time.monotonic()
|
439
478
|
|
440
479
|
total_dd_record_count = sum([ddr.deduped_record_count for ddr in merge_results])
|
441
|
-
|
480
|
+
total_deleted_record_count = sum(
|
442
481
|
[ddr.deleted_record_count for ddr in merge_results]
|
443
482
|
)
|
444
483
|
logger.info(
|
445
|
-
f"Deduped {total_dd_record_count} records and
|
484
|
+
f"Deduped {total_dd_record_count} records and deleted {total_deleted_record_count} records..."
|
446
485
|
)
|
447
486
|
|
448
487
|
compaction_audit.set_input_records(total_input_records_count.item())
|
@@ -456,7 +495,7 @@ def _execute_compaction(
|
|
456
495
|
)
|
457
496
|
|
458
497
|
compaction_audit.set_records_deduped(total_dd_record_count.item())
|
459
|
-
|
498
|
+
compaction_audit.set_records_deleted(total_deleted_record_count.item())
|
460
499
|
mat_results = []
|
461
500
|
for merge_result in merge_results:
|
462
501
|
mat_results.extend(merge_result.materialize_results)
|
@@ -503,7 +542,7 @@ def _execute_compaction(
|
|
503
542
|
record_info_msg = (
|
504
543
|
f"Hash bucket records: {total_hb_record_count},"
|
505
544
|
f" Deduped records: {total_dd_record_count}, "
|
506
|
-
f"
|
545
|
+
f" Deleted records: {total_deleted_record_count}, "
|
507
546
|
f" Materialized records: {merged_delta.meta.record_count}"
|
508
547
|
)
|
509
548
|
logger.info(record_info_msg)
|
@@ -603,7 +642,6 @@ def _execute_compaction(
|
|
603
642
|
f"partition-{params.source_partition_locator.partition_values},"
|
604
643
|
f"compacted at: {params.last_stream_position_to_compact},"
|
605
644
|
)
|
606
|
-
|
607
645
|
return (
|
608
646
|
compacted_partition,
|
609
647
|
new_round_completion_info,
|
@@ -40,3 +40,31 @@ DROP_DUPLICATES = True
|
|
40
40
|
# This is the observed upper bound inflation for parquet
|
41
41
|
# size in metadata to pyarrow table size.
|
42
42
|
PARQUET_TO_PYARROW_INFLATION = 4
|
43
|
+
|
44
|
+
# Metric Names
|
45
|
+
# Time taken for a hash bucket task
|
46
|
+
HASH_BUCKET_TIME_IN_SECONDS = "hash_bucket_time"
|
47
|
+
|
48
|
+
# Hash bucket success count
|
49
|
+
HASH_BUCKET_SUCCESS_COUNT = "hash_bucket_success_count"
|
50
|
+
|
51
|
+
# Hash bucket failure count
|
52
|
+
HASH_BUCKET_FAILURE_COUNT = "hash_bucket_failure_count"
|
53
|
+
|
54
|
+
# Time taken for a merge task
|
55
|
+
MERGE_TIME_IN_SECONDS = "merge_time"
|
56
|
+
|
57
|
+
# Merge success count
|
58
|
+
MERGE_SUCCESS_COUNT = "merge_success_count"
|
59
|
+
|
60
|
+
# Merge failure count
|
61
|
+
MERGE_FAILURE_COUNT = "merge_failure_count"
|
62
|
+
|
63
|
+
# Metric prefix for discover deltas
|
64
|
+
DISCOVER_DELTAS_METRIC_PREFIX = "discover_deltas"
|
65
|
+
|
66
|
+
# Metric prefix for prepare deletes
|
67
|
+
PREPARE_DELETES_METRIC_PREFIX = "prepare_deletes"
|
68
|
+
|
69
|
+
# Metric prefix for materialize
|
70
|
+
MATERIALIZE_METRIC_PREFIX = "delta_materialize"
|
@@ -23,6 +23,8 @@ from deltacat.storage import (
|
|
23
23
|
Delta,
|
24
24
|
)
|
25
25
|
from deltacat import logs
|
26
|
+
from deltacat.utils.metrics import metrics
|
27
|
+
from deltacat.compute.compactor_v2.constants import PREPARE_DELETES_METRIC_PREFIX
|
26
28
|
|
27
29
|
|
28
30
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -115,6 +117,7 @@ def _get_delete_file_envelopes(
|
|
115
117
|
return delete_file_envelopes
|
116
118
|
|
117
119
|
|
120
|
+
@metrics(prefix=PREPARE_DELETES_METRIC_PREFIX)
|
118
121
|
def prepare_deletes(
|
119
122
|
params: CompactPartitionParams,
|
120
123
|
input_deltas: List[Delta],
|
@@ -22,6 +22,7 @@ class HashBucketInput(Dict):
|
|
22
22
|
object_store: Optional[IObjectStore] = None,
|
23
23
|
deltacat_storage=unimplemented_deltacat_storage,
|
24
24
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
25
|
+
memory_logs_enabled: Optional[bool] = None,
|
25
26
|
) -> HashBucketInput:
|
26
27
|
|
27
28
|
result = HashBucketInput()
|
@@ -36,6 +37,7 @@ class HashBucketInput(Dict):
|
|
36
37
|
result["object_store"] = object_store
|
37
38
|
result["deltacat_storage"] = deltacat_storage
|
38
39
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
40
|
+
result["memory_logs_enabled"] = memory_logs_enabled
|
39
41
|
|
40
42
|
return result
|
41
43
|
|
@@ -82,3 +84,7 @@ class HashBucketInput(Dict):
|
|
82
84
|
@property
|
83
85
|
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
84
86
|
return self.get("deltacat_storage_kwargs")
|
87
|
+
|
88
|
+
@property
|
89
|
+
def memory_logs_enabled(self) -> Optional[bool]:
|
90
|
+
return self.get("memory_logs_enabled")
|
@@ -46,6 +46,7 @@ class MergeInput(Dict):
|
|
46
46
|
delete_file_envelopes: Optional[List] = None,
|
47
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
|
+
memory_logs_enabled: Optional[bool] = None,
|
49
50
|
) -> MergeInput:
|
50
51
|
|
51
52
|
result = MergeInput()
|
@@ -67,6 +68,7 @@ class MergeInput(Dict):
|
|
67
68
|
result["delete_strategy"] = delete_strategy
|
68
69
|
result["deltacat_storage"] = deltacat_storage
|
69
70
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
71
|
+
result["memory_logs_enabled"] = memory_logs_enabled
|
70
72
|
return result
|
71
73
|
|
72
74
|
@property
|
@@ -133,6 +135,10 @@ class MergeInput(Dict):
|
|
133
135
|
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
134
136
|
return self.get("deltacat_storage_kwargs")
|
135
137
|
|
138
|
+
@property
|
139
|
+
def memory_logs_enabled(self) -> Optional[bool]:
|
140
|
+
return self.get("memory_logs_enabled")
|
141
|
+
|
136
142
|
@property
|
137
143
|
def delete_file_envelopes(
|
138
144
|
self,
|
@@ -25,12 +25,17 @@ from deltacat.utils.ray_utils.runtime import (
|
|
25
25
|
)
|
26
26
|
from deltacat.utils.common import ReadKwargsProvider
|
27
27
|
from deltacat.utils.performance import timed_invocation
|
28
|
-
from deltacat.utils.metrics import emit_timer_metrics
|
28
|
+
from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_metric
|
29
29
|
from deltacat.utils.resources import (
|
30
30
|
get_current_process_peak_memory_usage_in_bytes,
|
31
31
|
ProcessUtilizationOverTimeRange,
|
32
32
|
)
|
33
33
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
34
|
+
from deltacat.compute.compactor_v2.constants import (
|
35
|
+
HASH_BUCKET_TIME_IN_SECONDS,
|
36
|
+
HASH_BUCKET_FAILURE_COUNT,
|
37
|
+
HASH_BUCKET_SUCCESS_COUNT,
|
38
|
+
)
|
34
39
|
|
35
40
|
if importlib.util.find_spec("memray"):
|
36
41
|
import memray
|
@@ -91,6 +96,8 @@ def _group_file_records_by_pk_hash_bucket(
|
|
91
96
|
return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
|
92
97
|
|
93
98
|
|
99
|
+
@success_metric(name=HASH_BUCKET_SUCCESS_COUNT)
|
100
|
+
@failure_metric(name=HASH_BUCKET_FAILURE_COUNT)
|
94
101
|
def _timed_hash_bucket(input: HashBucketInput):
|
95
102
|
task_id = get_current_ray_task_id()
|
96
103
|
worker_id = get_current_ray_worker_id()
|
@@ -142,7 +149,8 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
|
142
149
|
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
143
150
|
)
|
144
151
|
|
145
|
-
|
152
|
+
if input.memory_logs_enabled:
|
153
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
146
154
|
|
147
155
|
hash_bucket_result, duration = timed_invocation(
|
148
156
|
func=_timed_hash_bucket, input=input
|
@@ -152,7 +160,7 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
|
152
160
|
if input.metrics_config:
|
153
161
|
emit_result, latency = timed_invocation(
|
154
162
|
func=emit_timer_metrics,
|
155
|
-
metrics_name=
|
163
|
+
metrics_name=HASH_BUCKET_TIME_IN_SECONDS,
|
156
164
|
value=duration,
|
157
165
|
metrics_config=input.metrics_config,
|
158
166
|
)
|
@@ -12,6 +12,7 @@ from uuid import uuid4
|
|
12
12
|
from deltacat import logs
|
13
13
|
from typing import Callable, Iterator, List, Optional, Tuple
|
14
14
|
from deltacat.compute.compactor_v2.model.merge_result import MergeResult
|
15
|
+
from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
|
15
16
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
16
17
|
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
17
18
|
from deltacat.compute.compactor import RoundCompletionInfo, DeltaFileEnvelope
|
@@ -23,7 +24,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
23
24
|
)
|
24
25
|
from deltacat.compute.compactor.utils import system_columns as sc
|
25
26
|
from deltacat.utils.performance import timed_invocation
|
26
|
-
from deltacat.utils.metrics import emit_timer_metrics
|
27
|
+
from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_metric
|
27
28
|
from deltacat.utils.resources import (
|
28
29
|
get_current_process_peak_memory_usage_in_bytes,
|
29
30
|
ProcessUtilizationOverTimeRange,
|
@@ -41,6 +42,11 @@ from deltacat.storage import (
|
|
41
42
|
)
|
42
43
|
from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
|
43
44
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
45
|
+
from deltacat.compute.compactor_v2.constants import (
|
46
|
+
MERGE_TIME_IN_SECONDS,
|
47
|
+
MERGE_SUCCESS_COUNT,
|
48
|
+
MERGE_FAILURE_COUNT,
|
49
|
+
)
|
44
50
|
|
45
51
|
|
46
52
|
if importlib.util.find_spec("memray"):
|
@@ -269,6 +275,24 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
|
|
269
275
|
)
|
270
276
|
|
271
277
|
|
278
|
+
def _can_copy_by_reference(
|
279
|
+
has_delete: bool, merge_file_group: MergeFileGroup, input: MergeInput
|
280
|
+
) -> bool:
|
281
|
+
"""
|
282
|
+
Can copy by reference only if there are no deletes to merge in
|
283
|
+
and previous compacted stream id matches that of new stream
|
284
|
+
"""
|
285
|
+
return (
|
286
|
+
not has_delete
|
287
|
+
and not merge_file_group.dfe_groups
|
288
|
+
and input.round_completion_info is not None
|
289
|
+
and (
|
290
|
+
input.write_to_partition.stream_id
|
291
|
+
== input.round_completion_info.compacted_delta_locator.stream_id
|
292
|
+
)
|
293
|
+
)
|
294
|
+
|
295
|
+
|
272
296
|
def _flatten_dfe_list(
|
273
297
|
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
274
298
|
) -> List[DeltaFileEnvelope]:
|
@@ -349,7 +373,7 @@ def _compact_tables(
|
|
349
373
|
1. The compacted PyArrow table.
|
350
374
|
2. The total number of records in the incremental data.
|
351
375
|
3. The total number of deduplicated records.
|
352
|
-
4. The total number of
|
376
|
+
4. The total number of deleted records due to DELETE operations.
|
353
377
|
"""
|
354
378
|
df_envelopes: List[DeltaFileEnvelope] = _flatten_dfe_list(dfe_list)
|
355
379
|
delete_file_envelopes = input.delete_file_envelopes or []
|
@@ -460,6 +484,8 @@ def _copy_manifests_from_hash_bucketing(
|
|
460
484
|
return materialized_results
|
461
485
|
|
462
486
|
|
487
|
+
@success_metric(name=MERGE_SUCCESS_COUNT)
|
488
|
+
@failure_metric(name=MERGE_FAILURE_COUNT)
|
463
489
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
464
490
|
task_id = get_current_ray_task_id()
|
465
491
|
worker_id = get_current_ray_worker_id()
|
@@ -479,10 +505,12 @@ def _timed_merge(input: MergeInput) -> MergeResult:
|
|
479
505
|
assert (
|
480
506
|
input.delete_strategy is not None
|
481
507
|
), "Merge input missing delete_strategy"
|
482
|
-
if
|
483
|
-
|
508
|
+
if _can_copy_by_reference(
|
509
|
+
has_delete=has_delete, merge_file_group=merge_file_group, input=input
|
510
|
+
):
|
484
511
|
hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
|
485
512
|
continue
|
513
|
+
|
486
514
|
if _has_previous_compacted_table(input, merge_file_group.hb_index):
|
487
515
|
compacted_table = _download_compacted_table(
|
488
516
|
hb_index=merge_file_group.hb_index,
|
@@ -548,7 +576,8 @@ def merge(input: MergeInput) -> MergeResult:
|
|
548
576
|
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
549
577
|
)
|
550
578
|
|
551
|
-
|
579
|
+
if input.memory_logs_enabled:
|
580
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
552
581
|
|
553
582
|
merge_result, duration = timed_invocation(func=_timed_merge, input=input)
|
554
583
|
|
@@ -556,7 +585,7 @@ def merge(input: MergeInput) -> MergeResult:
|
|
556
585
|
if input.metrics_config:
|
557
586
|
emit_result, latency = timed_invocation(
|
558
587
|
func=emit_timer_metrics,
|
559
|
-
metrics_name=
|
588
|
+
metrics_name=MERGE_TIME_IN_SECONDS,
|
560
589
|
value=duration,
|
561
590
|
metrics_config=input.metrics_config,
|
562
591
|
)
|
@@ -23,10 +23,13 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
23
23
|
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
24
24
|
append_content_type_params,
|
25
25
|
)
|
26
|
+
from deltacat.utils.metrics import metrics
|
27
|
+
from deltacat.compute.compactor_v2.constants import DISCOVER_DELTAS_METRIC_PREFIX
|
26
28
|
|
27
29
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
30
|
|
29
31
|
|
32
|
+
@metrics(prefix=DISCOVER_DELTAS_METRIC_PREFIX)
|
30
33
|
def discover_deltas(
|
31
34
|
source_partition_locator: PartitionLocator,
|
32
35
|
last_stream_position_to_compact: int,
|
@@ -31,11 +31,14 @@ from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
|
31
31
|
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
32
32
|
DeleteFileEnvelope,
|
33
33
|
)
|
34
|
+
from deltacat.utils.metrics import metrics
|
35
|
+
from deltacat.compute.compactor_v2.constants import MATERIALIZE_METRIC_PREFIX
|
34
36
|
|
35
37
|
|
36
38
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
37
39
|
|
38
40
|
|
41
|
+
@metrics(prefix=MATERIALIZE_METRIC_PREFIX)
|
39
42
|
def materialize(
|
40
43
|
input: MergeInput,
|
41
44
|
task_index: int,
|