deltacat 1.1.8__py3-none-any.whl → 1.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +6 -0
- deltacat/aws/s3u.py +46 -25
- deltacat/compute/compactor/model/compact_partition_params.py +12 -1
- deltacat/compute/compactor/model/materialize_result.py +0 -4
- deltacat/compute/compactor_v2/compaction_session.py +11 -5
- deltacat/compute/compactor_v2/constants.py +2 -11
- deltacat/compute/compactor_v2/model/merge_input.py +6 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -7
- deltacat/compute/compactor_v2/steps/merge.py +12 -12
- deltacat/compute/compactor_v2/utils/merge.py +1 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +2 -12
- deltacat/exceptions.py +342 -7
- deltacat/io/memcached_object_store.py +7 -4
- deltacat/storage/interface.py +14 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +88 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +3 -2
- deltacat/tests/compute/compact_partition_test_cases.py +4 -2
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +3 -1
- deltacat/tests/compute/test_compact_partition_rebase.py +289 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +1 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -2
- deltacat/tests/local_deltacat_storage/__init__.py +41 -10
- deltacat/tests/local_deltacat_storage/exceptions.py +10 -0
- deltacat/tests/test_exceptions.py +100 -0
- deltacat/tests/test_logs.py +1 -0
- deltacat/tests/utils/test_daft.py +0 -1
- deltacat/tests/utils/test_resources.py +0 -28
- deltacat/utils/daft.py +3 -0
- deltacat/utils/pyarrow.py +8 -5
- deltacat/utils/ray_utils/runtime.py +2 -2
- deltacat/utils/resources.py +0 -45
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/METADATA +2 -2
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/RECORD +38 -34
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/LICENSE +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/WHEEL +0 -0
- {deltacat-1.1.8.dist-info → deltacat-1.1.9.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/constants.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
import botocore
|
2
2
|
from typing import Set
|
3
|
+
from daft.exceptions import DaftTransientError
|
3
4
|
|
4
5
|
from deltacat.utils.common import env_integer, env_string
|
5
6
|
|
7
|
+
|
6
8
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
|
9
|
+
DEFAULT_FILE_READ_TIMEOUT_MS = env_integer(
|
10
|
+
"DEFAULT_FILE_READ_TIMEOUT_MS", 300_000
|
11
|
+
) # 5 mins
|
7
12
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
|
8
13
|
BOTO_TIMEOUT_ERROR_CODES: Set[str] = {"ReadTimeoutError", "ConnectTimeoutError"}
|
9
14
|
BOTO_THROTTLING_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
|
@@ -14,6 +19,7 @@ RETRYABLE_TRANSIENT_ERRORS = (
|
|
14
19
|
botocore.exceptions.NoCredentialsError,
|
15
20
|
botocore.exceptions.ConnectTimeoutError,
|
16
21
|
botocore.exceptions.ReadTimeoutError,
|
22
|
+
DaftTransientError,
|
17
23
|
)
|
18
24
|
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
19
25
|
UPLOAD_DOWNLOAD_RETRY_STOP_AFTER_DELAY = env_integer(
|
deltacat/aws/s3u.py
CHANGED
@@ -26,14 +26,12 @@ from ray.types import ObjectRef
|
|
26
26
|
from tenacity import (
|
27
27
|
Retrying,
|
28
28
|
retry_if_exception_type,
|
29
|
-
retry_if_not_exception_type,
|
30
29
|
stop_after_delay,
|
31
30
|
wait_random_exponential,
|
32
31
|
)
|
33
32
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
34
33
|
import deltacat.aws.clients as aws_utils
|
35
34
|
from deltacat import logs
|
36
|
-
from deltacat.exceptions import NonRetryableError, RetryableError
|
37
35
|
from deltacat.storage import (
|
38
36
|
DistributedDataset,
|
39
37
|
LocalDataset,
|
@@ -55,8 +53,20 @@ from deltacat.types.tables import (
|
|
55
53
|
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
56
54
|
get_table_length,
|
57
55
|
)
|
56
|
+
from deltacat.exceptions import (
|
57
|
+
RetryableError,
|
58
|
+
RetryableUploadTableError,
|
59
|
+
RetryableDownloadTableError,
|
60
|
+
RetryableDownloadFileError,
|
61
|
+
RetryableUploadFileError,
|
62
|
+
NonRetryableDownloadFileError,
|
63
|
+
NonRetryableUploadFileError,
|
64
|
+
NonRetryableUploadTableError,
|
65
|
+
NonRetryableDownloadTableError,
|
66
|
+
)
|
58
67
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
59
68
|
from deltacat.utils.common import ReadKwargsProvider
|
69
|
+
from deltacat.exceptions import categorize_errors
|
60
70
|
|
61
71
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
62
72
|
|
@@ -232,6 +242,7 @@ def filter_objects_by_prefix(
|
|
232
242
|
more_objects_to_list = params["ContinuationToken"] is not None
|
233
243
|
|
234
244
|
|
245
|
+
@categorize_errors
|
235
246
|
def read_file(
|
236
247
|
s3_url: str,
|
237
248
|
content_type: ContentType,
|
@@ -263,15 +274,15 @@ def read_file(
|
|
263
274
|
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
264
275
|
):
|
265
276
|
# Timeout error not caught by botocore
|
266
|
-
raise
|
267
|
-
f"Retry table download from: {s3_url} after receiving {type(e).__name__}"
|
277
|
+
raise RetryableDownloadTableError(
|
278
|
+
f"Retry table download from: {s3_url} after receiving {type(e).__name__}",
|
268
279
|
) from e
|
269
|
-
raise
|
280
|
+
raise NonRetryableDownloadTableError(
|
270
281
|
f"Failed table download from: {s3_url} after receiving {type(e).__name__}"
|
271
282
|
) from e
|
272
283
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
273
|
-
raise
|
274
|
-
f"Retry
|
284
|
+
raise RetryableDownloadTableError(
|
285
|
+
f"Retry download for: {s3_url} after receiving {type(e).__name__}"
|
275
286
|
) from e
|
276
287
|
except BaseException as e:
|
277
288
|
logger.warn(
|
@@ -279,7 +290,10 @@ def read_file(
|
|
279
290
|
f"and encoding={content_encoding}. Error: {e}",
|
280
291
|
exc_info=True,
|
281
292
|
)
|
282
|
-
raise
|
293
|
+
raise NonRetryableDownloadTableError(
|
294
|
+
f"Read has failed for {s3_url} and content_type={content_type} "
|
295
|
+
f"and encoding={content_encoding}",
|
296
|
+
) from e
|
283
297
|
|
284
298
|
|
285
299
|
def upload_sliced_table(
|
@@ -378,29 +392,31 @@ def upload_table(
|
|
378
392
|
except ClientError as e:
|
379
393
|
if e.response["Error"]["Code"] == "NoSuchKey":
|
380
394
|
# s3fs may swallow S3 errors - we were probably throttled
|
381
|
-
raise
|
382
|
-
f"Retry table
|
395
|
+
raise RetryableUploadTableError(
|
396
|
+
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
383
397
|
) from e
|
384
398
|
if (
|
385
399
|
e.response["Error"]["Code"]
|
386
400
|
in BOTO_TIMEOUT_ERROR_CODES | BOTO_THROTTLING_ERROR_CODES
|
387
401
|
):
|
388
|
-
raise
|
389
|
-
f"Retry table
|
402
|
+
raise RetryableUploadTableError(
|
403
|
+
f"Retry table upload from: {s3_url} after receiving {type(e).__name__}",
|
390
404
|
) from e
|
391
|
-
raise
|
392
|
-
f"Failed table upload to: {s3_url} after receiving {type(e).__name__}"
|
405
|
+
raise NonRetryableUploadTableError(
|
406
|
+
f"Failed table upload to: {s3_url} after receiving {type(e).__name__}",
|
393
407
|
) from e
|
394
408
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
395
|
-
raise
|
396
|
-
f"Retry upload for: {s3_url} after receiving {type(e).__name__}"
|
409
|
+
raise RetryableUploadTableError(
|
410
|
+
f"Retry upload for: {s3_url} after receiving {type(e).__name__}",
|
397
411
|
) from e
|
398
412
|
except BaseException as e:
|
399
413
|
logger.warn(
|
400
414
|
f"Upload has failed for {s3_url} and content_type={content_type}. Error: {e}",
|
401
415
|
exc_info=True,
|
402
416
|
)
|
403
|
-
raise
|
417
|
+
raise NonRetryableUploadTableError(
|
418
|
+
f"Upload has failed for {s3_url} and content_type={content_type} because of {type(e).__name__}",
|
419
|
+
) from e
|
404
420
|
return manifest_entries
|
405
421
|
|
406
422
|
|
@@ -443,7 +459,7 @@ def download_manifest_entry(
|
|
443
459
|
retrying = Retrying(
|
444
460
|
wait=wait_random_exponential(multiplier=1, max=60),
|
445
461
|
stop=stop_after_delay(DOWNLOAD_MANIFEST_ENTRY_RETRY_STOP_AFTER_DELAY),
|
446
|
-
retry=
|
462
|
+
retry=retry_if_exception_type(RetryableError),
|
447
463
|
)
|
448
464
|
table = retrying(
|
449
465
|
read_file,
|
@@ -559,12 +575,15 @@ def _put_object(
|
|
559
575
|
)
|
560
576
|
except ClientError as e:
|
561
577
|
if e.response["Error"]["Code"] in BOTO_THROTTLING_ERROR_CODES:
|
562
|
-
|
563
|
-
|
578
|
+
error_code = e.response["Error"]["Code"]
|
579
|
+
raise RetryableUploadFileError(
|
580
|
+
f"Retry upload for: {bucket}/{key} after receiving {error_code}",
|
564
581
|
) from e
|
565
|
-
raise
|
582
|
+
raise NonRetryableUploadFileError(
|
583
|
+
f"Failed table upload to: {bucket}/{key}"
|
584
|
+
) from e
|
566
585
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
567
|
-
raise
|
586
|
+
raise RetryableUploadFileError(
|
568
587
|
f"Retry upload for: {bucket}/{key} after receiving {type(e).__name__}"
|
569
588
|
) from e
|
570
589
|
except BaseException as e:
|
@@ -572,7 +591,9 @@ def _put_object(
|
|
572
591
|
f"Upload has failed for {bucket}/{key}. Error: {type(e).__name__}",
|
573
592
|
exc_info=True,
|
574
593
|
)
|
575
|
-
raise
|
594
|
+
raise NonRetryableUploadFileError(
|
595
|
+
f"Failed table upload to: {bucket}/{key}"
|
596
|
+
) from e
|
576
597
|
|
577
598
|
|
578
599
|
def download(
|
@@ -604,12 +625,12 @@ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True
|
|
604
625
|
except ClientError as e:
|
605
626
|
if e.response["Error"]["Code"] == "NoSuchKey":
|
606
627
|
if fail_if_not_found:
|
607
|
-
raise
|
628
|
+
raise NonRetryableDownloadFileError(
|
608
629
|
f"Failed get object from: {bucket}/{key}"
|
609
630
|
) from e
|
610
631
|
logger.info(f"file not found: {bucket}/{key}")
|
611
632
|
except RETRYABLE_TRANSIENT_ERRORS as e:
|
612
|
-
raise
|
633
|
+
raise RetryableDownloadFileError(
|
613
634
|
f"Retry get object: {bucket}/{key} after receiving {type(e).__name__}"
|
614
635
|
) from e
|
615
636
|
|
@@ -21,6 +21,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
21
21
|
TASK_MAX_PARALLELISM,
|
22
22
|
DROP_DUPLICATES,
|
23
23
|
TOTAL_MEMORY_BUFFER_PERCENTAGE,
|
24
|
+
DEFAULT_DISABLE_COPY_BY_REFERENCE,
|
24
25
|
)
|
25
26
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
26
27
|
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
|
@@ -50,7 +51,6 @@ class CompactPartitionParams(dict):
|
|
50
51
|
|
51
52
|
result = CompactPartitionParams(params)
|
52
53
|
|
53
|
-
# TODO: move defaults to single file
|
54
54
|
result.records_per_compacted_file = params.get(
|
55
55
|
"records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
|
56
56
|
)
|
@@ -92,6 +92,9 @@ class CompactPartitionParams(dict):
|
|
92
92
|
result.hash_group_count = params.get(
|
93
93
|
"hash_group_count", result.hash_bucket_count
|
94
94
|
)
|
95
|
+
result.disable_copy_by_reference = params.get(
|
96
|
+
"disable_copy_by_reference", DEFAULT_DISABLE_COPY_BY_REFERENCE
|
97
|
+
)
|
95
98
|
result.drop_duplicates = params.get("drop_duplicates", DROP_DUPLICATES)
|
96
99
|
result.ray_custom_resources = params.get("ray_custom_resources")
|
97
100
|
|
@@ -238,6 +241,14 @@ class CompactPartitionParams(dict):
|
|
238
241
|
def enable_profiler(self, value: bool) -> None:
|
239
242
|
self["enable_profiler"] = value
|
240
243
|
|
244
|
+
@property
|
245
|
+
def disable_copy_by_reference(self) -> bool:
|
246
|
+
return self["disable_copy_by_reference"]
|
247
|
+
|
248
|
+
@disable_copy_by_reference.setter
|
249
|
+
def disable_copy_by_reference(self, value: bool) -> None:
|
250
|
+
self["disable_copy_by_reference"] = value
|
251
|
+
|
241
252
|
@property
|
242
253
|
def list_deltas_kwargs(self) -> dict:
|
243
254
|
return self["list_deltas_kwargs"]
|
@@ -55,10 +55,6 @@ class MaterializeResult(dict):
|
|
55
55
|
self["paWriteResult"] = val = PyArrowWriteResult(val)
|
56
56
|
return val
|
57
57
|
|
58
|
-
@property
|
59
|
-
def count_of_src_dfl_not_touched(self) -> int:
|
60
|
-
return self["countOfSrcFileNotTouched"]
|
61
|
-
|
62
58
|
@property
|
63
59
|
def referenced_pyarrow_write_result(self) -> PyArrowWriteResult:
|
64
60
|
val: Dict[str, Any] = self.get("referencedPaWriteResult")
|
@@ -77,6 +77,7 @@ from deltacat.compute.compactor_v2.utils.task_options import (
|
|
77
77
|
local_merge_resource_options_provider,
|
78
78
|
)
|
79
79
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
80
|
+
from deltacat.exceptions import categorize_errors
|
80
81
|
|
81
82
|
if importlib.util.find_spec("memray"):
|
82
83
|
import memray
|
@@ -86,6 +87,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
86
87
|
|
87
88
|
|
88
89
|
@metrics
|
90
|
+
@categorize_errors
|
89
91
|
def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
|
90
92
|
assert (
|
91
93
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
@@ -123,12 +125,12 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
123
125
|
f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
|
124
126
|
f"using previous partition: {previous_partition.locator if previous_partition else None}"
|
125
127
|
)
|
126
|
-
|
128
|
+
committed_partition: Partition = params.deltacat_storage.commit_partition(
|
127
129
|
execute_compaction_result.new_compacted_partition,
|
128
130
|
previous_partition,
|
129
131
|
**params.deltacat_storage_kwargs,
|
130
132
|
)
|
131
|
-
logger.info(f"Committed compacted partition: {
|
133
|
+
logger.info(f"Committed compacted partition: {committed_partition}")
|
132
134
|
round_completion_file_s3_url = rcf.write_round_completion_file(
|
133
135
|
params.compaction_artifact_s3_bucket,
|
134
136
|
execute_compaction_result.new_round_completion_file_partition_locator,
|
@@ -479,6 +481,7 @@ def _execute_compaction(
|
|
479
481
|
delete_strategy=delete_strategy,
|
480
482
|
delete_file_envelopes=delete_file_envelopes,
|
481
483
|
memory_logs_enabled=params.memory_logs_enabled,
|
484
|
+
disable_copy_by_reference=params.disable_copy_by_reference,
|
482
485
|
)
|
483
486
|
}
|
484
487
|
|
@@ -662,13 +665,16 @@ def _execute_compaction(
|
|
662
665
|
)
|
663
666
|
|
664
667
|
logger.info(
|
665
|
-
f"
|
668
|
+
f"Partition-{params.source_partition_locator.partition_values},"
|
666
669
|
f"compacted at: {params.last_stream_position_to_compact},"
|
667
670
|
)
|
671
|
+
logger.info(
|
672
|
+
f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
|
673
|
+
)
|
668
674
|
is_inplace_compacted: bool = (
|
669
|
-
|
675
|
+
rcf_source_partition_locator.partition_values
|
670
676
|
== params.destination_partition_locator.partition_values
|
671
|
-
and
|
677
|
+
and rcf_source_partition_locator.stream_id
|
672
678
|
== params.destination_partition_locator.stream_id
|
673
679
|
)
|
674
680
|
if is_inplace_compacted:
|
@@ -1,5 +1,3 @@
|
|
1
|
-
from deltacat.utils.common import env_integer
|
2
|
-
|
3
1
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
4
2
|
|
5
3
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -43,15 +41,8 @@ DROP_DUPLICATES = True
|
|
43
41
|
# size in metadata to pyarrow table size.
|
44
42
|
PARQUET_TO_PYARROW_INFLATION = 4
|
45
43
|
|
46
|
-
#
|
47
|
-
|
48
|
-
# This timeout depends on total data processed per task.
|
49
|
-
MERGE_TASK_TIMEOUT_IN_SECONDS = env_integer("MERGE_TASK_TIMEOUT_IN_SECONDS", 25 * 60)
|
50
|
-
|
51
|
-
# A hash bucket task will fail after this timeout
|
52
|
-
HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS = env_integer(
|
53
|
-
"HASH_BUCKET_TASK_TIMEOUT_IN_SECONDS", 25 * 60
|
54
|
-
)
|
44
|
+
# By default, copy by reference is enabled
|
45
|
+
DEFAULT_DISABLE_COPY_BY_REFERENCE = False
|
55
46
|
|
56
47
|
# Metric Names
|
57
48
|
# Time taken for a hash bucket task
|
@@ -47,6 +47,7 @@ class MergeInput(Dict):
|
|
47
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
48
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
49
|
memory_logs_enabled: Optional[bool] = None,
|
50
|
+
disable_copy_by_reference: Optional[bool] = None,
|
50
51
|
) -> MergeInput:
|
51
52
|
|
52
53
|
result = MergeInput()
|
@@ -69,6 +70,7 @@ class MergeInput(Dict):
|
|
69
70
|
result["deltacat_storage"] = deltacat_storage
|
70
71
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
71
72
|
result["memory_logs_enabled"] = memory_logs_enabled
|
73
|
+
result["disable_copy_by_reference"] = disable_copy_by_reference
|
72
74
|
return result
|
73
75
|
|
74
76
|
@property
|
@@ -148,3 +150,7 @@ class MergeInput(Dict):
|
|
148
150
|
@property
|
149
151
|
def delete_strategy(self) -> Optional[DeleteStrategy]:
|
150
152
|
return self.get("delete_strategy")
|
153
|
+
|
154
|
+
@property
|
155
|
+
def disable_copy_by_reference(self) -> bool:
|
156
|
+
return self["disable_copy_by_reference"]
|
@@ -29,14 +29,15 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
|
|
29
29
|
from deltacat.utils.resources import (
|
30
30
|
get_current_process_peak_memory_usage_in_bytes,
|
31
31
|
ProcessUtilizationOverTimeRange,
|
32
|
-
timeout,
|
33
32
|
)
|
34
33
|
from deltacat.constants import BYTES_PER_GIBIBYTE
|
35
34
|
from deltacat.compute.compactor_v2.constants import (
|
36
35
|
HASH_BUCKET_TIME_IN_SECONDS,
|
37
36
|
HASH_BUCKET_FAILURE_COUNT,
|
38
37
|
HASH_BUCKET_SUCCESS_COUNT,
|
39
|
-
|
38
|
+
)
|
39
|
+
from deltacat.exceptions import (
|
40
|
+
categorize_errors,
|
40
41
|
)
|
41
42
|
|
42
43
|
if importlib.util.find_spec("memray"):
|
@@ -79,7 +80,7 @@ def _group_file_records_by_pk_hash_bucket(
|
|
79
80
|
logger.info("Grouping by pk hash bucket")
|
80
81
|
group_start = time.monotonic()
|
81
82
|
hash_bucket_to_table = group_by_pk_hash_bucket(
|
82
|
-
dfe.table, num_hash_buckets, primary_keys
|
83
|
+
table=dfe.table, num_buckets=num_hash_buckets, primary_keys=primary_keys
|
83
84
|
)
|
84
85
|
group_end = time.monotonic()
|
85
86
|
logger.info(f"Grouping took: {group_end - group_start}")
|
@@ -98,12 +99,9 @@ def _group_file_records_by_pk_hash_bucket(
|
|
98
99
|
return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
|
99
100
|
|
100
101
|
|
101
|
-
# TODO: use timeout parameter in ray.remote
|
102
|
-
# https://github.com/ray-project/ray/issues/18916
|
103
|
-
# Note: order of decorators is important
|
104
102
|
@success_metric(name=HASH_BUCKET_SUCCESS_COUNT)
|
105
103
|
@failure_metric(name=HASH_BUCKET_FAILURE_COUNT)
|
106
|
-
@
|
104
|
+
@categorize_errors
|
107
105
|
def _timed_hash_bucket(input: HashBucketInput):
|
108
106
|
task_id = get_current_ray_task_id()
|
109
107
|
worker_id = get_current_ray_worker_id()
|
@@ -28,7 +28,6 @@ from deltacat.utils.metrics import emit_timer_metrics, failure_metric, success_m
|
|
28
28
|
from deltacat.utils.resources import (
|
29
29
|
get_current_process_peak_memory_usage_in_bytes,
|
30
30
|
ProcessUtilizationOverTimeRange,
|
31
|
-
timeout,
|
32
31
|
)
|
33
32
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
34
33
|
generate_pk_hash_column,
|
@@ -47,9 +46,10 @@ from deltacat.compute.compactor_v2.constants import (
|
|
47
46
|
MERGE_TIME_IN_SECONDS,
|
48
47
|
MERGE_SUCCESS_COUNT,
|
49
48
|
MERGE_FAILURE_COUNT,
|
50
|
-
MERGE_TASK_TIMEOUT_IN_SECONDS,
|
51
49
|
)
|
52
|
-
|
50
|
+
from deltacat.exceptions import (
|
51
|
+
categorize_errors,
|
52
|
+
)
|
53
53
|
|
54
54
|
if importlib.util.find_spec("memray"):
|
55
55
|
import memray
|
@@ -284,16 +284,19 @@ def _can_copy_by_reference(
|
|
284
284
|
Can copy by reference only if there are no deletes to merge in
|
285
285
|
and previous compacted stream id matches that of new stream
|
286
286
|
"""
|
287
|
-
|
287
|
+
copy_by_ref = (
|
288
288
|
not has_delete
|
289
289
|
and not merge_file_group.dfe_groups
|
290
290
|
and input.round_completion_info is not None
|
291
|
-
and (
|
292
|
-
input.write_to_partition.stream_id
|
293
|
-
== input.round_completion_info.compacted_delta_locator.stream_id
|
294
|
-
)
|
295
291
|
)
|
296
292
|
|
293
|
+
if input.disable_copy_by_reference:
|
294
|
+
copy_by_ref = False
|
295
|
+
|
296
|
+
logger.info(f"Copy by reference is {copy_by_ref} for {merge_file_group.hb_index}")
|
297
|
+
|
298
|
+
return copy_by_ref
|
299
|
+
|
297
300
|
|
298
301
|
def _flatten_dfe_list(
|
299
302
|
df_envelopes_list: List[List[DeltaFileEnvelope]],
|
@@ -486,12 +489,9 @@ def _copy_manifests_from_hash_bucketing(
|
|
486
489
|
return materialized_results
|
487
490
|
|
488
491
|
|
489
|
-
# TODO: use timeout parameter in ray.remote
|
490
|
-
# https://github.com/ray-project/ray/issues/18916
|
491
|
-
# Note: order of decorators is important
|
492
492
|
@success_metric(name=MERGE_SUCCESS_COUNT)
|
493
493
|
@failure_metric(name=MERGE_FAILURE_COUNT)
|
494
|
-
@
|
494
|
+
@categorize_errors
|
495
495
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
496
496
|
task_id = get_current_ray_task_id()
|
497
497
|
worker_id = get_current_ray_worker_id()
|
@@ -157,7 +157,12 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
157
157
|
def group_by_pk_hash_bucket(
|
158
158
|
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
159
159
|
) -> np.ndarray:
|
160
|
-
|
160
|
+
new_tables = generate_pk_hash_column([table], primary_keys, requires_hash=True)
|
161
|
+
assert (
|
162
|
+
len(new_tables) == 1
|
163
|
+
), f"Expected only 1 table in the result but found {len(new_tables)}"
|
164
|
+
|
165
|
+
table = generate_pk_hash_column([table], primary_keys, requires_hash=True)[0]
|
161
166
|
|
162
167
|
# group hash bucket record indices
|
163
168
|
result = group_record_indices_by_hash_bucket(
|
@@ -171,7 +176,7 @@ def group_by_pk_hash_bucket(
|
|
171
176
|
def generate_pk_hash_column(
|
172
177
|
tables: List[pa.Table],
|
173
178
|
primary_keys: Optional[List[str]] = None,
|
174
|
-
|
179
|
+
requires_hash: bool = False,
|
175
180
|
) -> List[pa.Table]:
|
176
181
|
"""
|
177
182
|
Returns a new table list after generating the primary key hash if desired.
|
@@ -203,12 +208,12 @@ def generate_pk_hash_column(
|
|
203
208
|
if primary_keys:
|
204
209
|
hash_column_list = [_generate_pk_hash(table) for table in tables]
|
205
210
|
|
206
|
-
can_sha1 =
|
211
|
+
can_sha1 = requires_hash or _is_sha1_desired(hash_column_list)
|
207
212
|
else:
|
208
213
|
hash_column_list = [_generate_uuid(table) for table in tables]
|
209
214
|
|
210
215
|
logger.info(
|
211
|
-
f"can_generate_sha1={can_sha1} for the table and requires_sha1={
|
216
|
+
f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_hash}"
|
212
217
|
)
|
213
218
|
|
214
219
|
result = []
|
@@ -1,6 +1,4 @@
|
|
1
|
-
import botocore
|
2
1
|
import logging
|
3
|
-
import tenacity
|
4
2
|
from typing import Dict, Optional, List, Tuple, Any
|
5
3
|
from deltacat import logs
|
6
4
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
@@ -21,8 +19,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
21
19
|
from deltacat.compute.compactor_v2.constants import (
|
22
20
|
PARQUET_TO_PYARROW_INFLATION,
|
23
21
|
)
|
24
|
-
from
|
25
|
-
|
22
|
+
from deltacat.exceptions import RetryableError
|
26
23
|
|
27
24
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
25
|
|
@@ -79,14 +76,7 @@ def get_task_options(
|
|
79
76
|
|
80
77
|
# List of possible botocore exceptions are available at
|
81
78
|
# https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
|
82
|
-
task_opts["retry_exceptions"] = [
|
83
|
-
botocore.exceptions.ConnectionError,
|
84
|
-
botocore.exceptions.HTTPClientError,
|
85
|
-
ConnectionError,
|
86
|
-
TimeoutError,
|
87
|
-
DaftTransientError,
|
88
|
-
tenacity.RetryError,
|
89
|
-
]
|
79
|
+
task_opts["retry_exceptions"] = [RetryableError]
|
90
80
|
|
91
81
|
return task_opts
|
92
82
|
|