deltacat 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +3 -1
- deltacat/aws/s3u.py +71 -24
- deltacat/compute/compactor_v2/utils/task_options.py +5 -2
- deltacat/tests/aws/test_s3u.py +112 -0
- {deltacat-1.1.5.dist-info → deltacat-1.1.6.dist-info}/METADATA +1 -1
- {deltacat-1.1.5.dist-info → deltacat-1.1.6.dist-info}/RECORD +10 -10
- {deltacat-1.1.5.dist-info → deltacat-1.1.6.dist-info}/LICENSE +0 -0
- {deltacat-1.1.5.dist-info → deltacat-1.1.6.dist-info}/WHEEL +0 -0
- {deltacat-1.1.5.dist-info → deltacat-1.1.6.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
deltacat/aws/constants.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
-
from typing import List
|
1
|
+
from typing import List, Set
|
2
2
|
|
3
3
|
from deltacat.utils.common import env_integer, env_string
|
4
4
|
|
5
5
|
DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
|
6
6
|
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
|
7
7
|
TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
|
8
|
+
RETRYABLE_PUT_OBJECT_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
|
8
9
|
AWS_REGION = env_string("AWS_REGION", "us-east-1")
|
10
|
+
RETRY_STOP_AFTER_DELAY = env_integer("RETRY_STOP_AFTER_DELAY", 10 * 60)
|
deltacat/aws/s3u.py
CHANGED
@@ -4,14 +4,18 @@ from functools import partial
|
|
4
4
|
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
5
5
|
from uuid import uuid4
|
6
6
|
from botocore.config import Config
|
7
|
-
from deltacat.aws.constants import
|
7
|
+
from deltacat.aws.constants import (
|
8
|
+
BOTO_MAX_RETRIES,
|
9
|
+
RETRY_STOP_AFTER_DELAY,
|
10
|
+
RETRYABLE_PUT_OBJECT_ERROR_CODES,
|
11
|
+
)
|
8
12
|
|
9
13
|
import pyarrow as pa
|
10
14
|
import ray
|
11
15
|
import s3fs
|
12
16
|
from boto3.resources.base import ServiceResource
|
13
17
|
from botocore.client import BaseClient
|
14
|
-
from botocore.exceptions import ClientError
|
18
|
+
from botocore.exceptions import ClientError, NoCredentialsError
|
15
19
|
from ray.data.block import Block, BlockAccessor, BlockMetadata
|
16
20
|
from ray.data.datasource import BlockWritePathProvider
|
17
21
|
from ray.types import ObjectRef
|
@@ -315,7 +319,6 @@ def upload_sliced_table(
|
|
315
319
|
**s3_client_kwargs,
|
316
320
|
)
|
317
321
|
manifest_entries.extend(slice_entries)
|
318
|
-
|
319
322
|
return manifest_entries
|
320
323
|
|
321
324
|
|
@@ -504,41 +507,85 @@ def download_manifest_entries_distributed(
|
|
504
507
|
|
505
508
|
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
506
509
|
|
507
|
-
# TODO (pdames): add tenacity retrying
|
508
510
|
parsed_s3_url = parse_s3_url(s3_url)
|
509
511
|
s3 = s3_client_cache(None, **s3_client_kwargs)
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
512
|
+
retrying = Retrying(
|
513
|
+
wait=wait_random_exponential(multiplier=1, max=15),
|
514
|
+
stop=stop_after_delay(RETRY_STOP_AFTER_DELAY),
|
515
|
+
retry=retry_if_exception_type(RetryableError),
|
516
|
+
)
|
517
|
+
return retrying(
|
518
|
+
_put_object,
|
519
|
+
s3,
|
520
|
+
body,
|
521
|
+
parsed_s3_url.bucket,
|
522
|
+
parsed_s3_url.key,
|
514
523
|
)
|
515
524
|
|
516
525
|
|
526
|
+
def _put_object(
|
527
|
+
s3_client, body: Any, bucket: str, key: str, **s3_put_object_kwargs
|
528
|
+
) -> Dict[str, Any]:
|
529
|
+
try:
|
530
|
+
return s3_client.put_object(
|
531
|
+
Body=body, Bucket=bucket, Key=key, **s3_put_object_kwargs
|
532
|
+
)
|
533
|
+
except ClientError as e:
|
534
|
+
if e.response["Error"]["Code"] in RETRYABLE_PUT_OBJECT_ERROR_CODES:
|
535
|
+
raise RetryableError(
|
536
|
+
f"Retry upload for: {bucket}/{key} after receiving {e.response['Error']['Code']}"
|
537
|
+
) from e
|
538
|
+
raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
|
539
|
+
except NoCredentialsError as e:
|
540
|
+
raise RetryableError(
|
541
|
+
f"Failed to fetch credentials when putting object into: {bucket}/{key}"
|
542
|
+
) from e
|
543
|
+
except BaseException as e:
|
544
|
+
logger.error(
|
545
|
+
f"Upload has failed for {bucket}/{key}. Error: {e}",
|
546
|
+
exc_info=True,
|
547
|
+
)
|
548
|
+
raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
|
549
|
+
|
550
|
+
|
517
551
|
def download(
|
518
552
|
s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
|
519
553
|
) -> Optional[Dict[str, Any]]:
|
520
554
|
|
521
|
-
# TODO (pdames): add tenacity retrying
|
522
555
|
parsed_s3_url = parse_s3_url(s3_url)
|
523
556
|
s3 = s3_client_cache(None, **s3_client_kwargs)
|
557
|
+
retrying = Retrying(
|
558
|
+
wait=wait_random_exponential(multiplier=1, max=15),
|
559
|
+
stop=stop_after_delay(RETRY_STOP_AFTER_DELAY),
|
560
|
+
retry=retry_if_exception_type(RetryableError),
|
561
|
+
)
|
562
|
+
return retrying(
|
563
|
+
_get_object,
|
564
|
+
s3,
|
565
|
+
parsed_s3_url.bucket,
|
566
|
+
parsed_s3_url.key,
|
567
|
+
fail_if_not_found=fail_if_not_found,
|
568
|
+
)
|
569
|
+
|
570
|
+
|
571
|
+
def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True):
|
524
572
|
try:
|
525
|
-
return
|
526
|
-
Bucket=
|
527
|
-
Key=
|
573
|
+
return s3_client.get_object(
|
574
|
+
Bucket=bucket,
|
575
|
+
Key=key,
|
528
576
|
)
|
529
577
|
except ClientError as e:
|
530
|
-
if
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
logger.info(f"file not found: {s3_url}")
|
578
|
+
if e.response["Error"]["Code"] == "NoSuchKey":
|
579
|
+
if fail_if_not_found:
|
580
|
+
raise NonRetryableError(
|
581
|
+
f"Failed get object from: {bucket}/{key}"
|
582
|
+
) from e
|
583
|
+
logger.info(f"file not found: {bucket}/{key}")
|
584
|
+
except NoCredentialsError as e:
|
585
|
+
raise RetryableError(
|
586
|
+
f"Failed to fetch credentials when getting object from: {bucket}/{key}"
|
587
|
+
) from e
|
588
|
+
|
542
589
|
return None
|
543
590
|
|
544
591
|
|
@@ -20,7 +20,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
20
20
|
from deltacat.compute.compactor_v2.constants import (
|
21
21
|
PARQUET_TO_PYARROW_INFLATION,
|
22
22
|
)
|
23
|
-
|
24
23
|
from daft.exceptions import DaftTransientError
|
25
24
|
|
26
25
|
|
@@ -65,7 +64,11 @@ def get_task_options(
|
|
65
64
|
cpu: float, memory: float, ray_custom_resources: Optional[Dict] = None
|
66
65
|
) -> Dict:
|
67
66
|
|
68
|
-
|
67
|
+
# NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
|
68
|
+
# not spin up enough nodes fast and hence we see only approximately
|
69
|
+
# 20 tasks get scheduled out of 100 tasks in queue.
|
70
|
+
# https://docs.ray.io/en/latest/ray-core/scheduling/index.html
|
71
|
+
task_opts = {"num_cpus": cpu, "memory": memory, "scheduling_strategy": "SPREAD"}
|
69
72
|
|
70
73
|
if ray_custom_resources:
|
71
74
|
task_opts["resources"] = ray_custom_resources
|
deltacat/tests/aws/test_s3u.py
CHANGED
@@ -2,6 +2,21 @@ import unittest
|
|
2
2
|
from deltacat.aws.s3u import UuidBlockWritePathProvider, CapturedBlockWritePaths
|
3
3
|
|
4
4
|
|
5
|
+
import os
|
6
|
+
from unittest import mock
|
7
|
+
from unittest.mock import patch
|
8
|
+
|
9
|
+
import boto3
|
10
|
+
import pytest
|
11
|
+
from boto3.resources.base import ServiceResource
|
12
|
+
from botocore.exceptions import ClientError, NoCredentialsError
|
13
|
+
from deltacat.exceptions import NonRetryableError
|
14
|
+
from moto import mock_s3
|
15
|
+
from tenacity import RetryError
|
16
|
+
|
17
|
+
from deltacat.aws import s3u
|
18
|
+
|
19
|
+
|
5
20
|
class TestUuidBlockWritePathProvider(unittest.TestCase):
|
6
21
|
def test_uuid_block_write_provider_sanity(self):
|
7
22
|
capture_object = CapturedBlockWritePaths()
|
@@ -10,3 +25,100 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
|
|
10
25
|
result = provider("base_path")
|
11
26
|
|
12
27
|
self.assertRegex(result, r"^base_path/[\w-]{36}$")
|
28
|
+
|
29
|
+
|
30
|
+
class TestDownloadUpload(unittest.TestCase):
|
31
|
+
TEST_S3_BUCKET_NAME = "TEST_S3_BUCKET"
|
32
|
+
TEST_S3_KEY = "TEST_S3_KEY"
|
33
|
+
|
34
|
+
@pytest.fixture(autouse=True)
|
35
|
+
def mock_aws_credential(self):
|
36
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
37
|
+
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
38
|
+
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
39
|
+
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
40
|
+
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
41
|
+
yield
|
42
|
+
|
43
|
+
@pytest.fixture(autouse=True)
|
44
|
+
def setup_s3_resource(self):
|
45
|
+
with mock_s3():
|
46
|
+
yield boto3.resource("s3")
|
47
|
+
|
48
|
+
@pytest.fixture(autouse=True)
|
49
|
+
def setup_test_s3_bucket(self, setup_s3_resource: ServiceResource):
|
50
|
+
setup_s3_resource.create_bucket(
|
51
|
+
ACL="authenticated-read",
|
52
|
+
Bucket=self.TEST_S3_BUCKET_NAME,
|
53
|
+
)
|
54
|
+
yield
|
55
|
+
|
56
|
+
def test_sanity(self):
|
57
|
+
uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
|
58
|
+
body = "test-body"
|
59
|
+
uploaded_file = s3u.upload(uri, body)
|
60
|
+
assert uploaded_file is not None
|
61
|
+
assert uploaded_file["ResponseMetadata"]["HTTPStatusCode"] == 200
|
62
|
+
downloaded_file = s3u.download(uri)
|
63
|
+
downloaded_body = downloaded_file["Body"].read().decode("utf-8")
|
64
|
+
assert downloaded_file["ResponseMetadata"]["HTTPStatusCode"] == 200
|
65
|
+
assert downloaded_body == body
|
66
|
+
|
67
|
+
@patch("deltacat.aws.s3u.RETRY_STOP_AFTER_DELAY", 1)
|
68
|
+
@patch("deltacat.aws.s3u.s3_client_cache")
|
69
|
+
def test_upload_throttled(self, mock_s3_client_cache):
|
70
|
+
uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
|
71
|
+
body = "test-body"
|
72
|
+
throttling_err = ClientError({"Error": {"Code": "Throttling"}}, "put_object")
|
73
|
+
mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
|
74
|
+
mock_s3.put_object.side_effect = throttling_err
|
75
|
+
with pytest.raises(RetryError):
|
76
|
+
s3u.upload(uri, body)
|
77
|
+
|
78
|
+
slowdown_err = ClientError({"Error": {"Code": "SlowDown"}}, "put_object")
|
79
|
+
mock_s3.put_object.side_effect = slowdown_err
|
80
|
+
with pytest.raises(RetryError):
|
81
|
+
s3u.upload(uri, body)
|
82
|
+
|
83
|
+
no_credentials_err = NoCredentialsError()
|
84
|
+
mock_s3.put_object.side_effect = no_credentials_err
|
85
|
+
with pytest.raises(RetryError):
|
86
|
+
s3u.upload(uri, body)
|
87
|
+
|
88
|
+
assert mock_s3.put_object.call_count > 3
|
89
|
+
|
90
|
+
@patch("deltacat.aws.s3u.s3_client_cache")
|
91
|
+
def test_upload_unexpected_error_code(self, mock_s3_client_cache):
|
92
|
+
uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
|
93
|
+
body = "test-body"
|
94
|
+
err = ClientError({"Error": {"Code": "UnexpectedError"}}, "put_object")
|
95
|
+
mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
|
96
|
+
mock_s3.put_object.side_effect = err
|
97
|
+
file = None
|
98
|
+
with pytest.raises(NonRetryableError):
|
99
|
+
s3u.upload(uri, body)
|
100
|
+
assert file is None
|
101
|
+
assert mock_s3.put_object.call_count == 1
|
102
|
+
|
103
|
+
@patch("deltacat.aws.s3u.RETRY_STOP_AFTER_DELAY", 1)
|
104
|
+
@patch("deltacat.aws.s3u.s3_client_cache")
|
105
|
+
def test_download_throttled(self, mock_s3_client_cache):
|
106
|
+
uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
|
107
|
+
no_credentials_err = NoCredentialsError()
|
108
|
+
mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
|
109
|
+
mock_s3.get_object.side_effect = no_credentials_err
|
110
|
+
file = None
|
111
|
+
with pytest.raises(RetryError):
|
112
|
+
file = s3u.download(uri)
|
113
|
+
assert file is None
|
114
|
+
assert mock_s3.get_object.call_count > 1
|
115
|
+
|
116
|
+
def test_download_not_exists(self):
|
117
|
+
uri = f"s3://{self.TEST_S3_BUCKET_NAME}/key-not-exists"
|
118
|
+
file = None
|
119
|
+
with pytest.raises(NonRetryableError):
|
120
|
+
file = s3u.download(uri)
|
121
|
+
assert file is None
|
122
|
+
|
123
|
+
file = s3u.download(uri, fail_if_not_found=False)
|
124
|
+
assert file is None
|
@@ -1,11 +1,11 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=01om7qgj3agAF60Q1qwZXAzsUtP7cabwc_1RXqRr0vw,1777
|
2
2
|
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
|
4
4
|
deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
|
5
5
|
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
deltacat/aws/clients.py,sha256=VgddlV3AEjlBGIFmhhHxokYzwJ-lXnmHAeprVyADduI,6948
|
7
|
-
deltacat/aws/constants.py,sha256=
|
8
|
-
deltacat/aws/s3u.py,sha256=
|
7
|
+
deltacat/aws/constants.py,sha256=OnRbtfFdu4buJEsl39Kg5cH-7A-dEL_ESeBSAlR_1Cs,501
|
8
|
+
deltacat/aws/s3u.py,sha256=qZL5Omz1onW79vB_KrPHQ2Mox4sNPrLXkNxpFl9HFHM,26525
|
9
9
|
deltacat/aws/redshift/__init__.py,sha256=7SvjG-dqox8zZUhFicTsUvpG5vXYDl_QQ3ohlHOgTKc,342
|
10
10
|
deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
deltacat/aws/redshift/model/manifest.py,sha256=ThgpdwzaWz493Zz9e8HSWwuxEheA1nDuypM3pe4vozk,12987
|
@@ -74,7 +74,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=8hjkDeIIkSX-gAQ2utQSp2sZcO2t
|
|
74
74
|
deltacat/compute/compactor_v2/utils/io.py,sha256=autXlE3uHICdCCuJoS7mfdeJbRRiz2_xlz-3izlccB4,5264
|
75
75
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=hK4Y7acrtgfvWWTz-fAGznEg6qn6dBYu8blQUQVHhs0,5244
|
76
76
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=MAscmL35WfwN7Is72aFlD_cGhxtZgjRwwR5kS9Yn2uU,11393
|
77
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
77
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=n1zKOFmAg2cL7CDpT9y9h-J0aYzTMtOdUjkDm1svo9k,14160
|
78
78
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
79
79
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
80
80
|
deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -136,7 +136,7 @@ deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
136
|
deltacat/tests/test_logs.py,sha256=6BEMw8VApFg2msFwCAVosz8NWJYATtX5furHyz8UluM,3828
|
137
137
|
deltacat/tests/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
138
138
|
deltacat/tests/aws/test_clients.py,sha256=23GMWfz27WWBDXSqphG9mfputsyS7j3I5P_HRk4YoKE,3790
|
139
|
-
deltacat/tests/aws/test_s3u.py,sha256=
|
139
|
+
deltacat/tests/aws/test_s3u.py,sha256=aFvUa9f63hFU8T4r_cuKYxcFg6jVUoJWygiPwDUd09s,4654
|
140
140
|
deltacat/tests/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
141
|
deltacat/tests/catalog/test_default_catalog_impl.py,sha256=9srCU5yQ159oZ9_PoJ_mWMzVUW5bKV0mnmPJc5zKCQQ,3125
|
142
142
|
deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -206,8 +206,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
206
206
|
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
207
207
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
208
208
|
deltacat/utils/ray_utils/runtime.py,sha256=5eaBWTDm0IXVoc5Y6aacoVB-f0Mnv-K2ewyTSjHKHwM,5009
|
209
|
-
deltacat-1.1.
|
210
|
-
deltacat-1.1.
|
211
|
-
deltacat-1.1.
|
212
|
-
deltacat-1.1.
|
213
|
-
deltacat-1.1.
|
209
|
+
deltacat-1.1.6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
210
|
+
deltacat-1.1.6.dist-info/METADATA,sha256=C5eD7a_S7Zxm5W6A5dBUGPKKBnwttmcu2qHELs6YImw,1780
|
211
|
+
deltacat-1.1.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
212
|
+
deltacat-1.1.6.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
213
|
+
deltacat-1.1.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|