deltacat 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.5"
47
+ __version__ = "1.1.6"
48
48
 
49
49
 
50
50
  __all__ = [
deltacat/aws/constants.py CHANGED
@@ -1,8 +1,10 @@
1
- from typing import List
1
+ from typing import List, Set
2
2
 
3
3
  from deltacat.utils.common import env_integer, env_string
4
4
 
5
5
  DAFT_MAX_S3_CONNECTIONS_PER_FILE = env_integer("DAFT_MAX_S3_CONNECTIONS_PER_FILE", 8)
6
6
  BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 5)
7
7
  TIMEOUT_ERROR_CODES: List[str] = ["ReadTimeoutError", "ConnectTimeoutError"]
8
+ RETRYABLE_PUT_OBJECT_ERROR_CODES: Set[str] = {"Throttling", "SlowDown"}
8
9
  AWS_REGION = env_string("AWS_REGION", "us-east-1")
10
+ RETRY_STOP_AFTER_DELAY = env_integer("RETRY_STOP_AFTER_DELAY", 10 * 60)
deltacat/aws/s3u.py CHANGED
@@ -4,14 +4,18 @@ from functools import partial
4
4
  from typing import Any, Callable, Dict, Generator, List, Optional, Union
5
5
  from uuid import uuid4
6
6
  from botocore.config import Config
7
- from deltacat.aws.constants import BOTO_MAX_RETRIES
7
+ from deltacat.aws.constants import (
8
+ BOTO_MAX_RETRIES,
9
+ RETRY_STOP_AFTER_DELAY,
10
+ RETRYABLE_PUT_OBJECT_ERROR_CODES,
11
+ )
8
12
 
9
13
  import pyarrow as pa
10
14
  import ray
11
15
  import s3fs
12
16
  from boto3.resources.base import ServiceResource
13
17
  from botocore.client import BaseClient
14
- from botocore.exceptions import ClientError
18
+ from botocore.exceptions import ClientError, NoCredentialsError
15
19
  from ray.data.block import Block, BlockAccessor, BlockMetadata
16
20
  from ray.data.datasource import BlockWritePathProvider
17
21
  from ray.types import ObjectRef
@@ -315,7 +319,6 @@ def upload_sliced_table(
315
319
  **s3_client_kwargs,
316
320
  )
317
321
  manifest_entries.extend(slice_entries)
318
-
319
322
  return manifest_entries
320
323
 
321
324
 
@@ -504,41 +507,85 @@ def download_manifest_entries_distributed(
504
507
 
505
508
  def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
506
509
 
507
- # TODO (pdames): add tenacity retrying
508
510
  parsed_s3_url = parse_s3_url(s3_url)
509
511
  s3 = s3_client_cache(None, **s3_client_kwargs)
510
- return s3.put_object(
511
- Body=body,
512
- Bucket=parsed_s3_url.bucket,
513
- Key=parsed_s3_url.key,
512
+ retrying = Retrying(
513
+ wait=wait_random_exponential(multiplier=1, max=15),
514
+ stop=stop_after_delay(RETRY_STOP_AFTER_DELAY),
515
+ retry=retry_if_exception_type(RetryableError),
516
+ )
517
+ return retrying(
518
+ _put_object,
519
+ s3,
520
+ body,
521
+ parsed_s3_url.bucket,
522
+ parsed_s3_url.key,
514
523
  )
515
524
 
516
525
 
526
+ def _put_object(
527
+ s3_client, body: Any, bucket: str, key: str, **s3_put_object_kwargs
528
+ ) -> Dict[str, Any]:
529
+ try:
530
+ return s3_client.put_object(
531
+ Body=body, Bucket=bucket, Key=key, **s3_put_object_kwargs
532
+ )
533
+ except ClientError as e:
534
+ if e.response["Error"]["Code"] in RETRYABLE_PUT_OBJECT_ERROR_CODES:
535
+ raise RetryableError(
536
+ f"Retry upload for: {bucket}/{key} after receiving {e.response['Error']['Code']}"
537
+ ) from e
538
+ raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
539
+ except NoCredentialsError as e:
540
+ raise RetryableError(
541
+ f"Failed to fetch credentials when putting object into: {bucket}/{key}"
542
+ ) from e
543
+ except BaseException as e:
544
+ logger.error(
545
+ f"Upload has failed for {bucket}/{key}. Error: {e}",
546
+ exc_info=True,
547
+ )
548
+ raise NonRetryableError(f"Failed table upload to: {bucket}/{key}") from e
549
+
550
+
517
551
  def download(
518
552
  s3_url: str, fail_if_not_found: bool = True, **s3_client_kwargs
519
553
  ) -> Optional[Dict[str, Any]]:
520
554
 
521
- # TODO (pdames): add tenacity retrying
522
555
  parsed_s3_url = parse_s3_url(s3_url)
523
556
  s3 = s3_client_cache(None, **s3_client_kwargs)
557
+ retrying = Retrying(
558
+ wait=wait_random_exponential(multiplier=1, max=15),
559
+ stop=stop_after_delay(RETRY_STOP_AFTER_DELAY),
560
+ retry=retry_if_exception_type(RetryableError),
561
+ )
562
+ return retrying(
563
+ _get_object,
564
+ s3,
565
+ parsed_s3_url.bucket,
566
+ parsed_s3_url.key,
567
+ fail_if_not_found=fail_if_not_found,
568
+ )
569
+
570
+
571
+ def _get_object(s3_client, bucket: str, key: str, fail_if_not_found: bool = True):
524
572
  try:
525
- return s3.get_object(
526
- Bucket=parsed_s3_url.bucket,
527
- Key=parsed_s3_url.key,
573
+ return s3_client.get_object(
574
+ Bucket=bucket,
575
+ Key=key,
528
576
  )
529
577
  except ClientError as e:
530
- if fail_if_not_found:
531
- raise
532
- else:
533
- if e.response["Error"]["Code"] != "404":
534
- if e.response["Error"]["Code"] != "NoSuchKey":
535
- raise
536
- logger.info(f"file not found: {s3_url}")
537
- except s3.exceptions.NoSuchKey:
538
- if fail_if_not_found:
539
- raise
540
- else:
541
- logger.info(f"file not found: {s3_url}")
578
+ if e.response["Error"]["Code"] == "NoSuchKey":
579
+ if fail_if_not_found:
580
+ raise NonRetryableError(
581
+ f"Failed get object from: {bucket}/{key}"
582
+ ) from e
583
+ logger.info(f"file not found: {bucket}/{key}")
584
+ except NoCredentialsError as e:
585
+ raise RetryableError(
586
+ f"Failed to fetch credentials when getting object from: {bucket}/{key}"
587
+ ) from e
588
+
542
589
  return None
543
590
 
544
591
 
@@ -20,7 +20,6 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
20
20
  from deltacat.compute.compactor_v2.constants import (
21
21
  PARQUET_TO_PYARROW_INFLATION,
22
22
  )
23
-
24
23
  from daft.exceptions import DaftTransientError
25
24
 
26
25
 
@@ -65,7 +64,11 @@ def get_task_options(
65
64
  cpu: float, memory: float, ray_custom_resources: Optional[Dict] = None
66
65
  ) -> Dict:
67
66
 
68
- task_opts = {"num_cpus": cpu, "memory": memory}
67
+ # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
68
+ # not spin up enough nodes fast and hence we see only approximately
69
+ # 20 tasks get scheduled out of 100 tasks in queue.
70
+ # https://docs.ray.io/en/latest/ray-core/scheduling/index.html
71
+ task_opts = {"num_cpus": cpu, "memory": memory, "scheduling_strategy": "SPREAD"}
69
72
 
70
73
  if ray_custom_resources:
71
74
  task_opts["resources"] = ray_custom_resources
@@ -2,6 +2,21 @@ import unittest
2
2
  from deltacat.aws.s3u import UuidBlockWritePathProvider, CapturedBlockWritePaths
3
3
 
4
4
 
5
+ import os
6
+ from unittest import mock
7
+ from unittest.mock import patch
8
+
9
+ import boto3
10
+ import pytest
11
+ from boto3.resources.base import ServiceResource
12
+ from botocore.exceptions import ClientError, NoCredentialsError
13
+ from deltacat.exceptions import NonRetryableError
14
+ from moto import mock_s3
15
+ from tenacity import RetryError
16
+
17
+ from deltacat.aws import s3u
18
+
19
+
5
20
  class TestUuidBlockWritePathProvider(unittest.TestCase):
6
21
  def test_uuid_block_write_provider_sanity(self):
7
22
  capture_object = CapturedBlockWritePaths()
@@ -10,3 +25,100 @@ class TestUuidBlockWritePathProvider(unittest.TestCase):
10
25
  result = provider("base_path")
11
26
 
12
27
  self.assertRegex(result, r"^base_path/[\w-]{36}$")
28
+
29
+
30
+ class TestDownloadUpload(unittest.TestCase):
31
+ TEST_S3_BUCKET_NAME = "TEST_S3_BUCKET"
32
+ TEST_S3_KEY = "TEST_S3_KEY"
33
+
34
+ @pytest.fixture(autouse=True)
35
+ def mock_aws_credential(self):
36
+ os.environ["AWS_ACCESS_KEY_ID"] = "testing"
37
+ os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
38
+ os.environ["AWS_SECURITY_TOKEN"] = "testing"
39
+ os.environ["AWS_SESSION_TOKEN"] = "testing"
40
+ os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
41
+ yield
42
+
43
+ @pytest.fixture(autouse=True)
44
+ def setup_s3_resource(self):
45
+ with mock_s3():
46
+ yield boto3.resource("s3")
47
+
48
+ @pytest.fixture(autouse=True)
49
+ def setup_test_s3_bucket(self, setup_s3_resource: ServiceResource):
50
+ setup_s3_resource.create_bucket(
51
+ ACL="authenticated-read",
52
+ Bucket=self.TEST_S3_BUCKET_NAME,
53
+ )
54
+ yield
55
+
56
+ def test_sanity(self):
57
+ uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
58
+ body = "test-body"
59
+ uploaded_file = s3u.upload(uri, body)
60
+ assert uploaded_file is not None
61
+ assert uploaded_file["ResponseMetadata"]["HTTPStatusCode"] == 200
62
+ downloaded_file = s3u.download(uri)
63
+ downloaded_body = downloaded_file["Body"].read().decode("utf-8")
64
+ assert downloaded_file["ResponseMetadata"]["HTTPStatusCode"] == 200
65
+ assert downloaded_body == body
66
+
67
+ @patch("deltacat.aws.s3u.RETRY_STOP_AFTER_DELAY", 1)
68
+ @patch("deltacat.aws.s3u.s3_client_cache")
69
+ def test_upload_throttled(self, mock_s3_client_cache):
70
+ uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
71
+ body = "test-body"
72
+ throttling_err = ClientError({"Error": {"Code": "Throttling"}}, "put_object")
73
+ mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
74
+ mock_s3.put_object.side_effect = throttling_err
75
+ with pytest.raises(RetryError):
76
+ s3u.upload(uri, body)
77
+
78
+ slowdown_err = ClientError({"Error": {"Code": "SlowDown"}}, "put_object")
79
+ mock_s3.put_object.side_effect = slowdown_err
80
+ with pytest.raises(RetryError):
81
+ s3u.upload(uri, body)
82
+
83
+ no_credentials_err = NoCredentialsError()
84
+ mock_s3.put_object.side_effect = no_credentials_err
85
+ with pytest.raises(RetryError):
86
+ s3u.upload(uri, body)
87
+
88
+ assert mock_s3.put_object.call_count > 3
89
+
90
+ @patch("deltacat.aws.s3u.s3_client_cache")
91
+ def test_upload_unexpected_error_code(self, mock_s3_client_cache):
92
+ uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
93
+ body = "test-body"
94
+ err = ClientError({"Error": {"Code": "UnexpectedError"}}, "put_object")
95
+ mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
96
+ mock_s3.put_object.side_effect = err
97
+ file = None
98
+ with pytest.raises(NonRetryableError):
99
+ s3u.upload(uri, body)
100
+ assert file is None
101
+ assert mock_s3.put_object.call_count == 1
102
+
103
+ @patch("deltacat.aws.s3u.RETRY_STOP_AFTER_DELAY", 1)
104
+ @patch("deltacat.aws.s3u.s3_client_cache")
105
+ def test_download_throttled(self, mock_s3_client_cache):
106
+ uri = f"s3://{self.TEST_S3_BUCKET_NAME}/{self.TEST_S3_KEY}"
107
+ no_credentials_err = NoCredentialsError()
108
+ mock_s3_client_cache.return_value = mock_s3 = mock.MagicMock()
109
+ mock_s3.get_object.side_effect = no_credentials_err
110
+ file = None
111
+ with pytest.raises(RetryError):
112
+ file = s3u.download(uri)
113
+ assert file is None
114
+ assert mock_s3.get_object.call_count > 1
115
+
116
+ def test_download_not_exists(self):
117
+ uri = f"s3://{self.TEST_S3_BUCKET_NAME}/key-not-exists"
118
+ file = None
119
+ with pytest.raises(NonRetryableError):
120
+ file = s3u.download(uri)
121
+ assert file is None
122
+
123
+ file = s3u.download(uri, fail_if_not_found=False)
124
+ assert file is None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.5
3
+ Version: 1.1.6
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,11 +1,11 @@
1
- deltacat/__init__.py,sha256=i3bpmX1O0ZJD9XFiuqF0RHURR1lLKvdTBzVOP90fKQA,1777
1
+ deltacat/__init__.py,sha256=01om7qgj3agAF60Q1qwZXAzsUtP7cabwc_1RXqRr0vw,1777
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=xqZf8CwysNYP2d39pf27OnXGStPREgBgIM-e2Tts-TI,199
4
4
  deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
5
5
  deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  deltacat/aws/clients.py,sha256=VgddlV3AEjlBGIFmhhHxokYzwJ-lXnmHAeprVyADduI,6948
7
- deltacat/aws/constants.py,sha256=aAhOKeLVgtpekA3h9otHUrHqY2bLDWs2QlL7GrdI63g,352
8
- deltacat/aws/s3u.py,sha256=lgoE6es6N4xfzwyydxmVspROP1hrNfanB6JqjyBoRb4,24859
7
+ deltacat/aws/constants.py,sha256=OnRbtfFdu4buJEsl39Kg5cH-7A-dEL_ESeBSAlR_1Cs,501
8
+ deltacat/aws/s3u.py,sha256=qZL5Omz1onW79vB_KrPHQ2Mox4sNPrLXkNxpFl9HFHM,26525
9
9
  deltacat/aws/redshift/__init__.py,sha256=7SvjG-dqox8zZUhFicTsUvpG5vXYDl_QQ3ohlHOgTKc,342
10
10
  deltacat/aws/redshift/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  deltacat/aws/redshift/model/manifest.py,sha256=ThgpdwzaWz493Zz9e8HSWwuxEheA1nDuypM3pe4vozk,12987
@@ -74,7 +74,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=8hjkDeIIkSX-gAQ2utQSp2sZcO2t
74
74
  deltacat/compute/compactor_v2/utils/io.py,sha256=autXlE3uHICdCCuJoS7mfdeJbRRiz2_xlz-3izlccB4,5264
75
75
  deltacat/compute/compactor_v2/utils/merge.py,sha256=hK4Y7acrtgfvWWTz-fAGznEg6qn6dBYu8blQUQVHhs0,5244
76
76
  deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=MAscmL35WfwN7Is72aFlD_cGhxtZgjRwwR5kS9Yn2uU,11393
77
- deltacat/compute/compactor_v2/utils/task_options.py,sha256=WK-1-1gncUSZI9yxBEG1AokDZKTBBhisYcYGijlitHY,13857
77
+ deltacat/compute/compactor_v2/utils/task_options.py,sha256=n1zKOFmAg2cL7CDpT9y9h-J0aYzTMtOdUjkDm1svo9k,14160
78
78
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
79
79
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
80
80
  deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -136,7 +136,7 @@ deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
136
  deltacat/tests/test_logs.py,sha256=6BEMw8VApFg2msFwCAVosz8NWJYATtX5furHyz8UluM,3828
137
137
  deltacat/tests/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
138
  deltacat/tests/aws/test_clients.py,sha256=23GMWfz27WWBDXSqphG9mfputsyS7j3I5P_HRk4YoKE,3790
139
- deltacat/tests/aws/test_s3u.py,sha256=QflXbR94o7WobGBm6jhQDK5lJJD2Pd9z2uvi4J7WEJg,437
139
+ deltacat/tests/aws/test_s3u.py,sha256=aFvUa9f63hFU8T4r_cuKYxcFg6jVUoJWygiPwDUd09s,4654
140
140
  deltacat/tests/catalog/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
141
  deltacat/tests/catalog/test_default_catalog_impl.py,sha256=9srCU5yQ159oZ9_PoJ_mWMzVUW5bKV0mnmPJc5zKCQQ,3125
142
142
  deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -206,8 +206,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
206
206
  deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
207
207
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
208
208
  deltacat/utils/ray_utils/runtime.py,sha256=5eaBWTDm0IXVoc5Y6aacoVB-f0Mnv-K2ewyTSjHKHwM,5009
209
- deltacat-1.1.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
210
- deltacat-1.1.5.dist-info/METADATA,sha256=ZXriHPvPca0lz3LrEfynDq-u4dyxhGvhKaM3m9veiT0,1780
211
- deltacat-1.1.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
212
- deltacat-1.1.5.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
213
- deltacat-1.1.5.dist-info/RECORD,,
209
+ deltacat-1.1.6.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
210
+ deltacat-1.1.6.dist-info/METADATA,sha256=C5eD7a_S7Zxm5W6A5dBUGPKKBnwttmcu2qHELs6YImw,1780
211
+ deltacat-1.1.6.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
212
+ deltacat-1.1.6.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
213
+ deltacat-1.1.6.dist-info/RECORD,,