konduktor-nightly 0.1.0.dev20250409105017__py3-none-any.whl → 0.1.0.dev20250411104646__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +2 -2
- konduktor/adaptors/aws.py +221 -0
- konduktor/cli.py +3 -4
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1114 -0
- konduktor/data/data_utils.py +46 -1
- konduktor/data/registry.py +2 -1
- konduktor/data/storage.py +22 -8
- konduktor/task.py +13 -0
- konduktor/templates/pod.yaml.j2 +5 -1
- konduktor/utils/common_utils.py +29 -0
- {konduktor_nightly-0.1.0.dev20250409105017.dist-info → konduktor_nightly-0.1.0.dev20250411104646.dist-info}/METADATA +5 -1
- {konduktor_nightly-0.1.0.dev20250409105017.dist-info → konduktor_nightly-0.1.0.dev20250411104646.dist-info}/RECORD +16 -13
- {konduktor_nightly-0.1.0.dev20250409105017.dist-info → konduktor_nightly-0.1.0.dev20250411104646.dist-info}/LICENSE +0 -0
- {konduktor_nightly-0.1.0.dev20250409105017.dist-info → konduktor_nightly-0.1.0.dev20250411104646.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250409105017.dist-info → konduktor_nightly-0.1.0.dev20250411104646.dist-info}/entry_points.txt +0 -0
konduktor/data/aws/s3.py
ADDED
@@ -0,0 +1,1114 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Amazon Web Services (AWS) S3 Storage."""
|
14
|
+
|
15
|
+
import enum
|
16
|
+
import hashlib
|
17
|
+
import os
|
18
|
+
import re
|
19
|
+
import shlex
|
20
|
+
import subprocess
|
21
|
+
import tempfile
|
22
|
+
import time
|
23
|
+
import typing
|
24
|
+
import uuid
|
25
|
+
from typing import Any, Dict, List, Optional, Tuple
|
26
|
+
|
27
|
+
import boto3
|
28
|
+
import colorama
|
29
|
+
|
30
|
+
from konduktor import config, logging
|
31
|
+
from konduktor.adaptors import aws
|
32
|
+
from konduktor.data import constants, data_utils, storage_utils
|
33
|
+
from konduktor.utils import (
|
34
|
+
annotations,
|
35
|
+
base64_utils,
|
36
|
+
common_utils,
|
37
|
+
exceptions,
|
38
|
+
kubernetes_utils,
|
39
|
+
rich_utils,
|
40
|
+
ux_utils,
|
41
|
+
)
|
42
|
+
|
43
|
+
logger = logging.get_logger(__name__)
|
44
|
+
|
45
|
+
# Maximum number of concurrent rsync upload processes
|
46
|
+
_MAX_CONCURRENT_UPLOADS = 32
|
47
|
+
|
48
|
+
_CREDENTIAL_FILES = ['credentials', 'config']
|
49
|
+
|
50
|
+
AWS_SECRET_NAME = 'awscredentials'
|
51
|
+
AWS_CREDENTIALS_KEY = 'awscredentials'
|
52
|
+
|
53
|
+
DEFAULT_AWS_CREDENTIALS_DIR = '~/.aws/'
|
54
|
+
DEFAULT_AWS_CREDENTIAL_PATH = '~/.aws/credentials'
|
55
|
+
DEFAULT_AWS_CONFIG_PATH = '~/.aws/config'
|
56
|
+
|
57
|
+
|
58
|
+
class AWSIdentityType(enum.Enum):
|
59
|
+
"""AWS identity type.
|
60
|
+
|
61
|
+
The account type is determined by the current user identity, based on `aws
|
62
|
+
configure list`. We will check the existence of the value in the output of
|
63
|
+
`aws configure list` to determine the account type.
|
64
|
+
"""
|
65
|
+
|
66
|
+
# Name Value Type Location
|
67
|
+
# ---- ----- ---- --------
|
68
|
+
# profile 1234 env ...
|
69
|
+
# access_key ****************abcd sso
|
70
|
+
# secret_key ****************abcd sso
|
71
|
+
# region <not set> None None
|
72
|
+
SSO = 'sso'
|
73
|
+
ENV = 'env'
|
74
|
+
IAM_ROLE = 'iam-role'
|
75
|
+
CONTAINER_ROLE = 'container-role'
|
76
|
+
CUSTOM_PROCESS = 'custom-process'
|
77
|
+
ASSUME_ROLE = 'assume-role'
|
78
|
+
|
79
|
+
# Name Value Type Location
|
80
|
+
# ---- ----- ---- --------
|
81
|
+
# profile <not set> None None
|
82
|
+
# access_key ****************abcd shared-credentials-file
|
83
|
+
# secret_key ****************abcd shared-credentials-file
|
84
|
+
# region us-east-1 config-file ~/.aws/config
|
85
|
+
SHARED_CREDENTIALS_FILE = 'shared-credentials-file'
|
86
|
+
|
87
|
+
# IN GCS.PY
|
88
|
+
def can_credential_expire(self) -> bool:
|
89
|
+
"""Check if the AWS identity type can expire.
|
90
|
+
|
91
|
+
SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed
|
92
|
+
automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived
|
93
|
+
credentials without refresh.
|
94
|
+
IAM ROLE:
|
95
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
96
|
+
SSO/Container-role refresh token:
|
97
|
+
https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html
|
98
|
+
"""
|
99
|
+
# TODO(hong): Add a CLI based check for the expiration of the temporary
|
100
|
+
# credentials
|
101
|
+
expirable_types = {AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE}
|
102
|
+
return self in expirable_types
|
103
|
+
|
104
|
+
|
105
|
+
class S3Store(storage_utils.AbstractStore):
|
106
|
+
"""S3Store inherits from Storage Object and represents the backend
|
107
|
+
for S3 buckets.
|
108
|
+
"""
|
109
|
+
|
110
|
+
# k8s secret name for aws credentials
|
111
|
+
_AWS_SECRET_NAME = f'{AWS_SECRET_NAME}-{common_utils.user_and_hostname_hash()}'
|
112
|
+
_AWS_CREDENTIALS_KEY = AWS_CREDENTIALS_KEY
|
113
|
+
|
114
|
+
_DEFAULT_REGION = 'us-east-1'
|
115
|
+
_ACCESS_DENIED_MESSAGE = 'Access Denied'
|
116
|
+
_CUSTOM_ENDPOINT_REGIONS = [
|
117
|
+
'ap-east-1',
|
118
|
+
'me-south-1',
|
119
|
+
'af-south-1',
|
120
|
+
'eu-south-1',
|
121
|
+
'eu-south-2',
|
122
|
+
'ap-south-2',
|
123
|
+
'ap-southeast-3',
|
124
|
+
'ap-southeast-4',
|
125
|
+
'me-central-1',
|
126
|
+
'il-central-1',
|
127
|
+
]
|
128
|
+
|
129
|
+
_INDENT_PREFIX = ' '
|
130
|
+
|
131
|
+
_STATIC_CREDENTIAL_HELP_STR = (
|
132
|
+
'Run the following commands:'
|
133
|
+
f'\n{_INDENT_PREFIX} $ aws configure'
|
134
|
+
f'\n{_INDENT_PREFIX} $ aws configure list '
|
135
|
+
'# Ensure that this shows identity is set.'
|
136
|
+
f'\n{_INDENT_PREFIX}For more info: '
|
137
|
+
'https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html' # pylint: disable=line-too-long
|
138
|
+
)
|
139
|
+
|
140
|
+
_REPR = 'S3Store'
|
141
|
+
|
142
|
+
def __init__(
|
143
|
+
self,
|
144
|
+
name: str,
|
145
|
+
source: str,
|
146
|
+
region: Optional[str] = _DEFAULT_REGION,
|
147
|
+
is_sky_managed: Optional[bool] = False,
|
148
|
+
sync_on_reconstruction: Optional[bool] = True,
|
149
|
+
_bucket_sub_path: Optional[str] = None,
|
150
|
+
):
|
151
|
+
self.client: 'boto3.client.Client'
|
152
|
+
self.bucket: 'constants.StorageHandle'
|
153
|
+
if region in self._CUSTOM_ENDPOINT_REGIONS:
|
154
|
+
logger.warning(
|
155
|
+
'AWS opt-in regions are not supported for S3. '
|
156
|
+
f'Falling back to default region '
|
157
|
+
f'{self._DEFAULT_REGION} for bucket {name!r}.'
|
158
|
+
)
|
159
|
+
region = self._DEFAULT_REGION
|
160
|
+
super().__init__(
|
161
|
+
name,
|
162
|
+
source,
|
163
|
+
region,
|
164
|
+
is_sky_managed,
|
165
|
+
sync_on_reconstruction,
|
166
|
+
_bucket_sub_path,
|
167
|
+
)
|
168
|
+
|
169
|
+
def __repr__(self):
|
170
|
+
return self._REPR
|
171
|
+
|
172
|
+
# IN GCS.PY
|
173
|
+
def _validate(self):
|
174
|
+
if self.source is not None and isinstance(self.source, str):
|
175
|
+
if self.source.startswith('s3://'):
|
176
|
+
assert self.name == data_utils.split_s3_path(self.source)[0], (
|
177
|
+
'S3 Bucket is specified as path, the name should be the'
|
178
|
+
' same as S3 bucket.'
|
179
|
+
)
|
180
|
+
assert data_utils.verify_s3_bucket(self.name), (
|
181
|
+
f'Source specified as {self.source}, an S3 bucket. ',
|
182
|
+
'S3 Bucket should exist.',
|
183
|
+
)
|
184
|
+
# if self.source.startswith('gs://'):
|
185
|
+
# assert self.name == data_utils.split_gcs_path(self.source)[0], (
|
186
|
+
# 'GCS Bucket is specified as path, the name should be '
|
187
|
+
# 'the same as GCS bucket.'
|
188
|
+
# )
|
189
|
+
# elif data_utils.is_az_container_endpoint(self.source):
|
190
|
+
# storage_account_name, container_name, _ = (
|
191
|
+
# data_utils.split_az_path(self.source))
|
192
|
+
# assert self.name == container_name, (
|
193
|
+
# 'Azure bucket is specified as path, the name should be '
|
194
|
+
# 'the same as Azure bucket.')
|
195
|
+
# assert data_utils.verify_az_bucket(
|
196
|
+
# storage_account_name, self.name), (
|
197
|
+
# f'Source specified as {self.source}, an Azure bucket. '
|
198
|
+
# 'Azure bucket should exist.')
|
199
|
+
# elif self.source.startswith('r2://'):
|
200
|
+
# assert self.name == data_utils.split_r2_path(self.source)[0], (
|
201
|
+
# 'R2 Bucket is specified as path, the name should be '
|
202
|
+
# 'the same as R2 bucket.')
|
203
|
+
# assert data_utils.verify_r2_bucket(self.name), (
|
204
|
+
# f'Source specified as {self.source}, a R2 bucket. ',
|
205
|
+
# 'R2 Bucket should exist.')
|
206
|
+
# elif self.source.startswith('cos://'):
|
207
|
+
# assert self.name == data_utils.split_cos_path(self.source)[0], (
|
208
|
+
# 'COS Bucket is specified as path, the name should be '
|
209
|
+
# 'the same as COS bucket.')
|
210
|
+
# assert data_utils.verify_ibm_cos_bucket(self.name), (
|
211
|
+
# f'Source specified as {self.source}, a COS bucket. ',
|
212
|
+
# 'COS Bucket should exist.')
|
213
|
+
# Validate name
|
214
|
+
self.name = self.validate_name(self.name)
|
215
|
+
|
216
|
+
# IN GCS.PY
|
217
|
+
@classmethod
|
218
|
+
def validate_name(cls, name: str) -> str:
|
219
|
+
"""Validates the name of the S3 store.
|
220
|
+
|
221
|
+
Source for rules:
|
222
|
+
https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html
|
223
|
+
"""
|
224
|
+
|
225
|
+
def _raise_no_traceback_name_error(err_str):
|
226
|
+
with ux_utils.print_exception_no_traceback():
|
227
|
+
raise exceptions.StorageNameError(err_str)
|
228
|
+
|
229
|
+
if name is not None and isinstance(name, str):
|
230
|
+
# Check for overall length
|
231
|
+
if not 3 <= len(name) <= 63:
|
232
|
+
_raise_no_traceback_name_error(
|
233
|
+
f'Invalid store name: name {name} must be between 3 (min) '
|
234
|
+
'and 63 (max) characters long.'
|
235
|
+
)
|
236
|
+
|
237
|
+
# Check for valid characters and start/end with a number or letter
|
238
|
+
pattern = r'^[a-z0-9][-a-z0-9._]*[a-z0-9]$'
|
239
|
+
if not re.match(pattern, name):
|
240
|
+
_raise_no_traceback_name_error(
|
241
|
+
f'Invalid store name: name {name} can consist only of '
|
242
|
+
'lowercase letters, numbers, dots (.), and hyphens (-). '
|
243
|
+
'It must begin and end with a letter or number.'
|
244
|
+
)
|
245
|
+
|
246
|
+
# Check for two adjacent periods
|
247
|
+
if '..' in name:
|
248
|
+
_raise_no_traceback_name_error(
|
249
|
+
f'Invalid store name: name {name} must not contain '
|
250
|
+
'two adjacent periods.'
|
251
|
+
)
|
252
|
+
|
253
|
+
# Check for IP address format
|
254
|
+
ip_pattern = r'^(?:\d{1,3}\.){3}\d{1,3}$'
|
255
|
+
if re.match(ip_pattern, name):
|
256
|
+
_raise_no_traceback_name_error(
|
257
|
+
f'Invalid store name: name {name} must not be formatted as '
|
258
|
+
'an IP address (for example, 192.168.5.4).'
|
259
|
+
)
|
260
|
+
|
261
|
+
# Check for 'xn--' prefix
|
262
|
+
if name.startswith('xn--'):
|
263
|
+
_raise_no_traceback_name_error(
|
264
|
+
f'Invalid store name: name {name} must not start with the '
|
265
|
+
'prefix "xn--".'
|
266
|
+
)
|
267
|
+
|
268
|
+
# Check for '-s3alias' suffix
|
269
|
+
if name.endswith('-s3alias'):
|
270
|
+
_raise_no_traceback_name_error(
|
271
|
+
f'Invalid store name: name {name} must not end with the '
|
272
|
+
'suffix "-s3alias".'
|
273
|
+
)
|
274
|
+
|
275
|
+
# Check for '--ol-s3' suffix
|
276
|
+
if name.endswith('--ol-s3'):
|
277
|
+
_raise_no_traceback_name_error(
|
278
|
+
f'Invalid store name: name {name} must not end with the '
|
279
|
+
'suffix "--ol-s3".'
|
280
|
+
)
|
281
|
+
else:
|
282
|
+
_raise_no_traceback_name_error('Store name must be specified.')
|
283
|
+
return name
|
284
|
+
|
285
|
+
# IN GCS.PY
|
286
|
+
def initialize(self):
|
287
|
+
"""Initializes the S3 store object on the cloud.
|
288
|
+
|
289
|
+
Initialization involves fetching bucket if exists, or creating it if
|
290
|
+
it does not.
|
291
|
+
|
292
|
+
Raises:
|
293
|
+
StorageBucketCreateError: If bucket creation fails
|
294
|
+
StorageBucketGetError: If fetching existing bucket fails
|
295
|
+
StorageInitError: If general initialization fails.
|
296
|
+
"""
|
297
|
+
self.client = data_utils.create_s3_client(self.region)
|
298
|
+
self.bucket, is_new_bucket = self._get_bucket()
|
299
|
+
if self.is_sky_managed is None:
|
300
|
+
# If is_sky_managed is not specified, then this is a new storage
|
301
|
+
# object (i.e., did not exist in global_user_state) and we should
|
302
|
+
# set the is_sky_managed property.
|
303
|
+
# If is_sky_managed is specified, then we take no action.
|
304
|
+
self.is_sky_managed = is_new_bucket
|
305
|
+
|
306
|
+
# IN GCS.PY
|
307
|
+
def upload(self):
|
308
|
+
"""Uploads source to store bucket.
|
309
|
+
|
310
|
+
Upload must be called by the Storage handler - it is not called on
|
311
|
+
Store initialization.
|
312
|
+
|
313
|
+
Raises:
|
314
|
+
StorageUploadError: if upload fails.
|
315
|
+
"""
|
316
|
+
try:
|
317
|
+
if isinstance(self.source, list):
|
318
|
+
self.batch_aws_rsync(self.source, create_dirs=True)
|
319
|
+
elif self.source is not None:
|
320
|
+
if self.source.startswith('s3://'):
|
321
|
+
pass
|
322
|
+
# elif self.source.startswith('gs://'):
|
323
|
+
# self._transfer_to_s3()
|
324
|
+
# elif self.source.startswith('r2://'):
|
325
|
+
# self._transfer_to_s3()
|
326
|
+
else:
|
327
|
+
self.batch_aws_rsync([self.source])
|
328
|
+
except exceptions.StorageUploadError:
|
329
|
+
raise
|
330
|
+
except Exception as e:
|
331
|
+
raise exceptions.StorageUploadError(
|
332
|
+
f'Upload failed for store {self.name}'
|
333
|
+
) from e
|
334
|
+
|
335
|
+
# IN GCS.PY
|
336
|
+
def delete(self) -> None:
|
337
|
+
deleted_by_skypilot = self._delete_s3_bucket(self.name)
|
338
|
+
if deleted_by_skypilot:
|
339
|
+
msg_str = f'Deleted S3 bucket {self.name}.'
|
340
|
+
else:
|
341
|
+
msg_str = (
|
342
|
+
f'S3 bucket {self.name} may have been deleted '
|
343
|
+
f'externally. Removing from local state.'
|
344
|
+
)
|
345
|
+
logger.info(f'{colorama.Fore.GREEN}{msg_str}' f'{colorama.Style.RESET_ALL}')
|
346
|
+
|
347
|
+
# IN GCS.PY
|
348
|
+
def get_handle(self) -> 'constants.StorageHandle':
|
349
|
+
return aws.resource('s3').Bucket(self.name)
|
350
|
+
|
351
|
+
# FROM data/storage.py but matches GCS.PY batch_gsutil_rsync() (s3 specific)
|
352
|
+
def batch_aws_rsync(
|
353
|
+
self, source_path_list: List['constants.Path'], create_dirs: bool = False
|
354
|
+
) -> None:
|
355
|
+
"""Invokes aws s3 sync to batch upload a list of local paths to S3
|
356
|
+
|
357
|
+
AWS Sync by default uses 10 threads to upload files to the bucket. To
|
358
|
+
increase parallelism, modify max_concurrent_requests in your aws config
|
359
|
+
file (Default path: ~/.aws/config).
|
360
|
+
|
361
|
+
Since aws s3 sync does not support batch operations, we construct
|
362
|
+
multiple commands to be run in parallel.
|
363
|
+
|
364
|
+
Args:
|
365
|
+
source_path_list: List of paths to local files or directories
|
366
|
+
create_dirs: If the local_path is a directory and this is set to
|
367
|
+
False, the contents of the directory are directly uploaded to
|
368
|
+
root of the bucket. If the local_path is a directory and this is
|
369
|
+
set to True, the directory is created in the bucket root and
|
370
|
+
contents are uploaded to it.
|
371
|
+
"""
|
372
|
+
sub_path = f'/{self._bucket_sub_path}' if self._bucket_sub_path else ''
|
373
|
+
|
374
|
+
def get_file_sync_command(base_dir_path, file_names):
|
375
|
+
includes = ' '.join(
|
376
|
+
[f'--include {shlex.quote(file_name)}' for file_name in file_names]
|
377
|
+
)
|
378
|
+
base_dir_path = shlex.quote(base_dir_path)
|
379
|
+
sync_command = (
|
380
|
+
'aws s3 sync --no-follow-symlinks --exclude="*" '
|
381
|
+
f'{includes} {base_dir_path} '
|
382
|
+
f's3://{self.name}{sub_path}'
|
383
|
+
)
|
384
|
+
return sync_command
|
385
|
+
|
386
|
+
def get_dir_sync_command(src_dir_path, dest_dir_name):
|
387
|
+
# we exclude .git directory from the sync
|
388
|
+
excluded_list = storage_utils.get_excluded_files(src_dir_path)
|
389
|
+
excluded_list.append('.git/*')
|
390
|
+
excludes = ' '.join(
|
391
|
+
[f'--exclude {shlex.quote(file_name)}' for file_name in excluded_list]
|
392
|
+
)
|
393
|
+
src_dir_path = shlex.quote(src_dir_path)
|
394
|
+
sync_command = (
|
395
|
+
f'aws s3 sync --no-follow-symlinks {excludes} '
|
396
|
+
f'{src_dir_path} '
|
397
|
+
f's3://{self.name}{sub_path}/{dest_dir_name}'
|
398
|
+
)
|
399
|
+
return sync_command
|
400
|
+
|
401
|
+
# Generate message for upload
|
402
|
+
if len(source_path_list) > 1:
|
403
|
+
source_message = f'{len(source_path_list)} paths'
|
404
|
+
else:
|
405
|
+
source_message = source_path_list[0]
|
406
|
+
|
407
|
+
log_path = logging.generate_tmp_logging_file_path(
|
408
|
+
constants._STORAGE_LOG_FILE_NAME
|
409
|
+
)
|
410
|
+
sync_path = f'{source_message} -> s3://{self.name}{sub_path}/'
|
411
|
+
with rich_utils.safe_status(
|
412
|
+
ux_utils.spinner_message(f'Syncing {sync_path}', log_path=log_path)
|
413
|
+
):
|
414
|
+
data_utils.parallel_upload(
|
415
|
+
source_path_list,
|
416
|
+
get_file_sync_command,
|
417
|
+
get_dir_sync_command,
|
418
|
+
log_path,
|
419
|
+
self.name,
|
420
|
+
self._ACCESS_DENIED_MESSAGE,
|
421
|
+
create_dirs=create_dirs,
|
422
|
+
max_concurrent_uploads=_MAX_CONCURRENT_UPLOADS,
|
423
|
+
)
|
424
|
+
logger.info(
|
425
|
+
ux_utils.finishing_message(f'Storage synced: {sync_path}', log_path)
|
426
|
+
)
|
427
|
+
|
428
|
+
# IN GCS.PY
|
429
|
+
def _get_bucket(self) -> Tuple['constants.StorageHandle', bool]:
|
430
|
+
"""Obtains the S3 bucket.
|
431
|
+
|
432
|
+
If the bucket exists, this method will return the bucket.
|
433
|
+
If the bucket does not exist, there are three cases:
|
434
|
+
1) Raise an error if the bucket source starts with s3://
|
435
|
+
2) Return None if bucket has been externally deleted and
|
436
|
+
sync_on_reconstruction is False
|
437
|
+
3) Create and return a new bucket otherwise
|
438
|
+
|
439
|
+
Raises:
|
440
|
+
StorageSpecError: If externally created bucket is attempted to be
|
441
|
+
mounted without specifying storage source.
|
442
|
+
StorageBucketCreateError: If creating the bucket fails
|
443
|
+
StorageBucketGetError: If fetching a bucket fails
|
444
|
+
StorageExternalDeletionError: If externally deleted storage is
|
445
|
+
attempted to be fetched while reconstructing the storage for
|
446
|
+
'sky storage delete' or 'sky start'
|
447
|
+
"""
|
448
|
+
s3 = aws.resource('s3')
|
449
|
+
bucket = s3.Bucket(self.name)
|
450
|
+
|
451
|
+
try:
|
452
|
+
# Try Public bucket case.
|
453
|
+
# This line does not error out if the bucket is an external public
|
454
|
+
# bucket or if it is a user's bucket that is publicly
|
455
|
+
# accessible.
|
456
|
+
self.client.head_bucket(Bucket=self.name)
|
457
|
+
self._validate_existing_bucket()
|
458
|
+
return bucket, False
|
459
|
+
except aws.botocore_exceptions().ClientError as e:
|
460
|
+
error_code = e.response['Error']['Code']
|
461
|
+
# AccessDenied error for buckets that are private and not owned by
|
462
|
+
# user.
|
463
|
+
if error_code == '403':
|
464
|
+
command = f'aws s3 ls {self.name}'
|
465
|
+
with ux_utils.print_exception_no_traceback():
|
466
|
+
raise exceptions.StorageBucketGetError(
|
467
|
+
f'Bucket {self.name} does not exist.'
|
468
|
+
+ f' To debug, consider running `{command}`.'
|
469
|
+
) from e
|
470
|
+
|
471
|
+
if isinstance(self.source, str) and self.source.startswith('s3://'):
|
472
|
+
with ux_utils.print_exception_no_traceback():
|
473
|
+
raise exceptions.StorageBucketGetError(
|
474
|
+
'Attempted to use a non-existent bucket as a source: '
|
475
|
+
f'{self.source}. Consider using `aws s3 ls '
|
476
|
+
f'{self.source}` to debug.'
|
477
|
+
)
|
478
|
+
|
479
|
+
# If bucket cannot be found in both private and public settings,
|
480
|
+
# the bucket is to be created by Sky. However, creation is skipped if
|
481
|
+
# Store object is being reconstructed for deletion or re-mount with
|
482
|
+
# sky start, and error is raised instead.
|
483
|
+
if self.sync_on_reconstruction:
|
484
|
+
bucket = self._create_s3_bucket(self.name, self.region)
|
485
|
+
return bucket, True
|
486
|
+
else:
|
487
|
+
# Raised when Storage object is reconstructed for sky storage
|
488
|
+
# delete or to re-mount Storages with sky start but the storage
|
489
|
+
# is already removed externally.
|
490
|
+
raise exceptions.StorageExternalDeletionError(
|
491
|
+
'Attempted to fetch a non-existent bucket: ' f'{self.name}'
|
492
|
+
)
|
493
|
+
|
494
|
+
# IN GCS.PY
|
495
|
+
def _download_file(self, remote_path: str, local_path: str) -> None:
|
496
|
+
"""Downloads file from remote to local on s3 bucket
|
497
|
+
using the boto3 API
|
498
|
+
|
499
|
+
Args:
|
500
|
+
remote_path: str; Remote path on S3 bucket
|
501
|
+
local_path: str; Local path on user's device
|
502
|
+
"""
|
503
|
+
self.bucket.download_file(remote_path, local_path)
|
504
|
+
|
505
|
+
# IN GCS.PY
|
506
|
+
def _create_s3_bucket(
|
507
|
+
self, bucket_name: str, region=_DEFAULT_REGION
|
508
|
+
) -> 'constants.StorageHandle':
|
509
|
+
"""Creates S3 bucket with specific name in specific region
|
510
|
+
|
511
|
+
Args:
|
512
|
+
bucket_name: str; Name of bucket
|
513
|
+
region: str; Region name, e.g. us-west-1, us-east-2
|
514
|
+
Raises:
|
515
|
+
StorageBucketCreateError: If bucket creation fails.
|
516
|
+
"""
|
517
|
+
s3_client = self.client
|
518
|
+
try:
|
519
|
+
create_bucket_config: Dict[str, Any] = {'Bucket': bucket_name}
|
520
|
+
# If default us-east-1 region of create_bucket API is used,
|
521
|
+
# the LocationConstraint must not be specified.
|
522
|
+
# Reference: https://stackoverflow.com/a/51912090
|
523
|
+
if region is not None and region != 'us-east-1':
|
524
|
+
create_bucket_config['CreateBucketConfiguration'] = {
|
525
|
+
'LocationConstraint': region
|
526
|
+
}
|
527
|
+
s3_client.create_bucket(**create_bucket_config)
|
528
|
+
logger.info(
|
529
|
+
f' {colorama.Style.DIM}Created S3 bucket {bucket_name!r} in '
|
530
|
+
f'{region or "us-east-1"}{colorama.Style.RESET_ALL}'
|
531
|
+
)
|
532
|
+
|
533
|
+
# Add AWS tags configured in config.yaml to the bucket.
|
534
|
+
# This is useful for cost tracking and external cleanup.
|
535
|
+
bucket_tags = config.get_nested(('aws', 'labels'), {})
|
536
|
+
if bucket_tags:
|
537
|
+
s3_client.put_bucket_tagging(
|
538
|
+
Bucket=bucket_name,
|
539
|
+
Tagging={
|
540
|
+
'TagSet': [
|
541
|
+
{'Key': k, 'Value': v} for k, v in bucket_tags.items()
|
542
|
+
]
|
543
|
+
},
|
544
|
+
)
|
545
|
+
|
546
|
+
except aws.botocore_exceptions().ClientError as e:
|
547
|
+
with ux_utils.print_exception_no_traceback():
|
548
|
+
raise exceptions.StorageBucketCreateError(
|
549
|
+
f'Attempted to create a bucket {self.name} but failed.'
|
550
|
+
) from e
|
551
|
+
return aws.resource('s3').Bucket(bucket_name)
|
552
|
+
|
553
|
+
# NOT IN GCS.PY but FROM data/storage.py (s3 specific)
|
554
|
+
def _execute_s3_remove_command(
|
555
|
+
self, command: str, bucket_name: str, hint_operating: str, hint_failed: str
|
556
|
+
) -> bool:
|
557
|
+
try:
|
558
|
+
with rich_utils.safe_status(ux_utils.spinner_message(hint_operating)):
|
559
|
+
subprocess.check_output(command.split(' '), stderr=subprocess.STDOUT)
|
560
|
+
except subprocess.CalledProcessError as e:
|
561
|
+
if 'NoSuchBucket' in e.output.decode('utf-8'):
|
562
|
+
logger.debug(f'Bucket {bucket_name} does not exist.')
|
563
|
+
return False
|
564
|
+
else:
|
565
|
+
with ux_utils.print_exception_no_traceback():
|
566
|
+
raise exceptions.StorageBucketDeleteError(
|
567
|
+
f'{hint_failed}' f'Detailed error: {e.output}'
|
568
|
+
)
|
569
|
+
return True
|
570
|
+
|
571
|
+
# IN GCS.PY
|
572
|
+
def _delete_s3_bucket(self, bucket_name: str) -> bool:
|
573
|
+
"""Deletes S3 bucket, including all objects in bucket
|
574
|
+
|
575
|
+
Args:
|
576
|
+
bucket_name: str; Name of bucket
|
577
|
+
|
578
|
+
Returns:
|
579
|
+
bool; True if bucket was deleted, False if it was deleted externally.
|
580
|
+
|
581
|
+
Raises:
|
582
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
583
|
+
"""
|
584
|
+
# Deleting objects is very slow programatically
|
585
|
+
# (i.e. bucket.objects.all().delete() is slow).
|
586
|
+
# In addition, standard delete operations (i.e. via `aws s3 rm`)
|
587
|
+
# are slow, since AWS puts deletion markers.
|
588
|
+
# https://stackoverflow.com/questions/49239351/why-is-it-so-much-slower-to-delete-objects-in-aws-s3-than-it-is-to-create-them
|
589
|
+
# The fastest way to delete is to run `aws s3 rb --force`,
|
590
|
+
# which removes the bucket by force.
|
591
|
+
remove_command = f'aws s3 rb s3://{bucket_name} --force'
|
592
|
+
success = self._execute_s3_remove_command(
|
593
|
+
remove_command,
|
594
|
+
bucket_name,
|
595
|
+
f'Deleting S3 bucket [green]{bucket_name}[/]',
|
596
|
+
f'Failed to delete S3 bucket {bucket_name}.',
|
597
|
+
)
|
598
|
+
if not success:
|
599
|
+
return False
|
600
|
+
|
601
|
+
# Wait until bucket deletion propagates on AWS servers
|
602
|
+
while data_utils.verify_s3_bucket(bucket_name):
|
603
|
+
time.sleep(0.1)
|
604
|
+
return True
|
605
|
+
|
606
|
+
# NOT IN GCS.PY but FROM data/storage.py (s3 specific)
|
607
|
+
def _delete_s3_bucket_sub_path(self, bucket_name: str, sub_path: str) -> bool:
|
608
|
+
"""Deletes the sub path from the bucket."""
|
609
|
+
remove_command = f'aws s3 rm s3://{bucket_name}/{sub_path}/ --recursive'
|
610
|
+
return self._execute_s3_remove_command(
|
611
|
+
remove_command,
|
612
|
+
bucket_name,
|
613
|
+
f'Removing objects from S3 bucket ' f'[green]{bucket_name}/{sub_path}[/]',
|
614
|
+
f'Failed to remove objects from S3 bucket {bucket_name}/{sub_path}.',
|
615
|
+
)
|
616
|
+
|
617
|
+
@classmethod
|
618
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
619
|
+
def _aws_configure_list(cls) -> Optional[bytes]:
|
620
|
+
proc = subprocess.run(
|
621
|
+
'aws configure list',
|
622
|
+
shell=True,
|
623
|
+
check=False,
|
624
|
+
stdout=subprocess.PIPE,
|
625
|
+
stderr=subprocess.PIPE,
|
626
|
+
)
|
627
|
+
if proc.returncode != 0:
|
628
|
+
return None
|
629
|
+
return proc.stdout
|
630
|
+
|
631
|
+
@classmethod
|
632
|
+
def _sso_credentials_help_str(cls, expired: bool = False) -> str:
|
633
|
+
help_str = 'Run the following commands (must use AWS CLI v2):'
|
634
|
+
if not expired:
|
635
|
+
help_str += f'\n{cls._INDENT_PREFIX} $ aws configure sso'
|
636
|
+
help_str += (
|
637
|
+
f'\n{cls._INDENT_PREFIX} $ aws sso login --profile <profile_name>'
|
638
|
+
f'\n{cls._INDENT_PREFIX}For more info: '
|
639
|
+
'https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-sso.html'
|
640
|
+
)
|
641
|
+
return help_str
|
642
|
+
|
643
|
+
@classmethod
|
644
|
+
@annotations.lru_cache(
|
645
|
+
scope='global', maxsize=1
|
646
|
+
) # Cache since getting identity is slow.
|
647
|
+
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
648
|
+
try:
|
649
|
+
sts = aws.client('sts', check_credentials=False)
|
650
|
+
# The caller identity contains 3 fields: UserId, Account, Arn.
|
651
|
+
# 1. 'UserId' is unique across all AWS entity, which looks like
|
652
|
+
# "AROADBQP57FF2AEXAMPLE:role-session-name"
|
653
|
+
# 2. 'Account' can be shared by multiple users under the same
|
654
|
+
# organization
|
655
|
+
# 3. 'Arn' is the full path to the user, which can be reused when
|
656
|
+
# the user is deleted and recreated.
|
657
|
+
# Refer to: <https://docs.aws.amazon.com/cli/latest/reference/sts/get-caller-identity.html>
|
658
|
+
# and <https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_variables.html#principaltable>
|
659
|
+
user_info = sts.get_caller_identity()
|
660
|
+
# Allow fallback to AccountId if UserId does not match, because:
|
661
|
+
# 1. In the case where multiple IAM users belong a single root account,
|
662
|
+
# those users normally share the visibility of the VMs, so we do not
|
663
|
+
# need to identity them with each other. (There can be some cases,
|
664
|
+
# when an IAM user is given a limited permission by the admin, we may
|
665
|
+
# ignore that case for now, or print out a warning if the underlying
|
666
|
+
# userid changed for a cluster).
|
667
|
+
# 2. In the case where the multiple users belong to an organization,
|
668
|
+
# those users will have different account id, so fallback works.
|
669
|
+
user_ids = [user_info['UserId'], user_info['Account']]
|
670
|
+
except aws.botocore_exceptions().NoCredentialsError as e:
|
671
|
+
with ux_utils.print_exception_no_traceback():
|
672
|
+
raise exceptions.CloudUserIdentityError(
|
673
|
+
'AWS credentials are not set. '
|
674
|
+
f'{cls._STATIC_CREDENTIAL_HELP_STR}\n'
|
675
|
+
f'{cls._INDENT_PREFIX}Details: `aws sts '
|
676
|
+
'get-caller-identity` failed with error:'
|
677
|
+
f' {common_utils.format_exception(e, use_bracket=True)}.'
|
678
|
+
) from None
|
679
|
+
except aws.botocore_exceptions().ClientError as e:
|
680
|
+
with ux_utils.print_exception_no_traceback():
|
681
|
+
raise exceptions.CloudUserIdentityError(
|
682
|
+
'Failed to access AWS services with credentials. '
|
683
|
+
'Make sure that the access and secret keys are correct.'
|
684
|
+
f' {cls._STATIC_CREDENTIAL_HELP_STR}\n'
|
685
|
+
f'{cls._INDENT_PREFIX}Details: `aws sts '
|
686
|
+
'get-caller-identity` failed with error:'
|
687
|
+
f' {common_utils.format_exception(e, use_bracket=True)}.'
|
688
|
+
) from None
|
689
|
+
except aws.botocore_exceptions().InvalidConfigError as e:
|
690
|
+
# pylint: disable=import-outside-toplevel
|
691
|
+
import awscli
|
692
|
+
from packaging import version
|
693
|
+
|
694
|
+
awscli_version = version.parse(awscli.__version__)
|
695
|
+
if awscli_version < version.parse(
|
696
|
+
'1.27.10'
|
697
|
+
) and 'configured to use SSO' in str(e):
|
698
|
+
with ux_utils.print_exception_no_traceback():
|
699
|
+
raise exceptions.CloudUserIdentityError(
|
700
|
+
'awscli is too old to use SSO.'
|
701
|
+
'Run the following command to upgrade:'
|
702
|
+
f'\n{cls._INDENT_PREFIX} $ pip install awscli>=1.27.10'
|
703
|
+
f'\n{cls._INDENT_PREFIX}You may need to log into SSO again '
|
704
|
+
f'after upgrading. {cls._sso_credentials_help_str()}'
|
705
|
+
) from None
|
706
|
+
with ux_utils.print_exception_no_traceback():
|
707
|
+
raise exceptions.CloudUserIdentityError(
|
708
|
+
f'Invalid AWS configuration.\n'
|
709
|
+
f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
|
710
|
+
) from None
|
711
|
+
except aws.botocore_exceptions().TokenRetrievalError:
|
712
|
+
# This is raised when the access token is expired, which mainly
|
713
|
+
# happens when the user is using temporary credentials or SSO
|
714
|
+
# login.
|
715
|
+
with ux_utils.print_exception_no_traceback():
|
716
|
+
raise exceptions.CloudUserIdentityError(
|
717
|
+
'AWS access token is expired.'
|
718
|
+
f' {cls._sso_credentials_help_str(expired=True)}'
|
719
|
+
) from None
|
720
|
+
except Exception as e: # pylint: disable=broad-except
|
721
|
+
with ux_utils.print_exception_no_traceback():
|
722
|
+
raise exceptions.CloudUserIdentityError(
|
723
|
+
f'Failed to get AWS user.\n'
|
724
|
+
f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
|
725
|
+
) from None
|
726
|
+
# TODO: Return a list of identities in the profile when we support
|
727
|
+
# automatic switching for AWS. Currently we only support one identity.
|
728
|
+
return [user_ids]
|
729
|
+
|
730
|
+
# IN GCS.PY
|
731
|
+
@classmethod
|
732
|
+
@annotations.lru_cache(
|
733
|
+
scope='global', maxsize=1
|
734
|
+
) # Cache since getting identity is slow.
|
735
|
+
def get_user_identities(cls) -> List[List[str]]:
|
736
|
+
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
737
|
+
|
738
|
+
These fields come from `aws sts get-caller-identity` and are cached
|
739
|
+
locally by `aws configure list` output. The identities are assumed to
|
740
|
+
be stable for the duration of the `sky` process. Modifying the
|
741
|
+
credentials while the `sky` process is running will not affect the
|
742
|
+
identity returned by this function.
|
743
|
+
|
744
|
+
We permit the same actual user to:
|
745
|
+
|
746
|
+
- switch between different root accounts (after which both elements
|
747
|
+
of the list will be different) and have their clusters owned by
|
748
|
+
each account be protected; or
|
749
|
+
|
750
|
+
- within the same root account, switch between different IAM
|
751
|
+
users, and treat [user_id=1234, account=A] and
|
752
|
+
[user_id=4567, account=A] to be the *same*. Namely, switching
|
753
|
+
between these IAM roles within the same root account will cause
|
754
|
+
the first element of the returned list to differ, and will allow
|
755
|
+
the same actual user to continue to interact with their clusters.
|
756
|
+
Note: this is not 100% safe, since the IAM users can have very
|
757
|
+
specific permissions, that disallow them to access the clusters
|
758
|
+
but it is a reasonable compromise as that could be rare.
|
759
|
+
|
760
|
+
Returns:
|
761
|
+
A list of strings that uniquely identifies the user on this cloud.
|
762
|
+
For identity check, we will fallback through the list of strings
|
763
|
+
until we find a match, and print a warning if we fail for the
|
764
|
+
first string.
|
765
|
+
|
766
|
+
Raises:
|
767
|
+
exceptions.CloudUserIdentityError: if the user identity cannot be
|
768
|
+
retrieved.
|
769
|
+
"""
|
770
|
+
stdout = cls._aws_configure_list()
|
771
|
+
if stdout is None:
|
772
|
+
# `aws configure list` is not available, possible reasons:
|
773
|
+
# - awscli is not installed but credentials are valid, e.g. run from
|
774
|
+
# an EC2 instance with IAM role
|
775
|
+
# - aws credentials are not set, proceed anyway to get unified error
|
776
|
+
# message for users
|
777
|
+
return cls._sts_get_caller_identity()
|
778
|
+
config_hash = hashlib.md5(stdout).hexdigest()[:8] # noqa: F841
|
779
|
+
# Getting aws identity cost ~1s, so we cache the result with the output of
|
780
|
+
# `aws configure list` as cache key. Different `aws configure list` output
|
781
|
+
# can have same aws identity, our assumption is the output would be stable
|
782
|
+
# in real world, so the number of cache files would be limited.
|
783
|
+
# TODO(aylei): consider using a more stable cache key and evalute eviction.
|
784
|
+
# TODO:(ryan) ??? Ignoring caching for now (returning early)
|
785
|
+
return cls._sts_get_caller_identity()
|
786
|
+
# cache_path = catalog_common.get_catalog_path(
|
787
|
+
# f'aws/.cache/user-identity-{config_hash}.txt')
|
788
|
+
# if os.path.exists(cache_path):
|
789
|
+
# try:
|
790
|
+
# with open(cache_path, 'r', encoding='utf-8') as f:
|
791
|
+
# return json.loads(f.read())
|
792
|
+
# except json.JSONDecodeError:
|
793
|
+
# # cache is invalid, ignore it and fetch identity again
|
794
|
+
# pass
|
795
|
+
#
|
796
|
+
# result = cls._sts_get_caller_identity()
|
797
|
+
# with open(cache_path, 'w', encoding='utf-8') as f:
|
798
|
+
# f.write(json.dumps(result))
|
799
|
+
# return result
|
800
|
+
|
801
|
+
# IN GCS.PY
|
802
|
+
@classmethod
|
803
|
+
def get_active_user_identity_str(cls) -> Optional[str]:
|
804
|
+
user_identity = cls.get_active_user_identity()
|
805
|
+
if user_identity is None:
|
806
|
+
return None
|
807
|
+
identity_str = f'{user_identity[0]} [account={user_identity[1]}]'
|
808
|
+
return identity_str
|
809
|
+
|
810
|
+
# IN GCS.PY
|
811
|
+
@classmethod
|
812
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
813
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
814
|
+
"""Checks if the user has access credentials to AWS."""
|
815
|
+
|
816
|
+
dependency_installation_hints = (
|
817
|
+
'AWS dependencies are not installed. '
|
818
|
+
'Run the following commands:'
|
819
|
+
f'\n{cls._INDENT_PREFIX} $ pip install boto3 botocore awscli'
|
820
|
+
f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. '
|
821
|
+
f'{cls._STATIC_CREDENTIAL_HELP_STR}'
|
822
|
+
)
|
823
|
+
|
824
|
+
# Checks if the AWS CLI is installed properly
|
825
|
+
proc = subprocess.run(
|
826
|
+
'aws --version',
|
827
|
+
shell=True,
|
828
|
+
check=False,
|
829
|
+
stdout=subprocess.PIPE,
|
830
|
+
stderr=subprocess.PIPE,
|
831
|
+
)
|
832
|
+
if proc.returncode != 0:
|
833
|
+
return False, dependency_installation_hints
|
834
|
+
try:
|
835
|
+
# Checks if aws boto is installed properly
|
836
|
+
# pylint: disable=import-outside-toplevel, unused-import
|
837
|
+
import boto3 # noqa: F401
|
838
|
+
import botocore # noqa: F401
|
839
|
+
except ImportError:
|
840
|
+
return False, dependency_installation_hints
|
841
|
+
|
842
|
+
# Checks if AWS credentials 1) exist and 2) are valid.
|
843
|
+
# https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
|
844
|
+
try:
|
845
|
+
identity_str = cls.get_active_user_identity_str() # noqa: F841
|
846
|
+
except exceptions.CloudUserIdentityError as e:
|
847
|
+
return False, str(e)
|
848
|
+
|
849
|
+
static_credential_exists = os.path.isfile(
|
850
|
+
os.path.expanduser('~/.aws/credentials')
|
851
|
+
)
|
852
|
+
hints = None
|
853
|
+
identity_type = cls._current_identity_type()
|
854
|
+
single_cloud_hint = (
|
855
|
+
' It will work if you use AWS only, but will cause problems '
|
856
|
+
'if you want to use multiple clouds. To set up static credentials, '
|
857
|
+
'try: aws configure'
|
858
|
+
)
|
859
|
+
if identity_type == AWSIdentityType.SSO:
|
860
|
+
hints = 'AWS SSO is set.'
|
861
|
+
if static_credential_exists:
|
862
|
+
hints += (
|
863
|
+
' To ensure multiple clouds work correctly, please use SkyPilot '
|
864
|
+
'with static credentials (e.g., ~/.aws/credentials) by unsetting '
|
865
|
+
'the AWS_PROFILE environment variable.'
|
866
|
+
)
|
867
|
+
else:
|
868
|
+
hints += single_cloud_hint
|
869
|
+
elif identity_type == AWSIdentityType.IAM_ROLE:
|
870
|
+
# When using an IAM role, the credentials may not exist in the
|
871
|
+
# ~/.aws/credentials file. So we don't check for the existence of the
|
872
|
+
# file. This will happen when the user is on a VM (or
|
873
|
+
# jobs-controller) created by an SSO account, i.e. the VM will be
|
874
|
+
# assigned the IAM role: skypilot-v1.
|
875
|
+
hints = f'AWS IAM role is set.{single_cloud_hint}'
|
876
|
+
elif identity_type == AWSIdentityType.CONTAINER_ROLE:
|
877
|
+
# Similar to the IAM ROLE, an ECS container may not store credentials
|
878
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
879
|
+
# the file. i.e. the container will be assigned the IAM role of the
|
880
|
+
# task: skypilot-v1.
|
881
|
+
hints = f'AWS container-role is set.{single_cloud_hint}'
|
882
|
+
elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
|
883
|
+
# Similar to the IAM ROLE, a custom process may not store credentials
|
884
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
885
|
+
# the file. i.e. the custom process will be assigned the IAM role of the
|
886
|
+
# task: skypilot-v1.
|
887
|
+
hints = f'AWS custom-process is set.{single_cloud_hint}'
|
888
|
+
elif identity_type == AWSIdentityType.ASSUME_ROLE:
|
889
|
+
# When using ASSUME ROLE, the credentials are coming from a different
|
890
|
+
# source profile. So we don't check for the existence of ~/.aws/credentials.
|
891
|
+
# i.e. the assumed role will be assigned the IAM role of the
|
892
|
+
# task: skypilot-v1.
|
893
|
+
hints = f'AWS assume-role is set.{single_cloud_hint}'
|
894
|
+
elif identity_type == AWSIdentityType.ENV:
|
895
|
+
# When using ENV vars, the credentials are coming from the environment
|
896
|
+
# variables. So we don't check for the existence of ~/.aws/credentials.
|
897
|
+
# i.e. the identity is not determined by the file.
|
898
|
+
hints = f'AWS env is set.{single_cloud_hint}'
|
899
|
+
else:
|
900
|
+
# This file is required because it is required by the VMs launched on
|
901
|
+
# other clouds to access private s3 buckets and resources like EC2.
|
902
|
+
# `get_active_user_identity` does not guarantee this file exists.
|
903
|
+
if not static_credential_exists:
|
904
|
+
return (
|
905
|
+
False,
|
906
|
+
'~/.aws/credentials does not exist. '
|
907
|
+
+ cls._STATIC_CREDENTIAL_HELP_STR,
|
908
|
+
)
|
909
|
+
|
910
|
+
try:
|
911
|
+
s3 = boto3.client('s3')
|
912
|
+
|
913
|
+
suffix = uuid.uuid4().hex[:6]
|
914
|
+
test_bucket = f'konduktor-check-s3-{int(time.time())}-{suffix}'
|
915
|
+
|
916
|
+
try:
|
917
|
+
s3.create_bucket(Bucket=test_bucket)
|
918
|
+
|
919
|
+
time.sleep(1)
|
920
|
+
|
921
|
+
s3.get_bucket_location(Bucket=test_bucket)
|
922
|
+
s3.list_objects_v2(Bucket=test_bucket, MaxKeys=1)
|
923
|
+
|
924
|
+
# Object-related checks
|
925
|
+
s3.put_object(Bucket=test_bucket, Key='test.txt', Body='hello')
|
926
|
+
s3.get_object(Bucket=test_bucket, Key='test.txt')
|
927
|
+
s3.delete_object(Bucket=test_bucket, Key='test.txt')
|
928
|
+
|
929
|
+
finally:
|
930
|
+
# Always attempt to clean up, even if earlier checks failed
|
931
|
+
try:
|
932
|
+
s3.delete_bucket(Bucket=test_bucket)
|
933
|
+
except Exception:
|
934
|
+
raise RuntimeError(
|
935
|
+
'AWS S3 delete bucket permission is missing. '
|
936
|
+
'Please check your policy.\n'
|
937
|
+
)
|
938
|
+
|
939
|
+
except Exception:
|
940
|
+
return False, (
|
941
|
+
'One or more AWS S3 permissions are missing. '
|
942
|
+
'Please check your policy.\n'
|
943
|
+
)
|
944
|
+
|
945
|
+
logger.info(
|
946
|
+
f'AWS credentials are valid '
|
947
|
+
f'for the current identity {logging.CHECK_MARK_EMOJI}'
|
948
|
+
)
|
949
|
+
logger.info('Creating k8s secret with AWS credentials...')
|
950
|
+
set_ok, result = cls.set_secret_credentials()
|
951
|
+
if not set_ok:
|
952
|
+
logger.error(f'Failed to create k8s secret with AWS credentials: {result}')
|
953
|
+
return False, result
|
954
|
+
return True, hints
|
955
|
+
|
956
|
+
@classmethod
|
957
|
+
def _current_identity_type(cls) -> Optional[AWSIdentityType]:
|
958
|
+
stdout = cls._aws_configure_list()
|
959
|
+
if stdout is None:
|
960
|
+
return None
|
961
|
+
output = stdout.decode()
|
962
|
+
|
963
|
+
# We determine the identity type by looking at the output of
|
964
|
+
# `aws configure list`. The output looks like:
|
965
|
+
# Name Value Type Location
|
966
|
+
# ---- ----- ---- --------
|
967
|
+
# profile <not set> None None
|
968
|
+
# access_key * <not set> sso None
|
969
|
+
# secret_key * <not set> sso None
|
970
|
+
# region <not set> None None
|
971
|
+
# We try to determine the identity type by looking for the
|
972
|
+
# string "sso"/"iam-role" in the output, i.e. the "Type" column.
|
973
|
+
|
974
|
+
def _is_access_key_of_type(type_str: str) -> bool:
|
975
|
+
# The dot (.) does not match line separators.
|
976
|
+
results = re.findall(rf'access_key.*{type_str}', output)
|
977
|
+
if len(results) > 1:
|
978
|
+
raise RuntimeError(f'Unexpected `aws configure list` output:\n{output}')
|
979
|
+
return len(results) == 1
|
980
|
+
|
981
|
+
for identity_type in AWSIdentityType:
|
982
|
+
if _is_access_key_of_type(identity_type.value):
|
983
|
+
return identity_type
|
984
|
+
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
985
|
+
|
986
|
+
# IN GCS.PY
|
987
|
+
@classmethod
|
988
|
+
def set_secret_credentials(cls) -> Tuple[bool, Optional[str]]:
|
989
|
+
"""
|
990
|
+
Set the k8s secret storing the AWS credentials
|
991
|
+
"""
|
992
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
993
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace()
|
994
|
+
|
995
|
+
# Check if credentials are provided via environment
|
996
|
+
access_key = os.environ.get('AWS_ACCESS_KEY_ID')
|
997
|
+
secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
|
998
|
+
region = os.environ.get('AWS_DEFAULT_REGION', 'us-east-1')
|
999
|
+
|
1000
|
+
if access_key and secret_key:
|
1001
|
+
logger.info('Using AWS credentials from env')
|
1002
|
+
credentials_dir = tempfile.mkdtemp()
|
1003
|
+
credentials_path = os.path.join(credentials_dir, 'credentials')
|
1004
|
+
config_path = os.path.join(credentials_dir, 'config')
|
1005
|
+
|
1006
|
+
with open(credentials_path, 'w') as f:
|
1007
|
+
f.write(f"""[default]
|
1008
|
+
aws_access_key_id = {access_key}
|
1009
|
+
aws_secret_access_key = {secret_key}
|
1010
|
+
""")
|
1011
|
+
|
1012
|
+
with open(config_path, 'w') as f:
|
1013
|
+
f.write(f"""[default]
|
1014
|
+
region = {region}
|
1015
|
+
""")
|
1016
|
+
else:
|
1017
|
+
logger.info('Using AWS credentils from ~/.aws/')
|
1018
|
+
credentials_dir = DEFAULT_AWS_CREDENTIALS_DIR
|
1019
|
+
|
1020
|
+
credentials_files = [
|
1021
|
+
os.path.expanduser(os.path.join(credentials_dir, f))
|
1022
|
+
for f in _CREDENTIAL_FILES
|
1023
|
+
]
|
1024
|
+
ok, result = kubernetes_utils.set_secret(
|
1025
|
+
secret_name=cls._AWS_SECRET_NAME,
|
1026
|
+
namespace=namespace,
|
1027
|
+
context=context,
|
1028
|
+
secret_key=cls._AWS_CREDENTIALS_KEY,
|
1029
|
+
secret_value=base64_utils.zip_base64encode(credentials_files),
|
1030
|
+
)
|
1031
|
+
if not ok:
|
1032
|
+
logger.error(f'Failed to set AWS credentials in k8s secret: \n{result}')
|
1033
|
+
return False, result
|
1034
|
+
else:
|
1035
|
+
logger.info(
|
1036
|
+
f'AWS credentials set in k8s secret: {cls._AWS_SECRET_NAME} '
|
1037
|
+
f'in namespace {namespace} in context {context} '
|
1038
|
+
f'{logging.CHECK_MARK_EMOJI}'
|
1039
|
+
)
|
1040
|
+
|
1041
|
+
try:
|
1042
|
+
identity = boto3.client('sts').get_caller_identity()
|
1043
|
+
logger.info(
|
1044
|
+
f"Using AWS credentials for ARN: {identity['Arn']} "
|
1045
|
+
f"(UserId: {identity['UserId']}, Account: {identity['Account']})"
|
1046
|
+
)
|
1047
|
+
except Exception as e:
|
1048
|
+
logger.warning(f'Could not fetch caller identity: {e}')
|
1049
|
+
|
1050
|
+
return True, None
|
1051
|
+
|
1052
|
+
# IN GCS.PY
|
1053
|
+
@classmethod
|
1054
|
+
def get_k8s_credential_name(cls) -> str:
|
1055
|
+
return cls._AWS_SECRET_NAME
|
1056
|
+
|
1057
|
+
|
1058
|
+
class S3CloudStorage(storage_utils.CloudStorage):
|
1059
|
+
"""S3 Storage."""
|
1060
|
+
|
1061
|
+
# List of commands to install AWS CLI
|
1062
|
+
_GET_AWSCLI = [
|
1063
|
+
'command -v aws >/dev/null 2>&1 || ('
|
1064
|
+
'apt-get update && apt-get install -y curl unzip && '
|
1065
|
+
'curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && ' # noqa: E501
|
1066
|
+
'unzip awscliv2.zip && '
|
1067
|
+
'./aws/install -i ~/aws-cli -b ~/bin && '
|
1068
|
+
'export PATH=$HOME/bin:$PATH && '
|
1069
|
+
'rm -rf aws awscliv2.zip'
|
1070
|
+
') && export PATH=$HOME/bin:$PATH'
|
1071
|
+
]
|
1072
|
+
|
1073
|
+
_STORE: typing.Type[storage_utils.AbstractStore] = S3Store
|
1074
|
+
|
1075
|
+
# IN GCS.PY
|
1076
|
+
def is_directory(self, url: str) -> bool:
|
1077
|
+
"""Returns whether S3 'url' is a directory.
|
1078
|
+
|
1079
|
+
In cloud object stores, a "directory" refers to a regular object whose
|
1080
|
+
name is a prefix of other objects.
|
1081
|
+
"""
|
1082
|
+
s3 = aws.resource('s3')
|
1083
|
+
bucket_name, path = data_utils.split_s3_path(url)
|
1084
|
+
bucket = s3.Bucket(bucket_name)
|
1085
|
+
|
1086
|
+
num_objects = 0
|
1087
|
+
for obj in bucket.objects.filter(Prefix=path):
|
1088
|
+
num_objects += 1
|
1089
|
+
if obj.key == path:
|
1090
|
+
return False
|
1091
|
+
# If there are more than 1 object in filter, then it is a directory
|
1092
|
+
if num_objects == 3:
|
1093
|
+
return True
|
1094
|
+
|
1095
|
+
# A directory with few or no items
|
1096
|
+
return True
|
1097
|
+
|
1098
|
+
# IN GCS.PY
|
1099
|
+
def make_sync_dir_command(self, source: str, destination: str) -> str:
|
1100
|
+
"""Downloads using AWS CLI."""
|
1101
|
+
# AWS Sync by default uses 10 threads to upload files to the bucket.
|
1102
|
+
# To increase parallelism, modify max_concurrent_requests in your
|
1103
|
+
# aws config file (Default path: ~/.aws/config).
|
1104
|
+
all_commands = list(self._GET_AWSCLI)
|
1105
|
+
|
1106
|
+
all_commands.append(f'aws s3 sync --no-follow-symlinks {source} {destination}')
|
1107
|
+
return ' && '.join(all_commands)
|
1108
|
+
|
1109
|
+
# IN GCS.PY
|
1110
|
+
def make_sync_file_command(self, source: str, destination: str) -> str:
|
1111
|
+
all_commands = list(self._GET_AWSCLI)
|
1112
|
+
|
1113
|
+
all_commands.append(f'aws s3 cp {source} {destination}')
|
1114
|
+
return ' && '.join(all_commands)
|