konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,799 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Different cloud storage definitions. This modules responsibility
|
14
|
+
1.) Create the secrets for each cloud as k8s secrets
|
15
|
+
2.) Mount the secrets as volumes into each container
|
16
|
+
3.) Provide utilities/scripts for the pods to download files syncd
|
17
|
+
to object storage
|
18
|
+
|
19
|
+
For each cloud/storage class we'll only have a single namespace at
|
20
|
+
`konduktor` and each run will correspond to a new folder e.g.
|
21
|
+
`s3://konduktor/my-llm-run-a34be-a3ebf`
|
22
|
+
"""
|
23
|
+
|
24
|
+
import enum
|
25
|
+
import os
|
26
|
+
import re
|
27
|
+
import urllib.parse
|
28
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
29
|
+
|
30
|
+
from konduktor import check, config, logging
|
31
|
+
from konduktor.data import constants, data_utils, gcp, storage_utils
|
32
|
+
from konduktor.utils import annotations, common_utils, exceptions, schemas, ux_utils
|
33
|
+
|
34
|
+
logger = logging.get_logger(__file__)
|
35
|
+
|
36
|
+
|
37
|
+
@annotations.lru_cache(scope='global')
|
38
|
+
def get_cached_enabled_storage_clouds_or_refresh(
|
39
|
+
raise_if_no_cloud_access: bool = False,
|
40
|
+
) -> List[str]:
|
41
|
+
# This is a temporary solution until https://github.com/skypilot-org/skypilot/issues/1943 # noqa: E501
|
42
|
+
# (asaiacai): This function does not do any actual checking right now.
|
43
|
+
# this is temporary.In the future, we can cache to disk.
|
44
|
+
# For now, we just print a warning to the user saying what
|
45
|
+
# clouds are enabled and if the task fails to run `konduktor check`
|
46
|
+
# to update the credentials.
|
47
|
+
enabled_clouds = config.get_nested(('allowed_clouds',), [])
|
48
|
+
if len(enabled_clouds) == 0:
|
49
|
+
enabled_clouds = constants.STORE_ENABLED_CLOUDS
|
50
|
+
else:
|
51
|
+
enabled_clouds = [str(cloud) for cloud in enabled_clouds]
|
52
|
+
logger.warning(
|
53
|
+
f'Enabled storage clouds: {enabled_clouds}. Defaulting to '
|
54
|
+
f'{enabled_clouds[0]}. If sync fails, '
|
55
|
+
're-run `konduktor check` to verify credentials.'
|
56
|
+
)
|
57
|
+
return enabled_clouds
|
58
|
+
|
59
|
+
|
60
|
+
def _is_storage_cloud_enabled(
|
61
|
+
cloud_name: str, try_fix_with_sky_check: bool = True
|
62
|
+
) -> bool:
|
63
|
+
enabled_storage_clouds = get_cached_enabled_storage_clouds_or_refresh()
|
64
|
+
if cloud_name in enabled_storage_clouds:
|
65
|
+
return True
|
66
|
+
if try_fix_with_sky_check:
|
67
|
+
# TODO(zhwu): Only check the specified cloud to speed up.
|
68
|
+
check.check(quiet=True)
|
69
|
+
return _is_storage_cloud_enabled(cloud_name, try_fix_with_sky_check=False)
|
70
|
+
return False
|
71
|
+
|
72
|
+
|
73
|
+
class StorageMode(enum.Enum):
|
74
|
+
COPY = 'COPY'
|
75
|
+
MOUNT = 'MOUNT'
|
76
|
+
|
77
|
+
|
78
|
+
class StoreType(enum.Enum):
|
79
|
+
"""Enum for the different types of stores."""
|
80
|
+
|
81
|
+
GCS = 'GCS'
|
82
|
+
|
83
|
+
@classmethod
|
84
|
+
def from_cloud(cls, cloud: str) -> 'StoreType':
|
85
|
+
# these need to match the cloud store classes in konduktor/cloud_stores.py
|
86
|
+
if cloud.lower() == 'gs':
|
87
|
+
return StoreType.GCS
|
88
|
+
else:
|
89
|
+
with ux_utils.print_exception_no_traceback():
|
90
|
+
raise ValueError(f'Unknown cloud: {cloud}')
|
91
|
+
|
92
|
+
@classmethod
|
93
|
+
def from_store(cls, store: 'storage_utils.AbstractStore') -> 'StoreType':
|
94
|
+
if store.__repr__() == 'GcsStore':
|
95
|
+
return StoreType.GCS
|
96
|
+
else:
|
97
|
+
with ux_utils.print_exception_no_traceback():
|
98
|
+
raise ValueError(f'Unknown store type: {store}')
|
99
|
+
|
100
|
+
def store_prefix(self) -> str:
|
101
|
+
if self == StoreType.GCS:
|
102
|
+
return 'gs://'
|
103
|
+
else:
|
104
|
+
with ux_utils.print_exception_no_traceback():
|
105
|
+
raise ValueError(f'Unknown store type: {self}')
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def get_fields_from_store_url(
|
109
|
+
cls, store_url: str
|
110
|
+
) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]:
|
111
|
+
"""Returns the store type, bucket name, and sub path from
|
112
|
+
a store URL, and the storage account name and region if applicable.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
store_url: str; The store URL.
|
116
|
+
"""
|
117
|
+
# The full path from the user config of IBM COS contains the region,
|
118
|
+
# and Azure Blob Storage contains the storage account name, we need to
|
119
|
+
# pass these information to the store constructor.
|
120
|
+
storage_account_name = None
|
121
|
+
region = None
|
122
|
+
for store_type in StoreType:
|
123
|
+
if store_url.startswith(store_type.store_prefix()):
|
124
|
+
if store_type == StoreType.GCS:
|
125
|
+
bucket_name, sub_path = data_utils.split_gcs_path(store_url)
|
126
|
+
return store_type, bucket_name, sub_path, storage_account_name, region
|
127
|
+
raise ValueError(f'Unknown store URL: {store_url}')
|
128
|
+
|
129
|
+
@classmethod
|
130
|
+
def get_endpoint_url(cls, store: 'storage_utils.AbstractStore', path: str) -> str:
|
131
|
+
"""Generates the endpoint URL for a given store and path.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
store: Store object implementing AbstractStore.
|
135
|
+
path: Path within the store.
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
Endpoint URL of the bucket as a string.
|
139
|
+
"""
|
140
|
+
store_type = cls.from_store(store)
|
141
|
+
bucket_endpoint_url = f'{store_type.store_prefix()}{path}'
|
142
|
+
return bucket_endpoint_url
|
143
|
+
|
144
|
+
|
145
|
+
# this should match the above StoreType enum
|
146
|
+
STORE_TYPES = Literal[StoreType.GCS]
|
147
|
+
|
148
|
+
|
149
|
+
class Storage(object):
|
150
|
+
"""Storage objects handle persistent and large volume storage in the sky.
|
151
|
+
|
152
|
+
Storage represents an abstract data store containing large data files
|
153
|
+
required by the task. Compared to file_mounts, storage is faster and
|
154
|
+
can persist across runs, requiring fewer uploads from your local machine.
|
155
|
+
|
156
|
+
A storage object can be used in either MOUNT mode or COPY mode. In MOUNT
|
157
|
+
mode (the default), the backing store is directly "mounted" to the remote
|
158
|
+
VM, and files are fetched when accessed by the task and files written to the
|
159
|
+
mount path are also written to the remote store. In COPY mode, the files are
|
160
|
+
pre-fetched and cached on the local disk and writes are not replicated on
|
161
|
+
the remote store.
|
162
|
+
|
163
|
+
Behind the scenes, storage automatically uploads all data in the source
|
164
|
+
to a backing object store in a particular cloud (S3/GCS/Azure Blob).
|
165
|
+
|
166
|
+
Typical Usage: (See examples/playground/storage_playground.py)
|
167
|
+
storage = Storage(name='imagenet-bucket', source='~/Documents/imagenet')
|
168
|
+
|
169
|
+
# Move data to S3
|
170
|
+
storage.add_store('S3')
|
171
|
+
|
172
|
+
# Move data to Google Cloud Storage
|
173
|
+
storage.add_store('GCS')
|
174
|
+
|
175
|
+
# Delete Storage for both S3 and GCS
|
176
|
+
storage.delete()
|
177
|
+
"""
|
178
|
+
|
179
|
+
class StorageMetadata(object):
|
180
|
+
"""A pickle-able tuple of:
|
181
|
+
|
182
|
+
- (required) Storage name.
|
183
|
+
- (required) Source
|
184
|
+
- (optional) Storage mode.
|
185
|
+
- (optional) Set of stores managed by sky added to the Storage object
|
186
|
+
"""
|
187
|
+
|
188
|
+
def __init__(
|
189
|
+
self,
|
190
|
+
*,
|
191
|
+
storage_name: Optional[str],
|
192
|
+
source: Optional[constants.SourceType],
|
193
|
+
mode: Optional[StorageMode] = None,
|
194
|
+
sky_stores: Optional[
|
195
|
+
Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
|
196
|
+
] = None,
|
197
|
+
):
|
198
|
+
assert storage_name is not None or source is not None
|
199
|
+
self.storage_name = storage_name
|
200
|
+
self.source = source
|
201
|
+
self.mode = mode
|
202
|
+
|
203
|
+
# Only stores managed by sky are stored here in the
|
204
|
+
# global_user_state
|
205
|
+
self.sky_stores = {} if sky_stores is None else sky_stores
|
206
|
+
|
207
|
+
def __repr__(self):
|
208
|
+
return (
|
209
|
+
f'StorageMetadata('
|
210
|
+
f'\n\tstorage_name={self.storage_name},'
|
211
|
+
f'\n\tsource={self.source},'
|
212
|
+
f'\n\tmode={self.mode},'
|
213
|
+
f'\n\t{self.sky_stores}'
|
214
|
+
)
|
215
|
+
|
216
|
+
def add_store(self, store: 'storage_utils.AbstractStore') -> None:
|
217
|
+
storetype = StoreType.from_store(store)
|
218
|
+
self.sky_stores[storetype] = store.get_metadata()
|
219
|
+
|
220
|
+
def remove_store(self, store: 'storage_utils.AbstractStore') -> None:
|
221
|
+
storetype = StoreType.from_store(store)
|
222
|
+
if storetype in self.sky_stores:
|
223
|
+
del self.sky_stores[storetype]
|
224
|
+
|
225
|
+
def __init__(
|
226
|
+
self,
|
227
|
+
name: Optional[str] = None,
|
228
|
+
source: Optional[constants.SourceType] = None,
|
229
|
+
stores: Optional[List[STORE_TYPES]] = None,
|
230
|
+
persistent: Optional[bool] = True,
|
231
|
+
mode: StorageMode = StorageMode.COPY,
|
232
|
+
sync_on_reconstruction: Optional[bool] = True,
|
233
|
+
_is_sky_managed: Optional[bool] = False,
|
234
|
+
_bucket_sub_path: Optional[str] = None,
|
235
|
+
) -> None:
|
236
|
+
"""Initializes a Storage object.
|
237
|
+
|
238
|
+
Three fields are required: the name of the storage, the source
|
239
|
+
path where the data is initially located, and the default mount
|
240
|
+
path where the data will be mounted to on the cloud.
|
241
|
+
|
242
|
+
Storage object validation depends on the name, source and mount mode.
|
243
|
+
There are four combinations possible for name and source inputs:
|
244
|
+
|
245
|
+
- name is None, source is None: Underspecified storage object.
|
246
|
+
- name is not None, source is None: If MOUNT mode, provision an empty
|
247
|
+
bucket with name <name>. If COPY mode, raise error since source is
|
248
|
+
required.
|
249
|
+
- name is None, source is not None: If source is local, raise error
|
250
|
+
since name is required to create destination bucket. If source is
|
251
|
+
a bucket URL, use the source bucket as the backing store (if
|
252
|
+
permissions allow, else raise error).
|
253
|
+
- name is not None, source is not None: If source is local, upload the
|
254
|
+
contents of the source path to <name> bucket. Create new bucket if
|
255
|
+
required. If source is bucket url - raise error. Name should not be
|
256
|
+
specified if the source is a URL; name will be inferred from source.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
name: str; Name of the storage object. Typically used as the
|
260
|
+
bucket name in backing object stores.
|
261
|
+
source: str, List[str]; File path where the data is initially stored.
|
262
|
+
Can be a single local path, a list of local paths, or a cloud URI
|
263
|
+
(s3://, gs://, etc.). Local paths do not need to be absolute.
|
264
|
+
stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
|
265
|
+
persistent: bool; Whether to persist across sky launches.
|
266
|
+
mode: StorageMode; Specify how the storage object is manifested on
|
267
|
+
the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
|
268
|
+
sync_on_reconstruction: bool; Whether to sync the data if the storage
|
269
|
+
object is found in the global_user_state and reconstructed from
|
270
|
+
there. This is set to false when the Storage object is created not
|
271
|
+
for direct use, e.g. for 'sky storage delete', or the storage is
|
272
|
+
being re-used, e.g., for `sky start` on a stopped cluster.
|
273
|
+
_is_sky_managed: Optional[bool]; Indicates if the storage is managed
|
274
|
+
by Sky. Without this argument, the controller's behavior differs
|
275
|
+
from the local machine. For example, if a bucket does not exist:
|
276
|
+
Local Machine (is_sky_managed=True) →
|
277
|
+
Controller (is_sky_managed=False).
|
278
|
+
With this argument, the controller aligns with the local machine,
|
279
|
+
ensuring it retains the is_sky_managed information from the YAML.
|
280
|
+
During teardown, if is_sky_managed is True, the controller should
|
281
|
+
delete the bucket. Otherwise, it might mistakenly delete only the
|
282
|
+
sub-path, assuming is_sky_managed is False.
|
283
|
+
_bucket_sub_path: Optional[str]; The subdirectory to use for the
|
284
|
+
storage object.
|
285
|
+
"""
|
286
|
+
self.name: str
|
287
|
+
self.source = source
|
288
|
+
self.persistent = persistent
|
289
|
+
self.mode = mode
|
290
|
+
assert mode in StorageMode
|
291
|
+
self.stores: Dict[StoreType, Optional['storage_utils.AbstractStore']] = {}
|
292
|
+
if stores is not None:
|
293
|
+
for store in stores:
|
294
|
+
self.stores[store] = None
|
295
|
+
self.sync_on_reconstruction = sync_on_reconstruction
|
296
|
+
self._is_sky_managed = _is_sky_managed
|
297
|
+
self._bucket_sub_path = _bucket_sub_path
|
298
|
+
|
299
|
+
# TODO(romilb, zhwu): This is a workaround to support storage deletion
|
300
|
+
# for spot. Once sky storage supports forced management for external
|
301
|
+
# buckets, this can be deprecated.
|
302
|
+
self.force_delete = False
|
303
|
+
|
304
|
+
# Validate and correct inputs if necessary
|
305
|
+
self._validate_storage_spec(name)
|
306
|
+
|
307
|
+
if self.source is not None:
|
308
|
+
# If source is a pre-existing bucket, connect to the bucket
|
309
|
+
# If the bucket does not exist, this will error out
|
310
|
+
if isinstance(self.source, str):
|
311
|
+
if self.source.startswith('gs://'):
|
312
|
+
self.add_store(StoreType.GCS)
|
313
|
+
|
314
|
+
@staticmethod
|
315
|
+
def _validate_source(
|
316
|
+
source: constants.SourceType,
|
317
|
+
mode: StorageMode,
|
318
|
+
sync_on_reconstruction: Optional[bool] = None,
|
319
|
+
) -> Tuple[constants.SourceType, bool]:
|
320
|
+
"""Validates the source path.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
source: str; File path where the data is initially stored. Can be a
|
324
|
+
local path or a cloud URI (s3://, gs://, r2:// etc.).
|
325
|
+
Local paths do not need to be absolute.
|
326
|
+
mode: StorageMode; StorageMode of the storage object
|
327
|
+
|
328
|
+
Returns:
|
329
|
+
Tuple[source, is_local_source]
|
330
|
+
source: str; The source path.
|
331
|
+
is_local_path: bool; Whether the source is a local path. False if URI.
|
332
|
+
"""
|
333
|
+
|
334
|
+
def _check_basename_conflicts(source_list: List[str]) -> None:
|
335
|
+
"""Checks if two paths in source_list have the same basename."""
|
336
|
+
basenames = [os.path.basename(s) for s in source_list]
|
337
|
+
conflicts = {x for x in basenames if basenames.count(x) > 1}
|
338
|
+
if conflicts:
|
339
|
+
with ux_utils.print_exception_no_traceback():
|
340
|
+
raise exceptions.StorageSourceError(
|
341
|
+
'Cannot have multiple files or directories with the '
|
342
|
+
'same name in source. Conflicts found for: '
|
343
|
+
f'{", ".join(conflicts)}'
|
344
|
+
)
|
345
|
+
|
346
|
+
def _validate_local_source(local_source):
|
347
|
+
if local_source.endswith('/'):
|
348
|
+
with ux_utils.print_exception_no_traceback():
|
349
|
+
raise exceptions.StorageSourceError(
|
350
|
+
'Storage source paths cannot end with a slash '
|
351
|
+
'(try "/mydir: /mydir" or "/myfile: /myfile"). '
|
352
|
+
f'Found source={local_source}'
|
353
|
+
)
|
354
|
+
# Local path, check if it exists
|
355
|
+
full_src = os.path.abspath(os.path.expanduser(local_source))
|
356
|
+
# Only check if local source exists if it is synced to the bucket
|
357
|
+
if not os.path.exists(full_src) and sync_on_reconstruction:
|
358
|
+
with ux_utils.print_exception_no_traceback():
|
359
|
+
raise exceptions.StorageSourceError(
|
360
|
+
'Local source path does not' f' exist: {local_source}'
|
361
|
+
)
|
362
|
+
# Raise warning if user's path is a symlink
|
363
|
+
elif os.path.islink(full_src):
|
364
|
+
logger.warning(
|
365
|
+
f'Source path {source} is a symlink. '
|
366
|
+
'Referenced contents are uploaded, matching '
|
367
|
+
'the default behavior for S3 and GCS syncing.'
|
368
|
+
)
|
369
|
+
|
370
|
+
# Check if source is a list of paths
|
371
|
+
if isinstance(source, list):
|
372
|
+
# Check for conflicts in basenames
|
373
|
+
_check_basename_conflicts(source)
|
374
|
+
# Validate each path
|
375
|
+
for local_source in source:
|
376
|
+
_validate_local_source(local_source)
|
377
|
+
is_local_source = True
|
378
|
+
else:
|
379
|
+
# Check if str source is a valid local/remote URL
|
380
|
+
split_path = urllib.parse.urlsplit(source)
|
381
|
+
if split_path.scheme == '':
|
382
|
+
_validate_local_source(source)
|
383
|
+
# Check if source is a file - throw error if it is
|
384
|
+
full_src = os.path.abspath(os.path.expanduser(source))
|
385
|
+
if os.path.isfile(full_src):
|
386
|
+
with ux_utils.print_exception_no_traceback():
|
387
|
+
raise exceptions.StorageSourceError(
|
388
|
+
'Storage source path cannot be a file - only'
|
389
|
+
' directories are supported as a source. '
|
390
|
+
'To upload a single file, specify it in a list '
|
391
|
+
f'by writing source: [{source}]. Note '
|
392
|
+
'that the file will be uploaded to the root of the '
|
393
|
+
'bucket and will appear at <destination_path>/'
|
394
|
+
f'{os.path.basename(source)}. Alternatively, you '
|
395
|
+
'can directly upload the file to the VM without '
|
396
|
+
'using a bucket by writing <destination_path>: '
|
397
|
+
f'{source} in the file_mounts section of your YAML'
|
398
|
+
)
|
399
|
+
is_local_source = True
|
400
|
+
elif split_path.scheme in ['s3', 'gs', 'https', 'r2', 'cos']:
|
401
|
+
is_local_source = False
|
402
|
+
# Storage mounting does not support mounting specific files from
|
403
|
+
# cloud store - ensure path points to only a directory
|
404
|
+
if mode == StorageMode.MOUNT:
|
405
|
+
if split_path.scheme != 'https' and (
|
406
|
+
(
|
407
|
+
split_path.scheme != 'cos'
|
408
|
+
and split_path.path.strip('/') != ''
|
409
|
+
)
|
410
|
+
or (
|
411
|
+
split_path.scheme == 'cos'
|
412
|
+
and not re.match(r'^/[-\w]+(/\s*)?$', split_path.path)
|
413
|
+
)
|
414
|
+
):
|
415
|
+
# regex allows split_path.path to include /bucket
|
416
|
+
# or /bucket/optional_whitespaces while considering
|
417
|
+
# cos URI's regions (cos://region/bucket_name)
|
418
|
+
with ux_utils.print_exception_no_traceback():
|
419
|
+
raise exceptions.StorageModeError(
|
420
|
+
'MOUNT mode does not support'
|
421
|
+
' mounting specific files from cloud'
|
422
|
+
' storage. Please use COPY mode or'
|
423
|
+
' specify only the bucket name as'
|
424
|
+
' the source.'
|
425
|
+
)
|
426
|
+
else:
|
427
|
+
with ux_utils.print_exception_no_traceback():
|
428
|
+
raise exceptions.StorageSourceError(
|
429
|
+
f'Supported paths: local, s3://, gs://, https://, '
|
430
|
+
f'r2://, cos://. Got: {source}'
|
431
|
+
)
|
432
|
+
return source, is_local_source
|
433
|
+
|
434
|
+
def _validate_storage_spec(self, name: Optional[str]) -> None:
|
435
|
+
"""Validates the storage spec and updates local fields if necessary."""
|
436
|
+
|
437
|
+
def validate_name(name):
|
438
|
+
"""Checks for validating the storage name.
|
439
|
+
|
440
|
+
Checks if the name starts the s3, gcs or r2 prefix and raise error
|
441
|
+
if it does. Store specific validation checks (e.g., S3 specific
|
442
|
+
rules) happen in the corresponding store class.
|
443
|
+
"""
|
444
|
+
prefix = name.split('://')[0]
|
445
|
+
prefix = prefix.lower()
|
446
|
+
if prefix in ['s3', 'gs', 'https', 'r2', 'cos']:
|
447
|
+
with ux_utils.print_exception_no_traceback():
|
448
|
+
raise exceptions.StorageNameError(
|
449
|
+
'Prefix detected: `name` cannot start with '
|
450
|
+
f'{prefix}://. If you are trying to use an existing '
|
451
|
+
'bucket created outside of SkyPilot, please specify it '
|
452
|
+
'using the `source` field (e.g. '
|
453
|
+
'`source: s3://mybucket/`). If you are trying to '
|
454
|
+
'create a new bucket, please use the `store` field to '
|
455
|
+
'specify the store type (e.g. `store: s3`).'
|
456
|
+
)
|
457
|
+
|
458
|
+
if self.source is None:
|
459
|
+
# If the mode is COPY, the source must be specified
|
460
|
+
if self.mode == StorageMode.COPY:
|
461
|
+
# Check if a Storage object already exists in global_user_state
|
462
|
+
# (e.g. used as scratch previously). Such storage objects can be
|
463
|
+
# mounted in copy mode even though they have no source in the
|
464
|
+
# yaml spec (the name is the source).
|
465
|
+
# TODO(asaiacai): remove references to global_user_state
|
466
|
+
# handle = global_user_state.get_handle_from_storage_name(name)
|
467
|
+
handle = None
|
468
|
+
if handle is None:
|
469
|
+
with ux_utils.print_exception_no_traceback():
|
470
|
+
raise exceptions.StorageSourceError(
|
471
|
+
'New storage object: source must be specified when '
|
472
|
+
'using COPY mode.'
|
473
|
+
)
|
474
|
+
else:
|
475
|
+
# If source is not specified in COPY mode, the intent is to
|
476
|
+
# create a bucket and use it as scratch disk. Name must be
|
477
|
+
# specified to create bucket.
|
478
|
+
if not name:
|
479
|
+
with ux_utils.print_exception_no_traceback():
|
480
|
+
raise exceptions.StorageSpecError(
|
481
|
+
'Storage source or storage name must be specified.'
|
482
|
+
)
|
483
|
+
assert name is not None, handle
|
484
|
+
validate_name(name)
|
485
|
+
self.name = name
|
486
|
+
return
|
487
|
+
elif self.source is not None:
|
488
|
+
source, is_local_source = Storage._validate_source(
|
489
|
+
self.source, self.mode, self.sync_on_reconstruction
|
490
|
+
)
|
491
|
+
if not name:
|
492
|
+
if is_local_source:
|
493
|
+
with ux_utils.print_exception_no_traceback():
|
494
|
+
raise exceptions.StorageNameError(
|
495
|
+
'Storage name must be specified if the source is ' 'local.'
|
496
|
+
)
|
497
|
+
else:
|
498
|
+
assert isinstance(source, str)
|
499
|
+
# Set name to source bucket name and continue
|
500
|
+
name = urllib.parse.urlsplit(source).netloc
|
501
|
+
assert name is not None, source
|
502
|
+
self.name = name
|
503
|
+
return
|
504
|
+
else:
|
505
|
+
if is_local_source:
|
506
|
+
# If name is specified and source is local, upload to bucket
|
507
|
+
assert name is not None, source
|
508
|
+
validate_name(name)
|
509
|
+
self.name = name
|
510
|
+
return
|
511
|
+
else:
|
512
|
+
# Both name and source should not be specified if the source
|
513
|
+
# is a URI. Name will be inferred from the URI.
|
514
|
+
with ux_utils.print_exception_no_traceback():
|
515
|
+
raise exceptions.StorageSpecError(
|
516
|
+
'Storage name should not be specified if the '
|
517
|
+
'source is a remote URI.'
|
518
|
+
)
|
519
|
+
raise exceptions.StorageSpecError(
|
520
|
+
f'Validation failed for storage source {self.source}, name '
|
521
|
+
f'{self.name} and mode {self.mode}. Please check the arguments.'
|
522
|
+
)
|
523
|
+
|
524
|
+
def _add_store_from_metadata(
|
525
|
+
self, sky_stores: Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
|
526
|
+
) -> None:
|
527
|
+
"""Reconstructs Storage.stores from sky_stores.
|
528
|
+
|
529
|
+
Reconstruct AbstractStore objects from sky_store's metadata and
|
530
|
+
adds them into Storage.stores
|
531
|
+
"""
|
532
|
+
for s_type, s_metadata in sky_stores.items():
|
533
|
+
# When initializing from global_user_state, we override the
|
534
|
+
# source from the YAML
|
535
|
+
try:
|
536
|
+
# if s_type == StoreType.S3:
|
537
|
+
# store = S3Store.from_metadata(
|
538
|
+
# s_metadata,
|
539
|
+
# source=self.source,
|
540
|
+
# sync_on_reconstruction=self.sync_on_reconstruction)
|
541
|
+
if s_type == StoreType.GCS:
|
542
|
+
store = gcp.GcsStore.from_metadata(
|
543
|
+
s_metadata,
|
544
|
+
source=self.source,
|
545
|
+
sync_on_reconstruction=self.sync_on_reconstruction,
|
546
|
+
)
|
547
|
+
# elif s_type == StoreType.AZURE:
|
548
|
+
# assert isinstance(s_metadata,
|
549
|
+
# AzureBlobStore.AzureBlobStoreMetadata)
|
550
|
+
# store = AzureBlobStore.from_metadata(
|
551
|
+
# s_metadata,
|
552
|
+
# source=self.source,
|
553
|
+
# sync_on_reconstruction=self.sync_on_reconstruction)
|
554
|
+
# elif s_type == StoreType.R2:
|
555
|
+
# store = R2Store.from_metadata(
|
556
|
+
# s_metadata,
|
557
|
+
# source=self.source,
|
558
|
+
# sync_on_reconstruction=self.sync_on_reconstruction)
|
559
|
+
# elif s_type == StoreType.IBM:
|
560
|
+
# store = IBMCosStore.from_metadata(
|
561
|
+
# s_metadata,
|
562
|
+
# source=self.source,
|
563
|
+
# sync_on_reconstruction=self.sync_on_reconstruction)
|
564
|
+
else:
|
565
|
+
with ux_utils.print_exception_no_traceback():
|
566
|
+
raise ValueError(f'Unknown store type: {s_type}')
|
567
|
+
# Following error is caught when an externally removed storage
|
568
|
+
# is attempted to be fetched.
|
569
|
+
except exceptions.StorageExternalDeletionError:
|
570
|
+
logger.debug(
|
571
|
+
f'Storage object {self.name!r} was attempted '
|
572
|
+
'to be reconstructed while the corresponding '
|
573
|
+
'bucket was externally deleted.'
|
574
|
+
)
|
575
|
+
continue
|
576
|
+
|
577
|
+
self._add_store(store, is_reconstructed=True)
|
578
|
+
|
579
|
+
@classmethod
|
580
|
+
def from_metadata(cls, metadata: StorageMetadata, **override_args) -> 'Storage':
|
581
|
+
"""Create Storage from StorageMetadata object.
|
582
|
+
|
583
|
+
Used when reconstructing Storage object and AbstractStore objects from
|
584
|
+
global_user_state.
|
585
|
+
"""
|
586
|
+
# Name should not be specified if the source is a cloud store URL.
|
587
|
+
source = override_args.get('source', metadata.source)
|
588
|
+
name = override_args.get('name', metadata.storage_name)
|
589
|
+
# If the source is a list, it consists of local paths
|
590
|
+
if not isinstance(source, list) and data_utils.is_cloud_store_url(source):
|
591
|
+
name = None
|
592
|
+
|
593
|
+
storage_obj = cls(
|
594
|
+
name=name,
|
595
|
+
source=source,
|
596
|
+
sync_on_reconstruction=override_args.get('sync_on_reconstruction', True),
|
597
|
+
)
|
598
|
+
|
599
|
+
# For backward compatibility
|
600
|
+
if hasattr(metadata, 'mode'):
|
601
|
+
if metadata.mode:
|
602
|
+
storage_obj.mode = override_args.get('mode', metadata.mode)
|
603
|
+
|
604
|
+
return storage_obj
|
605
|
+
|
606
|
+
def add_store(
|
607
|
+
self, store_type: Union[str, StoreType], region: Optional[str] = None
|
608
|
+
) -> 'storage_utils.AbstractStore':
|
609
|
+
"""Initializes and adds a new store to the storage.
|
610
|
+
|
611
|
+
Invoked by the optimizer after it has selected a store to
|
612
|
+
add it to Storage.
|
613
|
+
|
614
|
+
Args:
|
615
|
+
store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM]
|
616
|
+
region: str; Region to place the bucket in. Caller must ensure that
|
617
|
+
the region is valid for the chosen store_type.
|
618
|
+
"""
|
619
|
+
if isinstance(store_type, str):
|
620
|
+
store_type = StoreType(store_type)
|
621
|
+
|
622
|
+
store_cls: Type['storage_utils.AbstractStore']
|
623
|
+
if store_type == StoreType.GCS:
|
624
|
+
store_cls = gcp.GcsStore
|
625
|
+
else:
|
626
|
+
with ux_utils.print_exception_no_traceback():
|
627
|
+
raise exceptions.StorageSpecError(
|
628
|
+
f'{store_type} not supported as a Store.'
|
629
|
+
)
|
630
|
+
|
631
|
+
# Initialize store object and get/create bucket
|
632
|
+
try:
|
633
|
+
assert self.source is not None
|
634
|
+
store = store_cls(
|
635
|
+
name=self.name,
|
636
|
+
source=self.source,
|
637
|
+
region=region,
|
638
|
+
sync_on_reconstruction=self.sync_on_reconstruction,
|
639
|
+
is_sky_managed=self._is_sky_managed,
|
640
|
+
_bucket_sub_path=self._bucket_sub_path,
|
641
|
+
)
|
642
|
+
except exceptions.StorageBucketCreateError:
|
643
|
+
# Creation failed, so this must be sky managed store. Add failure
|
644
|
+
# to state.
|
645
|
+
logger.error(
|
646
|
+
f'Could not create {store_type} store ' f'with name {self.name}.'
|
647
|
+
)
|
648
|
+
raise
|
649
|
+
except exceptions.StorageBucketGetError:
|
650
|
+
# Bucket get failed, so this is not sky managed. Do not update state
|
651
|
+
logger.error(f'Could not get {store_type} store ' f'with name {self.name}.')
|
652
|
+
raise
|
653
|
+
except exceptions.StorageInitError:
|
654
|
+
logger.error(
|
655
|
+
f'Could not initialize {store_type} store with '
|
656
|
+
f'name {self.name}. General initialization error.'
|
657
|
+
)
|
658
|
+
raise
|
659
|
+
except exceptions.StorageSpecError:
|
660
|
+
logger.error(
|
661
|
+
f'Could not mount externally created {store_type}'
|
662
|
+
f'store with name {self.name!r}.'
|
663
|
+
)
|
664
|
+
raise
|
665
|
+
|
666
|
+
# Add store to storage
|
667
|
+
self._add_store(store)
|
668
|
+
|
669
|
+
# Upload source to store
|
670
|
+
self._sync_store(store)
|
671
|
+
|
672
|
+
return store
|
673
|
+
|
674
|
+
def _add_store(
|
675
|
+
self, store: 'storage_utils.AbstractStore', is_reconstructed: bool = False
|
676
|
+
):
|
677
|
+
# Adds a store object to the storage
|
678
|
+
store_type = StoreType.from_store(store)
|
679
|
+
self.stores[store_type] = store
|
680
|
+
|
681
|
+
def delete(self, store_type: Optional[StoreType] = None) -> None:
|
682
|
+
"""Deletes data for all sky-managed storage objects.
|
683
|
+
|
684
|
+
If a storage is not managed by sky, it is not deleted from the cloud.
|
685
|
+
User must manually delete any object stores created outside of sky.
|
686
|
+
|
687
|
+
Args:
|
688
|
+
store_type: StoreType; Specific cloud store to remove from the list
|
689
|
+
of backing stores.
|
690
|
+
"""
|
691
|
+
if not self.stores:
|
692
|
+
logger.info('No backing stores found. Deleting storage.')
|
693
|
+
if store_type:
|
694
|
+
store = self.stores[store_type]
|
695
|
+
assert store is not None
|
696
|
+
# We delete a store from the cloud if it's sky managed. Else just
|
697
|
+
# remove handle and return
|
698
|
+
if self.force_delete:
|
699
|
+
store.delete()
|
700
|
+
# Remove store from bookkeeping
|
701
|
+
del self.stores[store_type]
|
702
|
+
else:
|
703
|
+
for _, store in self.stores.items():
|
704
|
+
assert store is not None
|
705
|
+
if self.force_delete:
|
706
|
+
store.delete()
|
707
|
+
self.stores = {}
|
708
|
+
|
709
|
+
def sync_all_stores(self):
|
710
|
+
"""Syncs the source and destinations of all stores in the Storage"""
|
711
|
+
for _, store in self.stores.items():
|
712
|
+
self._sync_store(store)
|
713
|
+
|
714
|
+
def _sync_store(self, store: 'storage_utils.AbstractStore'):
|
715
|
+
"""Runs the upload routine for the store and handles failures"""
|
716
|
+
|
717
|
+
def warn_for_git_dir(source: str):
|
718
|
+
if os.path.isdir(os.path.join(source, '.git')):
|
719
|
+
logger.warning(
|
720
|
+
f"'.git' directory under '{self.source}' "
|
721
|
+
'is excluded during sync.'
|
722
|
+
)
|
723
|
+
|
724
|
+
try:
|
725
|
+
if self.source is not None:
|
726
|
+
if isinstance(self.source, str):
|
727
|
+
warn_for_git_dir(self.source)
|
728
|
+
else:
|
729
|
+
for source in self.source:
|
730
|
+
warn_for_git_dir(source)
|
731
|
+
store.upload()
|
732
|
+
except exceptions.StorageUploadError:
|
733
|
+
logger.error(
|
734
|
+
f'Could not upload {self.source!r} to store ' f'name {store.name!r}.'
|
735
|
+
)
|
736
|
+
raise
|
737
|
+
|
738
|
+
@classmethod
|
739
|
+
def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage':
|
740
|
+
common_utils.validate_schema(
|
741
|
+
config, schemas.get_storage_schema(), 'Invalid storage YAML: '
|
742
|
+
)
|
743
|
+
|
744
|
+
name = config.pop('name', None)
|
745
|
+
source = config.pop('source', None)
|
746
|
+
store = config.pop('store', None)
|
747
|
+
mode_str = config.pop('mode', None)
|
748
|
+
force_delete = config.pop('_force_delete', None)
|
749
|
+
if force_delete is None:
|
750
|
+
force_delete = False
|
751
|
+
|
752
|
+
if isinstance(mode_str, str):
|
753
|
+
# Make mode case insensitive, if specified
|
754
|
+
mode = StorageMode(mode_str.upper())
|
755
|
+
else:
|
756
|
+
# Make sure this keeps the same as the default mode in __init__
|
757
|
+
mode = StorageMode.MOUNT
|
758
|
+
persistent = config.pop('persistent', None)
|
759
|
+
if persistent is None:
|
760
|
+
persistent = True
|
761
|
+
|
762
|
+
assert not config, f'Invalid storage args: {config.keys()}'
|
763
|
+
|
764
|
+
# Validation of the config object happens on instantiation.
|
765
|
+
storage_obj = cls(name=name, source=source, persistent=persistent, mode=mode)
|
766
|
+
if store is not None:
|
767
|
+
storage_obj.add_store(StoreType(store.upper()))
|
768
|
+
|
769
|
+
# Add force deletion flag
|
770
|
+
storage_obj.force_delete = force_delete
|
771
|
+
return storage_obj
|
772
|
+
|
773
|
+
def to_yaml_config(self) -> Dict[str, str]:
|
774
|
+
config = {}
|
775
|
+
|
776
|
+
def add_if_not_none(key: str, value: Optional[Any]):
|
777
|
+
if value is not None:
|
778
|
+
config[key] = value
|
779
|
+
|
780
|
+
name = None
|
781
|
+
if (
|
782
|
+
self.source is None
|
783
|
+
or not isinstance(self.source, str)
|
784
|
+
or not data_utils.is_cloud_store_url(self.source)
|
785
|
+
):
|
786
|
+
# Remove name if source is a cloud store URL
|
787
|
+
name = self.name
|
788
|
+
add_if_not_none('name', name)
|
789
|
+
add_if_not_none('source', self.source)
|
790
|
+
|
791
|
+
stores = None
|
792
|
+
if len(self.stores) > 0:
|
793
|
+
stores = ','.join([store.value for store in self.stores])
|
794
|
+
add_if_not_none('store', stores)
|
795
|
+
add_if_not_none('persistent', self.persistent)
|
796
|
+
add_if_not_none('mode', self.mode.value)
|
797
|
+
if self.force_delete:
|
798
|
+
config['_force_delete'] = True
|
799
|
+
return config
|