konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,799 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Different cloud storage definitions. This modules responsibility
14
+ 1.) Create the secrets for each cloud as k8s secrets
15
+ 2.) Mount the secrets as volumes into each container
16
+ 3.) Provide utilities/scripts for the pods to download files syncd
17
+ to object storage
18
+
19
+ For each cloud/storage class we'll only have a single namespace at
20
+ `konduktor` and each run will correspond to a new folder e.g.
21
+ `s3://konduktor/my-llm-run-a34be-a3ebf`
22
+ """
23
+
24
+ import enum
25
+ import os
26
+ import re
27
+ import urllib.parse
28
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
29
+
30
+ from konduktor import check, config, logging
31
+ from konduktor.data import constants, data_utils, gcp, storage_utils
32
+ from konduktor.utils import annotations, common_utils, exceptions, schemas, ux_utils
33
+
34
+ logger = logging.get_logger(__file__)
35
+
36
+
37
+ @annotations.lru_cache(scope='global')
38
+ def get_cached_enabled_storage_clouds_or_refresh(
39
+ raise_if_no_cloud_access: bool = False,
40
+ ) -> List[str]:
41
+ # This is a temporary solution until https://github.com/skypilot-org/skypilot/issues/1943 # noqa: E501
42
+ # (asaiacai): This function does not do any actual checking right now.
43
+ # this is temporary.In the future, we can cache to disk.
44
+ # For now, we just print a warning to the user saying what
45
+ # clouds are enabled and if the task fails to run `konduktor check`
46
+ # to update the credentials.
47
+ enabled_clouds = config.get_nested(('allowed_clouds',), [])
48
+ if len(enabled_clouds) == 0:
49
+ enabled_clouds = constants.STORE_ENABLED_CLOUDS
50
+ else:
51
+ enabled_clouds = [str(cloud) for cloud in enabled_clouds]
52
+ logger.warning(
53
+ f'Enabled storage clouds: {enabled_clouds}. Defaulting to '
54
+ f'{enabled_clouds[0]}. If sync fails, '
55
+ 're-run `konduktor check` to verify credentials.'
56
+ )
57
+ return enabled_clouds
58
+
59
+
60
+ def _is_storage_cloud_enabled(
61
+ cloud_name: str, try_fix_with_sky_check: bool = True
62
+ ) -> bool:
63
+ enabled_storage_clouds = get_cached_enabled_storage_clouds_or_refresh()
64
+ if cloud_name in enabled_storage_clouds:
65
+ return True
66
+ if try_fix_with_sky_check:
67
+ # TODO(zhwu): Only check the specified cloud to speed up.
68
+ check.check(quiet=True)
69
+ return _is_storage_cloud_enabled(cloud_name, try_fix_with_sky_check=False)
70
+ return False
71
+
72
+
73
+ class StorageMode(enum.Enum):
74
+ COPY = 'COPY'
75
+ MOUNT = 'MOUNT'
76
+
77
+
78
+ class StoreType(enum.Enum):
79
+ """Enum for the different types of stores."""
80
+
81
+ GCS = 'GCS'
82
+
83
+ @classmethod
84
+ def from_cloud(cls, cloud: str) -> 'StoreType':
85
+ # these need to match the cloud store classes in konduktor/cloud_stores.py
86
+ if cloud.lower() == 'gs':
87
+ return StoreType.GCS
88
+ else:
89
+ with ux_utils.print_exception_no_traceback():
90
+ raise ValueError(f'Unknown cloud: {cloud}')
91
+
92
+ @classmethod
93
+ def from_store(cls, store: 'storage_utils.AbstractStore') -> 'StoreType':
94
+ if store.__repr__() == 'GcsStore':
95
+ return StoreType.GCS
96
+ else:
97
+ with ux_utils.print_exception_no_traceback():
98
+ raise ValueError(f'Unknown store type: {store}')
99
+
100
+ def store_prefix(self) -> str:
101
+ if self == StoreType.GCS:
102
+ return 'gs://'
103
+ else:
104
+ with ux_utils.print_exception_no_traceback():
105
+ raise ValueError(f'Unknown store type: {self}')
106
+
107
+ @classmethod
108
+ def get_fields_from_store_url(
109
+ cls, store_url: str
110
+ ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]:
111
+ """Returns the store type, bucket name, and sub path from
112
+ a store URL, and the storage account name and region if applicable.
113
+
114
+ Args:
115
+ store_url: str; The store URL.
116
+ """
117
+ # The full path from the user config of IBM COS contains the region,
118
+ # and Azure Blob Storage contains the storage account name, we need to
119
+ # pass these information to the store constructor.
120
+ storage_account_name = None
121
+ region = None
122
+ for store_type in StoreType:
123
+ if store_url.startswith(store_type.store_prefix()):
124
+ if store_type == StoreType.GCS:
125
+ bucket_name, sub_path = data_utils.split_gcs_path(store_url)
126
+ return store_type, bucket_name, sub_path, storage_account_name, region
127
+ raise ValueError(f'Unknown store URL: {store_url}')
128
+
129
+ @classmethod
130
+ def get_endpoint_url(cls, store: 'storage_utils.AbstractStore', path: str) -> str:
131
+ """Generates the endpoint URL for a given store and path.
132
+
133
+ Args:
134
+ store: Store object implementing AbstractStore.
135
+ path: Path within the store.
136
+
137
+ Returns:
138
+ Endpoint URL of the bucket as a string.
139
+ """
140
+ store_type = cls.from_store(store)
141
+ bucket_endpoint_url = f'{store_type.store_prefix()}{path}'
142
+ return bucket_endpoint_url
143
+
144
+
145
+ # this should match the above StoreType enum
146
+ STORE_TYPES = Literal[StoreType.GCS]
147
+
148
+
149
+ class Storage(object):
150
+ """Storage objects handle persistent and large volume storage in the sky.
151
+
152
+ Storage represents an abstract data store containing large data files
153
+ required by the task. Compared to file_mounts, storage is faster and
154
+ can persist across runs, requiring fewer uploads from your local machine.
155
+
156
+ A storage object can be used in either MOUNT mode or COPY mode. In MOUNT
157
+ mode (the default), the backing store is directly "mounted" to the remote
158
+ VM, and files are fetched when accessed by the task and files written to the
159
+ mount path are also written to the remote store. In COPY mode, the files are
160
+ pre-fetched and cached on the local disk and writes are not replicated on
161
+ the remote store.
162
+
163
+ Behind the scenes, storage automatically uploads all data in the source
164
+ to a backing object store in a particular cloud (S3/GCS/Azure Blob).
165
+
166
+ Typical Usage: (See examples/playground/storage_playground.py)
167
+ storage = Storage(name='imagenet-bucket', source='~/Documents/imagenet')
168
+
169
+ # Move data to S3
170
+ storage.add_store('S3')
171
+
172
+ # Move data to Google Cloud Storage
173
+ storage.add_store('GCS')
174
+
175
+ # Delete Storage for both S3 and GCS
176
+ storage.delete()
177
+ """
178
+
179
+ class StorageMetadata(object):
180
+ """A pickle-able tuple of:
181
+
182
+ - (required) Storage name.
183
+ - (required) Source
184
+ - (optional) Storage mode.
185
+ - (optional) Set of stores managed by sky added to the Storage object
186
+ """
187
+
188
+ def __init__(
189
+ self,
190
+ *,
191
+ storage_name: Optional[str],
192
+ source: Optional[constants.SourceType],
193
+ mode: Optional[StorageMode] = None,
194
+ sky_stores: Optional[
195
+ Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
196
+ ] = None,
197
+ ):
198
+ assert storage_name is not None or source is not None
199
+ self.storage_name = storage_name
200
+ self.source = source
201
+ self.mode = mode
202
+
203
+ # Only stores managed by sky are stored here in the
204
+ # global_user_state
205
+ self.sky_stores = {} if sky_stores is None else sky_stores
206
+
207
+ def __repr__(self):
208
+ return (
209
+ f'StorageMetadata('
210
+ f'\n\tstorage_name={self.storage_name},'
211
+ f'\n\tsource={self.source},'
212
+ f'\n\tmode={self.mode},'
213
+ f'\n\t{self.sky_stores}'
214
+ )
215
+
216
+ def add_store(self, store: 'storage_utils.AbstractStore') -> None:
217
+ storetype = StoreType.from_store(store)
218
+ self.sky_stores[storetype] = store.get_metadata()
219
+
220
+ def remove_store(self, store: 'storage_utils.AbstractStore') -> None:
221
+ storetype = StoreType.from_store(store)
222
+ if storetype in self.sky_stores:
223
+ del self.sky_stores[storetype]
224
+
225
+ def __init__(
226
+ self,
227
+ name: Optional[str] = None,
228
+ source: Optional[constants.SourceType] = None,
229
+ stores: Optional[List[STORE_TYPES]] = None,
230
+ persistent: Optional[bool] = True,
231
+ mode: StorageMode = StorageMode.COPY,
232
+ sync_on_reconstruction: Optional[bool] = True,
233
+ _is_sky_managed: Optional[bool] = False,
234
+ _bucket_sub_path: Optional[str] = None,
235
+ ) -> None:
236
+ """Initializes a Storage object.
237
+
238
+ Three fields are required: the name of the storage, the source
239
+ path where the data is initially located, and the default mount
240
+ path where the data will be mounted to on the cloud.
241
+
242
+ Storage object validation depends on the name, source and mount mode.
243
+ There are four combinations possible for name and source inputs:
244
+
245
+ - name is None, source is None: Underspecified storage object.
246
+ - name is not None, source is None: If MOUNT mode, provision an empty
247
+ bucket with name <name>. If COPY mode, raise error since source is
248
+ required.
249
+ - name is None, source is not None: If source is local, raise error
250
+ since name is required to create destination bucket. If source is
251
+ a bucket URL, use the source bucket as the backing store (if
252
+ permissions allow, else raise error).
253
+ - name is not None, source is not None: If source is local, upload the
254
+ contents of the source path to <name> bucket. Create new bucket if
255
+ required. If source is bucket url - raise error. Name should not be
256
+ specified if the source is a URL; name will be inferred from source.
257
+
258
+ Args:
259
+ name: str; Name of the storage object. Typically used as the
260
+ bucket name in backing object stores.
261
+ source: str, List[str]; File path where the data is initially stored.
262
+ Can be a single local path, a list of local paths, or a cloud URI
263
+ (s3://, gs://, etc.). Local paths do not need to be absolute.
264
+ stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
265
+ persistent: bool; Whether to persist across sky launches.
266
+ mode: StorageMode; Specify how the storage object is manifested on
267
+ the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
268
+ sync_on_reconstruction: bool; Whether to sync the data if the storage
269
+ object is found in the global_user_state and reconstructed from
270
+ there. This is set to false when the Storage object is created not
271
+ for direct use, e.g. for 'sky storage delete', or the storage is
272
+ being re-used, e.g., for `sky start` on a stopped cluster.
273
+ _is_sky_managed: Optional[bool]; Indicates if the storage is managed
274
+ by Sky. Without this argument, the controller's behavior differs
275
+ from the local machine. For example, if a bucket does not exist:
276
+ Local Machine (is_sky_managed=True) →
277
+ Controller (is_sky_managed=False).
278
+ With this argument, the controller aligns with the local machine,
279
+ ensuring it retains the is_sky_managed information from the YAML.
280
+ During teardown, if is_sky_managed is True, the controller should
281
+ delete the bucket. Otherwise, it might mistakenly delete only the
282
+ sub-path, assuming is_sky_managed is False.
283
+ _bucket_sub_path: Optional[str]; The subdirectory to use for the
284
+ storage object.
285
+ """
286
+ self.name: str
287
+ self.source = source
288
+ self.persistent = persistent
289
+ self.mode = mode
290
+ assert mode in StorageMode
291
+ self.stores: Dict[StoreType, Optional['storage_utils.AbstractStore']] = {}
292
+ if stores is not None:
293
+ for store in stores:
294
+ self.stores[store] = None
295
+ self.sync_on_reconstruction = sync_on_reconstruction
296
+ self._is_sky_managed = _is_sky_managed
297
+ self._bucket_sub_path = _bucket_sub_path
298
+
299
+ # TODO(romilb, zhwu): This is a workaround to support storage deletion
300
+ # for spot. Once sky storage supports forced management for external
301
+ # buckets, this can be deprecated.
302
+ self.force_delete = False
303
+
304
+ # Validate and correct inputs if necessary
305
+ self._validate_storage_spec(name)
306
+
307
+ if self.source is not None:
308
+ # If source is a pre-existing bucket, connect to the bucket
309
+ # If the bucket does not exist, this will error out
310
+ if isinstance(self.source, str):
311
+ if self.source.startswith('gs://'):
312
+ self.add_store(StoreType.GCS)
313
+
314
+ @staticmethod
315
+ def _validate_source(
316
+ source: constants.SourceType,
317
+ mode: StorageMode,
318
+ sync_on_reconstruction: Optional[bool] = None,
319
+ ) -> Tuple[constants.SourceType, bool]:
320
+ """Validates the source path.
321
+
322
+ Args:
323
+ source: str; File path where the data is initially stored. Can be a
324
+ local path or a cloud URI (s3://, gs://, r2:// etc.).
325
+ Local paths do not need to be absolute.
326
+ mode: StorageMode; StorageMode of the storage object
327
+
328
+ Returns:
329
+ Tuple[source, is_local_source]
330
+ source: str; The source path.
331
+ is_local_path: bool; Whether the source is a local path. False if URI.
332
+ """
333
+
334
+ def _check_basename_conflicts(source_list: List[str]) -> None:
335
+ """Checks if two paths in source_list have the same basename."""
336
+ basenames = [os.path.basename(s) for s in source_list]
337
+ conflicts = {x for x in basenames if basenames.count(x) > 1}
338
+ if conflicts:
339
+ with ux_utils.print_exception_no_traceback():
340
+ raise exceptions.StorageSourceError(
341
+ 'Cannot have multiple files or directories with the '
342
+ 'same name in source. Conflicts found for: '
343
+ f'{", ".join(conflicts)}'
344
+ )
345
+
346
+ def _validate_local_source(local_source):
347
+ if local_source.endswith('/'):
348
+ with ux_utils.print_exception_no_traceback():
349
+ raise exceptions.StorageSourceError(
350
+ 'Storage source paths cannot end with a slash '
351
+ '(try "/mydir: /mydir" or "/myfile: /myfile"). '
352
+ f'Found source={local_source}'
353
+ )
354
+ # Local path, check if it exists
355
+ full_src = os.path.abspath(os.path.expanduser(local_source))
356
+ # Only check if local source exists if it is synced to the bucket
357
+ if not os.path.exists(full_src) and sync_on_reconstruction:
358
+ with ux_utils.print_exception_no_traceback():
359
+ raise exceptions.StorageSourceError(
360
+ 'Local source path does not' f' exist: {local_source}'
361
+ )
362
+ # Raise warning if user's path is a symlink
363
+ elif os.path.islink(full_src):
364
+ logger.warning(
365
+ f'Source path {source} is a symlink. '
366
+ 'Referenced contents are uploaded, matching '
367
+ 'the default behavior for S3 and GCS syncing.'
368
+ )
369
+
370
+ # Check if source is a list of paths
371
+ if isinstance(source, list):
372
+ # Check for conflicts in basenames
373
+ _check_basename_conflicts(source)
374
+ # Validate each path
375
+ for local_source in source:
376
+ _validate_local_source(local_source)
377
+ is_local_source = True
378
+ else:
379
+ # Check if str source is a valid local/remote URL
380
+ split_path = urllib.parse.urlsplit(source)
381
+ if split_path.scheme == '':
382
+ _validate_local_source(source)
383
+ # Check if source is a file - throw error if it is
384
+ full_src = os.path.abspath(os.path.expanduser(source))
385
+ if os.path.isfile(full_src):
386
+ with ux_utils.print_exception_no_traceback():
387
+ raise exceptions.StorageSourceError(
388
+ 'Storage source path cannot be a file - only'
389
+ ' directories are supported as a source. '
390
+ 'To upload a single file, specify it in a list '
391
+ f'by writing source: [{source}]. Note '
392
+ 'that the file will be uploaded to the root of the '
393
+ 'bucket and will appear at <destination_path>/'
394
+ f'{os.path.basename(source)}. Alternatively, you '
395
+ 'can directly upload the file to the VM without '
396
+ 'using a bucket by writing <destination_path>: '
397
+ f'{source} in the file_mounts section of your YAML'
398
+ )
399
+ is_local_source = True
400
+ elif split_path.scheme in ['s3', 'gs', 'https', 'r2', 'cos']:
401
+ is_local_source = False
402
+ # Storage mounting does not support mounting specific files from
403
+ # cloud store - ensure path points to only a directory
404
+ if mode == StorageMode.MOUNT:
405
+ if split_path.scheme != 'https' and (
406
+ (
407
+ split_path.scheme != 'cos'
408
+ and split_path.path.strip('/') != ''
409
+ )
410
+ or (
411
+ split_path.scheme == 'cos'
412
+ and not re.match(r'^/[-\w]+(/\s*)?$', split_path.path)
413
+ )
414
+ ):
415
+ # regex allows split_path.path to include /bucket
416
+ # or /bucket/optional_whitespaces while considering
417
+ # cos URI's regions (cos://region/bucket_name)
418
+ with ux_utils.print_exception_no_traceback():
419
+ raise exceptions.StorageModeError(
420
+ 'MOUNT mode does not support'
421
+ ' mounting specific files from cloud'
422
+ ' storage. Please use COPY mode or'
423
+ ' specify only the bucket name as'
424
+ ' the source.'
425
+ )
426
+ else:
427
+ with ux_utils.print_exception_no_traceback():
428
+ raise exceptions.StorageSourceError(
429
+ f'Supported paths: local, s3://, gs://, https://, '
430
+ f'r2://, cos://. Got: {source}'
431
+ )
432
+ return source, is_local_source
433
+
434
+ def _validate_storage_spec(self, name: Optional[str]) -> None:
435
+ """Validates the storage spec and updates local fields if necessary."""
436
+
437
+ def validate_name(name):
438
+ """Checks for validating the storage name.
439
+
440
+ Checks if the name starts the s3, gcs or r2 prefix and raise error
441
+ if it does. Store specific validation checks (e.g., S3 specific
442
+ rules) happen in the corresponding store class.
443
+ """
444
+ prefix = name.split('://')[0]
445
+ prefix = prefix.lower()
446
+ if prefix in ['s3', 'gs', 'https', 'r2', 'cos']:
447
+ with ux_utils.print_exception_no_traceback():
448
+ raise exceptions.StorageNameError(
449
+ 'Prefix detected: `name` cannot start with '
450
+ f'{prefix}://. If you are trying to use an existing '
451
+ 'bucket created outside of SkyPilot, please specify it '
452
+ 'using the `source` field (e.g. '
453
+ '`source: s3://mybucket/`). If you are trying to '
454
+ 'create a new bucket, please use the `store` field to '
455
+ 'specify the store type (e.g. `store: s3`).'
456
+ )
457
+
458
+ if self.source is None:
459
+ # If the mode is COPY, the source must be specified
460
+ if self.mode == StorageMode.COPY:
461
+ # Check if a Storage object already exists in global_user_state
462
+ # (e.g. used as scratch previously). Such storage objects can be
463
+ # mounted in copy mode even though they have no source in the
464
+ # yaml spec (the name is the source).
465
+ # TODO(asaiacai): remove references to global_user_state
466
+ # handle = global_user_state.get_handle_from_storage_name(name)
467
+ handle = None
468
+ if handle is None:
469
+ with ux_utils.print_exception_no_traceback():
470
+ raise exceptions.StorageSourceError(
471
+ 'New storage object: source must be specified when '
472
+ 'using COPY mode.'
473
+ )
474
+ else:
475
+ # If source is not specified in COPY mode, the intent is to
476
+ # create a bucket and use it as scratch disk. Name must be
477
+ # specified to create bucket.
478
+ if not name:
479
+ with ux_utils.print_exception_no_traceback():
480
+ raise exceptions.StorageSpecError(
481
+ 'Storage source or storage name must be specified.'
482
+ )
483
+ assert name is not None, handle
484
+ validate_name(name)
485
+ self.name = name
486
+ return
487
+ elif self.source is not None:
488
+ source, is_local_source = Storage._validate_source(
489
+ self.source, self.mode, self.sync_on_reconstruction
490
+ )
491
+ if not name:
492
+ if is_local_source:
493
+ with ux_utils.print_exception_no_traceback():
494
+ raise exceptions.StorageNameError(
495
+ 'Storage name must be specified if the source is ' 'local.'
496
+ )
497
+ else:
498
+ assert isinstance(source, str)
499
+ # Set name to source bucket name and continue
500
+ name = urllib.parse.urlsplit(source).netloc
501
+ assert name is not None, source
502
+ self.name = name
503
+ return
504
+ else:
505
+ if is_local_source:
506
+ # If name is specified and source is local, upload to bucket
507
+ assert name is not None, source
508
+ validate_name(name)
509
+ self.name = name
510
+ return
511
+ else:
512
+ # Both name and source should not be specified if the source
513
+ # is a URI. Name will be inferred from the URI.
514
+ with ux_utils.print_exception_no_traceback():
515
+ raise exceptions.StorageSpecError(
516
+ 'Storage name should not be specified if the '
517
+ 'source is a remote URI.'
518
+ )
519
+ raise exceptions.StorageSpecError(
520
+ f'Validation failed for storage source {self.source}, name '
521
+ f'{self.name} and mode {self.mode}. Please check the arguments.'
522
+ )
523
+
524
+ def _add_store_from_metadata(
525
+ self, sky_stores: Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
526
+ ) -> None:
527
+ """Reconstructs Storage.stores from sky_stores.
528
+
529
+ Reconstruct AbstractStore objects from sky_store's metadata and
530
+ adds them into Storage.stores
531
+ """
532
+ for s_type, s_metadata in sky_stores.items():
533
+ # When initializing from global_user_state, we override the
534
+ # source from the YAML
535
+ try:
536
+ # if s_type == StoreType.S3:
537
+ # store = S3Store.from_metadata(
538
+ # s_metadata,
539
+ # source=self.source,
540
+ # sync_on_reconstruction=self.sync_on_reconstruction)
541
+ if s_type == StoreType.GCS:
542
+ store = gcp.GcsStore.from_metadata(
543
+ s_metadata,
544
+ source=self.source,
545
+ sync_on_reconstruction=self.sync_on_reconstruction,
546
+ )
547
+ # elif s_type == StoreType.AZURE:
548
+ # assert isinstance(s_metadata,
549
+ # AzureBlobStore.AzureBlobStoreMetadata)
550
+ # store = AzureBlobStore.from_metadata(
551
+ # s_metadata,
552
+ # source=self.source,
553
+ # sync_on_reconstruction=self.sync_on_reconstruction)
554
+ # elif s_type == StoreType.R2:
555
+ # store = R2Store.from_metadata(
556
+ # s_metadata,
557
+ # source=self.source,
558
+ # sync_on_reconstruction=self.sync_on_reconstruction)
559
+ # elif s_type == StoreType.IBM:
560
+ # store = IBMCosStore.from_metadata(
561
+ # s_metadata,
562
+ # source=self.source,
563
+ # sync_on_reconstruction=self.sync_on_reconstruction)
564
+ else:
565
+ with ux_utils.print_exception_no_traceback():
566
+ raise ValueError(f'Unknown store type: {s_type}')
567
+ # Following error is caught when an externally removed storage
568
+ # is attempted to be fetched.
569
+ except exceptions.StorageExternalDeletionError:
570
+ logger.debug(
571
+ f'Storage object {self.name!r} was attempted '
572
+ 'to be reconstructed while the corresponding '
573
+ 'bucket was externally deleted.'
574
+ )
575
+ continue
576
+
577
+ self._add_store(store, is_reconstructed=True)
578
+
579
+ @classmethod
580
+ def from_metadata(cls, metadata: StorageMetadata, **override_args) -> 'Storage':
581
+ """Create Storage from StorageMetadata object.
582
+
583
+ Used when reconstructing Storage object and AbstractStore objects from
584
+ global_user_state.
585
+ """
586
+ # Name should not be specified if the source is a cloud store URL.
587
+ source = override_args.get('source', metadata.source)
588
+ name = override_args.get('name', metadata.storage_name)
589
+ # If the source is a list, it consists of local paths
590
+ if not isinstance(source, list) and data_utils.is_cloud_store_url(source):
591
+ name = None
592
+
593
+ storage_obj = cls(
594
+ name=name,
595
+ source=source,
596
+ sync_on_reconstruction=override_args.get('sync_on_reconstruction', True),
597
+ )
598
+
599
+ # For backward compatibility
600
+ if hasattr(metadata, 'mode'):
601
+ if metadata.mode:
602
+ storage_obj.mode = override_args.get('mode', metadata.mode)
603
+
604
+ return storage_obj
605
+
606
+ def add_store(
607
+ self, store_type: Union[str, StoreType], region: Optional[str] = None
608
+ ) -> 'storage_utils.AbstractStore':
609
+ """Initializes and adds a new store to the storage.
610
+
611
+ Invoked by the optimizer after it has selected a store to
612
+ add it to Storage.
613
+
614
+ Args:
615
+ store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM]
616
+ region: str; Region to place the bucket in. Caller must ensure that
617
+ the region is valid for the chosen store_type.
618
+ """
619
+ if isinstance(store_type, str):
620
+ store_type = StoreType(store_type)
621
+
622
+ store_cls: Type['storage_utils.AbstractStore']
623
+ if store_type == StoreType.GCS:
624
+ store_cls = gcp.GcsStore
625
+ else:
626
+ with ux_utils.print_exception_no_traceback():
627
+ raise exceptions.StorageSpecError(
628
+ f'{store_type} not supported as a Store.'
629
+ )
630
+
631
+ # Initialize store object and get/create bucket
632
+ try:
633
+ assert self.source is not None
634
+ store = store_cls(
635
+ name=self.name,
636
+ source=self.source,
637
+ region=region,
638
+ sync_on_reconstruction=self.sync_on_reconstruction,
639
+ is_sky_managed=self._is_sky_managed,
640
+ _bucket_sub_path=self._bucket_sub_path,
641
+ )
642
+ except exceptions.StorageBucketCreateError:
643
+ # Creation failed, so this must be sky managed store. Add failure
644
+ # to state.
645
+ logger.error(
646
+ f'Could not create {store_type} store ' f'with name {self.name}.'
647
+ )
648
+ raise
649
+ except exceptions.StorageBucketGetError:
650
+ # Bucket get failed, so this is not sky managed. Do not update state
651
+ logger.error(f'Could not get {store_type} store ' f'with name {self.name}.')
652
+ raise
653
+ except exceptions.StorageInitError:
654
+ logger.error(
655
+ f'Could not initialize {store_type} store with '
656
+ f'name {self.name}. General initialization error.'
657
+ )
658
+ raise
659
+ except exceptions.StorageSpecError:
660
+ logger.error(
661
+ f'Could not mount externally created {store_type}'
662
+ f'store with name {self.name!r}.'
663
+ )
664
+ raise
665
+
666
+ # Add store to storage
667
+ self._add_store(store)
668
+
669
+ # Upload source to store
670
+ self._sync_store(store)
671
+
672
+ return store
673
+
674
+ def _add_store(
675
+ self, store: 'storage_utils.AbstractStore', is_reconstructed: bool = False
676
+ ):
677
+ # Adds a store object to the storage
678
+ store_type = StoreType.from_store(store)
679
+ self.stores[store_type] = store
680
+
681
+ def delete(self, store_type: Optional[StoreType] = None) -> None:
682
+ """Deletes data for all sky-managed storage objects.
683
+
684
+ If a storage is not managed by sky, it is not deleted from the cloud.
685
+ User must manually delete any object stores created outside of sky.
686
+
687
+ Args:
688
+ store_type: StoreType; Specific cloud store to remove from the list
689
+ of backing stores.
690
+ """
691
+ if not self.stores:
692
+ logger.info('No backing stores found. Deleting storage.')
693
+ if store_type:
694
+ store = self.stores[store_type]
695
+ assert store is not None
696
+ # We delete a store from the cloud if it's sky managed. Else just
697
+ # remove handle and return
698
+ if self.force_delete:
699
+ store.delete()
700
+ # Remove store from bookkeeping
701
+ del self.stores[store_type]
702
+ else:
703
+ for _, store in self.stores.items():
704
+ assert store is not None
705
+ if self.force_delete:
706
+ store.delete()
707
+ self.stores = {}
708
+
709
+ def sync_all_stores(self):
710
+ """Syncs the source and destinations of all stores in the Storage"""
711
+ for _, store in self.stores.items():
712
+ self._sync_store(store)
713
+
714
+ def _sync_store(self, store: 'storage_utils.AbstractStore'):
715
+ """Runs the upload routine for the store and handles failures"""
716
+
717
+ def warn_for_git_dir(source: str):
718
+ if os.path.isdir(os.path.join(source, '.git')):
719
+ logger.warning(
720
+ f"'.git' directory under '{self.source}' "
721
+ 'is excluded during sync.'
722
+ )
723
+
724
+ try:
725
+ if self.source is not None:
726
+ if isinstance(self.source, str):
727
+ warn_for_git_dir(self.source)
728
+ else:
729
+ for source in self.source:
730
+ warn_for_git_dir(source)
731
+ store.upload()
732
+ except exceptions.StorageUploadError:
733
+ logger.error(
734
+ f'Could not upload {self.source!r} to store ' f'name {store.name!r}.'
735
+ )
736
+ raise
737
+
738
+ @classmethod
739
+ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage':
740
+ common_utils.validate_schema(
741
+ config, schemas.get_storage_schema(), 'Invalid storage YAML: '
742
+ )
743
+
744
+ name = config.pop('name', None)
745
+ source = config.pop('source', None)
746
+ store = config.pop('store', None)
747
+ mode_str = config.pop('mode', None)
748
+ force_delete = config.pop('_force_delete', None)
749
+ if force_delete is None:
750
+ force_delete = False
751
+
752
+ if isinstance(mode_str, str):
753
+ # Make mode case insensitive, if specified
754
+ mode = StorageMode(mode_str.upper())
755
+ else:
756
+ # Make sure this keeps the same as the default mode in __init__
757
+ mode = StorageMode.MOUNT
758
+ persistent = config.pop('persistent', None)
759
+ if persistent is None:
760
+ persistent = True
761
+
762
+ assert not config, f'Invalid storage args: {config.keys()}'
763
+
764
+ # Validation of the config object happens on instantiation.
765
+ storage_obj = cls(name=name, source=source, persistent=persistent, mode=mode)
766
+ if store is not None:
767
+ storage_obj.add_store(StoreType(store.upper()))
768
+
769
+ # Add force deletion flag
770
+ storage_obj.force_delete = force_delete
771
+ return storage_obj
772
+
773
+ def to_yaml_config(self) -> Dict[str, str]:
774
+ config = {}
775
+
776
+ def add_if_not_none(key: str, value: Optional[Any]):
777
+ if value is not None:
778
+ config[key] = value
779
+
780
+ name = None
781
+ if (
782
+ self.source is None
783
+ or not isinstance(self.source, str)
784
+ or not data_utils.is_cloud_store_url(self.source)
785
+ ):
786
+ # Remove name if source is a cloud store URL
787
+ name = self.name
788
+ add_if_not_none('name', name)
789
+ add_if_not_none('source', self.source)
790
+
791
+ stores = None
792
+ if len(self.stores) > 0:
793
+ stores = ','.join([store.value for store in self.stores])
794
+ add_if_not_none('store', stores)
795
+ add_if_not_none('persistent', self.persistent)
796
+ add_if_not_none('mode', self.mode.value)
797
+ if self.force_delete:
798
+ config['_force_delete'] = True
799
+ return config