konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,812 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Different cloud storage definitions. This modules responsibility
14
+ 1.) Create the secrets for each cloud as k8s secrets
15
+ 2.) Mount the secrets as volumes into each container
16
+ 3.) Provide utilities/scripts for the pods to download files syncd
17
+ to object storage
18
+
19
+ For each cloud/storage class we'll only have a single namespace at
20
+ `konduktor` and each run will correspond to a new folder e.g.
21
+ `s3://konduktor/my-llm-run-a34be-a3ebf`
22
+ """
23
+
24
+ import enum
25
+ import os
26
+ import re
27
+ import urllib.parse
28
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
29
+
30
+ from konduktor import check, config, logging
31
+ from konduktor.data import aws, constants, data_utils, gcp, registry, storage_utils
32
+ from konduktor.utils import annotations, common_utils, exceptions, schemas, ux_utils
33
+
34
+ logger = logging.get_logger(__file__)
35
+
36
+
37
+ @annotations.lru_cache(scope='global')
38
+ def get_cached_enabled_storage_clouds_or_refresh(
39
+ raise_if_no_cloud_access: bool = False,
40
+ ) -> List[str]:
41
+ # This is a temporary solution until https://github.com/skypilot-org/skypilot/issues/1943 # noqa: E501
42
+ # (asaiacai): This function does not do any actual checking right now.
43
+ # this is temporary. In the future, we can cache to disk.
44
+ # For now, we just print a warning to the user saying what
45
+ # clouds are enabled and if the task fails to run `konduktor check`
46
+ # to update the credentials.
47
+ enabled_clouds = config.get_nested(('allowed_clouds',), [])
48
+ if len(enabled_clouds) == 0:
49
+ enabled_clouds = registry._STORE_ENABLED_CLOUDS
50
+ else:
51
+ enabled_clouds = [str(cloud) for cloud in enabled_clouds]
52
+ logger.warning(
53
+ f'Enabled storage clouds: {enabled_clouds}. Defaulting to '
54
+ f'{enabled_clouds[0]}. If sync fails, '
55
+ 're-run `konduktor check` to verify credentials.'
56
+ )
57
+ return enabled_clouds
58
+
59
+
60
+ def _is_storage_cloud_enabled(
61
+ cloud_name: str, try_fix_with_sky_check: bool = True
62
+ ) -> bool:
63
+ enabled_storage_clouds = get_cached_enabled_storage_clouds_or_refresh()
64
+ if cloud_name in enabled_storage_clouds:
65
+ return True
66
+ if try_fix_with_sky_check:
67
+ # TODO(zhwu): Only check the specified cloud to speed up.
68
+ check.check(quiet=True)
69
+ return _is_storage_cloud_enabled(cloud_name, try_fix_with_sky_check=False)
70
+ return False
71
+
72
+
73
+ class StorageMode(enum.Enum):
74
+ COPY = 'COPY'
75
+ MOUNT = 'MOUNT'
76
+
77
+
78
+ class StoreType(enum.Enum):
79
+ """Enum for the different types of stores."""
80
+
81
+ GCS = 'GCS'
82
+ S3 = 'S3'
83
+
84
+ @classmethod
85
+ def from_cloud(cls, cloud: str) -> 'StoreType':
86
+ # these need to match the cloud store classes in konduktor/cloud_stores.py
87
+ if cloud.lower() == 'gs':
88
+ return StoreType.GCS
89
+ elif cloud.lower() == 's3':
90
+ return StoreType.S3
91
+ else:
92
+ with ux_utils.print_exception_no_traceback():
93
+ raise ValueError(f'Unknown cloud: {cloud}')
94
+
95
+ @classmethod
96
+ def from_store(cls, store: 'storage_utils.AbstractStore') -> 'StoreType':
97
+ if store.__repr__() == 'GcsStore':
98
+ return StoreType.GCS
99
+ elif store.__repr__() == 'S3Store':
100
+ return StoreType.S3
101
+ else:
102
+ with ux_utils.print_exception_no_traceback():
103
+ raise ValueError(f'Unknown store type: {store}')
104
+
105
+ def store_prefix(self) -> str:
106
+ if self == StoreType.GCS:
107
+ return 'gs://'
108
+ elif self == StoreType.S3:
109
+ return 's3://'
110
+ else:
111
+ with ux_utils.print_exception_no_traceback():
112
+ raise ValueError(f'Unknown store type: {self}')
113
+
114
+ @classmethod
115
+ def get_fields_from_store_url(
116
+ cls, store_url: str
117
+ ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]:
118
+ """Returns the store type, bucket name, and sub path from
119
+ a store URL, and the storage account name and region if applicable.
120
+
121
+ Args:
122
+ store_url: str; The store URL.
123
+ """
124
+ # The full path from the user config of IBM COS contains the region,
125
+ # and Azure Blob Storage contains the storage account name, we need to
126
+ # pass these information to the store constructor.
127
+ storage_account_name = None
128
+ region = None
129
+ for store_type in StoreType:
130
+ if store_url.startswith(store_type.store_prefix()):
131
+ if store_type == StoreType.GCS:
132
+ bucket_name, sub_path = data_utils.split_gcs_path(store_url)
133
+ elif store_type == StoreType.S3:
134
+ bucket_name, sub_path = data_utils.split_s3_path(store_url)
135
+ return store_type, bucket_name, sub_path, storage_account_name, region
136
+ raise ValueError(f'Unknown store URL: {store_url}')
137
+
138
+ @classmethod
139
+ def get_endpoint_url(cls, store: 'storage_utils.AbstractStore', path: str) -> str:
140
+ """Generates the endpoint URL for a given store and path.
141
+
142
+ Args:
143
+ store: Store object implementing AbstractStore.
144
+ path: Path within the store.
145
+
146
+ Returns:
147
+ Endpoint URL of the bucket as a string.
148
+ """
149
+ store_type = cls.from_store(store)
150
+ bucket_endpoint_url = f'{store_type.store_prefix()}{path}'
151
+ return bucket_endpoint_url
152
+
153
+
154
+ # this should match the above StoreType enum
155
+ STORE_TYPES = Literal[StoreType.GCS, StoreType.S3]
156
+
157
+
158
+ class Storage(object):
159
+ """Storage objects handle persistent and large volume storage in the sky.
160
+
161
+ Storage represents an abstract data store containing large data files
162
+ required by the task. Compared to file_mounts, storage is faster and
163
+ can persist across runs, requiring fewer uploads from your local machine.
164
+
165
+ A storage object can be used in either MOUNT mode or COPY mode. In MOUNT
166
+ mode (the default), the backing store is directly "mounted" to the remote
167
+ VM, and files are fetched when accessed by the task and files written to the
168
+ mount path are also written to the remote store. In COPY mode, the files are
169
+ pre-fetched and cached on the local disk and writes are not replicated on
170
+ the remote store.
171
+
172
+ Behind the scenes, storage automatically uploads all data in the source
173
+ to a backing object store in a particular cloud (S3/GCS/Azure Blob).
174
+
175
+ Typical Usage: (See examples/playground/storage_playground.py)
176
+ storage = Storage(name='imagenet-bucket', source='~/Documents/imagenet')
177
+
178
+ # Move data to S3
179
+ storage.add_store('S3')
180
+
181
+ # Move data to Google Cloud Storage
182
+ storage.add_store('GCS')
183
+
184
+ # Delete Storage for both S3 and GCS
185
+ storage.delete()
186
+ """
187
+
188
+ class StorageMetadata(object):
189
+ """A pickle-able tuple of:
190
+
191
+ - (required) Storage name.
192
+ - (required) Source
193
+ - (optional) Storage mode.
194
+ - (optional) Set of stores managed by sky added to the Storage object
195
+ """
196
+
197
+ def __init__(
198
+ self,
199
+ *,
200
+ storage_name: Optional[str],
201
+ source: Optional[constants.SourceType],
202
+ mode: Optional[StorageMode] = None,
203
+ sky_stores: Optional[
204
+ Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
205
+ ] = None,
206
+ ):
207
+ assert storage_name is not None or source is not None
208
+ self.storage_name = storage_name
209
+ self.source = source
210
+ self.mode = mode
211
+
212
+ # Only stores managed by sky are stored here in the
213
+ # global_user_state
214
+ self.sky_stores = {} if sky_stores is None else sky_stores
215
+
216
+ def __repr__(self):
217
+ return (
218
+ f'StorageMetadata('
219
+ f'\n\tstorage_name={self.storage_name},'
220
+ f'\n\tsource={self.source},'
221
+ f'\n\tmode={self.mode},'
222
+ f'\n\t{self.sky_stores}'
223
+ )
224
+
225
+ def add_store(self, store: 'storage_utils.AbstractStore') -> None:
226
+ storetype = StoreType.from_store(store)
227
+ self.sky_stores[storetype] = store.get_metadata()
228
+
229
+ def remove_store(self, store: 'storage_utils.AbstractStore') -> None:
230
+ storetype = StoreType.from_store(store)
231
+ if storetype in self.sky_stores:
232
+ del self.sky_stores[storetype]
233
+
234
+ def __init__(
235
+ self,
236
+ name: Optional[str] = None,
237
+ source: Optional[constants.SourceType] = None,
238
+ stores: Optional[List[STORE_TYPES]] = None,
239
+ persistent: Optional[bool] = True,
240
+ mode: StorageMode = StorageMode.COPY,
241
+ sync_on_reconstruction: Optional[bool] = True,
242
+ _is_sky_managed: Optional[bool] = False,
243
+ _bucket_sub_path: Optional[str] = None,
244
+ ) -> None:
245
+ """Initializes a Storage object.
246
+
247
+ Three fields are required: the name of the storage, the source
248
+ path where the data is initially located, and the default mount
249
+ path where the data will be mounted to on the cloud.
250
+
251
+ Storage object validation depends on the name, source and mount mode.
252
+ There are four combinations possible for name and source inputs:
253
+
254
+ - name is None, source is None: Underspecified storage object.
255
+ - name is not None, source is None: If MOUNT mode, provision an empty
256
+ bucket with name <name>. If COPY mode, raise error since source is
257
+ required.
258
+ - name is None, source is not None: If source is local, raise error
259
+ since name is required to create destination bucket. If source is
260
+ a bucket URL, use the source bucket as the backing store (if
261
+ permissions allow, else raise error).
262
+ - name is not None, source is not None: If source is local, upload the
263
+ contents of the source path to <name> bucket. Create new bucket if
264
+ required. If source is bucket url - raise error. Name should not be
265
+ specified if the source is a URL; name will be inferred from source.
266
+
267
+ Args:
268
+ name: str; Name of the storage object. Typically used as the
269
+ bucket name in backing object stores.
270
+ source: str, List[str]; File path where the data is initially stored.
271
+ Can be a single local path, a list of local paths, or a cloud URI
272
+ (s3://, gs://, etc.). Local paths do not need to be absolute.
273
+ stores: Optional; Specify pre-initialized stores (S3Store, GcsStore).
274
+ persistent: bool; Whether to persist across konduktor launches.
275
+ mode: StorageMode; Specify how the storage object is manifested on
276
+ the remote VM. Can be either MOUNT or COPY. Defaults to MOUNT.
277
+ sync_on_reconstruction: bool; [defunct] Whether to sync the
278
+ data if the storage object is found in the global_user_state
279
+ and reconstructed from there. This is set to
280
+ false when the Storage object is created not for direct use
281
+ _is_sky_managed: Optional[bool]; [defunct] Indicates if the storage is managed
282
+ by Sky. Without this argument, the controller's behavior differs
283
+ from the local machine. For example, if a bucket does not exist:
284
+ Local Machine (is_sky_managed=True) →
285
+ Controller (is_sky_managed=False).
286
+ With this argument, the controller aligns with the local machine,
287
+ ensuring it retains the is_sky_managed information from the YAML.
288
+ During teardown, if is_sky_managed is True, the controller should
289
+ delete the bucket. Otherwise, it might mistakenly delete only the
290
+ sub-path, assuming is_sky_managed is False.
291
+ _bucket_sub_path: Optional[str]; The subdirectory to use for the
292
+ storage object.
293
+ """
294
+ self.name: str
295
+ self.source = source
296
+ self.persistent = persistent
297
+ self.mode = mode
298
+ assert mode in StorageMode
299
+ self.stores: Dict[StoreType, Optional['storage_utils.AbstractStore']] = {}
300
+ if stores is not None:
301
+ for store in stores:
302
+ self.stores[store] = None
303
+ self.sync_on_reconstruction = sync_on_reconstruction
304
+ self._is_sky_managed = _is_sky_managed
305
+ self._bucket_sub_path = _bucket_sub_path
306
+
307
+ # TODO(romilb, zhwu): This is a workaround to support storage deletion
308
+ # for spot. Once sky storage supports forced management for external
309
+ # buckets, this can be deprecated.
310
+ self.force_delete = False
311
+
312
+ # Validate and correct inputs if necessary
313
+ self._validate_storage_spec(name)
314
+
315
+ if self.source is not None:
316
+ # If source is a pre-existing bucket, connect to the bucket
317
+ # If the bucket does not exist, this will error out
318
+ if isinstance(self.source, str):
319
+ if self.source.startswith('gs://'):
320
+ self.add_store(StoreType.GCS)
321
+ elif self.source.startswith('s3://'):
322
+ self.add_store(StoreType.S3)
323
+
324
+ @staticmethod
325
+ def _validate_source(
326
+ source: constants.SourceType,
327
+ mode: StorageMode,
328
+ sync_on_reconstruction: Optional[bool] = None,
329
+ ) -> Tuple[constants.SourceType, bool]:
330
+ """Validates the source path.
331
+
332
+ Args:
333
+ source: str; File path where the data is initially stored. Can be a
334
+ local path or a cloud URI (s3://, gs://, r2:// etc.).
335
+ Local paths do not need to be absolute.
336
+ mode: StorageMode; StorageMode of the storage object
337
+
338
+ Returns:
339
+ Tuple[source, is_local_source]
340
+ source: str; The source path.
341
+ is_local_path: bool; Whether the source is a local path. False if URI.
342
+ """
343
+
344
+ def _check_basename_conflicts(source_list: List[str]) -> None:
345
+ """Checks if two paths in source_list have the same basename."""
346
+ basenames = [os.path.basename(s) for s in source_list]
347
+ conflicts = {x for x in basenames if basenames.count(x) > 1}
348
+ if conflicts:
349
+ with ux_utils.print_exception_no_traceback():
350
+ raise exceptions.StorageSourceError(
351
+ 'Cannot have multiple files or directories with the '
352
+ 'same name in source. Conflicts found for: '
353
+ f'{", ".join(conflicts)}'
354
+ )
355
+
356
+ def _validate_local_source(local_source):
357
+ if local_source.endswith('/'):
358
+ with ux_utils.print_exception_no_traceback():
359
+ raise exceptions.StorageSourceError(
360
+ 'Storage source paths cannot end with a slash '
361
+ '(try "/mydir: /mydir" or "/myfile: /myfile"). '
362
+ f'Found source={local_source}'
363
+ )
364
+ # Local path, check if it exists
365
+ full_src = os.path.abspath(os.path.expanduser(local_source))
366
+ # Only check if local source exists if it is synced to the bucket
367
+ if not os.path.exists(full_src) and sync_on_reconstruction:
368
+ with ux_utils.print_exception_no_traceback():
369
+ raise exceptions.StorageSourceError(
370
+ 'Local source path does not' f' exist: {local_source}'
371
+ )
372
+ # Raise warning if user's path is a symlink
373
+ elif os.path.islink(full_src):
374
+ logger.warning(
375
+ f'Source path {source} is a symlink. '
376
+ 'Referenced contents are uploaded, matching '
377
+ 'the default behavior for S3 and GCS syncing.'
378
+ )
379
+
380
+ # Check if source is a list of paths
381
+ if isinstance(source, list):
382
+ # Check for conflicts in basenames
383
+ _check_basename_conflicts(source)
384
+ # Validate each path
385
+ for local_source in source:
386
+ _validate_local_source(local_source)
387
+ is_local_source = True
388
+ else:
389
+ # Check if str source is a valid local/remote URL
390
+ split_path = urllib.parse.urlsplit(source)
391
+ if split_path.scheme == '':
392
+ _validate_local_source(source)
393
+ # Check if source is a file - throw error if it is
394
+ full_src = os.path.abspath(os.path.expanduser(source))
395
+ if os.path.isfile(full_src):
396
+ with ux_utils.print_exception_no_traceback():
397
+ raise exceptions.StorageSourceError(
398
+ 'Storage source path cannot be a file - only'
399
+ ' directories are supported as a source. '
400
+ 'To upload a single file, specify it in a list '
401
+ f'by writing source: [{source}]. Note '
402
+ 'that the file will be uploaded to the root of the '
403
+ 'bucket and will appear at <destination_path>/'
404
+ f'{os.path.basename(source)}. Alternatively, you '
405
+ 'can directly upload the file to the VM without '
406
+ 'using a bucket by writing <destination_path>: '
407
+ f'{source} in the file_mounts section of your YAML'
408
+ )
409
+ is_local_source = True
410
+ elif split_path.scheme in ['s3', 'gs', 'https', 'r2', 'cos']:
411
+ is_local_source = False
412
+ # Storage mounting does not support mounting specific files from
413
+ # cloud store - ensure path points to only a directory
414
+ if mode == StorageMode.MOUNT:
415
+ if split_path.scheme != 'https' and (
416
+ (
417
+ split_path.scheme != 'cos'
418
+ and split_path.path.strip('/') != ''
419
+ )
420
+ or (
421
+ split_path.scheme == 'cos'
422
+ and not re.match(r'^/[-\w]+(/\s*)?$', split_path.path)
423
+ )
424
+ ):
425
+ # regex allows split_path.path to include /bucket
426
+ # or /bucket/optional_whitespaces while considering
427
+ # cos URI's regions (cos://region/bucket_name)
428
+ with ux_utils.print_exception_no_traceback():
429
+ raise exceptions.StorageModeError(
430
+ 'MOUNT mode does not support'
431
+ ' mounting specific files from cloud'
432
+ ' storage. Please use COPY mode or'
433
+ ' specify only the bucket name as'
434
+ ' the source.'
435
+ )
436
+ else:
437
+ with ux_utils.print_exception_no_traceback():
438
+ raise exceptions.StorageSourceError(
439
+ f'Supported paths: local, s3://, gs://, https://, '
440
+ f'r2://, cos://. Got: {source}'
441
+ )
442
+ return source, is_local_source
443
+
444
+ def _validate_storage_spec(self, name: Optional[str]) -> None:
445
+ """Validates the storage spec and updates local fields if necessary."""
446
+
447
+ def validate_name(name):
448
+ """Checks for validating the storage name.
449
+
450
+ Checks if the name starts the s3, gcs or r2 prefix and raise error
451
+ if it does. Store specific validation checks (e.g., S3 specific
452
+ rules) happen in the corresponding store class.
453
+ """
454
+ prefix = name.split('://')[0]
455
+ prefix = prefix.lower()
456
+ if prefix in ['s3', 'gs', 'https', 'r2', 'cos']:
457
+ with ux_utils.print_exception_no_traceback():
458
+ raise exceptions.StorageNameError(
459
+ 'Prefix detected: `name` cannot start with '
460
+ f'{prefix}://. If you are trying to use an existing '
461
+ 'bucket created outside of SkyPilot, please specify it '
462
+ 'using the `source` field (e.g. '
463
+ '`source: s3://mybucket/`). If you are trying to '
464
+ 'create a new bucket, please use the `store` field to '
465
+ 'specify the store type (e.g. `store: s3`).'
466
+ )
467
+
468
+ if self.source is None:
469
+ # If the mode is COPY, the source must be specified
470
+ if self.mode == StorageMode.COPY:
471
+ # Check if a Storage object already exists in global_user_state
472
+ # (e.g. used as scratch previously). Such storage objects can be
473
+ # mounted in copy mode even though they have no source in the
474
+ # yaml spec (the name is the source).
475
+ # TODO(asaiacai): remove references to global_user_state
476
+ # handle = global_user_state.get_handle_from_storage_name(name)
477
+ handle = None
478
+ if handle is None:
479
+ with ux_utils.print_exception_no_traceback():
480
+ raise exceptions.StorageSourceError(
481
+ 'New storage object: source must be specified when '
482
+ 'using COPY mode.'
483
+ )
484
+ else:
485
+ # If source is not specified in COPY mode, the intent is to
486
+ # create a bucket and use it as scratch disk. Name must be
487
+ # specified to create bucket.
488
+ if not name:
489
+ with ux_utils.print_exception_no_traceback():
490
+ raise exceptions.StorageSpecError(
491
+ 'Storage source or storage name must be specified.'
492
+ )
493
+ assert name is not None, handle
494
+ validate_name(name)
495
+ self.name = name
496
+ return
497
+ elif self.source is not None:
498
+ source, is_local_source = Storage._validate_source(
499
+ self.source, self.mode, self.sync_on_reconstruction
500
+ )
501
+ if not name:
502
+ if is_local_source:
503
+ with ux_utils.print_exception_no_traceback():
504
+ raise exceptions.StorageNameError(
505
+ 'Storage name must be specified if the source is ' 'local.'
506
+ )
507
+ else:
508
+ assert isinstance(source, str)
509
+ # Set name to source bucket name and continue
510
+ name = urllib.parse.urlsplit(source).netloc
511
+ assert name is not None, source
512
+ self.name = name
513
+ return
514
+ else:
515
+ if is_local_source:
516
+ # If name is specified and source is local, upload to bucket
517
+ assert name is not None, source
518
+ validate_name(name)
519
+ self.name = name
520
+ return
521
+ else:
522
+ # Both name and source should not be specified if the source
523
+ # is a URI. Name will be inferred from the URI.
524
+ with ux_utils.print_exception_no_traceback():
525
+ raise exceptions.StorageSpecError(
526
+ 'Storage name should not be specified if the '
527
+ 'source is a remote URI.'
528
+ )
529
+ raise exceptions.StorageSpecError(
530
+ f'Validation failed for storage source {self.source}, name '
531
+ f'{self.name} and mode {self.mode}. Please check the arguments.'
532
+ )
533
+
534
+ def _add_store_from_metadata(
535
+ self, sky_stores: Dict[StoreType, 'storage_utils.AbstractStore.StoreMetadata']
536
+ ) -> None:
537
+ """Reconstructs Storage.stores from sky_stores.
538
+
539
+ Reconstruct AbstractStore objects from sky_store's metadata and
540
+ adds them into Storage.stores
541
+ """
542
+ for s_type, s_metadata in sky_stores.items():
543
+ # When initializing from global_user_state, we override the
544
+ # source from the YAML
545
+ try:
546
+ if s_type == StoreType.S3:
547
+ store = aws.S3Store.from_metadata(
548
+ s_metadata,
549
+ source=self.source,
550
+ sync_on_reconstruction=self.sync_on_reconstruction,
551
+ )
552
+ elif s_type == StoreType.GCS:
553
+ store = gcp.GcsStore.from_metadata(
554
+ s_metadata,
555
+ source=self.source,
556
+ sync_on_reconstruction=self.sync_on_reconstruction,
557
+ )
558
+ # elif s_type == StoreType.AZURE:
559
+ # assert isinstance(s_metadata,
560
+ # AzureBlobStore.AzureBlobStoreMetadata)
561
+ # store = AzureBlobStore.from_metadata(
562
+ # s_metadata,
563
+ # source=self.source,
564
+ # sync_on_reconstruction=self.sync_on_reconstruction)
565
+ # elif s_type == StoreType.R2:
566
+ # store = R2Store.from_metadata(
567
+ # s_metadata,
568
+ # source=self.source,
569
+ # sync_on_reconstruction=self.sync_on_reconstruction)
570
+ # elif s_type == StoreType.IBM:
571
+ # store = IBMCosStore.from_metadata(
572
+ # s_metadata,
573
+ # source=self.source,
574
+ # sync_on_reconstruction=self.sync_on_reconstruction)
575
+ else:
576
+ with ux_utils.print_exception_no_traceback():
577
+ raise ValueError(f'Unknown store type: {s_type}')
578
+ # Following error is caught when an externally removed storage
579
+ # is attempted to be fetched.
580
+ except exceptions.StorageExternalDeletionError:
581
+ logger.debug(
582
+ f'Storage object {self.name!r} was attempted '
583
+ 'to be reconstructed while the corresponding '
584
+ 'bucket was externally deleted.'
585
+ )
586
+ continue
587
+
588
+ self._add_store(store, is_reconstructed=True)
589
+
590
+ @classmethod
591
+ def from_metadata(cls, metadata: StorageMetadata, **override_args) -> 'Storage':
592
+ """Create Storage from StorageMetadata object.
593
+
594
+ Used when reconstructing Storage object and AbstractStore objects from
595
+ global_user_state.
596
+ """
597
+ # Name should not be specified if the source is a cloud store URL.
598
+ source = override_args.get('source', metadata.source)
599
+ name = override_args.get('name', metadata.storage_name)
600
+ # If the source is a list, it consists of local paths
601
+ if not isinstance(source, list) and data_utils.is_cloud_store_url(source):
602
+ name = None
603
+
604
+ storage_obj = cls(
605
+ name=name,
606
+ source=source,
607
+ sync_on_reconstruction=override_args.get('sync_on_reconstruction', True),
608
+ )
609
+
610
+ # For backward compatibility
611
+ if hasattr(metadata, 'mode'):
612
+ if metadata.mode:
613
+ storage_obj.mode = override_args.get('mode', metadata.mode)
614
+
615
+ return storage_obj
616
+
617
+ def add_store(
618
+ self, store_type: Union[str, StoreType], region: Optional[str] = None
619
+ ) -> 'storage_utils.AbstractStore':
620
+ """Initializes and adds a new store to the storage.
621
+
622
+ Invoked by the optimizer after it has selected a store to
623
+ add it to Storage.
624
+
625
+ Args:
626
+ store_type: StoreType; Type of the storage [S3, GCS, AZURE, R2, IBM]
627
+ region: str; Region to place the bucket in. Caller must ensure that
628
+ the region is valid for the chosen store_type.
629
+ """
630
+ if isinstance(store_type, str):
631
+ store_type = StoreType(store_type)
632
+
633
+ store_cls: Type['storage_utils.AbstractStore']
634
+ if store_type == StoreType.GCS:
635
+ store_cls = gcp.GcsStore
636
+ elif store_type == StoreType.S3:
637
+ store_cls = aws.S3Store
638
+ else:
639
+ with ux_utils.print_exception_no_traceback():
640
+ raise exceptions.StorageSpecError(
641
+ f'{store_type} not supported as a Store.'
642
+ )
643
+
644
+ # Initialize store object and get/create bucket
645
+ try:
646
+ assert self.source is not None
647
+ store = store_cls(
648
+ name=self.name,
649
+ source=self.source,
650
+ region=region,
651
+ sync_on_reconstruction=self.sync_on_reconstruction,
652
+ is_sky_managed=self._is_sky_managed,
653
+ _bucket_sub_path=self._bucket_sub_path,
654
+ )
655
+ except exceptions.StorageBucketCreateError:
656
+ # Creation failed, so this must be sky managed store. Add failure
657
+ # to state.
658
+ logger.error(
659
+ f'Could not create {store_type} store ' f'with name {self.name}.'
660
+ )
661
+ raise
662
+ except exceptions.StorageBucketGetError:
663
+ # Bucket get failed, so this is not sky managed. Do not update state
664
+ logger.error(f'Could not get {store_type} store ' f'with name {self.name}.')
665
+ raise
666
+ except exceptions.StorageInitError:
667
+ logger.error(
668
+ f'Could not initialize {store_type} store with '
669
+ f'name {self.name}. General initialization error.'
670
+ )
671
+ raise
672
+ except exceptions.StorageSpecError:
673
+ logger.error(
674
+ f'Could not mount externally created {store_type}'
675
+ f'store with name {self.name!r}.'
676
+ )
677
+ raise
678
+
679
+ # Add store to storage
680
+ self._add_store(store)
681
+
682
+ # Upload source to store
683
+ self._sync_store(store)
684
+
685
+ return store
686
+
687
+ def _add_store(
688
+ self, store: 'storage_utils.AbstractStore', is_reconstructed: bool = False
689
+ ):
690
+ # Adds a store object to the storage
691
+ store_type = StoreType.from_store(store)
692
+ self.stores[store_type] = store
693
+
694
+ def delete(self, store_type: Optional[StoreType] = None) -> None:
695
+ """Deletes data for all sky-managed storage objects.
696
+
697
+ If a storage is not managed by sky, it is not deleted from the cloud.
698
+ User must manually delete any object stores created outside of sky.
699
+
700
+ Args:
701
+ store_type: StoreType; Specific cloud store to remove from the list
702
+ of backing stores.
703
+ """
704
+ if not self.stores:
705
+ logger.info('No backing stores found. Deleting storage.')
706
+ if store_type:
707
+ store = self.stores[store_type]
708
+ assert store is not None
709
+ # We delete a store from the cloud if it's sky managed. Else just
710
+ # remove handle and return
711
+ if self.force_delete:
712
+ store.delete()
713
+ # Remove store from bookkeeping
714
+ del self.stores[store_type]
715
+ else:
716
+ for _, store in self.stores.items():
717
+ assert store is not None
718
+ if self.force_delete:
719
+ store.delete()
720
+ self.stores = {}
721
+
722
+ def sync_all_stores(self):
723
+ """Syncs the source and destinations of all stores in the Storage"""
724
+ for _, store in self.stores.items():
725
+ self._sync_store(store)
726
+
727
+ def _sync_store(self, store: 'storage_utils.AbstractStore'):
728
+ """Runs the upload routine for the store and handles failures"""
729
+
730
+ def warn_for_git_dir(source: str):
731
+ if os.path.isdir(os.path.join(source, '.git')):
732
+ logger.warning(
733
+ f"'.git' directory under '{self.source}' "
734
+ 'is excluded during sync.'
735
+ )
736
+
737
+ try:
738
+ if self.source is not None:
739
+ if isinstance(self.source, str):
740
+ warn_for_git_dir(self.source)
741
+ else:
742
+ for source in self.source:
743
+ warn_for_git_dir(source)
744
+ store.upload()
745
+ except exceptions.StorageUploadError:
746
+ logger.error(
747
+ f'Could not upload {self.source!r} to store ' f'name {store.name!r}.'
748
+ )
749
+ raise
750
+
751
+ @classmethod
752
+ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Storage':
753
+ common_utils.validate_schema(
754
+ config, schemas.get_storage_schema(), 'Invalid storage YAML: '
755
+ )
756
+
757
+ name = config.pop('name', None)
758
+ source = config.pop('source', None)
759
+ store = config.pop('store', None)
760
+ mode_str = config.pop('mode', None)
761
+ force_delete = config.pop('_force_delete', None)
762
+ if force_delete is None:
763
+ force_delete = False
764
+
765
+ if isinstance(mode_str, str):
766
+ # Make mode case insensitive, if specified
767
+ mode = StorageMode(mode_str.upper())
768
+ else:
769
+ # Make sure this keeps the same as the default mode in __init__
770
+ mode = StorageMode.MOUNT
771
+ persistent = config.pop('persistent', None)
772
+ if persistent is None:
773
+ persistent = True
774
+
775
+ assert not config, f'Invalid storage args: {config.keys()}'
776
+
777
+ # Validation of the config object happens on instantiation.
778
+ storage_obj = cls(name=name, source=source, persistent=persistent, mode=mode)
779
+ if store is not None:
780
+ storage_obj.add_store(StoreType(store.upper()))
781
+
782
+ # Add force deletion flag
783
+ storage_obj.force_delete = force_delete
784
+ return storage_obj
785
+
786
+ def to_yaml_config(self) -> Dict[str, str]:
787
+ config = {}
788
+
789
+ def add_if_not_none(key: str, value: Optional[Any]):
790
+ if value is not None:
791
+ config[key] = value
792
+
793
+ name = None
794
+ if (
795
+ self.source is None
796
+ or not isinstance(self.source, str)
797
+ or not data_utils.is_cloud_store_url(self.source)
798
+ ):
799
+ # Remove name if source is a cloud store URL
800
+ name = self.name
801
+ add_if_not_none('name', name)
802
+ add_if_not_none('source', self.source)
803
+
804
+ stores = None
805
+ if len(self.stores) > 0:
806
+ stores = ','.join([store.value for store in self.stores])
807
+ add_if_not_none('store', stores)
808
+ add_if_not_none('persistent', self.persistent)
809
+ add_if_not_none('mode', self.mode.value)
810
+ if self.force_delete:
811
+ config['_force_delete'] = True
812
+ return config