skypilot-nightly 1.0.0.dev20250401__py3-none-any.whl → 1.0.0.dev20250403__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +58 -13
- sky/check.py +2 -2
- sky/cli.py +2 -1
- sky/client/cli.py +2 -1
- sky/cloud_stores.py +8 -10
- sky/clouds/kubernetes.py +9 -0
- sky/data/data_utils.py +178 -90
- sky/data/mounting_utils.py +79 -22
- sky/data/storage.py +100 -34
- sky/data/storage_utils.py +13 -3
- sky/global_user_state.py +5 -0
- sky/jobs/controller.py +39 -7
- sky/jobs/server/server.py +1 -1
- sky/models.py +4 -1
- sky/provision/kubernetes/utils.py +37 -0
- sky/server/requests/payloads.py +4 -4
- sky/server/server.py +8 -1
- sky/skylet/constants.py +8 -2
- sky/task.py +4 -4
- sky/utils/controller_utils.py +3 -3
- sky/utils/kubernetes/gpu_labeler.py +46 -57
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/METADATA +1 -5
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/RECORD +29 -29
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250401.dist-info → skypilot_nightly-1.0.0.dev20250403.dist-info}/top_level.txt +0 -0
sky/data/mounting_utils.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
"""Helper functions for object store mounting in Sky Storage"""
|
2
|
+
import hashlib
|
3
|
+
import os
|
2
4
|
import random
|
3
5
|
import shlex
|
4
6
|
import textwrap
|
5
7
|
from typing import Optional
|
6
8
|
|
7
9
|
from sky import exceptions
|
10
|
+
from sky.skylet import constants
|
8
11
|
from sky.utils import command_runner
|
9
12
|
|
10
13
|
# Values used to construct mounting commands
|
@@ -14,11 +17,17 @@ _TYPE_CACHE_TTL = '5s'
|
|
14
17
|
_RENAME_DIR_LIMIT = 10000
|
15
18
|
# https://github.com/GoogleCloudPlatform/gcsfuse/releases
|
16
19
|
GCSFUSE_VERSION = '2.2.0'
|
20
|
+
# Creates a fusermount3 soft link on older (<22) Ubuntu systems to utilize
|
21
|
+
# Rclone's mounting utility.
|
22
|
+
FUSERMOUNT3_SOFT_LINK_CMD = ('[ ! -f /bin/fusermount3 ] && '
|
23
|
+
'sudo ln -s /bin/fusermount /bin/fusermount3 || '
|
24
|
+
'true')
|
17
25
|
# https://github.com/Azure/azure-storage-fuse/releases
|
18
26
|
BLOBFUSE2_VERSION = '2.2.0'
|
19
27
|
_BLOBFUSE_CACHE_ROOT_DIR = '~/.sky/blobfuse2_cache'
|
20
28
|
_BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
|
21
29
|
'{storage_account_name}_{container_name}')
|
30
|
+
# https://github.com/rclone/rclone/releases
|
22
31
|
RCLONE_VERSION = 'v1.68.2'
|
23
32
|
|
24
33
|
|
@@ -112,7 +121,12 @@ def get_az_mount_install_cmd() -> str:
|
|
112
121
|
'sudo apt-get update; '
|
113
122
|
'sudo apt-get install -y '
|
114
123
|
'-o Dpkg::Options::="--force-confdef" '
|
115
|
-
'fuse3 libfuse3-dev
|
124
|
+
'fuse3 libfuse3-dev || { '
|
125
|
+
' echo "fuse3 not available, falling back to fuse"; '
|
126
|
+
' sudo apt-get install -y '
|
127
|
+
' -o Dpkg::Options::="--force-confdef" '
|
128
|
+
' fuse libfuse-dev; '
|
129
|
+
'} && '
|
116
130
|
'ARCH=$(uname -m) && '
|
117
131
|
'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
118
132
|
' echo "blobfuse2 is not supported on $ARCH" && '
|
@@ -203,31 +217,17 @@ def get_r2_mount_cmd(r2_credentials_path: str,
|
|
203
217
|
return mount_cmd
|
204
218
|
|
205
219
|
|
206
|
-
def
|
207
|
-
|
208
|
-
install_cmd = ('rclone version >/dev/null 2>&1 || '
|
209
|
-
'(curl https://rclone.org/install.sh | '
|
210
|
-
'sudo bash)')
|
211
|
-
return install_cmd
|
212
|
-
|
213
|
-
|
214
|
-
def get_cos_mount_cmd(rclone_config_data: str,
|
215
|
-
rclone_config_path: str,
|
216
|
-
bucket_rclone_profile: str,
|
220
|
+
def get_cos_mount_cmd(rclone_config: str,
|
221
|
+
rclone_profile_name: str,
|
217
222
|
bucket_name: str,
|
218
223
|
mount_path: str,
|
219
224
|
_bucket_sub_path: Optional[str] = None) -> str:
|
220
225
|
"""Returns a command to mount an IBM COS bucket using rclone."""
|
221
|
-
# creates a fusermount soft link on older (<22) Ubuntu systems for
|
222
|
-
# rclone's mount utility.
|
223
|
-
set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && '
|
224
|
-
'sudo ln -s /bin/fusermount /bin/fusermount3 || '
|
225
|
-
'true')
|
226
226
|
# stores bucket profile in rclone config file at the cluster's nodes.
|
227
|
-
configure_rclone_profile = (f'{
|
228
|
-
'mkdir -p
|
229
|
-
f'echo "{
|
230
|
-
f'{
|
227
|
+
configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
|
228
|
+
f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
|
229
|
+
f'echo "{rclone_config}" >> '
|
230
|
+
f'{constants.RCLONE_CONFIG_PATH}')
|
231
231
|
if _bucket_sub_path is None:
|
232
232
|
sub_path_arg = f'{bucket_name}/{_bucket_sub_path}'
|
233
233
|
else:
|
@@ -235,11 +235,68 @@ def get_cos_mount_cmd(rclone_config_data: str,
|
|
235
235
|
# --daemon will keep the mounting process running in the background.
|
236
236
|
mount_cmd = (f'{configure_rclone_profile} && '
|
237
237
|
'rclone mount '
|
238
|
-
f'{
|
238
|
+
f'{rclone_profile_name}:{sub_path_arg} {mount_path} '
|
239
239
|
'--daemon')
|
240
240
|
return mount_cmd
|
241
241
|
|
242
242
|
|
243
|
+
def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
|
244
|
+
bucket_name: str, mount_path: str) -> str:
|
245
|
+
"""Returns a command to mount a bucket using rclone with vfs cache."""
|
246
|
+
# stores bucket profile in rclone config file at the remote nodes.
|
247
|
+
configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
|
248
|
+
f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
|
249
|
+
f'echo {shlex.quote(rclone_config)} >> '
|
250
|
+
f'{constants.RCLONE_CONFIG_PATH}')
|
251
|
+
# Assume mount path is unique. We use a hash of mount path as
|
252
|
+
# various filenames related to the mount.
|
253
|
+
# This is because the full path may be longer than
|
254
|
+
# the filename length limit.
|
255
|
+
# The hash is a non-negative integer in string form.
|
256
|
+
hashed_mount_path = hashlib.md5(mount_path.encode()).hexdigest()
|
257
|
+
log_file_path = os.path.join(constants.RCLONE_LOG_DIR,
|
258
|
+
f'{hashed_mount_path}.log')
|
259
|
+
create_log_cmd = (f'mkdir -p {constants.RCLONE_LOG_DIR} && '
|
260
|
+
f'touch {log_file_path}')
|
261
|
+
# when mounting multiple directories with vfs cache mode, it's handled by
|
262
|
+
# rclone to create separate cache directories at ~/.cache/rclone/vfs. It is
|
263
|
+
# not necessary to specify separate cache directories.
|
264
|
+
mount_cmd = (
|
265
|
+
f'{create_log_cmd} && '
|
266
|
+
f'{configure_rclone_profile} && '
|
267
|
+
'rclone mount '
|
268
|
+
f'{rclone_profile_name}:{bucket_name} {mount_path} '
|
269
|
+
# '--daemon' keeps the mounting process running in the background.
|
270
|
+
# fail in 10 seconds if mount cannot complete by then,
|
271
|
+
# which should be plenty of time.
|
272
|
+
'--daemon --daemon-wait 10 '
|
273
|
+
f'--log-file {log_file_path} --log-level INFO '
|
274
|
+
# '--dir-cache-time' sets how long directory listings are cached before
|
275
|
+
# rclone checks the remote storage for changes again. A shorter
|
276
|
+
# interval allows for faster detection of new or updated files on the
|
277
|
+
# remote, but increases the frequency of metadata lookups.
|
278
|
+
'--allow-other --vfs-cache-mode full --dir-cache-time 10s '
|
279
|
+
# '--transfers 1' guarantees the files written at the local mount point
|
280
|
+
# to be uploaded to the backend storage in the order of creation.
|
281
|
+
# '--vfs-cache-poll-interval' specifies the frequency of how often
|
282
|
+
# rclone checks the local mount point for stale objects in cache.
|
283
|
+
# '--vfs-write-back' defines the time to write files on remote storage
|
284
|
+
# after last use of the file in local mountpoint.
|
285
|
+
'--transfers 1 --vfs-cache-poll-interval 10s --vfs-write-back 1s '
|
286
|
+
# Have rclone evict files if the cache size exceeds 10G.
|
287
|
+
# This is to prevent cache from growing too large and
|
288
|
+
# using up all the disk space. Note that files that opened
|
289
|
+
# by a process is not evicted from the cache.
|
290
|
+
'--vfs-cache-max-size 10G '
|
291
|
+
# give each mount its own cache directory
|
292
|
+
f'--cache-dir {constants.RCLONE_CACHE_DIR}/{hashed_mount_path} '
|
293
|
+
# This command produces children processes, which need to be
|
294
|
+
# detached from the current process's terminal. The command doesn't
|
295
|
+
# produce any output, so we aren't dropping any logs.
|
296
|
+
'> /dev/null 2>&1')
|
297
|
+
return mount_cmd
|
298
|
+
|
299
|
+
|
243
300
|
def get_rclone_install_cmd() -> str:
|
244
301
|
""" RClone installation for both apt-get and rpm.
|
245
302
|
This would be common command.
|
sky/data/storage.py
CHANGED
@@ -30,7 +30,6 @@ from sky.data import data_transfer
|
|
30
30
|
from sky.data import data_utils
|
31
31
|
from sky.data import mounting_utils
|
32
32
|
from sky.data import storage_utils
|
33
|
-
from sky.data.data_utils import Rclone
|
34
33
|
from sky.skylet import constants
|
35
34
|
from sky.utils import common_utils
|
36
35
|
from sky.utils import rich_utils
|
@@ -266,6 +265,15 @@ class StoreType(enum.Enum):
|
|
266
265
|
class StorageMode(enum.Enum):
|
267
266
|
MOUNT = 'MOUNT'
|
268
267
|
COPY = 'COPY'
|
268
|
+
MOUNT_CACHED = 'MOUNT_CACHED'
|
269
|
+
|
270
|
+
|
271
|
+
MOUNTABLE_STORAGE_MODES = [
|
272
|
+
StorageMode.MOUNT,
|
273
|
+
StorageMode.MOUNT_CACHED,
|
274
|
+
]
|
275
|
+
|
276
|
+
DEFAULT_STORAGE_MODE = StorageMode.MOUNT
|
269
277
|
|
270
278
|
|
271
279
|
class AbstractStore:
|
@@ -451,13 +459,27 @@ class AbstractStore:
|
|
451
459
|
def mount_command(self, mount_path: str) -> str:
|
452
460
|
"""Returns the command to mount the Store to the specified mount_path.
|
453
461
|
|
454
|
-
Includes the setup commands to
|
462
|
+
This command is used for MOUNT mode. Includes the setup commands to
|
463
|
+
install mounting tools.
|
455
464
|
|
456
465
|
Args:
|
457
466
|
mount_path: str; Mount path on remote server
|
458
467
|
"""
|
459
468
|
raise NotImplementedError
|
460
469
|
|
470
|
+
def mount_cached_command(self, mount_path: str) -> str:
|
471
|
+
"""Returns the command to mount the Store to the specified mount_path.
|
472
|
+
|
473
|
+
This command is used for MOUNT_CACHED mode. Includes the setup commands
|
474
|
+
to install mounting tools.
|
475
|
+
|
476
|
+
Args:
|
477
|
+
mount_path: str; Mount path on remote server
|
478
|
+
"""
|
479
|
+
raise exceptions.NotSupportedError(
|
480
|
+
f'{StorageMode.MOUNT_CACHED.value} is '
|
481
|
+
f'not supported for {self.name}.')
|
482
|
+
|
461
483
|
def __deepcopy__(self, memo):
|
462
484
|
# S3 Client and GCS Client cannot be deep copied, hence the
|
463
485
|
# original Store object is returned
|
@@ -571,7 +593,7 @@ class Storage(object):
|
|
571
593
|
source: Optional[SourceType] = None,
|
572
594
|
stores: Optional[List[StoreType]] = None,
|
573
595
|
persistent: Optional[bool] = True,
|
574
|
-
mode: StorageMode =
|
596
|
+
mode: StorageMode = DEFAULT_STORAGE_MODE,
|
575
597
|
sync_on_reconstruction: bool = True,
|
576
598
|
# pylint: disable=invalid-name
|
577
599
|
_is_sky_managed: Optional[bool] = None,
|
@@ -835,7 +857,7 @@ class Storage(object):
|
|
835
857
|
is_local_source = False
|
836
858
|
# Storage mounting does not support mounting specific files from
|
837
859
|
# cloud store - ensure path points to only a directory
|
838
|
-
if mode
|
860
|
+
if mode in MOUNTABLE_STORAGE_MODES:
|
839
861
|
if (split_path.scheme != 'https' and
|
840
862
|
((split_path.scheme != 'cos' and
|
841
863
|
split_path.path.strip('/') != '') or
|
@@ -1264,8 +1286,7 @@ class Storage(object):
|
|
1264
1286
|
# Make mode case insensitive, if specified
|
1265
1287
|
mode = StorageMode(mode_str.upper())
|
1266
1288
|
else:
|
1267
|
-
|
1268
|
-
mode = StorageMode.MOUNT
|
1289
|
+
mode = DEFAULT_STORAGE_MODE
|
1269
1290
|
persistent = config.pop('persistent', None)
|
1270
1291
|
if persistent is None:
|
1271
1292
|
persistent = True
|
@@ -1725,6 +1746,17 @@ class S3Store(AbstractStore):
|
|
1725
1746
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
1726
1747
|
mount_cmd)
|
1727
1748
|
|
1749
|
+
def mount_cached_command(self, mount_path: str) -> str:
|
1750
|
+
install_cmd = mounting_utils.get_rclone_install_cmd()
|
1751
|
+
rclone_profile_name = (
|
1752
|
+
data_utils.Rclone.RcloneStores.S3.get_profile_name(self.name))
|
1753
|
+
rclone_config = data_utils.Rclone.RcloneStores.S3.get_config(
|
1754
|
+
rclone_profile_name=rclone_profile_name)
|
1755
|
+
mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
|
1756
|
+
rclone_config, rclone_profile_name, self.bucket.name, mount_path)
|
1757
|
+
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
1758
|
+
mount_cached_cmd)
|
1759
|
+
|
1728
1760
|
def _create_s3_bucket(self,
|
1729
1761
|
bucket_name: str,
|
1730
1762
|
region=_DEFAULT_REGION) -> StorageHandle:
|
@@ -2252,6 +2284,17 @@ class GcsStore(AbstractStore):
|
|
2252
2284
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
2253
2285
|
mount_cmd, version_check_cmd)
|
2254
2286
|
|
2287
|
+
def mount_cached_command(self, mount_path: str) -> str:
|
2288
|
+
install_cmd = mounting_utils.get_rclone_install_cmd()
|
2289
|
+
rclone_profile_name = (
|
2290
|
+
data_utils.Rclone.RcloneStores.GCS.get_profile_name(self.name))
|
2291
|
+
rclone_config = data_utils.Rclone.RcloneStores.GCS.get_config(
|
2292
|
+
rclone_profile_name=rclone_profile_name)
|
2293
|
+
mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
|
2294
|
+
rclone_config, rclone_profile_name, self.bucket.name, mount_path)
|
2295
|
+
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
2296
|
+
mount_cached_cmd)
|
2297
|
+
|
2255
2298
|
def _download_file(self, remote_path: str, local_path: str) -> None:
|
2256
2299
|
"""Downloads file from remote to local on GS bucket
|
2257
2300
|
|
@@ -3126,6 +3169,19 @@ class AzureBlobStore(AbstractStore):
|
|
3126
3169
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
3127
3170
|
mount_cmd)
|
3128
3171
|
|
3172
|
+
def mount_cached_command(self, mount_path: str) -> str:
|
3173
|
+
install_cmd = mounting_utils.get_rclone_install_cmd()
|
3174
|
+
rclone_profile_name = (
|
3175
|
+
data_utils.Rclone.RcloneStores.AZURE.get_profile_name(self.name))
|
3176
|
+
rclone_config = data_utils.Rclone.RcloneStores.AZURE.get_config(
|
3177
|
+
rclone_profile_name=rclone_profile_name,
|
3178
|
+
storage_account_name=self.storage_account_name,
|
3179
|
+
storage_account_key=self.storage_account_key)
|
3180
|
+
mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
|
3181
|
+
rclone_config, rclone_profile_name, self.container_name, mount_path)
|
3182
|
+
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
3183
|
+
mount_cached_cmd)
|
3184
|
+
|
3129
3185
|
def _create_az_bucket(self, container_name: str) -> StorageHandle:
|
3130
3186
|
"""Creates AZ Container.
|
3131
3187
|
|
@@ -3562,6 +3618,17 @@ class R2Store(AbstractStore):
|
|
3562
3618
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
3563
3619
|
mount_cmd)
|
3564
3620
|
|
3621
|
+
def mount_cached_command(self, mount_path: str) -> str:
|
3622
|
+
install_cmd = mounting_utils.get_rclone_install_cmd()
|
3623
|
+
rclone_profile_name = (
|
3624
|
+
data_utils.Rclone.RcloneStores.R2.get_profile_name(self.name))
|
3625
|
+
rclone_config = data_utils.Rclone.RcloneStores.R2.get_config(
|
3626
|
+
rclone_profile_name=rclone_profile_name)
|
3627
|
+
mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
|
3628
|
+
rclone_config, rclone_profile_name, self.bucket.name, mount_path)
|
3629
|
+
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
3630
|
+
mount_cached_cmd)
|
3631
|
+
|
3565
3632
|
def _create_r2_bucket(self,
|
3566
3633
|
bucket_name: str,
|
3567
3634
|
region='auto') -> StorageHandle:
|
@@ -3681,11 +3748,10 @@ class IBMCosStore(AbstractStore):
|
|
3681
3748
|
_bucket_sub_path: Optional[str] = None):
|
3682
3749
|
self.client: 'storage.Client'
|
3683
3750
|
self.bucket: 'StorageHandle'
|
3751
|
+
self.rclone_profile_name = (
|
3752
|
+
data_utils.Rclone.RcloneStores.IBM.get_profile_name(self.name))
|
3684
3753
|
super().__init__(name, source, region, is_sky_managed,
|
3685
3754
|
sync_on_reconstruction, _bucket_sub_path)
|
3686
|
-
self.bucket_rclone_profile = \
|
3687
|
-
Rclone.generate_rclone_bucket_profile_name(
|
3688
|
-
self.name, Rclone.RcloneClouds.IBM)
|
3689
3755
|
|
3690
3756
|
def _validate(self):
|
3691
3757
|
if self.source is not None and isinstance(self.source, str):
|
@@ -3897,11 +3963,10 @@ class IBMCosStore(AbstractStore):
|
|
3897
3963
|
# .git directory is excluded from the sync
|
3898
3964
|
# wrapping src_dir_path with "" to support path with spaces
|
3899
3965
|
src_dir_path = shlex.quote(src_dir_path)
|
3900
|
-
sync_command = (
|
3901
|
-
|
3902
|
-
|
3903
|
-
|
3904
|
-
f'/{dest_dir_name}')
|
3966
|
+
sync_command = ('rclone copy --exclude ".git/*" '
|
3967
|
+
f'{src_dir_path} '
|
3968
|
+
f'{self.rclone_profile_name}:{self.name}{sub_path}'
|
3969
|
+
f'/{dest_dir_name}')
|
3905
3970
|
return sync_command
|
3906
3971
|
|
3907
3972
|
def get_file_sync_command(base_dir_path, file_names) -> str:
|
@@ -3927,10 +3992,9 @@ class IBMCosStore(AbstractStore):
|
|
3927
3992
|
for file_name in file_names
|
3928
3993
|
])
|
3929
3994
|
base_dir_path = shlex.quote(base_dir_path)
|
3930
|
-
sync_command = (
|
3931
|
-
|
3932
|
-
|
3933
|
-
f'{self.bucket_rclone_profile}:{self.name}{sub_path}')
|
3995
|
+
sync_command = ('rclone copy '
|
3996
|
+
f'{includes} {base_dir_path} '
|
3997
|
+
f'{self.rclone_profile_name}:{self.name}{sub_path}')
|
3934
3998
|
return sync_command
|
3935
3999
|
|
3936
4000
|
# Generate message for upload
|
@@ -3976,7 +4040,8 @@ class IBMCosStore(AbstractStore):
|
|
3976
4040
|
'sky storage delete' or 'sky start'
|
3977
4041
|
"""
|
3978
4042
|
|
3979
|
-
bucket_profile_name = Rclone.
|
4043
|
+
bucket_profile_name = (data_utils.Rclone.RcloneStores.IBM.value +
|
4044
|
+
self.name)
|
3980
4045
|
try:
|
3981
4046
|
bucket_region = data_utils.get_ibm_cos_bucket_region(self.name)
|
3982
4047
|
except exceptions.StorageBucketGetError as e:
|
@@ -4011,9 +4076,9 @@ class IBMCosStore(AbstractStore):
|
|
4011
4076
|
'`rclone lsd <remote>` on relevant remotes returned '
|
4012
4077
|
'via `rclone listremotes` to debug.')
|
4013
4078
|
|
4014
|
-
Rclone.store_rclone_config(
|
4079
|
+
data_utils.Rclone.store_rclone_config(
|
4015
4080
|
self.name,
|
4016
|
-
Rclone.
|
4081
|
+
data_utils.Rclone.RcloneStores.IBM,
|
4017
4082
|
self.region, # type: ignore
|
4018
4083
|
)
|
4019
4084
|
|
@@ -4053,18 +4118,18 @@ class IBMCosStore(AbstractStore):
|
|
4053
4118
|
mount_path: str; Path to mount the bucket to.
|
4054
4119
|
"""
|
4055
4120
|
# install rclone if not installed.
|
4056
|
-
install_cmd = mounting_utils.
|
4057
|
-
|
4058
|
-
self.
|
4059
|
-
|
4060
|
-
|
4061
|
-
|
4062
|
-
|
4063
|
-
|
4064
|
-
|
4065
|
-
|
4066
|
-
|
4067
|
-
|
4121
|
+
install_cmd = mounting_utils.get_rclone_install_cmd()
|
4122
|
+
rclone_config = data_utils.Rclone.RcloneStores.IBM.get_config(
|
4123
|
+
rclone_profile_name=self.rclone_profile_name,
|
4124
|
+
region=self.region) # type: ignore
|
4125
|
+
mount_cmd = (
|
4126
|
+
mounting_utils.get_cos_mount_cmd(
|
4127
|
+
rclone_config,
|
4128
|
+
self.rclone_profile_name,
|
4129
|
+
self.bucket.name,
|
4130
|
+
mount_path,
|
4131
|
+
self._bucket_sub_path, # type: ignore
|
4132
|
+
))
|
4068
4133
|
return mounting_utils.get_mounting_command(mount_path, install_cmd,
|
4069
4134
|
mount_cmd)
|
4070
4135
|
|
@@ -4128,7 +4193,8 @@ class IBMCosStore(AbstractStore):
|
|
4128
4193
|
except ibm.ibm_botocore.exceptions.ClientError as e:
|
4129
4194
|
if e.__class__.__name__ == 'NoSuchBucket':
|
4130
4195
|
logger.debug('bucket already removed')
|
4131
|
-
Rclone.delete_rclone_bucket_profile(
|
4196
|
+
data_utils.Rclone.delete_rclone_bucket_profile(
|
4197
|
+
self.name, data_utils.Rclone.RcloneStores.IBM)
|
4132
4198
|
|
4133
4199
|
|
4134
4200
|
class OciStore(AbstractStore):
|
sky/data/storage_utils.py
CHANGED
@@ -223,7 +223,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
|
|
223
223
|
def get_excluded_files(src_dir_path: str) -> List[str]:
|
224
224
|
# TODO: this could return a huge list of files,
|
225
225
|
# should think of ways to optimize.
|
226
|
-
"""
|
226
|
+
"""List files and directories to be excluded."""
|
227
227
|
expand_src_dir_path = os.path.expanduser(src_dir_path)
|
228
228
|
skyignore_path = os.path.join(expand_src_dir_path,
|
229
229
|
constants.SKY_IGNORE_FILE)
|
@@ -273,12 +273,22 @@ def zip_files_and_folders(items: List[str],
|
|
273
273
|
zipf.write(item)
|
274
274
|
elif os.path.isdir(item):
|
275
275
|
for root, dirs, files in os.walk(item, followlinks=False):
|
276
|
+
# Modify dirs in-place to control os.walk()'s traversal
|
277
|
+
# behavior. This filters out excluded directories BEFORE
|
278
|
+
# os.walk() visits the files and sub-directories under
|
279
|
+
# them, preventing traversal into any excluded directory
|
280
|
+
# and its contents.
|
281
|
+
# Note: dirs[:] = ... is required for in-place
|
282
|
+
# modification.
|
283
|
+
dirs[:] = [
|
284
|
+
d for d in dirs
|
285
|
+
if os.path.join(root, d) not in excluded_files
|
286
|
+
]
|
287
|
+
|
276
288
|
# Store directory entries (important for empty
|
277
289
|
# directories)
|
278
290
|
for dir_name in dirs:
|
279
291
|
dir_path = os.path.join(root, dir_name)
|
280
|
-
if dir_path in excluded_files:
|
281
|
-
continue
|
282
292
|
# If it's a symlink, store it as a symlink
|
283
293
|
if os.path.islink(dir_path):
|
284
294
|
_store_symlink(zipf, dir_path, is_dir=True)
|
sky/global_user_state.py
CHANGED
@@ -186,6 +186,11 @@ def get_user(user_id: str) -> models.User:
|
|
186
186
|
return models.User(id=row[0], name=row[1])
|
187
187
|
|
188
188
|
|
189
|
+
def get_all_users() -> List[models.User]:
|
190
|
+
rows = _DB.cursor.execute('SELECT id, name FROM users').fetchall()
|
191
|
+
return [models.User(id=row[0], name=row[1]) for row in rows]
|
192
|
+
|
193
|
+
|
189
194
|
def add_or_update_cluster(cluster_name: str,
|
190
195
|
cluster_handle: 'backends.ResourceHandle',
|
191
196
|
requested_resources: Optional[Set[Any]],
|
sky/jobs/controller.py
CHANGED
@@ -227,13 +227,13 @@ class JobsController:
|
|
227
227
|
self._backend, cluster_name)
|
228
228
|
|
229
229
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
230
|
-
|
230
|
+
success_end_time = managed_job_utils.try_to_get_job_end_time(
|
231
231
|
self._backend, cluster_name)
|
232
232
|
# The job is done. Set the job to SUCCEEDED first before start
|
233
233
|
# downloading and streaming the logs to make it more responsive.
|
234
234
|
managed_job_state.set_succeeded(self._job_id,
|
235
235
|
task_id,
|
236
|
-
end_time=
|
236
|
+
end_time=success_end_time,
|
237
237
|
callback_func=callback_func)
|
238
238
|
logger.info(
|
239
239
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
@@ -299,23 +299,40 @@ class JobsController:
|
|
299
299
|
if job_status is not None and not job_status.is_terminal():
|
300
300
|
# The multi-node job is still running, continue monitoring.
|
301
301
|
continue
|
302
|
-
elif job_status
|
302
|
+
elif (job_status
|
303
|
+
in job_lib.JobStatus.user_code_failure_states() or
|
304
|
+
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
303
305
|
# The user code has probably crashed, fail immediately.
|
304
306
|
end_time = managed_job_utils.try_to_get_job_end_time(
|
305
307
|
self._backend, cluster_name)
|
306
308
|
logger.info(
|
307
|
-
'The user job failed. Please check the
|
309
|
+
f'The user job failed ({job_status}). Please check the '
|
310
|
+
'logs below.\n'
|
308
311
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
309
312
|
|
310
313
|
self._download_log_and_stream(task_id, handle)
|
314
|
+
|
315
|
+
failure_reason = (
|
316
|
+
'To see the details, run: '
|
317
|
+
f'sky jobs logs --controller {self._job_id}')
|
318
|
+
|
311
319
|
managed_job_status = (
|
312
320
|
managed_job_state.ManagedJobStatus.FAILED)
|
313
321
|
if job_status == job_lib.JobStatus.FAILED_SETUP:
|
314
322
|
managed_job_status = (
|
315
323
|
managed_job_state.ManagedJobStatus.FAILED_SETUP)
|
316
|
-
|
317
|
-
|
318
|
-
|
324
|
+
elif job_status == job_lib.JobStatus.FAILED_DRIVER:
|
325
|
+
# FAILED_DRIVER is kind of an internal error, so we mark
|
326
|
+
# this as FAILED_CONTROLLER, even though the failure is
|
327
|
+
# not strictly within the controller.
|
328
|
+
managed_job_status = (
|
329
|
+
managed_job_state.ManagedJobStatus.FAILED_CONTROLLER
|
330
|
+
)
|
331
|
+
failure_reason = (
|
332
|
+
'The job driver on the remote cluster failed. This '
|
333
|
+
'can be caused by the job taking too much memory '
|
334
|
+
'or other resources. Try adding more memory, CPU, '
|
335
|
+
f'or disk in your job definition. {failure_reason}')
|
319
336
|
should_restart_on_failure = (
|
320
337
|
self._strategy_executor.should_restart_on_failure())
|
321
338
|
if should_restart_on_failure:
|
@@ -337,6 +354,21 @@ class JobsController:
|
|
337
354
|
end_time=end_time,
|
338
355
|
callback_func=callback_func)
|
339
356
|
return False
|
357
|
+
elif job_status is not None:
|
358
|
+
# Either the job is cancelled (should not happen) or in some
|
359
|
+
# unknown new state that we do not handle.
|
360
|
+
logger.error(f'Unknown job status: {job_status}')
|
361
|
+
failure_reason = (
|
362
|
+
f'Unknown job status {job_status}. To see the details, '
|
363
|
+
f'run: sky jobs logs --controller {self._job_id}')
|
364
|
+
managed_job_state.set_failed(
|
365
|
+
self._job_id,
|
366
|
+
task_id,
|
367
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
368
|
+
FAILED_CONTROLLER,
|
369
|
+
failure_reason=failure_reason,
|
370
|
+
callback_func=callback_func)
|
371
|
+
return False
|
340
372
|
else:
|
341
373
|
# Although the cluster is healthy, we fail to access the
|
342
374
|
# job status. Try to recover the job (will not restart the
|
sky/jobs/server/server.py
CHANGED
@@ -160,7 +160,7 @@ async def dashboard(request: fastapi.Request,
|
|
160
160
|
async with httpx.AsyncClient() as client:
|
161
161
|
response = await client.request('GET',
|
162
162
|
dashboard_url,
|
163
|
-
timeout=
|
163
|
+
timeout=5)
|
164
164
|
break # Connection successful, proceed with the request
|
165
165
|
except Exception as e: # pylint: disable=broad-except
|
166
166
|
# We catch all exceptions to gracefully handle unknown
|
sky/models.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
import collections
|
4
4
|
import dataclasses
|
5
|
-
from typing import Dict, Optional
|
5
|
+
from typing import Any, Dict, Optional
|
6
6
|
|
7
7
|
|
8
8
|
@dataclasses.dataclass
|
@@ -12,6 +12,9 @@ class User:
|
|
12
12
|
# Display name of the user
|
13
13
|
name: Optional[str] = None
|
14
14
|
|
15
|
+
def to_dict(self) -> Dict[str, Any]:
|
16
|
+
return {'id': self.id, 'name': self.name}
|
17
|
+
|
15
18
|
|
16
19
|
RealtimeGpuAvailability = collections.namedtuple(
|
17
20
|
'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
|
@@ -2457,6 +2457,43 @@ def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
|
|
2457
2457
|
return kubernetes.api_client().deserialize(fake_kube_response, object_type)
|
2458
2458
|
|
2459
2459
|
|
2460
|
+
def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
|
2461
|
+
"""Gets a list of unlabeled GPU nodes in the cluster.
|
2462
|
+
|
2463
|
+
This function returns a list of nodes that have GPU resources but no label
|
2464
|
+
that indicates the accelerator type.
|
2465
|
+
|
2466
|
+
Args:
|
2467
|
+
context: The context to check.
|
2468
|
+
|
2469
|
+
Returns:
|
2470
|
+
List[Any]: List of unlabeled nodes with accelerators.
|
2471
|
+
"""
|
2472
|
+
nodes = get_kubernetes_nodes(context=context)
|
2473
|
+
nodes_with_accelerator = []
|
2474
|
+
for node in nodes:
|
2475
|
+
if get_gpu_resource_key() in node.status.capacity:
|
2476
|
+
nodes_with_accelerator.append(node)
|
2477
|
+
|
2478
|
+
label_formatter, _ = detect_gpu_label_formatter(context)
|
2479
|
+
if not label_formatter:
|
2480
|
+
return nodes_with_accelerator
|
2481
|
+
else:
|
2482
|
+
label_keys = label_formatter.get_label_keys()
|
2483
|
+
|
2484
|
+
unlabeled_nodes = []
|
2485
|
+
for node in nodes_with_accelerator:
|
2486
|
+
labeled = False
|
2487
|
+
for label_key in label_keys:
|
2488
|
+
if label_key in node.metadata.labels:
|
2489
|
+
labeled = True
|
2490
|
+
break
|
2491
|
+
if not labeled:
|
2492
|
+
unlabeled_nodes.append(node)
|
2493
|
+
|
2494
|
+
return unlabeled_nodes
|
2495
|
+
|
2496
|
+
|
2460
2497
|
def get_kubernetes_node_info(
|
2461
2498
|
context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
|
2462
2499
|
"""Gets the resource information for all the nodes in the cluster.
|
sky/server/requests/payloads.py
CHANGED
@@ -115,8 +115,8 @@ class RequestBody(pydantic.BaseModel):
|
|
115
115
|
|
116
116
|
class CheckBody(RequestBody):
|
117
117
|
"""The request body for the check endpoint."""
|
118
|
-
clouds: Optional[Tuple[str, ...]]
|
119
|
-
verbose: bool
|
118
|
+
clouds: Optional[Tuple[str, ...]] = None
|
119
|
+
verbose: bool = False
|
120
120
|
|
121
121
|
|
122
122
|
class DagRequestBody(RequestBody):
|
@@ -340,8 +340,8 @@ class JobsQueueBody(RequestBody):
|
|
340
340
|
|
341
341
|
class JobsCancelBody(RequestBody):
|
342
342
|
"""The request body for the jobs cancel endpoint."""
|
343
|
-
name: Optional[str]
|
344
|
-
job_ids: Optional[List[int]]
|
343
|
+
name: Optional[str] = None
|
344
|
+
job_ids: Optional[List[int]] = None
|
345
345
|
all: bool = False
|
346
346
|
all_users: bool = False
|
347
347
|
|
sky/server/server.py
CHANGED
@@ -12,7 +12,7 @@ import pathlib
|
|
12
12
|
import re
|
13
13
|
import shutil
|
14
14
|
import sys
|
15
|
-
from typing import Dict, List, Literal, Optional, Set, Tuple
|
15
|
+
from typing import Any, Dict, List, Literal, Optional, Set, Tuple
|
16
16
|
import uuid
|
17
17
|
import zipfile
|
18
18
|
|
@@ -675,6 +675,13 @@ async def logs(
|
|
675
675
|
)
|
676
676
|
|
677
677
|
|
678
|
+
@app.get('/users')
|
679
|
+
async def users() -> List[Dict[str, Any]]:
|
680
|
+
"""Gets all users."""
|
681
|
+
user_list = global_user_state.get_all_users()
|
682
|
+
return [user.to_dict() for user in user_list]
|
683
|
+
|
684
|
+
|
678
685
|
@app.post('/download_logs')
|
679
686
|
async def download_logs(
|
680
687
|
request: fastapi.Request,
|