skypilot-nightly 1.0.0.dev20250401__py3-none-any.whl → 1.0.0.dev20250403__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  """Helper functions for object store mounting in Sky Storage"""
2
+ import hashlib
3
+ import os
2
4
  import random
3
5
  import shlex
4
6
  import textwrap
5
7
  from typing import Optional
6
8
 
7
9
  from sky import exceptions
10
+ from sky.skylet import constants
8
11
  from sky.utils import command_runner
9
12
 
10
13
  # Values used to construct mounting commands
@@ -14,11 +17,17 @@ _TYPE_CACHE_TTL = '5s'
14
17
  _RENAME_DIR_LIMIT = 10000
15
18
  # https://github.com/GoogleCloudPlatform/gcsfuse/releases
16
19
  GCSFUSE_VERSION = '2.2.0'
20
+ # Creates a fusermount3 soft link on older (<22) Ubuntu systems to utilize
21
+ # Rclone's mounting utility.
22
+ FUSERMOUNT3_SOFT_LINK_CMD = ('[ ! -f /bin/fusermount3 ] && '
23
+ 'sudo ln -s /bin/fusermount /bin/fusermount3 || '
24
+ 'true')
17
25
  # https://github.com/Azure/azure-storage-fuse/releases
18
26
  BLOBFUSE2_VERSION = '2.2.0'
19
27
  _BLOBFUSE_CACHE_ROOT_DIR = '~/.sky/blobfuse2_cache'
20
28
  _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
21
29
  '{storage_account_name}_{container_name}')
30
+ # https://github.com/rclone/rclone/releases
22
31
  RCLONE_VERSION = 'v1.68.2'
23
32
 
24
33
 
@@ -112,7 +121,12 @@ def get_az_mount_install_cmd() -> str:
112
121
  'sudo apt-get update; '
113
122
  'sudo apt-get install -y '
114
123
  '-o Dpkg::Options::="--force-confdef" '
115
- 'fuse3 libfuse3-dev && '
124
+ 'fuse3 libfuse3-dev || { '
125
+ ' echo "fuse3 not available, falling back to fuse"; '
126
+ ' sudo apt-get install -y '
127
+ ' -o Dpkg::Options::="--force-confdef" '
128
+ ' fuse libfuse-dev; '
129
+ '} && '
116
130
  'ARCH=$(uname -m) && '
117
131
  'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
118
132
  ' echo "blobfuse2 is not supported on $ARCH" && '
@@ -203,31 +217,17 @@ def get_r2_mount_cmd(r2_credentials_path: str,
203
217
  return mount_cmd
204
218
 
205
219
 
206
- def get_cos_mount_install_cmd() -> str:
207
- """Returns a command to install IBM COS mount utility rclone."""
208
- install_cmd = ('rclone version >/dev/null 2>&1 || '
209
- '(curl https://rclone.org/install.sh | '
210
- 'sudo bash)')
211
- return install_cmd
212
-
213
-
214
- def get_cos_mount_cmd(rclone_config_data: str,
215
- rclone_config_path: str,
216
- bucket_rclone_profile: str,
220
+ def get_cos_mount_cmd(rclone_config: str,
221
+ rclone_profile_name: str,
217
222
  bucket_name: str,
218
223
  mount_path: str,
219
224
  _bucket_sub_path: Optional[str] = None) -> str:
220
225
  """Returns a command to mount an IBM COS bucket using rclone."""
221
- # creates a fusermount soft link on older (<22) Ubuntu systems for
222
- # rclone's mount utility.
223
- set_fuser3_soft_link = ('[ ! -f /bin/fusermount3 ] && '
224
- 'sudo ln -s /bin/fusermount /bin/fusermount3 || '
225
- 'true')
226
226
  # stores bucket profile in rclone config file at the cluster's nodes.
227
- configure_rclone_profile = (f'{set_fuser3_soft_link}; '
228
- 'mkdir -p ~/.config/rclone/ && '
229
- f'echo "{rclone_config_data}" >> '
230
- f'{rclone_config_path}')
227
+ configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
228
+ f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
229
+ f'echo "{rclone_config}" >> '
230
+ f'{constants.RCLONE_CONFIG_PATH}')
231
231
  if _bucket_sub_path is None:
232
232
  sub_path_arg = f'{bucket_name}/{_bucket_sub_path}'
233
233
  else:
@@ -235,11 +235,68 @@ def get_cos_mount_cmd(rclone_config_data: str,
235
235
  # --daemon will keep the mounting process running in the background.
236
236
  mount_cmd = (f'{configure_rclone_profile} && '
237
237
  'rclone mount '
238
- f'{bucket_rclone_profile}:{sub_path_arg} {mount_path} '
238
+ f'{rclone_profile_name}:{sub_path_arg} {mount_path} '
239
239
  '--daemon')
240
240
  return mount_cmd
241
241
 
242
242
 
243
+ def get_mount_cached_cmd(rclone_config: str, rclone_profile_name: str,
244
+ bucket_name: str, mount_path: str) -> str:
245
+ """Returns a command to mount a bucket using rclone with vfs cache."""
246
+ # stores bucket profile in rclone config file at the remote nodes.
247
+ configure_rclone_profile = (f'{FUSERMOUNT3_SOFT_LINK_CMD}; '
248
+ f'mkdir -p {constants.RCLONE_CONFIG_DIR} && '
249
+ f'echo {shlex.quote(rclone_config)} >> '
250
+ f'{constants.RCLONE_CONFIG_PATH}')
251
+ # Assume mount path is unique. We use a hash of mount path as
252
+ # various filenames related to the mount.
253
+ # This is because the full path may be longer than
254
+ # the filename length limit.
255
+ # The hash is a non-negative integer in string form.
256
+ hashed_mount_path = hashlib.md5(mount_path.encode()).hexdigest()
257
+ log_file_path = os.path.join(constants.RCLONE_LOG_DIR,
258
+ f'{hashed_mount_path}.log')
259
+ create_log_cmd = (f'mkdir -p {constants.RCLONE_LOG_DIR} && '
260
+ f'touch {log_file_path}')
261
+ # when mounting multiple directories with vfs cache mode, it's handled by
262
+ # rclone to create separate cache directories at ~/.cache/rclone/vfs. It is
263
+ # not necessary to specify separate cache directories.
264
+ mount_cmd = (
265
+ f'{create_log_cmd} && '
266
+ f'{configure_rclone_profile} && '
267
+ 'rclone mount '
268
+ f'{rclone_profile_name}:{bucket_name} {mount_path} '
269
+ # '--daemon' keeps the mounting process running in the background.
270
+ # fail in 10 seconds if mount cannot complete by then,
271
+ # which should be plenty of time.
272
+ '--daemon --daemon-wait 10 '
273
+ f'--log-file {log_file_path} --log-level INFO '
274
+ # '--dir-cache-time' sets how long directory listings are cached before
275
+ # rclone checks the remote storage for changes again. A shorter
276
+ # interval allows for faster detection of new or updated files on the
277
+ # remote, but increases the frequency of metadata lookups.
278
+ '--allow-other --vfs-cache-mode full --dir-cache-time 10s '
279
+ # '--transfers 1' guarantees the files written at the local mount point
280
+ # to be uploaded to the backend storage in the order of creation.
281
+ # '--vfs-cache-poll-interval' specifies the frequency of how often
282
+ # rclone checks the local mount point for stale objects in cache.
283
+ # '--vfs-write-back' defines the time to write files on remote storage
284
+ # after last use of the file in local mountpoint.
285
+ '--transfers 1 --vfs-cache-poll-interval 10s --vfs-write-back 1s '
286
+ # Have rclone evict files if the cache size exceeds 10G.
287
+ # This is to prevent cache from growing too large and
288
+ # using up all the disk space. Note that files that opened
289
+ # by a process is not evicted from the cache.
290
+ '--vfs-cache-max-size 10G '
291
+ # give each mount its own cache directory
292
+ f'--cache-dir {constants.RCLONE_CACHE_DIR}/{hashed_mount_path} '
293
+ # This command produces children processes, which need to be
294
+ # detached from the current process's terminal. The command doesn't
295
+ # produce any output, so we aren't dropping any logs.
296
+ '> /dev/null 2>&1')
297
+ return mount_cmd
298
+
299
+
243
300
  def get_rclone_install_cmd() -> str:
244
301
  """ RClone installation for both apt-get and rpm.
245
302
  This would be common command.
sky/data/storage.py CHANGED
@@ -30,7 +30,6 @@ from sky.data import data_transfer
30
30
  from sky.data import data_utils
31
31
  from sky.data import mounting_utils
32
32
  from sky.data import storage_utils
33
- from sky.data.data_utils import Rclone
34
33
  from sky.skylet import constants
35
34
  from sky.utils import common_utils
36
35
  from sky.utils import rich_utils
@@ -266,6 +265,15 @@ class StoreType(enum.Enum):
266
265
  class StorageMode(enum.Enum):
267
266
  MOUNT = 'MOUNT'
268
267
  COPY = 'COPY'
268
+ MOUNT_CACHED = 'MOUNT_CACHED'
269
+
270
+
271
+ MOUNTABLE_STORAGE_MODES = [
272
+ StorageMode.MOUNT,
273
+ StorageMode.MOUNT_CACHED,
274
+ ]
275
+
276
+ DEFAULT_STORAGE_MODE = StorageMode.MOUNT
269
277
 
270
278
 
271
279
  class AbstractStore:
@@ -451,13 +459,27 @@ class AbstractStore:
451
459
  def mount_command(self, mount_path: str) -> str:
452
460
  """Returns the command to mount the Store to the specified mount_path.
453
461
 
454
- Includes the setup commands to install mounting tools.
462
+ This command is used for MOUNT mode. Includes the setup commands to
463
+ install mounting tools.
455
464
 
456
465
  Args:
457
466
  mount_path: str; Mount path on remote server
458
467
  """
459
468
  raise NotImplementedError
460
469
 
470
+ def mount_cached_command(self, mount_path: str) -> str:
471
+ """Returns the command to mount the Store to the specified mount_path.
472
+
473
+ This command is used for MOUNT_CACHED mode. Includes the setup commands
474
+ to install mounting tools.
475
+
476
+ Args:
477
+ mount_path: str; Mount path on remote server
478
+ """
479
+ raise exceptions.NotSupportedError(
480
+ f'{StorageMode.MOUNT_CACHED.value} is '
481
+ f'not supported for {self.name}.')
482
+
461
483
  def __deepcopy__(self, memo):
462
484
  # S3 Client and GCS Client cannot be deep copied, hence the
463
485
  # original Store object is returned
@@ -571,7 +593,7 @@ class Storage(object):
571
593
  source: Optional[SourceType] = None,
572
594
  stores: Optional[List[StoreType]] = None,
573
595
  persistent: Optional[bool] = True,
574
- mode: StorageMode = StorageMode.MOUNT,
596
+ mode: StorageMode = DEFAULT_STORAGE_MODE,
575
597
  sync_on_reconstruction: bool = True,
576
598
  # pylint: disable=invalid-name
577
599
  _is_sky_managed: Optional[bool] = None,
@@ -835,7 +857,7 @@ class Storage(object):
835
857
  is_local_source = False
836
858
  # Storage mounting does not support mounting specific files from
837
859
  # cloud store - ensure path points to only a directory
838
- if mode == StorageMode.MOUNT:
860
+ if mode in MOUNTABLE_STORAGE_MODES:
839
861
  if (split_path.scheme != 'https' and
840
862
  ((split_path.scheme != 'cos' and
841
863
  split_path.path.strip('/') != '') or
@@ -1264,8 +1286,7 @@ class Storage(object):
1264
1286
  # Make mode case insensitive, if specified
1265
1287
  mode = StorageMode(mode_str.upper())
1266
1288
  else:
1267
- # Make sure this keeps the same as the default mode in __init__
1268
- mode = StorageMode.MOUNT
1289
+ mode = DEFAULT_STORAGE_MODE
1269
1290
  persistent = config.pop('persistent', None)
1270
1291
  if persistent is None:
1271
1292
  persistent = True
@@ -1725,6 +1746,17 @@ class S3Store(AbstractStore):
1725
1746
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
1726
1747
  mount_cmd)
1727
1748
 
1749
+ def mount_cached_command(self, mount_path: str) -> str:
1750
+ install_cmd = mounting_utils.get_rclone_install_cmd()
1751
+ rclone_profile_name = (
1752
+ data_utils.Rclone.RcloneStores.S3.get_profile_name(self.name))
1753
+ rclone_config = data_utils.Rclone.RcloneStores.S3.get_config(
1754
+ rclone_profile_name=rclone_profile_name)
1755
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
1756
+ rclone_config, rclone_profile_name, self.bucket.name, mount_path)
1757
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
1758
+ mount_cached_cmd)
1759
+
1728
1760
  def _create_s3_bucket(self,
1729
1761
  bucket_name: str,
1730
1762
  region=_DEFAULT_REGION) -> StorageHandle:
@@ -2252,6 +2284,17 @@ class GcsStore(AbstractStore):
2252
2284
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
2253
2285
  mount_cmd, version_check_cmd)
2254
2286
 
2287
+ def mount_cached_command(self, mount_path: str) -> str:
2288
+ install_cmd = mounting_utils.get_rclone_install_cmd()
2289
+ rclone_profile_name = (
2290
+ data_utils.Rclone.RcloneStores.GCS.get_profile_name(self.name))
2291
+ rclone_config = data_utils.Rclone.RcloneStores.GCS.get_config(
2292
+ rclone_profile_name=rclone_profile_name)
2293
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
2294
+ rclone_config, rclone_profile_name, self.bucket.name, mount_path)
2295
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
2296
+ mount_cached_cmd)
2297
+
2255
2298
  def _download_file(self, remote_path: str, local_path: str) -> None:
2256
2299
  """Downloads file from remote to local on GS bucket
2257
2300
 
@@ -3126,6 +3169,19 @@ class AzureBlobStore(AbstractStore):
3126
3169
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
3127
3170
  mount_cmd)
3128
3171
 
3172
+ def mount_cached_command(self, mount_path: str) -> str:
3173
+ install_cmd = mounting_utils.get_rclone_install_cmd()
3174
+ rclone_profile_name = (
3175
+ data_utils.Rclone.RcloneStores.AZURE.get_profile_name(self.name))
3176
+ rclone_config = data_utils.Rclone.RcloneStores.AZURE.get_config(
3177
+ rclone_profile_name=rclone_profile_name,
3178
+ storage_account_name=self.storage_account_name,
3179
+ storage_account_key=self.storage_account_key)
3180
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
3181
+ rclone_config, rclone_profile_name, self.container_name, mount_path)
3182
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
3183
+ mount_cached_cmd)
3184
+
3129
3185
  def _create_az_bucket(self, container_name: str) -> StorageHandle:
3130
3186
  """Creates AZ Container.
3131
3187
 
@@ -3562,6 +3618,17 @@ class R2Store(AbstractStore):
3562
3618
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
3563
3619
  mount_cmd)
3564
3620
 
3621
+ def mount_cached_command(self, mount_path: str) -> str:
3622
+ install_cmd = mounting_utils.get_rclone_install_cmd()
3623
+ rclone_profile_name = (
3624
+ data_utils.Rclone.RcloneStores.R2.get_profile_name(self.name))
3625
+ rclone_config = data_utils.Rclone.RcloneStores.R2.get_config(
3626
+ rclone_profile_name=rclone_profile_name)
3627
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
3628
+ rclone_config, rclone_profile_name, self.bucket.name, mount_path)
3629
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
3630
+ mount_cached_cmd)
3631
+
3565
3632
  def _create_r2_bucket(self,
3566
3633
  bucket_name: str,
3567
3634
  region='auto') -> StorageHandle:
@@ -3681,11 +3748,10 @@ class IBMCosStore(AbstractStore):
3681
3748
  _bucket_sub_path: Optional[str] = None):
3682
3749
  self.client: 'storage.Client'
3683
3750
  self.bucket: 'StorageHandle'
3751
+ self.rclone_profile_name = (
3752
+ data_utils.Rclone.RcloneStores.IBM.get_profile_name(self.name))
3684
3753
  super().__init__(name, source, region, is_sky_managed,
3685
3754
  sync_on_reconstruction, _bucket_sub_path)
3686
- self.bucket_rclone_profile = \
3687
- Rclone.generate_rclone_bucket_profile_name(
3688
- self.name, Rclone.RcloneClouds.IBM)
3689
3755
 
3690
3756
  def _validate(self):
3691
3757
  if self.source is not None and isinstance(self.source, str):
@@ -3897,11 +3963,10 @@ class IBMCosStore(AbstractStore):
3897
3963
  # .git directory is excluded from the sync
3898
3964
  # wrapping src_dir_path with "" to support path with spaces
3899
3965
  src_dir_path = shlex.quote(src_dir_path)
3900
- sync_command = (
3901
- 'rclone copy --exclude ".git/*" '
3902
- f'{src_dir_path} '
3903
- f'{self.bucket_rclone_profile}:{self.name}{sub_path}'
3904
- f'/{dest_dir_name}')
3966
+ sync_command = ('rclone copy --exclude ".git/*" '
3967
+ f'{src_dir_path} '
3968
+ f'{self.rclone_profile_name}:{self.name}{sub_path}'
3969
+ f'/{dest_dir_name}')
3905
3970
  return sync_command
3906
3971
 
3907
3972
  def get_file_sync_command(base_dir_path, file_names) -> str:
@@ -3927,10 +3992,9 @@ class IBMCosStore(AbstractStore):
3927
3992
  for file_name in file_names
3928
3993
  ])
3929
3994
  base_dir_path = shlex.quote(base_dir_path)
3930
- sync_command = (
3931
- 'rclone copy '
3932
- f'{includes} {base_dir_path} '
3933
- f'{self.bucket_rclone_profile}:{self.name}{sub_path}')
3995
+ sync_command = ('rclone copy '
3996
+ f'{includes} {base_dir_path} '
3997
+ f'{self.rclone_profile_name}:{self.name}{sub_path}')
3934
3998
  return sync_command
3935
3999
 
3936
4000
  # Generate message for upload
@@ -3976,7 +4040,8 @@ class IBMCosStore(AbstractStore):
3976
4040
  'sky storage delete' or 'sky start'
3977
4041
  """
3978
4042
 
3979
- bucket_profile_name = Rclone.RcloneClouds.IBM.value + self.name
4043
+ bucket_profile_name = (data_utils.Rclone.RcloneStores.IBM.value +
4044
+ self.name)
3980
4045
  try:
3981
4046
  bucket_region = data_utils.get_ibm_cos_bucket_region(self.name)
3982
4047
  except exceptions.StorageBucketGetError as e:
@@ -4011,9 +4076,9 @@ class IBMCosStore(AbstractStore):
4011
4076
  '`rclone lsd <remote>` on relevant remotes returned '
4012
4077
  'via `rclone listremotes` to debug.')
4013
4078
 
4014
- Rclone.store_rclone_config(
4079
+ data_utils.Rclone.store_rclone_config(
4015
4080
  self.name,
4016
- Rclone.RcloneClouds.IBM,
4081
+ data_utils.Rclone.RcloneStores.IBM,
4017
4082
  self.region, # type: ignore
4018
4083
  )
4019
4084
 
@@ -4053,18 +4118,18 @@ class IBMCosStore(AbstractStore):
4053
4118
  mount_path: str; Path to mount the bucket to.
4054
4119
  """
4055
4120
  # install rclone if not installed.
4056
- install_cmd = mounting_utils.get_cos_mount_install_cmd()
4057
- rclone_config_data = Rclone.get_rclone_config(
4058
- self.bucket.name,
4059
- Rclone.RcloneClouds.IBM,
4060
- self.region, # type: ignore
4061
- )
4062
- mount_cmd = mounting_utils.get_cos_mount_cmd(rclone_config_data,
4063
- Rclone.RCLONE_CONFIG_PATH,
4064
- self.bucket_rclone_profile,
4065
- self.bucket.name,
4066
- mount_path,
4067
- self._bucket_sub_path)
4121
+ install_cmd = mounting_utils.get_rclone_install_cmd()
4122
+ rclone_config = data_utils.Rclone.RcloneStores.IBM.get_config(
4123
+ rclone_profile_name=self.rclone_profile_name,
4124
+ region=self.region) # type: ignore
4125
+ mount_cmd = (
4126
+ mounting_utils.get_cos_mount_cmd(
4127
+ rclone_config,
4128
+ self.rclone_profile_name,
4129
+ self.bucket.name,
4130
+ mount_path,
4131
+ self._bucket_sub_path, # type: ignore
4132
+ ))
4068
4133
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
4069
4134
  mount_cmd)
4070
4135
 
@@ -4128,7 +4193,8 @@ class IBMCosStore(AbstractStore):
4128
4193
  except ibm.ibm_botocore.exceptions.ClientError as e:
4129
4194
  if e.__class__.__name__ == 'NoSuchBucket':
4130
4195
  logger.debug('bucket already removed')
4131
- Rclone.delete_rclone_bucket_profile(self.name, Rclone.RcloneClouds.IBM)
4196
+ data_utils.Rclone.delete_rclone_bucket_profile(
4197
+ self.name, data_utils.Rclone.RcloneStores.IBM)
4132
4198
 
4133
4199
 
4134
4200
  class OciStore(AbstractStore):
sky/data/storage_utils.py CHANGED
@@ -223,7 +223,7 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
223
223
  def get_excluded_files(src_dir_path: str) -> List[str]:
224
224
  # TODO: this could return a huge list of files,
225
225
  # should think of ways to optimize.
226
- """ List files and directories to be excluded."""
226
+ """List files and directories to be excluded."""
227
227
  expand_src_dir_path = os.path.expanduser(src_dir_path)
228
228
  skyignore_path = os.path.join(expand_src_dir_path,
229
229
  constants.SKY_IGNORE_FILE)
@@ -273,12 +273,22 @@ def zip_files_and_folders(items: List[str],
273
273
  zipf.write(item)
274
274
  elif os.path.isdir(item):
275
275
  for root, dirs, files in os.walk(item, followlinks=False):
276
+ # Modify dirs in-place to control os.walk()'s traversal
277
+ # behavior. This filters out excluded directories BEFORE
278
+ # os.walk() visits the files and sub-directories under
279
+ # them, preventing traversal into any excluded directory
280
+ # and its contents.
281
+ # Note: dirs[:] = ... is required for in-place
282
+ # modification.
283
+ dirs[:] = [
284
+ d for d in dirs
285
+ if os.path.join(root, d) not in excluded_files
286
+ ]
287
+
276
288
  # Store directory entries (important for empty
277
289
  # directories)
278
290
  for dir_name in dirs:
279
291
  dir_path = os.path.join(root, dir_name)
280
- if dir_path in excluded_files:
281
- continue
282
292
  # If it's a symlink, store it as a symlink
283
293
  if os.path.islink(dir_path):
284
294
  _store_symlink(zipf, dir_path, is_dir=True)
sky/global_user_state.py CHANGED
@@ -186,6 +186,11 @@ def get_user(user_id: str) -> models.User:
186
186
  return models.User(id=row[0], name=row[1])
187
187
 
188
188
 
189
+ def get_all_users() -> List[models.User]:
190
+ rows = _DB.cursor.execute('SELECT id, name FROM users').fetchall()
191
+ return [models.User(id=row[0], name=row[1]) for row in rows]
192
+
193
+
189
194
  def add_or_update_cluster(cluster_name: str,
190
195
  cluster_handle: 'backends.ResourceHandle',
191
196
  requested_resources: Optional[Set[Any]],
sky/jobs/controller.py CHANGED
@@ -227,13 +227,13 @@ class JobsController:
227
227
  self._backend, cluster_name)
228
228
 
229
229
  if job_status == job_lib.JobStatus.SUCCEEDED:
230
- end_time = managed_job_utils.try_to_get_job_end_time(
230
+ success_end_time = managed_job_utils.try_to_get_job_end_time(
231
231
  self._backend, cluster_name)
232
232
  # The job is done. Set the job to SUCCEEDED first before start
233
233
  # downloading and streaming the logs to make it more responsive.
234
234
  managed_job_state.set_succeeded(self._job_id,
235
235
  task_id,
236
- end_time=end_time,
236
+ end_time=success_end_time,
237
237
  callback_func=callback_func)
238
238
  logger.info(
239
239
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
@@ -299,23 +299,40 @@ class JobsController:
299
299
  if job_status is not None and not job_status.is_terminal():
300
300
  # The multi-node job is still running, continue monitoring.
301
301
  continue
302
- elif job_status in job_lib.JobStatus.user_code_failure_states():
302
+ elif (job_status
303
+ in job_lib.JobStatus.user_code_failure_states() or
304
+ job_status == job_lib.JobStatus.FAILED_DRIVER):
303
305
  # The user code has probably crashed, fail immediately.
304
306
  end_time = managed_job_utils.try_to_get_job_end_time(
305
307
  self._backend, cluster_name)
306
308
  logger.info(
307
- 'The user job failed. Please check the logs below.\n'
309
+ f'The user job failed ({job_status}). Please check the '
310
+ 'logs below.\n'
308
311
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
309
312
 
310
313
  self._download_log_and_stream(task_id, handle)
314
+
315
+ failure_reason = (
316
+ 'To see the details, run: '
317
+ f'sky jobs logs --controller {self._job_id}')
318
+
311
319
  managed_job_status = (
312
320
  managed_job_state.ManagedJobStatus.FAILED)
313
321
  if job_status == job_lib.JobStatus.FAILED_SETUP:
314
322
  managed_job_status = (
315
323
  managed_job_state.ManagedJobStatus.FAILED_SETUP)
316
- failure_reason = (
317
- 'To see the details, run: '
318
- f'sky jobs logs --controller {self._job_id}')
324
+ elif job_status == job_lib.JobStatus.FAILED_DRIVER:
325
+ # FAILED_DRIVER is kind of an internal error, so we mark
326
+ # this as FAILED_CONTROLLER, even though the failure is
327
+ # not strictly within the controller.
328
+ managed_job_status = (
329
+ managed_job_state.ManagedJobStatus.FAILED_CONTROLLER
330
+ )
331
+ failure_reason = (
332
+ 'The job driver on the remote cluster failed. This '
333
+ 'can be caused by the job taking too much memory '
334
+ 'or other resources. Try adding more memory, CPU, '
335
+ f'or disk in your job definition. {failure_reason}')
319
336
  should_restart_on_failure = (
320
337
  self._strategy_executor.should_restart_on_failure())
321
338
  if should_restart_on_failure:
@@ -337,6 +354,21 @@ class JobsController:
337
354
  end_time=end_time,
338
355
  callback_func=callback_func)
339
356
  return False
357
+ elif job_status is not None:
358
+ # Either the job is cancelled (should not happen) or in some
359
+ # unknown new state that we do not handle.
360
+ logger.error(f'Unknown job status: {job_status}')
361
+ failure_reason = (
362
+ f'Unknown job status {job_status}. To see the details, '
363
+ f'run: sky jobs logs --controller {self._job_id}')
364
+ managed_job_state.set_failed(
365
+ self._job_id,
366
+ task_id,
367
+ failure_type=managed_job_state.ManagedJobStatus.
368
+ FAILED_CONTROLLER,
369
+ failure_reason=failure_reason,
370
+ callback_func=callback_func)
371
+ return False
340
372
  else:
341
373
  # Although the cluster is healthy, we fail to access the
342
374
  # job status. Try to recover the job (will not restart the
sky/jobs/server/server.py CHANGED
@@ -160,7 +160,7 @@ async def dashboard(request: fastapi.Request,
160
160
  async with httpx.AsyncClient() as client:
161
161
  response = await client.request('GET',
162
162
  dashboard_url,
163
- timeout=1)
163
+ timeout=5)
164
164
  break # Connection successful, proceed with the request
165
165
  except Exception as e: # pylint: disable=broad-except
166
166
  # We catch all exceptions to gracefully handle unknown
sky/models.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  import collections
4
4
  import dataclasses
5
- from typing import Dict, Optional
5
+ from typing import Any, Dict, Optional
6
6
 
7
7
 
8
8
  @dataclasses.dataclass
@@ -12,6 +12,9 @@ class User:
12
12
  # Display name of the user
13
13
  name: Optional[str] = None
14
14
 
15
+ def to_dict(self) -> Dict[str, Any]:
16
+ return {'id': self.id, 'name': self.name}
17
+
15
18
 
16
19
  RealtimeGpuAvailability = collections.namedtuple(
17
20
  'RealtimeGpuAvailability', ['gpu', 'counts', 'capacity', 'available'])
@@ -2457,6 +2457,43 @@ def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
2457
2457
  return kubernetes.api_client().deserialize(fake_kube_response, object_type)
2458
2458
 
2459
2459
 
2460
+ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
2461
+ """Gets a list of unlabeled GPU nodes in the cluster.
2462
+
2463
+ This function returns a list of nodes that have GPU resources but no label
2464
+ that indicates the accelerator type.
2465
+
2466
+ Args:
2467
+ context: The context to check.
2468
+
2469
+ Returns:
2470
+ List[Any]: List of unlabeled nodes with accelerators.
2471
+ """
2472
+ nodes = get_kubernetes_nodes(context=context)
2473
+ nodes_with_accelerator = []
2474
+ for node in nodes:
2475
+ if get_gpu_resource_key() in node.status.capacity:
2476
+ nodes_with_accelerator.append(node)
2477
+
2478
+ label_formatter, _ = detect_gpu_label_formatter(context)
2479
+ if not label_formatter:
2480
+ return nodes_with_accelerator
2481
+ else:
2482
+ label_keys = label_formatter.get_label_keys()
2483
+
2484
+ unlabeled_nodes = []
2485
+ for node in nodes_with_accelerator:
2486
+ labeled = False
2487
+ for label_key in label_keys:
2488
+ if label_key in node.metadata.labels:
2489
+ labeled = True
2490
+ break
2491
+ if not labeled:
2492
+ unlabeled_nodes.append(node)
2493
+
2494
+ return unlabeled_nodes
2495
+
2496
+
2460
2497
  def get_kubernetes_node_info(
2461
2498
  context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
2462
2499
  """Gets the resource information for all the nodes in the cluster.
@@ -115,8 +115,8 @@ class RequestBody(pydantic.BaseModel):
115
115
 
116
116
  class CheckBody(RequestBody):
117
117
  """The request body for the check endpoint."""
118
- clouds: Optional[Tuple[str, ...]]
119
- verbose: bool
118
+ clouds: Optional[Tuple[str, ...]] = None
119
+ verbose: bool = False
120
120
 
121
121
 
122
122
  class DagRequestBody(RequestBody):
@@ -340,8 +340,8 @@ class JobsQueueBody(RequestBody):
340
340
 
341
341
  class JobsCancelBody(RequestBody):
342
342
  """The request body for the jobs cancel endpoint."""
343
- name: Optional[str]
344
- job_ids: Optional[List[int]]
343
+ name: Optional[str] = None
344
+ job_ids: Optional[List[int]] = None
345
345
  all: bool = False
346
346
  all_users: bool = False
347
347
 
sky/server/server.py CHANGED
@@ -12,7 +12,7 @@ import pathlib
12
12
  import re
13
13
  import shutil
14
14
  import sys
15
- from typing import Dict, List, Literal, Optional, Set, Tuple
15
+ from typing import Any, Dict, List, Literal, Optional, Set, Tuple
16
16
  import uuid
17
17
  import zipfile
18
18
 
@@ -675,6 +675,13 @@ async def logs(
675
675
  )
676
676
 
677
677
 
678
+ @app.get('/users')
679
+ async def users() -> List[Dict[str, Any]]:
680
+ """Gets all users."""
681
+ user_list = global_user_state.get_all_users()
682
+ return [user.to_dict() for user in user_list]
683
+
684
+
678
685
  @app.post('/download_logs')
679
686
  async def download_logs(
680
687
  request: fastapi.Request,