skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -4,6 +4,7 @@ import hashlib
4
4
  import os
5
5
  import pathlib
6
6
  import shlex
7
+ import sys
7
8
  import time
8
9
  from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
9
10
 
@@ -231,9 +232,9 @@ class CommandRunner:
231
232
  self,
232
233
  source: str,
233
234
  target: str,
234
- node_destination: str,
235
+ node_destination: Optional[str],
235
236
  up: bool,
236
- rsh_option: str,
237
+ rsh_option: Optional[str],
237
238
  # Advanced options.
238
239
  log_path: str = os.devnull,
239
240
  stream_logs: bool = True,
@@ -283,28 +284,43 @@ class CommandRunner:
283
284
  RSYNC_EXCLUDE_OPTION.format(
284
285
  shlex.quote(str(resolved_source / GIT_EXCLUDE))))
285
286
 
286
- rsync_command.append(f'-e {shlex.quote(rsh_option)}')
287
+ if rsh_option is not None:
288
+ rsync_command.append(f'-e {shlex.quote(rsh_option)}')
289
+ maybe_dest_prefix = ('' if node_destination is None else
290
+ f'{node_destination}:')
287
291
 
288
292
  if up:
289
293
  resolved_target = target
290
- if target.startswith('~'):
291
- remote_home_dir = _get_remote_home_dir_with_retry()
292
- resolved_target = target.replace('~', remote_home_dir)
294
+ if node_destination is None:
295
+ # Is a local rsync. Directly resolve the target.
296
+ resolved_target = str(
297
+ pathlib.Path(target).expanduser().resolve())
298
+ else:
299
+ if target.startswith('~'):
300
+ remote_home_dir = _get_remote_home_dir_with_retry()
301
+ resolved_target = target.replace('~', remote_home_dir)
293
302
  full_source_str = str(resolved_source)
294
303
  if resolved_source.is_dir():
295
304
  full_source_str = os.path.join(full_source_str, '')
296
305
  rsync_command.extend([
297
306
  f'{full_source_str!r}',
298
- f'{node_destination}:{resolved_target!r}',
307
+ f'{maybe_dest_prefix}{resolved_target!r}',
299
308
  ])
300
309
  else:
301
310
  resolved_source = source
302
- if source.startswith('~'):
303
- remote_home_dir = _get_remote_home_dir_with_retry()
304
- resolved_source = source.replace('~', remote_home_dir)
311
+ if node_destination is None:
312
+ resolved_target = str(
313
+ pathlib.Path(target).expanduser().resolve())
314
+ resolved_source = str(
315
+ pathlib.Path(source).expanduser().resolve())
316
+ else:
317
+ resolved_target = os.path.expanduser(target)
318
+ if source.startswith('~'):
319
+ remote_home_dir = _get_remote_home_dir_with_retry()
320
+ resolved_source = source.replace('~', remote_home_dir)
305
321
  rsync_command.extend([
306
- f'{node_destination}:{resolved_source!r}',
307
- f'{os.path.expanduser(target)!r}',
322
+ f'{maybe_dest_prefix}{resolved_source!r}',
323
+ f'{resolved_target!r}',
308
324
  ])
309
325
  command = ' '.join(rsync_command)
310
326
  logger.debug(f'Running rsync command: {command}')
@@ -964,3 +980,93 @@ class KubernetesCommandRunner(CommandRunner):
964
980
  # /~/xx, so we need to replace ~ with the remote home directory. We
965
981
  # only need to do this when ~ is at the beginning of the path.
966
982
  get_remote_home_dir=get_remote_home_dir)
983
+
984
+
985
+ class LocalProcessCommandRunner(CommandRunner):
986
+ """Runner for local process commands."""
987
+
988
+ def __init__(self):
989
+ super().__init__('local')
990
+
991
+ @timeline.event
992
+ @context_utils.cancellation_guard
993
+ def run(
994
+ self,
995
+ cmd: Union[str, List[str]],
996
+ *,
997
+ require_outputs: bool = False,
998
+ port_forward: Optional[List[Tuple[int, int]]] = None,
999
+ # Advanced options.
1000
+ log_path: str = os.devnull,
1001
+ # If False, do not redirect stdout/stderr to optimize performance.
1002
+ process_stream: bool = True,
1003
+ stream_logs: bool = True,
1004
+ ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
1005
+ separate_stderr: bool = False,
1006
+ connect_timeout: Optional[int] = None,
1007
+ source_bashrc: bool = False,
1008
+ skip_num_lines: int = 0,
1009
+ **kwargs) -> Union[int, Tuple[int, str, str]]:
1010
+ """Use subprocess to run the command."""
1011
+ del port_forward, ssh_mode, connect_timeout # Unused.
1012
+
1013
+ command_str = self._get_command_to_run(cmd,
1014
+ process_stream,
1015
+ separate_stderr,
1016
+ skip_num_lines=skip_num_lines,
1017
+ source_bashrc=source_bashrc)
1018
+
1019
+ log_dir = os.path.expanduser(os.path.dirname(log_path))
1020
+ os.makedirs(log_dir, exist_ok=True)
1021
+
1022
+ executable = None
1023
+ command = [command_str]
1024
+ if not process_stream:
1025
+ if stream_logs:
1026
+ command += [
1027
+ f'| tee {log_path}',
1028
+ # This also requires the executor to be '/bin/bash' instead
1029
+ # of the default '/bin/sh'.
1030
+ '; exit ${PIPESTATUS[0]}'
1031
+ ]
1032
+ else:
1033
+ command += [f'> {log_path}']
1034
+ executable = '/bin/bash'
1035
+ command_str = ' '.join(command)
1036
+ # For local process, the API server might not have this python path
1037
+ # setup. But this command runner should only be triggered from the API
1038
+ # server (in controller consolidation mode), so we can safely replace
1039
+ # the python path with the executable of the API server.
1040
+ command_str = command_str.replace(constants.SKY_PYTHON_CMD,
1041
+ sys.executable)
1042
+ logger.debug(f'Running command locally: {command_str}')
1043
+ return log_lib.run_with_log(command_str,
1044
+ log_path,
1045
+ require_outputs=require_outputs,
1046
+ stream_logs=stream_logs,
1047
+ process_stream=process_stream,
1048
+ shell=True,
1049
+ executable=executable,
1050
+ **kwargs)
1051
+
1052
+ @timeline.event
1053
+ def rsync(
1054
+ self,
1055
+ source: str,
1056
+ target: str,
1057
+ *,
1058
+ up: bool,
1059
+ # Advanced options.
1060
+ log_path: str = os.devnull,
1061
+ stream_logs: bool = True,
1062
+ max_retry: int = 1,
1063
+ ) -> None:
1064
+ """Use rsync to sync the source to the target."""
1065
+ self._rsync(source,
1066
+ target,
1067
+ node_destination=None,
1068
+ up=up,
1069
+ rsh_option=None,
1070
+ log_path=log_path,
1071
+ stream_logs=stream_logs,
1072
+ max_retry=max_retry)
@@ -271,3 +271,60 @@ class KubernetesCommandRunner(CommandRunner):
271
271
  stream_logs: bool = ...,
272
272
  max_retry: int = ...) -> None:
273
273
  ...
274
+
275
+
276
+ class LocalProcessCommandRunner(CommandRunner):
277
+
278
+ def __init__(self) -> None:
279
+ ...
280
+
281
+ @typing.overload
282
+ def run(self,
283
+ cmd: Union[str, List[str]],
284
+ *,
285
+ port_forward: Optional[List[int]] = ...,
286
+ require_outputs: Literal[False] = ...,
287
+ log_path: str = ...,
288
+ process_stream: bool = ...,
289
+ stream_logs: bool = ...,
290
+ ssh_mode: SshMode = ...,
291
+ separate_stderr: bool = ...,
292
+ connect_timeout: Optional[int] = ...,
293
+ source_bashrc: bool = ...,
294
+ skip_lines: int = ...,
295
+ **kwargs) -> int:
296
+ ...
297
+
298
+ @typing.overload
299
+ def run(self,
300
+ cmd: Union[str, List[str]],
301
+ *,
302
+ port_forward: Optional[List[int]] = ...,
303
+ require_outputs: Literal[True],
304
+ log_path: str = ...,
305
+ process_stream: bool = ...,
306
+ stream_logs: bool = ...,
307
+ ssh_mode: SshMode = ...,
308
+ separate_stderr: bool = ...,
309
+ connect_timeout: Optional[int] = ...,
310
+ source_bashrc: bool = ...,
311
+ skip_lines: int = ...,
312
+ **kwargs) -> Tuple[int, str, str]:
313
+ ...
314
+
315
+ @typing.overload
316
+ def run(self,
317
+ cmd: Union[str, List[str]],
318
+ *,
319
+ port_forward: Optional[List[int]] = ...,
320
+ require_outputs: bool = ...,
321
+ log_path: str = ...,
322
+ process_stream: bool = ...,
323
+ stream_logs: bool = ...,
324
+ ssh_mode: SshMode = ...,
325
+ separate_stderr: bool = ...,
326
+ connect_timeout: Optional[int] = ...,
327
+ source_bashrc: bool = ...,
328
+ skip_lines: int = ...,
329
+ **kwargs) -> Union[Tuple[int, str, str], int]:
330
+ ...
sky/utils/common_utils.py CHANGED
@@ -26,6 +26,7 @@ from sky.adaptors import common as adaptors_common
26
26
  from sky.skylet import constants
27
27
  from sky.usage import constants as usage_constants
28
28
  from sky.utils import annotations
29
+ from sky.utils import common_utils
29
30
  from sky.utils import ux_utils
30
31
  from sky.utils import validator
31
32
 
@@ -298,6 +299,13 @@ def get_current_user() -> 'models.User':
298
299
  return models.User.get_current_user()
299
300
 
300
301
 
302
+ def get_current_user_name() -> str:
303
+ """Returns the current user name."""
304
+ name = common_utils.get_current_user().name
305
+ assert name is not None
306
+ return name
307
+
308
+
301
309
  def set_current_user(user: 'models.User'):
302
310
  """Sets the current user."""
303
311
  global _current_user
@@ -754,7 +762,7 @@ def get_cleaned_username(username: str = '') -> str:
754
762
  Returns:
755
763
  A cleaned username.
756
764
  """
757
- username = username or getpass.getuser()
765
+ username = username or common_utils.get_current_user_name()
758
766
  username = username.lower()
759
767
  username = re.sub(r'[^a-z0-9-_]', '', username)
760
768
  username = re.sub(r'^[0-9-]+', '', username)
sky/utils/context.py CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
254
254
  def __init__(self, *args, **kwargs):
255
255
  env = kwargs.pop('env', None)
256
256
  if env is None:
257
- env = os.environ
257
+ # Pass a copy of current context.environ to avoid race condition
258
+ # when the context is updated after the Popen is created.
259
+ env = os.environ.copy()
258
260
  super().__init__(*args, env=env, **kwargs)
259
261
 
260
262
 
@@ -2,7 +2,6 @@
2
2
  import copy
3
3
  import dataclasses
4
4
  import enum
5
- import getpass
6
5
  import os
7
6
  import tempfile
8
7
  import typing
@@ -498,7 +497,7 @@ def shared_controller_vars_to_fill(
498
497
  env_vars.update({
499
498
  # Should not use $USER here, as that env var can be empty when
500
499
  # running in a container.
501
- constants.USER_ENV_VAR: getpass.getuser(),
500
+ constants.USER_ENV_VAR: common_utils.get_current_user_name(),
502
501
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
503
502
  # Skip cloud identity check to avoid the overhead.
504
503
  env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
@@ -8,6 +8,7 @@ import typing
8
8
  from typing import Dict, List, Optional, Set, Union
9
9
 
10
10
  from sky import skypilot_config
11
+ from sky.skylet import constants
11
12
  from sky.utils import common_utils
12
13
  from sky.utils import registry
13
14
  from sky.utils import ux_utils
@@ -331,3 +332,68 @@ def make_launchables_for_valid_region_zones(
331
332
  # Batch the requests at the granularity of a single region.
332
333
  launchables.append(launchable_resources.copy(region=region.name))
333
334
  return launchables
335
+
336
+
337
+ def parse_memory_resource(resource_qty_str: Union[str, int, float],
338
+ field_name: str,
339
+ ret_type: type = int,
340
+ unit: str = 'gb',
341
+ allow_plus: bool = False,
342
+ allow_x: bool = False,
343
+ allow_rounding: bool = False) -> str:
344
+ """Returns memory size in chosen units given a resource quantity string.
345
+
346
+ Args:
347
+ resource_qty_str: Resource quantity string
348
+ unit: Unit to convert to
349
+ allow_plus: Whether to allow '+' prefix
350
+ allow_x: Whether to allow 'x' suffix
351
+ """
352
+ assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
353
+
354
+ error_msg = (f'"{field_name}" field should be a '
355
+ f'{constants.MEMORY_SIZE_PATTERN}+?,'
356
+ f' got {resource_qty_str}')
357
+
358
+ resource_str = str(resource_qty_str)
359
+
360
+ # Handle plus and x suffixes, x is only used internally for jobs controller
361
+ plus = ''
362
+ if resource_str.endswith('+'):
363
+ if allow_plus:
364
+ resource_str = resource_str[:-1]
365
+ plus = '+'
366
+ else:
367
+ raise ValueError(error_msg)
368
+
369
+ x = ''
370
+ if resource_str.endswith('x'):
371
+ if allow_x:
372
+ resource_str = resource_str[:-1]
373
+ x = 'x'
374
+ else:
375
+ raise ValueError(error_msg)
376
+
377
+ try:
378
+ # We assume it is already in the wanted units to maintain backwards
379
+ # compatibility
380
+ ret_type(resource_str)
381
+ return f'{resource_str}{plus}{x}'
382
+ except ValueError:
383
+ pass
384
+
385
+ resource_str = resource_str.lower()
386
+ for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
387
+ if resource_str.endswith(mem_unit):
388
+ try:
389
+ value = ret_type(resource_str[:-len(mem_unit)])
390
+ converted = (value * multiplier /
391
+ constants.MEMORY_SIZE_UNITS[unit])
392
+ if not allow_rounding and ret_type(converted) != converted:
393
+ raise ValueError(error_msg)
394
+ converted = ret_type(converted)
395
+ return f'{converted}{plus}{x}'
396
+ except ValueError:
397
+ continue
398
+
399
+ raise ValueError(error_msg)
sky/utils/rich_utils.py CHANGED
@@ -7,6 +7,7 @@ import threading
7
7
  import typing
8
8
  from typing import Callable, Iterator, Optional, Tuple, Union
9
9
 
10
+ from sky import exceptions
10
11
  from sky.adaptors import common as adaptors_common
11
12
  from sky.utils import annotations
12
13
  from sky.utils import context
@@ -58,6 +59,7 @@ class Control(enum.Enum):
58
59
  EXIT = 'rich_exit'
59
60
  UPDATE = 'rich_update'
60
61
  HEARTBEAT = 'heartbeat'
62
+ RETRY = 'retry'
61
63
 
62
64
  def encode(self, msg: str) -> str:
63
65
  return f'<{self.value}>{msg}</{self.value}>'
@@ -365,6 +367,10 @@ def decode_rich_status(
365
367
  yield line
366
368
  continue
367
369
 
370
+ if control == Control.RETRY:
371
+ raise exceptions.ServerTemporarilyUnavailableError(
372
+ 'The server is temporarily unavailable. Please try '
373
+ 'again.')
368
374
  # control is not None, i.e. it is a rich status control message.
369
375
  if threading.current_thread() is not threading.main_thread():
370
376
  yield None
sky/utils/schemas.py CHANGED
@@ -70,8 +70,36 @@ _AUTOSTOP_SCHEMA = {
70
70
  }
71
71
 
72
72
 
73
- def _get_single_resources_schema():
74
- """Schema for a single resource in a resources list."""
73
+ # Note: This is similar to _get_infra_pattern()
74
+ # but without the wildcard patterns.
75
+ def _get_volume_infra_pattern():
76
+ # Building the regex pattern for the infra field
77
+ # Format: cloud[/region[/zone]] or wildcards or kubernetes context
78
+ # Match any cloud name (case insensitive)
79
+ all_clouds = list(constants.ALL_CLOUDS)
80
+ all_clouds.remove('kubernetes')
81
+ cloud_pattern = f'(?i:({"|".join(all_clouds)}))'
82
+
83
+ # Optional /region followed by optional /zone
84
+ # /[^/]+ matches a slash followed by any characters except slash (region or
85
+ # zone name)
86
+ # The outer (?:...)? makes the entire region/zone part optional
87
+ region_zone_pattern = '(?:/[^/]+(?:/[^/]+)?)?'
88
+
89
+ # Kubernetes specific pattern - matches:
90
+ # 1. Just the word "kubernetes" or "k8s" by itself
91
+ # 2. "k8s/" or "kubernetes/" followed by any context name (which may contain
92
+ # slashes)
93
+ kubernetes_pattern = '(?i:kubernetes|k8s)(?:/.+)?'
94
+
95
+ # Combine all patterns with alternation (|)
96
+ # ^ marks start of string, $ marks end of string
97
+ infra_pattern = (f'^(?:{cloud_pattern}{region_zone_pattern}|'
98
+ f'{kubernetes_pattern})$')
99
+ return infra_pattern
100
+
101
+
102
+ def _get_infra_pattern():
75
103
  # Building the regex pattern for the infra field
76
104
  # Format: cloud[/region[/zone]] or wildcards or kubernetes context
77
105
  # Match any cloud name (case insensitive)
@@ -103,7 +131,11 @@ def _get_single_resources_schema():
103
131
  infra_pattern = (f'^(?:{cloud_pattern}{region_zone_pattern}|'
104
132
  f'{wildcard_cloud}{wildcard_with_region}|'
105
133
  f'{kubernetes_pattern})$')
134
+ return infra_pattern
135
+
106
136
 
137
+ def _get_single_resources_schema():
138
+ """Schema for a single resource in a resources list."""
107
139
  return {
108
140
  '$schema': 'https://json-schema.org/draft/2020-12/schema',
109
141
  'type': 'object',
@@ -133,7 +165,7 @@ def _get_single_resources_schema():
133
165
  # 3. Kubernetes patterns - e.g. "kubernetes/my-context",
134
166
  # "k8s/context-name",
135
167
  # "k8s/aws:eks:us-east-1:123456789012:cluster/my-cluster"
136
- 'pattern': infra_pattern,
168
+ 'pattern': _get_infra_pattern(),
137
169
  },
138
170
  'cpus': {
139
171
  'anyOf': [{
@@ -383,6 +415,66 @@ def get_resources_schema():
383
415
  }
384
416
 
385
417
 
418
+ def get_volume_schema():
419
+ # pylint: disable=import-outside-toplevel
420
+ from sky.volumes import volume
421
+
422
+ return {
423
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
424
+ 'type': 'object',
425
+ 'required': ['name', 'type', 'infra'],
426
+ 'additionalProperties': False,
427
+ 'properties': {
428
+ 'name': {
429
+ 'type': 'string',
430
+ },
431
+ 'type': {
432
+ 'type': 'string',
433
+ 'case_sensitive_enum': [
434
+ type.value for type in volume.VolumeType
435
+ ],
436
+ },
437
+ 'infra': {
438
+ 'type': 'string',
439
+ 'description': ('Infrastructure specification in format: '
440
+ 'cloud[/region[/zone]].'),
441
+ # Pattern validates:
442
+ # 1. cloud[/region[/zone]] - e.g. "aws", "aws/us-east-1",
443
+ # "aws/us-east-1/us-east-1a"
444
+ # 2. Kubernetes patterns - e.g. "kubernetes/my-context",
445
+ # "k8s/context-name",
446
+ # "k8s/aws:eks:us-east-1:123456789012:cluster/my-cluster"
447
+ 'pattern': _get_volume_infra_pattern(),
448
+ },
449
+ 'size': {
450
+ 'type': 'string',
451
+ 'pattern': constants.MEMORY_SIZE_PATTERN,
452
+ },
453
+ 'resource_name': {
454
+ 'type': 'string',
455
+ },
456
+ 'config': {
457
+ 'type': 'object',
458
+ 'required': [],
459
+ 'properties': {
460
+ 'storage_class_name': {
461
+ 'type': 'string',
462
+ },
463
+ 'access_mode': {
464
+ 'type': 'string',
465
+ 'case_sensitive_enum': [
466
+ type.value for type in volume.VolumeAccessMode
467
+ ],
468
+ },
469
+ 'namespace': {
470
+ 'type': 'string',
471
+ },
472
+ },
473
+ },
474
+ }
475
+ }
476
+
477
+
386
478
  def get_storage_schema():
387
479
  # pylint: disable=import-outside-toplevel
388
480
  from sky.data import storage
@@ -457,6 +549,49 @@ def get_storage_schema():
457
549
  }
458
550
 
459
551
 
552
+ def get_volume_mount_schema():
553
+ """Schema for volume mount object in task config (internal use only)."""
554
+ return {
555
+ '$schema': 'https://json-schema.org/draft/2020-12/schema',
556
+ 'type': 'object',
557
+ 'required': [],
558
+ 'additionalProperties': False,
559
+ 'properties': {
560
+ 'path': {
561
+ 'type': 'string',
562
+ },
563
+ 'volume_name': {
564
+ 'type': 'string',
565
+ },
566
+ 'volume_config': {
567
+ 'type': 'object',
568
+ 'required': [],
569
+ 'additionalProperties': True,
570
+ 'properties': {
571
+ 'cloud': {
572
+ 'type': 'string',
573
+ 'case_insensitive_enum': list(constants.ALL_CLOUDS)
574
+ },
575
+ 'region': {
576
+ 'anyOf': [{
577
+ 'type': 'string'
578
+ }, {
579
+ 'type': 'null'
580
+ }]
581
+ },
582
+ 'zone': {
583
+ 'anyOf': [{
584
+ 'type': 'string'
585
+ }, {
586
+ 'type': 'null'
587
+ }]
588
+ },
589
+ },
590
+ }
591
+ }
592
+ }
593
+
594
+
460
595
  def get_service_schema():
461
596
  """Schema for top-level `service:` field (for SkyServe)."""
462
597
  # To avoid circular imports, only import when needed.
@@ -672,18 +807,6 @@ def get_task_schema():
672
807
  'service': {
673
808
  'type': 'object',
674
809
  },
675
- 'job': {
676
- 'type': 'object',
677
- 'required': [],
678
- 'additionalProperties': False,
679
- 'properties': {
680
- 'priority': {
681
- 'type': 'integer',
682
- 'minimum': 0,
683
- 'maximum': 1000,
684
- },
685
- },
686
- },
687
810
  'setup': {
688
811
  'type': 'string',
689
812
  },
@@ -735,6 +858,14 @@ def get_task_schema():
735
858
  'config': _filter_schema(
736
859
  get_config_schema(),
737
860
  constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK),
861
+ # volumes config is validated separately using get_volume_schema
862
+ 'volumes': {
863
+ 'type': 'object',
864
+ },
865
+ 'volume_mounts': {
866
+ 'type': 'array',
867
+ 'items': get_volume_mount_schema(),
868
+ },
738
869
  **_experimental_task_schema(),
739
870
  }
740
871
  }
@@ -899,30 +1030,41 @@ def get_config_schema():
899
1030
  if k != '$schema'
900
1031
  }
901
1032
  resources_schema['properties'].pop('ports')
902
- controller_resources_schema = {
903
- 'type': 'object',
904
- 'required': [],
905
- 'additionalProperties': False,
906
- 'properties': {
907
- 'controller': {
908
- 'type': 'object',
909
- 'required': [],
910
- 'additionalProperties': False,
911
- 'properties': {
912
- 'resources': resources_schema,
913
- 'high_availability': {
914
- 'type': 'boolean',
915
- },
916
- 'autostop': _AUTOSTOP_SCHEMA,
917
- }
1033
+
1034
+ def _get_controller_schema(add_consolidation_mode: bool = False):
1035
+ controller_properties = {
1036
+ 'resources': resources_schema,
1037
+ 'high_availability': {
1038
+ 'type': 'boolean',
1039
+ 'default': False,
918
1040
  },
919
- 'bucket': {
920
- 'type': 'string',
921
- 'pattern': '^(https|s3|gs|r2|cos)://.+',
922
- 'required': [],
1041
+ 'autostop': _AUTOSTOP_SCHEMA,
1042
+ }
1043
+ if add_consolidation_mode:
1044
+ controller_properties['consolidation_mode'] = {
1045
+ 'type': 'boolean',
1046
+ 'default': False,
1047
+ }
1048
+
1049
+ return {
1050
+ 'type': 'object',
1051
+ 'required': [],
1052
+ 'additionalProperties': False,
1053
+ 'properties': {
1054
+ 'controller': {
1055
+ 'type': 'object',
1056
+ 'required': [],
1057
+ 'additionalProperties': False,
1058
+ 'properties': controller_properties,
1059
+ },
1060
+ 'bucket': {
1061
+ 'type': 'string',
1062
+ 'pattern': '^(https|s3|gs|r2|cos)://.+',
1063
+ 'required': [],
1064
+ }
923
1065
  }
924
1066
  }
925
- }
1067
+
926
1068
  cloud_configs = {
927
1069
  'aws': {
928
1070
  'type': 'object',
@@ -1440,8 +1582,8 @@ def get_config_schema():
1440
1582
  'db': {
1441
1583
  'type': 'string',
1442
1584
  },
1443
- 'jobs': controller_resources_schema,
1444
- 'serve': controller_resources_schema,
1585
+ 'jobs': _get_controller_schema(add_consolidation_mode=True),
1586
+ 'serve': _get_controller_schema(add_consolidation_mode=False),
1445
1587
  'allowed_clouds': allowed_clouds,
1446
1588
  'admin_policy': admin_policy_schema,
1447
1589
  'docker': docker_configs,