skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/utils/command_runner.py
CHANGED
@@ -4,6 +4,7 @@ import hashlib
|
|
4
4
|
import os
|
5
5
|
import pathlib
|
6
6
|
import shlex
|
7
|
+
import sys
|
7
8
|
import time
|
8
9
|
from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union
|
9
10
|
|
@@ -231,9 +232,9 @@ class CommandRunner:
|
|
231
232
|
self,
|
232
233
|
source: str,
|
233
234
|
target: str,
|
234
|
-
node_destination: str,
|
235
|
+
node_destination: Optional[str],
|
235
236
|
up: bool,
|
236
|
-
rsh_option: str,
|
237
|
+
rsh_option: Optional[str],
|
237
238
|
# Advanced options.
|
238
239
|
log_path: str = os.devnull,
|
239
240
|
stream_logs: bool = True,
|
@@ -283,28 +284,43 @@ class CommandRunner:
|
|
283
284
|
RSYNC_EXCLUDE_OPTION.format(
|
284
285
|
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
|
285
286
|
|
286
|
-
|
287
|
+
if rsh_option is not None:
|
288
|
+
rsync_command.append(f'-e {shlex.quote(rsh_option)}')
|
289
|
+
maybe_dest_prefix = ('' if node_destination is None else
|
290
|
+
f'{node_destination}:')
|
287
291
|
|
288
292
|
if up:
|
289
293
|
resolved_target = target
|
290
|
-
if
|
291
|
-
|
292
|
-
resolved_target =
|
294
|
+
if node_destination is None:
|
295
|
+
# Is a local rsync. Directly resolve the target.
|
296
|
+
resolved_target = str(
|
297
|
+
pathlib.Path(target).expanduser().resolve())
|
298
|
+
else:
|
299
|
+
if target.startswith('~'):
|
300
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
301
|
+
resolved_target = target.replace('~', remote_home_dir)
|
293
302
|
full_source_str = str(resolved_source)
|
294
303
|
if resolved_source.is_dir():
|
295
304
|
full_source_str = os.path.join(full_source_str, '')
|
296
305
|
rsync_command.extend([
|
297
306
|
f'{full_source_str!r}',
|
298
|
-
f'{
|
307
|
+
f'{maybe_dest_prefix}{resolved_target!r}',
|
299
308
|
])
|
300
309
|
else:
|
301
310
|
resolved_source = source
|
302
|
-
if
|
303
|
-
|
304
|
-
|
311
|
+
if node_destination is None:
|
312
|
+
resolved_target = str(
|
313
|
+
pathlib.Path(target).expanduser().resolve())
|
314
|
+
resolved_source = str(
|
315
|
+
pathlib.Path(source).expanduser().resolve())
|
316
|
+
else:
|
317
|
+
resolved_target = os.path.expanduser(target)
|
318
|
+
if source.startswith('~'):
|
319
|
+
remote_home_dir = _get_remote_home_dir_with_retry()
|
320
|
+
resolved_source = source.replace('~', remote_home_dir)
|
305
321
|
rsync_command.extend([
|
306
|
-
f'{
|
307
|
-
f'{
|
322
|
+
f'{maybe_dest_prefix}{resolved_source!r}',
|
323
|
+
f'{resolved_target!r}',
|
308
324
|
])
|
309
325
|
command = ' '.join(rsync_command)
|
310
326
|
logger.debug(f'Running rsync command: {command}')
|
@@ -964,3 +980,93 @@ class KubernetesCommandRunner(CommandRunner):
|
|
964
980
|
# /~/xx, so we need to replace ~ with the remote home directory. We
|
965
981
|
# only need to do this when ~ is at the beginning of the path.
|
966
982
|
get_remote_home_dir=get_remote_home_dir)
|
983
|
+
|
984
|
+
|
985
|
+
class LocalProcessCommandRunner(CommandRunner):
|
986
|
+
"""Runner for local process commands."""
|
987
|
+
|
988
|
+
def __init__(self):
|
989
|
+
super().__init__('local')
|
990
|
+
|
991
|
+
@timeline.event
|
992
|
+
@context_utils.cancellation_guard
|
993
|
+
def run(
|
994
|
+
self,
|
995
|
+
cmd: Union[str, List[str]],
|
996
|
+
*,
|
997
|
+
require_outputs: bool = False,
|
998
|
+
port_forward: Optional[List[Tuple[int, int]]] = None,
|
999
|
+
# Advanced options.
|
1000
|
+
log_path: str = os.devnull,
|
1001
|
+
# If False, do not redirect stdout/stderr to optimize performance.
|
1002
|
+
process_stream: bool = True,
|
1003
|
+
stream_logs: bool = True,
|
1004
|
+
ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
|
1005
|
+
separate_stderr: bool = False,
|
1006
|
+
connect_timeout: Optional[int] = None,
|
1007
|
+
source_bashrc: bool = False,
|
1008
|
+
skip_num_lines: int = 0,
|
1009
|
+
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
1010
|
+
"""Use subprocess to run the command."""
|
1011
|
+
del port_forward, ssh_mode, connect_timeout # Unused.
|
1012
|
+
|
1013
|
+
command_str = self._get_command_to_run(cmd,
|
1014
|
+
process_stream,
|
1015
|
+
separate_stderr,
|
1016
|
+
skip_num_lines=skip_num_lines,
|
1017
|
+
source_bashrc=source_bashrc)
|
1018
|
+
|
1019
|
+
log_dir = os.path.expanduser(os.path.dirname(log_path))
|
1020
|
+
os.makedirs(log_dir, exist_ok=True)
|
1021
|
+
|
1022
|
+
executable = None
|
1023
|
+
command = [command_str]
|
1024
|
+
if not process_stream:
|
1025
|
+
if stream_logs:
|
1026
|
+
command += [
|
1027
|
+
f'| tee {log_path}',
|
1028
|
+
# This also requires the executor to be '/bin/bash' instead
|
1029
|
+
# of the default '/bin/sh'.
|
1030
|
+
'; exit ${PIPESTATUS[0]}'
|
1031
|
+
]
|
1032
|
+
else:
|
1033
|
+
command += [f'> {log_path}']
|
1034
|
+
executable = '/bin/bash'
|
1035
|
+
command_str = ' '.join(command)
|
1036
|
+
# For local process, the API server might not have this python path
|
1037
|
+
# setup. But this command runner should only be triggered from the API
|
1038
|
+
# server (in controller consolidation mode), so we can safely replace
|
1039
|
+
# the python path with the executable of the API server.
|
1040
|
+
command_str = command_str.replace(constants.SKY_PYTHON_CMD,
|
1041
|
+
sys.executable)
|
1042
|
+
logger.debug(f'Running command locally: {command_str}')
|
1043
|
+
return log_lib.run_with_log(command_str,
|
1044
|
+
log_path,
|
1045
|
+
require_outputs=require_outputs,
|
1046
|
+
stream_logs=stream_logs,
|
1047
|
+
process_stream=process_stream,
|
1048
|
+
shell=True,
|
1049
|
+
executable=executable,
|
1050
|
+
**kwargs)
|
1051
|
+
|
1052
|
+
@timeline.event
|
1053
|
+
def rsync(
|
1054
|
+
self,
|
1055
|
+
source: str,
|
1056
|
+
target: str,
|
1057
|
+
*,
|
1058
|
+
up: bool,
|
1059
|
+
# Advanced options.
|
1060
|
+
log_path: str = os.devnull,
|
1061
|
+
stream_logs: bool = True,
|
1062
|
+
max_retry: int = 1,
|
1063
|
+
) -> None:
|
1064
|
+
"""Use rsync to sync the source to the target."""
|
1065
|
+
self._rsync(source,
|
1066
|
+
target,
|
1067
|
+
node_destination=None,
|
1068
|
+
up=up,
|
1069
|
+
rsh_option=None,
|
1070
|
+
log_path=log_path,
|
1071
|
+
stream_logs=stream_logs,
|
1072
|
+
max_retry=max_retry)
|
sky/utils/command_runner.pyi
CHANGED
@@ -271,3 +271,60 @@ class KubernetesCommandRunner(CommandRunner):
|
|
271
271
|
stream_logs: bool = ...,
|
272
272
|
max_retry: int = ...) -> None:
|
273
273
|
...
|
274
|
+
|
275
|
+
|
276
|
+
class LocalProcessCommandRunner(CommandRunner):
|
277
|
+
|
278
|
+
def __init__(self) -> None:
|
279
|
+
...
|
280
|
+
|
281
|
+
@typing.overload
|
282
|
+
def run(self,
|
283
|
+
cmd: Union[str, List[str]],
|
284
|
+
*,
|
285
|
+
port_forward: Optional[List[int]] = ...,
|
286
|
+
require_outputs: Literal[False] = ...,
|
287
|
+
log_path: str = ...,
|
288
|
+
process_stream: bool = ...,
|
289
|
+
stream_logs: bool = ...,
|
290
|
+
ssh_mode: SshMode = ...,
|
291
|
+
separate_stderr: bool = ...,
|
292
|
+
connect_timeout: Optional[int] = ...,
|
293
|
+
source_bashrc: bool = ...,
|
294
|
+
skip_lines: int = ...,
|
295
|
+
**kwargs) -> int:
|
296
|
+
...
|
297
|
+
|
298
|
+
@typing.overload
|
299
|
+
def run(self,
|
300
|
+
cmd: Union[str, List[str]],
|
301
|
+
*,
|
302
|
+
port_forward: Optional[List[int]] = ...,
|
303
|
+
require_outputs: Literal[True],
|
304
|
+
log_path: str = ...,
|
305
|
+
process_stream: bool = ...,
|
306
|
+
stream_logs: bool = ...,
|
307
|
+
ssh_mode: SshMode = ...,
|
308
|
+
separate_stderr: bool = ...,
|
309
|
+
connect_timeout: Optional[int] = ...,
|
310
|
+
source_bashrc: bool = ...,
|
311
|
+
skip_lines: int = ...,
|
312
|
+
**kwargs) -> Tuple[int, str, str]:
|
313
|
+
...
|
314
|
+
|
315
|
+
@typing.overload
|
316
|
+
def run(self,
|
317
|
+
cmd: Union[str, List[str]],
|
318
|
+
*,
|
319
|
+
port_forward: Optional[List[int]] = ...,
|
320
|
+
require_outputs: bool = ...,
|
321
|
+
log_path: str = ...,
|
322
|
+
process_stream: bool = ...,
|
323
|
+
stream_logs: bool = ...,
|
324
|
+
ssh_mode: SshMode = ...,
|
325
|
+
separate_stderr: bool = ...,
|
326
|
+
connect_timeout: Optional[int] = ...,
|
327
|
+
source_bashrc: bool = ...,
|
328
|
+
skip_lines: int = ...,
|
329
|
+
**kwargs) -> Union[Tuple[int, str, str], int]:
|
330
|
+
...
|
sky/utils/common_utils.py
CHANGED
@@ -26,6 +26,7 @@ from sky.adaptors import common as adaptors_common
|
|
26
26
|
from sky.skylet import constants
|
27
27
|
from sky.usage import constants as usage_constants
|
28
28
|
from sky.utils import annotations
|
29
|
+
from sky.utils import common_utils
|
29
30
|
from sky.utils import ux_utils
|
30
31
|
from sky.utils import validator
|
31
32
|
|
@@ -298,6 +299,13 @@ def get_current_user() -> 'models.User':
|
|
298
299
|
return models.User.get_current_user()
|
299
300
|
|
300
301
|
|
302
|
+
def get_current_user_name() -> str:
|
303
|
+
"""Returns the current user name."""
|
304
|
+
name = common_utils.get_current_user().name
|
305
|
+
assert name is not None
|
306
|
+
return name
|
307
|
+
|
308
|
+
|
301
309
|
def set_current_user(user: 'models.User'):
|
302
310
|
"""Sets the current user."""
|
303
311
|
global _current_user
|
@@ -754,7 +762,7 @@ def get_cleaned_username(username: str = '') -> str:
|
|
754
762
|
Returns:
|
755
763
|
A cleaned username.
|
756
764
|
"""
|
757
|
-
username = username or
|
765
|
+
username = username or common_utils.get_current_user_name()
|
758
766
|
username = username.lower()
|
759
767
|
username = re.sub(r'[^a-z0-9-_]', '', username)
|
760
768
|
username = re.sub(r'^[0-9-]+', '', username)
|
sky/utils/context.py
CHANGED
@@ -254,7 +254,9 @@ class Popen(subprocess.Popen):
|
|
254
254
|
def __init__(self, *args, **kwargs):
|
255
255
|
env = kwargs.pop('env', None)
|
256
256
|
if env is None:
|
257
|
-
|
257
|
+
# Pass a copy of current context.environ to avoid race condition
|
258
|
+
# when the context is updated after the Popen is created.
|
259
|
+
env = os.environ.copy()
|
258
260
|
super().__init__(*args, env=env, **kwargs)
|
259
261
|
|
260
262
|
|
sky/utils/controller_utils.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
import copy
|
3
3
|
import dataclasses
|
4
4
|
import enum
|
5
|
-
import getpass
|
6
5
|
import os
|
7
6
|
import tempfile
|
8
7
|
import typing
|
@@ -498,7 +497,7 @@ def shared_controller_vars_to_fill(
|
|
498
497
|
env_vars.update({
|
499
498
|
# Should not use $USER here, as that env var can be empty when
|
500
499
|
# running in a container.
|
501
|
-
constants.USER_ENV_VAR:
|
500
|
+
constants.USER_ENV_VAR: common_utils.get_current_user_name(),
|
502
501
|
constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
|
503
502
|
# Skip cloud identity check to avoid the overhead.
|
504
503
|
env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
|
sky/utils/resources_utils.py
CHANGED
@@ -8,6 +8,7 @@ import typing
|
|
8
8
|
from typing import Dict, List, Optional, Set, Union
|
9
9
|
|
10
10
|
from sky import skypilot_config
|
11
|
+
from sky.skylet import constants
|
11
12
|
from sky.utils import common_utils
|
12
13
|
from sky.utils import registry
|
13
14
|
from sky.utils import ux_utils
|
@@ -331,3 +332,68 @@ def make_launchables_for_valid_region_zones(
|
|
331
332
|
# Batch the requests at the granularity of a single region.
|
332
333
|
launchables.append(launchable_resources.copy(region=region.name))
|
333
334
|
return launchables
|
335
|
+
|
336
|
+
|
337
|
+
def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
338
|
+
field_name: str,
|
339
|
+
ret_type: type = int,
|
340
|
+
unit: str = 'gb',
|
341
|
+
allow_plus: bool = False,
|
342
|
+
allow_x: bool = False,
|
343
|
+
allow_rounding: bool = False) -> str:
|
344
|
+
"""Returns memory size in chosen units given a resource quantity string.
|
345
|
+
|
346
|
+
Args:
|
347
|
+
resource_qty_str: Resource quantity string
|
348
|
+
unit: Unit to convert to
|
349
|
+
allow_plus: Whether to allow '+' prefix
|
350
|
+
allow_x: Whether to allow 'x' suffix
|
351
|
+
"""
|
352
|
+
assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
|
353
|
+
|
354
|
+
error_msg = (f'"{field_name}" field should be a '
|
355
|
+
f'{constants.MEMORY_SIZE_PATTERN}+?,'
|
356
|
+
f' got {resource_qty_str}')
|
357
|
+
|
358
|
+
resource_str = str(resource_qty_str)
|
359
|
+
|
360
|
+
# Handle plus and x suffixes, x is only used internally for jobs controller
|
361
|
+
plus = ''
|
362
|
+
if resource_str.endswith('+'):
|
363
|
+
if allow_plus:
|
364
|
+
resource_str = resource_str[:-1]
|
365
|
+
plus = '+'
|
366
|
+
else:
|
367
|
+
raise ValueError(error_msg)
|
368
|
+
|
369
|
+
x = ''
|
370
|
+
if resource_str.endswith('x'):
|
371
|
+
if allow_x:
|
372
|
+
resource_str = resource_str[:-1]
|
373
|
+
x = 'x'
|
374
|
+
else:
|
375
|
+
raise ValueError(error_msg)
|
376
|
+
|
377
|
+
try:
|
378
|
+
# We assume it is already in the wanted units to maintain backwards
|
379
|
+
# compatibility
|
380
|
+
ret_type(resource_str)
|
381
|
+
return f'{resource_str}{plus}{x}'
|
382
|
+
except ValueError:
|
383
|
+
pass
|
384
|
+
|
385
|
+
resource_str = resource_str.lower()
|
386
|
+
for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
|
387
|
+
if resource_str.endswith(mem_unit):
|
388
|
+
try:
|
389
|
+
value = ret_type(resource_str[:-len(mem_unit)])
|
390
|
+
converted = (value * multiplier /
|
391
|
+
constants.MEMORY_SIZE_UNITS[unit])
|
392
|
+
if not allow_rounding and ret_type(converted) != converted:
|
393
|
+
raise ValueError(error_msg)
|
394
|
+
converted = ret_type(converted)
|
395
|
+
return f'{converted}{plus}{x}'
|
396
|
+
except ValueError:
|
397
|
+
continue
|
398
|
+
|
399
|
+
raise ValueError(error_msg)
|
sky/utils/rich_utils.py
CHANGED
@@ -7,6 +7,7 @@ import threading
|
|
7
7
|
import typing
|
8
8
|
from typing import Callable, Iterator, Optional, Tuple, Union
|
9
9
|
|
10
|
+
from sky import exceptions
|
10
11
|
from sky.adaptors import common as adaptors_common
|
11
12
|
from sky.utils import annotations
|
12
13
|
from sky.utils import context
|
@@ -58,6 +59,7 @@ class Control(enum.Enum):
|
|
58
59
|
EXIT = 'rich_exit'
|
59
60
|
UPDATE = 'rich_update'
|
60
61
|
HEARTBEAT = 'heartbeat'
|
62
|
+
RETRY = 'retry'
|
61
63
|
|
62
64
|
def encode(self, msg: str) -> str:
|
63
65
|
return f'<{self.value}>{msg}</{self.value}>'
|
@@ -365,6 +367,10 @@ def decode_rich_status(
|
|
365
367
|
yield line
|
366
368
|
continue
|
367
369
|
|
370
|
+
if control == Control.RETRY:
|
371
|
+
raise exceptions.ServerTemporarilyUnavailableError(
|
372
|
+
'The server is temporarily unavailable. Please try '
|
373
|
+
'again.')
|
368
374
|
# control is not None, i.e. it is a rich status control message.
|
369
375
|
if threading.current_thread() is not threading.main_thread():
|
370
376
|
yield None
|
sky/utils/schemas.py
CHANGED
@@ -70,8 +70,36 @@ _AUTOSTOP_SCHEMA = {
|
|
70
70
|
}
|
71
71
|
|
72
72
|
|
73
|
-
|
74
|
-
|
73
|
+
# Note: This is similar to _get_infra_pattern()
|
74
|
+
# but without the wildcard patterns.
|
75
|
+
def _get_volume_infra_pattern():
|
76
|
+
# Building the regex pattern for the infra field
|
77
|
+
# Format: cloud[/region[/zone]] or wildcards or kubernetes context
|
78
|
+
# Match any cloud name (case insensitive)
|
79
|
+
all_clouds = list(constants.ALL_CLOUDS)
|
80
|
+
all_clouds.remove('kubernetes')
|
81
|
+
cloud_pattern = f'(?i:({"|".join(all_clouds)}))'
|
82
|
+
|
83
|
+
# Optional /region followed by optional /zone
|
84
|
+
# /[^/]+ matches a slash followed by any characters except slash (region or
|
85
|
+
# zone name)
|
86
|
+
# The outer (?:...)? makes the entire region/zone part optional
|
87
|
+
region_zone_pattern = '(?:/[^/]+(?:/[^/]+)?)?'
|
88
|
+
|
89
|
+
# Kubernetes specific pattern - matches:
|
90
|
+
# 1. Just the word "kubernetes" or "k8s" by itself
|
91
|
+
# 2. "k8s/" or "kubernetes/" followed by any context name (which may contain
|
92
|
+
# slashes)
|
93
|
+
kubernetes_pattern = '(?i:kubernetes|k8s)(?:/.+)?'
|
94
|
+
|
95
|
+
# Combine all patterns with alternation (|)
|
96
|
+
# ^ marks start of string, $ marks end of string
|
97
|
+
infra_pattern = (f'^(?:{cloud_pattern}{region_zone_pattern}|'
|
98
|
+
f'{kubernetes_pattern})$')
|
99
|
+
return infra_pattern
|
100
|
+
|
101
|
+
|
102
|
+
def _get_infra_pattern():
|
75
103
|
# Building the regex pattern for the infra field
|
76
104
|
# Format: cloud[/region[/zone]] or wildcards or kubernetes context
|
77
105
|
# Match any cloud name (case insensitive)
|
@@ -103,7 +131,11 @@ def _get_single_resources_schema():
|
|
103
131
|
infra_pattern = (f'^(?:{cloud_pattern}{region_zone_pattern}|'
|
104
132
|
f'{wildcard_cloud}{wildcard_with_region}|'
|
105
133
|
f'{kubernetes_pattern})$')
|
134
|
+
return infra_pattern
|
135
|
+
|
106
136
|
|
137
|
+
def _get_single_resources_schema():
|
138
|
+
"""Schema for a single resource in a resources list."""
|
107
139
|
return {
|
108
140
|
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
109
141
|
'type': 'object',
|
@@ -133,7 +165,7 @@ def _get_single_resources_schema():
|
|
133
165
|
# 3. Kubernetes patterns - e.g. "kubernetes/my-context",
|
134
166
|
# "k8s/context-name",
|
135
167
|
# "k8s/aws:eks:us-east-1:123456789012:cluster/my-cluster"
|
136
|
-
'pattern':
|
168
|
+
'pattern': _get_infra_pattern(),
|
137
169
|
},
|
138
170
|
'cpus': {
|
139
171
|
'anyOf': [{
|
@@ -383,6 +415,66 @@ def get_resources_schema():
|
|
383
415
|
}
|
384
416
|
|
385
417
|
|
418
|
+
def get_volume_schema():
|
419
|
+
# pylint: disable=import-outside-toplevel
|
420
|
+
from sky.volumes import volume
|
421
|
+
|
422
|
+
return {
|
423
|
+
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
424
|
+
'type': 'object',
|
425
|
+
'required': ['name', 'type', 'infra'],
|
426
|
+
'additionalProperties': False,
|
427
|
+
'properties': {
|
428
|
+
'name': {
|
429
|
+
'type': 'string',
|
430
|
+
},
|
431
|
+
'type': {
|
432
|
+
'type': 'string',
|
433
|
+
'case_sensitive_enum': [
|
434
|
+
type.value for type in volume.VolumeType
|
435
|
+
],
|
436
|
+
},
|
437
|
+
'infra': {
|
438
|
+
'type': 'string',
|
439
|
+
'description': ('Infrastructure specification in format: '
|
440
|
+
'cloud[/region[/zone]].'),
|
441
|
+
# Pattern validates:
|
442
|
+
# 1. cloud[/region[/zone]] - e.g. "aws", "aws/us-east-1",
|
443
|
+
# "aws/us-east-1/us-east-1a"
|
444
|
+
# 2. Kubernetes patterns - e.g. "kubernetes/my-context",
|
445
|
+
# "k8s/context-name",
|
446
|
+
# "k8s/aws:eks:us-east-1:123456789012:cluster/my-cluster"
|
447
|
+
'pattern': _get_volume_infra_pattern(),
|
448
|
+
},
|
449
|
+
'size': {
|
450
|
+
'type': 'string',
|
451
|
+
'pattern': constants.MEMORY_SIZE_PATTERN,
|
452
|
+
},
|
453
|
+
'resource_name': {
|
454
|
+
'type': 'string',
|
455
|
+
},
|
456
|
+
'config': {
|
457
|
+
'type': 'object',
|
458
|
+
'required': [],
|
459
|
+
'properties': {
|
460
|
+
'storage_class_name': {
|
461
|
+
'type': 'string',
|
462
|
+
},
|
463
|
+
'access_mode': {
|
464
|
+
'type': 'string',
|
465
|
+
'case_sensitive_enum': [
|
466
|
+
type.value for type in volume.VolumeAccessMode
|
467
|
+
],
|
468
|
+
},
|
469
|
+
'namespace': {
|
470
|
+
'type': 'string',
|
471
|
+
},
|
472
|
+
},
|
473
|
+
},
|
474
|
+
}
|
475
|
+
}
|
476
|
+
|
477
|
+
|
386
478
|
def get_storage_schema():
|
387
479
|
# pylint: disable=import-outside-toplevel
|
388
480
|
from sky.data import storage
|
@@ -457,6 +549,49 @@ def get_storage_schema():
|
|
457
549
|
}
|
458
550
|
|
459
551
|
|
552
|
+
def get_volume_mount_schema():
|
553
|
+
"""Schema for volume mount object in task config (internal use only)."""
|
554
|
+
return {
|
555
|
+
'$schema': 'https://json-schema.org/draft/2020-12/schema',
|
556
|
+
'type': 'object',
|
557
|
+
'required': [],
|
558
|
+
'additionalProperties': False,
|
559
|
+
'properties': {
|
560
|
+
'path': {
|
561
|
+
'type': 'string',
|
562
|
+
},
|
563
|
+
'volume_name': {
|
564
|
+
'type': 'string',
|
565
|
+
},
|
566
|
+
'volume_config': {
|
567
|
+
'type': 'object',
|
568
|
+
'required': [],
|
569
|
+
'additionalProperties': True,
|
570
|
+
'properties': {
|
571
|
+
'cloud': {
|
572
|
+
'type': 'string',
|
573
|
+
'case_insensitive_enum': list(constants.ALL_CLOUDS)
|
574
|
+
},
|
575
|
+
'region': {
|
576
|
+
'anyOf': [{
|
577
|
+
'type': 'string'
|
578
|
+
}, {
|
579
|
+
'type': 'null'
|
580
|
+
}]
|
581
|
+
},
|
582
|
+
'zone': {
|
583
|
+
'anyOf': [{
|
584
|
+
'type': 'string'
|
585
|
+
}, {
|
586
|
+
'type': 'null'
|
587
|
+
}]
|
588
|
+
},
|
589
|
+
},
|
590
|
+
}
|
591
|
+
}
|
592
|
+
}
|
593
|
+
|
594
|
+
|
460
595
|
def get_service_schema():
|
461
596
|
"""Schema for top-level `service:` field (for SkyServe)."""
|
462
597
|
# To avoid circular imports, only import when needed.
|
@@ -672,18 +807,6 @@ def get_task_schema():
|
|
672
807
|
'service': {
|
673
808
|
'type': 'object',
|
674
809
|
},
|
675
|
-
'job': {
|
676
|
-
'type': 'object',
|
677
|
-
'required': [],
|
678
|
-
'additionalProperties': False,
|
679
|
-
'properties': {
|
680
|
-
'priority': {
|
681
|
-
'type': 'integer',
|
682
|
-
'minimum': 0,
|
683
|
-
'maximum': 1000,
|
684
|
-
},
|
685
|
-
},
|
686
|
-
},
|
687
810
|
'setup': {
|
688
811
|
'type': 'string',
|
689
812
|
},
|
@@ -735,6 +858,14 @@ def get_task_schema():
|
|
735
858
|
'config': _filter_schema(
|
736
859
|
get_config_schema(),
|
737
860
|
constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK),
|
861
|
+
# volumes config is validated separately using get_volume_schema
|
862
|
+
'volumes': {
|
863
|
+
'type': 'object',
|
864
|
+
},
|
865
|
+
'volume_mounts': {
|
866
|
+
'type': 'array',
|
867
|
+
'items': get_volume_mount_schema(),
|
868
|
+
},
|
738
869
|
**_experimental_task_schema(),
|
739
870
|
}
|
740
871
|
}
|
@@ -899,30 +1030,41 @@ def get_config_schema():
|
|
899
1030
|
if k != '$schema'
|
900
1031
|
}
|
901
1032
|
resources_schema['properties'].pop('ports')
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
'
|
909
|
-
'required': [],
|
910
|
-
'additionalProperties': False,
|
911
|
-
'properties': {
|
912
|
-
'resources': resources_schema,
|
913
|
-
'high_availability': {
|
914
|
-
'type': 'boolean',
|
915
|
-
},
|
916
|
-
'autostop': _AUTOSTOP_SCHEMA,
|
917
|
-
}
|
1033
|
+
|
1034
|
+
def _get_controller_schema(add_consolidation_mode: bool = False):
|
1035
|
+
controller_properties = {
|
1036
|
+
'resources': resources_schema,
|
1037
|
+
'high_availability': {
|
1038
|
+
'type': 'boolean',
|
1039
|
+
'default': False,
|
918
1040
|
},
|
919
|
-
'
|
920
|
-
|
921
|
-
|
922
|
-
|
1041
|
+
'autostop': _AUTOSTOP_SCHEMA,
|
1042
|
+
}
|
1043
|
+
if add_consolidation_mode:
|
1044
|
+
controller_properties['consolidation_mode'] = {
|
1045
|
+
'type': 'boolean',
|
1046
|
+
'default': False,
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
return {
|
1050
|
+
'type': 'object',
|
1051
|
+
'required': [],
|
1052
|
+
'additionalProperties': False,
|
1053
|
+
'properties': {
|
1054
|
+
'controller': {
|
1055
|
+
'type': 'object',
|
1056
|
+
'required': [],
|
1057
|
+
'additionalProperties': False,
|
1058
|
+
'properties': controller_properties,
|
1059
|
+
},
|
1060
|
+
'bucket': {
|
1061
|
+
'type': 'string',
|
1062
|
+
'pattern': '^(https|s3|gs|r2|cos)://.+',
|
1063
|
+
'required': [],
|
1064
|
+
}
|
923
1065
|
}
|
924
1066
|
}
|
925
|
-
|
1067
|
+
|
926
1068
|
cloud_configs = {
|
927
1069
|
'aws': {
|
928
1070
|
'type': 'object',
|
@@ -1440,8 +1582,8 @@ def get_config_schema():
|
|
1440
1582
|
'db': {
|
1441
1583
|
'type': 'string',
|
1442
1584
|
},
|
1443
|
-
'jobs':
|
1444
|
-
'serve':
|
1585
|
+
'jobs': _get_controller_schema(add_consolidation_mode=True),
|
1586
|
+
'serve': _get_controller_schema(add_consolidation_mode=False),
|
1445
1587
|
'allowed_clouds': allowed_clouds,
|
1446
1588
|
'admin_policy': admin_policy_schema,
|
1447
1589
|
'docker': docker_configs,
|