skypilot-nightly 1.0.0.dev20250820__py3-none-any.whl → 1.0.0.dev20250822__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +9 -1
- sky/client/cli/command.py +4 -1
- sky/client/cli/flags.py +3 -3
- sky/client/sdk.py +64 -19
- sky/client/sdk_async.py +1 -1
- sky/core.py +3 -5
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{8ZscIHnvBWz3AXkxsJL6H → WD29VpW0S7wsYey0qFBHQ}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/3015-6c9c09593b1e67b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.bc5d2853355c9c47.js → 3785.d5b86f6ebc88e6e6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{9277.71481d5b2e606e33.js → 4783.c485f48348349f47.js} +8 -3
- sky/dashboard/out/_next/static/chunks/{6633-efe924b9b8136699.js → 7205-88191679e7988c57.js} +9 -4
- sky/dashboard/out/_next/static/chunks/8969-4a6f1a928fb6d370.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8838.e7953f42af2b0544.js → 9946.3b7b43c217ff70ec.js} +9 -4
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-ec747e4f2dc39b57.js → [cluster]-a0527109c2fab467.js} +7 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-4b3ba1792dc6f21d.js → jobs-7421e63ac35f8fce.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-65f72dee417237ef.js → [name]-de06e613e20bc977.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-338de9df523d883a.js → workspaces-be35b22e2046564c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-6e76f636a048e145.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +8 -0
- sky/jobs/utils.py +10 -3
- sky/optimizer.py +14 -4
- sky/provision/docker_utils.py +20 -1
- sky/provision/kubernetes/instance.py +4 -1
- sky/resources.py +17 -7
- sky/server/requests/executor.py +2 -2
- sky/server/requests/serializers/decoders.py +5 -0
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/rest.py +38 -8
- sky/server/server.py +8 -6
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +6 -7
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/RECORD +60 -60
- sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-23c8fbdb8b397d59.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +0 -6
- sky/dashboard/out/_next/static/chunks/webpack-008593a02784a2df.js +0 -1
- /sky/dashboard/out/_next/static/{8ZscIHnvBWz3AXkxsJL6H → WD29VpW0S7wsYey0qFBHQ}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1121-2edb8ab2ba080a76.js → 1121-8afcf719ea87debc.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-2f60a90b7d76838e.js → 1141-943efc7aff0f0c06.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6856-e6f350f567182e87.js → 6856-049014c6d43d127b.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-7d4182df6625fe10.js → [pool]-07349868f7905d37.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250820.dist-info → skypilot_nightly-1.0.0.dev20250822.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-8afcf719ea87debc.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-6c9c09593b1e67b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-6e76f636a048e145.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WD29VpW0S7wsYey0qFBHQ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"WD29VpW0S7wsYey0qFBHQ","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/exceptions.py
CHANGED
|
@@ -661,3 +661,11 @@ class RequestInterruptedError(Exception):
|
|
|
661
661
|
class SkyletInternalError(Exception):
|
|
662
662
|
"""Raised when a Skylet internal error occurs."""
|
|
663
663
|
pass
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
class ClientError(Exception):
|
|
667
|
+
"""Raised when a there is a client error occurs.
|
|
668
|
+
|
|
669
|
+
If a request encounters a ClientError, it will not be retried to the server.
|
|
670
|
+
"""
|
|
671
|
+
pass
|
sky/jobs/utils.py
CHANGED
|
@@ -1494,7 +1494,7 @@ def format_job_table(
|
|
|
1494
1494
|
'JOB DURATION',
|
|
1495
1495
|
'#RECOVERIES',
|
|
1496
1496
|
'STATUS',
|
|
1497
|
-
'
|
|
1497
|
+
'POOL',
|
|
1498
1498
|
]
|
|
1499
1499
|
if show_all:
|
|
1500
1500
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
|
@@ -1597,6 +1597,10 @@ def format_job_table(
|
|
|
1597
1597
|
|
|
1598
1598
|
user_values = get_user_column_values(job_tasks[0])
|
|
1599
1599
|
|
|
1600
|
+
pool = job_tasks[0].get('pool')
|
|
1601
|
+
if pool is None:
|
|
1602
|
+
pool = '-'
|
|
1603
|
+
|
|
1600
1604
|
job_id = job_hash[1] if tasks_have_k8s_user else job_hash
|
|
1601
1605
|
job_values = [
|
|
1602
1606
|
job_id,
|
|
@@ -1610,7 +1614,7 @@ def format_job_table(
|
|
|
1610
1614
|
job_duration,
|
|
1611
1615
|
recovery_cnt,
|
|
1612
1616
|
status_str,
|
|
1613
|
-
|
|
1617
|
+
pool,
|
|
1614
1618
|
]
|
|
1615
1619
|
if show_all:
|
|
1616
1620
|
details = job_tasks[current_task_id].get('details')
|
|
@@ -1637,6 +1641,9 @@ def format_job_table(
|
|
|
1637
1641
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
|
1638
1642
|
user_values = get_user_column_values(task)
|
|
1639
1643
|
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
|
1644
|
+
pool = task.get('pool')
|
|
1645
|
+
if pool is None:
|
|
1646
|
+
pool = '-'
|
|
1640
1647
|
values = [
|
|
1641
1648
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
|
1642
1649
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
|
@@ -1653,7 +1660,7 @@ def format_job_table(
|
|
|
1653
1660
|
job_duration,
|
|
1654
1661
|
task['recovery_count'],
|
|
1655
1662
|
task['status'].colored_str(),
|
|
1656
|
-
|
|
1663
|
+
pool,
|
|
1657
1664
|
]
|
|
1658
1665
|
if show_all:
|
|
1659
1666
|
# schedule_state is only set at the job level, so if we have
|
sky/optimizer.py
CHANGED
|
@@ -1262,12 +1262,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
|
1262
1262
|
|
|
1263
1263
|
|
|
1264
1264
|
def _check_specified_regions(task: task_lib.Task) -> None:
|
|
1265
|
-
"""Check if specified regions (Kubernetes contexts) are enabled.
|
|
1265
|
+
"""Check if specified regions (Kubernetes/SSH contexts) are enabled.
|
|
1266
1266
|
|
|
1267
1267
|
Args:
|
|
1268
1268
|
task: The task to check.
|
|
1269
1269
|
"""
|
|
1270
|
-
# Only check for Kubernetes now
|
|
1270
|
+
# Only check for Kubernetes/SSH for now
|
|
1271
|
+
# Below check works because SSH inherits Kubernetes cloud.
|
|
1271
1272
|
if not all(
|
|
1272
1273
|
isinstance(resources.cloud, clouds.Kubernetes)
|
|
1273
1274
|
for resources in task.resources):
|
|
@@ -1276,12 +1277,21 @@ def _check_specified_regions(task: task_lib.Task) -> None:
|
|
|
1276
1277
|
for resources in task.resources:
|
|
1277
1278
|
if resources.region is None:
|
|
1278
1279
|
continue
|
|
1279
|
-
|
|
1280
|
+
|
|
1281
|
+
is_ssh = isinstance(resources.cloud, clouds.SSH)
|
|
1282
|
+
if is_ssh:
|
|
1283
|
+
existing_contexts = clouds.SSH.existing_allowed_contexts()
|
|
1284
|
+
else:
|
|
1285
|
+
existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
|
1286
|
+
|
|
1280
1287
|
region = resources.region
|
|
1281
1288
|
task_name = f' {task.name!r}' if task.name is not None else ''
|
|
1282
1289
|
msg = f'Task{task_name} requires '
|
|
1283
1290
|
if region not in existing_contexts:
|
|
1284
|
-
|
|
1291
|
+
if is_ssh:
|
|
1292
|
+
infra_str = f'SSH/{region.lstrip("ssh-")}'
|
|
1293
|
+
else:
|
|
1294
|
+
infra_str = f'Kubernetes/{region}'
|
|
1285
1295
|
logger.warning(f'{infra_str} is not enabled.')
|
|
1286
1296
|
volume_mounts_str = ''
|
|
1287
1297
|
if task.volume_mounts:
|
sky/provision/docker_utils.py
CHANGED
|
@@ -83,6 +83,21 @@ def check_docker_image(cname, docker_cmd):
|
|
|
83
83
|
return _check_helper(cname, '.Config.Image', docker_cmd)
|
|
84
84
|
|
|
85
85
|
|
|
86
|
+
def maybe_remove_container_cmds(container_name, docker_cmd):
|
|
87
|
+
"""Remove the container if it exists. If not, it will be a no-op.
|
|
88
|
+
"""
|
|
89
|
+
docker_rm = [
|
|
90
|
+
docker_cmd,
|
|
91
|
+
'rm',
|
|
92
|
+
'-f',
|
|
93
|
+
container_name,
|
|
94
|
+
'2>/dev/null',
|
|
95
|
+
'||',
|
|
96
|
+
'true',
|
|
97
|
+
]
|
|
98
|
+
return ' '.join(docker_rm)
|
|
99
|
+
|
|
100
|
+
|
|
86
101
|
def docker_start_cmds(
|
|
87
102
|
image,
|
|
88
103
|
container_name,
|
|
@@ -285,6 +300,10 @@ class DockerInitializer:
|
|
|
285
300
|
'sudo mv /tmp/daemon.json /etc/docker/daemon.json;'
|
|
286
301
|
'sudo systemctl restart docker; } || true')
|
|
287
302
|
user_docker_run_options = self.docker_config.get('run_options', [])
|
|
303
|
+
remove_container_cmd = maybe_remove_container_cmds(
|
|
304
|
+
self.container_name,
|
|
305
|
+
self.docker_cmd,
|
|
306
|
+
)
|
|
288
307
|
start_command = docker_start_cmds(
|
|
289
308
|
specific_image,
|
|
290
309
|
self.container_name,
|
|
@@ -292,7 +311,7 @@ class DockerInitializer:
|
|
|
292
311
|
self._auto_configure_shm(user_docker_run_options)),
|
|
293
312
|
self.docker_cmd,
|
|
294
313
|
)
|
|
295
|
-
self._run(start_command)
|
|
314
|
+
self._run(f'{remove_container_cmd}; {start_command}')
|
|
296
315
|
|
|
297
316
|
# SkyPilot: Setup Commands.
|
|
298
317
|
# TODO(zhwu): the following setups should be aligned with the kubernetes
|
|
@@ -797,15 +797,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
797
797
|
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
|
798
798
|
|
|
799
799
|
needs_gpus = False
|
|
800
|
+
needs_gpus_nvidia = False
|
|
800
801
|
limits = pod_spec['spec']['containers'][0].get('resources',
|
|
801
802
|
{}).get('limits')
|
|
802
803
|
if limits is not None:
|
|
803
804
|
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
|
|
804
805
|
0) > 0
|
|
806
|
+
needs_gpus_nvidia = limits.get(
|
|
807
|
+
kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
|
|
805
808
|
|
|
806
809
|
# TPU pods provisioned on GKE use the default containerd runtime.
|
|
807
810
|
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
|
808
|
-
if nvidia_runtime_exists and
|
|
811
|
+
if nvidia_runtime_exists and needs_gpus_nvidia:
|
|
809
812
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
|
810
813
|
|
|
811
814
|
logger.debug(f'run_instances: calling create_namespaced_pod '
|
sky/resources.py
CHANGED
|
@@ -1260,10 +1260,14 @@ class Resources:
|
|
|
1260
1260
|
def extract_docker_image(self) -> Optional[str]:
|
|
1261
1261
|
if self.image_id is None:
|
|
1262
1262
|
return None
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
if
|
|
1266
|
-
|
|
1263
|
+
# Handle dict image_id
|
|
1264
|
+
if len(self.image_id) == 1:
|
|
1265
|
+
# Check if the single key matches the region or is None (any region)
|
|
1266
|
+
image_key = list(self.image_id.keys())[0]
|
|
1267
|
+
if image_key == self.region or image_key is None:
|
|
1268
|
+
image_id = self.image_id[image_key]
|
|
1269
|
+
if image_id.startswith('docker:'):
|
|
1270
|
+
return image_id[len('docker:'):]
|
|
1267
1271
|
return None
|
|
1268
1272
|
|
|
1269
1273
|
def _try_validate_image_id(self) -> None:
|
|
@@ -1333,13 +1337,19 @@ class Resources:
|
|
|
1333
1337
|
'Kubernetes, please explicitly specify the cloud.') from e
|
|
1334
1338
|
|
|
1335
1339
|
if self._region is not None:
|
|
1336
|
-
|
|
1340
|
+
# If the image_id has None as key (region-agnostic),
|
|
1341
|
+
# use it for any region
|
|
1342
|
+
if None in self._image_id:
|
|
1343
|
+
# Replace None key with the actual region
|
|
1344
|
+
self._image_id = {self._region: self._image_id[None]}
|
|
1345
|
+
elif self._region not in self._image_id:
|
|
1337
1346
|
with ux_utils.print_exception_no_traceback():
|
|
1338
1347
|
raise ValueError(
|
|
1339
1348
|
f'image_id {self._image_id} should contain the image '
|
|
1340
1349
|
f'for the specified region {self._region}.')
|
|
1341
|
-
|
|
1342
|
-
|
|
1350
|
+
else:
|
|
1351
|
+
# Narrow down the image_id to the specified region.
|
|
1352
|
+
self._image_id = {self._region: self._image_id[self._region]}
|
|
1343
1353
|
|
|
1344
1354
|
# Check the image_id's are valid.
|
|
1345
1355
|
for region, image_id in self._image_id.items():
|
sky/server/requests/executor.py
CHANGED
|
@@ -427,9 +427,9 @@ async def execute_request_coroutine(request: api_requests.Request):
|
|
|
427
427
|
event loop. This is designed for executing tasks that are not CPU
|
|
428
428
|
intensive, e.g. sky logs.
|
|
429
429
|
"""
|
|
430
|
+
context.initialize()
|
|
430
431
|
ctx = context.get()
|
|
431
|
-
|
|
432
|
-
raise ValueError('Context is not initialized')
|
|
432
|
+
assert ctx is not None, 'Context is not initialized'
|
|
433
433
|
logger.info(f'Executing request {request.request_id} in coroutine')
|
|
434
434
|
func = request.entrypoint
|
|
435
435
|
request_body = request.request_body
|
|
@@ -203,3 +203,8 @@ def decode_job_status(
|
|
|
203
203
|
def decode_kubernetes_node_info(
|
|
204
204
|
return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
|
|
205
205
|
return models.KubernetesNodesInfo.from_dict(return_value)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
@register_decoders('endpoints')
|
|
209
|
+
def decode_endpoints(return_value: Dict[int, str]) -> Dict[int, str]:
|
|
210
|
+
return {int(k): v for k, v in return_value.items()}
|
|
@@ -209,3 +209,8 @@ def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
|
|
|
209
209
|
def encode_kubernetes_node_info(
|
|
210
210
|
return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
|
|
211
211
|
return return_value.to_dict()
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@register_encoder('endpoints')
|
|
215
|
+
def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
|
|
216
|
+
return {str(k): v for k, v in return_value.items()}
|
sky/server/rest.py
CHANGED
|
@@ -47,9 +47,10 @@ class RetryContext:
|
|
|
47
47
|
|
|
48
48
|
@contextlib.contextmanager
|
|
49
49
|
def _retry_in_context():
|
|
50
|
-
|
|
50
|
+
context = RetryContext()
|
|
51
|
+
token = _RETRY_CONTEXT.set(context)
|
|
51
52
|
try:
|
|
52
|
-
yield
|
|
53
|
+
yield context
|
|
53
54
|
finally:
|
|
54
55
|
_RETRY_CONTEXT.reset(token)
|
|
55
56
|
|
|
@@ -76,6 +77,8 @@ def retry_transient_errors(max_retries: int = 3,
|
|
|
76
77
|
if isinstance(e, requests.exceptions.HTTPError):
|
|
77
78
|
# Only server error is considered as transient.
|
|
78
79
|
return e.response.status_code >= 500
|
|
80
|
+
if isinstance(e, exceptions.ClientError):
|
|
81
|
+
return False
|
|
79
82
|
# It is hard to enumerate all other errors that are transient, e.g.
|
|
80
83
|
# broken pipe, connection refused, etc. Instead, it is safer to assume
|
|
81
84
|
# all other errors might be transient since we only retry for 3 times
|
|
@@ -88,26 +91,53 @@ def retry_transient_errors(max_retries: int = 3,
|
|
|
88
91
|
@functools.wraps(func)
|
|
89
92
|
def wrapper(*args, **kwargs):
|
|
90
93
|
backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
|
|
91
|
-
|
|
92
|
-
|
|
94
|
+
consecutive_failed_count = 0
|
|
95
|
+
|
|
96
|
+
with _retry_in_context() as context:
|
|
97
|
+
previous_line_processed = context.line_processed # should be 0
|
|
98
|
+
|
|
99
|
+
def _handle_exception():
|
|
100
|
+
# If the function made progress on a retry,
|
|
101
|
+
# clears the backoff and resets the failed retry count.
|
|
102
|
+
# Otherwise, increments the failed retry count.
|
|
103
|
+
nonlocal backoff
|
|
104
|
+
nonlocal consecutive_failed_count
|
|
105
|
+
nonlocal previous_line_processed
|
|
106
|
+
if context.line_processed > previous_line_processed:
|
|
107
|
+
backoff = common_utils.Backoff(initial_backoff,
|
|
108
|
+
max_backoff_factor)
|
|
109
|
+
previous_line_processed = context.line_processed
|
|
110
|
+
consecutive_failed_count = 0
|
|
111
|
+
else:
|
|
112
|
+
consecutive_failed_count += 1
|
|
113
|
+
|
|
114
|
+
while consecutive_failed_count < max_retries:
|
|
93
115
|
try:
|
|
94
116
|
return func(*args, **kwargs)
|
|
95
117
|
# Occurs when the server proactively interrupts the request
|
|
96
118
|
# during rolling update, we can retry immediately on the
|
|
97
119
|
# new replica.
|
|
98
120
|
except exceptions.RequestInterruptedError:
|
|
121
|
+
_handle_exception()
|
|
99
122
|
logger.debug('Request interrupted. Retry immediately.')
|
|
100
123
|
continue
|
|
101
124
|
except Exception as e: # pylint: disable=broad-except
|
|
102
|
-
|
|
125
|
+
_handle_exception()
|
|
126
|
+
if consecutive_failed_count >= max_retries:
|
|
103
127
|
# Retries exhausted.
|
|
104
128
|
raise
|
|
105
129
|
if not is_transient_error(e):
|
|
106
130
|
# Permanent error, no need to retry.
|
|
107
131
|
raise
|
|
108
|
-
logger.debug(
|
|
109
|
-
|
|
110
|
-
|
|
132
|
+
logger.debug(
|
|
133
|
+
f'Retry {func.__name__} due to {e}, '
|
|
134
|
+
f'attempt {consecutive_failed_count}/{max_retries}')
|
|
135
|
+
# Only sleep if this is not the first retry.
|
|
136
|
+
# The idea is that if the function made progress on a
|
|
137
|
+
# retry, we should try again immediately to reduce the
|
|
138
|
+
# waiting time.
|
|
139
|
+
if consecutive_failed_count > 0:
|
|
140
|
+
time.sleep(backoff.current_backoff())
|
|
111
141
|
|
|
112
142
|
return cast(F, wrapper)
|
|
113
143
|
|
sky/server/server.py
CHANGED
|
@@ -1185,10 +1185,6 @@ async def logs(
|
|
|
1185
1185
|
# TODO(zhwu): This should wait for the request on the cluster, e.g., async
|
|
1186
1186
|
# launch, to finish, so that a user does not need to manually pull the
|
|
1187
1187
|
# request status.
|
|
1188
|
-
# Only initialize the context in logs handler to limit the scope of this
|
|
1189
|
-
# experimental change.
|
|
1190
|
-
# TODO(aylei): init in lifespan() to enable SkyPilot context in all APIs.
|
|
1191
|
-
context.initialize()
|
|
1192
1188
|
request_task = executor.prepare_request(
|
|
1193
1189
|
request_id=request.state.request_id,
|
|
1194
1190
|
request_name='logs',
|
|
@@ -1198,8 +1194,14 @@ async def logs(
|
|
|
1198
1194
|
)
|
|
1199
1195
|
task = asyncio.create_task(executor.execute_request_coroutine(request_task))
|
|
1200
1196
|
|
|
1201
|
-
def cancel_task():
|
|
1202
|
-
|
|
1197
|
+
async def cancel_task():
|
|
1198
|
+
try:
|
|
1199
|
+
logger.info('Client disconnected for request: '
|
|
1200
|
+
f'{request.state.request_id}')
|
|
1201
|
+
task.cancel()
|
|
1202
|
+
await task
|
|
1203
|
+
except asyncio.CancelledError:
|
|
1204
|
+
pass
|
|
1203
1205
|
|
|
1204
1206
|
# Cancel the task after the request is done or client disconnects
|
|
1205
1207
|
background_tasks.add_task(cancel_task)
|
sky/skypilot_config.py
CHANGED
|
@@ -514,10 +514,10 @@ def parse_and_validate_config_file(config_path: str) -> config_utils.Config:
|
|
|
514
514
|
|
|
515
515
|
|
|
516
516
|
def _parse_dotlist(dotlist: List[str]) -> config_utils.Config:
|
|
517
|
-
"""Parse a
|
|
517
|
+
"""Parse a single key-value pair into a dictionary.
|
|
518
518
|
|
|
519
519
|
Args:
|
|
520
|
-
dotlist: A
|
|
520
|
+
dotlist: A single key-value pair.
|
|
521
521
|
|
|
522
522
|
Returns:
|
|
523
523
|
A config_utils.Config object with the parsed key-value pairs.
|
|
@@ -788,7 +788,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
|
788
788
|
"""Composes the skypilot CLI config.
|
|
789
789
|
CLI config can either be:
|
|
790
790
|
- A path to a config file
|
|
791
|
-
- A
|
|
791
|
+
- A single key-value pair
|
|
792
792
|
"""
|
|
793
793
|
|
|
794
794
|
if not cli_config:
|
|
@@ -804,7 +804,7 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
|
|
|
804
804
|
config_source = maybe_config_path
|
|
805
805
|
# cli_config is a path to a config file
|
|
806
806
|
parsed_config = parse_and_validate_config_file(maybe_config_path)
|
|
807
|
-
else: # cli_config is a
|
|
807
|
+
else: # cli_config is a single key-value pair
|
|
808
808
|
parsed_config = _parse_dotlist(cli_config)
|
|
809
809
|
_validate_config(parsed_config, config_source)
|
|
810
810
|
except ValueError as e:
|
sky/users/permission.py
CHANGED
|
@@ -46,7 +46,8 @@ class PermissionService:
|
|
|
46
46
|
engine = global_user_state.initialize_and_get_db()
|
|
47
47
|
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
48
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
49
|
-
adapter = sqlalchemy_adapter.Adapter(
|
|
49
|
+
adapter = sqlalchemy_adapter.Adapter(
|
|
50
|
+
engine, db_class=sqlalchemy_adapter.CasbinRule)
|
|
50
51
|
model_path = os.path.join(os.path.dirname(__file__),
|
|
51
52
|
'model.conf')
|
|
52
53
|
enforcer = casbin.Enforcer(model_path, adapter)
|
|
@@ -67,7 +68,7 @@ class PermissionService:
|
|
|
67
68
|
username.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
|
|
68
69
|
user_info = global_user_state.get_user(user_hash)
|
|
69
70
|
if user_info:
|
|
70
|
-
logger.
|
|
71
|
+
logger.debug(f'Basic auth user {username} already exists')
|
|
71
72
|
return
|
|
72
73
|
global_user_state.add_or_update_user(
|
|
73
74
|
models.User(id=user_hash, name=username, password=password))
|
|
@@ -168,8 +169,6 @@ class PermissionService:
|
|
|
168
169
|
"""
|
|
169
170
|
user_roles = self.enforcer.get_roles_for_user(user_id)
|
|
170
171
|
if not user_roles:
|
|
171
|
-
logger.info(f'User {user_id} has no roles, adding'
|
|
172
|
-
f' default role {rbac.get_default_role()}')
|
|
173
172
|
self.enforcer.add_grouping_policy(user_id, rbac.get_default_role())
|
|
174
173
|
return True
|
|
175
174
|
return False
|
|
@@ -183,7 +182,7 @@ class PermissionService:
|
|
|
183
182
|
# Avoid calling get_user_roles, as it will require the lock.
|
|
184
183
|
current_roles = self.enforcer.get_roles_for_user(user_id)
|
|
185
184
|
if not current_roles:
|
|
186
|
-
logger.
|
|
185
|
+
logger.debug(f'User {user_id} has no roles')
|
|
187
186
|
return
|
|
188
187
|
self.enforcer.remove_grouping_policy(user_id, current_roles[0])
|
|
189
188
|
self.enforcer.save_policy()
|
|
@@ -197,12 +196,12 @@ class PermissionService:
|
|
|
197
196
|
# Avoid calling get_user_roles, as it will require the lock.
|
|
198
197
|
current_roles = self.enforcer.get_roles_for_user(user_id)
|
|
199
198
|
if not current_roles:
|
|
200
|
-
logger.
|
|
199
|
+
logger.debug(f'User {user_id} has no roles')
|
|
201
200
|
else:
|
|
202
201
|
# TODO(hailong): how to handle multiple roles?
|
|
203
202
|
current_role = current_roles[0]
|
|
204
203
|
if current_role == new_role:
|
|
205
|
-
logger.
|
|
204
|
+
logger.debug(f'User {user_id} already has role {new_role}')
|
|
206
205
|
return
|
|
207
206
|
self.enforcer.remove_grouping_policy(user_id, current_role)
|
|
208
207
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250822
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -253,7 +253,7 @@ Dynamic: summary
|
|
|
253
253
|
|
|
254
254
|
----
|
|
255
255
|
|
|
256
|
-
SkyPilot is a system
|
|
256
|
+
SkyPilot is a system to run, manage, and scale AI workloads on any AI infrastructure.
|
|
257
257
|
|
|
258
258
|
SkyPilot gives **AI teams** a simple interface to run jobs on any infra.
|
|
259
259
|
**Infra teams** get a unified control plane to manage any AI compute — with advanced scheduling, scaling, and orchestration.
|