skypilot-nightly 1.0.0.dev20251012__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/shadeform.py +89 -0
- sky/authentication.py +52 -2
- sky/backends/backend_utils.py +35 -25
- sky/backends/cloud_vm_ray_backend.py +5 -5
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +19 -25
- sky/catalog/shadeform_catalog.py +165 -0
- sky/client/cli/command.py +53 -19
- sky/client/sdk.py +13 -1
- sky/clouds/__init__.py +2 -0
- sky/clouds/shadeform.py +393 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +122 -145
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +5 -5
- sky/jobs/state.py +65 -21
- sky/jobs/utils.py +58 -22
- sky/metrics/utils.py +27 -6
- sky/provision/__init__.py +1 -0
- sky/provision/kubernetes/utils.py +44 -39
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/server/common.py +4 -2
- sky/server/requests/executor.py +25 -3
- sky/server/server.py +9 -3
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +23 -6
- sky/skylet/log_lib.py +0 -1
- sky/skylet/log_lib.pyi +1 -1
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/utils/common.py +2 -0
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +15 -11
- sky/utils/controller_utils.py +35 -8
- sky/utils/locks.py +20 -5
- sky/utils/subprocess_utils.py +4 -3
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +39 -38
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +63 -54
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{yOfMelBaFp8uL5F9atyAK → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251012.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.shadeform
|
|
11
|
+
region: "{{region}}"
|
|
12
|
+
disable_launch_config_check: true
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
ssh_user: shadeform
|
|
16
|
+
ssh_private_key: {{ssh_private_key}}
|
|
17
|
+
ssh_key_id: {{ssh_key_id}}
|
|
18
|
+
|
|
19
|
+
available_node_types:
|
|
20
|
+
ray_head_default:
|
|
21
|
+
{%- if custom_resources %}
|
|
22
|
+
resources: {{custom_resources}}
|
|
23
|
+
{%- else %}
|
|
24
|
+
resources: {}
|
|
25
|
+
{%- endif %}
|
|
26
|
+
node_config:
|
|
27
|
+
InstanceType: {{instance_type}}
|
|
28
|
+
PublicKey: |-
|
|
29
|
+
skypilot:ssh_public_key_content
|
|
30
|
+
|
|
31
|
+
head_node_type: ray_head_default
|
|
32
|
+
|
|
33
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
34
|
+
file_mounts: {
|
|
35
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
36
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
37
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
38
|
+
"{{remote_path}}": "{{local_path}}",
|
|
39
|
+
{%- endfor %}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
rsync_exclude: []
|
|
43
|
+
|
|
44
|
+
initialization_commands: []
|
|
45
|
+
|
|
46
|
+
# List of shell commands to run to set up nodes.
|
|
47
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
48
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
49
|
+
# items!
|
|
50
|
+
#
|
|
51
|
+
# Increment the following for catching performance bugs easier:
|
|
52
|
+
# current num items (num SSH connections): 1
|
|
53
|
+
setup_commands:
|
|
54
|
+
# Create ~/.ssh/config file in case the file does not exist in the image.
|
|
55
|
+
# Line 'rm ..': there is another installation of pip.
|
|
56
|
+
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
|
57
|
+
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
|
58
|
+
# Line 'mkdir -p ..': disable host key check
|
|
59
|
+
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
|
60
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
61
|
+
{{ initial_setup_command }}
|
|
62
|
+
{%- endfor %}
|
|
63
|
+
mkdir -p ~/.ssh; touch ~/.ssh/config; which patch > /dev/null || sudo apt install -y patch;
|
|
64
|
+
{{ conda_installation_commands }}
|
|
65
|
+
{{ ray_skypilot_installation_commands }}
|
|
66
|
+
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
|
67
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
|
|
68
|
+
(grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
|
69
|
+
{{ ssh_max_sessions_config }}
|
|
70
|
+
|
|
71
|
+
# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
|
|
72
|
+
# We do not need to list it here anymore.
|
sky/utils/common.py
CHANGED
sky/utils/context.py
CHANGED
|
@@ -5,13 +5,12 @@ from collections.abc import Mapping
|
|
|
5
5
|
import contextvars
|
|
6
6
|
import copy
|
|
7
7
|
import functools
|
|
8
|
-
import inspect
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import subprocess
|
|
12
11
|
import sys
|
|
13
|
-
from typing import (Callable, Dict, Iterator, MutableMapping,
|
|
14
|
-
TYPE_CHECKING, TypeVar)
|
|
12
|
+
from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
|
|
13
|
+
Optional, TextIO, TYPE_CHECKING, TypeVar)
|
|
15
14
|
|
|
16
15
|
from typing_extensions import ParamSpec
|
|
17
16
|
|
|
@@ -19,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
19
18
|
from sky.skypilot_config import ConfigContext
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
class
|
|
21
|
+
class SkyPilotContext(object):
|
|
23
22
|
"""SkyPilot typed context vars for threads and coroutines.
|
|
24
23
|
|
|
25
24
|
This is a wrapper around `contextvars.ContextVar` that provides a typed
|
|
@@ -114,7 +113,14 @@ class Context(object):
|
|
|
114
113
|
self._log_file_handle.close()
|
|
115
114
|
self._log_file_handle = None
|
|
116
115
|
|
|
117
|
-
def
|
|
116
|
+
def __enter__(self):
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
120
|
+
del exc_type, exc_val, exc_tb
|
|
121
|
+
self.cleanup()
|
|
122
|
+
|
|
123
|
+
def copy(self) -> 'SkyPilotContext':
|
|
118
124
|
"""Create a copy of the context.
|
|
119
125
|
|
|
120
126
|
Changes to the current context after this call will not affect the copy.
|
|
@@ -123,18 +129,18 @@ class Context(object):
|
|
|
123
129
|
The new context will get an independent copy of the config context.
|
|
124
130
|
Cancellation of the current context will not be propagated to the copy.
|
|
125
131
|
"""
|
|
126
|
-
new_context =
|
|
132
|
+
new_context = SkyPilotContext()
|
|
127
133
|
new_context.redirect_log(self._log_file)
|
|
128
134
|
new_context.env_overrides = self.env_overrides.copy()
|
|
129
135
|
new_context.config_context = copy.deepcopy(self.config_context)
|
|
130
136
|
return new_context
|
|
131
137
|
|
|
132
138
|
|
|
133
|
-
_CONTEXT = contextvars.ContextVar[Optional[
|
|
134
|
-
|
|
139
|
+
_CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
|
|
140
|
+
default=None)
|
|
135
141
|
|
|
136
142
|
|
|
137
|
-
def get() -> Optional[
|
|
143
|
+
def get() -> Optional[SkyPilotContext]:
|
|
138
144
|
"""Get the current SkyPilot context.
|
|
139
145
|
|
|
140
146
|
If the context is not initialized, get() will return None. This helps
|
|
@@ -200,7 +206,7 @@ class ContextualEnviron(MutableMapping[str, str]):
|
|
|
200
206
|
|
|
201
207
|
def __iter__(self) -> Iterator[str]:
|
|
202
208
|
|
|
203
|
-
def iter_from_context(ctx:
|
|
209
|
+
def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
|
|
204
210
|
deleted_keys = set()
|
|
205
211
|
for key, value in ctx.env_overrides.items():
|
|
206
212
|
if value is None:
|
|
@@ -311,56 +317,56 @@ def contextual(func: Callable[P, T]) -> Callable[P, T]:
|
|
|
311
317
|
context that inherits the values from the existing context.
|
|
312
318
|
"""
|
|
313
319
|
|
|
320
|
+
def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
321
|
+
# Within the new contextvars Context, set up the SkyPilotContext.
|
|
322
|
+
original_ctx = get()
|
|
323
|
+
with initialize(original_ctx):
|
|
324
|
+
return func(*args, **kwargs)
|
|
325
|
+
|
|
314
326
|
@functools.wraps(func)
|
|
315
327
|
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
328
|
+
# Create a copy of the current contextvars Context so that setting the
|
|
329
|
+
# SkyPilotContext does not affect the caller's context in async
|
|
330
|
+
# environments.
|
|
331
|
+
context = contextvars.copy_context()
|
|
332
|
+
return context.run(run_in_context, *args, **kwargs)
|
|
333
|
+
|
|
334
|
+
return wrapper
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def contextual_async(
|
|
338
|
+
func: Callable[P, Coroutine[Any, Any, T]]
|
|
339
|
+
) -> Callable[P, Coroutine[Any, Any, T]]:
|
|
340
|
+
"""Decorator to initialize a context before executing the function.
|
|
341
|
+
|
|
342
|
+
If a context is already initialized, this decorator will create a new
|
|
343
|
+
context that inherits the values from the existing context.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
347
|
+
# Within the new contextvars Context, set up the SkyPilotContext.
|
|
316
348
|
original_ctx = get()
|
|
317
|
-
initialize(original_ctx)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# ValueError: <Token ... at ...> was created in a different
|
|
328
|
-
# Context
|
|
329
|
-
# We must make sure this happens because otherwise we may try to
|
|
330
|
-
# write to the wrong log.
|
|
331
|
-
_CONTEXT.set(original_ctx)
|
|
332
|
-
|
|
333
|
-
# There are two cases:
|
|
334
|
-
# 1. The function is synchronous (that is, return type is not awaitable)
|
|
335
|
-
# In this case, we use a finally block to cleanup the context.
|
|
336
|
-
# 2. The function is asynchronous (that is, return type is awaitable)
|
|
337
|
-
# In this case, we need to construct an async def wrapper and await
|
|
338
|
-
# the value, then call the cleanup function in the finally block.
|
|
339
|
-
|
|
340
|
-
async def await_with_cleanup(awaitable):
|
|
341
|
-
try:
|
|
342
|
-
return await awaitable
|
|
343
|
-
finally:
|
|
344
|
-
cleanup()
|
|
345
|
-
|
|
346
|
-
try:
|
|
347
|
-
ret = func(*args, **kwargs)
|
|
348
|
-
if inspect.isawaitable(ret):
|
|
349
|
-
cleanup_after_await = True
|
|
350
|
-
return await_with_cleanup(ret)
|
|
351
|
-
else:
|
|
352
|
-
return ret
|
|
353
|
-
finally:
|
|
354
|
-
if not cleanup_after_await:
|
|
355
|
-
cleanup()
|
|
349
|
+
with initialize(original_ctx):
|
|
350
|
+
return await func(*args, **kwargs)
|
|
351
|
+
|
|
352
|
+
@functools.wraps(func)
|
|
353
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
354
|
+
# Create a copy of the current contextvars Context so that setting the
|
|
355
|
+
# SkyPilotContext does not affect the caller's context in async
|
|
356
|
+
# environments.
|
|
357
|
+
context = contextvars.copy_context()
|
|
358
|
+
return await context.run(run_in_context, *args, **kwargs)
|
|
356
359
|
|
|
357
360
|
return wrapper
|
|
358
361
|
|
|
359
362
|
|
|
360
|
-
def initialize(
|
|
363
|
+
def initialize(
|
|
364
|
+
base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
|
|
361
365
|
"""Initialize the current SkyPilot context."""
|
|
362
|
-
new_context = base_context.copy(
|
|
366
|
+
new_context = base_context.copy(
|
|
367
|
+
) if base_context is not None else SkyPilotContext()
|
|
363
368
|
_CONTEXT.set(new_context)
|
|
369
|
+
return new_context
|
|
364
370
|
|
|
365
371
|
|
|
366
372
|
class _ContextualStream:
|
sky/utils/context_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utilities for SkyPilot context."""
|
|
2
2
|
import asyncio
|
|
3
|
+
import concurrent.futures
|
|
3
4
|
import contextvars
|
|
4
5
|
import functools
|
|
5
6
|
import io
|
|
@@ -59,7 +60,7 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
def pipe_and_wait_process(
|
|
62
|
-
ctx: context.
|
|
63
|
+
ctx: context.SkyPilotContext,
|
|
63
64
|
proc: subprocess.Popen,
|
|
64
65
|
poll_interval: float = 0.5,
|
|
65
66
|
cancel_callback: Optional[Callable[[], None]] = None,
|
|
@@ -112,7 +113,7 @@ def pipe_and_wait_process(
|
|
|
112
113
|
return stdout, stderr
|
|
113
114
|
|
|
114
115
|
|
|
115
|
-
def wait_process(ctx: context.
|
|
116
|
+
def wait_process(ctx: context.SkyPilotContext,
|
|
116
117
|
proc: subprocess.Popen,
|
|
117
118
|
poll_interval: float = 0.5,
|
|
118
119
|
cancel_callback: Optional[Callable[[], None]] = None):
|
|
@@ -191,14 +192,17 @@ def to_thread(func: Callable[P, T], /, *args: P.args,
|
|
|
191
192
|
|
|
192
193
|
This is same as asyncio.to_thread added in python 3.9
|
|
193
194
|
"""
|
|
195
|
+
return to_thread_with_executor(None, func, *args, **kwargs)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def to_thread_with_executor(executor: Optional[concurrent.futures.Executor],
|
|
199
|
+
func: Callable[P, T], /, *args: P.args,
|
|
200
|
+
**kwargs: P.kwargs) -> 'asyncio.Future[T]':
|
|
201
|
+
"""Asynchronously run function *func* in a separate thread with
|
|
202
|
+
a custom executor."""
|
|
203
|
+
|
|
194
204
|
loop = asyncio.get_running_loop()
|
|
195
|
-
# This is critical to pass the current coroutine context to the new thread
|
|
196
205
|
pyctx = contextvars.copy_context()
|
|
197
|
-
func_call: Callable[..., T] = functools.partial(
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
pyctx.run, # type: ignore
|
|
201
|
-
func,
|
|
202
|
-
*args,
|
|
203
|
-
**kwargs)
|
|
204
|
-
return loop.run_in_executor(None, func_call)
|
|
206
|
+
func_call: Callable[..., T] = functools.partial(pyctx.run, func, *args,
|
|
207
|
+
**kwargs)
|
|
208
|
+
return loop.run_in_executor(executor, func_call)
|
sky/utils/controller_utils.py
CHANGED
|
@@ -72,7 +72,8 @@ class _ControllerSpec:
|
|
|
72
72
|
"""Spec for skypilot controllers."""
|
|
73
73
|
controller_type: str
|
|
74
74
|
name: str
|
|
75
|
-
|
|
75
|
+
_cluster_name_func: Callable[[], str]
|
|
76
|
+
_cluster_name_from_server: Optional[str] # For client-side only
|
|
76
77
|
in_progress_hint: Callable[[bool], str]
|
|
77
78
|
decline_cancel_hint: str
|
|
78
79
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
@@ -93,6 +94,24 @@ class _ControllerSpec:
|
|
|
93
94
|
return self._check_cluster_name_hint.format(
|
|
94
95
|
cluster_name=self.cluster_name)
|
|
95
96
|
|
|
97
|
+
@property
|
|
98
|
+
def cluster_name(self) -> str:
|
|
99
|
+
"""The cluster name of the controller.
|
|
100
|
+
|
|
101
|
+
On the server-side, the cluster name is the actual cluster name,
|
|
102
|
+
which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
|
|
103
|
+
|
|
104
|
+
On the client-side, the cluster name may not be accurate,
|
|
105
|
+
as we may not know the exact name, because we are missing
|
|
106
|
+
the server-side common.SERVER_ID. We have to wait until
|
|
107
|
+
we get the actual cluster name from the server.
|
|
108
|
+
"""
|
|
109
|
+
return (self._cluster_name_from_server if self._cluster_name_from_server
|
|
110
|
+
is not None else self._cluster_name_func())
|
|
111
|
+
|
|
112
|
+
def set_cluster_name_from_server(self, cluster_name: str) -> None:
|
|
113
|
+
self._cluster_name_from_server = cluster_name
|
|
114
|
+
|
|
96
115
|
|
|
97
116
|
# TODO: refactor controller class to not be an enum.
|
|
98
117
|
class Controllers(enum.Enum):
|
|
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
|
|
|
102
121
|
JOBS_CONTROLLER = _ControllerSpec(
|
|
103
122
|
controller_type='jobs',
|
|
104
123
|
name='managed jobs controller',
|
|
105
|
-
|
|
124
|
+
_cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
|
|
125
|
+
_cluster_name_from_server=None,
|
|
106
126
|
in_progress_hint=lambda _:
|
|
107
127
|
('* {job_info}To see all managed jobs: '
|
|
108
128
|
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
|
|
|
133
153
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
|
134
154
|
controller_type='serve',
|
|
135
155
|
name='serve controller',
|
|
136
|
-
|
|
156
|
+
_cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
|
|
157
|
+
_cluster_name_from_server=None,
|
|
137
158
|
in_progress_hint=(
|
|
138
159
|
lambda pool:
|
|
139
160
|
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
|
|
|
166
187
|
default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
|
|
167
188
|
|
|
168
189
|
@classmethod
|
|
169
|
-
def from_name(cls,
|
|
190
|
+
def from_name(cls,
|
|
191
|
+
name: Optional[str],
|
|
192
|
+
expect_exact_match: bool = True) -> Optional['Controllers']:
|
|
170
193
|
"""Check if the cluster name is a controller name.
|
|
171
194
|
|
|
172
195
|
Returns:
|
|
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
|
|
|
187
210
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
188
211
|
controller = cls.JOBS_CONTROLLER
|
|
189
212
|
prefix = common.JOB_CONTROLLER_PREFIX
|
|
190
|
-
|
|
213
|
+
|
|
214
|
+
if controller is not None and expect_exact_match:
|
|
215
|
+
assert name == controller.value.cluster_name, (
|
|
216
|
+
name, controller.value.cluster_name)
|
|
217
|
+
elif controller is not None and name != controller.value.cluster_name:
|
|
191
218
|
# The client-side cluster_name is not accurate. Assume that `name`
|
|
192
219
|
# is the actual cluster name, so need to set the controller's
|
|
193
220
|
# cluster name to the input name.
|
|
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
|
|
|
201
228
|
prefix)
|
|
202
229
|
|
|
203
230
|
# Update the cluster name.
|
|
204
|
-
controller.value.
|
|
231
|
+
controller.value.set_cluster_name_from_server(name)
|
|
205
232
|
return controller
|
|
206
233
|
|
|
207
234
|
@classmethod
|
|
@@ -228,7 +255,7 @@ def get_controller_for_pool(pool: bool) -> Controllers:
|
|
|
228
255
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
229
256
|
"""Check if the controller high availability is specified in user config.
|
|
230
257
|
"""
|
|
231
|
-
controller = Controllers.from_name(cluster_name)
|
|
258
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
232
259
|
if controller is None:
|
|
233
260
|
return False
|
|
234
261
|
|
|
@@ -411,7 +438,7 @@ def check_cluster_name_not_controller(
|
|
|
411
438
|
Returns:
|
|
412
439
|
None, if the cluster name is not a controller name.
|
|
413
440
|
"""
|
|
414
|
-
controller = Controllers.from_name(cluster_name)
|
|
441
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
415
442
|
if controller is not None:
|
|
416
443
|
msg = controller.value.check_cluster_name_hint
|
|
417
444
|
if operation_str is not None:
|
sky/utils/locks.py
CHANGED
|
@@ -243,6 +243,7 @@ class PostgresLock(DistributedLock):
|
|
|
243
243
|
if not self._acquired or not self._connection:
|
|
244
244
|
return
|
|
245
245
|
|
|
246
|
+
connection_lost = False
|
|
246
247
|
try:
|
|
247
248
|
cursor = self._connection.cursor()
|
|
248
249
|
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
@@ -252,8 +253,11 @@ class PostgresLock(DistributedLock):
|
|
|
252
253
|
# Lost connection to the database, likely the lock is force unlocked
|
|
253
254
|
# by other routines.
|
|
254
255
|
logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
|
|
256
|
+
connection_lost = True
|
|
255
257
|
finally:
|
|
256
|
-
|
|
258
|
+
# Invalidate if connection was lost to prevent SQLAlchemy from
|
|
259
|
+
# trying to reset a dead connection
|
|
260
|
+
self._close_connection(invalidate=connection_lost)
|
|
257
261
|
|
|
258
262
|
def force_unlock(self) -> None:
|
|
259
263
|
"""Force unlock the postgres advisory lock."""
|
|
@@ -292,13 +296,24 @@ class PostgresLock(DistributedLock):
|
|
|
292
296
|
finally:
|
|
293
297
|
self._close_connection()
|
|
294
298
|
|
|
295
|
-
def _close_connection(self) -> None:
|
|
296
|
-
"""Close the postgres connection.
|
|
299
|
+
def _close_connection(self, invalidate: bool = False) -> None:
|
|
300
|
+
"""Close the postgres connection.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
invalidate: If True, invalidate connection instead of closing it.
|
|
304
|
+
Use this when the connection might be broken (e.g., after
|
|
305
|
+
pg_terminate_backend) to prevent SQLAlchemy from trying to
|
|
306
|
+
reset it (which would result in an error being logged).
|
|
307
|
+
"""
|
|
297
308
|
if self._connection:
|
|
298
309
|
try:
|
|
299
|
-
|
|
310
|
+
if invalidate:
|
|
311
|
+
self._connection.invalidate()
|
|
312
|
+
else:
|
|
313
|
+
self._connection.close()
|
|
300
314
|
except Exception as e: # pylint: disable=broad-except
|
|
301
|
-
logger.debug(
|
|
315
|
+
logger.debug(
|
|
316
|
+
f'Failed to invalidate or close postgres connection: {e}')
|
|
302
317
|
self._connection = None
|
|
303
318
|
|
|
304
319
|
def is_locked(self) -> bool:
|
sky/utils/subprocess_utils.py
CHANGED
|
@@ -10,7 +10,8 @@ import sys
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple,
|
|
13
|
+
from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
|
|
14
|
+
Union)
|
|
14
15
|
|
|
15
16
|
import colorama
|
|
16
17
|
|
|
@@ -107,7 +108,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
|
|
107
108
|
|
|
108
109
|
|
|
109
110
|
def run_in_parallel(func: Callable,
|
|
110
|
-
args: List[Any],
|
|
111
|
+
args: Union[List[Any], Set[Any]],
|
|
111
112
|
num_threads: Optional[int] = None) -> List[Any]:
|
|
112
113
|
"""Run a function in parallel on a list of arguments.
|
|
113
114
|
|
|
@@ -128,7 +129,7 @@ def run_in_parallel(func: Callable,
|
|
|
128
129
|
if len(args) == 0:
|
|
129
130
|
return []
|
|
130
131
|
if len(args) == 1:
|
|
131
|
-
return [func(args[0])]
|
|
132
|
+
return [func(list(args)[0])]
|
|
132
133
|
|
|
133
134
|
processes = (num_threads
|
|
134
135
|
if num_threads is not None else get_parallel_threads())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20251014
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -154,52 +154,53 @@ Requires-Dist: grpcio>=1.63.0; extra == "server"
|
|
|
154
154
|
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "server"
|
|
155
155
|
Requires-Dist: aiosqlite; extra == "server"
|
|
156
156
|
Requires-Dist: greenlet; extra == "server"
|
|
157
|
+
Provides-Extra: shadeform
|
|
157
158
|
Provides-Extra: all
|
|
159
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
160
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
161
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
162
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
163
|
+
Requires-Dist: passlib; extra == "all"
|
|
164
|
+
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
165
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
166
|
+
Requires-Dist: azure-common; extra == "all"
|
|
167
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
168
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
169
|
+
Requires-Dist: pyjwt; extra == "all"
|
|
170
|
+
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
171
|
+
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
172
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
173
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
174
|
+
Requires-Dist: websockets; extra == "all"
|
|
158
175
|
Requires-Dist: anyio; extra == "all"
|
|
159
|
-
Requires-Dist:
|
|
176
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
177
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
160
178
|
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
161
|
-
Requires-Dist:
|
|
179
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
162
180
|
Requires-Dist: python-dateutil; extra == "all"
|
|
181
|
+
Requires-Dist: msrestazure; extra == "all"
|
|
182
|
+
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
183
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
|
184
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
185
|
+
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
163
186
|
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
164
|
-
Requires-Dist: aiosqlite; extra == "all"
|
|
165
|
-
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
166
|
-
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
167
|
-
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
168
187
|
Requires-Dist: casbin; extra == "all"
|
|
169
|
-
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
170
|
-
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
171
|
-
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
172
|
-
Requires-Dist: passlib; extra == "all"
|
|
173
|
-
Requires-Dist: greenlet; extra == "all"
|
|
174
|
-
Requires-Dist: msrestazure; extra == "all"
|
|
175
|
-
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
176
|
-
Requires-Dist: azure-common; extra == "all"
|
|
177
|
-
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
178
|
-
Requires-Dist: websockets; extra == "all"
|
|
179
|
-
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
180
|
-
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
181
|
-
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
182
|
-
Requires-Dist: google-cloud-storage; extra == "all"
|
|
183
|
-
Requires-Dist: docker; extra == "all"
|
|
184
188
|
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
185
|
-
Requires-Dist:
|
|
186
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
|
187
|
-
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
188
|
-
Requires-Dist: pyjwt; extra == "all"
|
|
189
|
-
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
190
|
-
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
191
|
-
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
189
|
+
Requires-Dist: docker; extra == "all"
|
|
192
190
|
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
191
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
192
|
+
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
193
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
|
194
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
193
195
|
Requires-Dist: oci; extra == "all"
|
|
194
|
-
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
195
|
-
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
196
|
-
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
197
|
-
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
198
196
|
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
199
|
-
Requires-Dist:
|
|
200
|
-
Requires-Dist:
|
|
201
|
-
Requires-Dist:
|
|
202
|
-
Requires-Dist:
|
|
197
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
198
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
199
|
+
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
200
|
+
Requires-Dist: aiosqlite; extra == "all"
|
|
201
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
202
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
203
|
+
Requires-Dist: greenlet; extra == "all"
|
|
203
204
|
Dynamic: author
|
|
204
205
|
Dynamic: classifier
|
|
205
206
|
Dynamic: description
|
|
@@ -249,7 +250,7 @@ Dynamic: summary
|
|
|
249
250
|
----
|
|
250
251
|
|
|
251
252
|
:fire: *News* :fire:
|
|
252
|
-
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./
|
|
253
|
+
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
|
|
253
254
|
- [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
|
|
254
255
|
- [Sep 2025] Network and Storage Benchmarks for LLM training on the cloud: [**blog**](https://maknee.github.io/blog/2025/Network-And-Storage-Training-Skypilot/)
|
|
255
256
|
- [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)
|