skypilot-nightly 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251015__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +9 -2
- sky/backends/backend_utils.py +62 -40
- sky/backends/cloud_vm_ray_backend.py +8 -6
- sky/catalog/kubernetes_catalog.py +19 -25
- sky/client/cli/command.py +53 -19
- sky/client/sdk.py +13 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +122 -145
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +5 -5
- sky/jobs/state.py +65 -21
- sky/jobs/utils.py +58 -22
- sky/metrics/utils.py +27 -6
- sky/provision/common.py +2 -0
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/instance.py +34 -10
- sky/provision/kubernetes/utils.py +53 -39
- sky/server/common.py +4 -2
- sky/server/requests/executor.py +3 -1
- sky/server/requests/preconditions.py +2 -4
- sky/server/requests/requests.py +13 -23
- sky/server/server.py +5 -0
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +22 -5
- sky/skylet/log_lib.py +0 -1
- sky/skylet/log_lib.pyi +1 -1
- sky/utils/asyncio_utils.py +18 -0
- sky/utils/common.py +2 -0
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +2 -2
- sky/utils/controller_utils.py +35 -8
- sky/utils/locks.py +20 -5
- sky/utils/subprocess_utils.py +4 -3
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/METADATA +38 -37
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/RECORD +57 -56
- /sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -226,7 +226,9 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
226
226
|
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
|
227
227
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
|
228
228
|
f'|| {RAY_STATUS} || '
|
|
229
|
-
|
|
229
|
+
# The pydantic-core==2.41.3 for arm seems corrupted
|
|
230
|
+
# so we need to avoid that specific version.
|
|
231
|
+
f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
|
|
230
232
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
|
231
233
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
|
232
234
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
|
@@ -402,10 +404,25 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
402
404
|
]
|
|
403
405
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
404
406
|
# we skip the following keys because they are meant to be client-side configs.
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
407
|
+
# Also, we skip the consolidation mode config as those should be only set on
|
|
408
|
+
# the API server side.
|
|
409
|
+
SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
|
|
410
|
+
('api_server',),
|
|
411
|
+
('allowed_clouds',),
|
|
412
|
+
('workspaces',),
|
|
413
|
+
('db',),
|
|
414
|
+
('daemons',),
|
|
415
|
+
# TODO(kevin,tian): Override the whole controller config once our test
|
|
416
|
+
# infrastructure supports setting dynamic server side configs.
|
|
417
|
+
# Tests that are affected:
|
|
418
|
+
# - test_managed_jobs_ha_kill_starting
|
|
419
|
+
# - test_managed_jobs_ha_kill_running
|
|
420
|
+
# - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
|
|
421
|
+
# LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
|
|
422
|
+
# but the configs won't be applied)
|
|
423
|
+
('jobs', 'controller', 'consolidation_mode'),
|
|
424
|
+
('serve', 'controller', 'consolidation_mode'),
|
|
425
|
+
]
|
|
409
426
|
|
|
410
427
|
# Constants for Azure blob storage
|
|
411
428
|
WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
|
sky/skylet/log_lib.py
CHANGED
|
@@ -271,7 +271,6 @@ def run_with_log(
|
|
|
271
271
|
stdout, stderr = context_utils.pipe_and_wait_process(
|
|
272
272
|
ctx,
|
|
273
273
|
proc,
|
|
274
|
-
cancel_callback=subprocess_utils.kill_children_processes,
|
|
275
274
|
stdout_stream_handler=stdout_stream_handler,
|
|
276
275
|
stderr_stream_handler=stderr_stream_handler)
|
|
277
276
|
elif process_stream:
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Asyncio utilities."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import functools
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def shield(func):
|
|
8
|
+
"""Shield the decorated async function from cancellation.
|
|
9
|
+
|
|
10
|
+
Note that filelock.AsyncFileLock is not cancellation safe, thus the
|
|
11
|
+
function calls filelock.AsyncFileLock must be shielded.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
@functools.wraps(func)
|
|
15
|
+
async def async_wrapper(*args, **kwargs):
|
|
16
|
+
return await asyncio.shield(func(*args, **kwargs))
|
|
17
|
+
|
|
18
|
+
return async_wrapper
|
sky/utils/common.py
CHANGED
sky/utils/context.py
CHANGED
|
@@ -5,13 +5,12 @@ from collections.abc import Mapping
|
|
|
5
5
|
import contextvars
|
|
6
6
|
import copy
|
|
7
7
|
import functools
|
|
8
|
-
import inspect
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import subprocess
|
|
12
11
|
import sys
|
|
13
|
-
from typing import (Callable, Dict, Iterator, MutableMapping,
|
|
14
|
-
TYPE_CHECKING, TypeVar)
|
|
12
|
+
from typing import (Any, Callable, Coroutine, Dict, Iterator, MutableMapping,
|
|
13
|
+
Optional, TextIO, TYPE_CHECKING, TypeVar)
|
|
15
14
|
|
|
16
15
|
from typing_extensions import ParamSpec
|
|
17
16
|
|
|
@@ -19,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
19
18
|
from sky.skypilot_config import ConfigContext
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
class
|
|
21
|
+
class SkyPilotContext(object):
|
|
23
22
|
"""SkyPilot typed context vars for threads and coroutines.
|
|
24
23
|
|
|
25
24
|
This is a wrapper around `contextvars.ContextVar` that provides a typed
|
|
@@ -114,7 +113,14 @@ class Context(object):
|
|
|
114
113
|
self._log_file_handle.close()
|
|
115
114
|
self._log_file_handle = None
|
|
116
115
|
|
|
117
|
-
def
|
|
116
|
+
def __enter__(self):
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
120
|
+
del exc_type, exc_val, exc_tb
|
|
121
|
+
self.cleanup()
|
|
122
|
+
|
|
123
|
+
def copy(self) -> 'SkyPilotContext':
|
|
118
124
|
"""Create a copy of the context.
|
|
119
125
|
|
|
120
126
|
Changes to the current context after this call will not affect the copy.
|
|
@@ -123,18 +129,18 @@ class Context(object):
|
|
|
123
129
|
The new context will get an independent copy of the config context.
|
|
124
130
|
Cancellation of the current context will not be propagated to the copy.
|
|
125
131
|
"""
|
|
126
|
-
new_context =
|
|
132
|
+
new_context = SkyPilotContext()
|
|
127
133
|
new_context.redirect_log(self._log_file)
|
|
128
134
|
new_context.env_overrides = self.env_overrides.copy()
|
|
129
135
|
new_context.config_context = copy.deepcopy(self.config_context)
|
|
130
136
|
return new_context
|
|
131
137
|
|
|
132
138
|
|
|
133
|
-
_CONTEXT = contextvars.ContextVar[Optional[
|
|
134
|
-
|
|
139
|
+
_CONTEXT = contextvars.ContextVar[Optional[SkyPilotContext]]('sky_context',
|
|
140
|
+
default=None)
|
|
135
141
|
|
|
136
142
|
|
|
137
|
-
def get() -> Optional[
|
|
143
|
+
def get() -> Optional[SkyPilotContext]:
|
|
138
144
|
"""Get the current SkyPilot context.
|
|
139
145
|
|
|
140
146
|
If the context is not initialized, get() will return None. This helps
|
|
@@ -200,7 +206,7 @@ class ContextualEnviron(MutableMapping[str, str]):
|
|
|
200
206
|
|
|
201
207
|
def __iter__(self) -> Iterator[str]:
|
|
202
208
|
|
|
203
|
-
def iter_from_context(ctx:
|
|
209
|
+
def iter_from_context(ctx: SkyPilotContext) -> Iterator[str]:
|
|
204
210
|
deleted_keys = set()
|
|
205
211
|
for key, value in ctx.env_overrides.items():
|
|
206
212
|
if value is None:
|
|
@@ -311,56 +317,56 @@ def contextual(func: Callable[P, T]) -> Callable[P, T]:
|
|
|
311
317
|
context that inherits the values from the existing context.
|
|
312
318
|
"""
|
|
313
319
|
|
|
320
|
+
def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
321
|
+
# Within the new contextvars Context, set up the SkyPilotContext.
|
|
322
|
+
original_ctx = get()
|
|
323
|
+
with initialize(original_ctx):
|
|
324
|
+
return func(*args, **kwargs)
|
|
325
|
+
|
|
314
326
|
@functools.wraps(func)
|
|
315
327
|
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
328
|
+
# Create a copy of the current contextvars Context so that setting the
|
|
329
|
+
# SkyPilotContext does not affect the caller's context in async
|
|
330
|
+
# environments.
|
|
331
|
+
context = contextvars.copy_context()
|
|
332
|
+
return context.run(run_in_context, *args, **kwargs)
|
|
333
|
+
|
|
334
|
+
return wrapper
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def contextual_async(
|
|
338
|
+
func: Callable[P, Coroutine[Any, Any, T]]
|
|
339
|
+
) -> Callable[P, Coroutine[Any, Any, T]]:
|
|
340
|
+
"""Decorator to initialize a context before executing the function.
|
|
341
|
+
|
|
342
|
+
If a context is already initialized, this decorator will create a new
|
|
343
|
+
context that inherits the values from the existing context.
|
|
344
|
+
"""
|
|
345
|
+
|
|
346
|
+
async def run_in_context(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
347
|
+
# Within the new contextvars Context, set up the SkyPilotContext.
|
|
316
348
|
original_ctx = get()
|
|
317
|
-
initialize(original_ctx)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# ValueError: <Token ... at ...> was created in a different
|
|
328
|
-
# Context
|
|
329
|
-
# We must make sure this happens because otherwise we may try to
|
|
330
|
-
# write to the wrong log.
|
|
331
|
-
_CONTEXT.set(original_ctx)
|
|
332
|
-
|
|
333
|
-
# There are two cases:
|
|
334
|
-
# 1. The function is synchronous (that is, return type is not awaitable)
|
|
335
|
-
# In this case, we use a finally block to cleanup the context.
|
|
336
|
-
# 2. The function is asynchronous (that is, return type is awaitable)
|
|
337
|
-
# In this case, we need to construct an async def wrapper and await
|
|
338
|
-
# the value, then call the cleanup function in the finally block.
|
|
339
|
-
|
|
340
|
-
async def await_with_cleanup(awaitable):
|
|
341
|
-
try:
|
|
342
|
-
return await awaitable
|
|
343
|
-
finally:
|
|
344
|
-
cleanup()
|
|
345
|
-
|
|
346
|
-
try:
|
|
347
|
-
ret = func(*args, **kwargs)
|
|
348
|
-
if inspect.isawaitable(ret):
|
|
349
|
-
cleanup_after_await = True
|
|
350
|
-
return await_with_cleanup(ret)
|
|
351
|
-
else:
|
|
352
|
-
return ret
|
|
353
|
-
finally:
|
|
354
|
-
if not cleanup_after_await:
|
|
355
|
-
cleanup()
|
|
349
|
+
with initialize(original_ctx):
|
|
350
|
+
return await func(*args, **kwargs)
|
|
351
|
+
|
|
352
|
+
@functools.wraps(func)
|
|
353
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
354
|
+
# Create a copy of the current contextvars Context so that setting the
|
|
355
|
+
# SkyPilotContext does not affect the caller's context in async
|
|
356
|
+
# environments.
|
|
357
|
+
context = contextvars.copy_context()
|
|
358
|
+
return await context.run(run_in_context, *args, **kwargs)
|
|
356
359
|
|
|
357
360
|
return wrapper
|
|
358
361
|
|
|
359
362
|
|
|
360
|
-
def initialize(
|
|
363
|
+
def initialize(
|
|
364
|
+
base_context: Optional[SkyPilotContext] = None) -> SkyPilotContext:
|
|
361
365
|
"""Initialize the current SkyPilot context."""
|
|
362
|
-
new_context = base_context.copy(
|
|
366
|
+
new_context = base_context.copy(
|
|
367
|
+
) if base_context is not None else SkyPilotContext()
|
|
363
368
|
_CONTEXT.set(new_context)
|
|
369
|
+
return new_context
|
|
364
370
|
|
|
365
371
|
|
|
366
372
|
class _ContextualStream:
|
sky/utils/context_utils.py
CHANGED
|
@@ -60,7 +60,7 @@ def passthrough_stream_handler(in_stream: IO[Any], out_stream: IO[Any]) -> str:
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def pipe_and_wait_process(
|
|
63
|
-
ctx: context.
|
|
63
|
+
ctx: context.SkyPilotContext,
|
|
64
64
|
proc: subprocess.Popen,
|
|
65
65
|
poll_interval: float = 0.5,
|
|
66
66
|
cancel_callback: Optional[Callable[[], None]] = None,
|
|
@@ -113,7 +113,7 @@ def pipe_and_wait_process(
|
|
|
113
113
|
return stdout, stderr
|
|
114
114
|
|
|
115
115
|
|
|
116
|
-
def wait_process(ctx: context.
|
|
116
|
+
def wait_process(ctx: context.SkyPilotContext,
|
|
117
117
|
proc: subprocess.Popen,
|
|
118
118
|
poll_interval: float = 0.5,
|
|
119
119
|
cancel_callback: Optional[Callable[[], None]] = None):
|
sky/utils/controller_utils.py
CHANGED
|
@@ -72,7 +72,8 @@ class _ControllerSpec:
|
|
|
72
72
|
"""Spec for skypilot controllers."""
|
|
73
73
|
controller_type: str
|
|
74
74
|
name: str
|
|
75
|
-
|
|
75
|
+
_cluster_name_func: Callable[[], str]
|
|
76
|
+
_cluster_name_from_server: Optional[str] # For client-side only
|
|
76
77
|
in_progress_hint: Callable[[bool], str]
|
|
77
78
|
decline_cancel_hint: str
|
|
78
79
|
_decline_down_when_failed_to_fetch_status_hint: str
|
|
@@ -93,6 +94,24 @@ class _ControllerSpec:
|
|
|
93
94
|
return self._check_cluster_name_hint.format(
|
|
94
95
|
cluster_name=self.cluster_name)
|
|
95
96
|
|
|
97
|
+
@property
|
|
98
|
+
def cluster_name(self) -> str:
|
|
99
|
+
"""The cluster name of the controller.
|
|
100
|
+
|
|
101
|
+
On the server-side, the cluster name is the actual cluster name,
|
|
102
|
+
which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
|
|
103
|
+
|
|
104
|
+
On the client-side, the cluster name may not be accurate,
|
|
105
|
+
as we may not know the exact name, because we are missing
|
|
106
|
+
the server-side common.SERVER_ID. We have to wait until
|
|
107
|
+
we get the actual cluster name from the server.
|
|
108
|
+
"""
|
|
109
|
+
return (self._cluster_name_from_server if self._cluster_name_from_server
|
|
110
|
+
is not None else self._cluster_name_func())
|
|
111
|
+
|
|
112
|
+
def set_cluster_name_from_server(self, cluster_name: str) -> None:
|
|
113
|
+
self._cluster_name_from_server = cluster_name
|
|
114
|
+
|
|
96
115
|
|
|
97
116
|
# TODO: refactor controller class to not be an enum.
|
|
98
117
|
class Controllers(enum.Enum):
|
|
@@ -102,7 +121,8 @@ class Controllers(enum.Enum):
|
|
|
102
121
|
JOBS_CONTROLLER = _ControllerSpec(
|
|
103
122
|
controller_type='jobs',
|
|
104
123
|
name='managed jobs controller',
|
|
105
|
-
|
|
124
|
+
_cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
|
|
125
|
+
_cluster_name_from_server=None,
|
|
106
126
|
in_progress_hint=lambda _:
|
|
107
127
|
('* {job_info}To see all managed jobs: '
|
|
108
128
|
f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
|
|
@@ -133,7 +153,8 @@ class Controllers(enum.Enum):
|
|
|
133
153
|
SKY_SERVE_CONTROLLER = _ControllerSpec(
|
|
134
154
|
controller_type='serve',
|
|
135
155
|
name='serve controller',
|
|
136
|
-
|
|
156
|
+
_cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
|
|
157
|
+
_cluster_name_from_server=None,
|
|
137
158
|
in_progress_hint=(
|
|
138
159
|
lambda pool:
|
|
139
160
|
(f'* To see detailed pool status: {colorama.Style.BRIGHT}'
|
|
@@ -166,7 +187,9 @@ class Controllers(enum.Enum):
|
|
|
166
187
|
default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
|
|
167
188
|
|
|
168
189
|
@classmethod
|
|
169
|
-
def from_name(cls,
|
|
190
|
+
def from_name(cls,
|
|
191
|
+
name: Optional[str],
|
|
192
|
+
expect_exact_match: bool = True) -> Optional['Controllers']:
|
|
170
193
|
"""Check if the cluster name is a controller name.
|
|
171
194
|
|
|
172
195
|
Returns:
|
|
@@ -187,7 +210,11 @@ class Controllers(enum.Enum):
|
|
|
187
210
|
elif name.startswith(common.JOB_CONTROLLER_PREFIX):
|
|
188
211
|
controller = cls.JOBS_CONTROLLER
|
|
189
212
|
prefix = common.JOB_CONTROLLER_PREFIX
|
|
190
|
-
|
|
213
|
+
|
|
214
|
+
if controller is not None and expect_exact_match:
|
|
215
|
+
assert name == controller.value.cluster_name, (
|
|
216
|
+
name, controller.value.cluster_name)
|
|
217
|
+
elif controller is not None and name != controller.value.cluster_name:
|
|
191
218
|
# The client-side cluster_name is not accurate. Assume that `name`
|
|
192
219
|
# is the actual cluster name, so need to set the controller's
|
|
193
220
|
# cluster name to the input name.
|
|
@@ -201,7 +228,7 @@ class Controllers(enum.Enum):
|
|
|
201
228
|
prefix)
|
|
202
229
|
|
|
203
230
|
# Update the cluster name.
|
|
204
|
-
controller.value.
|
|
231
|
+
controller.value.set_cluster_name_from_server(name)
|
|
205
232
|
return controller
|
|
206
233
|
|
|
207
234
|
@classmethod
|
|
@@ -228,7 +255,7 @@ def get_controller_for_pool(pool: bool) -> Controllers:
|
|
|
228
255
|
def high_availability_specified(cluster_name: Optional[str]) -> bool:
|
|
229
256
|
"""Check if the controller high availability is specified in user config.
|
|
230
257
|
"""
|
|
231
|
-
controller = Controllers.from_name(cluster_name)
|
|
258
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
232
259
|
if controller is None:
|
|
233
260
|
return False
|
|
234
261
|
|
|
@@ -411,7 +438,7 @@ def check_cluster_name_not_controller(
|
|
|
411
438
|
Returns:
|
|
412
439
|
None, if the cluster name is not a controller name.
|
|
413
440
|
"""
|
|
414
|
-
controller = Controllers.from_name(cluster_name)
|
|
441
|
+
controller = Controllers.from_name(cluster_name, expect_exact_match=False)
|
|
415
442
|
if controller is not None:
|
|
416
443
|
msg = controller.value.check_cluster_name_hint
|
|
417
444
|
if operation_str is not None:
|
sky/utils/locks.py
CHANGED
|
@@ -243,6 +243,7 @@ class PostgresLock(DistributedLock):
|
|
|
243
243
|
if not self._acquired or not self._connection:
|
|
244
244
|
return
|
|
245
245
|
|
|
246
|
+
connection_lost = False
|
|
246
247
|
try:
|
|
247
248
|
cursor = self._connection.cursor()
|
|
248
249
|
cursor.execute('SELECT pg_advisory_unlock(%s)', (self._lock_key,))
|
|
@@ -252,8 +253,11 @@ class PostgresLock(DistributedLock):
|
|
|
252
253
|
# Lost connection to the database, likely the lock is force unlocked
|
|
253
254
|
# by other routines.
|
|
254
255
|
logger.debug(f'Failed to release postgres lock {self.lock_id}: {e}')
|
|
256
|
+
connection_lost = True
|
|
255
257
|
finally:
|
|
256
|
-
|
|
258
|
+
# Invalidate if connection was lost to prevent SQLAlchemy from
|
|
259
|
+
# trying to reset a dead connection
|
|
260
|
+
self._close_connection(invalidate=connection_lost)
|
|
257
261
|
|
|
258
262
|
def force_unlock(self) -> None:
|
|
259
263
|
"""Force unlock the postgres advisory lock."""
|
|
@@ -292,13 +296,24 @@ class PostgresLock(DistributedLock):
|
|
|
292
296
|
finally:
|
|
293
297
|
self._close_connection()
|
|
294
298
|
|
|
295
|
-
def _close_connection(self) -> None:
|
|
296
|
-
"""Close the postgres connection.
|
|
299
|
+
def _close_connection(self, invalidate: bool = False) -> None:
|
|
300
|
+
"""Close the postgres connection.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
invalidate: If True, invalidate connection instead of closing it.
|
|
304
|
+
Use this when the connection might be broken (e.g., after
|
|
305
|
+
pg_terminate_backend) to prevent SQLAlchemy from trying to
|
|
306
|
+
reset it (which would result in an error being logged).
|
|
307
|
+
"""
|
|
297
308
|
if self._connection:
|
|
298
309
|
try:
|
|
299
|
-
|
|
310
|
+
if invalidate:
|
|
311
|
+
self._connection.invalidate()
|
|
312
|
+
else:
|
|
313
|
+
self._connection.close()
|
|
300
314
|
except Exception as e: # pylint: disable=broad-except
|
|
301
|
-
logger.debug(
|
|
315
|
+
logger.debug(
|
|
316
|
+
f'Failed to invalidate or close postgres connection: {e}')
|
|
302
317
|
self._connection = None
|
|
303
318
|
|
|
304
319
|
def is_locked(self) -> bool:
|
sky/utils/subprocess_utils.py
CHANGED
|
@@ -10,7 +10,8 @@ import sys
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple,
|
|
13
|
+
from typing import (Any, Callable, Dict, List, Optional, Protocol, Set, Tuple,
|
|
14
|
+
Union)
|
|
14
15
|
|
|
15
16
|
import colorama
|
|
16
17
|
|
|
@@ -107,7 +108,7 @@ def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
|
|
107
108
|
|
|
108
109
|
|
|
109
110
|
def run_in_parallel(func: Callable,
|
|
110
|
-
args: List[Any],
|
|
111
|
+
args: Union[List[Any], Set[Any]],
|
|
111
112
|
num_threads: Optional[int] = None) -> List[Any]:
|
|
112
113
|
"""Run a function in parallel on a list of arguments.
|
|
113
114
|
|
|
@@ -128,7 +129,7 @@ def run_in_parallel(func: Callable,
|
|
|
128
129
|
if len(args) == 0:
|
|
129
130
|
return []
|
|
130
131
|
if len(args) == 1:
|
|
131
|
-
return [func(args[0])]
|
|
132
|
+
return [func(list(args)[0])]
|
|
132
133
|
|
|
133
134
|
processes = (num_threads
|
|
134
135
|
if num_threads is not None else get_parallel_threads())
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20251015
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -156,51 +156,51 @@ Requires-Dist: aiosqlite; extra == "server"
|
|
|
156
156
|
Requires-Dist: greenlet; extra == "server"
|
|
157
157
|
Provides-Extra: shadeform
|
|
158
158
|
Provides-Extra: all
|
|
159
|
-
Requires-Dist:
|
|
160
|
-
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
159
|
+
Requires-Dist: pydo>=0.3.0; extra == "all"
|
|
161
160
|
Requires-Dist: tomli; python_version < "3.11" and extra == "all"
|
|
162
|
-
Requires-Dist:
|
|
163
|
-
Requires-Dist:
|
|
164
|
-
Requires-Dist:
|
|
165
|
-
Requires-Dist:
|
|
161
|
+
Requires-Dist: ibm-platform-services>=0.48.0; extra == "all"
|
|
162
|
+
Requires-Dist: aiohttp; extra == "all"
|
|
163
|
+
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
164
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
|
166
165
|
Requires-Dist: pyjwt; extra == "all"
|
|
167
|
-
Requires-Dist:
|
|
168
|
-
Requires-Dist:
|
|
169
|
-
Requires-Dist:
|
|
166
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
167
|
+
Requires-Dist: websockets; extra == "all"
|
|
168
|
+
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
169
|
+
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
170
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
|
171
|
+
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
170
172
|
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
|
171
|
-
Requires-Dist:
|
|
172
|
-
Requires-Dist:
|
|
173
|
+
Requires-Dist: aiosqlite; extra == "all"
|
|
174
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
|
173
175
|
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
|
174
|
-
Requires-Dist:
|
|
175
|
-
Requires-Dist:
|
|
176
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
|
177
|
+
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
178
|
+
Requires-Dist: oci; extra == "all"
|
|
176
179
|
Requires-Dist: grpcio>=1.63.0; extra == "all"
|
|
177
|
-
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
178
|
-
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
179
|
-
Requires-Dist: greenlet; extra == "all"
|
|
180
|
-
Requires-Dist: azure-common; extra == "all"
|
|
181
|
-
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
182
180
|
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
|
183
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
|
184
|
-
Requires-Dist: oci; extra == "all"
|
|
185
|
-
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
|
186
|
-
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
187
|
-
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
188
|
-
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
|
189
181
|
Requires-Dist: colorama<0.4.5; extra == "all"
|
|
182
|
+
Requires-Dist: python-dateutil; extra == "all"
|
|
183
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
|
184
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
185
|
+
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
190
186
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
191
|
-
Requires-Dist:
|
|
192
|
-
Requires-Dist: aiohttp; extra == "all"
|
|
187
|
+
Requires-Dist: passlib; extra == "all"
|
|
193
188
|
Requires-Dist: msrestazure; extra == "all"
|
|
194
|
-
Requires-Dist: protobuf<7.0.0,>=5.26.1; extra == "all"
|
|
195
|
-
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
196
|
-
Requires-Dist: google-cloud-storage; extra == "all"
|
|
197
|
-
Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
198
|
-
Requires-Dist: sqlalchemy_adapter; extra == "all"
|
|
199
|
-
Requires-Dist: runpod>=1.6.1; extra == "all"
|
|
200
|
-
Requires-Dist: msgraph-sdk; extra == "all"
|
|
201
189
|
Requires-Dist: ray[default]>=2.6.1; extra == "all"
|
|
202
|
-
Requires-Dist:
|
|
203
|
-
Requires-Dist:
|
|
190
|
+
Requires-Dist: greenlet; extra == "all"
|
|
191
|
+
Requires-Dist: docker; extra == "all"
|
|
192
|
+
Requires-Dist: ecsapi>=0.2.0; extra == "all"
|
|
193
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
|
194
|
+
Requires-Dist: ibm-cos-sdk; extra == "all"
|
|
195
|
+
Requires-Dist: pyopenssl<24.3.0,>=23.2.0; extra == "all"
|
|
196
|
+
Requires-Dist: casbin; extra == "all"
|
|
197
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
198
|
+
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
199
|
+
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
200
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
|
201
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
|
202
|
+
Requires-Dist: azure-common; extra == "all"
|
|
203
|
+
Requires-Dist: anyio; extra == "all"
|
|
204
204
|
Dynamic: author
|
|
205
205
|
Dynamic: classifier
|
|
206
206
|
Dynamic: description
|
|
@@ -250,7 +250,8 @@ Dynamic: summary
|
|
|
250
250
|
----
|
|
251
251
|
|
|
252
252
|
:fire: *News* :fire:
|
|
253
|
-
- [Oct 2025]
|
|
253
|
+
- [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
|
|
254
|
+
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
|
|
254
255
|
- [Sep 2025] Scaling AI infrastructure at Abridge - **10x faster development** with SkyPilot: [**blog**](https://blog.skypilot.co/abridge/)
|
|
255
256
|
- [Sep 2025] Network and Storage Benchmarks for LLM training on the cloud: [**blog**](https://maknee.github.io/blog/2025/Network-And-Storage-Training-Skypilot/)
|
|
256
257
|
- [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)
|