skypilot-nightly 1.0.0.dev20250801__py3-none-any.whl → 1.0.0.dev20250804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +10 -2
- sky/backends/cloud_vm_ray_backend.py +2 -1
- sky/catalog/data_fetchers/fetch_nebius.py +31 -7
- sky/client/cli/command.py +42 -20
- sky/client/cli/flags.py +15 -0
- sky/client/sdk.py +80 -10
- sky/client/sdk.pyi +4 -0
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
- sky/dashboard/out/_next/static/chunks/{webpack-42cd1b19a6b01078.js → webpack-13145516b19858fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +5 -3
- sky/jobs/client/sdk.py +5 -1
- sky/provision/kubernetes/utils.py +32 -2
- sky/resources.py +17 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +3 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/autostop_lib.py +96 -8
- sky/skylet/constants.py +3 -2
- sky/skylet/events.py +27 -13
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/utils/schemas.py +6 -0
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/METADATA +4 -3
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/RECORD +48 -48
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
- /sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/top_level.txt +0 -0
sky/execution.py
CHANGED
|
@@ -15,6 +15,7 @@ from sky import global_user_state
|
|
|
15
15
|
from sky import optimizer
|
|
16
16
|
from sky import sky_logging
|
|
17
17
|
from sky.backends import backend_utils
|
|
18
|
+
from sky.skylet import autostop_lib
|
|
18
19
|
from sky.usage import usage_lib
|
|
19
20
|
from sky.utils import admin_policy_utils
|
|
20
21
|
from sky.utils import common
|
|
@@ -309,11 +310,13 @@ def _execute_dag(
|
|
|
309
310
|
|
|
310
311
|
idle_minutes_to_autostop: Optional[int] = None
|
|
311
312
|
down = False
|
|
313
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
312
314
|
if resource_autostop_config is not None:
|
|
313
315
|
if resource_autostop_config.enabled:
|
|
314
316
|
idle_minutes_to_autostop = (
|
|
315
317
|
resource_autostop_config.idle_minutes)
|
|
316
318
|
down = resource_autostop_config.down
|
|
319
|
+
wait_for = resource_autostop_config.wait_for
|
|
317
320
|
else:
|
|
318
321
|
# Autostop is explicitly disabled, so cancel it if it's
|
|
319
322
|
# already set.
|
|
@@ -450,9 +453,8 @@ def _execute_dag(
|
|
|
450
453
|
if idle_minutes_to_autostop is not None:
|
|
451
454
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
452
455
|
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
453
|
-
backend.set_autostop(handle,
|
|
454
|
-
|
|
455
|
-
down=down)
|
|
456
|
+
backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
|
|
457
|
+
down)
|
|
456
458
|
|
|
457
459
|
if Stage.EXEC in stages:
|
|
458
460
|
try:
|
sky/jobs/client/sdk.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
|
-
import webbrowser
|
|
6
5
|
|
|
7
6
|
import click
|
|
8
7
|
|
|
9
8
|
from sky import sky_logging
|
|
9
|
+
from sky.adaptors import common as adaptors_common
|
|
10
10
|
from sky.client import common as client_common
|
|
11
11
|
from sky.client import sdk
|
|
12
12
|
from sky.serve.client import impl
|
|
@@ -23,9 +23,13 @@ from sky.utils import dag_utils
|
|
|
23
23
|
|
|
24
24
|
if typing.TYPE_CHECKING:
|
|
25
25
|
import io
|
|
26
|
+
import webbrowser
|
|
26
27
|
|
|
27
28
|
import sky
|
|
28
29
|
from sky.serve import serve_utils
|
|
30
|
+
else:
|
|
31
|
+
# only used in dashboard()
|
|
32
|
+
webbrowser = adaptors_common.LazyImport('webbrowser')
|
|
29
33
|
|
|
30
34
|
logger = sky_logging.init_logger(__name__)
|
|
31
35
|
|
|
@@ -2642,6 +2642,7 @@ def combine_pod_config_fields(
|
|
|
2642
2642
|
|
|
2643
2643
|
|
|
2644
2644
|
def combine_metadata_fields(cluster_yaml_path: str,
|
|
2645
|
+
cluster_config_overrides: Dict[str, Any],
|
|
2645
2646
|
context: Optional[str] = None) -> None:
|
|
2646
2647
|
"""Updates the metadata for all Kubernetes objects created by SkyPilot with
|
|
2647
2648
|
fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
|
|
@@ -2652,12 +2653,25 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2652
2653
|
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
|
2653
2654
|
yaml_content = f.read()
|
|
2654
2655
|
yaml_obj = yaml.safe_load(yaml_content)
|
|
2656
|
+
|
|
2657
|
+
# Get custom_metadata from global config
|
|
2655
2658
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2656
2659
|
cloud='kubernetes',
|
|
2657
2660
|
region=context,
|
|
2658
2661
|
keys=('custom_metadata',),
|
|
2659
2662
|
default_value={})
|
|
2660
2663
|
|
|
2664
|
+
# Get custom_metadata from task-level config overrides
|
|
2665
|
+
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2666
|
+
dict_config=cluster_config_overrides,
|
|
2667
|
+
cloud='kubernetes',
|
|
2668
|
+
region=context,
|
|
2669
|
+
keys=('custom_metadata',),
|
|
2670
|
+
default_value={})
|
|
2671
|
+
|
|
2672
|
+
# Merge task-level overrides with global config
|
|
2673
|
+
config_utils.merge_k8s_configs(custom_metadata, override_custom_metadata)
|
|
2674
|
+
|
|
2661
2675
|
# List of objects in the cluster YAML to be updated
|
|
2662
2676
|
combination_destinations = [
|
|
2663
2677
|
# Service accounts
|
|
@@ -2679,17 +2693,33 @@ def combine_metadata_fields(cluster_yaml_path: str,
|
|
|
2679
2693
|
common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2680
2694
|
|
|
2681
2695
|
|
|
2682
|
-
def merge_custom_metadata(
|
|
2683
|
-
|
|
2696
|
+
def merge_custom_metadata(
|
|
2697
|
+
original_metadata: Dict[str, Any],
|
|
2698
|
+
context: Optional[str] = None,
|
|
2699
|
+
cluster_config_overrides: Optional[Dict[str, Any]] = None) -> None:
|
|
2684
2700
|
"""Merges original metadata with custom_metadata from config
|
|
2685
2701
|
|
|
2686
2702
|
Merge is done in-place, so return is not required
|
|
2687
2703
|
"""
|
|
2704
|
+
# Get custom_metadata from global config
|
|
2688
2705
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2689
2706
|
cloud='kubernetes',
|
|
2690
2707
|
region=context,
|
|
2691
2708
|
keys=('custom_metadata',),
|
|
2692
2709
|
default_value={})
|
|
2710
|
+
|
|
2711
|
+
# Get custom_metadata from task-level config overrides if available
|
|
2712
|
+
if cluster_config_overrides is not None:
|
|
2713
|
+
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2714
|
+
dict_config=cluster_config_overrides,
|
|
2715
|
+
cloud='kubernetes',
|
|
2716
|
+
region=context,
|
|
2717
|
+
keys=('custom_metadata',),
|
|
2718
|
+
default_value={})
|
|
2719
|
+
# Merge task-level overrides with global config
|
|
2720
|
+
config_utils.merge_k8s_configs(custom_metadata,
|
|
2721
|
+
override_custom_metadata)
|
|
2722
|
+
|
|
2693
2723
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2694
2724
|
|
|
2695
2725
|
|
sky/resources.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
|
|
|
20
20
|
from sky.provision.gcp import constants as gcp_constants
|
|
21
21
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
22
|
from sky.provision.nebius import constants as nebius_constants
|
|
23
|
+
from sky.skylet import autostop_lib
|
|
23
24
|
from sky.skylet import constants
|
|
24
25
|
from sky.utils import accelerator_registry
|
|
25
26
|
from sky.utils import annotations
|
|
@@ -69,14 +70,18 @@ class AutostopConfig:
|
|
|
69
70
|
# flags.
|
|
70
71
|
idle_minutes: int = 0
|
|
71
72
|
down: bool = False
|
|
73
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
72
74
|
|
|
73
75
|
def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
|
|
74
76
|
if not self.enabled:
|
|
75
77
|
return False
|
|
76
|
-
|
|
78
|
+
config: Dict[str, Any] = {
|
|
77
79
|
'idle_minutes': self.idle_minutes,
|
|
78
80
|
'down': self.down,
|
|
79
81
|
}
|
|
82
|
+
if self.wait_for is not None:
|
|
83
|
+
config['wait_for'] = self.wait_for.value
|
|
84
|
+
return config
|
|
80
85
|
|
|
81
86
|
@classmethod
|
|
82
87
|
def from_yaml_config(
|
|
@@ -104,6 +109,9 @@ class AutostopConfig:
|
|
|
104
109
|
autostop_config.idle_minutes = config['idle_minutes']
|
|
105
110
|
if 'down' in config:
|
|
106
111
|
autostop_config.down = config['down']
|
|
112
|
+
if 'wait_for' in config:
|
|
113
|
+
autostop_config.wait_for = (
|
|
114
|
+
autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
|
|
107
115
|
return autostop_config
|
|
108
116
|
|
|
109
117
|
return None
|
|
@@ -958,15 +966,18 @@ class Resources:
|
|
|
958
966
|
valid_volumes.append(volume)
|
|
959
967
|
self._volumes = valid_volumes
|
|
960
968
|
|
|
961
|
-
def override_autostop_config(
|
|
962
|
-
|
|
963
|
-
|
|
969
|
+
def override_autostop_config(
|
|
970
|
+
self,
|
|
971
|
+
down: bool = False,
|
|
972
|
+
idle_minutes: Optional[int] = None,
|
|
973
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
|
|
964
974
|
"""Override autostop config to the resource.
|
|
965
975
|
|
|
966
976
|
Args:
|
|
967
977
|
down: If true, override the autostop config to use autodown.
|
|
968
978
|
idle_minutes: If not None, override the idle minutes to autostop or
|
|
969
979
|
autodown.
|
|
980
|
+
wait_for: If not None, override the wait mode.
|
|
970
981
|
"""
|
|
971
982
|
if not down and idle_minutes is None:
|
|
972
983
|
return
|
|
@@ -976,6 +987,8 @@ class Resources:
|
|
|
976
987
|
self._autostop_config.down = down
|
|
977
988
|
if idle_minutes is not None:
|
|
978
989
|
self._autostop_config.idle_minutes = idle_minutes
|
|
990
|
+
if wait_for is not None:
|
|
991
|
+
self._autostop_config.wait_for = wait_for
|
|
979
992
|
|
|
980
993
|
def is_launchable(self) -> bool:
|
|
981
994
|
"""Returns whether the resource is launchable."""
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 13
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/requests/payloads.py
CHANGED
|
@@ -33,6 +33,7 @@ from sky import sky_logging
|
|
|
33
33
|
from sky import skypilot_config
|
|
34
34
|
from sky.adaptors import common as adaptors_common
|
|
35
35
|
from sky.server import common
|
|
36
|
+
from sky.skylet import autostop_lib
|
|
36
37
|
from sky.skylet import constants
|
|
37
38
|
from sky.usage import constants as usage_constants
|
|
38
39
|
from sky.usage import usage_lib
|
|
@@ -312,6 +313,7 @@ class StartBody(RequestBody):
|
|
|
312
313
|
"""The request body for the start endpoint."""
|
|
313
314
|
cluster_name: str
|
|
314
315
|
idle_minutes_to_autostop: Optional[int] = None
|
|
316
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
315
317
|
retry_until_up: bool = False
|
|
316
318
|
down: bool = False
|
|
317
319
|
force: bool = False
|
|
@@ -321,6 +323,7 @@ class AutostopBody(RequestBody):
|
|
|
321
323
|
"""The request body for the autostop endpoint."""
|
|
322
324
|
cluster_name: str
|
|
323
325
|
idle_minutes: int
|
|
326
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
324
327
|
down: bool = False
|
|
325
328
|
|
|
326
329
|
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -177,7 +177,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
177
177
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
|
178
178
|
],
|
|
179
179
|
'nebius': [
|
|
180
|
-
'nebius>=0.2.
|
|
180
|
+
'nebius>=0.2.47',
|
|
181
181
|
] + aws_dependencies,
|
|
182
182
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
183
183
|
'server': server_dependencies,
|
sky/skylet/autostop_lib.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Autostop utilities."""
|
|
2
|
+
import enum
|
|
2
3
|
import pickle
|
|
3
4
|
import shlex
|
|
5
|
+
import subprocess
|
|
4
6
|
import time
|
|
5
7
|
import typing
|
|
6
8
|
from typing import List, Optional
|
|
@@ -10,6 +12,7 @@ from sky.adaptors import common as adaptors_common
|
|
|
10
12
|
from sky.skylet import configs
|
|
11
13
|
from sky.skylet import constants
|
|
12
14
|
from sky.utils import message_utils
|
|
15
|
+
from sky.utils import ux_utils
|
|
13
16
|
|
|
14
17
|
if typing.TYPE_CHECKING:
|
|
15
18
|
import psutil
|
|
@@ -30,6 +33,55 @@ _AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
|
|
|
30
33
|
_AUTOSTOP_INDICATOR = 'autostop_indicator'
|
|
31
34
|
|
|
32
35
|
|
|
36
|
+
class AutostopWaitFor(enum.Enum):
|
|
37
|
+
"""Enum for the Autostop behaviour.
|
|
38
|
+
|
|
39
|
+
JOBS: Wait for jobs to finish.
|
|
40
|
+
JOBS_AND_SSH: Wait for jobs to finish and all SSH sessions to be closed.
|
|
41
|
+
NONE: Unconditionally stop the cluster after the idle time.
|
|
42
|
+
"""
|
|
43
|
+
JOBS_AND_SSH = 'jobs_and_ssh'
|
|
44
|
+
JOBS = 'jobs'
|
|
45
|
+
NONE = 'none'
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def supported_modes(cls) -> List[str]:
|
|
49
|
+
return [mode.value for mode in cls]
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def cli_help_message(cls, pair: str) -> str:
|
|
53
|
+
return f"""\
|
|
54
|
+
Determines the condition for resetting the idleness timer.
|
|
55
|
+
This option works in conjunction with ``--{pair}``. Options:
|
|
56
|
+
|
|
57
|
+
\b
|
|
58
|
+
1. ``jobs_and_ssh`` (default): Wait for all jobs to complete AND all SSH
|
|
59
|
+
sessions to disconnect.
|
|
60
|
+
2. ``jobs``: Wait for all jobs to complete.
|
|
61
|
+
3. ``none``: Stop immediately after idle time expires, regardless of running
|
|
62
|
+
jobs or SSH connections."""
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_str(cls, mode: str) -> 'AutostopWaitFor':
|
|
66
|
+
"""Returns the enum value for the given string."""
|
|
67
|
+
if mode.lower() == cls.JOBS.value:
|
|
68
|
+
return cls.JOBS
|
|
69
|
+
elif mode.lower() == cls.JOBS_AND_SSH.value:
|
|
70
|
+
return cls.JOBS_AND_SSH
|
|
71
|
+
elif mode.lower() == cls.NONE.value:
|
|
72
|
+
return cls.NONE
|
|
73
|
+
else:
|
|
74
|
+
with ux_utils.print_exception_no_traceback():
|
|
75
|
+
raise ValueError(f'Unsupported autostop wait mode: '
|
|
76
|
+
f'{mode}. The mode must be either '
|
|
77
|
+
f'\'{cls.JOBS_AND_SSH.value}\', '
|
|
78
|
+
f'\'{cls.JOBS.value}\', or '
|
|
79
|
+
f'\'{cls.NONE.value}\'. ')
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
|
|
83
|
+
|
|
84
|
+
|
|
33
85
|
class AutostopConfig:
|
|
34
86
|
"""Autostop configuration."""
|
|
35
87
|
|
|
@@ -37,12 +89,14 @@ class AutostopConfig:
|
|
|
37
89
|
autostop_idle_minutes: int,
|
|
38
90
|
boot_time: float,
|
|
39
91
|
backend: Optional[str],
|
|
92
|
+
wait_for: AutostopWaitFor,
|
|
40
93
|
down: bool = False):
|
|
41
94
|
assert autostop_idle_minutes < 0 or backend is not None, (
|
|
42
95
|
autostop_idle_minutes, backend)
|
|
43
96
|
self.autostop_idle_minutes = autostop_idle_minutes
|
|
44
97
|
self.boot_time = boot_time
|
|
45
98
|
self.backend = backend
|
|
99
|
+
self.wait_for = wait_for
|
|
46
100
|
self.down = down
|
|
47
101
|
|
|
48
102
|
def __setstate__(self, state: dict):
|
|
@@ -53,15 +107,18 @@ class AutostopConfig:
|
|
|
53
107
|
def get_autostop_config() -> AutostopConfig:
|
|
54
108
|
config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
|
|
55
109
|
if config_str is None:
|
|
56
|
-
return AutostopConfig(-1, -1, None)
|
|
110
|
+
return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
|
|
57
111
|
return pickle.loads(config_str)
|
|
58
112
|
|
|
59
113
|
|
|
60
|
-
def set_autostop(idle_minutes: int, backend: Optional[str],
|
|
114
|
+
def set_autostop(idle_minutes: int, backend: Optional[str],
|
|
115
|
+
wait_for: AutostopWaitFor, down: bool) -> None:
|
|
61
116
|
boot_time = psutil.boot_time()
|
|
62
|
-
autostop_config = AutostopConfig(idle_minutes, boot_time, backend,
|
|
117
|
+
autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
|
|
118
|
+
down)
|
|
63
119
|
configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
|
|
64
|
-
logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}
|
|
120
|
+
logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
|
|
121
|
+
f'wait_for {wait_for.value}.')
|
|
65
122
|
# Reset timer whenever an autostop setting is submitted, i.e. the idle
|
|
66
123
|
# time will be counted from now.
|
|
67
124
|
set_last_active_time_to_now()
|
|
@@ -107,6 +164,28 @@ def set_last_active_time_to_now() -> None:
|
|
|
107
164
|
configs.set_config(_AUTOSTOP_LAST_ACTIVE_TIME, str(time.time()))
|
|
108
165
|
|
|
109
166
|
|
|
167
|
+
def has_active_ssh_sessions() -> bool:
|
|
168
|
+
"""Returns True if there are any active SSH sessions on the node."""
|
|
169
|
+
try:
|
|
170
|
+
# /dev/pts is a virtual filesystem that contains the pseudo-terminal
|
|
171
|
+
# devices. ptmx is the pseudo-terminal multiplexer, which is the
|
|
172
|
+
# "master" device that creates new pseudo-terminal devices, so we
|
|
173
|
+
# exclude it from the count.
|
|
174
|
+
proc = subprocess.run('ls /dev/pts | grep -v ptmx | wc -l',
|
|
175
|
+
capture_output=True,
|
|
176
|
+
text=True,
|
|
177
|
+
check=False,
|
|
178
|
+
shell=True)
|
|
179
|
+
if proc.returncode != 0:
|
|
180
|
+
logger.warning(f'SSH session check command failed with return code '
|
|
181
|
+
f'{proc.returncode}.')
|
|
182
|
+
return False
|
|
183
|
+
return int(proc.stdout.strip()) > 0
|
|
184
|
+
except Exception as e: # pylint: disable=broad-except
|
|
185
|
+
logger.warning(f'Error checking active SSH sessions: {e}.')
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
|
|
110
189
|
class AutostopCodeGen:
|
|
111
190
|
"""Code generator for autostop utility functions.
|
|
112
191
|
|
|
@@ -114,13 +193,22 @@ class AutostopCodeGen:
|
|
|
114
193
|
|
|
115
194
|
>> codegen = AutostopCodeGen.set_autostop(...)
|
|
116
195
|
"""
|
|
117
|
-
_PREFIX = ['from sky.skylet import autostop_lib']
|
|
196
|
+
_PREFIX = ['from sky.skylet import autostop_lib, constants']
|
|
118
197
|
|
|
119
198
|
@classmethod
|
|
120
|
-
def set_autostop(cls,
|
|
199
|
+
def set_autostop(cls,
|
|
200
|
+
idle_minutes: int,
|
|
201
|
+
backend: str,
|
|
202
|
+
wait_for: Optional[AutostopWaitFor],
|
|
203
|
+
down: bool = False) -> str:
|
|
204
|
+
if wait_for is None:
|
|
205
|
+
wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
|
|
121
206
|
code = [
|
|
122
|
-
f'
|
|
123
|
-
f' {down})'
|
|
207
|
+
f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
|
|
208
|
+
f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
|
|
209
|
+
f'\nelse: '
|
|
210
|
+
f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
|
|
211
|
+
f'autostop_lib.{wait_for}, {down})',
|
|
124
212
|
]
|
|
125
213
|
return cls._build(code)
|
|
126
214
|
|
sky/skylet/constants.py
CHANGED
|
@@ -90,11 +90,11 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
90
90
|
# cluster yaml is updated.
|
|
91
91
|
#
|
|
92
92
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
93
|
-
SKYLET_VERSION = '
|
|
93
|
+
SKYLET_VERSION = '16'
|
|
94
94
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
95
95
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
96
96
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
97
|
-
SKYLET_LIB_VERSION =
|
|
97
|
+
SKYLET_LIB_VERSION = 4
|
|
98
98
|
SKYLET_VERSION_FILE = '~/.sky/skylet_version'
|
|
99
99
|
|
|
100
100
|
# Docker default options
|
|
@@ -369,6 +369,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
369
369
|
('docker', 'run_options'),
|
|
370
370
|
('nvidia_gpus', 'disable_ecc'),
|
|
371
371
|
('ssh', 'pod_config'),
|
|
372
|
+
('kubernetes', 'custom_metadata'),
|
|
372
373
|
('kubernetes', 'pod_config'),
|
|
373
374
|
('kubernetes', 'provision_timeout'),
|
|
374
375
|
('kubernetes', 'dws'),
|
sky/skylet/events.py
CHANGED
|
@@ -137,23 +137,37 @@ class AutostopEvent(SkyletEvent):
|
|
|
137
137
|
logger.debug('autostop_config not set. Skipped.')
|
|
138
138
|
return
|
|
139
139
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
140
|
+
ignore_idle_check = (
|
|
141
|
+
autostop_config.wait_for == autostop_lib.AutostopWaitFor.NONE)
|
|
142
|
+
is_idle = True
|
|
143
|
+
if not ignore_idle_check:
|
|
144
|
+
if not job_lib.is_cluster_idle(
|
|
145
|
+
) or managed_job_state.get_num_alive_jobs() or (
|
|
146
|
+
autostop_config.wait_for
|
|
147
|
+
== autostop_lib.AutostopWaitFor.JOBS_AND_SSH and
|
|
148
|
+
autostop_lib.has_active_ssh_sessions()):
|
|
149
|
+
is_idle = False
|
|
150
|
+
|
|
151
|
+
if ignore_idle_check or is_idle:
|
|
152
|
+
minutes_since_last_active = (
|
|
153
|
+
time.time() - autostop_lib.get_last_active_time()) // 60
|
|
144
154
|
logger.debug(
|
|
145
|
-
f'
|
|
146
|
-
f'AutoStop
|
|
155
|
+
f'Minutes since last active: {minutes_since_last_active}, '
|
|
156
|
+
f'AutoStop idle minutes: '
|
|
157
|
+
f'{autostop_config.autostop_idle_minutes}, '
|
|
158
|
+
f'Wait for: {autostop_config.wait_for.value}')
|
|
147
159
|
else:
|
|
148
160
|
autostop_lib.set_last_active_time_to_now()
|
|
149
|
-
|
|
150
|
-
logger.debug(
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
161
|
+
minutes_since_last_active = -1
|
|
162
|
+
logger.debug('Not idle. Reset idle minutes. '
|
|
163
|
+
f'AutoStop idle minutes: '
|
|
164
|
+
f'{autostop_config.autostop_idle_minutes}, '
|
|
165
|
+
f'Wait for: {autostop_config.wait_for.value}')
|
|
166
|
+
if minutes_since_last_active >= autostop_config.autostop_idle_minutes:
|
|
154
167
|
logger.info(
|
|
155
|
-
f'{
|
|
156
|
-
f'{autostop_config.autostop_idle_minutes} minutes.
|
|
168
|
+
f'{minutes_since_last_active} minute(s) since last active; '
|
|
169
|
+
f'threshold: {autostop_config.autostop_idle_minutes} minutes. '
|
|
170
|
+
f'Stopping.')
|
|
157
171
|
self._stop_cluster(autostop_config)
|
|
158
172
|
|
|
159
173
|
def _stop_cluster(self, autostop_config):
|
|
@@ -12,6 +12,8 @@ service_spec:
|
|
|
12
12
|
{%- for key, value in annotations.items() %}
|
|
13
13
|
{{ key }}: {{ value|tojson }}
|
|
14
14
|
{%- endfor %}
|
|
15
|
+
{# Note: It's ok to add cloud-specific annotations here since they will be ignored by other clouds #}
|
|
16
|
+
service.beta.kubernetes.io/coreweave-load-balancer-type: public
|
|
15
17
|
spec:
|
|
16
18
|
type: LoadBalancer
|
|
17
19
|
selector:
|
sky/utils/schemas.py
CHANGED
|
@@ -6,6 +6,7 @@ https://json-schema.org/
|
|
|
6
6
|
import enum
|
|
7
7
|
from typing import Any, Dict, List, Tuple
|
|
8
8
|
|
|
9
|
+
from sky.skylet import autostop_lib
|
|
9
10
|
from sky.skylet import constants
|
|
10
11
|
from sky.utils import kubernetes_enums
|
|
11
12
|
|
|
@@ -65,6 +66,11 @@ _AUTOSTOP_SCHEMA = {
|
|
|
65
66
|
'down': {
|
|
66
67
|
'type': 'boolean',
|
|
67
68
|
},
|
|
69
|
+
'wait_for': {
|
|
70
|
+
'type': 'string',
|
|
71
|
+
'case_insensitive_enum':
|
|
72
|
+
autostop_lib.AutostopWaitFor.supported_modes(),
|
|
73
|
+
}
|
|
68
74
|
},
|
|
69
75
|
},
|
|
70
76
|
],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: skypilot-nightly
|
|
3
|
-
Version: 1.0.0.
|
|
3
|
+
Version: 1.0.0.dev20250804
|
|
4
4
|
Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
|
|
5
5
|
Author: SkyPilot Team
|
|
6
6
|
License: Apache 2.0
|
|
@@ -121,7 +121,7 @@ Requires-Dist: vastai-sdk>=0.1.12; extra == "vast"
|
|
|
121
121
|
Provides-Extra: vsphere
|
|
122
122
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
|
|
123
123
|
Provides-Extra: nebius
|
|
124
|
-
Requires-Dist: nebius>=0.2.
|
|
124
|
+
Requires-Dist: nebius>=0.2.47; extra == "nebius"
|
|
125
125
|
Requires-Dist: awscli>=1.27.10; extra == "nebius"
|
|
126
126
|
Requires-Dist: botocore>=1.29.10; extra == "nebius"
|
|
127
127
|
Requires-Dist: boto3>=1.26.1; extra == "nebius"
|
|
@@ -177,7 +177,7 @@ Requires-Dist: azure-core>=1.24.0; extra == "all"
|
|
|
177
177
|
Requires-Dist: azure-common; extra == "all"
|
|
178
178
|
Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
|
|
179
179
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
|
180
|
-
Requires-Dist: nebius>=0.2.
|
|
180
|
+
Requires-Dist: nebius>=0.2.47; extra == "all"
|
|
181
181
|
Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
182
182
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
|
183
183
|
Requires-Dist: boto3>=1.26.1; extra == "all"
|
|
@@ -235,6 +235,7 @@ Dynamic: summary
|
|
|
235
235
|
----
|
|
236
236
|
|
|
237
237
|
:fire: *News* :fire:
|
|
238
|
+
- [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
|
|
238
239
|
- [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
|
|
239
240
|
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
|
|
240
241
|
- [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
|