skypilot-nightly 1.0.0.dev20250801__py3-none-any.whl → 1.0.0.dev20250804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (51) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +10 -2
  3. sky/backends/cloud_vm_ray_backend.py +2 -1
  4. sky/catalog/data_fetchers/fetch_nebius.py +31 -7
  5. sky/client/cli/command.py +42 -20
  6. sky/client/cli/flags.py +15 -0
  7. sky/client/sdk.py +80 -10
  8. sky/client/sdk.pyi +4 -0
  9. sky/core.py +10 -2
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
  15. sky/dashboard/out/_next/static/chunks/{webpack-42cd1b19a6b01078.js → webpack-13145516b19858fb.js} +1 -1
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/volumes.html +1 -1
  27. sky/dashboard/out/workspace/new.html +1 -1
  28. sky/dashboard/out/workspaces/[name].html +1 -1
  29. sky/dashboard/out/workspaces.html +1 -1
  30. sky/execution.py +5 -3
  31. sky/jobs/client/sdk.py +5 -1
  32. sky/provision/kubernetes/utils.py +32 -2
  33. sky/resources.py +17 -4
  34. sky/server/constants.py +1 -1
  35. sky/server/requests/payloads.py +3 -0
  36. sky/setup_files/dependencies.py +1 -1
  37. sky/skylet/autostop_lib.py +96 -8
  38. sky/skylet/constants.py +3 -2
  39. sky/skylet/events.py +27 -13
  40. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  41. sky/utils/schemas.py +6 -0
  42. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/METADATA +4 -3
  43. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/RECORD +48 -48
  44. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
  45. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
  47. /sky/dashboard/out/_next/static/{f2fEsZwJxryJVOYRNtNKE → KiGGm4fK0CpmN6BT17jkh}/_ssgManifest.js +0 -0
  48. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/WHEEL +0 -0
  49. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/entry_points.txt +0 -0
  50. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/licenses/LICENSE +0 -0
  51. {skypilot_nightly-1.0.0.dev20250801.dist-info → skypilot_nightly-1.0.0.dev20250804.dist-info}/top_level.txt +0 -0
sky/execution.py CHANGED
@@ -15,6 +15,7 @@ from sky import global_user_state
15
15
  from sky import optimizer
16
16
  from sky import sky_logging
17
17
  from sky.backends import backend_utils
18
+ from sky.skylet import autostop_lib
18
19
  from sky.usage import usage_lib
19
20
  from sky.utils import admin_policy_utils
20
21
  from sky.utils import common
@@ -309,11 +310,13 @@ def _execute_dag(
309
310
 
310
311
  idle_minutes_to_autostop: Optional[int] = None
311
312
  down = False
313
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
312
314
  if resource_autostop_config is not None:
313
315
  if resource_autostop_config.enabled:
314
316
  idle_minutes_to_autostop = (
315
317
  resource_autostop_config.idle_minutes)
316
318
  down = resource_autostop_config.down
319
+ wait_for = resource_autostop_config.wait_for
317
320
  else:
318
321
  # Autostop is explicitly disabled, so cancel it if it's
319
322
  # already set.
@@ -450,9 +453,8 @@ def _execute_dag(
450
453
  if idle_minutes_to_autostop is not None:
451
454
  assert isinstance(backend, backends.CloudVmRayBackend)
452
455
  assert isinstance(handle, backends.CloudVmRayResourceHandle)
453
- backend.set_autostop(handle,
454
- idle_minutes_to_autostop,
455
- down=down)
456
+ backend.set_autostop(handle, idle_minutes_to_autostop, wait_for,
457
+ down)
456
458
 
457
459
  if Stage.EXEC in stages:
458
460
  try:
sky/jobs/client/sdk.py CHANGED
@@ -2,11 +2,11 @@
2
2
  import json
3
3
  import typing
4
4
  from typing import Dict, List, Optional, Union
5
- import webbrowser
6
5
 
7
6
  import click
8
7
 
9
8
  from sky import sky_logging
9
+ from sky.adaptors import common as adaptors_common
10
10
  from sky.client import common as client_common
11
11
  from sky.client import sdk
12
12
  from sky.serve.client import impl
@@ -23,9 +23,13 @@ from sky.utils import dag_utils
23
23
 
24
24
  if typing.TYPE_CHECKING:
25
25
  import io
26
+ import webbrowser
26
27
 
27
28
  import sky
28
29
  from sky.serve import serve_utils
30
+ else:
31
+ # only used in dashboard()
32
+ webbrowser = adaptors_common.LazyImport('webbrowser')
29
33
 
30
34
  logger = sky_logging.init_logger(__name__)
31
35
 
@@ -2642,6 +2642,7 @@ def combine_pod_config_fields(
2642
2642
 
2643
2643
 
2644
2644
  def combine_metadata_fields(cluster_yaml_path: str,
2645
+ cluster_config_overrides: Dict[str, Any],
2645
2646
  context: Optional[str] = None) -> None:
2646
2647
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2647
2648
  fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
@@ -2652,12 +2653,25 @@ def combine_metadata_fields(cluster_yaml_path: str,
2652
2653
  with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2653
2654
  yaml_content = f.read()
2654
2655
  yaml_obj = yaml.safe_load(yaml_content)
2656
+
2657
+ # Get custom_metadata from global config
2655
2658
  custom_metadata = skypilot_config.get_effective_region_config(
2656
2659
  cloud='kubernetes',
2657
2660
  region=context,
2658
2661
  keys=('custom_metadata',),
2659
2662
  default_value={})
2660
2663
 
2664
+ # Get custom_metadata from task-level config overrides
2665
+ override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2666
+ dict_config=cluster_config_overrides,
2667
+ cloud='kubernetes',
2668
+ region=context,
2669
+ keys=('custom_metadata',),
2670
+ default_value={})
2671
+
2672
+ # Merge task-level overrides with global config
2673
+ config_utils.merge_k8s_configs(custom_metadata, override_custom_metadata)
2674
+
2661
2675
  # List of objects in the cluster YAML to be updated
2662
2676
  combination_destinations = [
2663
2677
  # Service accounts
@@ -2679,17 +2693,33 @@ def combine_metadata_fields(cluster_yaml_path: str,
2679
2693
  common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2680
2694
 
2681
2695
 
2682
- def merge_custom_metadata(original_metadata: Dict[str, Any],
2683
- context: Optional[str] = None) -> None:
2696
+ def merge_custom_metadata(
2697
+ original_metadata: Dict[str, Any],
2698
+ context: Optional[str] = None,
2699
+ cluster_config_overrides: Optional[Dict[str, Any]] = None) -> None:
2684
2700
  """Merges original metadata with custom_metadata from config
2685
2701
 
2686
2702
  Merge is done in-place, so return is not required
2687
2703
  """
2704
+ # Get custom_metadata from global config
2688
2705
  custom_metadata = skypilot_config.get_effective_region_config(
2689
2706
  cloud='kubernetes',
2690
2707
  region=context,
2691
2708
  keys=('custom_metadata',),
2692
2709
  default_value={})
2710
+
2711
+ # Get custom_metadata from task-level config overrides if available
2712
+ if cluster_config_overrides is not None:
2713
+ override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2714
+ dict_config=cluster_config_overrides,
2715
+ cloud='kubernetes',
2716
+ region=context,
2717
+ keys=('custom_metadata',),
2718
+ default_value={})
2719
+ # Merge task-level overrides with global config
2720
+ config_utils.merge_k8s_configs(custom_metadata,
2721
+ override_custom_metadata)
2722
+
2693
2723
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2694
2724
 
2695
2725
 
sky/resources.py CHANGED
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
20
20
  from sky.provision.gcp import constants as gcp_constants
21
21
  from sky.provision.kubernetes import utils as kubernetes_utils
22
22
  from sky.provision.nebius import constants as nebius_constants
23
+ from sky.skylet import autostop_lib
23
24
  from sky.skylet import constants
24
25
  from sky.utils import accelerator_registry
25
26
  from sky.utils import annotations
@@ -69,14 +70,18 @@ class AutostopConfig:
69
70
  # flags.
70
71
  idle_minutes: int = 0
71
72
  down: bool = False
73
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
72
74
 
73
75
  def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
74
76
  if not self.enabled:
75
77
  return False
76
- return {
78
+ config: Dict[str, Any] = {
77
79
  'idle_minutes': self.idle_minutes,
78
80
  'down': self.down,
79
81
  }
82
+ if self.wait_for is not None:
83
+ config['wait_for'] = self.wait_for.value
84
+ return config
80
85
 
81
86
  @classmethod
82
87
  def from_yaml_config(
@@ -104,6 +109,9 @@ class AutostopConfig:
104
109
  autostop_config.idle_minutes = config['idle_minutes']
105
110
  if 'down' in config:
106
111
  autostop_config.down = config['down']
112
+ if 'wait_for' in config:
113
+ autostop_config.wait_for = (
114
+ autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
107
115
  return autostop_config
108
116
 
109
117
  return None
@@ -958,15 +966,18 @@ class Resources:
958
966
  valid_volumes.append(volume)
959
967
  self._volumes = valid_volumes
960
968
 
961
- def override_autostop_config(self,
962
- down: bool = False,
963
- idle_minutes: Optional[int] = None) -> None:
969
+ def override_autostop_config(
970
+ self,
971
+ down: bool = False,
972
+ idle_minutes: Optional[int] = None,
973
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
964
974
  """Override autostop config to the resource.
965
975
 
966
976
  Args:
967
977
  down: If true, override the autostop config to use autodown.
968
978
  idle_minutes: If not None, override the idle minutes to autostop or
969
979
  autodown.
980
+ wait_for: If not None, override the wait mode.
970
981
  """
971
982
  if not down and idle_minutes is None:
972
983
  return
@@ -976,6 +987,8 @@ class Resources:
976
987
  self._autostop_config.down = down
977
988
  if idle_minutes is not None:
978
989
  self._autostop_config.idle_minutes = idle_minutes
990
+ if wait_for is not None:
991
+ self._autostop_config.wait_for = wait_for
979
992
 
980
993
  def is_launchable(self) -> bool:
981
994
  """Returns whether the resource is launchable."""
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 12
13
+ API_VERSION = 13
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -33,6 +33,7 @@ from sky import sky_logging
33
33
  from sky import skypilot_config
34
34
  from sky.adaptors import common as adaptors_common
35
35
  from sky.server import common
36
+ from sky.skylet import autostop_lib
36
37
  from sky.skylet import constants
37
38
  from sky.usage import constants as usage_constants
38
39
  from sky.usage import usage_lib
@@ -312,6 +313,7 @@ class StartBody(RequestBody):
312
313
  """The request body for the start endpoint."""
313
314
  cluster_name: str
314
315
  idle_minutes_to_autostop: Optional[int] = None
316
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
315
317
  retry_until_up: bool = False
316
318
  down: bool = False
317
319
  force: bool = False
@@ -321,6 +323,7 @@ class AutostopBody(RequestBody):
321
323
  """The request body for the autostop endpoint."""
322
324
  cluster_name: str
323
325
  idle_minutes: int
326
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
324
327
  down: bool = False
325
328
 
326
329
 
@@ -177,7 +177,7 @@ extras_require: Dict[str, List[str]] = {
177
177
  # 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
178
178
  ],
179
179
  'nebius': [
180
- 'nebius>=0.2.37',
180
+ 'nebius>=0.2.47',
181
181
  ] + aws_dependencies,
182
182
  'hyperbolic': [], # No dependencies needed for hyperbolic
183
183
  'server': server_dependencies,
@@ -1,6 +1,8 @@
1
1
  """Autostop utilities."""
2
+ import enum
2
3
  import pickle
3
4
  import shlex
5
+ import subprocess
4
6
  import time
5
7
  import typing
6
8
  from typing import List, Optional
@@ -10,6 +12,7 @@ from sky.adaptors import common as adaptors_common
10
12
  from sky.skylet import configs
11
13
  from sky.skylet import constants
12
14
  from sky.utils import message_utils
15
+ from sky.utils import ux_utils
13
16
 
14
17
  if typing.TYPE_CHECKING:
15
18
  import psutil
@@ -30,6 +33,55 @@ _AUTOSTOP_LAST_ACTIVE_TIME = 'autostop_last_active_time'
30
33
  _AUTOSTOP_INDICATOR = 'autostop_indicator'
31
34
 
32
35
 
36
+ class AutostopWaitFor(enum.Enum):
37
+ """Enum for the Autostop behaviour.
38
+
39
+ JOBS: Wait for jobs to finish.
40
+ JOBS_AND_SSH: Wait for jobs to finish and all SSH sessions to be closed.
41
+ NONE: Unconditionally stop the cluster after the idle time.
42
+ """
43
+ JOBS_AND_SSH = 'jobs_and_ssh'
44
+ JOBS = 'jobs'
45
+ NONE = 'none'
46
+
47
+ @classmethod
48
+ def supported_modes(cls) -> List[str]:
49
+ return [mode.value for mode in cls]
50
+
51
+ @classmethod
52
+ def cli_help_message(cls, pair: str) -> str:
53
+ return f"""\
54
+ Determines the condition for resetting the idleness timer.
55
+ This option works in conjunction with ``--{pair}``. Options:
56
+
57
+ \b
58
+ 1. ``jobs_and_ssh`` (default): Wait for all jobs to complete AND all SSH
59
+ sessions to disconnect.
60
+ 2. ``jobs``: Wait for all jobs to complete.
61
+ 3. ``none``: Stop immediately after idle time expires, regardless of running
62
+ jobs or SSH connections."""
63
+
64
+ @classmethod
65
+ def from_str(cls, mode: str) -> 'AutostopWaitFor':
66
+ """Returns the enum value for the given string."""
67
+ if mode.lower() == cls.JOBS.value:
68
+ return cls.JOBS
69
+ elif mode.lower() == cls.JOBS_AND_SSH.value:
70
+ return cls.JOBS_AND_SSH
71
+ elif mode.lower() == cls.NONE.value:
72
+ return cls.NONE
73
+ else:
74
+ with ux_utils.print_exception_no_traceback():
75
+ raise ValueError(f'Unsupported autostop wait mode: '
76
+ f'{mode}. The mode must be either '
77
+ f'\'{cls.JOBS_AND_SSH.value}\', '
78
+ f'\'{cls.JOBS.value}\', or '
79
+ f'\'{cls.NONE.value}\'. ')
80
+
81
+
82
+ DEFAULT_AUTOSTOP_WAIT_FOR: AutostopWaitFor = AutostopWaitFor.JOBS_AND_SSH
83
+
84
+
33
85
  class AutostopConfig:
34
86
  """Autostop configuration."""
35
87
 
@@ -37,12 +89,14 @@ class AutostopConfig:
37
89
  autostop_idle_minutes: int,
38
90
  boot_time: float,
39
91
  backend: Optional[str],
92
+ wait_for: AutostopWaitFor,
40
93
  down: bool = False):
41
94
  assert autostop_idle_minutes < 0 or backend is not None, (
42
95
  autostop_idle_minutes, backend)
43
96
  self.autostop_idle_minutes = autostop_idle_minutes
44
97
  self.boot_time = boot_time
45
98
  self.backend = backend
99
+ self.wait_for = wait_for
46
100
  self.down = down
47
101
 
48
102
  def __setstate__(self, state: dict):
@@ -53,15 +107,18 @@ class AutostopConfig:
53
107
  def get_autostop_config() -> AutostopConfig:
54
108
  config_str = configs.get_config(_AUTOSTOP_CONFIG_KEY)
55
109
  if config_str is None:
56
- return AutostopConfig(-1, -1, None)
110
+ return AutostopConfig(-1, -1, None, DEFAULT_AUTOSTOP_WAIT_FOR)
57
111
  return pickle.loads(config_str)
58
112
 
59
113
 
60
- def set_autostop(idle_minutes: int, backend: Optional[str], down: bool) -> None:
114
+ def set_autostop(idle_minutes: int, backend: Optional[str],
115
+ wait_for: AutostopWaitFor, down: bool) -> None:
61
116
  boot_time = psutil.boot_time()
62
- autostop_config = AutostopConfig(idle_minutes, boot_time, backend, down)
117
+ autostop_config = AutostopConfig(idle_minutes, boot_time, backend, wait_for,
118
+ down)
63
119
  configs.set_config(_AUTOSTOP_CONFIG_KEY, pickle.dumps(autostop_config))
64
- logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}.')
120
+ logger.debug(f'set_autostop(): idle_minutes {idle_minutes}, down {down}, '
121
+ f'wait_for {wait_for.value}.')
65
122
  # Reset timer whenever an autostop setting is submitted, i.e. the idle
66
123
  # time will be counted from now.
67
124
  set_last_active_time_to_now()
@@ -107,6 +164,28 @@ def set_last_active_time_to_now() -> None:
107
164
  configs.set_config(_AUTOSTOP_LAST_ACTIVE_TIME, str(time.time()))
108
165
 
109
166
 
167
+ def has_active_ssh_sessions() -> bool:
168
+ """Returns True if there are any active SSH sessions on the node."""
169
+ try:
170
+ # /dev/pts is a virtual filesystem that contains the pseudo-terminal
171
+ # devices. ptmx is the pseudo-terminal multiplexer, which is the
172
+ # "master" device that creates new pseudo-terminal devices, so we
173
+ # exclude it from the count.
174
+ proc = subprocess.run('ls /dev/pts | grep -v ptmx | wc -l',
175
+ capture_output=True,
176
+ text=True,
177
+ check=False,
178
+ shell=True)
179
+ if proc.returncode != 0:
180
+ logger.warning(f'SSH session check command failed with return code '
181
+ f'{proc.returncode}.')
182
+ return False
183
+ return int(proc.stdout.strip()) > 0
184
+ except Exception as e: # pylint: disable=broad-except
185
+ logger.warning(f'Error checking active SSH sessions: {e}.')
186
+ return False
187
+
188
+
110
189
  class AutostopCodeGen:
111
190
  """Code generator for autostop utility functions.
112
191
 
@@ -114,13 +193,22 @@ class AutostopCodeGen:
114
193
 
115
194
  >> codegen = AutostopCodeGen.set_autostop(...)
116
195
  """
117
- _PREFIX = ['from sky.skylet import autostop_lib']
196
+ _PREFIX = ['from sky.skylet import autostop_lib, constants']
118
197
 
119
198
  @classmethod
120
- def set_autostop(cls, idle_minutes: int, backend: str, down: bool) -> str:
199
+ def set_autostop(cls,
200
+ idle_minutes: int,
201
+ backend: str,
202
+ wait_for: Optional[AutostopWaitFor],
203
+ down: bool = False) -> str:
204
+ if wait_for is None:
205
+ wait_for = DEFAULT_AUTOSTOP_WAIT_FOR
121
206
  code = [
122
- f'autostop_lib.set_autostop({idle_minutes}, {backend!r},'
123
- f' {down})',
207
+ f'\nif getattr(constants, "SKYLET_LIB_VERSION", 1) < 4: '
208
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, {down})'
209
+ f'\nelse: '
210
+ f'\n autostop_lib.set_autostop({idle_minutes}, {backend!r}, '
211
+ f'autostop_lib.{wait_for}, {down})',
124
212
  ]
125
213
  return cls._build(code)
126
214
 
sky/skylet/constants.py CHANGED
@@ -90,11 +90,11 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
90
90
  # cluster yaml is updated.
91
91
  #
92
92
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
93
- SKYLET_VERSION = '15'
93
+ SKYLET_VERSION = '16'
94
94
  # The version of the lib files that skylet/jobs use. Whenever there is an API
95
95
  # change for the job_lib or log_lib, we need to bump this version, so that the
96
96
  # user can be notified to update their SkyPilot version on the remote cluster.
97
- SKYLET_LIB_VERSION = 3
97
+ SKYLET_LIB_VERSION = 4
98
98
  SKYLET_VERSION_FILE = '~/.sky/skylet_version'
99
99
 
100
100
  # Docker default options
@@ -369,6 +369,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
369
369
  ('docker', 'run_options'),
370
370
  ('nvidia_gpus', 'disable_ecc'),
371
371
  ('ssh', 'pod_config'),
372
+ ('kubernetes', 'custom_metadata'),
372
373
  ('kubernetes', 'pod_config'),
373
374
  ('kubernetes', 'provision_timeout'),
374
375
  ('kubernetes', 'dws'),
sky/skylet/events.py CHANGED
@@ -137,23 +137,37 @@ class AutostopEvent(SkyletEvent):
137
137
  logger.debug('autostop_config not set. Skipped.')
138
138
  return
139
139
 
140
- if (job_lib.is_cluster_idle() and
141
- not managed_job_state.get_num_alive_jobs()):
142
- idle_minutes = (time.time() -
143
- autostop_lib.get_last_active_time()) // 60
140
+ ignore_idle_check = (
141
+ autostop_config.wait_for == autostop_lib.AutostopWaitFor.NONE)
142
+ is_idle = True
143
+ if not ignore_idle_check:
144
+ if not job_lib.is_cluster_idle(
145
+ ) or managed_job_state.get_num_alive_jobs() or (
146
+ autostop_config.wait_for
147
+ == autostop_lib.AutostopWaitFor.JOBS_AND_SSH and
148
+ autostop_lib.has_active_ssh_sessions()):
149
+ is_idle = False
150
+
151
+ if ignore_idle_check or is_idle:
152
+ minutes_since_last_active = (
153
+ time.time() - autostop_lib.get_last_active_time()) // 60
144
154
  logger.debug(
145
- f'Idle minutes: {idle_minutes}, '
146
- f'AutoStop config: {autostop_config.autostop_idle_minutes}')
155
+ f'Minutes since last active: {minutes_since_last_active}, '
156
+ f'AutoStop idle minutes: '
157
+ f'{autostop_config.autostop_idle_minutes}, '
158
+ f'Wait for: {autostop_config.wait_for.value}')
147
159
  else:
148
160
  autostop_lib.set_last_active_time_to_now()
149
- idle_minutes = -1
150
- logger.debug(
151
- 'Not idle. Reset idle minutes.'
152
- f'AutoStop config: {autostop_config.autostop_idle_minutes}')
153
- if idle_minutes >= autostop_config.autostop_idle_minutes:
161
+ minutes_since_last_active = -1
162
+ logger.debug('Not idle. Reset idle minutes. '
163
+ f'AutoStop idle minutes: '
164
+ f'{autostop_config.autostop_idle_minutes}, '
165
+ f'Wait for: {autostop_config.wait_for.value}')
166
+ if minutes_since_last_active >= autostop_config.autostop_idle_minutes:
154
167
  logger.info(
155
- f'{idle_minutes} idle minutes reached; threshold: '
156
- f'{autostop_config.autostop_idle_minutes} minutes. Stopping.')
168
+ f'{minutes_since_last_active} minute(s) since last active; '
169
+ f'threshold: {autostop_config.autostop_idle_minutes} minutes. '
170
+ f'Stopping.')
157
171
  self._stop_cluster(autostop_config)
158
172
 
159
173
  def _stop_cluster(self, autostop_config):
@@ -12,6 +12,8 @@ service_spec:
12
12
  {%- for key, value in annotations.items() %}
13
13
  {{ key }}: {{ value|tojson }}
14
14
  {%- endfor %}
15
+ {# Note: It's ok to add cloud-specific annotations here since they will be ignored by other clouds #}
16
+ service.beta.kubernetes.io/coreweave-load-balancer-type: public
15
17
  spec:
16
18
  type: LoadBalancer
17
19
  selector:
sky/utils/schemas.py CHANGED
@@ -6,6 +6,7 @@ https://json-schema.org/
6
6
  import enum
7
7
  from typing import Any, Dict, List, Tuple
8
8
 
9
+ from sky.skylet import autostop_lib
9
10
  from sky.skylet import constants
10
11
  from sky.utils import kubernetes_enums
11
12
 
@@ -65,6 +66,11 @@ _AUTOSTOP_SCHEMA = {
65
66
  'down': {
66
67
  'type': 'boolean',
67
68
  },
69
+ 'wait_for': {
70
+ 'type': 'string',
71
+ 'case_insensitive_enum':
72
+ autostop_lib.AutostopWaitFor.supported_modes(),
73
+ }
68
74
  },
69
75
  },
70
76
  ],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250801
3
+ Version: 1.0.0.dev20250804
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -121,7 +121,7 @@ Requires-Dist: vastai-sdk>=0.1.12; extra == "vast"
121
121
  Provides-Extra: vsphere
122
122
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
123
123
  Provides-Extra: nebius
124
- Requires-Dist: nebius>=0.2.37; extra == "nebius"
124
+ Requires-Dist: nebius>=0.2.47; extra == "nebius"
125
125
  Requires-Dist: awscli>=1.27.10; extra == "nebius"
126
126
  Requires-Dist: botocore>=1.29.10; extra == "nebius"
127
127
  Requires-Dist: boto3>=1.26.1; extra == "nebius"
@@ -177,7 +177,7 @@ Requires-Dist: azure-core>=1.24.0; extra == "all"
177
177
  Requires-Dist: azure-common; extra == "all"
178
178
  Requires-Dist: vastai-sdk>=0.1.12; extra == "all"
179
179
  Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
180
- Requires-Dist: nebius>=0.2.37; extra == "all"
180
+ Requires-Dist: nebius>=0.2.47; extra == "all"
181
181
  Requires-Dist: awscli>=1.27.10; extra == "all"
182
182
  Requires-Dist: botocore>=1.29.10; extra == "all"
183
183
  Requires-Dist: boto3>=1.26.1; extra == "all"
@@ -235,6 +235,7 @@ Dynamic: summary
235
235
  ----
236
236
 
237
237
  :fire: *News* :fire:
238
+ - [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
238
239
  - [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
239
240
  - [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
240
241
  - [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)