skypilot-nightly 1.0.0.dev20250731__py3-none-any.whl → 1.0.0.dev20250802__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +6 -1
- sky/backends/cloud_vm_ray_backend.py +2 -1
- sky/catalog/data_fetchers/fetch_nebius.py +31 -7
- sky/client/cli/command.py +40 -14
- sky/client/cli/flags.py +15 -0
- sky/client/sdk.py +80 -10
- sky/client/sdk.pyi +4 -0
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +11 -0
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-13145516b19858fb.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/execution.py +5 -3
- sky/jobs/client/sdk.py +5 -1
- sky/provision/runpod/utils.py +27 -12
- sky/resources.py +17 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +2 -124
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/setup_files/dependencies.py +1 -1
- sky/sky_logging.py +30 -0
- sky/skylet/autostop_lib.py +96 -8
- sky/skylet/constants.py +4 -3
- sky/skylet/events.py +27 -13
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/utils/schemas.py +29 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/METADATA +4 -3
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/RECORD +55 -54
- sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +0 -11
- /sky/dashboard/out/_next/static/{oKqDxFQ88cquF4nQGE_0w → 2JNCZ4daQBotwWRNGi6aE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250731.dist-info → skypilot_nightly-1.0.0.dev20250802.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
4
|
from typing import Dict, List, Optional, Union
|
|
5
|
-
import webbrowser
|
|
6
5
|
|
|
7
6
|
import click
|
|
8
7
|
|
|
9
8
|
from sky import sky_logging
|
|
9
|
+
from sky.adaptors import common as adaptors_common
|
|
10
10
|
from sky.client import common as client_common
|
|
11
11
|
from sky.client import sdk
|
|
12
12
|
from sky.serve.client import impl
|
|
@@ -23,9 +23,13 @@ from sky.utils import dag_utils
|
|
|
23
23
|
|
|
24
24
|
if typing.TYPE_CHECKING:
|
|
25
25
|
import io
|
|
26
|
+
import webbrowser
|
|
26
27
|
|
|
27
28
|
import sky
|
|
28
29
|
from sky.serve import serve_utils
|
|
30
|
+
else:
|
|
31
|
+
# only used in dashboard()
|
|
32
|
+
webbrowser = adaptors_common.LazyImport('webbrowser')
|
|
29
33
|
|
|
30
34
|
logger = sky_logging.init_logger(__name__)
|
|
31
35
|
|
sky/provision/runpod/utils.py
CHANGED
|
@@ -270,18 +270,17 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
270
270
|
docker_login_config: Optional[Dict[str, str]]) -> str:
|
|
271
271
|
"""Launches an instance with the given parameters.
|
|
272
272
|
|
|
273
|
-
|
|
274
|
-
|
|
273
|
+
For CPU instances, we directly use the instance_type for launching the
|
|
274
|
+
instance.
|
|
275
|
+
|
|
276
|
+
For GPU instances, we convert the instance_type to the RunPod GPU name,
|
|
277
|
+
and finds the specs for the GPU, before launching the instance.
|
|
275
278
|
|
|
276
279
|
Returns:
|
|
277
280
|
instance_id: The instance ID.
|
|
278
281
|
"""
|
|
279
282
|
name = f'{cluster_name}-{node_type}'
|
|
280
|
-
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
281
|
-
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
282
|
-
cloud_type = instance_type.split('_')[2]
|
|
283
283
|
|
|
284
|
-
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
285
284
|
# TODO(zhwu): keep this align with setups in
|
|
286
285
|
# `provision.kuberunetes.instance.py`
|
|
287
286
|
setup_cmd = (
|
|
@@ -329,12 +328,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
329
328
|
params = {
|
|
330
329
|
'name': name,
|
|
331
330
|
'image_name': image_name_formatted,
|
|
332
|
-
'gpu_type_id': gpu_type,
|
|
333
|
-
'cloud_type': cloud_type,
|
|
334
331
|
'container_disk_in_gb': disk_size,
|
|
335
|
-
'min_vcpu_count': 4 * gpu_quantity,
|
|
336
|
-
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
337
|
-
'gpu_count': gpu_quantity,
|
|
338
332
|
'country_code': region,
|
|
339
333
|
'data_center_id': zone,
|
|
340
334
|
'ports': ports_str,
|
|
@@ -343,12 +337,33 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
|
|
|
343
337
|
'template_id': template_id,
|
|
344
338
|
}
|
|
345
339
|
|
|
340
|
+
# GPU instance types start with f'{gpu_count}x',
|
|
341
|
+
# CPU instance types start with 'cpu'.
|
|
342
|
+
is_cpu_instance = instance_type.startswith('cpu')
|
|
343
|
+
if is_cpu_instance:
|
|
344
|
+
# RunPod CPU instances can be uniquely identified by the instance_id.
|
|
345
|
+
params.update({
|
|
346
|
+
'instance_id': instance_type,
|
|
347
|
+
})
|
|
348
|
+
else:
|
|
349
|
+
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
|
|
350
|
+
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
|
|
351
|
+
cloud_type = instance_type.split('_')[2]
|
|
352
|
+
gpu_specs = runpod.runpod.get_gpu(gpu_type)
|
|
353
|
+
params.update({
|
|
354
|
+
'gpu_type_id': gpu_type,
|
|
355
|
+
'cloud_type': cloud_type,
|
|
356
|
+
'min_vcpu_count': 4 * gpu_quantity,
|
|
357
|
+
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
|
|
358
|
+
'gpu_count': gpu_quantity,
|
|
359
|
+
})
|
|
360
|
+
|
|
346
361
|
if preemptible is None or not preemptible:
|
|
347
362
|
new_instance = runpod.runpod.create_pod(**params)
|
|
348
363
|
else:
|
|
349
364
|
new_instance = runpod_commands.create_spot_pod(
|
|
350
365
|
bid_per_gpu=bid_per_gpu,
|
|
351
|
-
**params,
|
|
366
|
+
**params, # type: ignore[arg-type]
|
|
352
367
|
)
|
|
353
368
|
|
|
354
369
|
return new_instance['id']
|
sky/resources.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.provision import docker_utils
|
|
|
20
20
|
from sky.provision.gcp import constants as gcp_constants
|
|
21
21
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
22
|
from sky.provision.nebius import constants as nebius_constants
|
|
23
|
+
from sky.skylet import autostop_lib
|
|
23
24
|
from sky.skylet import constants
|
|
24
25
|
from sky.utils import accelerator_registry
|
|
25
26
|
from sky.utils import annotations
|
|
@@ -69,14 +70,18 @@ class AutostopConfig:
|
|
|
69
70
|
# flags.
|
|
70
71
|
idle_minutes: int = 0
|
|
71
72
|
down: bool = False
|
|
73
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
72
74
|
|
|
73
75
|
def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
|
|
74
76
|
if not self.enabled:
|
|
75
77
|
return False
|
|
76
|
-
|
|
78
|
+
config: Dict[str, Any] = {
|
|
77
79
|
'idle_minutes': self.idle_minutes,
|
|
78
80
|
'down': self.down,
|
|
79
81
|
}
|
|
82
|
+
if self.wait_for is not None:
|
|
83
|
+
config['wait_for'] = self.wait_for.value
|
|
84
|
+
return config
|
|
80
85
|
|
|
81
86
|
@classmethod
|
|
82
87
|
def from_yaml_config(
|
|
@@ -104,6 +109,9 @@ class AutostopConfig:
|
|
|
104
109
|
autostop_config.idle_minutes = config['idle_minutes']
|
|
105
110
|
if 'down' in config:
|
|
106
111
|
autostop_config.down = config['down']
|
|
112
|
+
if 'wait_for' in config:
|
|
113
|
+
autostop_config.wait_for = (
|
|
114
|
+
autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
|
|
107
115
|
return autostop_config
|
|
108
116
|
|
|
109
117
|
return None
|
|
@@ -958,15 +966,18 @@ class Resources:
|
|
|
958
966
|
valid_volumes.append(volume)
|
|
959
967
|
self._volumes = valid_volumes
|
|
960
968
|
|
|
961
|
-
def override_autostop_config(
|
|
962
|
-
|
|
963
|
-
|
|
969
|
+
def override_autostop_config(
|
|
970
|
+
self,
|
|
971
|
+
down: bool = False,
|
|
972
|
+
idle_minutes: Optional[int] = None,
|
|
973
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
|
|
964
974
|
"""Override autostop config to the resource.
|
|
965
975
|
|
|
966
976
|
Args:
|
|
967
977
|
down: If true, override the autostop config to use autodown.
|
|
968
978
|
idle_minutes: If not None, override the idle minutes to autostop or
|
|
969
979
|
autodown.
|
|
980
|
+
wait_for: If not None, override the wait mode.
|
|
970
981
|
"""
|
|
971
982
|
if not down and idle_minutes is None:
|
|
972
983
|
return
|
|
@@ -976,6 +987,8 @@ class Resources:
|
|
|
976
987
|
self._autostop_config.down = down
|
|
977
988
|
if idle_minutes is not None:
|
|
978
989
|
self._autostop_config.idle_minutes = idle_minutes
|
|
990
|
+
if wait_for is not None:
|
|
991
|
+
self._autostop_config.wait_for = wait_for
|
|
979
992
|
|
|
980
993
|
def is_launchable(self) -> bool:
|
|
981
994
|
"""Returns whether the resource is launchable."""
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 13
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Internal server daemons that run in the background."""
|
|
2
|
+
import dataclasses
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky import skypilot_config
|
|
9
|
+
from sky.server import constants as server_constants
|
|
10
|
+
from sky.utils import common
|
|
11
|
+
from sky.utils import env_options
|
|
12
|
+
from sky.utils import ux_utils
|
|
13
|
+
|
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class InternalRequestDaemon:
|
|
19
|
+
"""Internal daemon that runs an event in the background."""
|
|
20
|
+
|
|
21
|
+
id: str
|
|
22
|
+
name: str
|
|
23
|
+
event_fn: Callable[[], None]
|
|
24
|
+
default_log_level: str = 'INFO'
|
|
25
|
+
|
|
26
|
+
def refresh_log_level(self) -> int:
|
|
27
|
+
# pylint: disable=import-outside-toplevel
|
|
28
|
+
import logging
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Refresh config within the while loop.
|
|
32
|
+
# Since this is a long running daemon,
|
|
33
|
+
# reload_config_for_new_request()
|
|
34
|
+
# is not called in between the event runs.
|
|
35
|
+
skypilot_config.safe_reload_config()
|
|
36
|
+
# Get the configured log level for the daemon inside the event loop
|
|
37
|
+
# in case the log level changes after the API server is started.
|
|
38
|
+
level_str = skypilot_config.get_nested(
|
|
39
|
+
('daemons', self.id, 'log_level'), self.default_log_level)
|
|
40
|
+
return getattr(logging, level_str.upper())
|
|
41
|
+
except AttributeError:
|
|
42
|
+
# Bad level should be rejected by
|
|
43
|
+
# schema validation, just in case.
|
|
44
|
+
logger.warning(f'Invalid log level: {level_str}, using DEBUG')
|
|
45
|
+
return logging.DEBUG
|
|
46
|
+
except Exception as e: # pylint: disable=broad-except
|
|
47
|
+
logger.exception(f'Error refreshing log level for {self.id}: {e}')
|
|
48
|
+
return logging.DEBUG
|
|
49
|
+
|
|
50
|
+
def run_event(self):
|
|
51
|
+
"""Run the event."""
|
|
52
|
+
|
|
53
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
54
|
+
# sent multiple times.
|
|
55
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
56
|
+
|
|
57
|
+
level = self.refresh_log_level()
|
|
58
|
+
while True:
|
|
59
|
+
try:
|
|
60
|
+
with ux_utils.enable_traceback(), \
|
|
61
|
+
sky_logging.set_sky_logging_levels(level):
|
|
62
|
+
sky_logging.reload_logger()
|
|
63
|
+
level = self.refresh_log_level()
|
|
64
|
+
self.event_fn()
|
|
65
|
+
except Exception: # pylint: disable=broad-except
|
|
66
|
+
# It is OK to fail to run the event, as the event is not
|
|
67
|
+
# critical, but we should log the error.
|
|
68
|
+
logger.exception(
|
|
69
|
+
f'Error running {self.name} event. '
|
|
70
|
+
f'Restarting in '
|
|
71
|
+
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
72
|
+
'seconds...')
|
|
73
|
+
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def refresh_cluster_status_event():
|
|
77
|
+
"""Periodically refresh the cluster status."""
|
|
78
|
+
# pylint: disable=import-outside-toplevel
|
|
79
|
+
from sky import core
|
|
80
|
+
|
|
81
|
+
logger.info('=== Refreshing cluster status ===')
|
|
82
|
+
# This periodically refresh will hold the lock for the cluster being
|
|
83
|
+
# refreshed, but it is OK because other operations will just wait for
|
|
84
|
+
# the lock and get the just refreshed status without refreshing again.
|
|
85
|
+
core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
|
|
86
|
+
logger.info('Status refreshed. Sleeping '
|
|
87
|
+
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
88
|
+
' seconds for the next refresh...\n')
|
|
89
|
+
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def refresh_volume_status_event():
|
|
93
|
+
"""Periodically refresh the volume status."""
|
|
94
|
+
# pylint: disable=import-outside-toplevel
|
|
95
|
+
from sky.volumes.server import core
|
|
96
|
+
|
|
97
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
98
|
+
# sent multiple times.
|
|
99
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
100
|
+
|
|
101
|
+
logger.info('=== Refreshing volume status ===')
|
|
102
|
+
core.volume_refresh()
|
|
103
|
+
logger.info('Volume status refreshed. Sleeping '
|
|
104
|
+
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
105
|
+
' seconds for the next refresh...\n')
|
|
106
|
+
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def managed_job_status_refresh_event():
|
|
110
|
+
"""Refresh the managed job status for controller consolidation mode."""
|
|
111
|
+
# pylint: disable=import-outside-toplevel
|
|
112
|
+
from sky.jobs import utils as managed_job_utils
|
|
113
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
114
|
+
return
|
|
115
|
+
# We run the recovery logic before starting the event loop as those two are
|
|
116
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
117
|
+
from sky.utils import controller_utils
|
|
118
|
+
if controller_utils.high_availability_specified(
|
|
119
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
120
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
121
|
+
# After recovery, we start the event loop.
|
|
122
|
+
from sky.skylet import events
|
|
123
|
+
refresh_event = events.ManagedJobEvent()
|
|
124
|
+
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
125
|
+
logger.info('=== Running managed job event ===')
|
|
126
|
+
refresh_event.run()
|
|
127
|
+
scheduling_event.run()
|
|
128
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def sky_serve_status_refresh_event():
|
|
132
|
+
"""Refresh the sky serve status for controller consolidation mode."""
|
|
133
|
+
# pylint: disable=import-outside-toplevel
|
|
134
|
+
from sky.serve import serve_utils
|
|
135
|
+
if not serve_utils.is_consolidation_mode():
|
|
136
|
+
return
|
|
137
|
+
# TODO(tian): Add HA recovery logic.
|
|
138
|
+
from sky.skylet import events
|
|
139
|
+
event = events.ServiceUpdateEvent()
|
|
140
|
+
logger.info('=== Running serve status refresh event ===')
|
|
141
|
+
event.run()
|
|
142
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Register the events to run in the background.
|
|
146
|
+
INTERNAL_REQUEST_DAEMONS = [
|
|
147
|
+
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
148
|
+
# set to updated status automatically, without showing users the hint of
|
|
149
|
+
# cluster being stopped or down when `sky status -r` is called.
|
|
150
|
+
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
151
|
+
name='status',
|
|
152
|
+
event_fn=refresh_cluster_status_event,
|
|
153
|
+
default_log_level='DEBUG'),
|
|
154
|
+
# Volume status refresh daemon to update the volume status periodically.
|
|
155
|
+
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
156
|
+
name='volume',
|
|
157
|
+
event_fn=refresh_volume_status_event),
|
|
158
|
+
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
159
|
+
name='managed-job-status',
|
|
160
|
+
event_fn=managed_job_status_refresh_event),
|
|
161
|
+
InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
|
|
162
|
+
name='sky-serve-status',
|
|
163
|
+
event_fn=sky_serve_status_refresh_event),
|
|
164
|
+
]
|
sky/server/requests/payloads.py
CHANGED
|
@@ -33,6 +33,7 @@ from sky import sky_logging
|
|
|
33
33
|
from sky import skypilot_config
|
|
34
34
|
from sky.adaptors import common as adaptors_common
|
|
35
35
|
from sky.server import common
|
|
36
|
+
from sky.skylet import autostop_lib
|
|
36
37
|
from sky.skylet import constants
|
|
37
38
|
from sky.usage import constants as usage_constants
|
|
38
39
|
from sky.usage import usage_lib
|
|
@@ -312,6 +313,7 @@ class StartBody(RequestBody):
|
|
|
312
313
|
"""The request body for the start endpoint."""
|
|
313
314
|
cluster_name: str
|
|
314
315
|
idle_minutes_to_autostop: Optional[int] = None
|
|
316
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
315
317
|
retry_until_up: bool = False
|
|
316
318
|
down: bool = False
|
|
317
319
|
force: bool = False
|
|
@@ -321,6 +323,7 @@ class AutostopBody(RequestBody):
|
|
|
321
323
|
"""The request body for the autostop endpoint."""
|
|
322
324
|
cluster_name: str
|
|
323
325
|
idle_minutes: int
|
|
326
|
+
wait_for: Optional[autostop_lib.AutostopWaitFor] = None
|
|
324
327
|
down: bool = False
|
|
325
328
|
|
|
326
329
|
|
sky/server/requests/requests.py
CHANGED
|
@@ -24,12 +24,11 @@ from sky import sky_logging
|
|
|
24
24
|
from sky import skypilot_config
|
|
25
25
|
from sky.server import common as server_common
|
|
26
26
|
from sky.server import constants as server_constants
|
|
27
|
+
from sky.server import daemons
|
|
27
28
|
from sky.server.requests import payloads
|
|
28
29
|
from sky.server.requests.serializers import decoders
|
|
29
30
|
from sky.server.requests.serializers import encoders
|
|
30
|
-
from sky.utils import common
|
|
31
31
|
from sky.utils import common_utils
|
|
32
|
-
from sky.utils import env_options
|
|
33
32
|
from sky.utils import subprocess_utils
|
|
34
33
|
from sky.utils import ux_utils
|
|
35
34
|
from sky.utils.db import db_utils
|
|
@@ -307,127 +306,6 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
307
306
|
kill_requests(request_ids)
|
|
308
307
|
|
|
309
308
|
|
|
310
|
-
def refresh_cluster_status_event():
|
|
311
|
-
"""Periodically refresh the cluster status."""
|
|
312
|
-
# pylint: disable=import-outside-toplevel
|
|
313
|
-
from sky import core
|
|
314
|
-
|
|
315
|
-
# Disable logging for periodic refresh to avoid the usage message being
|
|
316
|
-
# sent multiple times.
|
|
317
|
-
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
318
|
-
|
|
319
|
-
while True:
|
|
320
|
-
logger.info('=== Refreshing cluster status ===')
|
|
321
|
-
# This periodically refresh will hold the lock for the cluster being
|
|
322
|
-
# refreshed, but it is OK because other operations will just wait for
|
|
323
|
-
# the lock and get the just refreshed status without refreshing again.
|
|
324
|
-
core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
|
|
325
|
-
logger.info(
|
|
326
|
-
'Status refreshed. Sleeping '
|
|
327
|
-
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
328
|
-
' seconds for the next refresh...\n')
|
|
329
|
-
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def refresh_volume_status_event():
|
|
333
|
-
"""Periodically refresh the volume status."""
|
|
334
|
-
# pylint: disable=import-outside-toplevel
|
|
335
|
-
from sky.volumes.server import core
|
|
336
|
-
|
|
337
|
-
# Disable logging for periodic refresh to avoid the usage message being
|
|
338
|
-
# sent multiple times.
|
|
339
|
-
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
340
|
-
|
|
341
|
-
while True:
|
|
342
|
-
logger.info('=== Refreshing volume status ===')
|
|
343
|
-
core.volume_refresh()
|
|
344
|
-
logger.info('Volume status refreshed. Sleeping '
|
|
345
|
-
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
346
|
-
' seconds for the next refresh...\n')
|
|
347
|
-
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
def managed_job_status_refresh_event():
|
|
351
|
-
"""Refresh the managed job status for controller consolidation mode."""
|
|
352
|
-
# pylint: disable=import-outside-toplevel
|
|
353
|
-
from sky.jobs import utils as managed_job_utils
|
|
354
|
-
if not managed_job_utils.is_consolidation_mode():
|
|
355
|
-
return
|
|
356
|
-
# We run the recovery logic before starting the event loop as those two are
|
|
357
|
-
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
358
|
-
from sky.utils import controller_utils
|
|
359
|
-
if controller_utils.high_availability_specified(
|
|
360
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
361
|
-
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
362
|
-
# After recovery, we start the event loop.
|
|
363
|
-
from sky.skylet import events
|
|
364
|
-
refresh_event = events.ManagedJobEvent()
|
|
365
|
-
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
366
|
-
while True:
|
|
367
|
-
logger.info('=== Running managed job event ===')
|
|
368
|
-
refresh_event.run()
|
|
369
|
-
scheduling_event.run()
|
|
370
|
-
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
def sky_serve_status_refresh_event():
|
|
374
|
-
"""Refresh the managed job status for controller consolidation mode."""
|
|
375
|
-
# pylint: disable=import-outside-toplevel
|
|
376
|
-
from sky.serve import serve_utils
|
|
377
|
-
if not serve_utils.is_consolidation_mode():
|
|
378
|
-
return
|
|
379
|
-
# TODO(tian): Add HA recovery logic.
|
|
380
|
-
from sky.skylet import events
|
|
381
|
-
event = events.ServiceUpdateEvent()
|
|
382
|
-
while True:
|
|
383
|
-
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
384
|
-
event.run()
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
@dataclasses.dataclass
|
|
388
|
-
class InternalRequestDaemon:
|
|
389
|
-
"""Internal daemon that runs an event in the background."""
|
|
390
|
-
|
|
391
|
-
id: str
|
|
392
|
-
name: str
|
|
393
|
-
event_fn: Callable[[], None]
|
|
394
|
-
|
|
395
|
-
def run_event(self):
|
|
396
|
-
"""Run the event."""
|
|
397
|
-
while True:
|
|
398
|
-
with ux_utils.enable_traceback():
|
|
399
|
-
try:
|
|
400
|
-
self.event_fn()
|
|
401
|
-
break
|
|
402
|
-
except Exception: # pylint: disable=broad-except
|
|
403
|
-
# It is OK to fail to run the event, as the event is not
|
|
404
|
-
# critical, but we should log the error.
|
|
405
|
-
logger.exception(
|
|
406
|
-
f'Error running {self.name} event. '
|
|
407
|
-
f'Restarting in '
|
|
408
|
-
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
409
|
-
'seconds...')
|
|
410
|
-
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
# Register the events to run in the background.
|
|
414
|
-
INTERNAL_REQUEST_DAEMONS = [
|
|
415
|
-
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
416
|
-
# set to updated status automatically, without showing users the hint of
|
|
417
|
-
# cluster being stopped or down when `sky status -r` is called.
|
|
418
|
-
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
419
|
-
name='status',
|
|
420
|
-
event_fn=refresh_cluster_status_event),
|
|
421
|
-
# Volume status refresh daemon to update the volume status periodically.
|
|
422
|
-
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
423
|
-
name='volume',
|
|
424
|
-
event_fn=refresh_volume_status_event),
|
|
425
|
-
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
426
|
-
name='managed-job-status',
|
|
427
|
-
event_fn=managed_job_status_refresh_event),
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
|
|
431
309
|
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
432
310
|
user_id: Optional[str] = None) -> List[str]:
|
|
433
311
|
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
@@ -458,7 +336,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
458
336
|
# Skip internal requests. The internal requests are scheduled with
|
|
459
337
|
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
460
338
|
if request_record.request_id in set(
|
|
461
|
-
event.id for event in INTERNAL_REQUEST_DAEMONS):
|
|
339
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
462
340
|
continue
|
|
463
341
|
if request_record.status > RequestStatus.RUNNING:
|
|
464
342
|
logger.debug(f'Request {request_id} already finished')
|
sky/server/server.py
CHANGED
|
@@ -46,6 +46,7 @@ from sky.serve.server import server as serve_rest
|
|
|
46
46
|
from sky.server import common
|
|
47
47
|
from sky.server import config as server_config
|
|
48
48
|
from sky.server import constants as server_constants
|
|
49
|
+
from sky.server import daemons
|
|
49
50
|
from sky.server import metrics
|
|
50
51
|
from sky.server import state
|
|
51
52
|
from sky.server import stream_utils
|
|
@@ -482,7 +483,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
482
483
|
"""FastAPI lifespan context manager."""
|
|
483
484
|
del app # unused
|
|
484
485
|
# Startup: Run background tasks
|
|
485
|
-
for event in
|
|
486
|
+
for event in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
486
487
|
try:
|
|
487
488
|
executor.schedule_request(
|
|
488
489
|
request_id=event.id,
|
sky/server/uvicorn.py
CHANGED
|
@@ -16,6 +16,7 @@ import uvicorn
|
|
|
16
16
|
from uvicorn.supervisors import multiprocess
|
|
17
17
|
|
|
18
18
|
from sky import sky_logging
|
|
19
|
+
from sky.server import daemons
|
|
19
20
|
from sky.server import state
|
|
20
21
|
from sky.server.requests import requests as requests_lib
|
|
21
22
|
from sky.skylet import constants
|
|
@@ -120,7 +121,7 @@ class Server(uvicorn.Server):
|
|
|
120
121
|
# Proactively cancel internal requests and logs requests since
|
|
121
122
|
# they can run for infinite time.
|
|
122
123
|
internal_request_ids = [
|
|
123
|
-
d.id for d in
|
|
124
|
+
d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
|
|
124
125
|
]
|
|
125
126
|
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
|
126
127
|
logger.warning('Timeout waiting for on-going requests to '
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -177,7 +177,7 @@ extras_require: Dict[str, List[str]] = {
|
|
|
177
177
|
# 'vsphere-automation-sdk @ git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.1.0' pylint: disable=line-too-long
|
|
178
178
|
],
|
|
179
179
|
'nebius': [
|
|
180
|
-
'nebius>=0.2.
|
|
180
|
+
'nebius>=0.2.47',
|
|
181
181
|
] + aws_dependencies,
|
|
182
182
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
183
183
|
'server': server_dependencies,
|
sky/sky_logging.py
CHANGED
|
@@ -171,6 +171,36 @@ def set_logging_level(logger: str, level: int):
|
|
|
171
171
|
logger.setLevel(original_level)
|
|
172
172
|
|
|
173
173
|
|
|
174
|
+
@contextlib.contextmanager
|
|
175
|
+
def set_sky_logging_levels(level: int):
|
|
176
|
+
"""Set the logging level for all loggers."""
|
|
177
|
+
# Turn off logger
|
|
178
|
+
previous_levels = {}
|
|
179
|
+
for logger_name in logging.Logger.manager.loggerDict:
|
|
180
|
+
if logger_name.startswith('sky'):
|
|
181
|
+
logger = logging.getLogger(logger_name)
|
|
182
|
+
previous_levels[logger_name] = logger.level
|
|
183
|
+
logger.setLevel(level)
|
|
184
|
+
if level == logging.DEBUG:
|
|
185
|
+
previous_show_debug_info = env_options.Options.SHOW_DEBUG_INFO.get()
|
|
186
|
+
os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
|
|
187
|
+
try:
|
|
188
|
+
yield
|
|
189
|
+
finally:
|
|
190
|
+
# Restore logger
|
|
191
|
+
for logger_name in logging.Logger.manager.loggerDict:
|
|
192
|
+
if logger_name.startswith('sky'):
|
|
193
|
+
logger = logging.getLogger(logger_name)
|
|
194
|
+
try:
|
|
195
|
+
logger.setLevel(previous_levels[logger_name])
|
|
196
|
+
except KeyError:
|
|
197
|
+
# New loggers maybe initialized after the context manager,
|
|
198
|
+
# no need to restore the level.
|
|
199
|
+
pass
|
|
200
|
+
if level == logging.DEBUG and not previous_show_debug_info:
|
|
201
|
+
os.environ.pop(env_options.Options.SHOW_DEBUG_INFO.env_key)
|
|
202
|
+
|
|
203
|
+
|
|
174
204
|
def logging_enabled(logger: logging.Logger, level: int) -> bool:
|
|
175
205
|
return logger.level <= level
|
|
176
206
|
|