skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/serve/service.py
CHANGED
|
@@ -222,7 +222,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
222
222
|
requested_resources_str=backend_utils.get_task_resources_str(task),
|
|
223
223
|
load_balancing_policy=service_spec.load_balancing_policy,
|
|
224
224
|
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
|
225
|
-
tls_encrypted=service_spec.tls_credential is not None
|
|
225
|
+
tls_encrypted=service_spec.tls_credential is not None,
|
|
226
|
+
pool=service_spec.pool)
|
|
226
227
|
# Directly throw an error here. See sky/serve/api.py::up
|
|
227
228
|
# for more details.
|
|
228
229
|
if not success:
|
|
@@ -292,14 +293,17 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
|
292
293
|
# TODO(tian): Probably we could enable multiple ports specified in
|
|
293
294
|
# service spec and we could start multiple load balancers.
|
|
294
295
|
# After that, we will have a mapping from replica port to endpoint.
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
296
|
+
# NOTE(tian): We don't need the load balancer for cluster pool.
|
|
297
|
+
# Skip the load balancer process for cluster pool.
|
|
298
|
+
if not service_spec.pool:
|
|
299
|
+
load_balancer_process = multiprocessing.Process(
|
|
300
|
+
target=ux_utils.RedirectOutputForProcess(
|
|
301
|
+
load_balancer.run_load_balancer,
|
|
302
|
+
load_balancer_log_file).run,
|
|
303
|
+
args=(controller_addr, load_balancer_port,
|
|
304
|
+
service_spec.load_balancing_policy,
|
|
305
|
+
service_spec.tls_credential))
|
|
306
|
+
load_balancer_process.start()
|
|
303
307
|
|
|
304
308
|
if not is_recovery:
|
|
305
309
|
serve_state.set_service_load_balancer_port(
|
sky/serve/service_spec.py
CHANGED
|
@@ -43,7 +43,33 @@ class SkyServiceSpec:
|
|
|
43
43
|
upscale_delay_seconds: Optional[int] = None,
|
|
44
44
|
downscale_delay_seconds: Optional[int] = None,
|
|
45
45
|
load_balancing_policy: Optional[str] = None,
|
|
46
|
+
pool: Optional[bool] = None,
|
|
46
47
|
) -> None:
|
|
48
|
+
if pool:
|
|
49
|
+
for unsupported_field in [
|
|
50
|
+
'max_replicas',
|
|
51
|
+
'num_overprovision',
|
|
52
|
+
'target_qps_per_replica',
|
|
53
|
+
'upscale_delay_seconds',
|
|
54
|
+
'downscale_delay_seconds',
|
|
55
|
+
'base_ondemand_fallback_replicas',
|
|
56
|
+
'dynamic_ondemand_fallback',
|
|
57
|
+
'spot_placer',
|
|
58
|
+
'load_balancing_policy',
|
|
59
|
+
'ports',
|
|
60
|
+
'post_data',
|
|
61
|
+
'tls_credential',
|
|
62
|
+
'readiness_headers',
|
|
63
|
+
]:
|
|
64
|
+
if locals()[unsupported_field] is not None:
|
|
65
|
+
with ux_utils.print_exception_no_traceback():
|
|
66
|
+
raise ValueError(
|
|
67
|
+
f'{unsupported_field} is not supported for pool.')
|
|
68
|
+
if max_replicas is not None and max_replicas != min_replicas:
|
|
69
|
+
with ux_utils.print_exception_no_traceback():
|
|
70
|
+
raise ValueError('Autoscaling is not supported for pool '
|
|
71
|
+
'for now.')
|
|
72
|
+
|
|
47
73
|
if max_replicas is not None and max_replicas < min_replicas:
|
|
48
74
|
with ux_utils.print_exception_no_traceback():
|
|
49
75
|
raise ValueError('max_replicas must be greater than or '
|
|
@@ -96,6 +122,7 @@ class SkyServiceSpec:
|
|
|
96
122
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
|
97
123
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
|
98
124
|
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
|
125
|
+
self._pool: Optional[bool] = pool
|
|
99
126
|
|
|
100
127
|
self._use_ondemand_fallback: bool = (
|
|
101
128
|
self.dynamic_ondemand_fallback is not None and
|
|
@@ -115,7 +142,7 @@ class SkyServiceSpec:
|
|
|
115
142
|
|
|
116
143
|
service_config: Dict[str, Any] = {}
|
|
117
144
|
|
|
118
|
-
readiness_section = config
|
|
145
|
+
readiness_section = config.get('readiness_probe', '/')
|
|
119
146
|
if isinstance(readiness_section, str):
|
|
120
147
|
service_config['readiness_path'] = readiness_section
|
|
121
148
|
initial_delay_seconds = None
|
|
@@ -157,8 +184,29 @@ class SkyServiceSpec:
|
|
|
157
184
|
raise ValueError('Port must be between 1 and 65535.')
|
|
158
185
|
service_config['ports'] = str(ports) if ports is not None else None
|
|
159
186
|
|
|
187
|
+
pool_config = config.get('pool', None)
|
|
188
|
+
if pool_config is not None:
|
|
189
|
+
service_config['pool'] = pool_config
|
|
190
|
+
|
|
160
191
|
policy_section = config.get('replica_policy', None)
|
|
192
|
+
if policy_section is not None and pool_config:
|
|
193
|
+
with ux_utils.print_exception_no_traceback():
|
|
194
|
+
raise ValueError('Cannot specify `replica_policy` for cluster '
|
|
195
|
+
'pool. Only `workers: <num>` is supported '
|
|
196
|
+
'for cluster pool now.')
|
|
197
|
+
|
|
161
198
|
simplified_policy_section = config.get('replicas', None)
|
|
199
|
+
workers_config = config.get('workers', None)
|
|
200
|
+
if simplified_policy_section is not None and workers_config is not None:
|
|
201
|
+
with ux_utils.print_exception_no_traceback():
|
|
202
|
+
raise ValueError('Cannot specify both `replicas` and `workers`.'
|
|
203
|
+
' Please use one of them.')
|
|
204
|
+
if simplified_policy_section is not None and pool_config:
|
|
205
|
+
with ux_utils.print_exception_no_traceback():
|
|
206
|
+
raise ValueError('Cannot specify `replicas` for cluster pool. '
|
|
207
|
+
'Please use `workers` instead.')
|
|
208
|
+
if simplified_policy_section is None:
|
|
209
|
+
simplified_policy_section = workers_config
|
|
162
210
|
if policy_section is None or simplified_policy_section is not None:
|
|
163
211
|
if simplified_policy_section is not None:
|
|
164
212
|
min_replicas = simplified_policy_section
|
|
@@ -239,6 +287,13 @@ class SkyServiceSpec:
|
|
|
239
287
|
config[section] = dict()
|
|
240
288
|
config[section][key] = value
|
|
241
289
|
|
|
290
|
+
add_if_not_none('pool', None, self._pool)
|
|
291
|
+
|
|
292
|
+
if self.pool:
|
|
293
|
+
# For pool, currently only `workers: <num>` is supported.
|
|
294
|
+
add_if_not_none('workers', None, self.min_replicas)
|
|
295
|
+
return config
|
|
296
|
+
|
|
242
297
|
add_if_not_none('readiness_probe', 'path', self.readiness_path)
|
|
243
298
|
add_if_not_none('readiness_probe', 'initial_delay_seconds',
|
|
244
299
|
self.initial_delay_seconds)
|
|
@@ -306,10 +361,14 @@ class SkyServiceSpec:
|
|
|
306
361
|
return ' '.join(policy_strs)
|
|
307
362
|
|
|
308
363
|
def autoscaling_policy_str(self):
|
|
364
|
+
if self.pool:
|
|
365
|
+
# We only support fixed-size pool for now.
|
|
366
|
+
return f'Fixed-size ({self.min_replicas} workers)'
|
|
309
367
|
# TODO(MaoZiming): Update policy_str
|
|
368
|
+
noun = 'worker' if self.pool else 'replica'
|
|
310
369
|
min_plural = '' if self.min_replicas == 1 else 's'
|
|
311
370
|
if self.max_replicas == self.min_replicas or self.max_replicas is None:
|
|
312
|
-
return f'Fixed {self.min_replicas}
|
|
371
|
+
return f'Fixed {self.min_replicas} {noun}{min_plural}'
|
|
313
372
|
# Already checked in __init__.
|
|
314
373
|
assert self.target_qps_per_replica is not None
|
|
315
374
|
# TODO(tian): Refactor to contain more information
|
|
@@ -319,8 +378,8 @@ class SkyServiceSpec:
|
|
|
319
378
|
overprovision_str = (
|
|
320
379
|
f' with {self.num_overprovision} overprovisioned replicas')
|
|
321
380
|
return (f'Autoscaling from {self.min_replicas} to {self.max_replicas} '
|
|
322
|
-
f'
|
|
323
|
-
f'
|
|
381
|
+
f'{noun}{max_plural}{overprovision_str} (target QPS per '
|
|
382
|
+
f'{noun}: {self.target_qps_per_replica})')
|
|
324
383
|
|
|
325
384
|
def set_ports(self, ports: str) -> None:
|
|
326
385
|
self._ports = ports
|
|
@@ -332,6 +391,10 @@ class SkyServiceSpec:
|
|
|
332
391
|
f'Certfile: {self.tls_credential.certfile}')
|
|
333
392
|
|
|
334
393
|
def __repr__(self) -> str:
|
|
394
|
+
if self.pool:
|
|
395
|
+
return textwrap.dedent(f"""\
|
|
396
|
+
Worker policy: {self.autoscaling_policy_str()}
|
|
397
|
+
""")
|
|
335
398
|
return textwrap.dedent(f"""\
|
|
336
399
|
Readiness probe method: {self.probe_str()}
|
|
337
400
|
Readiness initial delay seconds: {self.initial_delay_seconds}
|
|
@@ -420,3 +483,10 @@ class SkyServiceSpec:
|
|
|
420
483
|
def load_balancing_policy(self) -> str:
|
|
421
484
|
return lb_policies.LoadBalancingPolicy.make_policy_name(
|
|
422
485
|
self._load_balancing_policy)
|
|
486
|
+
|
|
487
|
+
@property
|
|
488
|
+
def pool(self) -> bool:
|
|
489
|
+
# This can happen for backward compatibility.
|
|
490
|
+
if not hasattr(self, '_pool'):
|
|
491
|
+
return False
|
|
492
|
+
return bool(self._pool)
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 12
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/daemons.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Internal server daemons that run in the background."""
|
|
2
|
+
import dataclasses
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Callable
|
|
6
|
+
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky import skypilot_config
|
|
9
|
+
from sky.server import constants as server_constants
|
|
10
|
+
from sky.utils import common
|
|
11
|
+
from sky.utils import env_options
|
|
12
|
+
from sky.utils import ux_utils
|
|
13
|
+
|
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclasses.dataclass
|
|
18
|
+
class InternalRequestDaemon:
|
|
19
|
+
"""Internal daemon that runs an event in the background."""
|
|
20
|
+
|
|
21
|
+
id: str
|
|
22
|
+
name: str
|
|
23
|
+
event_fn: Callable[[], None]
|
|
24
|
+
default_log_level: str = 'INFO'
|
|
25
|
+
|
|
26
|
+
def refresh_log_level(self) -> int:
|
|
27
|
+
# pylint: disable=import-outside-toplevel
|
|
28
|
+
import logging
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Refresh config within the while loop.
|
|
32
|
+
# Since this is a long running daemon,
|
|
33
|
+
# reload_config_for_new_request()
|
|
34
|
+
# is not called in between the event runs.
|
|
35
|
+
skypilot_config.safe_reload_config()
|
|
36
|
+
# Get the configured log level for the daemon inside the event loop
|
|
37
|
+
# in case the log level changes after the API server is started.
|
|
38
|
+
level_str = skypilot_config.get_nested(
|
|
39
|
+
('daemons', self.id, 'log_level'), self.default_log_level)
|
|
40
|
+
return getattr(logging, level_str.upper())
|
|
41
|
+
except AttributeError:
|
|
42
|
+
# Bad level should be rejected by
|
|
43
|
+
# schema validation, just in case.
|
|
44
|
+
logger.warning(f'Invalid log level: {level_str}, using DEBUG')
|
|
45
|
+
return logging.DEBUG
|
|
46
|
+
except Exception as e: # pylint: disable=broad-except
|
|
47
|
+
logger.exception(f'Error refreshing log level for {self.id}: {e}')
|
|
48
|
+
return logging.DEBUG
|
|
49
|
+
|
|
50
|
+
def run_event(self):
|
|
51
|
+
"""Run the event."""
|
|
52
|
+
|
|
53
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
54
|
+
# sent multiple times.
|
|
55
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
56
|
+
|
|
57
|
+
level = self.refresh_log_level()
|
|
58
|
+
while True:
|
|
59
|
+
try:
|
|
60
|
+
with ux_utils.enable_traceback(), \
|
|
61
|
+
sky_logging.set_sky_logging_levels(level):
|
|
62
|
+
sky_logging.reload_logger()
|
|
63
|
+
level = self.refresh_log_level()
|
|
64
|
+
self.event_fn()
|
|
65
|
+
except Exception: # pylint: disable=broad-except
|
|
66
|
+
# It is OK to fail to run the event, as the event is not
|
|
67
|
+
# critical, but we should log the error.
|
|
68
|
+
logger.exception(
|
|
69
|
+
f'Error running {self.name} event. '
|
|
70
|
+
f'Restarting in '
|
|
71
|
+
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
72
|
+
'seconds...')
|
|
73
|
+
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def refresh_cluster_status_event():
|
|
77
|
+
"""Periodically refresh the cluster status."""
|
|
78
|
+
# pylint: disable=import-outside-toplevel
|
|
79
|
+
from sky import core
|
|
80
|
+
|
|
81
|
+
logger.info('=== Refreshing cluster status ===')
|
|
82
|
+
# This periodically refresh will hold the lock for the cluster being
|
|
83
|
+
# refreshed, but it is OK because other operations will just wait for
|
|
84
|
+
# the lock and get the just refreshed status without refreshing again.
|
|
85
|
+
core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
|
|
86
|
+
logger.info('Status refreshed. Sleeping '
|
|
87
|
+
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
88
|
+
' seconds for the next refresh...\n')
|
|
89
|
+
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def refresh_volume_status_event():
|
|
93
|
+
"""Periodically refresh the volume status."""
|
|
94
|
+
# pylint: disable=import-outside-toplevel
|
|
95
|
+
from sky.volumes.server import core
|
|
96
|
+
|
|
97
|
+
# Disable logging for periodic refresh to avoid the usage message being
|
|
98
|
+
# sent multiple times.
|
|
99
|
+
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
100
|
+
|
|
101
|
+
logger.info('=== Refreshing volume status ===')
|
|
102
|
+
core.volume_refresh()
|
|
103
|
+
logger.info('Volume status refreshed. Sleeping '
|
|
104
|
+
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
105
|
+
' seconds for the next refresh...\n')
|
|
106
|
+
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def managed_job_status_refresh_event():
|
|
110
|
+
"""Refresh the managed job status for controller consolidation mode."""
|
|
111
|
+
# pylint: disable=import-outside-toplevel
|
|
112
|
+
from sky.jobs import utils as managed_job_utils
|
|
113
|
+
if not managed_job_utils.is_consolidation_mode():
|
|
114
|
+
return
|
|
115
|
+
# We run the recovery logic before starting the event loop as those two are
|
|
116
|
+
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
117
|
+
from sky.utils import controller_utils
|
|
118
|
+
if controller_utils.high_availability_specified(
|
|
119
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
120
|
+
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
121
|
+
# After recovery, we start the event loop.
|
|
122
|
+
from sky.skylet import events
|
|
123
|
+
refresh_event = events.ManagedJobEvent()
|
|
124
|
+
scheduling_event = events.ManagedJobSchedulingEvent()
|
|
125
|
+
logger.info('=== Running managed job event ===')
|
|
126
|
+
refresh_event.run()
|
|
127
|
+
scheduling_event.run()
|
|
128
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def sky_serve_status_refresh_event():
|
|
132
|
+
"""Refresh the sky serve status for controller consolidation mode."""
|
|
133
|
+
# pylint: disable=import-outside-toplevel
|
|
134
|
+
from sky.serve import serve_utils
|
|
135
|
+
if not serve_utils.is_consolidation_mode():
|
|
136
|
+
return
|
|
137
|
+
# TODO(tian): Add HA recovery logic.
|
|
138
|
+
from sky.skylet import events
|
|
139
|
+
event = events.ServiceUpdateEvent()
|
|
140
|
+
logger.info('=== Running serve status refresh event ===')
|
|
141
|
+
event.run()
|
|
142
|
+
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# Register the events to run in the background.
|
|
146
|
+
INTERNAL_REQUEST_DAEMONS = [
|
|
147
|
+
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
148
|
+
# set to updated status automatically, without showing users the hint of
|
|
149
|
+
# cluster being stopped or down when `sky status -r` is called.
|
|
150
|
+
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
151
|
+
name='status',
|
|
152
|
+
event_fn=refresh_cluster_status_event,
|
|
153
|
+
default_log_level='DEBUG'),
|
|
154
|
+
# Volume status refresh daemon to update the volume status periodically.
|
|
155
|
+
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
156
|
+
name='volume',
|
|
157
|
+
event_fn=refresh_volume_status_event),
|
|
158
|
+
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
159
|
+
name='managed-job-status',
|
|
160
|
+
event_fn=managed_job_status_refresh_event),
|
|
161
|
+
InternalRequestDaemon(id='sky-serve-status-refresh-daemon',
|
|
162
|
+
name='sky-serve-status',
|
|
163
|
+
event_fn=sky_serve_status_refresh_event),
|
|
164
|
+
]
|
sky/server/requests/payloads.py
CHANGED
|
@@ -478,6 +478,8 @@ class JobsLaunchBody(RequestBody):
|
|
|
478
478
|
"""The request body for the jobs launch endpoint."""
|
|
479
479
|
task: str
|
|
480
480
|
name: Optional[str]
|
|
481
|
+
pool: Optional[str] = None
|
|
482
|
+
num_jobs: Optional[int] = None
|
|
481
483
|
|
|
482
484
|
def to_kwargs(self) -> Dict[str, Any]:
|
|
483
485
|
kwargs = super().to_kwargs()
|
|
@@ -500,6 +502,7 @@ class JobsCancelBody(RequestBody):
|
|
|
500
502
|
job_ids: Optional[List[int]] = None
|
|
501
503
|
all: bool = False
|
|
502
504
|
all_users: bool = False
|
|
505
|
+
pool: Optional[str] = None
|
|
503
506
|
|
|
504
507
|
|
|
505
508
|
class JobsLogsBody(RequestBody):
|
|
@@ -671,6 +674,36 @@ class JobsDownloadLogsBody(RequestBody):
|
|
|
671
674
|
local_dir: str = constants.SKY_LOGS_DIRECTORY
|
|
672
675
|
|
|
673
676
|
|
|
677
|
+
class JobsPoolApplyBody(RequestBody):
|
|
678
|
+
"""The request body for the jobs pool apply endpoint."""
|
|
679
|
+
task: str
|
|
680
|
+
pool_name: str
|
|
681
|
+
mode: serve.UpdateMode
|
|
682
|
+
|
|
683
|
+
def to_kwargs(self) -> Dict[str, Any]:
|
|
684
|
+
kwargs = super().to_kwargs()
|
|
685
|
+
dag = common.process_mounts_in_task_on_api_server(self.task,
|
|
686
|
+
self.env_vars,
|
|
687
|
+
workdir_only=False)
|
|
688
|
+
assert len(
|
|
689
|
+
dag.tasks) == 1, ('Must only specify one task in the DAG for '
|
|
690
|
+
'a pool.', dag)
|
|
691
|
+
kwargs['task'] = dag.tasks[0]
|
|
692
|
+
return kwargs
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
class JobsPoolDownBody(RequestBody):
|
|
696
|
+
"""The request body for the jobs pool down endpoint."""
|
|
697
|
+
pool_names: Optional[Union[str, List[str]]]
|
|
698
|
+
all: bool = False
|
|
699
|
+
purge: bool = False
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
class JobsPoolStatusBody(RequestBody):
|
|
703
|
+
"""The request body for the jobs pool status endpoint."""
|
|
704
|
+
pool_names: Optional[Union[str, List[str]]]
|
|
705
|
+
|
|
706
|
+
|
|
674
707
|
class UploadZipFileResponse(pydantic.BaseModel):
|
|
675
708
|
"""The response body for the upload zip file endpoint."""
|
|
676
709
|
status: str
|
sky/server/requests/requests.py
CHANGED
|
@@ -24,12 +24,11 @@ from sky import sky_logging
|
|
|
24
24
|
from sky import skypilot_config
|
|
25
25
|
from sky.server import common as server_common
|
|
26
26
|
from sky.server import constants as server_constants
|
|
27
|
+
from sky.server import daemons
|
|
27
28
|
from sky.server.requests import payloads
|
|
28
29
|
from sky.server.requests.serializers import decoders
|
|
29
30
|
from sky.server.requests.serializers import encoders
|
|
30
|
-
from sky.utils import common
|
|
31
31
|
from sky.utils import common_utils
|
|
32
|
-
from sky.utils import env_options
|
|
33
32
|
from sky.utils import subprocess_utils
|
|
34
33
|
from sky.utils import ux_utils
|
|
35
34
|
from sky.utils.db import db_utils
|
|
@@ -307,110 +306,6 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
|
|
|
307
306
|
kill_requests(request_ids)
|
|
308
307
|
|
|
309
308
|
|
|
310
|
-
def refresh_cluster_status_event():
|
|
311
|
-
"""Periodically refresh the cluster status."""
|
|
312
|
-
# pylint: disable=import-outside-toplevel
|
|
313
|
-
from sky import core
|
|
314
|
-
|
|
315
|
-
# Disable logging for periodic refresh to avoid the usage message being
|
|
316
|
-
# sent multiple times.
|
|
317
|
-
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
318
|
-
|
|
319
|
-
while True:
|
|
320
|
-
logger.info('=== Refreshing cluster status ===')
|
|
321
|
-
# This periodically refresh will hold the lock for the cluster being
|
|
322
|
-
# refreshed, but it is OK because other operations will just wait for
|
|
323
|
-
# the lock and get the just refreshed status without refreshing again.
|
|
324
|
-
core.status(refresh=common.StatusRefreshMode.FORCE, all_users=True)
|
|
325
|
-
logger.info(
|
|
326
|
-
'Status refreshed. Sleeping '
|
|
327
|
-
f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
328
|
-
' seconds for the next refresh...\n')
|
|
329
|
-
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def refresh_volume_status_event():
|
|
333
|
-
"""Periodically refresh the volume status."""
|
|
334
|
-
# pylint: disable=import-outside-toplevel
|
|
335
|
-
from sky.volumes.server import core
|
|
336
|
-
|
|
337
|
-
# Disable logging for periodic refresh to avoid the usage message being
|
|
338
|
-
# sent multiple times.
|
|
339
|
-
os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
|
|
340
|
-
|
|
341
|
-
while True:
|
|
342
|
-
logger.info('=== Refreshing volume status ===')
|
|
343
|
-
core.volume_refresh()
|
|
344
|
-
logger.info('Volume status refreshed. Sleeping '
|
|
345
|
-
f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
|
|
346
|
-
' seconds for the next refresh...\n')
|
|
347
|
-
time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
def managed_job_status_refresh_event():
|
|
351
|
-
"""Refresh the managed job status for controller consolidation mode."""
|
|
352
|
-
# pylint: disable=import-outside-toplevel
|
|
353
|
-
from sky.jobs import utils as managed_job_utils
|
|
354
|
-
if not managed_job_utils.is_consolidation_mode():
|
|
355
|
-
return
|
|
356
|
-
# We run the recovery logic before starting the event loop as those two are
|
|
357
|
-
# conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
|
|
358
|
-
from sky.utils import controller_utils
|
|
359
|
-
if controller_utils.high_availability_specified(
|
|
360
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name):
|
|
361
|
-
managed_job_utils.ha_recovery_for_consolidation_mode()
|
|
362
|
-
# After recovery, we start the event loop.
|
|
363
|
-
from sky.skylet import events
|
|
364
|
-
event = events.ManagedJobEvent()
|
|
365
|
-
while True:
|
|
366
|
-
time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
|
|
367
|
-
event.run()
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
@dataclasses.dataclass
|
|
371
|
-
class InternalRequestDaemon:
|
|
372
|
-
"""Internal daemon that runs an event in the background."""
|
|
373
|
-
|
|
374
|
-
id: str
|
|
375
|
-
name: str
|
|
376
|
-
event_fn: Callable[[], None]
|
|
377
|
-
|
|
378
|
-
def run_event(self):
|
|
379
|
-
"""Run the event."""
|
|
380
|
-
while True:
|
|
381
|
-
with ux_utils.enable_traceback():
|
|
382
|
-
try:
|
|
383
|
-
self.event_fn()
|
|
384
|
-
break
|
|
385
|
-
except Exception: # pylint: disable=broad-except
|
|
386
|
-
# It is OK to fail to run the event, as the event is not
|
|
387
|
-
# critical, but we should log the error.
|
|
388
|
-
logger.exception(
|
|
389
|
-
f'Error running {self.name} event. '
|
|
390
|
-
f'Restarting in '
|
|
391
|
-
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
|
|
392
|
-
'seconds...')
|
|
393
|
-
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
# Register the events to run in the background.
|
|
397
|
-
INTERNAL_REQUEST_DAEMONS = [
|
|
398
|
-
# This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
|
|
399
|
-
# set to updated status automatically, without showing users the hint of
|
|
400
|
-
# cluster being stopped or down when `sky status -r` is called.
|
|
401
|
-
InternalRequestDaemon(id='skypilot-status-refresh-daemon',
|
|
402
|
-
name='status',
|
|
403
|
-
event_fn=refresh_cluster_status_event),
|
|
404
|
-
# Volume status refresh daemon to update the volume status periodically.
|
|
405
|
-
InternalRequestDaemon(id='skypilot-volume-status-refresh-daemon',
|
|
406
|
-
name='volume',
|
|
407
|
-
event_fn=refresh_volume_status_event),
|
|
408
|
-
InternalRequestDaemon(id='managed-job-status-refresh-daemon',
|
|
409
|
-
name='managed-job-status',
|
|
410
|
-
event_fn=managed_job_status_refresh_event),
|
|
411
|
-
]
|
|
412
|
-
|
|
413
|
-
|
|
414
309
|
def kill_requests(request_ids: Optional[List[str]] = None,
|
|
415
310
|
user_id: Optional[str] = None) -> List[str]:
|
|
416
311
|
"""Kill a SkyPilot API request and set its status to cancelled.
|
|
@@ -441,7 +336,7 @@ def kill_requests(request_ids: Optional[List[str]] = None,
|
|
|
441
336
|
# Skip internal requests. The internal requests are scheduled with
|
|
442
337
|
# request_id in range(len(INTERNAL_REQUEST_EVENTS)).
|
|
443
338
|
if request_record.request_id in set(
|
|
444
|
-
event.id for event in INTERNAL_REQUEST_DAEMONS):
|
|
339
|
+
event.id for event in daemons.INTERNAL_REQUEST_DAEMONS):
|
|
445
340
|
continue
|
|
446
341
|
if request_record.status > RequestStatus.RUNNING:
|
|
447
342
|
logger.debug(f'Request {request_id} already finished')
|
|
@@ -109,9 +109,8 @@ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
109
109
|
return jobs
|
|
110
110
|
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
service_statuses = return_value
|
|
112
|
+
def _decode_serve_status(
|
|
113
|
+
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
115
114
|
for service_status in service_statuses:
|
|
116
115
|
service_status['status'] = serve_state.ServiceStatus(
|
|
117
116
|
service_status['status'])
|
|
@@ -122,6 +121,16 @@ def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
|
122
121
|
return service_statuses
|
|
123
122
|
|
|
124
123
|
|
|
124
|
+
@register_decoders('serve.status')
|
|
125
|
+
def decode_serve_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
126
|
+
return _decode_serve_status(return_value)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@register_decoders('jobs.pool_status')
|
|
130
|
+
def decode_jobs_pool_status(return_value: List[dict]) -> List[Dict[str, Any]]:
|
|
131
|
+
return _decode_serve_status(return_value)
|
|
132
|
+
|
|
133
|
+
|
|
125
134
|
@register_decoders('cost_report')
|
|
126
135
|
def decode_cost_report(
|
|
127
136
|
return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
@@ -112,8 +112,7 @@ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
|
|
|
112
112
|
return jobs
|
|
113
113
|
|
|
114
114
|
|
|
115
|
-
|
|
116
|
-
def encode_serve_status(
|
|
115
|
+
def _encode_serve_status(
|
|
117
116
|
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
118
117
|
for service_status in service_statuses:
|
|
119
118
|
service_status['status'] = service_status['status'].value
|
|
@@ -123,6 +122,18 @@ def encode_serve_status(
|
|
|
123
122
|
return service_statuses
|
|
124
123
|
|
|
125
124
|
|
|
125
|
+
@register_encoder('serve.status')
|
|
126
|
+
def encode_serve_status(
|
|
127
|
+
service_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
128
|
+
return _encode_serve_status(service_statuses)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@register_encoder('jobs.pool_status')
|
|
132
|
+
def encode_jobs_pool_status(
|
|
133
|
+
pool_statuses: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
134
|
+
return _encode_serve_status(pool_statuses)
|
|
135
|
+
|
|
136
|
+
|
|
126
137
|
@register_encoder('cost_report')
|
|
127
138
|
def encode_cost_report(
|
|
128
139
|
cost_report: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
sky/server/server.py
CHANGED
|
@@ -46,6 +46,7 @@ from sky.serve.server import server as serve_rest
|
|
|
46
46
|
from sky.server import common
|
|
47
47
|
from sky.server import config as server_config
|
|
48
48
|
from sky.server import constants as server_constants
|
|
49
|
+
from sky.server import daemons
|
|
49
50
|
from sky.server import metrics
|
|
50
51
|
from sky.server import state
|
|
51
52
|
from sky.server import stream_utils
|
|
@@ -482,7 +483,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
|
|
|
482
483
|
"""FastAPI lifespan context manager."""
|
|
483
484
|
del app # unused
|
|
484
485
|
# Startup: Run background tasks
|
|
485
|
-
for event in
|
|
486
|
+
for event in daemons.INTERNAL_REQUEST_DAEMONS:
|
|
486
487
|
try:
|
|
487
488
|
executor.schedule_request(
|
|
488
489
|
request_id=event.id,
|
sky/server/uvicorn.py
CHANGED
|
@@ -16,6 +16,7 @@ import uvicorn
|
|
|
16
16
|
from uvicorn.supervisors import multiprocess
|
|
17
17
|
|
|
18
18
|
from sky import sky_logging
|
|
19
|
+
from sky.server import daemons
|
|
19
20
|
from sky.server import state
|
|
20
21
|
from sky.server.requests import requests as requests_lib
|
|
21
22
|
from sky.skylet import constants
|
|
@@ -120,7 +121,7 @@ class Server(uvicorn.Server):
|
|
|
120
121
|
# Proactively cancel internal requests and logs requests since
|
|
121
122
|
# they can run for infinite time.
|
|
122
123
|
internal_request_ids = [
|
|
123
|
-
d.id for d in
|
|
124
|
+
d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
|
|
124
125
|
]
|
|
125
126
|
if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
|
|
126
127
|
logger.warning('Timeout waiting for on-going requests to '
|
sky/sky_logging.py
CHANGED
|
@@ -171,6 +171,36 @@ def set_logging_level(logger: str, level: int):
|
|
|
171
171
|
logger.setLevel(original_level)
|
|
172
172
|
|
|
173
173
|
|
|
174
|
+
@contextlib.contextmanager
|
|
175
|
+
def set_sky_logging_levels(level: int):
|
|
176
|
+
"""Set the logging level for all loggers."""
|
|
177
|
+
# Turn off logger
|
|
178
|
+
previous_levels = {}
|
|
179
|
+
for logger_name in logging.Logger.manager.loggerDict:
|
|
180
|
+
if logger_name.startswith('sky'):
|
|
181
|
+
logger = logging.getLogger(logger_name)
|
|
182
|
+
previous_levels[logger_name] = logger.level
|
|
183
|
+
logger.setLevel(level)
|
|
184
|
+
if level == logging.DEBUG:
|
|
185
|
+
previous_show_debug_info = env_options.Options.SHOW_DEBUG_INFO.get()
|
|
186
|
+
os.environ[env_options.Options.SHOW_DEBUG_INFO.env_key] = '1'
|
|
187
|
+
try:
|
|
188
|
+
yield
|
|
189
|
+
finally:
|
|
190
|
+
# Restore logger
|
|
191
|
+
for logger_name in logging.Logger.manager.loggerDict:
|
|
192
|
+
if logger_name.startswith('sky'):
|
|
193
|
+
logger = logging.getLogger(logger_name)
|
|
194
|
+
try:
|
|
195
|
+
logger.setLevel(previous_levels[logger_name])
|
|
196
|
+
except KeyError:
|
|
197
|
+
# New loggers maybe initialized after the context manager,
|
|
198
|
+
# no need to restore the level.
|
|
199
|
+
pass
|
|
200
|
+
if level == logging.DEBUG and not previous_show_debug_info:
|
|
201
|
+
os.environ.pop(env_options.Options.SHOW_DEBUG_INFO.env_key)
|
|
202
|
+
|
|
203
|
+
|
|
174
204
|
def logging_enabled(logger: logging.Logger, level: int) -> bool:
|
|
175
205
|
return logger.level <= level
|
|
176
206
|
|