skypilot-nightly 1.0.0.dev20250802__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/catalog/kubernetes_catalog.py +8 -0
- sky/catalog/nebius_catalog.py +0 -1
- sky/client/cli/command.py +32 -13
- sky/client/sdk.py +16 -8
- sky/client/sdk.pyi +6 -5
- sky/client/sdk_async.py +811 -0
- sky/clouds/kubernetes.py +6 -1
- sky/clouds/nebius.py +1 -4
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
- sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/client/sdk_async.py +135 -0
- sky/jobs/utils.py +3 -1
- sky/provision/kubernetes/utils.py +62 -6
- sky/provision/nebius/instance.py +1 -0
- sky/provision/nebius/utils.py +9 -1
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +2 -1
- sky/serve/controller.py +2 -1
- sky/serve/load_balancer.py +3 -1
- sky/serve/serve_state.py +70 -5
- sky/serve/serve_utils.py +124 -22
- sky/serve/server/impl.py +22 -21
- sky/serve/service.py +8 -1
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +46 -0
- sky/server/auth/oauth2_proxy.py +185 -0
- sky/server/common.py +108 -17
- sky/server/constants.py +1 -1
- sky/server/daemons.py +60 -11
- sky/server/rest.py +114 -0
- sky/server/server.py +44 -40
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +5 -1
- sky/skylet/skylet.py +3 -1
- sky/task.py +43 -10
- sky/templates/kubernetes-ray.yml.j2 +4 -0
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/utils/controller_utils.py +7 -0
- sky/utils/rich_utils.py +120 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +87 -82
- sky/dashboard/out/_next/static/2JNCZ4daQBotwWRNGi6aE/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
- sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
- /sky/dashboard/out/_next/static/{2JNCZ4daQBotwWRNGi6aE → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250802.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Async SDK for SkyServe."""
|
|
2
|
+
import typing
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from sky.client import sdk_async
|
|
6
|
+
from sky.serve.client import sdk
|
|
7
|
+
from sky.usage import usage_lib
|
|
8
|
+
from sky.utils import context_utils
|
|
9
|
+
|
|
10
|
+
if typing.TYPE_CHECKING:
|
|
11
|
+
import io
|
|
12
|
+
|
|
13
|
+
import sky
|
|
14
|
+
from sky.serve import serve_utils
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@usage_lib.entrypoint
|
|
18
|
+
async def up(
|
|
19
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
20
|
+
service_name: str,
|
|
21
|
+
# Internal only:
|
|
22
|
+
# pylint: disable=invalid-name
|
|
23
|
+
_need_confirmation: bool = False,
|
|
24
|
+
stream_logs: Optional[
|
|
25
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
26
|
+
) -> Tuple[str, str]:
|
|
27
|
+
"""Async version of up() that spins up a service."""
|
|
28
|
+
request_id = await context_utils.to_thread(sdk.up, task, service_name,
|
|
29
|
+
_need_confirmation)
|
|
30
|
+
if stream_logs is not None:
|
|
31
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
32
|
+
else:
|
|
33
|
+
return await sdk_async.get(request_id)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@usage_lib.entrypoint
|
|
37
|
+
async def update(
|
|
38
|
+
task: Union['sky.Task', 'sky.Dag'],
|
|
39
|
+
service_name: str,
|
|
40
|
+
mode: 'serve_utils.UpdateMode',
|
|
41
|
+
# Internal only:
|
|
42
|
+
# pylint: disable=invalid-name
|
|
43
|
+
_need_confirmation: bool = False,
|
|
44
|
+
stream_logs: Optional[
|
|
45
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Async version of update() that updates an existing service."""
|
|
48
|
+
request_id = await context_utils.to_thread(sdk.update, task, service_name,
|
|
49
|
+
mode, _need_confirmation)
|
|
50
|
+
if stream_logs is not None:
|
|
51
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
52
|
+
else:
|
|
53
|
+
return await sdk_async.get(request_id)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@usage_lib.entrypoint
|
|
57
|
+
async def down(
|
|
58
|
+
service_names: Optional[Union[str, List[str]]],
|
|
59
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
|
60
|
+
purge: bool = False,
|
|
61
|
+
stream_logs: Optional[
|
|
62
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Async version of down() that tears down a service."""
|
|
65
|
+
request_id = await context_utils.to_thread(sdk.down, service_names, all,
|
|
66
|
+
purge)
|
|
67
|
+
if stream_logs is not None:
|
|
68
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
69
|
+
else:
|
|
70
|
+
return await sdk_async.get(request_id)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@usage_lib.entrypoint
|
|
74
|
+
async def terminate_replica(
|
|
75
|
+
service_name: str,
|
|
76
|
+
replica_id: int,
|
|
77
|
+
purge: bool,
|
|
78
|
+
stream_logs: Optional[
|
|
79
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Async version of terminate_replica() that tears down a specific
|
|
82
|
+
replica."""
|
|
83
|
+
request_id = await context_utils.to_thread(sdk.terminate_replica,
|
|
84
|
+
service_name, replica_id, purge)
|
|
85
|
+
if stream_logs is not None:
|
|
86
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
87
|
+
else:
|
|
88
|
+
return await sdk_async.get(request_id)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@usage_lib.entrypoint
|
|
92
|
+
async def status(
|
|
93
|
+
service_names: Optional[Union[str, List[str]]],
|
|
94
|
+
stream_logs: Optional[
|
|
95
|
+
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
96
|
+
) -> List[Dict[str, Any]]:
|
|
97
|
+
"""Async version of status() that sdk_async.gets service statuses."""
|
|
98
|
+
request_id = await context_utils.to_thread(sdk.status, service_names)
|
|
99
|
+
if stream_logs is not None:
|
|
100
|
+
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
101
|
+
else:
|
|
102
|
+
return await sdk_async.get(request_id)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@usage_lib.entrypoint
|
|
106
|
+
async def tail_logs(service_name: str,
|
|
107
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
|
108
|
+
replica_id: Optional[int] = None,
|
|
109
|
+
follow: bool = True,
|
|
110
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
|
111
|
+
"""Async version of tail_logs() that tails logs for a service."""
|
|
112
|
+
return await context_utils.to_thread(sdk.tail_logs, service_name, target,
|
|
113
|
+
replica_id, follow, output_stream)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@usage_lib.entrypoint
|
|
117
|
+
async def sync_down_logs(service_name: str,
|
|
118
|
+
local_dir: str,
|
|
119
|
+
*,
|
|
120
|
+
targets: Optional[Union[
|
|
121
|
+
str, 'serve_utils.ServiceComponent', List[Union[
|
|
122
|
+
str, 'serve_utils.ServiceComponent']]]] = None,
|
|
123
|
+
replica_ids: Optional[List[int]] = None) -> None:
|
|
124
|
+
"""Async version of sync_down_logs() that syncs down logs from service
|
|
125
|
+
components."""
|
|
126
|
+
return await context_utils.to_thread(sdk.sync_down_logs,
|
|
127
|
+
service_name,
|
|
128
|
+
local_dir,
|
|
129
|
+
targets=targets,
|
|
130
|
+
replica_ids=replica_ids)
|
sky/serve/constants.py
CHANGED
|
@@ -105,7 +105,8 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
|
105
105
|
# v1.0 - Introduce rolling update.
|
|
106
106
|
# v2.0 - Added template-replica feature.
|
|
107
107
|
# v3.0 - Added cluster pool.
|
|
108
|
-
|
|
108
|
+
# v4.0 - Added pool argument to wait_service_registration.
|
|
109
|
+
SERVE_VERSION = 4
|
|
109
110
|
|
|
110
111
|
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
|
111
112
|
'The version of service is outdated and does not support manually '
|
sky/serve/controller.py
CHANGED
|
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
|
|
|
4
4
|
"""
|
|
5
5
|
import contextlib
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import threading
|
|
8
9
|
import time
|
|
9
10
|
import traceback
|
|
@@ -242,7 +243,7 @@ class SkyServeController:
|
|
|
242
243
|
threading.Thread(target=self._run_autoscaler).start()
|
|
243
244
|
|
|
244
245
|
logger.info('SkyServe Controller started on '
|
|
245
|
-
f'http://{self._host}:{self._port}')
|
|
246
|
+
f'http://{self._host}:{self._port}. PID: {os.getpid()}')
|
|
246
247
|
|
|
247
248
|
uvicorn.run(self._app, host=self._host, port=self._port)
|
|
248
249
|
|
sky/serve/load_balancer.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""LoadBalancer: Distribute any incoming request to all ready replicas."""
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
import threading
|
|
5
6
|
import traceback
|
|
6
7
|
from typing import Dict, List, Optional, Union
|
|
@@ -254,7 +255,8 @@ class SkyServeLoadBalancer:
|
|
|
254
255
|
protocol = 'https' if self._tls_credential is not None else 'http'
|
|
255
256
|
|
|
256
257
|
logger.info('SkyServe Load Balancer started on '
|
|
257
|
-
f'{protocol}://0.0.0.0:{self._load_balancer_port}'
|
|
258
|
+
f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
|
|
259
|
+
f'PID: {os.getpid()}')
|
|
258
260
|
|
|
259
261
|
uvicorn.run(self._app,
|
|
260
262
|
host='0.0.0.0',
|
sky/serve/serve_state.py
CHANGED
|
@@ -47,6 +47,10 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
|
|
47
47
|
service_name TEXT,
|
|
48
48
|
spec BLOB,
|
|
49
49
|
PRIMARY KEY (service_name, version))""")
|
|
50
|
+
cursor.execute("""\
|
|
51
|
+
CREATE TABLE IF NOT EXISTS ha_recovery_script (
|
|
52
|
+
service_name TEXT PRIMARY KEY,
|
|
53
|
+
script TEXT)""")
|
|
50
54
|
conn.commit()
|
|
51
55
|
|
|
52
56
|
# Backward compatibility.
|
|
@@ -71,6 +75,13 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
|
|
|
71
75
|
# Whether the service is a cluster pool.
|
|
72
76
|
db_utils.add_column_to_table(cursor, conn, 'services', 'pool',
|
|
73
77
|
'INTEGER DEFAULT 0')
|
|
78
|
+
# Add controller_pid for status tracking.
|
|
79
|
+
db_utils.add_column_to_table(cursor,
|
|
80
|
+
conn,
|
|
81
|
+
'services',
|
|
82
|
+
'controller_pid',
|
|
83
|
+
'INTEGER DEFAULT NULL',
|
|
84
|
+
value_to_replace_existing_entries=-1)
|
|
74
85
|
conn.commit()
|
|
75
86
|
|
|
76
87
|
|
|
@@ -272,7 +283,8 @@ _SERVICE_STATUS_TO_COLOR = {
|
|
|
272
283
|
@init_db
|
|
273
284
|
def add_service(name: str, controller_job_id: int, policy: str,
|
|
274
285
|
requested_resources_str: str, load_balancing_policy: str,
|
|
275
|
-
status: ServiceStatus, tls_encrypted: bool, pool: bool
|
|
286
|
+
status: ServiceStatus, tls_encrypted: bool, pool: bool,
|
|
287
|
+
controller_pid: int) -> bool:
|
|
276
288
|
"""Add a service in the database.
|
|
277
289
|
|
|
278
290
|
Returns:
|
|
@@ -287,11 +299,11 @@ def add_service(name: str, controller_job_id: int, policy: str,
|
|
|
287
299
|
INSERT INTO services
|
|
288
300
|
(name, controller_job_id, status, policy,
|
|
289
301
|
requested_resources_str, load_balancing_policy, tls_encrypted,
|
|
290
|
-
pool)
|
|
291
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
302
|
+
pool, controller_pid)
|
|
303
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
292
304
|
(name, controller_job_id, status.value, policy,
|
|
293
305
|
requested_resources_str, load_balancing_policy,
|
|
294
|
-
int(tls_encrypted), int(pool)))
|
|
306
|
+
int(tls_encrypted), int(pool), controller_pid))
|
|
295
307
|
|
|
296
308
|
except sqlite3.IntegrityError as e:
|
|
297
309
|
if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
|
|
@@ -300,6 +312,22 @@ def add_service(name: str, controller_job_id: int, policy: str,
|
|
|
300
312
|
return True
|
|
301
313
|
|
|
302
314
|
|
|
315
|
+
@init_db
|
|
316
|
+
def update_service_controller_pid(service_name: str,
|
|
317
|
+
controller_pid: int) -> None:
|
|
318
|
+
"""Updates the controller pid of a service.
|
|
319
|
+
|
|
320
|
+
This is used to update the controller pid of a service on ha recovery.
|
|
321
|
+
"""
|
|
322
|
+
assert _DB_PATH is not None
|
|
323
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
324
|
+
cursor.execute(
|
|
325
|
+
"""\
|
|
326
|
+
UPDATE services SET
|
|
327
|
+
controller_pid=(?) WHERE name=(?)""",
|
|
328
|
+
(controller_pid, service_name))
|
|
329
|
+
|
|
330
|
+
|
|
303
331
|
@init_db
|
|
304
332
|
def remove_service(service_name: str) -> None:
|
|
305
333
|
"""Removes a service from the database."""
|
|
@@ -368,7 +396,8 @@ def set_service_load_balancer_port(service_name: str,
|
|
|
368
396
|
def _get_service_from_row(row) -> Dict[str, Any]:
|
|
369
397
|
(current_version, name, controller_job_id, controller_port,
|
|
370
398
|
load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
|
|
371
|
-
_, active_versions, load_balancing_policy, tls_encrypted, pool
|
|
399
|
+
_, active_versions, load_balancing_policy, tls_encrypted, pool,
|
|
400
|
+
controller_pid) = row[:17]
|
|
372
401
|
record = {
|
|
373
402
|
'name': name,
|
|
374
403
|
'controller_job_id': controller_job_id,
|
|
@@ -388,6 +417,7 @@ def _get_service_from_row(row) -> Dict[str, Any]:
|
|
|
388
417
|
'load_balancing_policy': load_balancing_policy,
|
|
389
418
|
'tls_encrypted': bool(tls_encrypted),
|
|
390
419
|
'pool': bool(pool),
|
|
420
|
+
'controller_pid': controller_pid,
|
|
391
421
|
}
|
|
392
422
|
latest_spec = get_spec(name, current_version)
|
|
393
423
|
if latest_spec is not None:
|
|
@@ -666,3 +696,38 @@ def get_service_load_balancer_port(service_name: str) -> int:
|
|
|
666
696
|
if row is None:
|
|
667
697
|
raise ValueError(f'Service {service_name} does not exist.')
|
|
668
698
|
return row[0]
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
@init_db
|
|
702
|
+
def get_ha_recovery_script(service_name: str) -> Optional[str]:
|
|
703
|
+
"""Gets the HA recovery script for a service."""
|
|
704
|
+
assert _DB_PATH is not None
|
|
705
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
706
|
+
cursor.execute(
|
|
707
|
+
'SELECT script FROM ha_recovery_script WHERE service_name = ?',
|
|
708
|
+
(service_name,))
|
|
709
|
+
row = cursor.fetchone()
|
|
710
|
+
if row is None:
|
|
711
|
+
return None
|
|
712
|
+
return row[0]
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
@init_db
|
|
716
|
+
def set_ha_recovery_script(service_name: str, script: str) -> None:
|
|
717
|
+
"""Sets the HA recovery script for a service."""
|
|
718
|
+
assert _DB_PATH is not None
|
|
719
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
720
|
+
cursor.execute(
|
|
721
|
+
"""\
|
|
722
|
+
INSERT OR REPLACE INTO ha_recovery_script
|
|
723
|
+
(service_name, script)
|
|
724
|
+
VALUES (?, ?)""", (service_name, script))
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
@init_db
|
|
728
|
+
def remove_ha_recovery_script(service_name: str) -> None:
|
|
729
|
+
"""Removes the HA recovery script for a service."""
|
|
730
|
+
assert _DB_PATH is not None
|
|
731
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
|
732
|
+
cursor.execute('DELETE FROM ha_recovery_script WHERE service_name = ?',
|
|
733
|
+
(service_name,))
|
sky/serve/serve_utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import base64
|
|
3
3
|
import collections
|
|
4
4
|
import dataclasses
|
|
5
|
+
import datetime
|
|
5
6
|
import enum
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
@@ -33,6 +34,7 @@ from sky.serve import spot_placer
|
|
|
33
34
|
from sky.skylet import constants as skylet_constants
|
|
34
35
|
from sky.skylet import job_lib
|
|
35
36
|
from sky.utils import annotations
|
|
37
|
+
from sky.utils import command_runner
|
|
36
38
|
from sky.utils import common_utils
|
|
37
39
|
from sky.utils import log_utils
|
|
38
40
|
from sky.utils import message_utils
|
|
@@ -258,13 +260,76 @@ def get_service_filelock_path(pool: str) -> str:
|
|
|
258
260
|
|
|
259
261
|
|
|
260
262
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
261
|
-
def is_consolidation_mode() -> bool:
|
|
263
|
+
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
264
|
+
# Use jobs config for pool consolidation mode.
|
|
265
|
+
controller_type = 'jobs' if pool else 'serve'
|
|
262
266
|
consolidation_mode = skypilot_config.get_nested(
|
|
263
|
-
(
|
|
264
|
-
|
|
267
|
+
(controller_type, 'controller', 'consolidation_mode'),
|
|
268
|
+
default_value=False)
|
|
269
|
+
# _check_consolidation_mode_consistency(consolidation_mode, pool)
|
|
265
270
|
return consolidation_mode
|
|
266
271
|
|
|
267
272
|
|
|
273
|
+
def ha_recovery_for_consolidation_mode(pool: bool):
|
|
274
|
+
"""Recovery logic for HA mode."""
|
|
275
|
+
# No setup recovery is needed in consolidation mode, as the API server
|
|
276
|
+
# already has all runtime installed. Directly start jobs recovery here.
|
|
277
|
+
# Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
|
|
278
|
+
runner = command_runner.LocalProcessCommandRunner()
|
|
279
|
+
noun = 'pool' if pool else 'serve'
|
|
280
|
+
capnoun = noun.capitalize()
|
|
281
|
+
prefix = f'{noun}_'
|
|
282
|
+
with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
|
|
283
|
+
'w',
|
|
284
|
+
encoding='utf-8') as f:
|
|
285
|
+
start = time.time()
|
|
286
|
+
f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
|
|
287
|
+
for service_name in serve_state.get_glob_service_names(None):
|
|
288
|
+
svc = _get_service_status(service_name,
|
|
289
|
+
pool=pool,
|
|
290
|
+
with_replica_info=False)
|
|
291
|
+
if svc is None:
|
|
292
|
+
continue
|
|
293
|
+
controller_pid = svc['controller_pid']
|
|
294
|
+
if controller_pid is not None:
|
|
295
|
+
try:
|
|
296
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
297
|
+
f.write(f'Controller pid {controller_pid} for '
|
|
298
|
+
f'{noun} {service_name} is still running. '
|
|
299
|
+
'Skipping recovery.\n')
|
|
300
|
+
continue
|
|
301
|
+
except Exception: # pylint: disable=broad-except
|
|
302
|
+
# _controller_process_alive may raise if psutil fails; we
|
|
303
|
+
# should not crash the recovery logic because of this.
|
|
304
|
+
f.write('Error checking controller pid '
|
|
305
|
+
f'{controller_pid} for {noun} {service_name}\n')
|
|
306
|
+
|
|
307
|
+
script = serve_state.get_ha_recovery_script(service_name)
|
|
308
|
+
if script is None:
|
|
309
|
+
f.write(f'{capnoun} {service_name}\'s recovery script does '
|
|
310
|
+
'not exist. Skipping recovery.\n')
|
|
311
|
+
continue
|
|
312
|
+
rc, out, err = runner.run(script, require_outputs=True)
|
|
313
|
+
if rc:
|
|
314
|
+
f.write(f'Recovery script returned {rc}. '
|
|
315
|
+
f'Output: {out}\nError: {err}\n')
|
|
316
|
+
f.write(f'{capnoun} {service_name} completed recovery at '
|
|
317
|
+
f'{datetime.datetime.now()}\n')
|
|
318
|
+
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
|
319
|
+
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _controller_process_alive(pid: int, service_name: str) -> bool:
|
|
323
|
+
"""Check if the controller process is alive."""
|
|
324
|
+
try:
|
|
325
|
+
process = psutil.Process(pid)
|
|
326
|
+
cmd_str = ' '.join(process.cmdline())
|
|
327
|
+
return process.is_running(
|
|
328
|
+
) and f'--service-name {service_name}' in cmd_str
|
|
329
|
+
except psutil.NoSuchProcess:
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
|
|
268
333
|
def validate_service_task(task: 'sky.Task', pool: bool) -> None:
|
|
269
334
|
"""Validate the task for Sky Serve.
|
|
270
335
|
|
|
@@ -460,22 +525,53 @@ def set_service_status_and_active_versions_from_replica(
|
|
|
460
525
|
active_versions=active_versions)
|
|
461
526
|
|
|
462
527
|
|
|
463
|
-
def update_service_status() -> None:
|
|
464
|
-
if
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
528
|
+
def update_service_status(pool: bool) -> None:
|
|
529
|
+
noun = 'pool' if pool else 'serve'
|
|
530
|
+
capnoun = noun.capitalize()
|
|
531
|
+
service_names = serve_state.get_glob_service_names(None)
|
|
532
|
+
for service_name in service_names:
|
|
533
|
+
record = _get_service_status(service_name,
|
|
534
|
+
pool=pool,
|
|
535
|
+
with_replica_info=False)
|
|
536
|
+
if record is None:
|
|
537
|
+
continue
|
|
538
|
+
service_status = record['status']
|
|
539
|
+
if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
|
|
470
540
|
# Skip services that is shutting down.
|
|
471
541
|
continue
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
542
|
+
|
|
543
|
+
logger.info(f'Update {noun} status for {service_name!r} '
|
|
544
|
+
f'with status {service_status}')
|
|
545
|
+
|
|
546
|
+
controller_pid = record['controller_pid']
|
|
547
|
+
if controller_pid is None:
|
|
548
|
+
logger.info(f'{capnoun} {service_name!r} controller pid is None. '
|
|
549
|
+
f'Unexpected status {service_status}. Set to failure.')
|
|
550
|
+
elif controller_pid < 0:
|
|
551
|
+
# Backwards compatibility: this service was submitted when ray was
|
|
552
|
+
# still used for controller process management. We set the
|
|
553
|
+
# value_to_replace_existing_entries to -1 to indicate historical
|
|
554
|
+
# services.
|
|
555
|
+
# TODO(tian): Remove before 0.13.0.
|
|
556
|
+
controller_job_id = record['controller_job_id']
|
|
557
|
+
assert controller_job_id is not None
|
|
558
|
+
controller_status = job_lib.get_status(controller_job_id)
|
|
559
|
+
if (controller_status is not None and
|
|
560
|
+
not controller_status.is_terminal()):
|
|
561
|
+
continue
|
|
562
|
+
logger.info(f'Updating {noun} {service_name!r} in old version. '
|
|
563
|
+
f'SkyPilot job status: {controller_status}. '
|
|
564
|
+
'Set to failure.')
|
|
565
|
+
else:
|
|
566
|
+
if _controller_process_alive(controller_pid, service_name):
|
|
567
|
+
# The controller is still running.
|
|
568
|
+
continue
|
|
569
|
+
logger.info(f'{capnoun} {service_name!r} controller pid '
|
|
570
|
+
f'{controller_pid} is not alive. Set to failure.')
|
|
571
|
+
|
|
572
|
+
# If controller job is not running, set it as controller failed.
|
|
573
|
+
serve_state.set_service_status_and_active_versions(
|
|
574
|
+
service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
|
|
479
575
|
|
|
480
576
|
|
|
481
577
|
def update_service_encoded(service_name: str, version: int, mode: str,
|
|
@@ -754,9 +850,11 @@ def _terminate_failed_services(
|
|
|
754
850
|
shutil.rmtree(service_dir)
|
|
755
851
|
serve_state.remove_service(service_name)
|
|
756
852
|
serve_state.delete_all_versions(service_name)
|
|
853
|
+
serve_state.remove_ha_recovery_script(service_name)
|
|
757
854
|
|
|
758
855
|
if not remaining_replica_clusters:
|
|
759
856
|
return None
|
|
857
|
+
# TODO(tian): Try to terminate those replica clusters.
|
|
760
858
|
remaining_identity = ', '.join(remaining_replica_clusters)
|
|
761
859
|
return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
|
|
762
860
|
f'failed status ({service_status}). This may indicate a resource '
|
|
@@ -845,7 +943,8 @@ def terminate_services(service_names: Optional[List[str]], purge: bool,
|
|
|
845
943
|
return '\n'.join(messages)
|
|
846
944
|
|
|
847
945
|
|
|
848
|
-
def wait_service_registration(service_name: str, job_id: int
|
|
946
|
+
def wait_service_registration(service_name: str, job_id: int,
|
|
947
|
+
pool: bool) -> str:
|
|
849
948
|
"""Util function to call at the end of `sky.serve.up()`.
|
|
850
949
|
|
|
851
950
|
This function will:
|
|
@@ -862,7 +961,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
|
862
961
|
setup_completed = False
|
|
863
962
|
while True:
|
|
864
963
|
# TODO(tian): PID-based tracking.
|
|
865
|
-
if not is_consolidation_mode():
|
|
964
|
+
if not is_consolidation_mode(pool):
|
|
866
965
|
job_status = job_lib.get_status(job_id)
|
|
867
966
|
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
|
868
967
|
# Wait for the controller process to finish setting up. It
|
|
@@ -888,7 +987,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
|
888
987
|
record = serve_state.get_service_from_name(service_name)
|
|
889
988
|
if record is not None:
|
|
890
989
|
# TODO(tian): PID-based tracking.
|
|
891
|
-
if (not is_consolidation_mode() and
|
|
990
|
+
if (not is_consolidation_mode(pool) and
|
|
892
991
|
job_id != record['controller_job_id']):
|
|
893
992
|
with ux_utils.print_exception_no_traceback():
|
|
894
993
|
raise ValueError(
|
|
@@ -1420,10 +1519,13 @@ class ServeCodeGen:
|
|
|
1420
1519
|
return cls._build(code)
|
|
1421
1520
|
|
|
1422
1521
|
@classmethod
|
|
1423
|
-
def wait_service_registration(cls, service_name: str, job_id: int
|
|
1522
|
+
def wait_service_registration(cls, service_name: str, job_id: int,
|
|
1523
|
+
pool: bool) -> str:
|
|
1424
1524
|
code = [
|
|
1525
|
+
f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
|
|
1425
1526
|
'msg = serve_utils.wait_service_registration('
|
|
1426
|
-
f'{service_name!r}, {job_id}
|
|
1527
|
+
f'{service_name!r}, {job_id}, **kwargs)',
|
|
1528
|
+
'print(msg, end="", flush=True)'
|
|
1427
1529
|
]
|
|
1428
1530
|
return cls._build(code)
|
|
1429
1531
|
|
sky/serve/server/impl.py
CHANGED
|
@@ -102,10 +102,10 @@ def up(
|
|
|
102
102
|
pool: bool = False,
|
|
103
103
|
) -> Tuple[str, str]:
|
|
104
104
|
"""Spins up a service or a pool."""
|
|
105
|
-
if pool and not serve_utils.is_consolidation_mode():
|
|
105
|
+
if pool and not serve_utils.is_consolidation_mode(pool):
|
|
106
106
|
raise ValueError(
|
|
107
107
|
'Pool is only supported in consolidation mode. To fix, set '
|
|
108
|
-
'`
|
|
108
|
+
'`jobs.controller.consolidation_mode: true` in SkyPilot config.')
|
|
109
109
|
task.validate()
|
|
110
110
|
serve_utils.validate_service_task(task, pool=pool)
|
|
111
111
|
assert task.service is not None
|
|
@@ -174,7 +174,8 @@ def up(
|
|
|
174
174
|
prefix=f'controller-task-{service_name}-',
|
|
175
175
|
mode='w',
|
|
176
176
|
) as controller_file:
|
|
177
|
-
|
|
177
|
+
controller = controller_utils.get_controller_for_pool(pool)
|
|
178
|
+
controller_name = controller.value.cluster_name
|
|
178
179
|
task_config = task.to_yaml_config()
|
|
179
180
|
common_utils.dump_yaml(service_file.name, task_config)
|
|
180
181
|
remote_tmp_task_yaml_path = (
|
|
@@ -187,7 +188,7 @@ def up(
|
|
|
187
188
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
|
188
189
|
task_resources=task.resources)
|
|
189
190
|
controller_job_id = None
|
|
190
|
-
if serve_utils.is_consolidation_mode():
|
|
191
|
+
if serve_utils.is_consolidation_mode(pool):
|
|
191
192
|
controller_job_id = 0
|
|
192
193
|
|
|
193
194
|
vars_to_fill = {
|
|
@@ -238,7 +239,7 @@ def up(
|
|
|
238
239
|
# for the first time; otherwise it is a name conflict.
|
|
239
240
|
# Since the controller may be shared among multiple users, launch the
|
|
240
241
|
# controller with the API server's user hash.
|
|
241
|
-
if not serve_utils.is_consolidation_mode():
|
|
242
|
+
if not serve_utils.is_consolidation_mode(pool):
|
|
242
243
|
print(f'{colorama.Fore.YELLOW}Launching controller for '
|
|
243
244
|
f'{service_name!r}...{colorama.Style.RESET_ALL}')
|
|
244
245
|
with common.with_server_user():
|
|
@@ -251,9 +252,9 @@ def up(
|
|
|
251
252
|
_disable_controller_check=True,
|
|
252
253
|
)
|
|
253
254
|
else:
|
|
255
|
+
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
254
256
|
controller_handle = backend_utils.is_controller_accessible(
|
|
255
|
-
controller=
|
|
256
|
-
stopped_message='')
|
|
257
|
+
controller=controller_type, stopped_message='')
|
|
257
258
|
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
258
259
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
259
260
|
backend.sync_file_mounts(
|
|
@@ -270,10 +271,8 @@ def up(
|
|
|
270
271
|
]
|
|
271
272
|
run_script = '\n'.join(env_cmds + [run_script])
|
|
272
273
|
# Dump script for high availability recovery.
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
# managed_job_state.set_ha_recovery_script(
|
|
276
|
-
# consolidation_mode_job_id, run_script)
|
|
274
|
+
if controller_utils.high_availability_specified(controller_name):
|
|
275
|
+
serve_state.set_ha_recovery_script(service_name, run_script)
|
|
277
276
|
backend.run_on_head(controller_handle, run_script)
|
|
278
277
|
|
|
279
278
|
style = colorama.Style
|
|
@@ -289,7 +288,7 @@ def up(
|
|
|
289
288
|
# and return the endpoint if the job id matches. Otherwise it will
|
|
290
289
|
# return None.
|
|
291
290
|
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
292
|
-
service_name, controller_job_id)
|
|
291
|
+
service_name, controller_job_id, pool)
|
|
293
292
|
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
294
293
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
295
294
|
assert isinstance(controller_handle,
|
|
@@ -304,7 +303,7 @@ def up(
|
|
|
304
303
|
returncode, code, f'Failed to wait for {noun} initialization',
|
|
305
304
|
lb_port_payload)
|
|
306
305
|
except exceptions.CommandError:
|
|
307
|
-
if serve_utils.is_consolidation_mode():
|
|
306
|
+
if serve_utils.is_consolidation_mode(pool):
|
|
308
307
|
with ux_utils.print_exception_no_traceback():
|
|
309
308
|
raise RuntimeError(
|
|
310
309
|
f'Failed to wait for {noun} initialization. '
|
|
@@ -339,7 +338,7 @@ def up(
|
|
|
339
338
|
else:
|
|
340
339
|
lb_port = serve_utils.load_service_initialization_result(
|
|
341
340
|
lb_port_payload)
|
|
342
|
-
if not serve_utils.is_consolidation_mode():
|
|
341
|
+
if not serve_utils.is_consolidation_mode(pool):
|
|
343
342
|
socket_endpoint = backend_utils.get_endpoints(
|
|
344
343
|
controller_handle.cluster_name,
|
|
345
344
|
lb_port,
|
|
@@ -442,8 +441,9 @@ def update(
|
|
|
442
441
|
'effect. To update TLS keyfile and certfile, please '
|
|
443
442
|
'tear down the service and spin up a new one.')
|
|
444
443
|
|
|
444
|
+
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
445
445
|
handle = backend_utils.is_controller_accessible(
|
|
446
|
-
controller=
|
|
446
|
+
controller=controller_type,
|
|
447
447
|
stopped_message=
|
|
448
448
|
'Service controller is stopped. There is no service to update. '
|
|
449
449
|
f'To spin up a new service, use {ux_utils.BOLD}'
|
|
@@ -572,9 +572,9 @@ def apply(
|
|
|
572
572
|
"""Applies the config to the service or pool."""
|
|
573
573
|
with filelock.FileLock(serve_utils.get_service_filelock_path(service_name)):
|
|
574
574
|
try:
|
|
575
|
+
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
575
576
|
handle = backend_utils.is_controller_accessible(
|
|
576
|
-
controller=
|
|
577
|
-
stopped_message='')
|
|
577
|
+
controller=controller_type, stopped_message='')
|
|
578
578
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
579
579
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
580
580
|
service_record = _get_service_record(service_name, pool, handle,
|
|
@@ -598,8 +598,9 @@ def down(
|
|
|
598
598
|
service_names = []
|
|
599
599
|
if isinstance(service_names, str):
|
|
600
600
|
service_names = [service_names]
|
|
601
|
+
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
601
602
|
handle = backend_utils.is_controller_accessible(
|
|
602
|
-
controller=
|
|
603
|
+
controller=controller_type,
|
|
603
604
|
stopped_message=f'All {noun}s should have terminated.')
|
|
604
605
|
|
|
605
606
|
service_names_str = ','.join(service_names)
|
|
@@ -624,7 +625,7 @@ def down(
|
|
|
624
625
|
except exceptions.FetchClusterInfoError as e:
|
|
625
626
|
raise RuntimeError(
|
|
626
627
|
'Failed to fetch controller IP. Please refresh controller status '
|
|
627
|
-
f'by `sky status -r {
|
|
628
|
+
f'by `sky status -r {controller_type.value.cluster_name}` '
|
|
628
629
|
'and try again.') from e
|
|
629
630
|
|
|
630
631
|
try:
|
|
@@ -654,7 +655,7 @@ def status(
|
|
|
654
655
|
raise RuntimeError(f'Failed to refresh {noun}s status '
|
|
655
656
|
'due to network error.') from e
|
|
656
657
|
|
|
657
|
-
controller_type = controller_utils.
|
|
658
|
+
controller_type = controller_utils.get_controller_for_pool(pool)
|
|
658
659
|
handle = backend_utils.is_controller_accessible(
|
|
659
660
|
controller=controller_type,
|
|
660
661
|
stopped_message=controller_type.value.default_hint_if_non_existent.
|
|
@@ -690,7 +691,7 @@ def status(
|
|
|
690
691
|
if service_record['load_balancer_port'] is not None:
|
|
691
692
|
try:
|
|
692
693
|
lb_port = service_record['load_balancer_port']
|
|
693
|
-
if not serve_utils.is_consolidation_mode():
|
|
694
|
+
if not serve_utils.is_consolidation_mode(pool):
|
|
694
695
|
endpoint = backend_utils.get_endpoints(
|
|
695
696
|
cluster=common.SKY_SERVE_CONTROLLER_NAME,
|
|
696
697
|
port=lb_port).get(lb_port, None)
|