skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250806__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (103) hide show
  1. sky/__init__.py +2 -2
  2. sky/catalog/kubernetes_catalog.py +8 -0
  3. sky/catalog/nebius_catalog.py +0 -1
  4. sky/client/cli/command.py +26 -7
  5. sky/client/sdk.py +16 -8
  6. sky/client/sdk.pyi +6 -5
  7. sky/client/sdk_async.py +811 -0
  8. sky/clouds/kubernetes.py +6 -1
  9. sky/clouds/nebius.py +1 -4
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-2a43ea3241bbdacd.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa63e8b1d203f298.js → [job]-7cb24da04ca00956.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9e7df5fc761c95a7.js → [cluster]-1e95993124dbfc57.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/clusters-47f1ddae13a2f8e4.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-2a44e70b500b6b70.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/infra-22faac9325016d83.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +11 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/users-b90c865a690bfe84.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/volumes-7af733f5d7b6ed1c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-4d41c9023287f59a.js → [name]-35e0de5bca55e594.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +1 -0
  32. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  33. sky/dashboard/out/clusters/[cluster].html +1 -1
  34. sky/dashboard/out/clusters.html +1 -1
  35. sky/dashboard/out/config.html +1 -1
  36. sky/dashboard/out/index.html +1 -1
  37. sky/dashboard/out/infra/[context].html +1 -1
  38. sky/dashboard/out/infra.html +1 -1
  39. sky/dashboard/out/jobs/[job].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/volumes.html +1 -1
  43. sky/dashboard/out/workspace/new.html +1 -1
  44. sky/dashboard/out/workspaces/[name].html +1 -1
  45. sky/dashboard/out/workspaces.html +1 -1
  46. sky/jobs/client/sdk_async.py +135 -0
  47. sky/jobs/utils.py +3 -1
  48. sky/provision/kubernetes/utils.py +30 -4
  49. sky/provision/nebius/instance.py +1 -0
  50. sky/provision/nebius/utils.py +9 -1
  51. sky/serve/client/sdk_async.py +130 -0
  52. sky/serve/constants.py +2 -1
  53. sky/serve/controller.py +2 -1
  54. sky/serve/load_balancer.py +3 -1
  55. sky/serve/serve_state.py +70 -5
  56. sky/serve/serve_utils.py +124 -22
  57. sky/serve/server/impl.py +22 -21
  58. sky/serve/service.py +8 -1
  59. sky/server/auth/__init__.py +0 -0
  60. sky/server/auth/authn.py +46 -0
  61. sky/server/auth/oauth2_proxy.py +185 -0
  62. sky/server/common.py +108 -17
  63. sky/server/constants.py +1 -1
  64. sky/server/daemons.py +60 -11
  65. sky/server/rest.py +114 -0
  66. sky/server/server.py +44 -40
  67. sky/setup_files/dependencies.py +2 -0
  68. sky/skylet/constants.py +1 -1
  69. sky/skylet/events.py +5 -1
  70. sky/skylet/skylet.py +3 -1
  71. sky/task.py +43 -10
  72. sky/templates/kubernetes-ray.yml.j2 +4 -0
  73. sky/templates/nebius-ray.yml.j2 +1 -0
  74. sky/utils/controller_utils.py +7 -0
  75. sky/utils/rich_utils.py +120 -0
  76. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/METADATA +5 -1
  77. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/RECORD +86 -81
  78. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  81. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +0 -6
  85. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  90. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  94. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  95. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → Gelsd19kVxXcX7aQQGsGu}/_ssgManifest.js +0 -0
  96. /sky/dashboard/out/_next/static/chunks/{1871-7e17c195296e2ea9.js → 1871-ced1c14230cad6e1.js} +0 -0
  97. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-2d7ed3350659d073.js} +0 -0
  98. /sky/dashboard/out/_next/static/chunks/{6601-234b1cf963c7280b.js → 6601-2109d22e7861861c.js} +0 -0
  99. /sky/dashboard/out/_next/static/chunks/{938-40d15b6261ec8dc1.js → 938-bda2685db5eae6cf.js} +0 -0
  100. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/WHEEL +0 -0
  101. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/entry_points.txt +0 -0
  102. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/licenses/LICENSE +0 -0
  103. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250806.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,130 @@
1
+ """Async SDK for SkyServe."""
2
+ import typing
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
4
+
5
+ from sky.client import sdk_async
6
+ from sky.serve.client import sdk
7
+ from sky.usage import usage_lib
8
+ from sky.utils import context_utils
9
+
10
+ if typing.TYPE_CHECKING:
11
+ import io
12
+
13
+ import sky
14
+ from sky.serve import serve_utils
15
+
16
+
17
+ @usage_lib.entrypoint
18
+ async def up(
19
+ task: Union['sky.Task', 'sky.Dag'],
20
+ service_name: str,
21
+ # Internal only:
22
+ # pylint: disable=invalid-name
23
+ _need_confirmation: bool = False,
24
+ stream_logs: Optional[
25
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
26
+ ) -> Tuple[str, str]:
27
+ """Async version of up() that spins up a service."""
28
+ request_id = await context_utils.to_thread(sdk.up, task, service_name,
29
+ _need_confirmation)
30
+ if stream_logs is not None:
31
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
32
+ else:
33
+ return await sdk_async.get(request_id)
34
+
35
+
36
+ @usage_lib.entrypoint
37
+ async def update(
38
+ task: Union['sky.Task', 'sky.Dag'],
39
+ service_name: str,
40
+ mode: 'serve_utils.UpdateMode',
41
+ # Internal only:
42
+ # pylint: disable=invalid-name
43
+ _need_confirmation: bool = False,
44
+ stream_logs: Optional[
45
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
46
+ ) -> None:
47
+ """Async version of update() that updates an existing service."""
48
+ request_id = await context_utils.to_thread(sdk.update, task, service_name,
49
+ mode, _need_confirmation)
50
+ if stream_logs is not None:
51
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
52
+ else:
53
+ return await sdk_async.get(request_id)
54
+
55
+
56
+ @usage_lib.entrypoint
57
+ async def down(
58
+ service_names: Optional[Union[str, List[str]]],
59
+ all: bool = False, # pylint: disable=redefined-builtin
60
+ purge: bool = False,
61
+ stream_logs: Optional[
62
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
63
+ ) -> None:
64
+ """Async version of down() that tears down a service."""
65
+ request_id = await context_utils.to_thread(sdk.down, service_names, all,
66
+ purge)
67
+ if stream_logs is not None:
68
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
69
+ else:
70
+ return await sdk_async.get(request_id)
71
+
72
+
73
+ @usage_lib.entrypoint
74
+ async def terminate_replica(
75
+ service_name: str,
76
+ replica_id: int,
77
+ purge: bool,
78
+ stream_logs: Optional[
79
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
80
+ ) -> None:
81
+ """Async version of terminate_replica() that tears down a specific
82
+ replica."""
83
+ request_id = await context_utils.to_thread(sdk.terminate_replica,
84
+ service_name, replica_id, purge)
85
+ if stream_logs is not None:
86
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
87
+ else:
88
+ return await sdk_async.get(request_id)
89
+
90
+
91
+ @usage_lib.entrypoint
92
+ async def status(
93
+ service_names: Optional[Union[str, List[str]]],
94
+ stream_logs: Optional[
95
+ sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
96
+ ) -> List[Dict[str, Any]]:
97
+ """Async version of status() that sdk_async.gets service statuses."""
98
+ request_id = await context_utils.to_thread(sdk.status, service_names)
99
+ if stream_logs is not None:
100
+ return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
101
+ else:
102
+ return await sdk_async.get(request_id)
103
+
104
+
105
+ @usage_lib.entrypoint
106
+ async def tail_logs(service_name: str,
107
+ target: Union[str, 'serve_utils.ServiceComponent'],
108
+ replica_id: Optional[int] = None,
109
+ follow: bool = True,
110
+ output_stream: Optional['io.TextIOBase'] = None) -> None:
111
+ """Async version of tail_logs() that tails logs for a service."""
112
+ return await context_utils.to_thread(sdk.tail_logs, service_name, target,
113
+ replica_id, follow, output_stream)
114
+
115
+
116
+ @usage_lib.entrypoint
117
+ async def sync_down_logs(service_name: str,
118
+ local_dir: str,
119
+ *,
120
+ targets: Optional[Union[
121
+ str, 'serve_utils.ServiceComponent', List[Union[
122
+ str, 'serve_utils.ServiceComponent']]]] = None,
123
+ replica_ids: Optional[List[int]] = None) -> None:
124
+ """Async version of sync_down_logs() that syncs down logs from service
125
+ components."""
126
+ return await context_utils.to_thread(sdk.sync_down_logs,
127
+ service_name,
128
+ local_dir,
129
+ targets=targets,
130
+ replica_ids=replica_ids)
sky/serve/constants.py CHANGED
@@ -105,7 +105,8 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
105
105
  # v1.0 - Introduce rolling update.
106
106
  # v2.0 - Added template-replica feature.
107
107
  # v3.0 - Added cluster pool.
108
- SERVE_VERSION = 3
108
+ # v4.0 - Added pool argument to wait_service_registration.
109
+ SERVE_VERSION = 4
109
110
 
110
111
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
111
112
  'The version of service is outdated and does not support manually '
sky/serve/controller.py CHANGED
@@ -4,6 +4,7 @@ Responsible for autoscaling and replica management.
4
4
  """
5
5
  import contextlib
6
6
  import logging
7
+ import os
7
8
  import threading
8
9
  import time
9
10
  import traceback
@@ -242,7 +243,7 @@ class SkyServeController:
242
243
  threading.Thread(target=self._run_autoscaler).start()
243
244
 
244
245
  logger.info('SkyServe Controller started on '
245
- f'http://{self._host}:{self._port}')
246
+ f'http://{self._host}:{self._port}. PID: {os.getpid()}')
246
247
 
247
248
  uvicorn.run(self._app, host=self._host, port=self._port)
248
249
 
@@ -1,6 +1,7 @@
1
1
  """LoadBalancer: Distribute any incoming request to all ready replicas."""
2
2
  import asyncio
3
3
  import logging
4
+ import os
4
5
  import threading
5
6
  import traceback
6
7
  from typing import Dict, List, Optional, Union
@@ -254,7 +255,8 @@ class SkyServeLoadBalancer:
254
255
  protocol = 'https' if self._tls_credential is not None else 'http'
255
256
 
256
257
  logger.info('SkyServe Load Balancer started on '
257
- f'{protocol}://0.0.0.0:{self._load_balancer_port}')
258
+ f'{protocol}://0.0.0.0:{self._load_balancer_port}. '
259
+ f'PID: {os.getpid()}')
258
260
 
259
261
  uvicorn.run(self._app,
260
262
  host='0.0.0.0',
sky/serve/serve_state.py CHANGED
@@ -47,6 +47,10 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
47
47
  service_name TEXT,
48
48
  spec BLOB,
49
49
  PRIMARY KEY (service_name, version))""")
50
+ cursor.execute("""\
51
+ CREATE TABLE IF NOT EXISTS ha_recovery_script (
52
+ service_name TEXT PRIMARY KEY,
53
+ script TEXT)""")
50
54
  conn.commit()
51
55
 
52
56
  # Backward compatibility.
@@ -71,6 +75,13 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
71
75
  # Whether the service is a cluster pool.
72
76
  db_utils.add_column_to_table(cursor, conn, 'services', 'pool',
73
77
  'INTEGER DEFAULT 0')
78
+ # Add controller_pid for status tracking.
79
+ db_utils.add_column_to_table(cursor,
80
+ conn,
81
+ 'services',
82
+ 'controller_pid',
83
+ 'INTEGER DEFAULT NULL',
84
+ value_to_replace_existing_entries=-1)
74
85
  conn.commit()
75
86
 
76
87
 
@@ -272,7 +283,8 @@ _SERVICE_STATUS_TO_COLOR = {
272
283
  @init_db
273
284
  def add_service(name: str, controller_job_id: int, policy: str,
274
285
  requested_resources_str: str, load_balancing_policy: str,
275
- status: ServiceStatus, tls_encrypted: bool, pool: bool) -> bool:
286
+ status: ServiceStatus, tls_encrypted: bool, pool: bool,
287
+ controller_pid: int) -> bool:
276
288
  """Add a service in the database.
277
289
 
278
290
  Returns:
@@ -287,11 +299,11 @@ def add_service(name: str, controller_job_id: int, policy: str,
287
299
  INSERT INTO services
288
300
  (name, controller_job_id, status, policy,
289
301
  requested_resources_str, load_balancing_policy, tls_encrypted,
290
- pool)
291
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
302
+ pool, controller_pid)
303
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
292
304
  (name, controller_job_id, status.value, policy,
293
305
  requested_resources_str, load_balancing_policy,
294
- int(tls_encrypted), int(pool)))
306
+ int(tls_encrypted), int(pool), controller_pid))
295
307
 
296
308
  except sqlite3.IntegrityError as e:
297
309
  if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
@@ -300,6 +312,22 @@ def add_service(name: str, controller_job_id: int, policy: str,
300
312
  return True
301
313
 
302
314
 
315
+ @init_db
316
+ def update_service_controller_pid(service_name: str,
317
+ controller_pid: int) -> None:
318
+ """Updates the controller pid of a service.
319
+
320
+ This is used to update the controller pid of a service on ha recovery.
321
+ """
322
+ assert _DB_PATH is not None
323
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
324
+ cursor.execute(
325
+ """\
326
+ UPDATE services SET
327
+ controller_pid=(?) WHERE name=(?)""",
328
+ (controller_pid, service_name))
329
+
330
+
303
331
  @init_db
304
332
  def remove_service(service_name: str) -> None:
305
333
  """Removes a service from the database."""
@@ -368,7 +396,8 @@ def set_service_load_balancer_port(service_name: str,
368
396
  def _get_service_from_row(row) -> Dict[str, Any]:
369
397
  (current_version, name, controller_job_id, controller_port,
370
398
  load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
371
- _, active_versions, load_balancing_policy, tls_encrypted, pool) = row[:16]
399
+ _, active_versions, load_balancing_policy, tls_encrypted, pool,
400
+ controller_pid) = row[:17]
372
401
  record = {
373
402
  'name': name,
374
403
  'controller_job_id': controller_job_id,
@@ -388,6 +417,7 @@ def _get_service_from_row(row) -> Dict[str, Any]:
388
417
  'load_balancing_policy': load_balancing_policy,
389
418
  'tls_encrypted': bool(tls_encrypted),
390
419
  'pool': bool(pool),
420
+ 'controller_pid': controller_pid,
391
421
  }
392
422
  latest_spec = get_spec(name, current_version)
393
423
  if latest_spec is not None:
@@ -666,3 +696,38 @@ def get_service_load_balancer_port(service_name: str) -> int:
666
696
  if row is None:
667
697
  raise ValueError(f'Service {service_name} does not exist.')
668
698
  return row[0]
699
+
700
+
701
+ @init_db
702
+ def get_ha_recovery_script(service_name: str) -> Optional[str]:
703
+ """Gets the HA recovery script for a service."""
704
+ assert _DB_PATH is not None
705
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
706
+ cursor.execute(
707
+ 'SELECT script FROM ha_recovery_script WHERE service_name = ?',
708
+ (service_name,))
709
+ row = cursor.fetchone()
710
+ if row is None:
711
+ return None
712
+ return row[0]
713
+
714
+
715
+ @init_db
716
+ def set_ha_recovery_script(service_name: str, script: str) -> None:
717
+ """Sets the HA recovery script for a service."""
718
+ assert _DB_PATH is not None
719
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
720
+ cursor.execute(
721
+ """\
722
+ INSERT OR REPLACE INTO ha_recovery_script
723
+ (service_name, script)
724
+ VALUES (?, ?)""", (service_name, script))
725
+
726
+
727
+ @init_db
728
+ def remove_ha_recovery_script(service_name: str) -> None:
729
+ """Removes the HA recovery script for a service."""
730
+ assert _DB_PATH is not None
731
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
732
+ cursor.execute('DELETE FROM ha_recovery_script WHERE service_name = ?',
733
+ (service_name,))
sky/serve/serve_utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import base64
3
3
  import collections
4
4
  import dataclasses
5
+ import datetime
5
6
  import enum
6
7
  import os
7
8
  import pathlib
@@ -33,6 +34,7 @@ from sky.serve import spot_placer
33
34
  from sky.skylet import constants as skylet_constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.utils import annotations
37
+ from sky.utils import command_runner
36
38
  from sky.utils import common_utils
37
39
  from sky.utils import log_utils
38
40
  from sky.utils import message_utils
@@ -258,13 +260,76 @@ def get_service_filelock_path(pool: str) -> str:
258
260
 
259
261
 
260
262
  @annotations.lru_cache(scope='request', maxsize=1)
261
- def is_consolidation_mode() -> bool:
263
+ def is_consolidation_mode(pool: bool = False) -> bool:
264
+ # Use jobs config for pool consolidation mode.
265
+ controller_type = 'jobs' if pool else 'serve'
262
266
  consolidation_mode = skypilot_config.get_nested(
263
- ('serve', 'controller', 'consolidation_mode'), default_value=False)
264
- # _check_consolidation_mode_consistency(consolidation_mode)
267
+ (controller_type, 'controller', 'consolidation_mode'),
268
+ default_value=False)
269
+ # _check_consolidation_mode_consistency(consolidation_mode, pool)
265
270
  return consolidation_mode
266
271
 
267
272
 
273
+ def ha_recovery_for_consolidation_mode(pool: bool):
274
+ """Recovery logic for HA mode."""
275
+ # No setup recovery is needed in consolidation mode, as the API server
276
+ # already has all runtime installed. Directly start jobs recovery here.
277
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
278
+ runner = command_runner.LocalProcessCommandRunner()
279
+ noun = 'pool' if pool else 'serve'
280
+ capnoun = noun.capitalize()
281
+ prefix = f'{noun}_'
282
+ with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
283
+ 'w',
284
+ encoding='utf-8') as f:
285
+ start = time.time()
286
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
287
+ for service_name in serve_state.get_glob_service_names(None):
288
+ svc = _get_service_status(service_name,
289
+ pool=pool,
290
+ with_replica_info=False)
291
+ if svc is None:
292
+ continue
293
+ controller_pid = svc['controller_pid']
294
+ if controller_pid is not None:
295
+ try:
296
+ if _controller_process_alive(controller_pid, service_name):
297
+ f.write(f'Controller pid {controller_pid} for '
298
+ f'{noun} {service_name} is still running. '
299
+ 'Skipping recovery.\n')
300
+ continue
301
+ except Exception: # pylint: disable=broad-except
302
+ # _controller_process_alive may raise if psutil fails; we
303
+ # should not crash the recovery logic because of this.
304
+ f.write('Error checking controller pid '
305
+ f'{controller_pid} for {noun} {service_name}\n')
306
+
307
+ script = serve_state.get_ha_recovery_script(service_name)
308
+ if script is None:
309
+ f.write(f'{capnoun} {service_name}\'s recovery script does '
310
+ 'not exist. Skipping recovery.\n')
311
+ continue
312
+ rc, out, err = runner.run(script, require_outputs=True)
313
+ if rc:
314
+ f.write(f'Recovery script returned {rc}. '
315
+ f'Output: {out}\nError: {err}\n')
316
+ f.write(f'{capnoun} {service_name} completed recovery at '
317
+ f'{datetime.datetime.now()}\n')
318
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
319
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
320
+
321
+
322
+ def _controller_process_alive(pid: int, service_name: str) -> bool:
323
+ """Check if the controller process is alive."""
324
+ try:
325
+ process = psutil.Process(pid)
326
+ cmd_str = ' '.join(process.cmdline())
327
+ return process.is_running(
328
+ ) and f'--service-name {service_name}' in cmd_str
329
+ except psutil.NoSuchProcess:
330
+ return False
331
+
332
+
268
333
  def validate_service_task(task: 'sky.Task', pool: bool) -> None:
269
334
  """Validate the task for Sky Serve.
270
335
 
@@ -460,22 +525,53 @@ def set_service_status_and_active_versions_from_replica(
460
525
  active_versions=active_versions)
461
526
 
462
527
 
463
- def update_service_status() -> None:
464
- if is_consolidation_mode():
465
- # TODO(tian): PID-based tracking.
466
- return
467
- services = serve_state.get_services()
468
- for record in services:
469
- if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
528
+ def update_service_status(pool: bool) -> None:
529
+ noun = 'pool' if pool else 'serve'
530
+ capnoun = noun.capitalize()
531
+ service_names = serve_state.get_glob_service_names(None)
532
+ for service_name in service_names:
533
+ record = _get_service_status(service_name,
534
+ pool=pool,
535
+ with_replica_info=False)
536
+ if record is None:
537
+ continue
538
+ service_status = record['status']
539
+ if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
470
540
  # Skip services that is shutting down.
471
541
  continue
472
- controller_job_id = record['controller_job_id']
473
- assert controller_job_id is not None
474
- controller_status = job_lib.get_status(controller_job_id)
475
- if controller_status is None or controller_status.is_terminal():
476
- # If controller job is not running, set it as controller failed.
477
- serve_state.set_service_status_and_active_versions(
478
- record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
542
+
543
+ logger.info(f'Update {noun} status for {service_name!r} '
544
+ f'with status {service_status}')
545
+
546
+ controller_pid = record['controller_pid']
547
+ if controller_pid is None:
548
+ logger.info(f'{capnoun} {service_name!r} controller pid is None. '
549
+ f'Unexpected status {service_status}. Set to failure.')
550
+ elif controller_pid < 0:
551
+ # Backwards compatibility: this service was submitted when ray was
552
+ # still used for controller process management. We set the
553
+ # value_to_replace_existing_entries to -1 to indicate historical
554
+ # services.
555
+ # TODO(tian): Remove before 0.13.0.
556
+ controller_job_id = record['controller_job_id']
557
+ assert controller_job_id is not None
558
+ controller_status = job_lib.get_status(controller_job_id)
559
+ if (controller_status is not None and
560
+ not controller_status.is_terminal()):
561
+ continue
562
+ logger.info(f'Updating {noun} {service_name!r} in old version. '
563
+ f'SkyPilot job status: {controller_status}. '
564
+ 'Set to failure.')
565
+ else:
566
+ if _controller_process_alive(controller_pid, service_name):
567
+ # The controller is still running.
568
+ continue
569
+ logger.info(f'{capnoun} {service_name!r} controller pid '
570
+ f'{controller_pid} is not alive. Set to failure.')
571
+
572
+ # If controller job is not running, set it as controller failed.
573
+ serve_state.set_service_status_and_active_versions(
574
+ service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
479
575
 
480
576
 
481
577
  def update_service_encoded(service_name: str, version: int, mode: str,
@@ -754,9 +850,11 @@ def _terminate_failed_services(
754
850
  shutil.rmtree(service_dir)
755
851
  serve_state.remove_service(service_name)
756
852
  serve_state.delete_all_versions(service_name)
853
+ serve_state.remove_ha_recovery_script(service_name)
757
854
 
758
855
  if not remaining_replica_clusters:
759
856
  return None
857
+ # TODO(tian): Try to terminate those replica clusters.
760
858
  remaining_identity = ', '.join(remaining_replica_clusters)
761
859
  return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
762
860
  f'failed status ({service_status}). This may indicate a resource '
@@ -845,7 +943,8 @@ def terminate_services(service_names: Optional[List[str]], purge: bool,
845
943
  return '\n'.join(messages)
846
944
 
847
945
 
848
- def wait_service_registration(service_name: str, job_id: int) -> str:
946
+ def wait_service_registration(service_name: str, job_id: int,
947
+ pool: bool) -> str:
849
948
  """Util function to call at the end of `sky.serve.up()`.
850
949
 
851
950
  This function will:
@@ -862,7 +961,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
862
961
  setup_completed = False
863
962
  while True:
864
963
  # TODO(tian): PID-based tracking.
865
- if not is_consolidation_mode():
964
+ if not is_consolidation_mode(pool):
866
965
  job_status = job_lib.get_status(job_id)
867
966
  if job_status is None or job_status < job_lib.JobStatus.RUNNING:
868
967
  # Wait for the controller process to finish setting up. It
@@ -888,7 +987,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
888
987
  record = serve_state.get_service_from_name(service_name)
889
988
  if record is not None:
890
989
  # TODO(tian): PID-based tracking.
891
- if (not is_consolidation_mode() and
990
+ if (not is_consolidation_mode(pool) and
892
991
  job_id != record['controller_job_id']):
893
992
  with ux_utils.print_exception_no_traceback():
894
993
  raise ValueError(
@@ -1420,10 +1519,13 @@ class ServeCodeGen:
1420
1519
  return cls._build(code)
1421
1520
 
1422
1521
  @classmethod
1423
- def wait_service_registration(cls, service_name: str, job_id: int) -> str:
1522
+ def wait_service_registration(cls, service_name: str, job_id: int,
1523
+ pool: bool) -> str:
1424
1524
  code = [
1525
+ f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
1425
1526
  'msg = serve_utils.wait_service_registration('
1426
- f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)'
1527
+ f'{service_name!r}, {job_id}, **kwargs)',
1528
+ 'print(msg, end="", flush=True)'
1427
1529
  ]
1428
1530
  return cls._build(code)
1429
1531
 
sky/serve/server/impl.py CHANGED
@@ -102,10 +102,10 @@ def up(
102
102
  pool: bool = False,
103
103
  ) -> Tuple[str, str]:
104
104
  """Spins up a service or a pool."""
105
- if pool and not serve_utils.is_consolidation_mode():
105
+ if pool and not serve_utils.is_consolidation_mode(pool):
106
106
  raise ValueError(
107
107
  'Pool is only supported in consolidation mode. To fix, set '
108
- '`serve.controller.consolidation_mode: true` in SkyPilot config.')
108
+ '`jobs.controller.consolidation_mode: true` in SkyPilot config.')
109
109
  task.validate()
110
110
  serve_utils.validate_service_task(task, pool=pool)
111
111
  assert task.service is not None
@@ -174,7 +174,8 @@ def up(
174
174
  prefix=f'controller-task-{service_name}-',
175
175
  mode='w',
176
176
  ) as controller_file:
177
- controller_name = common.SKY_SERVE_CONTROLLER_NAME
177
+ controller = controller_utils.get_controller_for_pool(pool)
178
+ controller_name = controller.value.cluster_name
178
179
  task_config = task.to_yaml_config()
179
180
  common_utils.dump_yaml(service_file.name, task_config)
180
181
  remote_tmp_task_yaml_path = (
@@ -187,7 +188,7 @@ def up(
187
188
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
188
189
  task_resources=task.resources)
189
190
  controller_job_id = None
190
- if serve_utils.is_consolidation_mode():
191
+ if serve_utils.is_consolidation_mode(pool):
191
192
  controller_job_id = 0
192
193
 
193
194
  vars_to_fill = {
@@ -238,7 +239,7 @@ def up(
238
239
  # for the first time; otherwise it is a name conflict.
239
240
  # Since the controller may be shared among multiple users, launch the
240
241
  # controller with the API server's user hash.
241
- if not serve_utils.is_consolidation_mode():
242
+ if not serve_utils.is_consolidation_mode(pool):
242
243
  print(f'{colorama.Fore.YELLOW}Launching controller for '
243
244
  f'{service_name!r}...{colorama.Style.RESET_ALL}')
244
245
  with common.with_server_user():
@@ -251,9 +252,9 @@ def up(
251
252
  _disable_controller_check=True,
252
253
  )
253
254
  else:
255
+ controller_type = controller_utils.get_controller_for_pool(pool)
254
256
  controller_handle = backend_utils.is_controller_accessible(
255
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
256
- stopped_message='')
257
+ controller=controller_type, stopped_message='')
257
258
  backend = backend_utils.get_backend_from_handle(controller_handle)
258
259
  assert isinstance(backend, backends.CloudVmRayBackend)
259
260
  backend.sync_file_mounts(
@@ -270,10 +271,8 @@ def up(
270
271
  ]
271
272
  run_script = '\n'.join(env_cmds + [run_script])
272
273
  # Dump script for high availability recovery.
273
- # if controller_utils.high_availability_specified(
274
- # controller_name):
275
- # managed_job_state.set_ha_recovery_script(
276
- # consolidation_mode_job_id, run_script)
274
+ if controller_utils.high_availability_specified(controller_name):
275
+ serve_state.set_ha_recovery_script(service_name, run_script)
277
276
  backend.run_on_head(controller_handle, run_script)
278
277
 
279
278
  style = colorama.Style
@@ -289,7 +288,7 @@ def up(
289
288
  # and return the endpoint if the job id matches. Otherwise it will
290
289
  # return None.
291
290
  code = serve_utils.ServeCodeGen.wait_service_registration(
292
- service_name, controller_job_id)
291
+ service_name, controller_job_id, pool)
293
292
  backend = backend_utils.get_backend_from_handle(controller_handle)
294
293
  assert isinstance(backend, backends.CloudVmRayBackend)
295
294
  assert isinstance(controller_handle,
@@ -304,7 +303,7 @@ def up(
304
303
  returncode, code, f'Failed to wait for {noun} initialization',
305
304
  lb_port_payload)
306
305
  except exceptions.CommandError:
307
- if serve_utils.is_consolidation_mode():
306
+ if serve_utils.is_consolidation_mode(pool):
308
307
  with ux_utils.print_exception_no_traceback():
309
308
  raise RuntimeError(
310
309
  f'Failed to wait for {noun} initialization. '
@@ -339,7 +338,7 @@ def up(
339
338
  else:
340
339
  lb_port = serve_utils.load_service_initialization_result(
341
340
  lb_port_payload)
342
- if not serve_utils.is_consolidation_mode():
341
+ if not serve_utils.is_consolidation_mode(pool):
343
342
  socket_endpoint = backend_utils.get_endpoints(
344
343
  controller_handle.cluster_name,
345
344
  lb_port,
@@ -442,8 +441,9 @@ def update(
442
441
  'effect. To update TLS keyfile and certfile, please '
443
442
  'tear down the service and spin up a new one.')
444
443
 
444
+ controller_type = controller_utils.get_controller_for_pool(pool)
445
445
  handle = backend_utils.is_controller_accessible(
446
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
446
+ controller=controller_type,
447
447
  stopped_message=
448
448
  'Service controller is stopped. There is no service to update. '
449
449
  f'To spin up a new service, use {ux_utils.BOLD}'
@@ -572,9 +572,9 @@ def apply(
572
572
  """Applies the config to the service or pool."""
573
573
  with filelock.FileLock(serve_utils.get_service_filelock_path(service_name)):
574
574
  try:
575
+ controller_type = controller_utils.get_controller_for_pool(pool)
575
576
  handle = backend_utils.is_controller_accessible(
576
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
577
- stopped_message='')
577
+ controller=controller_type, stopped_message='')
578
578
  backend = backend_utils.get_backend_from_handle(handle)
579
579
  assert isinstance(backend, backends.CloudVmRayBackend)
580
580
  service_record = _get_service_record(service_name, pool, handle,
@@ -598,8 +598,9 @@ def down(
598
598
  service_names = []
599
599
  if isinstance(service_names, str):
600
600
  service_names = [service_names]
601
+ controller_type = controller_utils.get_controller_for_pool(pool)
601
602
  handle = backend_utils.is_controller_accessible(
602
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
603
+ controller=controller_type,
603
604
  stopped_message=f'All {noun}s should have terminated.')
604
605
 
605
606
  service_names_str = ','.join(service_names)
@@ -624,7 +625,7 @@ def down(
624
625
  except exceptions.FetchClusterInfoError as e:
625
626
  raise RuntimeError(
626
627
  'Failed to fetch controller IP. Please refresh controller status '
627
- f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
628
+ f'by `sky status -r {controller_type.value.cluster_name}` '
628
629
  'and try again.') from e
629
630
 
630
631
  try:
@@ -654,7 +655,7 @@ def status(
654
655
  raise RuntimeError(f'Failed to refresh {noun}s status '
655
656
  'due to network error.') from e
656
657
 
657
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
658
+ controller_type = controller_utils.get_controller_for_pool(pool)
658
659
  handle = backend_utils.is_controller_accessible(
659
660
  controller=controller_type,
660
661
  stopped_message=controller_type.value.default_hint_if_non_existent.
@@ -690,7 +691,7 @@ def status(
690
691
  if service_record['load_balancer_port'] is not None:
691
692
  try:
692
693
  lb_port = service_record['load_balancer_port']
693
- if not serve_utils.is_consolidation_mode():
694
+ if not serve_utils.is_consolidation_mode(pool):
694
695
  endpoint = backend_utils.get_endpoints(
695
696
  cluster=common.SKY_SERVE_CONTROLLER_NAME,
696
697
  port=lb_port).get(lb_port, None)