skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (151) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/catalog/kubernetes_catalog.py +8 -0
  4. sky/catalog/nebius_catalog.py +0 -1
  5. sky/check.py +11 -1
  6. sky/client/cli/command.py +234 -100
  7. sky/client/sdk.py +30 -9
  8. sky/client/sdk_async.py +815 -0
  9. sky/clouds/kubernetes.py +6 -1
  10. sky/clouds/nebius.py +1 -4
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  26. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  30. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  49. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  50. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  51. sky/dashboard/out/clusters/[cluster].html +1 -1
  52. sky/dashboard/out/clusters.html +1 -1
  53. sky/dashboard/out/config.html +1 -1
  54. sky/dashboard/out/index.html +1 -1
  55. sky/dashboard/out/infra/[context].html +1 -1
  56. sky/dashboard/out/infra.html +1 -1
  57. sky/dashboard/out/jobs/[job].html +1 -1
  58. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/volumes.html +1 -1
  62. sky/dashboard/out/workspace/new.html +1 -1
  63. sky/dashboard/out/workspaces/[name].html +1 -1
  64. sky/dashboard/out/workspaces.html +1 -1
  65. sky/global_user_state.py +14 -2
  66. sky/jobs/__init__.py +2 -0
  67. sky/jobs/client/sdk.py +43 -2
  68. sky/jobs/client/sdk_async.py +135 -0
  69. sky/jobs/server/core.py +48 -1
  70. sky/jobs/server/server.py +52 -3
  71. sky/jobs/state.py +5 -1
  72. sky/jobs/utils.py +3 -1
  73. sky/provision/kubernetes/utils.py +30 -4
  74. sky/provision/nebius/instance.py +1 -0
  75. sky/provision/nebius/utils.py +9 -1
  76. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  77. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  78. sky/serve/client/impl.py +85 -1
  79. sky/serve/client/sdk.py +16 -47
  80. sky/serve/client/sdk_async.py +130 -0
  81. sky/serve/constants.py +3 -1
  82. sky/serve/controller.py +6 -3
  83. sky/serve/load_balancer.py +3 -1
  84. sky/serve/serve_state.py +93 -5
  85. sky/serve/serve_utils.py +200 -67
  86. sky/serve/server/core.py +13 -197
  87. sky/serve/server/impl.py +261 -23
  88. sky/serve/service.py +15 -3
  89. sky/server/auth/__init__.py +0 -0
  90. sky/server/auth/authn.py +46 -0
  91. sky/server/auth/oauth2_proxy.py +185 -0
  92. sky/server/common.py +119 -21
  93. sky/server/constants.py +1 -1
  94. sky/server/daemons.py +60 -11
  95. sky/server/requests/executor.py +5 -3
  96. sky/server/requests/payloads.py +19 -0
  97. sky/server/rest.py +114 -0
  98. sky/server/server.py +44 -40
  99. sky/setup_files/dependencies.py +2 -0
  100. sky/skylet/constants.py +1 -1
  101. sky/skylet/events.py +5 -1
  102. sky/skylet/skylet.py +3 -1
  103. sky/task.py +61 -21
  104. sky/templates/kubernetes-ray.yml.j2 +9 -0
  105. sky/templates/nebius-ray.yml.j2 +1 -0
  106. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  107. sky/usage/usage_lib.py +8 -6
  108. sky/utils/annotations.py +8 -3
  109. sky/utils/common_utils.py +11 -1
  110. sky/utils/controller_utils.py +7 -0
  111. sky/utils/db/migration_utils.py +2 -2
  112. sky/utils/rich_utils.py +120 -0
  113. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
  114. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
  115. sky/client/sdk.pyi +0 -300
  116. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  119. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  126. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  138. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  145. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  146. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  147. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
  148. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  149. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  150. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  151. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/serve/serve_state.py CHANGED
@@ -9,6 +9,7 @@ import sqlite3
9
9
  import threading
10
10
  import typing
11
11
  from typing import Any, Dict, List, Optional, Tuple
12
+ import uuid
12
13
 
13
14
  import colorama
14
15
 
@@ -47,6 +48,10 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
47
48
  service_name TEXT,
48
49
  spec BLOB,
49
50
  PRIMARY KEY (service_name, version))""")
51
+ cursor.execute("""\
52
+ CREATE TABLE IF NOT EXISTS ha_recovery_script (
53
+ service_name TEXT PRIMARY KEY,
54
+ script TEXT)""")
50
55
  conn.commit()
51
56
 
52
57
  # Backward compatibility.
@@ -71,6 +76,20 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
71
76
  # Whether the service is a cluster pool.
72
77
  db_utils.add_column_to_table(cursor, conn, 'services', 'pool',
73
78
  'INTEGER DEFAULT 0')
79
+ # Add controller_pid for status tracking.
80
+ db_utils.add_column_to_table(cursor,
81
+ conn,
82
+ 'services',
83
+ 'controller_pid',
84
+ 'INTEGER DEFAULT NULL',
85
+ value_to_replace_existing_entries=-1)
86
+ # The service hash. Unique for each service, even if the service name is
87
+ # the same.
88
+ db_utils.add_column_to_table(cursor, conn, 'services', 'hash',
89
+ 'TEXT DEFAULT NULL')
90
+ # Entrypoint to launch the service.
91
+ db_utils.add_column_to_table(cursor, conn, 'services', 'entrypoint',
92
+ 'TEXT DEFAULT NULL')
74
93
  conn.commit()
75
94
 
76
95
 
@@ -272,7 +291,8 @@ _SERVICE_STATUS_TO_COLOR = {
272
291
  @init_db
273
292
  def add_service(name: str, controller_job_id: int, policy: str,
274
293
  requested_resources_str: str, load_balancing_policy: str,
275
- status: ServiceStatus, tls_encrypted: bool, pool: bool) -> bool:
294
+ status: ServiceStatus, tls_encrypted: bool, pool: bool,
295
+ controller_pid: int, entrypoint: str) -> bool:
276
296
  """Add a service in the database.
277
297
 
278
298
  Returns:
@@ -287,11 +307,12 @@ def add_service(name: str, controller_job_id: int, policy: str,
287
307
  INSERT INTO services
288
308
  (name, controller_job_id, status, policy,
289
309
  requested_resources_str, load_balancing_policy, tls_encrypted,
290
- pool)
291
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
310
+ pool, controller_pid, hash, entrypoint)
311
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
292
312
  (name, controller_job_id, status.value, policy,
293
313
  requested_resources_str, load_balancing_policy,
294
- int(tls_encrypted), int(pool)))
314
+ int(tls_encrypted), int(pool), controller_pid, str(
315
+ uuid.uuid4()), entrypoint))
295
316
 
296
317
  except sqlite3.IntegrityError as e:
297
318
  if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
@@ -300,6 +321,22 @@ def add_service(name: str, controller_job_id: int, policy: str,
300
321
  return True
301
322
 
302
323
 
324
+ @init_db
325
+ def update_service_controller_pid(service_name: str,
326
+ controller_pid: int) -> None:
327
+ """Updates the controller pid of a service.
328
+
329
+ This is used to update the controller pid of a service on ha recovery.
330
+ """
331
+ assert _DB_PATH is not None
332
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
333
+ cursor.execute(
334
+ """\
335
+ UPDATE services SET
336
+ controller_pid=(?) WHERE name=(?)""",
337
+ (controller_pid, service_name))
338
+
339
+
303
340
  @init_db
304
341
  def remove_service(service_name: str) -> None:
305
342
  """Removes a service from the database."""
@@ -368,7 +405,8 @@ def set_service_load_balancer_port(service_name: str,
368
405
  def _get_service_from_row(row) -> Dict[str, Any]:
369
406
  (current_version, name, controller_job_id, controller_port,
370
407
  load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
371
- _, active_versions, load_balancing_policy, tls_encrypted, pool) = row[:16]
408
+ _, active_versions, load_balancing_policy, tls_encrypted, pool,
409
+ controller_pid, svc_hash, entrypoint) = row[:19]
372
410
  record = {
373
411
  'name': name,
374
412
  'controller_job_id': controller_job_id,
@@ -388,6 +426,9 @@ def _get_service_from_row(row) -> Dict[str, Any]:
388
426
  'load_balancing_policy': load_balancing_policy,
389
427
  'tls_encrypted': bool(tls_encrypted),
390
428
  'pool': bool(pool),
429
+ 'controller_pid': controller_pid,
430
+ 'hash': svc_hash,
431
+ 'entrypoint': entrypoint,
391
432
  }
392
433
  latest_spec = get_spec(name, current_version)
393
434
  if latest_spec is not None:
@@ -429,6 +470,18 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
429
470
  return None
430
471
 
431
472
 
473
+ @init_db
474
+ def get_service_hash(service_name: str) -> Optional[str]:
475
+ """Get the hash of a service."""
476
+ assert _DB_PATH is not None
477
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
478
+ rows = cursor.execute('SELECT hash FROM services WHERE name=(?)',
479
+ (service_name,)).fetchall()
480
+ for row in rows:
481
+ return row[0]
482
+ return None
483
+
484
+
432
485
  @init_db
433
486
  def get_service_versions(service_name: str) -> List[int]:
434
487
  """Gets all versions of a service."""
@@ -666,3 +719,38 @@ def get_service_load_balancer_port(service_name: str) -> int:
666
719
  if row is None:
667
720
  raise ValueError(f'Service {service_name} does not exist.')
668
721
  return row[0]
722
+
723
+
724
+ @init_db
725
+ def get_ha_recovery_script(service_name: str) -> Optional[str]:
726
+ """Gets the HA recovery script for a service."""
727
+ assert _DB_PATH is not None
728
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
729
+ cursor.execute(
730
+ 'SELECT script FROM ha_recovery_script WHERE service_name = ?',
731
+ (service_name,))
732
+ row = cursor.fetchone()
733
+ if row is None:
734
+ return None
735
+ return row[0]
736
+
737
+
738
+ @init_db
739
+ def set_ha_recovery_script(service_name: str, script: str) -> None:
740
+ """Sets the HA recovery script for a service."""
741
+ assert _DB_PATH is not None
742
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
743
+ cursor.execute(
744
+ """\
745
+ INSERT OR REPLACE INTO ha_recovery_script
746
+ (service_name, script)
747
+ VALUES (?, ?)""", (service_name, script))
748
+
749
+
750
+ @init_db
751
+ def remove_ha_recovery_script(service_name: str) -> None:
752
+ """Removes the HA recovery script for a service."""
753
+ assert _DB_PATH is not None
754
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
755
+ cursor.execute('DELETE FROM ha_recovery_script WHERE service_name = ?',
756
+ (service_name,))
sky/serve/serve_utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import base64
3
3
  import collections
4
4
  import dataclasses
5
+ import datetime
5
6
  import enum
6
7
  import os
7
8
  import pathlib
@@ -19,6 +20,7 @@ import uuid
19
20
 
20
21
  import colorama
21
22
  import filelock
23
+ import yaml
22
24
 
23
25
  from sky import backends
24
26
  from sky import exceptions
@@ -33,6 +35,7 @@ from sky.serve import spot_placer
33
35
  from sky.skylet import constants as skylet_constants
34
36
  from sky.skylet import job_lib
35
37
  from sky.utils import annotations
38
+ from sky.utils import command_runner
36
39
  from sky.utils import common_utils
37
40
  from sky.utils import log_utils
38
41
  from sky.utils import message_utils
@@ -63,13 +66,12 @@ def get_num_service_threshold():
63
66
 
64
67
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
65
68
 
66
- # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
67
- # and always appear after a space. Be careful when changing UX as this
68
- # assumption is used to expand some log files while ignoring others.
69
- _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
70
- _SKYPILOT_PROVISION_LOG_PATTERN = (
71
- fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
72
- _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
69
+ # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
70
+ # when changing UX as this assumption is used to expand some log files while
71
+ # ignoring others.
72
+ _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
73
+ _SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
74
+ _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
73
75
 
74
76
  # TODO(tian): Find all existing replica id and print here.
75
77
  _FAILED_TO_FIND_REPLICA_MSG = (
@@ -258,13 +260,76 @@ def get_service_filelock_path(pool: str) -> str:
258
260
 
259
261
 
260
262
  @annotations.lru_cache(scope='request', maxsize=1)
261
- def is_consolidation_mode() -> bool:
263
+ def is_consolidation_mode(pool: bool = False) -> bool:
264
+ # Use jobs config for pool consolidation mode.
265
+ controller_type = 'jobs' if pool else 'serve'
262
266
  consolidation_mode = skypilot_config.get_nested(
263
- ('serve', 'controller', 'consolidation_mode'), default_value=False)
264
- # _check_consolidation_mode_consistency(consolidation_mode)
267
+ (controller_type, 'controller', 'consolidation_mode'),
268
+ default_value=False)
269
+ # _check_consolidation_mode_consistency(consolidation_mode, pool)
265
270
  return consolidation_mode
266
271
 
267
272
 
273
+ def ha_recovery_for_consolidation_mode(pool: bool):
274
+ """Recovery logic for HA mode."""
275
+ # No setup recovery is needed in consolidation mode, as the API server
276
+ # already has all runtime installed. Directly start jobs recovery here.
277
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
278
+ runner = command_runner.LocalProcessCommandRunner()
279
+ noun = 'pool' if pool else 'serve'
280
+ capnoun = noun.capitalize()
281
+ prefix = f'{noun}_'
282
+ with open(skylet_constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(prefix),
283
+ 'w',
284
+ encoding='utf-8') as f:
285
+ start = time.time()
286
+ f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
287
+ for service_name in serve_state.get_glob_service_names(None):
288
+ svc = _get_service_status(service_name,
289
+ pool=pool,
290
+ with_replica_info=False)
291
+ if svc is None:
292
+ continue
293
+ controller_pid = svc['controller_pid']
294
+ if controller_pid is not None:
295
+ try:
296
+ if _controller_process_alive(controller_pid, service_name):
297
+ f.write(f'Controller pid {controller_pid} for '
298
+ f'{noun} {service_name} is still running. '
299
+ 'Skipping recovery.\n')
300
+ continue
301
+ except Exception: # pylint: disable=broad-except
302
+ # _controller_process_alive may raise if psutil fails; we
303
+ # should not crash the recovery logic because of this.
304
+ f.write('Error checking controller pid '
305
+ f'{controller_pid} for {noun} {service_name}\n')
306
+
307
+ script = serve_state.get_ha_recovery_script(service_name)
308
+ if script is None:
309
+ f.write(f'{capnoun} {service_name}\'s recovery script does '
310
+ 'not exist. Skipping recovery.\n')
311
+ continue
312
+ rc, out, err = runner.run(script, require_outputs=True)
313
+ if rc:
314
+ f.write(f'Recovery script returned {rc}. '
315
+ f'Output: {out}\nError: {err}\n')
316
+ f.write(f'{capnoun} {service_name} completed recovery at '
317
+ f'{datetime.datetime.now()}\n')
318
+ f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
319
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
320
+
321
+
322
+ def _controller_process_alive(pid: int, service_name: str) -> bool:
323
+ """Check if the controller process is alive."""
324
+ try:
325
+ process = psutil.Process(pid)
326
+ cmd_str = ' '.join(process.cmdline())
327
+ return process.is_running(
328
+ ) and f'--service-name {service_name}' in cmd_str
329
+ except psutil.NoSuchProcess:
330
+ return False
331
+
332
+
268
333
  def validate_service_task(task: 'sky.Task', pool: bool) -> None:
269
334
  """Validate the task for Sky Serve.
270
335
 
@@ -460,22 +525,53 @@ def set_service_status_and_active_versions_from_replica(
460
525
  active_versions=active_versions)
461
526
 
462
527
 
463
- def update_service_status() -> None:
464
- if is_consolidation_mode():
465
- # TODO(tian): PID-based tracking.
466
- return
467
- services = serve_state.get_services()
468
- for record in services:
469
- if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
528
+ def update_service_status(pool: bool) -> None:
529
+ noun = 'pool' if pool else 'serve'
530
+ capnoun = noun.capitalize()
531
+ service_names = serve_state.get_glob_service_names(None)
532
+ for service_name in service_names:
533
+ record = _get_service_status(service_name,
534
+ pool=pool,
535
+ with_replica_info=False)
536
+ if record is None:
537
+ continue
538
+ service_status = record['status']
539
+ if service_status == serve_state.ServiceStatus.SHUTTING_DOWN:
470
540
  # Skip services that is shutting down.
471
541
  continue
472
- controller_job_id = record['controller_job_id']
473
- assert controller_job_id is not None
474
- controller_status = job_lib.get_status(controller_job_id)
475
- if controller_status is None or controller_status.is_terminal():
476
- # If controller job is not running, set it as controller failed.
477
- serve_state.set_service_status_and_active_versions(
478
- record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
542
+
543
+ logger.info(f'Update {noun} status for {service_name!r} '
544
+ f'with status {service_status}')
545
+
546
+ controller_pid = record['controller_pid']
547
+ if controller_pid is None:
548
+ logger.info(f'{capnoun} {service_name!r} controller pid is None. '
549
+ f'Unexpected status {service_status}. Set to failure.')
550
+ elif controller_pid < 0:
551
+ # Backwards compatibility: this service was submitted when ray was
552
+ # still used for controller process management. We set the
553
+ # value_to_replace_existing_entries to -1 to indicate historical
554
+ # services.
555
+ # TODO(tian): Remove before 0.13.0.
556
+ controller_job_id = record['controller_job_id']
557
+ assert controller_job_id is not None
558
+ controller_status = job_lib.get_status(controller_job_id)
559
+ if (controller_status is not None and
560
+ not controller_status.is_terminal()):
561
+ continue
562
+ logger.info(f'Updating {noun} {service_name!r} in old version. '
563
+ f'SkyPilot job status: {controller_status}. '
564
+ 'Set to failure.')
565
+ else:
566
+ if _controller_process_alive(controller_pid, service_name):
567
+ # The controller is still running.
568
+ continue
569
+ logger.info(f'{capnoun} {service_name!r} controller pid '
570
+ f'{controller_pid} is not alive. Set to failure.')
571
+
572
+ # If controller job is not running, set it as controller failed.
573
+ serve_state.set_service_status_and_active_versions(
574
+ service_name, serve_state.ServiceStatus.CONTROLLER_FAILED)
479
575
 
480
576
 
481
577
  def update_service_encoded(service_name: str, version: int, mode: str,
@@ -572,12 +668,18 @@ def _get_service_status(
572
668
  if record['pool']:
573
669
  latest_yaml_path = generate_task_yaml_file_name(service_name,
574
670
  record['version'])
575
- original_config = common_utils.read_yaml(latest_yaml_path)
576
- original_config.pop('run', None)
577
- svc: Dict[str, Any] = original_config.pop('service')
578
- if svc is not None:
579
- svc.pop('pool', None)
580
- original_config['pool'] = svc
671
+ raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
672
+ original_config = raw_yaml_config.get('_user_specified_yaml')
673
+ if original_config is None:
674
+ # Fall back to old display format.
675
+ original_config = raw_yaml_config
676
+ original_config.pop('run', None)
677
+ svc: Dict[str, Any] = original_config.pop('service')
678
+ if svc is not None:
679
+ svc.pop('pool', None) # Remove pool from service config
680
+ original_config['pool'] = svc # Add pool to root config
681
+ else:
682
+ original_config = yaml.safe_load(original_config)
581
683
  record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
582
684
 
583
685
  record['target_num_replicas'] = 0
@@ -754,9 +856,11 @@ def _terminate_failed_services(
754
856
  shutil.rmtree(service_dir)
755
857
  serve_state.remove_service(service_name)
756
858
  serve_state.delete_all_versions(service_name)
859
+ serve_state.remove_ha_recovery_script(service_name)
757
860
 
758
861
  if not remaining_replica_clusters:
759
862
  return None
863
+ # TODO(tian): Try to terminate those replica clusters.
760
864
  remaining_identity = ', '.join(remaining_replica_clusters)
761
865
  return (f'{colorama.Fore.YELLOW}terminate service {service_name!r} with '
762
866
  f'failed status ({service_status}). This may indicate a resource '
@@ -845,7 +949,8 @@ def terminate_services(service_names: Optional[List[str]], purge: bool,
845
949
  return '\n'.join(messages)
846
950
 
847
951
 
848
- def wait_service_registration(service_name: str, job_id: int) -> str:
952
+ def wait_service_registration(service_name: str, job_id: int,
953
+ pool: bool) -> str:
849
954
  """Util function to call at the end of `sky.serve.up()`.
850
955
 
851
956
  This function will:
@@ -860,9 +965,11 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
860
965
  """
861
966
  start_time = time.time()
862
967
  setup_completed = False
968
+ noun = 'pool' if pool else 'service'
863
969
  while True:
864
- # TODO(tian): PID-based tracking.
865
- if not is_consolidation_mode():
970
+ # Only do this check for non-consolidation mode as consolidation mode
971
+ # has no setup process.
972
+ if not is_consolidation_mode(pool):
866
973
  job_status = job_lib.get_status(job_id)
867
974
  if job_status is None or job_status < job_lib.JobStatus.RUNNING:
868
975
  # Wait for the controller process to finish setting up. It
@@ -872,7 +979,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
872
979
  with ux_utils.print_exception_no_traceback():
873
980
  raise RuntimeError(
874
981
  f'Failed to start the controller process for '
875
- f'the service {service_name!r} within '
982
+ f'the {noun} {service_name!r} within '
876
983
  f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
877
984
  f' seconds.')
878
985
  # No need to check the service status as the controller process
@@ -880,22 +987,26 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
880
987
  time.sleep(1)
881
988
  continue
882
989
 
883
- if not setup_completed:
884
- setup_completed = True
885
- # Reset the start time to wait for the service to be registered.
886
- start_time = time.time()
990
+ if not setup_completed:
991
+ setup_completed = True
992
+ # Reset the start time to wait for the service to be registered.
993
+ start_time = time.time()
887
994
 
888
- record = serve_state.get_service_from_name(service_name)
995
+ record = _get_service_status(service_name,
996
+ pool=pool,
997
+ with_replica_info=False)
889
998
  if record is not None:
890
- # TODO(tian): PID-based tracking.
891
- if (not is_consolidation_mode() and
892
- job_id != record['controller_job_id']):
999
+ if job_id != record['controller_job_id']:
1000
+ if pool:
1001
+ command_to_run = 'sky jobs pool apply --pool'
1002
+ else:
1003
+ command_to_run = 'sky serve update'
893
1004
  with ux_utils.print_exception_no_traceback():
894
1005
  raise ValueError(
895
- f'The service {service_name!r} is already running. '
896
- 'Please specify a different name for your service. '
897
- 'To update an existing service, run: sky serve update '
898
- f'{service_name} <new-service-yaml>')
1006
+ f'The {noun} {service_name!r} is already running. '
1007
+ f'Please specify a different name for your {noun}. '
1008
+ f'To update an existing {noun}, run: {command_to_run}'
1009
+ f' {service_name} <new-{noun}-yaml>')
899
1010
  lb_port = record['load_balancer_port']
900
1011
  if lb_port is not None:
901
1012
  return message_utils.encode_payload(lb_port)
@@ -924,12 +1035,16 @@ def load_service_initialization_result(payload: str) -> int:
924
1035
  return message_utils.decode_payload(payload)
925
1036
 
926
1037
 
927
- def check_service_status_healthy(service_name: str) -> Optional[str]:
928
- service_record = serve_state.get_service_from_name(service_name)
1038
+ def _check_service_status_healthy(service_name: str,
1039
+ pool: bool) -> Optional[str]:
1040
+ service_record = _get_service_status(service_name,
1041
+ pool,
1042
+ with_replica_info=False)
1043
+ capnoun = 'Service' if not pool else 'Pool'
929
1044
  if service_record is None:
930
- return f'Service {service_name!r} does not exist.'
1045
+ return f'{capnoun} {service_name!r} does not exist.'
931
1046
  if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
932
- return (f'Service {service_name!r} is still initializing its '
1047
+ return (f'{capnoun} {service_name!r} is still initializing its '
933
1048
  'controller. Please try again later.')
934
1049
  return None
935
1050
 
@@ -968,7 +1083,10 @@ def _process_line(line: str,
968
1083
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
969
1084
 
970
1085
  if provision_log_prompt is not None:
971
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
1086
+ log_path = provision_log_prompt.group(1)
1087
+ nested_log_path = pathlib.Path(
1088
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1089
+ log_path).resolve()
972
1090
 
973
1091
  try:
974
1092
  with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
@@ -1060,12 +1178,14 @@ def _capped_follow_logs_with_provision_expanding(
1060
1178
 
1061
1179
 
1062
1180
  def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1063
- tail: Optional[int]) -> str:
1064
- msg = check_service_status_healthy(service_name)
1181
+ tail: Optional[int], pool: bool) -> str:
1182
+ msg = _check_service_status_healthy(service_name, pool=pool)
1065
1183
  if msg is not None:
1066
1184
  return msg
1185
+ repnoun = 'worker' if pool else 'replica'
1186
+ caprepnoun = repnoun.capitalize()
1067
1187
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
1068
- f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
1188
+ f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
1069
1189
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
1070
1190
  if os.path.exists(log_file_name):
1071
1191
  if tail is not None:
@@ -1082,7 +1202,7 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1082
1202
  launch_log_file_name = generate_replica_launch_log_file_name(
1083
1203
  service_name, replica_id)
1084
1204
  if not os.path.exists(launch_log_file_name):
1085
- return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.'
1205
+ return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
1086
1206
  f'{colorama.Style.RESET_ALL}')
1087
1207
 
1088
1208
  replica_cluster_name = generate_replica_cluster_name(
@@ -1132,6 +1252,10 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1132
1252
  print(line, end='', flush=True)
1133
1253
  return ''
1134
1254
 
1255
+ # For pools, we don't stream the job logs as the run section is ignored.
1256
+ if pool:
1257
+ return ''
1258
+
1135
1259
  backend = backends.CloudVmRayBackend()
1136
1260
  handle = global_user_state.get_handle_from_cluster_name(
1137
1261
  replica_cluster_name)
@@ -1146,13 +1270,13 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1146
1270
 
1147
1271
  # Notify user here to make sure user won't think the log is finished.
1148
1272
  print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
1149
- f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
1273
+ f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
1150
1274
 
1151
1275
  # Always tail the latest logs, which represent user setup & run.
1152
1276
  if tail is None:
1153
1277
  returncode = backend.tail_logs(handle, job_id=None, follow=follow)
1154
1278
  if returncode != 0:
1155
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1279
+ return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
1156
1280
  f'{replica_id}.{colorama.Style.RESET_ALL}')
1157
1281
  elif not follow and tail > 0:
1158
1282
  final = backend.tail_logs(handle,
@@ -1179,8 +1303,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1179
1303
 
1180
1304
 
1181
1305
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
1182
- follow: bool, tail: Optional[int]) -> str:
1183
- msg = check_service_status_healthy(service_name)
1306
+ follow: bool, tail: Optional[int],
1307
+ pool: bool) -> str:
1308
+ msg = _check_service_status_healthy(service_name, pool)
1184
1309
  if msg is not None:
1185
1310
  return msg
1186
1311
  if stream_controller:
@@ -1189,7 +1314,9 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
1189
1314
  log_file = generate_remote_load_balancer_log_file_name(service_name)
1190
1315
 
1191
1316
  def _service_is_terminal() -> bool:
1192
- record = serve_state.get_service_from_name(service_name)
1317
+ record = _get_service_status(service_name,
1318
+ pool,
1319
+ with_replica_info=False)
1193
1320
  if record is None:
1194
1321
  return True
1195
1322
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
@@ -1420,30 +1547,36 @@ class ServeCodeGen:
1420
1547
  return cls._build(code)
1421
1548
 
1422
1549
  @classmethod
1423
- def wait_service_registration(cls, service_name: str, job_id: int) -> str:
1550
+ def wait_service_registration(cls, service_name: str, job_id: int,
1551
+ pool: bool) -> str:
1424
1552
  code = [
1553
+ f'kwargs={{}} if serve_version < 4 else {{"pool": {pool}}}',
1425
1554
  'msg = serve_utils.wait_service_registration('
1426
- f'{service_name!r}, {job_id})', 'print(msg, end="", flush=True)'
1555
+ f'{service_name!r}, {job_id}, **kwargs)',
1556
+ 'print(msg, end="", flush=True)'
1427
1557
  ]
1428
1558
  return cls._build(code)
1429
1559
 
1430
1560
  @classmethod
1431
1561
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1432
- follow: bool, tail: Optional[int]) -> str:
1562
+ follow: bool, tail: Optional[int],
1563
+ pool: bool) -> str:
1433
1564
  code = [
1565
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1434
1566
  'msg = serve_utils.stream_replica_logs('
1435
- f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
1436
- 'print(msg, flush=True)'
1567
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
1568
+ '**kwargs)', 'print(msg, flush=True)'
1437
1569
  ]
1438
1570
  return cls._build(code)
1439
1571
 
1440
1572
  @classmethod
1441
1573
  def stream_serve_process_logs(cls, service_name: str,
1442
1574
  stream_controller: bool, follow: bool,
1443
- tail: Optional[int]) -> str:
1575
+ tail: Optional[int], pool: bool) -> str:
1444
1576
  code = [
1577
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1445
1578
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1446
- f'{stream_controller}, follow={follow}, tail={tail})',
1579
+ f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
1447
1580
  'print(msg, flush=True)'
1448
1581
  ]
1449
1582
  return cls._build(code)