skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/check.py +11 -1
  4. sky/client/cli/command.py +208 -93
  5. sky/client/sdk.py +14 -1
  6. sky/client/sdk_async.py +4 -0
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  10. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  11. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
  24. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  26. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-1e6de35d15a8d432.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
  35. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  36. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  37. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  44. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  45. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  46. sky/dashboard/out/clusters/[cluster].html +1 -1
  47. sky/dashboard/out/clusters.html +1 -1
  48. sky/dashboard/out/config.html +1 -1
  49. sky/dashboard/out/index.html +1 -1
  50. sky/dashboard/out/infra/[context].html +1 -1
  51. sky/dashboard/out/infra.html +1 -1
  52. sky/dashboard/out/jobs/[job].html +1 -1
  53. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  54. sky/dashboard/out/jobs.html +1 -1
  55. sky/dashboard/out/users.html +1 -1
  56. sky/dashboard/out/volumes.html +1 -1
  57. sky/dashboard/out/workspace/new.html +1 -1
  58. sky/dashboard/out/workspaces/[name].html +1 -1
  59. sky/dashboard/out/workspaces.html +1 -1
  60. sky/global_user_state.py +14 -2
  61. sky/jobs/__init__.py +2 -0
  62. sky/jobs/client/sdk.py +43 -2
  63. sky/jobs/server/core.py +48 -1
  64. sky/jobs/server/server.py +52 -3
  65. sky/jobs/state.py +5 -1
  66. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  67. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  68. sky/serve/client/impl.py +85 -1
  69. sky/serve/client/sdk.py +16 -47
  70. sky/serve/constants.py +2 -1
  71. sky/serve/controller.py +4 -2
  72. sky/serve/serve_state.py +28 -5
  73. sky/serve/serve_utils.py +77 -46
  74. sky/serve/server/core.py +13 -197
  75. sky/serve/server/impl.py +239 -2
  76. sky/serve/service.py +8 -3
  77. sky/server/common.py +11 -4
  78. sky/server/constants.py +1 -1
  79. sky/server/requests/executor.py +5 -3
  80. sky/server/requests/payloads.py +19 -0
  81. sky/task.py +18 -11
  82. sky/templates/kubernetes-ray.yml.j2 +5 -0
  83. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  84. sky/usage/usage_lib.py +8 -6
  85. sky/utils/annotations.py +8 -3
  86. sky/utils/common_utils.py +11 -1
  87. sky/utils/db/migration_utils.py +2 -2
  88. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +18 -13
  89. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +95 -92
  90. sky/client/sdk.pyi +0 -301
  91. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
  94. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  101. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  102. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
  107. sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
  109. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
  117. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  118. /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  119. /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
  120. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/serve/client/sdk.py CHANGED
@@ -1,9 +1,8 @@
1
1
  """SDK for SkyServe."""
2
2
  import json
3
3
  import typing
4
- from typing import List, Optional, Union
4
+ from typing import List, Optional, Sequence, Union
5
5
 
6
- from sky.client import common as client_common
7
6
  from sky.serve.client import impl
8
7
  from sky.server import common as server_common
9
8
  from sky.server import rest
@@ -290,27 +289,13 @@ def tail_logs(service_name: str,
290
289
  sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
291
290
  ValueError: arguments not valid, or failed to tail the logs.
292
291
  """
293
- # Avoid circular import.
294
- from sky.client import sdk # pylint: disable=import-outside-toplevel
295
-
296
- body = payloads.ServeLogsBody(
297
- service_name=service_name,
298
- target=target,
299
- replica_id=replica_id,
300
- follow=follow,
301
- tail=tail,
302
- )
303
- response = server_common.make_authenticated_request(
304
- 'POST',
305
- '/serve/logs',
306
- json=json.loads(body.model_dump_json()),
307
- timeout=(5, None),
308
- stream=True)
309
- request_id = server_common.get_request_id(response)
310
- return sdk.stream_response(request_id=request_id,
311
- response=response,
312
- output_stream=output_stream,
313
- resumable=True)
292
+ return impl.tail_logs(service_name,
293
+ target,
294
+ replica_id,
295
+ follow,
296
+ output_stream,
297
+ tail,
298
+ pool=False)
314
299
 
315
300
 
316
301
  @usage_lib.entrypoint
@@ -320,8 +305,8 @@ def sync_down_logs(service_name: str,
320
305
  *,
321
306
  targets: Optional[Union[
322
307
  str, 'serve_utils.ServiceComponent',
323
- List[Union[str,
324
- 'serve_utils.ServiceComponent']]]] = None,
308
+ Sequence[Union[str,
309
+ 'serve_utils.ServiceComponent']]]] = None,
325
310
  replica_ids: Optional[List[int]] = None,
326
311
  tail: Optional[int] = None) -> None:
327
312
  """Sync down logs from the service components to a local directory.
@@ -352,25 +337,9 @@ def sync_down_logs(service_name: str,
352
337
  sky.exceptions.ClusterNotUpError: If the controller is not up.
353
338
  ValueError: Arguments not valid.
354
339
  """
355
- # Avoid circular import.
356
- from sky.client import sdk # pylint: disable=import-outside-toplevel
357
-
358
- body = payloads.ServeDownloadLogsBody(
359
- service_name=service_name,
360
- # No need to set here, since the server will override it
361
- # to a directory on the API server.
362
- local_dir=local_dir,
363
- targets=targets,
364
- replica_ids=replica_ids,
365
- tail=tail,
366
- )
367
- response = server_common.make_authenticated_request(
368
- 'POST',
369
- '/serve/sync-down-logs',
370
- json=json.loads(body.model_dump_json()),
371
- timeout=(5, None))
372
- remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
373
-
374
- # Download from API server paths to the client's local_dir
375
- client_common.download_logs_from_api_server([remote_dir], remote_dir,
376
- local_dir)
340
+ return impl.sync_down_logs(service_name,
341
+ local_dir,
342
+ targets=targets,
343
+ replica_ids=replica_ids,
344
+ tail=tail,
345
+ pool=False)
sky/serve/constants.py CHANGED
@@ -106,7 +106,8 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
106
106
  # v2.0 - Added template-replica feature.
107
107
  # v3.0 - Added cluster pool.
108
108
  # v4.0 - Added pool argument to wait_service_registration.
109
- SERVE_VERSION = 4
109
+ # v5.0 - Added pool argument to stream_serve_process_logs & stream_replica_logs.
110
+ SERVE_VERSION = 5
110
111
 
111
112
  TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
112
113
  'The version of service is outdated and does not support manually '
sky/serve/controller.py CHANGED
@@ -27,11 +27,12 @@ from sky.utils import ux_utils
27
27
  logger = sky_logging.init_logger(__name__)
28
28
 
29
29
 
30
- class SuppressSuccessGetAccessLogsFilter(logging.Filter):
30
+ class AutoscalerInfoFilter(logging.Filter):
31
31
 
32
32
  def filter(self, record: logging.LogRecord) -> bool:
33
33
  message = record.getMessage()
34
- return not ('GET' in message and '200' in message)
34
+ return not ('GET' in message and '200' in message and
35
+ '/autoscaler/info' in message)
35
36
 
36
37
 
37
38
  class SkyServeController:
@@ -61,6 +62,7 @@ class SkyServeController:
61
62
  uvicorn_access_logger = logging.getLogger('uvicorn.access')
62
63
  for handler in uvicorn_access_logger.handlers:
63
64
  handler.setFormatter(sky_logging.FORMATTER)
65
+ handler.addFilter(AutoscalerInfoFilter())
64
66
  yield
65
67
 
66
68
  def _run_autoscaler(self):
sky/serve/serve_state.py CHANGED
@@ -9,6 +9,7 @@ import sqlite3
9
9
  import threading
10
10
  import typing
11
11
  from typing import Any, Dict, List, Optional, Tuple
12
+ import uuid
12
13
 
13
14
  import colorama
14
15
 
@@ -82,6 +83,13 @@ def create_table(cursor: 'sqlite3.Cursor', conn: 'sqlite3.Connection') -> None:
82
83
  'controller_pid',
83
84
  'INTEGER DEFAULT NULL',
84
85
  value_to_replace_existing_entries=-1)
86
+ # The service hash. Unique for each service, even if the service name is
87
+ # the same.
88
+ db_utils.add_column_to_table(cursor, conn, 'services', 'hash',
89
+ 'TEXT DEFAULT NULL')
90
+ # Entrypoint to launch the service.
91
+ db_utils.add_column_to_table(cursor, conn, 'services', 'entrypoint',
92
+ 'TEXT DEFAULT NULL')
85
93
  conn.commit()
86
94
 
87
95
 
@@ -284,7 +292,7 @@ _SERVICE_STATUS_TO_COLOR = {
284
292
  def add_service(name: str, controller_job_id: int, policy: str,
285
293
  requested_resources_str: str, load_balancing_policy: str,
286
294
  status: ServiceStatus, tls_encrypted: bool, pool: bool,
287
- controller_pid: int) -> bool:
295
+ controller_pid: int, entrypoint: str) -> bool:
288
296
  """Add a service in the database.
289
297
 
290
298
  Returns:
@@ -299,11 +307,12 @@ def add_service(name: str, controller_job_id: int, policy: str,
299
307
  INSERT INTO services
300
308
  (name, controller_job_id, status, policy,
301
309
  requested_resources_str, load_balancing_policy, tls_encrypted,
302
- pool, controller_pid)
303
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
310
+ pool, controller_pid, hash, entrypoint)
311
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
304
312
  (name, controller_job_id, status.value, policy,
305
313
  requested_resources_str, load_balancing_policy,
306
- int(tls_encrypted), int(pool), controller_pid))
314
+ int(tls_encrypted), int(pool), controller_pid, str(
315
+ uuid.uuid4()), entrypoint))
307
316
 
308
317
  except sqlite3.IntegrityError as e:
309
318
  if str(e) != _UNIQUE_CONSTRAINT_FAILED_ERROR_MSG:
@@ -397,7 +406,7 @@ def _get_service_from_row(row) -> Dict[str, Any]:
397
406
  (current_version, name, controller_job_id, controller_port,
398
407
  load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
399
408
  _, active_versions, load_balancing_policy, tls_encrypted, pool,
400
- controller_pid) = row[:17]
409
+ controller_pid, svc_hash, entrypoint) = row[:19]
401
410
  record = {
402
411
  'name': name,
403
412
  'controller_job_id': controller_job_id,
@@ -418,6 +427,8 @@ def _get_service_from_row(row) -> Dict[str, Any]:
418
427
  'tls_encrypted': bool(tls_encrypted),
419
428
  'pool': bool(pool),
420
429
  'controller_pid': controller_pid,
430
+ 'hash': svc_hash,
431
+ 'entrypoint': entrypoint,
421
432
  }
422
433
  latest_spec = get_spec(name, current_version)
423
434
  if latest_spec is not None:
@@ -459,6 +470,18 @@ def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
459
470
  return None
460
471
 
461
472
 
473
+ @init_db
474
+ def get_service_hash(service_name: str) -> Optional[str]:
475
+ """Get the hash of a service."""
476
+ assert _DB_PATH is not None
477
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
478
+ rows = cursor.execute('SELECT hash FROM services WHERE name=(?)',
479
+ (service_name,)).fetchall()
480
+ for row in rows:
481
+ return row[0]
482
+ return None
483
+
484
+
462
485
  @init_db
463
486
  def get_service_versions(service_name: str) -> List[int]:
464
487
  """Gets all versions of a service."""
sky/serve/serve_utils.py CHANGED
@@ -20,6 +20,7 @@ import uuid
20
20
 
21
21
  import colorama
22
22
  import filelock
23
+ import yaml
23
24
 
24
25
  from sky import backends
25
26
  from sky import exceptions
@@ -65,13 +66,12 @@ def get_num_service_threshold():
65
66
 
66
67
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
67
68
 
68
- # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
69
- # and always appear after a space. Be careful when changing UX as this
70
- # assumption is used to expand some log files while ignoring others.
71
- _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
72
- _SKYPILOT_PROVISION_LOG_PATTERN = (
73
- fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
74
- _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
69
+ # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
70
+ # when changing UX as this assumption is used to expand some log files while
71
+ # ignoring others.
72
+ _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
73
+ _SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
74
+ _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
75
75
 
76
76
  # TODO(tian): Find all existing replica id and print here.
77
77
  _FAILED_TO_FIND_REPLICA_MSG = (
@@ -668,12 +668,18 @@ def _get_service_status(
668
668
  if record['pool']:
669
669
  latest_yaml_path = generate_task_yaml_file_name(service_name,
670
670
  record['version'])
671
- original_config = common_utils.read_yaml(latest_yaml_path)
672
- original_config.pop('run', None)
673
- svc: Dict[str, Any] = original_config.pop('service')
674
- if svc is not None:
675
- svc.pop('pool', None)
676
- original_config['pool'] = svc
671
+ raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
672
+ original_config = raw_yaml_config.get('_user_specified_yaml')
673
+ if original_config is None:
674
+ # Fall back to old display format.
675
+ original_config = raw_yaml_config
676
+ original_config.pop('run', None)
677
+ svc: Dict[str, Any] = original_config.pop('service')
678
+ if svc is not None:
679
+ svc.pop('pool', None) # Remove pool from service config
680
+ original_config['pool'] = svc # Add pool to root config
681
+ else:
682
+ original_config = yaml.safe_load(original_config)
677
683
  record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
678
684
 
679
685
  record['target_num_replicas'] = 0
@@ -959,8 +965,10 @@ def wait_service_registration(service_name: str, job_id: int,
959
965
  """
960
966
  start_time = time.time()
961
967
  setup_completed = False
968
+ noun = 'pool' if pool else 'service'
962
969
  while True:
963
- # TODO(tian): PID-based tracking.
970
+ # Only do this check for non-consolidation mode as consolidation mode
971
+ # has no setup process.
964
972
  if not is_consolidation_mode(pool):
965
973
  job_status = job_lib.get_status(job_id)
966
974
  if job_status is None or job_status < job_lib.JobStatus.RUNNING:
@@ -971,7 +979,7 @@ def wait_service_registration(service_name: str, job_id: int,
971
979
  with ux_utils.print_exception_no_traceback():
972
980
  raise RuntimeError(
973
981
  f'Failed to start the controller process for '
974
- f'the service {service_name!r} within '
982
+ f'the {noun} {service_name!r} within '
975
983
  f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
976
984
  f' seconds.')
977
985
  # No need to check the service status as the controller process
@@ -979,22 +987,26 @@ def wait_service_registration(service_name: str, job_id: int,
979
987
  time.sleep(1)
980
988
  continue
981
989
 
982
- if not setup_completed:
983
- setup_completed = True
984
- # Reset the start time to wait for the service to be registered.
985
- start_time = time.time()
990
+ if not setup_completed:
991
+ setup_completed = True
992
+ # Reset the start time to wait for the service to be registered.
993
+ start_time = time.time()
986
994
 
987
- record = serve_state.get_service_from_name(service_name)
995
+ record = _get_service_status(service_name,
996
+ pool=pool,
997
+ with_replica_info=False)
988
998
  if record is not None:
989
- # TODO(tian): PID-based tracking.
990
- if (not is_consolidation_mode(pool) and
991
- job_id != record['controller_job_id']):
999
+ if job_id != record['controller_job_id']:
1000
+ if pool:
1001
+ command_to_run = 'sky jobs pool apply --pool'
1002
+ else:
1003
+ command_to_run = 'sky serve update'
992
1004
  with ux_utils.print_exception_no_traceback():
993
1005
  raise ValueError(
994
- f'The service {service_name!r} is already running. '
995
- 'Please specify a different name for your service. '
996
- 'To update an existing service, run: sky serve update '
997
- f'{service_name} <new-service-yaml>')
1006
+ f'The {noun} {service_name!r} is already running. '
1007
+ f'Please specify a different name for your {noun}. '
1008
+ f'To update an existing {noun}, run: {command_to_run}'
1009
+ f' {service_name} <new-{noun}-yaml>')
998
1010
  lb_port = record['load_balancer_port']
999
1011
  if lb_port is not None:
1000
1012
  return message_utils.encode_payload(lb_port)
@@ -1023,12 +1035,16 @@ def load_service_initialization_result(payload: str) -> int:
1023
1035
  return message_utils.decode_payload(payload)
1024
1036
 
1025
1037
 
1026
- def check_service_status_healthy(service_name: str) -> Optional[str]:
1027
- service_record = serve_state.get_service_from_name(service_name)
1038
+ def _check_service_status_healthy(service_name: str,
1039
+ pool: bool) -> Optional[str]:
1040
+ service_record = _get_service_status(service_name,
1041
+ pool,
1042
+ with_replica_info=False)
1043
+ capnoun = 'Service' if not pool else 'Pool'
1028
1044
  if service_record is None:
1029
- return f'Service {service_name!r} does not exist.'
1045
+ return f'{capnoun} {service_name!r} does not exist.'
1030
1046
  if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
1031
- return (f'Service {service_name!r} is still initializing its '
1047
+ return (f'{capnoun} {service_name!r} is still initializing its '
1032
1048
  'controller. Please try again later.')
1033
1049
  return None
1034
1050
 
@@ -1067,7 +1083,10 @@ def _process_line(line: str,
1067
1083
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1068
1084
 
1069
1085
  if provision_log_prompt is not None:
1070
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
1086
+ log_path = provision_log_prompt.group(1)
1087
+ nested_log_path = pathlib.Path(
1088
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1089
+ log_path).resolve()
1071
1090
 
1072
1091
  try:
1073
1092
  with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
@@ -1159,12 +1178,14 @@ def _capped_follow_logs_with_provision_expanding(
1159
1178
 
1160
1179
 
1161
1180
  def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1162
- tail: Optional[int]) -> str:
1163
- msg = check_service_status_healthy(service_name)
1181
+ tail: Optional[int], pool: bool) -> str:
1182
+ msg = _check_service_status_healthy(service_name, pool=pool)
1164
1183
  if msg is not None:
1165
1184
  return msg
1185
+ repnoun = 'worker' if pool else 'replica'
1186
+ caprepnoun = repnoun.capitalize()
1166
1187
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
1167
- f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
1188
+ f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
1168
1189
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
1169
1190
  if os.path.exists(log_file_name):
1170
1191
  if tail is not None:
@@ -1181,7 +1202,7 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1181
1202
  launch_log_file_name = generate_replica_launch_log_file_name(
1182
1203
  service_name, replica_id)
1183
1204
  if not os.path.exists(launch_log_file_name):
1184
- return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.'
1205
+ return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
1185
1206
  f'{colorama.Style.RESET_ALL}')
1186
1207
 
1187
1208
  replica_cluster_name = generate_replica_cluster_name(
@@ -1231,6 +1252,10 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1231
1252
  print(line, end='', flush=True)
1232
1253
  return ''
1233
1254
 
1255
+ # For pools, we don't stream the job logs as the run section is ignored.
1256
+ if pool:
1257
+ return ''
1258
+
1234
1259
  backend = backends.CloudVmRayBackend()
1235
1260
  handle = global_user_state.get_handle_from_cluster_name(
1236
1261
  replica_cluster_name)
@@ -1245,13 +1270,13 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1245
1270
 
1246
1271
  # Notify user here to make sure user won't think the log is finished.
1247
1272
  print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
1248
- f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
1273
+ f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
1249
1274
 
1250
1275
  # Always tail the latest logs, which represent user setup & run.
1251
1276
  if tail is None:
1252
1277
  returncode = backend.tail_logs(handle, job_id=None, follow=follow)
1253
1278
  if returncode != 0:
1254
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1279
+ return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
1255
1280
  f'{replica_id}.{colorama.Style.RESET_ALL}')
1256
1281
  elif not follow and tail > 0:
1257
1282
  final = backend.tail_logs(handle,
@@ -1278,8 +1303,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1278
1303
 
1279
1304
 
1280
1305
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
1281
- follow: bool, tail: Optional[int]) -> str:
1282
- msg = check_service_status_healthy(service_name)
1306
+ follow: bool, tail: Optional[int],
1307
+ pool: bool) -> str:
1308
+ msg = _check_service_status_healthy(service_name, pool)
1283
1309
  if msg is not None:
1284
1310
  return msg
1285
1311
  if stream_controller:
@@ -1288,7 +1314,9 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
1288
1314
  log_file = generate_remote_load_balancer_log_file_name(service_name)
1289
1315
 
1290
1316
  def _service_is_terminal() -> bool:
1291
- record = serve_state.get_service_from_name(service_name)
1317
+ record = _get_service_status(service_name,
1318
+ pool,
1319
+ with_replica_info=False)
1292
1320
  if record is None:
1293
1321
  return True
1294
1322
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
@@ -1531,21 +1559,24 @@ class ServeCodeGen:
1531
1559
 
1532
1560
  @classmethod
1533
1561
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1534
- follow: bool, tail: Optional[int]) -> str:
1562
+ follow: bool, tail: Optional[int],
1563
+ pool: bool) -> str:
1535
1564
  code = [
1565
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1536
1566
  'msg = serve_utils.stream_replica_logs('
1537
- f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
1538
- 'print(msg, flush=True)'
1567
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
1568
+ '**kwargs)', 'print(msg, flush=True)'
1539
1569
  ]
1540
1570
  return cls._build(code)
1541
1571
 
1542
1572
  @classmethod
1543
1573
  def stream_serve_process_logs(cls, service_name: str,
1544
1574
  stream_controller: bool, follow: bool,
1545
- tail: Optional[int]) -> str:
1575
+ tail: Optional[int], pool: bool) -> str:
1546
1576
  code = [
1577
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1547
1578
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1548
- f'{stream_controller}, follow={follow}, tail={tail})',
1579
+ f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
1549
1580
  'print(msg, flush=True)'
1550
1581
  ]
1551
1582
  return cls._build(code)