skypilot-nightly 1.0.0.dev20250804__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (151) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/catalog/kubernetes_catalog.py +8 -0
  4. sky/catalog/nebius_catalog.py +0 -1
  5. sky/check.py +11 -1
  6. sky/client/cli/command.py +234 -100
  7. sky/client/sdk.py +30 -9
  8. sky/client/sdk_async.py +815 -0
  9. sky/clouds/kubernetes.py +6 -1
  10. sky/clouds/nebius.py +1 -4
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  14. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{3698-7874720877646365.js → 3850-ff4a9a69d978632b.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  26. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9025.7937c16bc8623516.js → 9025.a1bef12d672bb66d.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  30. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/{9984.78ee6d2c6fa4b0e8.js → 9984.c5564679e467d245.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/{_app-a67ae198457b9886.js → _app-1e6de35d15a8d432.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{config-8620d099cbef8608.js → config-dfb9bf07b13045f4.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  41. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  49. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  50. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  51. sky/dashboard/out/clusters/[cluster].html +1 -1
  52. sky/dashboard/out/clusters.html +1 -1
  53. sky/dashboard/out/config.html +1 -1
  54. sky/dashboard/out/index.html +1 -1
  55. sky/dashboard/out/infra/[context].html +1 -1
  56. sky/dashboard/out/infra.html +1 -1
  57. sky/dashboard/out/jobs/[job].html +1 -1
  58. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -1
  61. sky/dashboard/out/volumes.html +1 -1
  62. sky/dashboard/out/workspace/new.html +1 -1
  63. sky/dashboard/out/workspaces/[name].html +1 -1
  64. sky/dashboard/out/workspaces.html +1 -1
  65. sky/global_user_state.py +14 -2
  66. sky/jobs/__init__.py +2 -0
  67. sky/jobs/client/sdk.py +43 -2
  68. sky/jobs/client/sdk_async.py +135 -0
  69. sky/jobs/server/core.py +48 -1
  70. sky/jobs/server/server.py +52 -3
  71. sky/jobs/state.py +5 -1
  72. sky/jobs/utils.py +3 -1
  73. sky/provision/kubernetes/utils.py +30 -4
  74. sky/provision/nebius/instance.py +1 -0
  75. sky/provision/nebius/utils.py +9 -1
  76. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  77. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  78. sky/serve/client/impl.py +85 -1
  79. sky/serve/client/sdk.py +16 -47
  80. sky/serve/client/sdk_async.py +130 -0
  81. sky/serve/constants.py +3 -1
  82. sky/serve/controller.py +6 -3
  83. sky/serve/load_balancer.py +3 -1
  84. sky/serve/serve_state.py +93 -5
  85. sky/serve/serve_utils.py +200 -67
  86. sky/serve/server/core.py +13 -197
  87. sky/serve/server/impl.py +261 -23
  88. sky/serve/service.py +15 -3
  89. sky/server/auth/__init__.py +0 -0
  90. sky/server/auth/authn.py +46 -0
  91. sky/server/auth/oauth2_proxy.py +185 -0
  92. sky/server/common.py +119 -21
  93. sky/server/constants.py +1 -1
  94. sky/server/daemons.py +60 -11
  95. sky/server/requests/executor.py +5 -3
  96. sky/server/requests/payloads.py +19 -0
  97. sky/server/rest.py +114 -0
  98. sky/server/server.py +44 -40
  99. sky/setup_files/dependencies.py +2 -0
  100. sky/skylet/constants.py +1 -1
  101. sky/skylet/events.py +5 -1
  102. sky/skylet/skylet.py +3 -1
  103. sky/task.py +61 -21
  104. sky/templates/kubernetes-ray.yml.j2 +9 -0
  105. sky/templates/nebius-ray.yml.j2 +1 -0
  106. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  107. sky/usage/usage_lib.py +8 -6
  108. sky/utils/annotations.py +8 -3
  109. sky/utils/common_utils.py +11 -1
  110. sky/utils/controller_utils.py +7 -0
  111. sky/utils/db/migration_utils.py +2 -2
  112. sky/utils/rich_utils.py +120 -0
  113. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +22 -13
  114. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +120 -112
  115. sky/client/sdk.pyi +0 -300
  116. sky/dashboard/out/_next/static/KiGGm4fK0CpmN6BT17jkh/_buildManifest.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +0 -11
  119. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/1871-7e17c195296e2ea9.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  126. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +0 -11
  133. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9e7df5fc761c95a7.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6c5af4c86e6ab3d3.js +0 -11
  138. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/webpack-13145516b19858fb.js +0 -1
  145. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  146. /sky/dashboard/out/_next/static/{KiGGm4fK0CpmN6BT17jkh → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  147. /sky/dashboard/out/_next/static/chunks/{6135-d0e285ac5f3f2485.js → 6135-85426374db04811e.js} +0 -0
  148. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  149. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  150. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  151. {skypilot_nightly-1.0.0.dev20250804.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/serve/server/core.py CHANGED
@@ -1,9 +1,6 @@
1
1
  """SkyServe core APIs."""
2
- import pathlib
3
- import signal
4
- import threading
5
2
  import typing
6
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
4
 
8
5
  from sky import backends
9
6
  from sky import exceptions
@@ -12,11 +9,8 @@ from sky.backends import backend_utils
12
9
  from sky.serve import serve_utils
13
10
  from sky.serve.server import impl
14
11
  from sky.usage import usage_lib
15
- from sky.utils import command_runner
16
12
  from sky.utils import controller_utils
17
- from sky.utils import rich_utils
18
13
  from sky.utils import subprocess_utils
19
- from sky.utils import ux_utils
20
14
 
21
15
  if typing.TYPE_CHECKING:
22
16
  import sky
@@ -24,42 +18,6 @@ if typing.TYPE_CHECKING:
24
18
  logger = sky_logging.init_logger(__name__)
25
19
 
26
20
 
27
- def _get_all_replica_targets(
28
- service_name: str, backend: backends.CloudVmRayBackend,
29
- handle: backends.CloudVmRayResourceHandle
30
- ) -> Set[serve_utils.ServiceComponentTarget]:
31
- """Helper function to get targets for all live replicas."""
32
- code = serve_utils.ServeCodeGen.get_service_status([service_name],
33
- pool=False)
34
- returncode, serve_status_payload, stderr = backend.run_on_head(
35
- handle,
36
- code,
37
- require_outputs=True,
38
- stream_logs=False,
39
- separate_stderr=True)
40
-
41
- try:
42
- subprocess_utils.handle_returncode(returncode,
43
- code,
44
- 'Failed to fetch services',
45
- stderr,
46
- stream_logs=True)
47
- except exceptions.CommandError as e:
48
- raise RuntimeError(e.error_msg) from e
49
-
50
- service_records = serve_utils.load_service_status(serve_status_payload)
51
- if not service_records:
52
- raise ValueError(f'Service {service_name!r} not found.')
53
- assert len(service_records) == 1
54
- service_record = service_records[0]
55
-
56
- return {
57
- serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
58
- replica_info['replica_id'])
59
- for replica_info in service_record['replica_info']
60
- }
61
-
62
-
63
21
  @usage_lib.entrypoint
64
22
  def up(
65
23
  task: 'sky.Task',
@@ -277,59 +235,12 @@ def tail_logs(
277
235
  sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
278
236
  ValueError: arguments not valid, or failed to tail the logs.
279
237
  """
280
- if isinstance(target, str):
281
- target = serve_utils.ServiceComponent(target)
282
- if not isinstance(target, serve_utils.ServiceComponent):
283
- with ux_utils.print_exception_no_traceback():
284
- raise ValueError(f'`target` must be a string or '
285
- f'sky.serve.ServiceComponent, got {type(target)}.')
286
-
287
- if target == serve_utils.ServiceComponent.REPLICA:
288
- if replica_id is None:
289
- with ux_utils.print_exception_no_traceback():
290
- raise ValueError(
291
- '`replica_id` must be specified when using target=REPLICA.')
292
- else:
293
- if replica_id is not None:
294
- with ux_utils.print_exception_no_traceback():
295
- raise ValueError('`replica_id` must be None when using '
296
- 'target=CONTROLLER/LOAD_BALANCER.')
297
-
298
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
299
- handle = backend_utils.is_controller_accessible(
300
- controller=controller_type,
301
- stopped_message=controller_type.value.default_hint_if_non_existent)
302
-
303
- backend = backend_utils.get_backend_from_handle(handle)
304
- assert isinstance(backend, backends.CloudVmRayBackend), backend
305
-
306
- if target != serve_utils.ServiceComponent.REPLICA:
307
- code = serve_utils.ServeCodeGen.stream_serve_process_logs(
308
- service_name,
309
- stream_controller=(
310
- target == serve_utils.ServiceComponent.CONTROLLER),
311
- follow=follow,
312
- tail=tail)
313
- else:
314
- assert replica_id is not None, service_name
315
- code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
316
- replica_id,
317
- follow,
318
- tail=tail)
319
-
320
- # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
321
- # kill the process, so we need to handle it manually here.
322
- if threading.current_thread() is threading.main_thread():
323
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
324
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
325
-
326
- # Refer to the notes in
327
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
328
- backend.run_on_head(handle,
329
- code,
330
- stream_logs=True,
331
- process_stream=False,
332
- ssh_mode=command_runner.SshMode.INTERACTIVE)
238
+ return impl.tail_logs(service_name,
239
+ target=target,
240
+ replica_id=replica_id,
241
+ follow=follow,
242
+ tail=tail,
243
+ pool=False)
333
244
 
334
245
 
335
246
  @usage_lib.entrypoint
@@ -374,104 +285,9 @@ def sync_down_logs(
374
285
  sky.exceptions.ClusterNotUpError: If the controller is not up.
375
286
  ValueError: Arguments not valid.
376
287
  """
377
- # Step 0) get the controller handle
378
- with rich_utils.safe_status(
379
- ux_utils.spinner_message('Checking service status...')):
380
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
381
- handle = backend_utils.is_controller_accessible(
382
- controller=controller_type,
383
- stopped_message=controller_type.value.default_hint_if_non_existent)
384
- backend: backends.CloudVmRayBackend = (
385
- backend_utils.get_backend_from_handle(handle))
386
-
387
- requested_components: Set[serve_utils.ServiceComponent] = set()
388
- if not targets:
389
- # No targets specified -> request all components
390
- requested_components = {
391
- serve_utils.ServiceComponent.CONTROLLER,
392
- serve_utils.ServiceComponent.LOAD_BALANCER,
393
- serve_utils.ServiceComponent.REPLICA
394
- }
395
- else:
396
- # Parse provided targets
397
- if isinstance(targets, (str, serve_utils.ServiceComponent)):
398
- requested_components = {serve_utils.ServiceComponent(targets)}
399
- else: # list
400
- requested_components = {
401
- serve_utils.ServiceComponent(t) for t in targets
402
- }
403
-
404
- normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
405
- if serve_utils.ServiceComponent.CONTROLLER in requested_components:
406
- normalized_targets.add(
407
- serve_utils.ServiceComponentTarget(
408
- serve_utils.ServiceComponent.CONTROLLER))
409
- if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
410
- normalized_targets.add(
411
- serve_utils.ServiceComponentTarget(
412
- serve_utils.ServiceComponent.LOAD_BALANCER))
413
- if serve_utils.ServiceComponent.REPLICA in requested_components:
414
- with rich_utils.safe_status(
415
- ux_utils.spinner_message('Getting live replica infos...')):
416
- replica_targets = _get_all_replica_targets(service_name, backend,
417
- handle)
418
- if not replica_ids:
419
- # Replica target requested but no specific IDs
420
- # -> Get all replica logs
421
- normalized_targets.update(replica_targets)
422
- else:
423
- # Replica target requested with specific IDs
424
- requested_replica_targets = [
425
- serve_utils.ServiceComponentTarget(
426
- serve_utils.ServiceComponent.REPLICA, rid)
427
- for rid in replica_ids
428
- ]
429
- for target in requested_replica_targets:
430
- if target not in replica_targets:
431
- logger.warning(f'Replica ID {target.replica_id} not found '
432
- f'for {service_name}. Skipping...')
433
- else:
434
- normalized_targets.add(target)
435
-
436
- def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
437
- component = target.component
438
- # We need to set one side of the pipe to a logs stream, and the other
439
- # side to a file.
440
- log_path = str(pathlib.Path(local_dir) / f'{target}.log')
441
- stream_logs_code: str
442
-
443
- if component == serve_utils.ServiceComponent.CONTROLLER:
444
- stream_logs_code = (
445
- serve_utils.ServeCodeGen.stream_serve_process_logs(
446
- service_name,
447
- stream_controller=True,
448
- follow=False,
449
- tail=tail))
450
- elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
451
- stream_logs_code = (
452
- serve_utils.ServeCodeGen.stream_serve_process_logs(
453
- service_name,
454
- stream_controller=False,
455
- follow=False,
456
- tail=tail))
457
- elif component == serve_utils.ServiceComponent.REPLICA:
458
- replica_id = target.replica_id
459
- assert replica_id is not None, service_name
460
- stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
461
- service_name, replica_id, follow=False, tail=tail)
462
- else:
463
- assert False, component
464
-
465
- # Refer to the notes in
466
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
467
- backend.run_on_head(handle,
468
- stream_logs_code,
469
- stream_logs=False,
470
- process_stream=False,
471
- ssh_mode=command_runner.SshMode.INTERACTIVE,
472
- log_path=log_path)
473
-
474
- subprocess_utils.run_in_parallel(sync_down_logs_by_target,
475
- list(normalized_targets))
476
-
477
- return local_dir
288
+ return impl.sync_down_logs(service_name,
289
+ local_dir=local_dir,
290
+ targets=targets,
291
+ replica_ids=replica_ids,
292
+ tail=tail,
293
+ pool=False)
sky/serve/server/impl.py CHANGED
@@ -1,7 +1,12 @@
1
1
  """Implementation of the SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
4
+ import shlex
5
+ import signal
3
6
  import tempfile
4
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import threading
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
+ import uuid
5
10
 
6
11
  import colorama
7
12
  import filelock
@@ -21,6 +26,7 @@ from sky.serve import serve_state
21
26
  from sky.serve import serve_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import admin_policy_utils
29
+ from sky.utils import command_runner
24
30
  from sky.utils import common
25
31
  from sky.utils import common_utils
26
32
  from sky.utils import controller_utils
@@ -102,10 +108,10 @@ def up(
102
108
  pool: bool = False,
103
109
  ) -> Tuple[str, str]:
104
110
  """Spins up a service or a pool."""
105
- if pool and not serve_utils.is_consolidation_mode():
111
+ if pool and not serve_utils.is_consolidation_mode(pool):
106
112
  raise ValueError(
107
113
  'Pool is only supported in consolidation mode. To fix, set '
108
- '`serve.controller.consolidation_mode: true` in SkyPilot config.')
114
+ '`jobs.controller.consolidation_mode: true` in SkyPilot config.')
109
115
  task.validate()
110
116
  serve_utils.validate_service_task(task, pool=pool)
111
117
  assert task.service is not None
@@ -174,7 +180,8 @@ def up(
174
180
  prefix=f'controller-task-{service_name}-',
175
181
  mode='w',
176
182
  ) as controller_file:
177
- controller_name = common.SKY_SERVE_CONTROLLER_NAME
183
+ controller = controller_utils.get_controller_for_pool(pool)
184
+ controller_name = controller.value.cluster_name
178
185
  task_config = task.to_yaml_config()
179
186
  common_utils.dump_yaml(service_file.name, task_config)
180
187
  remote_tmp_task_yaml_path = (
@@ -187,8 +194,13 @@ def up(
187
194
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
188
195
  task_resources=task.resources)
189
196
  controller_job_id = None
190
- if serve_utils.is_consolidation_mode():
191
- controller_job_id = 0
197
+ if serve_utils.is_consolidation_mode(pool):
198
+ # We need a unique integer per sky.serve.up call to avoid name
199
+ # conflict. Originally in non-consolidation mode, this is the ray
200
+ # job id; now we use the request id hash instead. Here we also
201
+ # make sure it is a 63-bit integer to avoid overflow on sqlalchemy.
202
+ rid = common_utils.get_current_request_id()
203
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFFFFFFFFFF
192
204
 
193
205
  vars_to_fill = {
194
206
  'remote_task_yaml_path': remote_tmp_task_yaml_path,
@@ -200,6 +212,7 @@ def up(
200
212
  'modified_catalogs':
201
213
  service_catalog_common.get_modified_catalog_file_mounts(),
202
214
  'consolidation_mode_job_id': controller_job_id,
215
+ 'entrypoint': shlex.quote(common_utils.get_current_command()),
203
216
  **tls_template_vars,
204
217
  **controller_utils.shared_controller_vars_to_fill(
205
218
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
@@ -238,7 +251,7 @@ def up(
238
251
  # for the first time; otherwise it is a name conflict.
239
252
  # Since the controller may be shared among multiple users, launch the
240
253
  # controller with the API server's user hash.
241
- if not serve_utils.is_consolidation_mode():
254
+ if not serve_utils.is_consolidation_mode(pool):
242
255
  print(f'{colorama.Fore.YELLOW}Launching controller for '
243
256
  f'{service_name!r}...{colorama.Style.RESET_ALL}')
244
257
  with common.with_server_user():
@@ -251,9 +264,9 @@ def up(
251
264
  _disable_controller_check=True,
252
265
  )
253
266
  else:
267
+ controller_type = controller_utils.get_controller_for_pool(pool)
254
268
  controller_handle = backend_utils.is_controller_accessible(
255
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
256
- stopped_message='')
269
+ controller=controller_type, stopped_message='')
257
270
  backend = backend_utils.get_backend_from_handle(controller_handle)
258
271
  assert isinstance(backend, backends.CloudVmRayBackend)
259
272
  backend.sync_file_mounts(
@@ -270,10 +283,8 @@ def up(
270
283
  ]
271
284
  run_script = '\n'.join(env_cmds + [run_script])
272
285
  # Dump script for high availability recovery.
273
- # if controller_utils.high_availability_specified(
274
- # controller_name):
275
- # managed_job_state.set_ha_recovery_script(
276
- # consolidation_mode_job_id, run_script)
286
+ if controller_utils.high_availability_specified(controller_name):
287
+ serve_state.set_ha_recovery_script(service_name, run_script)
277
288
  backend.run_on_head(controller_handle, run_script)
278
289
 
279
290
  style = colorama.Style
@@ -289,7 +300,7 @@ def up(
289
300
  # and return the endpoint if the job id matches. Otherwise it will
290
301
  # return None.
291
302
  code = serve_utils.ServeCodeGen.wait_service_registration(
292
- service_name, controller_job_id)
303
+ service_name, controller_job_id, pool)
293
304
  backend = backend_utils.get_backend_from_handle(controller_handle)
294
305
  assert isinstance(backend, backends.CloudVmRayBackend)
295
306
  assert isinstance(controller_handle,
@@ -304,7 +315,7 @@ def up(
304
315
  returncode, code, f'Failed to wait for {noun} initialization',
305
316
  lb_port_payload)
306
317
  except exceptions.CommandError:
307
- if serve_utils.is_consolidation_mode():
318
+ if serve_utils.is_consolidation_mode(pool):
308
319
  with ux_utils.print_exception_no_traceback():
309
320
  raise RuntimeError(
310
321
  f'Failed to wait for {noun} initialization. '
@@ -339,7 +350,7 @@ def up(
339
350
  else:
340
351
  lb_port = serve_utils.load_service_initialization_result(
341
352
  lb_port_payload)
342
- if not serve_utils.is_consolidation_mode():
353
+ if not serve_utils.is_consolidation_mode(pool):
343
354
  socket_endpoint = backend_utils.get_endpoints(
344
355
  controller_handle.cluster_name,
345
356
  lb_port,
@@ -442,8 +453,9 @@ def update(
442
453
  'effect. To update TLS keyfile and certfile, please '
443
454
  'tear down the service and spin up a new one.')
444
455
 
456
+ controller_type = controller_utils.get_controller_for_pool(pool)
445
457
  handle = backend_utils.is_controller_accessible(
446
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
458
+ controller=controller_type,
447
459
  stopped_message=
448
460
  'Service controller is stopped. There is no service to update. '
449
461
  f'To spin up a new service, use {ux_utils.BOLD}'
@@ -572,9 +584,9 @@ def apply(
572
584
  """Applies the config to the service or pool."""
573
585
  with filelock.FileLock(serve_utils.get_service_filelock_path(service_name)):
574
586
  try:
587
+ controller_type = controller_utils.get_controller_for_pool(pool)
575
588
  handle = backend_utils.is_controller_accessible(
576
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
577
- stopped_message='')
589
+ controller=controller_type, stopped_message='')
578
590
  backend = backend_utils.get_backend_from_handle(handle)
579
591
  assert isinstance(backend, backends.CloudVmRayBackend)
580
592
  service_record = _get_service_record(service_name, pool, handle,
@@ -598,8 +610,9 @@ def down(
598
610
  service_names = []
599
611
  if isinstance(service_names, str):
600
612
  service_names = [service_names]
613
+ controller_type = controller_utils.get_controller_for_pool(pool)
601
614
  handle = backend_utils.is_controller_accessible(
602
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
615
+ controller=controller_type,
603
616
  stopped_message=f'All {noun}s should have terminated.')
604
617
 
605
618
  service_names_str = ','.join(service_names)
@@ -624,7 +637,7 @@ def down(
624
637
  except exceptions.FetchClusterInfoError as e:
625
638
  raise RuntimeError(
626
639
  'Failed to fetch controller IP. Please refresh controller status '
627
- f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
640
+ f'by `sky status -r {controller_type.value.cluster_name}` '
628
641
  'and try again.') from e
629
642
 
630
643
  try:
@@ -654,7 +667,7 @@ def status(
654
667
  raise RuntimeError(f'Failed to refresh {noun}s status '
655
668
  'due to network error.') from e
656
669
 
657
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
670
+ controller_type = controller_utils.get_controller_for_pool(pool)
658
671
  handle = backend_utils.is_controller_accessible(
659
672
  controller=controller_type,
660
673
  stopped_message=controller_type.value.default_hint_if_non_existent.
@@ -690,7 +703,7 @@ def status(
690
703
  if service_record['load_balancer_port'] is not None:
691
704
  try:
692
705
  lb_port = service_record['load_balancer_port']
693
- if not serve_utils.is_consolidation_mode():
706
+ if not serve_utils.is_consolidation_mode(pool):
694
707
  endpoint = backend_utils.get_endpoints(
695
708
  cluster=common.SKY_SERVE_CONTROLLER_NAME,
696
709
  port=lb_port).get(lb_port, None)
@@ -707,3 +720,228 @@ def status(
707
720
  service_record['endpoint'] = f'{protocol}://{endpoint}'
708
721
 
709
722
  return service_records
723
+
724
+
725
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
726
+
727
+
728
+ def tail_logs(
729
+ service_name: str,
730
+ *,
731
+ target: ServiceComponentOrStr,
732
+ replica_id: Optional[int] = None,
733
+ follow: bool = True,
734
+ tail: Optional[int] = None,
735
+ pool: bool = False,
736
+ ) -> None:
737
+ """Tail logs of a service or pool."""
738
+ if isinstance(target, str):
739
+ target = serve_utils.ServiceComponent(target)
740
+
741
+ if pool and target == serve_utils.ServiceComponent.LOAD_BALANCER:
742
+ raise ValueError(f'Target {target} is not supported for pool.')
743
+
744
+ if target == serve_utils.ServiceComponent.REPLICA:
745
+ if replica_id is None:
746
+ with ux_utils.print_exception_no_traceback():
747
+ raise ValueError(
748
+ '`replica_id` must be specified when using target=REPLICA.')
749
+ else:
750
+ if replica_id is not None:
751
+ with ux_utils.print_exception_no_traceback():
752
+ raise ValueError('`replica_id` must be None when using '
753
+ 'target=CONTROLLER/LOAD_BALANCER.')
754
+
755
+ controller_type = controller_utils.get_controller_for_pool(pool)
756
+ handle = backend_utils.is_controller_accessible(
757
+ controller=controller_type,
758
+ stopped_message=controller_type.value.default_hint_if_non_existent)
759
+
760
+ backend = backend_utils.get_backend_from_handle(handle)
761
+ assert isinstance(backend, backends.CloudVmRayBackend), backend
762
+
763
+ if target != serve_utils.ServiceComponent.REPLICA:
764
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
765
+ service_name,
766
+ stream_controller=(
767
+ target == serve_utils.ServiceComponent.CONTROLLER),
768
+ follow=follow,
769
+ tail=tail,
770
+ pool=pool)
771
+ else:
772
+ assert replica_id is not None, service_name
773
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
774
+ replica_id,
775
+ follow,
776
+ tail=tail,
777
+ pool=pool)
778
+
779
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
780
+ # kill the process, so we need to handle it manually here.
781
+ if threading.current_thread() is threading.main_thread():
782
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
783
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
784
+
785
+ # Refer to the notes in
786
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
787
+ backend.run_on_head(handle,
788
+ code,
789
+ stream_logs=True,
790
+ process_stream=False,
791
+ ssh_mode=command_runner.SshMode.INTERACTIVE)
792
+
793
+
794
+ def _get_all_replica_targets(
795
+ service_name: str, backend: backends.CloudVmRayBackend,
796
+ handle: backends.CloudVmRayResourceHandle,
797
+ pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
798
+ """Helper function to get targets for all live replicas."""
799
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
800
+ pool=pool)
801
+ returncode, serve_status_payload, stderr = backend.run_on_head(
802
+ handle,
803
+ code,
804
+ require_outputs=True,
805
+ stream_logs=False,
806
+ separate_stderr=True)
807
+
808
+ try:
809
+ subprocess_utils.handle_returncode(returncode,
810
+ code,
811
+ 'Failed to fetch services',
812
+ stderr,
813
+ stream_logs=True)
814
+ except exceptions.CommandError as e:
815
+ raise RuntimeError(e.error_msg) from e
816
+
817
+ service_records = serve_utils.load_service_status(serve_status_payload)
818
+ if not service_records:
819
+ raise ValueError(f'Service {service_name!r} not found.')
820
+ assert len(service_records) == 1
821
+ service_record = service_records[0]
822
+
823
+ return {
824
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
825
+ replica_info['replica_id'])
826
+ for replica_info in service_record['replica_info']
827
+ }
828
+
829
+
830
+ def sync_down_logs(
831
+ service_name: str,
832
+ *,
833
+ local_dir: str,
834
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
835
+ None] = None,
836
+ replica_ids: Optional[List[int]] = None,
837
+ tail: Optional[int] = None,
838
+ pool: bool = False,
839
+ ) -> str:
840
+ """Sync down logs of a service or pool."""
841
+ noun = 'pool' if pool else 'service'
842
+ repnoun = 'worker' if pool else 'replica'
843
+ caprepnoun = repnoun.capitalize()
844
+
845
+ # Step 0) get the controller handle
846
+ with rich_utils.safe_status(
847
+ ux_utils.spinner_message(f'Checking {noun} status...')):
848
+ controller_type = controller_utils.get_controller_for_pool(pool)
849
+ handle = backend_utils.is_controller_accessible(
850
+ controller=controller_type,
851
+ stopped_message=controller_type.value.default_hint_if_non_existent)
852
+ backend: backends.CloudVmRayBackend = (
853
+ backend_utils.get_backend_from_handle(handle))
854
+
855
+ requested_components: Set[serve_utils.ServiceComponent] = set()
856
+ if not targets:
857
+ # No targets specified -> request all components
858
+ requested_components = {
859
+ serve_utils.ServiceComponent.CONTROLLER,
860
+ serve_utils.ServiceComponent.LOAD_BALANCER,
861
+ serve_utils.ServiceComponent.REPLICA
862
+ }
863
+ else:
864
+ # Parse provided targets
865
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
866
+ requested_components = {serve_utils.ServiceComponent(targets)}
867
+ else: # list
868
+ requested_components = {
869
+ serve_utils.ServiceComponent(t) for t in targets
870
+ }
871
+
872
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
873
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
874
+ normalized_targets.add(
875
+ serve_utils.ServiceComponentTarget(
876
+ serve_utils.ServiceComponent.CONTROLLER))
877
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
878
+ normalized_targets.add(
879
+ serve_utils.ServiceComponentTarget(
880
+ serve_utils.ServiceComponent.LOAD_BALANCER))
881
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
882
+ with rich_utils.safe_status(
883
+ ux_utils.spinner_message(f'Getting live {repnoun} infos...')):
884
+ replica_targets = _get_all_replica_targets(service_name, backend,
885
+ handle, pool)
886
+ if not replica_ids:
887
+ # Replica target requested but no specific IDs
888
+ # -> Get all replica logs
889
+ normalized_targets.update(replica_targets)
890
+ else:
891
+ # Replica target requested with specific IDs
892
+ requested_replica_targets = [
893
+ serve_utils.ServiceComponentTarget(
894
+ serve_utils.ServiceComponent.REPLICA, rid)
895
+ for rid in replica_ids
896
+ ]
897
+ for target in requested_replica_targets:
898
+ if target not in replica_targets:
899
+ logger.warning(f'{caprepnoun} ID {target.replica_id} not '
900
+ f'found for {service_name}. Skipping...')
901
+ else:
902
+ normalized_targets.add(target)
903
+
904
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
905
+ component = target.component
906
+ # We need to set one side of the pipe to a logs stream, and the other
907
+ # side to a file.
908
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
909
+ stream_logs_code: str
910
+
911
+ if component == serve_utils.ServiceComponent.CONTROLLER:
912
+ stream_logs_code = (
913
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
914
+ service_name,
915
+ stream_controller=True,
916
+ follow=False,
917
+ tail=tail,
918
+ pool=pool))
919
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
920
+ stream_logs_code = (
921
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
922
+ service_name,
923
+ stream_controller=False,
924
+ follow=False,
925
+ tail=tail,
926
+ pool=pool))
927
+ elif component == serve_utils.ServiceComponent.REPLICA:
928
+ replica_id = target.replica_id
929
+ assert replica_id is not None, service_name
930
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
931
+ service_name, replica_id, follow=False, tail=tail, pool=pool)
932
+ else:
933
+ assert False, component
934
+
935
+ # Refer to the notes in
936
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
937
+ backend.run_on_head(handle,
938
+ stream_logs_code,
939
+ stream_logs=False,
940
+ process_stream=False,
941
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
942
+ log_path=log_path)
943
+
944
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
945
+ list(normalized_targets))
946
+
947
+ return local_dir