skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250807__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +33 -4
  3. sky/check.py +11 -1
  4. sky/client/cli/command.py +208 -93
  5. sky/client/sdk.py +14 -1
  6. sky/client/sdk_async.py +4 -0
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/YAirOGsV1z6B2RJ0VIUmD/_buildManifest.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  10. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  11. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/6601-3e21152fe16da09c.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  18. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  22. sky/dashboard/out/_next/static/chunks/8969-318c3dca725e8e5d.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
  24. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  26. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-1e6de35d15a8d432.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
  35. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  36. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  37. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +1 -0
  44. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  45. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  46. sky/dashboard/out/clusters/[cluster].html +1 -1
  47. sky/dashboard/out/clusters.html +1 -1
  48. sky/dashboard/out/config.html +1 -1
  49. sky/dashboard/out/index.html +1 -1
  50. sky/dashboard/out/infra/[context].html +1 -1
  51. sky/dashboard/out/infra.html +1 -1
  52. sky/dashboard/out/jobs/[job].html +1 -1
  53. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  54. sky/dashboard/out/jobs.html +1 -1
  55. sky/dashboard/out/users.html +1 -1
  56. sky/dashboard/out/volumes.html +1 -1
  57. sky/dashboard/out/workspace/new.html +1 -1
  58. sky/dashboard/out/workspaces/[name].html +1 -1
  59. sky/dashboard/out/workspaces.html +1 -1
  60. sky/global_user_state.py +14 -2
  61. sky/jobs/__init__.py +2 -0
  62. sky/jobs/client/sdk.py +43 -2
  63. sky/jobs/server/core.py +48 -1
  64. sky/jobs/server/server.py +52 -3
  65. sky/jobs/state.py +5 -1
  66. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  67. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  68. sky/serve/client/impl.py +85 -1
  69. sky/serve/client/sdk.py +16 -47
  70. sky/serve/constants.py +2 -1
  71. sky/serve/controller.py +4 -2
  72. sky/serve/serve_state.py +28 -5
  73. sky/serve/serve_utils.py +77 -46
  74. sky/serve/server/core.py +13 -197
  75. sky/serve/server/impl.py +239 -2
  76. sky/serve/service.py +8 -3
  77. sky/server/common.py +11 -4
  78. sky/server/constants.py +1 -1
  79. sky/server/requests/executor.py +5 -3
  80. sky/server/requests/payloads.py +19 -0
  81. sky/task.py +18 -11
  82. sky/templates/kubernetes-ray.yml.j2 +5 -0
  83. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  84. sky/usage/usage_lib.py +8 -6
  85. sky/utils/annotations.py +8 -3
  86. sky/utils/common_utils.py +11 -1
  87. sky/utils/db/migration_utils.py +2 -2
  88. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/METADATA +18 -13
  89. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/RECORD +95 -92
  90. sky/client/sdk.pyi +0 -301
  91. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
  92. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
  93. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
  94. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  95. sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
  96. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  97. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  98. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
  99. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  101. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  102. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  103. sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
  107. sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
  109. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
  116. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
  117. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  118. /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → YAirOGsV1z6B2RJ0VIUmD}/_ssgManifest.js +0 -0
  119. /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
  120. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/WHEEL +0 -0
  121. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/entry_points.txt +0 -0
  122. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/licenses/LICENSE +0 -0
  123. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250807.dist-info}/top_level.txt +0 -0
sky/serve/server/core.py CHANGED
@@ -1,9 +1,6 @@
1
1
  """SkyServe core APIs."""
2
- import pathlib
3
- import signal
4
- import threading
5
2
  import typing
6
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
4
 
8
5
  from sky import backends
9
6
  from sky import exceptions
@@ -12,11 +9,8 @@ from sky.backends import backend_utils
12
9
  from sky.serve import serve_utils
13
10
  from sky.serve.server import impl
14
11
  from sky.usage import usage_lib
15
- from sky.utils import command_runner
16
12
  from sky.utils import controller_utils
17
- from sky.utils import rich_utils
18
13
  from sky.utils import subprocess_utils
19
- from sky.utils import ux_utils
20
14
 
21
15
  if typing.TYPE_CHECKING:
22
16
  import sky
@@ -24,42 +18,6 @@ if typing.TYPE_CHECKING:
24
18
  logger = sky_logging.init_logger(__name__)
25
19
 
26
20
 
27
- def _get_all_replica_targets(
28
- service_name: str, backend: backends.CloudVmRayBackend,
29
- handle: backends.CloudVmRayResourceHandle
30
- ) -> Set[serve_utils.ServiceComponentTarget]:
31
- """Helper function to get targets for all live replicas."""
32
- code = serve_utils.ServeCodeGen.get_service_status([service_name],
33
- pool=False)
34
- returncode, serve_status_payload, stderr = backend.run_on_head(
35
- handle,
36
- code,
37
- require_outputs=True,
38
- stream_logs=False,
39
- separate_stderr=True)
40
-
41
- try:
42
- subprocess_utils.handle_returncode(returncode,
43
- code,
44
- 'Failed to fetch services',
45
- stderr,
46
- stream_logs=True)
47
- except exceptions.CommandError as e:
48
- raise RuntimeError(e.error_msg) from e
49
-
50
- service_records = serve_utils.load_service_status(serve_status_payload)
51
- if not service_records:
52
- raise ValueError(f'Service {service_name!r} not found.')
53
- assert len(service_records) == 1
54
- service_record = service_records[0]
55
-
56
- return {
57
- serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
58
- replica_info['replica_id'])
59
- for replica_info in service_record['replica_info']
60
- }
61
-
62
-
63
21
  @usage_lib.entrypoint
64
22
  def up(
65
23
  task: 'sky.Task',
@@ -277,59 +235,12 @@ def tail_logs(
277
235
  sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
278
236
  ValueError: arguments not valid, or failed to tail the logs.
279
237
  """
280
- if isinstance(target, str):
281
- target = serve_utils.ServiceComponent(target)
282
- if not isinstance(target, serve_utils.ServiceComponent):
283
- with ux_utils.print_exception_no_traceback():
284
- raise ValueError(f'`target` must be a string or '
285
- f'sky.serve.ServiceComponent, got {type(target)}.')
286
-
287
- if target == serve_utils.ServiceComponent.REPLICA:
288
- if replica_id is None:
289
- with ux_utils.print_exception_no_traceback():
290
- raise ValueError(
291
- '`replica_id` must be specified when using target=REPLICA.')
292
- else:
293
- if replica_id is not None:
294
- with ux_utils.print_exception_no_traceback():
295
- raise ValueError('`replica_id` must be None when using '
296
- 'target=CONTROLLER/LOAD_BALANCER.')
297
-
298
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
299
- handle = backend_utils.is_controller_accessible(
300
- controller=controller_type,
301
- stopped_message=controller_type.value.default_hint_if_non_existent)
302
-
303
- backend = backend_utils.get_backend_from_handle(handle)
304
- assert isinstance(backend, backends.CloudVmRayBackend), backend
305
-
306
- if target != serve_utils.ServiceComponent.REPLICA:
307
- code = serve_utils.ServeCodeGen.stream_serve_process_logs(
308
- service_name,
309
- stream_controller=(
310
- target == serve_utils.ServiceComponent.CONTROLLER),
311
- follow=follow,
312
- tail=tail)
313
- else:
314
- assert replica_id is not None, service_name
315
- code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
316
- replica_id,
317
- follow,
318
- tail=tail)
319
-
320
- # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
321
- # kill the process, so we need to handle it manually here.
322
- if threading.current_thread() is threading.main_thread():
323
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
324
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
325
-
326
- # Refer to the notes in
327
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
328
- backend.run_on_head(handle,
329
- code,
330
- stream_logs=True,
331
- process_stream=False,
332
- ssh_mode=command_runner.SshMode.INTERACTIVE)
238
+ return impl.tail_logs(service_name,
239
+ target=target,
240
+ replica_id=replica_id,
241
+ follow=follow,
242
+ tail=tail,
243
+ pool=False)
333
244
 
334
245
 
335
246
  @usage_lib.entrypoint
@@ -374,104 +285,9 @@ def sync_down_logs(
374
285
  sky.exceptions.ClusterNotUpError: If the controller is not up.
375
286
  ValueError: Arguments not valid.
376
287
  """
377
- # Step 0) get the controller handle
378
- with rich_utils.safe_status(
379
- ux_utils.spinner_message('Checking service status...')):
380
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
381
- handle = backend_utils.is_controller_accessible(
382
- controller=controller_type,
383
- stopped_message=controller_type.value.default_hint_if_non_existent)
384
- backend: backends.CloudVmRayBackend = (
385
- backend_utils.get_backend_from_handle(handle))
386
-
387
- requested_components: Set[serve_utils.ServiceComponent] = set()
388
- if not targets:
389
- # No targets specified -> request all components
390
- requested_components = {
391
- serve_utils.ServiceComponent.CONTROLLER,
392
- serve_utils.ServiceComponent.LOAD_BALANCER,
393
- serve_utils.ServiceComponent.REPLICA
394
- }
395
- else:
396
- # Parse provided targets
397
- if isinstance(targets, (str, serve_utils.ServiceComponent)):
398
- requested_components = {serve_utils.ServiceComponent(targets)}
399
- else: # list
400
- requested_components = {
401
- serve_utils.ServiceComponent(t) for t in targets
402
- }
403
-
404
- normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
405
- if serve_utils.ServiceComponent.CONTROLLER in requested_components:
406
- normalized_targets.add(
407
- serve_utils.ServiceComponentTarget(
408
- serve_utils.ServiceComponent.CONTROLLER))
409
- if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
410
- normalized_targets.add(
411
- serve_utils.ServiceComponentTarget(
412
- serve_utils.ServiceComponent.LOAD_BALANCER))
413
- if serve_utils.ServiceComponent.REPLICA in requested_components:
414
- with rich_utils.safe_status(
415
- ux_utils.spinner_message('Getting live replica infos...')):
416
- replica_targets = _get_all_replica_targets(service_name, backend,
417
- handle)
418
- if not replica_ids:
419
- # Replica target requested but no specific IDs
420
- # -> Get all replica logs
421
- normalized_targets.update(replica_targets)
422
- else:
423
- # Replica target requested with specific IDs
424
- requested_replica_targets = [
425
- serve_utils.ServiceComponentTarget(
426
- serve_utils.ServiceComponent.REPLICA, rid)
427
- for rid in replica_ids
428
- ]
429
- for target in requested_replica_targets:
430
- if target not in replica_targets:
431
- logger.warning(f'Replica ID {target.replica_id} not found '
432
- f'for {service_name}. Skipping...')
433
- else:
434
- normalized_targets.add(target)
435
-
436
- def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
437
- component = target.component
438
- # We need to set one side of the pipe to a logs stream, and the other
439
- # side to a file.
440
- log_path = str(pathlib.Path(local_dir) / f'{target}.log')
441
- stream_logs_code: str
442
-
443
- if component == serve_utils.ServiceComponent.CONTROLLER:
444
- stream_logs_code = (
445
- serve_utils.ServeCodeGen.stream_serve_process_logs(
446
- service_name,
447
- stream_controller=True,
448
- follow=False,
449
- tail=tail))
450
- elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
451
- stream_logs_code = (
452
- serve_utils.ServeCodeGen.stream_serve_process_logs(
453
- service_name,
454
- stream_controller=False,
455
- follow=False,
456
- tail=tail))
457
- elif component == serve_utils.ServiceComponent.REPLICA:
458
- replica_id = target.replica_id
459
- assert replica_id is not None, service_name
460
- stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
461
- service_name, replica_id, follow=False, tail=tail)
462
- else:
463
- assert False, component
464
-
465
- # Refer to the notes in
466
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
467
- backend.run_on_head(handle,
468
- stream_logs_code,
469
- stream_logs=False,
470
- process_stream=False,
471
- ssh_mode=command_runner.SshMode.INTERACTIVE,
472
- log_path=log_path)
473
-
474
- subprocess_utils.run_in_parallel(sync_down_logs_by_target,
475
- list(normalized_targets))
476
-
477
- return local_dir
288
+ return impl.sync_down_logs(service_name,
289
+ local_dir=local_dir,
290
+ targets=targets,
291
+ replica_ids=replica_ids,
292
+ tail=tail,
293
+ pool=False)
sky/serve/server/impl.py CHANGED
@@ -1,7 +1,12 @@
1
1
  """Implementation of the SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
4
+ import shlex
5
+ import signal
3
6
  import tempfile
4
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import threading
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
+ import uuid
5
10
 
6
11
  import colorama
7
12
  import filelock
@@ -21,6 +26,7 @@ from sky.serve import serve_state
21
26
  from sky.serve import serve_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import admin_policy_utils
29
+ from sky.utils import command_runner
24
30
  from sky.utils import common
25
31
  from sky.utils import common_utils
26
32
  from sky.utils import controller_utils
@@ -189,7 +195,12 @@ def up(
189
195
  task_resources=task.resources)
190
196
  controller_job_id = None
191
197
  if serve_utils.is_consolidation_mode(pool):
192
- controller_job_id = 0
198
+ # We need a unique integer per sky.serve.up call to avoid name
199
+ # conflict. Originally in non-consolidation mode, this is the ray
200
+ # job id; now we use the request id hash instead. Here we also
201
+ # make sure it is a 63-bit integer to avoid overflow on sqlalchemy.
202
+ rid = common_utils.get_current_request_id()
203
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFFFFFFFFFF
193
204
 
194
205
  vars_to_fill = {
195
206
  'remote_task_yaml_path': remote_tmp_task_yaml_path,
@@ -201,6 +212,7 @@ def up(
201
212
  'modified_catalogs':
202
213
  service_catalog_common.get_modified_catalog_file_mounts(),
203
214
  'consolidation_mode_job_id': controller_job_id,
215
+ 'entrypoint': shlex.quote(common_utils.get_current_command()),
204
216
  **tls_template_vars,
205
217
  **controller_utils.shared_controller_vars_to_fill(
206
218
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
@@ -708,3 +720,228 @@ def status(
708
720
  service_record['endpoint'] = f'{protocol}://{endpoint}'
709
721
 
710
722
  return service_records
723
+
724
+
725
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
726
+
727
+
728
+ def tail_logs(
729
+ service_name: str,
730
+ *,
731
+ target: ServiceComponentOrStr,
732
+ replica_id: Optional[int] = None,
733
+ follow: bool = True,
734
+ tail: Optional[int] = None,
735
+ pool: bool = False,
736
+ ) -> None:
737
+ """Tail logs of a service or pool."""
738
+ if isinstance(target, str):
739
+ target = serve_utils.ServiceComponent(target)
740
+
741
+ if pool and target == serve_utils.ServiceComponent.LOAD_BALANCER:
742
+ raise ValueError(f'Target {target} is not supported for pool.')
743
+
744
+ if target == serve_utils.ServiceComponent.REPLICA:
745
+ if replica_id is None:
746
+ with ux_utils.print_exception_no_traceback():
747
+ raise ValueError(
748
+ '`replica_id` must be specified when using target=REPLICA.')
749
+ else:
750
+ if replica_id is not None:
751
+ with ux_utils.print_exception_no_traceback():
752
+ raise ValueError('`replica_id` must be None when using '
753
+ 'target=CONTROLLER/LOAD_BALANCER.')
754
+
755
+ controller_type = controller_utils.get_controller_for_pool(pool)
756
+ handle = backend_utils.is_controller_accessible(
757
+ controller=controller_type,
758
+ stopped_message=controller_type.value.default_hint_if_non_existent)
759
+
760
+ backend = backend_utils.get_backend_from_handle(handle)
761
+ assert isinstance(backend, backends.CloudVmRayBackend), backend
762
+
763
+ if target != serve_utils.ServiceComponent.REPLICA:
764
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
765
+ service_name,
766
+ stream_controller=(
767
+ target == serve_utils.ServiceComponent.CONTROLLER),
768
+ follow=follow,
769
+ tail=tail,
770
+ pool=pool)
771
+ else:
772
+ assert replica_id is not None, service_name
773
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
774
+ replica_id,
775
+ follow,
776
+ tail=tail,
777
+ pool=pool)
778
+
779
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
780
+ # kill the process, so we need to handle it manually here.
781
+ if threading.current_thread() is threading.main_thread():
782
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
783
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
784
+
785
+ # Refer to the notes in
786
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
787
+ backend.run_on_head(handle,
788
+ code,
789
+ stream_logs=True,
790
+ process_stream=False,
791
+ ssh_mode=command_runner.SshMode.INTERACTIVE)
792
+
793
+
794
+ def _get_all_replica_targets(
795
+ service_name: str, backend: backends.CloudVmRayBackend,
796
+ handle: backends.CloudVmRayResourceHandle,
797
+ pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
798
+ """Helper function to get targets for all live replicas."""
799
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
800
+ pool=pool)
801
+ returncode, serve_status_payload, stderr = backend.run_on_head(
802
+ handle,
803
+ code,
804
+ require_outputs=True,
805
+ stream_logs=False,
806
+ separate_stderr=True)
807
+
808
+ try:
809
+ subprocess_utils.handle_returncode(returncode,
810
+ code,
811
+ 'Failed to fetch services',
812
+ stderr,
813
+ stream_logs=True)
814
+ except exceptions.CommandError as e:
815
+ raise RuntimeError(e.error_msg) from e
816
+
817
+ service_records = serve_utils.load_service_status(serve_status_payload)
818
+ if not service_records:
819
+ raise ValueError(f'Service {service_name!r} not found.')
820
+ assert len(service_records) == 1
821
+ service_record = service_records[0]
822
+
823
+ return {
824
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
825
+ replica_info['replica_id'])
826
+ for replica_info in service_record['replica_info']
827
+ }
828
+
829
+
830
+ def sync_down_logs(
831
+ service_name: str,
832
+ *,
833
+ local_dir: str,
834
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
835
+ None] = None,
836
+ replica_ids: Optional[List[int]] = None,
837
+ tail: Optional[int] = None,
838
+ pool: bool = False,
839
+ ) -> str:
840
+ """Sync down logs of a service or pool."""
841
+ noun = 'pool' if pool else 'service'
842
+ repnoun = 'worker' if pool else 'replica'
843
+ caprepnoun = repnoun.capitalize()
844
+
845
+ # Step 0) get the controller handle
846
+ with rich_utils.safe_status(
847
+ ux_utils.spinner_message(f'Checking {noun} status...')):
848
+ controller_type = controller_utils.get_controller_for_pool(pool)
849
+ handle = backend_utils.is_controller_accessible(
850
+ controller=controller_type,
851
+ stopped_message=controller_type.value.default_hint_if_non_existent)
852
+ backend: backends.CloudVmRayBackend = (
853
+ backend_utils.get_backend_from_handle(handle))
854
+
855
+ requested_components: Set[serve_utils.ServiceComponent] = set()
856
+ if not targets:
857
+ # No targets specified -> request all components
858
+ requested_components = {
859
+ serve_utils.ServiceComponent.CONTROLLER,
860
+ serve_utils.ServiceComponent.LOAD_BALANCER,
861
+ serve_utils.ServiceComponent.REPLICA
862
+ }
863
+ else:
864
+ # Parse provided targets
865
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
866
+ requested_components = {serve_utils.ServiceComponent(targets)}
867
+ else: # list
868
+ requested_components = {
869
+ serve_utils.ServiceComponent(t) for t in targets
870
+ }
871
+
872
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
873
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
874
+ normalized_targets.add(
875
+ serve_utils.ServiceComponentTarget(
876
+ serve_utils.ServiceComponent.CONTROLLER))
877
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
878
+ normalized_targets.add(
879
+ serve_utils.ServiceComponentTarget(
880
+ serve_utils.ServiceComponent.LOAD_BALANCER))
881
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
882
+ with rich_utils.safe_status(
883
+ ux_utils.spinner_message(f'Getting live {repnoun} infos...')):
884
+ replica_targets = _get_all_replica_targets(service_name, backend,
885
+ handle, pool)
886
+ if not replica_ids:
887
+ # Replica target requested but no specific IDs
888
+ # -> Get all replica logs
889
+ normalized_targets.update(replica_targets)
890
+ else:
891
+ # Replica target requested with specific IDs
892
+ requested_replica_targets = [
893
+ serve_utils.ServiceComponentTarget(
894
+ serve_utils.ServiceComponent.REPLICA, rid)
895
+ for rid in replica_ids
896
+ ]
897
+ for target in requested_replica_targets:
898
+ if target not in replica_targets:
899
+ logger.warning(f'{caprepnoun} ID {target.replica_id} not '
900
+ f'found for {service_name}. Skipping...')
901
+ else:
902
+ normalized_targets.add(target)
903
+
904
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
905
+ component = target.component
906
+ # We need to set one side of the pipe to a logs stream, and the other
907
+ # side to a file.
908
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
909
+ stream_logs_code: str
910
+
911
+ if component == serve_utils.ServiceComponent.CONTROLLER:
912
+ stream_logs_code = (
913
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
914
+ service_name,
915
+ stream_controller=True,
916
+ follow=False,
917
+ tail=tail,
918
+ pool=pool))
919
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
920
+ stream_logs_code = (
921
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
922
+ service_name,
923
+ stream_controller=False,
924
+ follow=False,
925
+ tail=tail,
926
+ pool=pool))
927
+ elif component == serve_utils.ServiceComponent.REPLICA:
928
+ replica_id = target.replica_id
929
+ assert replica_id is not None, service_name
930
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
931
+ service_name, replica_id, follow=False, tail=tail, pool=pool)
932
+ else:
933
+ assert False, component
934
+
935
+ # Refer to the notes in
936
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
937
+ backend.run_on_head(handle,
938
+ stream_logs_code,
939
+ stream_logs=False,
940
+ process_stream=False,
941
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
942
+ log_path=log_path)
943
+
944
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
945
+ list(normalized_targets))
946
+
947
+ return local_dir
sky/serve/service.py CHANGED
@@ -176,7 +176,7 @@ def _cleanup_task_run_script(job_id: int) -> None:
176
176
  logger.warning(f'Task run script {this_task_run_script} not found')
177
177
 
178
178
 
179
- def _start(service_name: str, tmp_task_yaml: str, job_id: int):
179
+ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
180
180
  """Starts the service.
181
181
  This including the controller and load balancer.
182
182
  """
@@ -228,7 +228,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
228
228
  status=serve_state.ServiceStatus.CONTROLLER_INIT,
229
229
  tls_encrypted=service_spec.tls_credential is not None,
230
230
  pool=service_spec.pool,
231
- controller_pid=os.getpid())
231
+ controller_pid=os.getpid(),
232
+ entrypoint=entrypoint)
232
233
  # Directly throw an error here. See sky/serve/api.py::up
233
234
  # for more details.
234
235
  if not success:
@@ -365,8 +366,12 @@ if __name__ == '__main__':
365
366
  required=True,
366
367
  type=int,
367
368
  help='Job id for the service job.')
369
+ parser.add_argument('--entrypoint',
370
+ type=str,
371
+ help='Entrypoint to launch the service',
372
+ required=True)
368
373
  args = parser.parse_args()
369
374
  # We start process with 'spawn', because 'fork' could result in weird
370
375
  # behaviors; 'spawn' is also cross-platform.
371
376
  multiprocessing.set_start_method('spawn', force=True)
372
- _start(args.service_name, args.task_yaml, args.job_id)
377
+ _start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)
sky/server/common.py CHANGED
@@ -16,13 +16,15 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import Any, Dict, Literal, Optional, Tuple, Union
19
+ from typing import (Any, Callable, cast, Dict, Literal, Optional, Tuple,
20
+ TypeVar, Union)
20
21
  from urllib import parse
21
22
  import uuid
22
23
 
23
24
  import cachetools
24
25
  import colorama
25
26
  import filelock
27
+ from typing_extensions import ParamSpec
26
28
 
27
29
  from sky import exceptions
28
30
  from sky import sky_logging
@@ -94,6 +96,9 @@ logger = sky_logging.init_logger(__name__)
94
96
 
95
97
  hinted_for_server_install_version_mismatch = False
96
98
 
99
+ T = TypeVar('T')
100
+ P = ParamSpec('P')
101
+
97
102
 
98
103
  class ApiServerStatus(enum.Enum):
99
104
  HEALTHY = 'healthy'
@@ -753,14 +758,14 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
753
758
  metrics_port, enable_basic_auth)
754
759
 
755
760
 
756
- def check_server_healthy_or_start(func):
761
+ def check_server_healthy_or_start(func: Callable[P, T]) -> Callable[P, T]:
757
762
 
758
763
  @functools.wraps(func)
759
764
  def wrapper(*args, deploy: bool = False, host: str = '127.0.0.1', **kwargs):
760
765
  check_server_healthy_or_start_fn(deploy, host)
761
766
  return func(*args, **kwargs)
762
767
 
763
- return wrapper
768
+ return cast(Callable[P, T], wrapper)
764
769
 
765
770
 
766
771
  def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
@@ -878,7 +883,8 @@ def request_body_to_params(body: 'pydantic.BaseModel') -> Dict[str, Any]:
878
883
 
879
884
  def reload_for_new_request(client_entrypoint: Optional[str],
880
885
  client_command: Optional[str],
881
- using_remote_api_server: bool, user: 'models.User'):
886
+ using_remote_api_server: bool, user: 'models.User',
887
+ request_id: str) -> None:
882
888
  """Reload modules, global variables, and usage message for a new request."""
883
889
  # This should be called first to make sure the logger is up-to-date.
884
890
  sky_logging.reload_logger()
@@ -892,6 +898,7 @@ def reload_for_new_request(client_entrypoint: Optional[str],
892
898
  client_command=client_command,
893
899
  using_remote_api_server=using_remote_api_server,
894
900
  user=user,
901
+ request_id=request_id,
895
902
  )
896
903
 
897
904
  # Clear cache should be called before reload_logger and usage reset,
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 14
13
+ API_VERSION = 15
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -271,7 +271,8 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
271
271
 
272
272
  @contextlib.contextmanager
273
273
  def override_request_env_and_config(
274
- request_body: payloads.RequestBody) -> Generator[None, None, None]:
274
+ request_body: payloads.RequestBody,
275
+ request_id: str) -> Generator[None, None, None]:
275
276
  """Override the environment and SkyPilot config for a request."""
276
277
  original_env = os.environ.copy()
277
278
  os.environ.update(request_body.env_vars)
@@ -292,7 +293,8 @@ def override_request_env_and_config(
292
293
  client_entrypoint=request_body.entrypoint,
293
294
  client_command=request_body.entrypoint_command,
294
295
  using_remote_api_server=request_body.using_remote_api_server,
295
- user=user)
296
+ user=user,
297
+ request_id=request_id)
296
298
  try:
297
299
  logger.debug(
298
300
  f'override path: {request_body.override_skypilot_config_path}')
@@ -376,7 +378,7 @@ def _request_execution_wrapper(request_id: str,
376
378
  # config, as there can be some logs during override that needs to be
377
379
  # captured in the log file.
378
380
  try:
379
- with override_request_env_and_config(request_body), \
381
+ with override_request_env_and_config(request_body, request_id), \
380
382
  tempstore.tempdir():
381
383
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
382
384
  config = skypilot_config.to_dict()
@@ -707,6 +707,25 @@ class JobsPoolStatusBody(RequestBody):
707
707
  pool_names: Optional[Union[str, List[str]]]
708
708
 
709
709
 
710
+ class JobsPoolLogsBody(RequestBody):
711
+ """The request body for the jobs pool logs endpoint."""
712
+ pool_name: str
713
+ target: Union[str, serve.ServiceComponent]
714
+ worker_id: Optional[int] = None
715
+ follow: bool = True
716
+ tail: Optional[int] = None
717
+
718
+
719
+ class JobsPoolDownloadLogsBody(RequestBody):
720
+ """The request body for the jobs pool download logs endpoint."""
721
+ pool_name: str
722
+ local_dir: str
723
+ targets: Optional[Union[str, serve.ServiceComponent,
724
+ List[Union[str, serve.ServiceComponent]]]]
725
+ worker_ids: Optional[List[int]] = None
726
+ tail: Optional[int] = None
727
+
728
+
710
729
  class UploadZipFileResponse(pydantic.BaseModel):
711
730
  """The response body for the upload zip file endpoint."""
712
731
  status: str