skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -3,8 +3,10 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import base64
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  import contextlib
7
8
  import datetime
9
+ from enum import IntEnum
8
10
  import hashlib
9
11
  import json
10
12
  import multiprocessing
@@ -14,8 +16,10 @@ import posixpath
14
16
  import re
15
17
  import resource
16
18
  import shutil
19
+ import struct
17
20
  import sys
18
21
  import threading
22
+ import traceback
19
23
  from typing import Dict, List, Literal, Optional, Set, Tuple
20
24
  import uuid
21
25
  import zipfile
@@ -23,6 +27,7 @@ import zipfile
23
27
  import aiofiles
24
28
  import anyio
25
29
  import fastapi
30
+ from fastapi import responses as fastapi_responses
26
31
  from fastapi.middleware import cors
27
32
  import starlette.middleware.base
28
33
  import uvloop
@@ -41,6 +46,7 @@ from sky.data import storage_utils
41
46
  from sky.jobs import utils as managed_job_utils
42
47
  from sky.jobs.server import server as jobs_rest
43
48
  from sky.metrics import utils as metrics_utils
49
+ from sky.provision import metadata_utils
44
50
  from sky.provision.kubernetes import utils as kubernetes_utils
45
51
  from sky.schemas.api import responses
46
52
  from sky.serve.server import server as serve_rest
@@ -58,6 +64,7 @@ from sky.server.auth import oauth2_proxy
58
64
  from sky.server.requests import executor
59
65
  from sky.server.requests import payloads
60
66
  from sky.server.requests import preconditions
67
+ from sky.server.requests import request_names
61
68
  from sky.server.requests import requests as requests_lib
62
69
  from sky.skylet import constants
63
70
  from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -73,6 +80,7 @@ from sky.utils import dag_utils
73
80
  from sky.utils import perf_utils
74
81
  from sky.utils import status_lib
75
82
  from sky.utils import subprocess_utils
83
+ from sky.utils import ux_utils
76
84
  from sky.utils.db import db_utils
77
85
  from sky.volumes.server import server as volumes_rest
78
86
  from sky.workspaces import server as workspaces_rest
@@ -159,7 +167,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
159
167
  """Middleware to add a request ID to each request."""
160
168
 
161
169
  async def dispatch(self, request: fastapi.Request, call_next):
162
- request_id = str(uuid.uuid4())
170
+ request_id = requests_lib.get_new_request_id()
163
171
  request.state.request_id = request_id
164
172
  response = await call_next(request)
165
173
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
@@ -451,11 +459,11 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
451
459
  loop.call_at(target, tick)
452
460
 
453
461
 
454
- def schedule_on_boot_check():
462
+ async def schedule_on_boot_check_async():
455
463
  try:
456
- executor.schedule_request(
464
+ await executor.schedule_request_async(
457
465
  request_id='skypilot-server-on-boot-check',
458
- request_name='check',
466
+ request_name=request_names.RequestName.CHECK,
459
467
  request_body=payloads.CheckBody(),
460
468
  func=sky_check.check,
461
469
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -476,7 +484,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
476
484
  if event.should_skip():
477
485
  continue
478
486
  try:
479
- executor.schedule_request(
487
+ await executor.schedule_request_async(
480
488
  request_id=event.id,
481
489
  request_name=event.name,
482
490
  request_body=payloads.RequestBody(),
@@ -491,7 +499,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
491
499
  # Lifespan will be executed in each uvicorn worker process, we
492
500
  # can safely ignore the error if the task is already scheduled.
493
501
  logger.debug(f'Request {event.id} already exists.')
494
- schedule_on_boot_check()
502
+ await schedule_on_boot_check_async()
495
503
  asyncio.create_task(cleanup_upload_ids())
496
504
  if metrics_utils.METRICS_ENABLED:
497
505
  # Start monitoring the event loop lag in each server worker
@@ -663,6 +671,25 @@ except Exception: # pylint: disable=broad-except
663
671
  pass # no issue, we will warn the user later if its too low
664
672
 
665
673
 
674
+ @app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
675
+ def handle_concurrent_worker_exhausted_error(
676
+ request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
677
+ del request # request is not used
678
+ # Print detailed error message to server log
679
+ logger.error('Concurrent worker exhausted: '
680
+ f'{common_utils.format_exception(e)}')
681
+ with ux_utils.enable_traceback():
682
+ logger.error(f' Traceback: {traceback.format_exc()}')
683
+ # Return human readable error message to client
684
+ return fastapi.responses.JSONResponse(
685
+ status_code=503,
686
+ content={
687
+ 'detail':
688
+ ('The server has exhausted its concurrent worker limit. '
689
+ 'Please try again or scale the server if the load persists.')
690
+ })
691
+
692
+
666
693
  @app.get('/token')
667
694
  async def token(request: fastapi.Request,
668
695
  local_port: Optional[int] = None) -> fastapi.responses.Response:
@@ -706,9 +733,9 @@ async def token(request: fastapi.Request,
706
733
  async def check(request: fastapi.Request,
707
734
  check_body: payloads.CheckBody) -> None:
708
735
  """Checks enabled clouds."""
709
- executor.schedule_request(
736
+ await executor.schedule_request_async(
710
737
  request_id=request.state.request_id,
711
- request_name='check',
738
+ request_name=request_names.RequestName.CHECK,
712
739
  request_body=check_body,
713
740
  func=sky_check.check,
714
741
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -720,9 +747,9 @@ async def enabled_clouds(request: fastapi.Request,
720
747
  workspace: Optional[str] = None,
721
748
  expand: bool = False) -> None:
722
749
  """Gets enabled clouds on the server."""
723
- executor.schedule_request(
750
+ await executor.schedule_request_async(
724
751
  request_id=request.state.request_id,
725
- request_name='enabled_clouds',
752
+ request_name=request_names.RequestName.ENABLED_CLOUDS,
726
753
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
727
754
  expand=expand),
728
755
  func=core.enabled_clouds,
@@ -736,9 +763,10 @@ async def realtime_kubernetes_gpu_availability(
736
763
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
737
764
  ) -> None:
738
765
  """Gets real-time Kubernetes GPU availability."""
739
- executor.schedule_request(
766
+ await executor.schedule_request_async(
740
767
  request_id=request.state.request_id,
741
- request_name='realtime_kubernetes_gpu_availability',
768
+ request_name=request_names.RequestName.
769
+ REALTIME_KUBERNETES_GPU_AVAILABILITY,
742
770
  request_body=realtime_gpu_availability_body,
743
771
  func=core.realtime_kubernetes_gpu_availability,
744
772
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -751,9 +779,9 @@ async def kubernetes_node_info(
751
779
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
752
780
  ) -> None:
753
781
  """Gets Kubernetes nodes information and hints."""
754
- executor.schedule_request(
782
+ await executor.schedule_request_async(
755
783
  request_id=request.state.request_id,
756
- request_name='kubernetes_node_info',
784
+ request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
757
785
  request_body=kubernetes_node_info_body,
758
786
  func=kubernetes_utils.get_kubernetes_node_info,
759
787
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -763,9 +791,9 @@ async def kubernetes_node_info(
763
791
  @app.get('/status_kubernetes')
764
792
  async def status_kubernetes(request: fastapi.Request) -> None:
765
793
  """Gets Kubernetes status."""
766
- executor.schedule_request(
794
+ await executor.schedule_request_async(
767
795
  request_id=request.state.request_id,
768
- request_name='status_kubernetes',
796
+ request_name=request_names.RequestName.STATUS_KUBERNETES,
769
797
  request_body=payloads.RequestBody(),
770
798
  func=core.status_kubernetes,
771
799
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -777,9 +805,9 @@ async def list_accelerators(
777
805
  request: fastapi.Request,
778
806
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
779
807
  """Gets list of accelerators from cloud catalog."""
780
- executor.schedule_request(
808
+ await executor.schedule_request_async(
781
809
  request_id=request.state.request_id,
782
- request_name='list_accelerators',
810
+ request_name=request_names.RequestName.LIST_ACCELERATORS,
783
811
  request_body=list_accelerator_counts_body,
784
812
  func=catalog.list_accelerators,
785
813
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -792,9 +820,9 @@ async def list_accelerator_counts(
792
820
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
793
821
  ) -> None:
794
822
  """Gets list of accelerator counts from cloud catalog."""
795
- executor.schedule_request(
823
+ await executor.schedule_request_async(
796
824
  request_id=request.state.request_id,
797
- request_name='list_accelerator_counts',
825
+ request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
798
826
  request_body=list_accelerator_counts_body,
799
827
  func=catalog.list_accelerator_counts,
800
828
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -828,6 +856,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
828
856
  # server thread.
829
857
  with admin_policy_utils.apply_and_use_config_in_current_request(
830
858
  dag,
859
+ request_name=request_names.AdminPolicyRequestName.VALIDATE,
831
860
  request_options=validate_body.get_request_options()) as dag:
832
861
  dag.resolve_and_validate_volumes()
833
862
  # Skip validating workdir and file_mounts, as those need to be
@@ -849,9 +878,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
849
878
  async def optimize(optimize_body: payloads.OptimizeBody,
850
879
  request: fastapi.Request) -> None:
851
880
  """Optimizes the user's DAG."""
852
- executor.schedule_request(
881
+ await executor.schedule_request_async(
853
882
  request_id=request.state.request_id,
854
- request_name='optimize',
883
+ request_name=request_names.RequestName.OPTIMIZE,
855
884
  request_body=optimize_body,
856
885
  ignore_return_value=True,
857
886
  func=core.optimize,
@@ -1059,9 +1088,9 @@ async def launch(launch_body: payloads.LaunchBody,
1059
1088
  """Launches a cluster or task."""
1060
1089
  request_id = request.state.request_id
1061
1090
  logger.info(f'Launching request: {request_id}')
1062
- executor.schedule_request(
1091
+ await executor.schedule_request_async(
1063
1092
  request_id,
1064
- request_name='launch',
1093
+ request_name=request_names.RequestName.CLUSTER_LAUNCH,
1065
1094
  request_body=launch_body,
1066
1095
  func=execution.launch,
1067
1096
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1075,9 +1104,9 @@ async def launch(launch_body: payloads.LaunchBody,
1075
1104
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1076
1105
  """Executes a task on an existing cluster."""
1077
1106
  cluster_name = exec_body.cluster_name
1078
- executor.schedule_request(
1107
+ await executor.schedule_request_async(
1079
1108
  request_id=request.state.request_id,
1080
- request_name='exec',
1109
+ request_name=request_names.RequestName.CLUSTER_EXEC,
1081
1110
  request_body=exec_body,
1082
1111
  func=execution.exec,
1083
1112
  precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1093,9 +1122,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1093
1122
  async def stop(request: fastapi.Request,
1094
1123
  stop_body: payloads.StopOrDownBody) -> None:
1095
1124
  """Stops a cluster."""
1096
- executor.schedule_request(
1125
+ await executor.schedule_request_async(
1097
1126
  request_id=request.state.request_id,
1098
- request_name='stop',
1127
+ request_name=request_names.RequestName.CLUSTER_STOP,
1099
1128
  request_body=stop_body,
1100
1129
  func=core.stop,
1101
1130
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1113,9 +1142,9 @@ async def status(
1113
1142
  raise fastapi.HTTPException(
1114
1143
  status_code=503,
1115
1144
  detail='Server is shutting down, please try again later.')
1116
- executor.schedule_request(
1145
+ await executor.schedule_request_async(
1117
1146
  request_id=request.state.request_id,
1118
- request_name='status',
1147
+ request_name=request_names.RequestName.CLUSTER_STATUS,
1119
1148
  request_body=status_body,
1120
1149
  func=core.status,
1121
1150
  schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1128,9 +1157,9 @@ async def status(
1128
1157
  async def endpoints(request: fastapi.Request,
1129
1158
  endpoint_body: payloads.EndpointsBody) -> None:
1130
1159
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1131
- executor.schedule_request(
1160
+ await executor.schedule_request_async(
1132
1161
  request_id=request.state.request_id,
1133
- request_name='endpoints',
1162
+ request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
1134
1163
  request_body=endpoint_body,
1135
1164
  func=core.endpoints,
1136
1165
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1142,9 +1171,9 @@ async def endpoints(request: fastapi.Request,
1142
1171
  async def down(request: fastapi.Request,
1143
1172
  down_body: payloads.StopOrDownBody) -> None:
1144
1173
  """Tears down a cluster."""
1145
- executor.schedule_request(
1174
+ await executor.schedule_request_async(
1146
1175
  request_id=request.state.request_id,
1147
- request_name='down',
1176
+ request_name=request_names.RequestName.CLUSTER_DOWN,
1148
1177
  request_body=down_body,
1149
1178
  func=core.down,
1150
1179
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1156,9 +1185,9 @@ async def down(request: fastapi.Request,
1156
1185
  async def start(request: fastapi.Request,
1157
1186
  start_body: payloads.StartBody) -> None:
1158
1187
  """Restarts a cluster."""
1159
- executor.schedule_request(
1188
+ await executor.schedule_request_async(
1160
1189
  request_id=request.state.request_id,
1161
- request_name='start',
1190
+ request_name=request_names.RequestName.CLUSTER_START,
1162
1191
  request_body=start_body,
1163
1192
  func=core.start,
1164
1193
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1170,9 +1199,9 @@ async def start(request: fastapi.Request,
1170
1199
  async def autostop(request: fastapi.Request,
1171
1200
  autostop_body: payloads.AutostopBody) -> None:
1172
1201
  """Schedules an autostop/autodown for a cluster."""
1173
- executor.schedule_request(
1202
+ await executor.schedule_request_async(
1174
1203
  request_id=request.state.request_id,
1175
- request_name='autostop',
1204
+ request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
1176
1205
  request_body=autostop_body,
1177
1206
  func=core.autostop,
1178
1207
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1184,9 +1213,9 @@ async def autostop(request: fastapi.Request,
1184
1213
  async def queue(request: fastapi.Request,
1185
1214
  queue_body: payloads.QueueBody) -> None:
1186
1215
  """Gets the job queue of a cluster."""
1187
- executor.schedule_request(
1216
+ await executor.schedule_request_async(
1188
1217
  request_id=request.state.request_id,
1189
- request_name='queue',
1218
+ request_name=request_names.RequestName.CLUSTER_QUEUE,
1190
1219
  request_body=queue_body,
1191
1220
  func=core.queue,
1192
1221
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1198,9 +1227,9 @@ async def queue(request: fastapi.Request,
1198
1227
  async def job_status(request: fastapi.Request,
1199
1228
  job_status_body: payloads.JobStatusBody) -> None:
1200
1229
  """Gets the status of a job."""
1201
- executor.schedule_request(
1230
+ await executor.schedule_request_async(
1202
1231
  request_id=request.state.request_id,
1203
- request_name='job_status',
1232
+ request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
1204
1233
  request_body=job_status_body,
1205
1234
  func=core.job_status,
1206
1235
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1212,9 +1241,9 @@ async def job_status(request: fastapi.Request,
1212
1241
  async def cancel(request: fastapi.Request,
1213
1242
  cancel_body: payloads.CancelBody) -> None:
1214
1243
  """Cancels jobs on a cluster."""
1215
- executor.schedule_request(
1244
+ await executor.schedule_request_async(
1216
1245
  request_id=request.state.request_id,
1217
- request_name='cancel',
1246
+ request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
1218
1247
  request_body=cancel_body,
1219
1248
  func=core.cancel,
1220
1249
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1231,9 +1260,10 @@ async def logs(
1231
1260
  # TODO(zhwu): This should wait for the request on the cluster, e.g., async
1232
1261
  # launch, to finish, so that a user does not need to manually pull the
1233
1262
  # request status.
1234
- request_task = executor.prepare_request(
1263
+ executor.check_request_thread_executor_available()
1264
+ request_task = await executor.prepare_request_async(
1235
1265
  request_id=request.state.request_id,
1236
- request_name='logs',
1266
+ request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
1237
1267
  request_body=cluster_job_body,
1238
1268
  func=core.tail_logs,
1239
1269
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1243,10 +1273,11 @@ async def logs(
1243
1273
  background_tasks.add_task(task.cancel)
1244
1274
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1245
1275
  # the same approach as /stream.
1246
- return stream_utils.stream_response(
1276
+ return stream_utils.stream_response_for_long_request(
1247
1277
  request_id=request.state.request_id,
1248
1278
  logs_path=request_task.log_path,
1249
1279
  background_tasks=background_tasks,
1280
+ kill_request_on_disconnect=False,
1250
1281
  )
1251
1282
 
1252
1283
 
@@ -1261,9 +1292,9 @@ async def download_logs(
1261
1292
  # We should reuse the original request body, so that the env vars, such as
1262
1293
  # user hash, are kept the same.
1263
1294
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1264
- executor.schedule_request(
1295
+ await executor.schedule_request_async(
1265
1296
  request_id=request.state.request_id,
1266
- request_name='download_logs',
1297
+ request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
1267
1298
  request_body=cluster_jobs_body,
1268
1299
  func=core.download_logs,
1269
1300
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1340,38 +1371,65 @@ async def download(download_body: payloads.DownloadBody,
1340
1371
 
1341
1372
  # TODO(aylei): run it asynchronously after global_user_state support async op
1342
1373
  @app.post('/provision_logs')
1343
- def provision_logs(cluster_body: payloads.ClusterNameBody,
1374
+ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1344
1375
  follow: bool = True,
1345
1376
  tail: int = 0) -> fastapi.responses.StreamingResponse:
1346
1377
  """Streams the provision.log for the latest launch request of a cluster."""
1347
- # Prefer clusters table first, then cluster_history as fallback.
1348
- log_path_str = global_user_state.get_cluster_provision_log_path(
1349
- cluster_body.cluster_name)
1350
- if not log_path_str:
1351
- log_path_str = global_user_state.get_cluster_history_provision_log_path(
1352
- cluster_body.cluster_name)
1353
- if not log_path_str:
1354
- raise fastapi.HTTPException(
1355
- status_code=404,
1356
- detail=('Provision log path is not recorded for this cluster. '
1357
- 'Please relaunch to generate provisioning logs.'))
1378
+ log_path = None
1379
+ cluster_name = provision_logs_body.cluster_name
1380
+ worker = provision_logs_body.worker
1381
+ # stream head node logs
1382
+ if worker is None:
1383
+ # Prefer clusters table first, then cluster_history as fallback.
1384
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1385
+ cluster_name)
1386
+ if not log_path_str:
1387
+ log_path_str = (
1388
+ global_user_state.get_cluster_history_provision_log_path(
1389
+ cluster_name))
1390
+ if not log_path_str:
1391
+ raise fastapi.HTTPException(
1392
+ status_code=404,
1393
+ detail=('Provision log path is not recorded for this cluster. '
1394
+ 'Please relaunch to generate provisioning logs.'))
1395
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1396
+ if not log_path.exists():
1397
+ raise fastapi.HTTPException(
1398
+ status_code=404,
1399
+ detail=f'Provision log path does not exist: {str(log_path)}')
1358
1400
 
1359
- log_path = pathlib.Path(log_path_str).expanduser().resolve()
1360
- if not log_path.exists():
1361
- raise fastapi.HTTPException(
1362
- status_code=404,
1363
- detail=f'Provision log path does not exist: {str(log_path)}')
1401
+ # stream worker node logs
1402
+ else:
1403
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
1404
+ if handle is None:
1405
+ raise fastapi.HTTPException(
1406
+ status_code=404,
1407
+ detail=('Cluster handle is not recorded for this cluster. '
1408
+ 'Please relaunch to generate provisioning logs.'))
1409
+ # instance_ids includes head node
1410
+ instance_ids = handle.instance_ids
1411
+ if instance_ids is None:
1412
+ raise fastapi.HTTPException(
1413
+ status_code=400,
1414
+ detail='Instance IDs are not recorded for this cluster. '
1415
+ 'Please relaunch to generate provisioning logs.')
1416
+ if worker > len(instance_ids) - 1:
1417
+ raise fastapi.HTTPException(
1418
+ status_code=400,
1419
+ detail=f'Worker {worker} is out of range. '
1420
+ f'The cluster has {len(instance_ids)} nodes.')
1421
+ log_path = metadata_utils.get_instance_log_dir(
1422
+ handle.get_cluster_name_on_cloud(), instance_ids[worker])
1364
1423
 
1365
1424
  # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1366
1425
  effective_tail = None if tail is None or tail <= 0 else tail
1367
1426
 
1368
1427
  return fastapi.responses.StreamingResponse(
1369
- content=stream_utils.log_streamer(
1370
- None,
1371
- log_path,
1372
- tail=effective_tail,
1373
- follow=follow,
1374
- cluster_name=cluster_body.cluster_name),
1428
+ content=stream_utils.log_streamer(None,
1429
+ log_path,
1430
+ tail=effective_tail,
1431
+ follow=follow,
1432
+ cluster_name=cluster_name),
1375
1433
  media_type='text/plain',
1376
1434
  headers={
1377
1435
  'Cache-Control': 'no-cache, no-transform',
@@ -1385,9 +1443,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1385
1443
  async def cost_report(request: fastapi.Request,
1386
1444
  cost_report_body: payloads.CostReportBody) -> None:
1387
1445
  """Gets the cost report of a cluster."""
1388
- executor.schedule_request(
1446
+ await executor.schedule_request_async(
1389
1447
  request_id=request.state.request_id,
1390
- request_name='cost_report',
1448
+ request_name=request_names.RequestName.CLUSTER_COST_REPORT,
1391
1449
  request_body=cost_report_body,
1392
1450
  func=core.cost_report,
1393
1451
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1397,9 +1455,9 @@ async def cost_report(request: fastapi.Request,
1397
1455
  @app.get('/storage/ls')
1398
1456
  async def storage_ls(request: fastapi.Request) -> None:
1399
1457
  """Gets the storages."""
1400
- executor.schedule_request(
1458
+ await executor.schedule_request_async(
1401
1459
  request_id=request.state.request_id,
1402
- request_name='storage_ls',
1460
+ request_name=request_names.RequestName.STORAGE_LS,
1403
1461
  request_body=payloads.RequestBody(),
1404
1462
  func=core.storage_ls,
1405
1463
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1410,9 +1468,9 @@ async def storage_ls(request: fastapi.Request) -> None:
1410
1468
  async def storage_delete(request: fastapi.Request,
1411
1469
  storage_body: payloads.StorageBody) -> None:
1412
1470
  """Deletes a storage."""
1413
- executor.schedule_request(
1471
+ await executor.schedule_request_async(
1414
1472
  request_id=request.state.request_id,
1415
- request_name='storage_delete',
1473
+ request_name=request_names.RequestName.STORAGE_DELETE,
1416
1474
  request_body=storage_body,
1417
1475
  func=core.storage_delete,
1418
1476
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1423,9 +1481,9 @@ async def storage_delete(request: fastapi.Request,
1423
1481
  async def local_up(request: fastapi.Request,
1424
1482
  local_up_body: payloads.LocalUpBody) -> None:
1425
1483
  """Launches a Kubernetes cluster on API server."""
1426
- executor.schedule_request(
1484
+ await executor.schedule_request_async(
1427
1485
  request_id=request.state.request_id,
1428
- request_name='local_up',
1486
+ request_name=request_names.RequestName.LOCAL_UP,
1429
1487
  request_body=local_up_body,
1430
1488
  func=core.local_up,
1431
1489
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1436,19 +1494,36 @@ async def local_up(request: fastapi.Request,
1436
1494
  async def local_down(request: fastapi.Request,
1437
1495
  local_down_body: payloads.LocalDownBody) -> None:
1438
1496
  """Tears down the Kubernetes cluster started by local_up."""
1439
- executor.schedule_request(
1497
+ await executor.schedule_request_async(
1440
1498
  request_id=request.state.request_id,
1441
- request_name='local_down',
1499
+ request_name=request_names.RequestName.LOCAL_DOWN,
1442
1500
  request_body=local_down_body,
1443
1501
  func=core.local_down,
1444
1502
  schedule_type=requests_lib.ScheduleType.LONG,
1445
1503
  )
1446
1504
 
1447
1505
 
1506
+ async def get_expanded_request_id(request_id: str) -> str:
1507
+ """Gets the expanded request ID for a given request ID prefix."""
1508
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1509
+ request_id, fields=['request_id'])
1510
+ if request_tasks is None:
1511
+ raise fastapi.HTTPException(status_code=404,
1512
+ detail=f'Request {request_id!r} not found')
1513
+ if len(request_tasks) > 1:
1514
+ raise fastapi.HTTPException(status_code=400,
1515
+ detail=('Multiple requests found for '
1516
+ f'request ID prefix: {request_id}'))
1517
+ return request_tasks[0].request_id
1518
+
1519
+
1448
1520
  # === API server related APIs ===
1449
- @app.get('/api/get')
1521
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1450
1522
  async def api_get(request_id: str) -> payloads.RequestPayload:
1451
1523
  """Gets a request with a given request ID prefix."""
1524
+ # Validate request_id prefix matches a single request.
1525
+ request_id = await get_expanded_request_id(request_id)
1526
+
1452
1527
  while True:
1453
1528
  req_status = await requests_lib.get_request_status_async(request_id)
1454
1529
  if req_status is None:
@@ -1465,6 +1540,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1465
1540
  # to avoid storming the DB and CPU in the meantime
1466
1541
  await asyncio.sleep(0.1)
1467
1542
  request_task = await requests_lib.get_request_async(request_id)
1543
+ # TODO(aylei): refine this, /api/get will not be retried and this is
1544
+ # meaningless to retry. It is the original request that should be retried.
1468
1545
  if request_task.should_retry:
1469
1546
  raise fastapi.HTTPException(
1470
1547
  status_code=503, detail=f'Request {request_id!r} should be retried')
@@ -1506,13 +1583,18 @@ async def stream(
1506
1583
  clients, console for CLI/API clients), 'plain' (force plain text),
1507
1584
  'html' (force HTML), or 'console' (force console)
1508
1585
  """
1586
+ # We need to save the user-supplied request ID for the response header.
1587
+ user_supplied_request_id = request_id
1509
1588
  if request_id is not None and log_path is not None:
1510
1589
  raise fastapi.HTTPException(
1511
1590
  status_code=400,
1512
1591
  detail='Only one of request_id and log_path can be provided')
1513
1592
 
1593
+ if request_id is not None:
1594
+ request_id = await get_expanded_request_id(request_id)
1595
+
1514
1596
  if request_id is None and log_path is None:
1515
- request_id = requests_lib.get_latest_request_id()
1597
+ request_id = await requests_lib.get_latest_request_id_async()
1516
1598
  if request_id is None:
1517
1599
  raise fastapi.HTTPException(status_code=404,
1518
1600
  detail='No request found')
@@ -1539,13 +1621,17 @@ async def stream(
1539
1621
  'X-Accel-Buffering': 'no'
1540
1622
  })
1541
1623
 
1624
+ polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1542
1625
  # Original plain text streaming logic
1543
1626
  if request_id is not None:
1544
- request_task = await requests_lib.get_request_async(request_id)
1627
+ request_task = await requests_lib.get_request_async(
1628
+ request_id, fields=['request_id', 'schedule_type'])
1545
1629
  if request_task is None:
1546
1630
  print(f'No task with request ID {request_id}')
1547
1631
  raise fastapi.HTTPException(
1548
1632
  status_code=404, detail=f'Request {request_id!r} not found')
1633
+ # req.log_path is derived from request_id,
1634
+ # so it's ok to just grab the request_id in the above query.
1549
1635
  log_path_to_stream = request_task.log_path
1550
1636
  if not log_path_to_stream.exists():
1551
1637
  # The log file might be deleted by the request GC daemon but the
@@ -1553,6 +1639,9 @@ async def stream(
1553
1639
  raise fastapi.HTTPException(
1554
1640
  status_code=404,
1555
1641
  detail=f'Log of request {request_id!r} has been deleted')
1642
+ if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1643
+ polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1644
+ del request_task
1556
1645
  else:
1557
1646
  assert log_path is not None, (request_id, log_path)
1558
1647
  if log_path == constants.API_SERVER_LOGS:
@@ -1593,14 +1682,17 @@ async def stream(
1593
1682
  'Transfer-Encoding': 'chunked'
1594
1683
  }
1595
1684
  if request_id is not None:
1596
- headers[server_constants.STREAM_REQUEST_HEADER] = request_id
1685
+ headers[server_constants.STREAM_REQUEST_HEADER] = (
1686
+ user_supplied_request_id
1687
+ if user_supplied_request_id else request_id)
1597
1688
 
1598
1689
  return fastapi.responses.StreamingResponse(
1599
1690
  content=stream_utils.log_streamer(request_id,
1600
1691
  log_path_to_stream,
1601
1692
  plain_logs=format == 'plain',
1602
1693
  tail=tail,
1603
- follow=follow),
1694
+ follow=follow,
1695
+ polling_interval=polling_interval),
1604
1696
  media_type='text/plain',
1605
1697
  headers=headers,
1606
1698
  )
@@ -1610,11 +1702,11 @@ async def stream(
1610
1702
  async def api_cancel(request: fastapi.Request,
1611
1703
  request_cancel_body: payloads.RequestCancelBody) -> None:
1612
1704
  """Cancels requests."""
1613
- executor.schedule_request(
1705
+ await executor.schedule_request_async(
1614
1706
  request_id=request.state.request_id,
1615
- request_name='api_cancel',
1707
+ request_name=request_names.RequestName.API_CANCEL,
1616
1708
  request_body=request_cancel_body,
1617
- func=requests_lib.kill_requests,
1709
+ func=requests_lib.kill_requests_with_prefix,
1618
1710
  schedule_type=requests_lib.ScheduleType.SHORT,
1619
1711
  )
1620
1712
 
@@ -1622,9 +1714,13 @@ async def api_cancel(request: fastapi.Request,
1622
1714
  @app.get('/api/status')
1623
1715
  async def api_status(
1624
1716
  request_ids: Optional[List[str]] = fastapi.Query(
1625
- None, description='Request IDs to get status for.'),
1717
+ None, description='Request ID prefixes to get status for.'),
1626
1718
  all_status: bool = fastapi.Query(
1627
1719
  False, description='Get finished requests as well.'),
1720
+ limit: Optional[int] = fastapi.Query(
1721
+ None, description='Number of requests to show.'),
1722
+ fields: Optional[List[str]] = fastapi.Query(
1723
+ None, description='Fields to get. If None, get all fields.'),
1628
1724
  ) -> List[payloads.RequestPayload]:
1629
1725
  """Gets the list of requests."""
1630
1726
  if request_ids is None:
@@ -1635,15 +1731,22 @@ async def api_status(
1635
1731
  requests_lib.RequestStatus.RUNNING,
1636
1732
  ]
1637
1733
  request_tasks = await requests_lib.get_request_tasks_async(
1638
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
1639
- return [r.readable_encode() for r in request_tasks]
1734
+ req_filter=requests_lib.RequestTaskFilter(
1735
+ status=statuses,
1736
+ limit=limit,
1737
+ fields=fields,
1738
+ sort=True,
1739
+ ))
1740
+ return requests_lib.encode_requests(request_tasks)
1640
1741
  else:
1641
1742
  encoded_request_tasks = []
1642
1743
  for request_id in request_ids:
1643
- request_task = await requests_lib.get_request_async(request_id)
1644
- if request_task is None:
1744
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1745
+ request_id)
1746
+ if request_tasks is None:
1645
1747
  continue
1646
- encoded_request_tasks.append(request_task.readable_encode())
1748
+ for request_task in request_tasks:
1749
+ encoded_request_tasks.append(request_task.readable_encode())
1647
1750
  return encoded_request_tasks
1648
1751
 
1649
1752
 
@@ -1703,23 +1806,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1703
1806
  version=sky.__version__,
1704
1807
  version_on_disk=common.get_skypilot_version_on_disk(),
1705
1808
  commit=sky.__commit__,
1809
+ # Whether basic auth on api server is enabled
1706
1810
  basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1707
1811
  'false').lower() == 'true',
1708
1812
  user=user if user is not None else None,
1813
+ # Whether service account token is enabled
1814
+ service_account_token_enabled=(os.environ.get(
1815
+ constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
1816
+ 'false').lower() == 'true'),
1817
+ # Whether basic auth on ingress is enabled
1818
+ ingress_basic_auth_enabled=os.environ.get(
1819
+ constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
1820
+ 'false').lower() == 'true',
1709
1821
  )
1710
1822
 
1711
1823
 
1824
+ class KubernetesSSHMessageType(IntEnum):
1825
+ REGULAR_DATA = 0
1826
+ PINGPONG = 1
1827
+ LATENCY_MEASUREMENT = 2
1828
+
1829
+
1712
1830
  @app.websocket('/kubernetes-pod-ssh-proxy')
1713
- async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1714
- cluster_name: str) -> None:
1831
+ async def kubernetes_pod_ssh_proxy(
1832
+ websocket: fastapi.WebSocket,
1833
+ cluster_name: str,
1834
+ client_version: Optional[int] = None) -> None:
1715
1835
  """Proxies SSH to the Kubernetes pod with websocket."""
1716
1836
  await websocket.accept()
1717
1837
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1718
1838
 
1839
+ timestamps_supported = client_version is not None and client_version > 21
1840
+ logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
1841
+ client_version = {client_version}')
1842
+
1719
1843
  # Run core.status in another thread to avoid blocking the event loop.
1720
- cluster_records = await context_utils.to_thread(core.status,
1721
- cluster_name,
1722
- all_users=True)
1844
+ with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1845
+ cluster_records = await context_utils.to_thread_with_executor(
1846
+ thread_pool_executor, core.status, cluster_name, all_users=True)
1723
1847
  cluster_record = cluster_records[0]
1724
1848
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1725
1849
  raise fastapi.HTTPException(
@@ -1770,6 +1894,42 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1770
1894
  async def websocket_to_ssh():
1771
1895
  try:
1772
1896
  async for message in websocket.iter_bytes():
1897
+ if timestamps_supported:
1898
+ type_size = struct.calcsize('!B')
1899
+ message_type = struct.unpack('!B',
1900
+ message[:type_size])[0]
1901
+ if (message_type ==
1902
+ KubernetesSSHMessageType.REGULAR_DATA):
1903
+ # Regular data - strip type byte and forward to SSH
1904
+ message = message[type_size:]
1905
+ elif message_type == KubernetesSSHMessageType.PINGPONG:
1906
+ # PING message - respond with PONG (type 1)
1907
+ ping_id_size = struct.calcsize('!I')
1908
+ if len(message) != type_size + ping_id_size:
1909
+ raise ValueError('Invalid PING message '
1910
+ f'length: {len(message)}')
1911
+ # Return the same PING message, so that the client
1912
+ # can measure the latency.
1913
+ await websocket.send_bytes(message)
1914
+ continue
1915
+ elif (message_type ==
1916
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT):
1917
+ # Latency measurement from client
1918
+ latency_size = struct.calcsize('!Q')
1919
+ if len(message) != type_size + latency_size:
1920
+ raise ValueError(
1921
+ 'Invalid latency measurement '
1922
+ f'message length: {len(message)}')
1923
+ avg_latency_ms = struct.unpack(
1924
+ '!Q',
1925
+ message[type_size:type_size + latency_size])[0]
1926
+ latency_seconds = avg_latency_ms / 1000
1927
+ metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
1928
+ continue
1929
+ else:
1930
+ # Unknown message type.
1931
+ raise ValueError(
1932
+ f'Unknown message type: {message_type}')
1773
1933
  writer.write(message)
1774
1934
  try:
1775
1935
  await writer.drain()
@@ -1800,6 +1960,11 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1800
1960
  nonlocal ssh_failed
1801
1961
  ssh_failed = True
1802
1962
  break
1963
+ if timestamps_supported:
1964
+ # Prepend message type byte (0 = regular data)
1965
+ message_type_bytes = struct.pack(
1966
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
1967
+ data = message_type_bytes + data
1803
1968
  await websocket.send_bytes(data)
1804
1969
  except Exception: # pylint: disable=broad-except
1805
1970
  pass
@@ -1837,9 +2002,9 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1837
2002
  async def all_contexts(request: fastapi.Request) -> None:
1838
2003
  """Gets all Kubernetes and SSH node pool contexts."""
1839
2004
 
1840
- executor.schedule_request(
2005
+ await executor.schedule_request_async(
1841
2006
  request_id=request.state.request_id,
1842
- request_name='all_contexts',
2007
+ request_name=request_names.RequestName.ALL_CONTEXTS,
1843
2008
  request_body=payloads.RequestBody(),
1844
2009
  func=core.get_all_contexts,
1845
2010
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1967,6 +2132,19 @@ if __name__ == '__main__':
1967
2132
  logger.error(f'Port {cmd_args.port} is not available, exiting.')
1968
2133
  raise RuntimeError(f'Port {cmd_args.port} is not available')
1969
2134
 
2135
+ # Maybe touch the signal file on API server startup. Do it again here even
2136
+ # if we already touched it in the sky/server/common.py::_start_api_server.
2137
+ # This is because the sky/server/common.py::_start_api_server function call
2138
+ # is running outside the skypilot API server process tree. The process tree
2139
+ # starts within that function (see the `subprocess.Popen` call in
2140
+ # sky/server/common.py::_start_api_server). When pg is used, the
2141
+ # _start_api_server function will not load the config file from db, which
2142
+ # will ignore the consolidation mode config. Here, inside the process tree,
2143
+ # we already reload the config as a server (with env var _start_api_server),
2144
+ # so we will respect the consolidation mode config.
2145
+ # Refers to #7717 for more details.
2146
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
2147
+
1970
2148
  # Show the privacy policy if it is not already shown. We place it here so
1971
2149
  # that it is shown only when the API server is started.
1972
2150
  usage_lib.maybe_show_privacy_policy()
@@ -2014,7 +2192,8 @@ if __name__ == '__main__':
2014
2192
  uvicorn_config = uvicorn.Config('sky.server.server:app',
2015
2193
  host=cmd_args.host,
2016
2194
  port=cmd_args.port,
2017
- workers=num_workers)
2195
+ workers=num_workers,
2196
+ ws_per_message_deflate=False)
2018
2197
  skyuvicorn.run(uvicorn_config,
2019
2198
  max_db_connections=config.num_db_connections_per_worker)
2020
2199
  except Exception as exc: # pylint: disable=broad-except