skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +64 -0
  3. sky/backends/backend_utils.py +11 -11
  4. sky/backends/cloud_vm_ray_backend.py +15 -4
  5. sky/client/cli/command.py +39 -10
  6. sky/client/cli/flags.py +4 -2
  7. sky/client/sdk.py +26 -3
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  12. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/storage.py +2 -2
  46. sky/global_user_state.py +137 -37
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +4 -2
  49. sky/jobs/server/server.py +21 -12
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +248 -144
  52. sky/provision/kubernetes/network.py +9 -6
  53. sky/provision/provisioner.py +8 -0
  54. sky/schemas/api/responses.py +2 -0
  55. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  56. sky/serve/server/server.py +8 -7
  57. sky/server/common.py +10 -15
  58. sky/server/constants.py +1 -1
  59. sky/server/daemons.py +4 -2
  60. sky/server/requests/executor.py +30 -28
  61. sky/server/requests/payloads.py +5 -1
  62. sky/server/requests/preconditions.py +9 -4
  63. sky/server/requests/requests.py +130 -53
  64. sky/server/requests/serializers/encoders.py +3 -3
  65. sky/server/server.py +91 -58
  66. sky/server/stream_utils.py +127 -38
  67. sky/server/uvicorn.py +18 -17
  68. sky/setup_files/alembic.ini +4 -0
  69. sky/skylet/services.py +5 -5
  70. sky/skypilot_config.py +87 -75
  71. sky/ssh_node_pools/server.py +4 -4
  72. sky/users/permission.py +4 -0
  73. sky/utils/asyncio_utils.py +63 -3
  74. sky/utils/db/db_utils.py +11 -3
  75. sky/utils/db/migration_utils.py +7 -3
  76. sky/volumes/server/server.py +3 -3
  77. sky/workspaces/server.py +6 -6
  78. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
  79. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
  80. sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  82. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  83. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  87. sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
  88. /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  89. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  90. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  92. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/server/server.py CHANGED
@@ -43,6 +43,7 @@ from sky.data import storage_utils
43
43
  from sky.jobs import utils as managed_job_utils
44
44
  from sky.jobs.server import server as jobs_rest
45
45
  from sky.metrics import utils as metrics_utils
46
+ from sky.provision import metadata_utils
46
47
  from sky.provision.kubernetes import utils as kubernetes_utils
47
48
  from sky.schemas.api import responses
48
49
  from sky.serve.server import server as serve_rest
@@ -162,7 +163,7 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
162
163
  """Middleware to add a request ID to each request."""
163
164
 
164
165
  async def dispatch(self, request: fastapi.Request, call_next):
165
- request_id = str(uuid.uuid4())
166
+ request_id = requests_lib.get_new_request_id()
166
167
  request.state.request_id = request_id
167
168
  response = await call_next(request)
168
169
  # TODO(syang): remove X-Request-ID when v0.10.0 is released.
@@ -454,9 +455,9 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
454
455
  loop.call_at(target, tick)
455
456
 
456
457
 
457
- def schedule_on_boot_check():
458
+ async def schedule_on_boot_check_async():
458
459
  try:
459
- executor.schedule_request(
460
+ await executor.schedule_request_async(
460
461
  request_id='skypilot-server-on-boot-check',
461
462
  request_name='check',
462
463
  request_body=payloads.CheckBody(),
@@ -479,7 +480,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
479
480
  if event.should_skip():
480
481
  continue
481
482
  try:
482
- executor.schedule_request(
483
+ await executor.schedule_request_async(
483
484
  request_id=event.id,
484
485
  request_name=event.name,
485
486
  request_body=payloads.RequestBody(),
@@ -494,7 +495,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
494
495
  # Lifespan will be executed in each uvicorn worker process, we
495
496
  # can safely ignore the error if the task is already scheduled.
496
497
  logger.debug(f'Request {event.id} already exists.')
497
- schedule_on_boot_check()
498
+ await schedule_on_boot_check_async()
498
499
  asyncio.create_task(cleanup_upload_ids())
499
500
  if metrics_utils.METRICS_ENABLED:
500
501
  # Start monitoring the event loop lag in each server worker
@@ -728,7 +729,7 @@ async def token(request: fastapi.Request,
728
729
  async def check(request: fastapi.Request,
729
730
  check_body: payloads.CheckBody) -> None:
730
731
  """Checks enabled clouds."""
731
- executor.schedule_request(
732
+ await executor.schedule_request_async(
732
733
  request_id=request.state.request_id,
733
734
  request_name='check',
734
735
  request_body=check_body,
@@ -742,7 +743,7 @@ async def enabled_clouds(request: fastapi.Request,
742
743
  workspace: Optional[str] = None,
743
744
  expand: bool = False) -> None:
744
745
  """Gets enabled clouds on the server."""
745
- executor.schedule_request(
746
+ await executor.schedule_request_async(
746
747
  request_id=request.state.request_id,
747
748
  request_name='enabled_clouds',
748
749
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
@@ -758,7 +759,7 @@ async def realtime_kubernetes_gpu_availability(
758
759
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
759
760
  ) -> None:
760
761
  """Gets real-time Kubernetes GPU availability."""
761
- executor.schedule_request(
762
+ await executor.schedule_request_async(
762
763
  request_id=request.state.request_id,
763
764
  request_name='realtime_kubernetes_gpu_availability',
764
765
  request_body=realtime_gpu_availability_body,
@@ -773,7 +774,7 @@ async def kubernetes_node_info(
773
774
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
774
775
  ) -> None:
775
776
  """Gets Kubernetes nodes information and hints."""
776
- executor.schedule_request(
777
+ await executor.schedule_request_async(
777
778
  request_id=request.state.request_id,
778
779
  request_name='kubernetes_node_info',
779
780
  request_body=kubernetes_node_info_body,
@@ -785,7 +786,7 @@ async def kubernetes_node_info(
785
786
  @app.get('/status_kubernetes')
786
787
  async def status_kubernetes(request: fastapi.Request) -> None:
787
788
  """Gets Kubernetes status."""
788
- executor.schedule_request(
789
+ await executor.schedule_request_async(
789
790
  request_id=request.state.request_id,
790
791
  request_name='status_kubernetes',
791
792
  request_body=payloads.RequestBody(),
@@ -799,7 +800,7 @@ async def list_accelerators(
799
800
  request: fastapi.Request,
800
801
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
801
802
  """Gets list of accelerators from cloud catalog."""
802
- executor.schedule_request(
803
+ await executor.schedule_request_async(
803
804
  request_id=request.state.request_id,
804
805
  request_name='list_accelerators',
805
806
  request_body=list_accelerator_counts_body,
@@ -814,7 +815,7 @@ async def list_accelerator_counts(
814
815
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
815
816
  ) -> None:
816
817
  """Gets list of accelerator counts from cloud catalog."""
817
- executor.schedule_request(
818
+ await executor.schedule_request_async(
818
819
  request_id=request.state.request_id,
819
820
  request_name='list_accelerator_counts',
820
821
  request_body=list_accelerator_counts_body,
@@ -871,7 +872,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
871
872
  async def optimize(optimize_body: payloads.OptimizeBody,
872
873
  request: fastapi.Request) -> None:
873
874
  """Optimizes the user's DAG."""
874
- executor.schedule_request(
875
+ await executor.schedule_request_async(
875
876
  request_id=request.state.request_id,
876
877
  request_name='optimize',
877
878
  request_body=optimize_body,
@@ -1081,7 +1082,7 @@ async def launch(launch_body: payloads.LaunchBody,
1081
1082
  """Launches a cluster or task."""
1082
1083
  request_id = request.state.request_id
1083
1084
  logger.info(f'Launching request: {request_id}')
1084
- executor.schedule_request(
1085
+ await executor.schedule_request_async(
1085
1086
  request_id,
1086
1087
  request_name='launch',
1087
1088
  request_body=launch_body,
@@ -1097,7 +1098,7 @@ async def launch(launch_body: payloads.LaunchBody,
1097
1098
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1098
1099
  """Executes a task on an existing cluster."""
1099
1100
  cluster_name = exec_body.cluster_name
1100
- executor.schedule_request(
1101
+ await executor.schedule_request_async(
1101
1102
  request_id=request.state.request_id,
1102
1103
  request_name='exec',
1103
1104
  request_body=exec_body,
@@ -1115,7 +1116,7 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1115
1116
  async def stop(request: fastapi.Request,
1116
1117
  stop_body: payloads.StopOrDownBody) -> None:
1117
1118
  """Stops a cluster."""
1118
- executor.schedule_request(
1119
+ await executor.schedule_request_async(
1119
1120
  request_id=request.state.request_id,
1120
1121
  request_name='stop',
1121
1122
  request_body=stop_body,
@@ -1135,7 +1136,7 @@ async def status(
1135
1136
  raise fastapi.HTTPException(
1136
1137
  status_code=503,
1137
1138
  detail='Server is shutting down, please try again later.')
1138
- executor.schedule_request(
1139
+ await executor.schedule_request_async(
1139
1140
  request_id=request.state.request_id,
1140
1141
  request_name='status',
1141
1142
  request_body=status_body,
@@ -1150,7 +1151,7 @@ async def status(
1150
1151
  async def endpoints(request: fastapi.Request,
1151
1152
  endpoint_body: payloads.EndpointsBody) -> None:
1152
1153
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1153
- executor.schedule_request(
1154
+ await executor.schedule_request_async(
1154
1155
  request_id=request.state.request_id,
1155
1156
  request_name='endpoints',
1156
1157
  request_body=endpoint_body,
@@ -1164,7 +1165,7 @@ async def endpoints(request: fastapi.Request,
1164
1165
  async def down(request: fastapi.Request,
1165
1166
  down_body: payloads.StopOrDownBody) -> None:
1166
1167
  """Tears down a cluster."""
1167
- executor.schedule_request(
1168
+ await executor.schedule_request_async(
1168
1169
  request_id=request.state.request_id,
1169
1170
  request_name='down',
1170
1171
  request_body=down_body,
@@ -1178,7 +1179,7 @@ async def down(request: fastapi.Request,
1178
1179
  async def start(request: fastapi.Request,
1179
1180
  start_body: payloads.StartBody) -> None:
1180
1181
  """Restarts a cluster."""
1181
- executor.schedule_request(
1182
+ await executor.schedule_request_async(
1182
1183
  request_id=request.state.request_id,
1183
1184
  request_name='start',
1184
1185
  request_body=start_body,
@@ -1192,7 +1193,7 @@ async def start(request: fastapi.Request,
1192
1193
  async def autostop(request: fastapi.Request,
1193
1194
  autostop_body: payloads.AutostopBody) -> None:
1194
1195
  """Schedules an autostop/autodown for a cluster."""
1195
- executor.schedule_request(
1196
+ await executor.schedule_request_async(
1196
1197
  request_id=request.state.request_id,
1197
1198
  request_name='autostop',
1198
1199
  request_body=autostop_body,
@@ -1206,7 +1207,7 @@ async def autostop(request: fastapi.Request,
1206
1207
  async def queue(request: fastapi.Request,
1207
1208
  queue_body: payloads.QueueBody) -> None:
1208
1209
  """Gets the job queue of a cluster."""
1209
- executor.schedule_request(
1210
+ await executor.schedule_request_async(
1210
1211
  request_id=request.state.request_id,
1211
1212
  request_name='queue',
1212
1213
  request_body=queue_body,
@@ -1220,7 +1221,7 @@ async def queue(request: fastapi.Request,
1220
1221
  async def job_status(request: fastapi.Request,
1221
1222
  job_status_body: payloads.JobStatusBody) -> None:
1222
1223
  """Gets the status of a job."""
1223
- executor.schedule_request(
1224
+ await executor.schedule_request_async(
1224
1225
  request_id=request.state.request_id,
1225
1226
  request_name='job_status',
1226
1227
  request_body=job_status_body,
@@ -1234,7 +1235,7 @@ async def job_status(request: fastapi.Request,
1234
1235
  async def cancel(request: fastapi.Request,
1235
1236
  cancel_body: payloads.CancelBody) -> None:
1236
1237
  """Cancels jobs on a cluster."""
1237
- executor.schedule_request(
1238
+ await executor.schedule_request_async(
1238
1239
  request_id=request.state.request_id,
1239
1240
  request_name='cancel',
1240
1241
  request_body=cancel_body,
@@ -1254,7 +1255,7 @@ async def logs(
1254
1255
  # launch, to finish, so that a user does not need to manually pull the
1255
1256
  # request status.
1256
1257
  executor.check_request_thread_executor_available()
1257
- request_task = executor.prepare_request(
1258
+ request_task = await executor.prepare_request_async(
1258
1259
  request_id=request.state.request_id,
1259
1260
  request_name='logs',
1260
1261
  request_body=cluster_job_body,
@@ -1270,6 +1271,7 @@ async def logs(
1270
1271
  request_id=request.state.request_id,
1271
1272
  logs_path=request_task.log_path,
1272
1273
  background_tasks=background_tasks,
1274
+ kill_request_on_disconnect=False,
1273
1275
  )
1274
1276
 
1275
1277
 
@@ -1284,7 +1286,7 @@ async def download_logs(
1284
1286
  # We should reuse the original request body, so that the env vars, such as
1285
1287
  # user hash, are kept the same.
1286
1288
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1287
- executor.schedule_request(
1289
+ await executor.schedule_request_async(
1288
1290
  request_id=request.state.request_id,
1289
1291
  request_name='download_logs',
1290
1292
  request_body=cluster_jobs_body,
@@ -1363,38 +1365,65 @@ async def download(download_body: payloads.DownloadBody,
1363
1365
 
1364
1366
  # TODO(aylei): run it asynchronously after global_user_state support async op
1365
1367
  @app.post('/provision_logs')
1366
- def provision_logs(cluster_body: payloads.ClusterNameBody,
1368
+ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1367
1369
  follow: bool = True,
1368
1370
  tail: int = 0) -> fastapi.responses.StreamingResponse:
1369
1371
  """Streams the provision.log for the latest launch request of a cluster."""
1370
- # Prefer clusters table first, then cluster_history as fallback.
1371
- log_path_str = global_user_state.get_cluster_provision_log_path(
1372
- cluster_body.cluster_name)
1373
- if not log_path_str:
1374
- log_path_str = global_user_state.get_cluster_history_provision_log_path(
1375
- cluster_body.cluster_name)
1376
- if not log_path_str:
1377
- raise fastapi.HTTPException(
1378
- status_code=404,
1379
- detail=('Provision log path is not recorded for this cluster. '
1380
- 'Please relaunch to generate provisioning logs.'))
1372
+ log_path = None
1373
+ cluster_name = provision_logs_body.cluster_name
1374
+ worker = provision_logs_body.worker
1375
+ # stream head node logs
1376
+ if worker is None:
1377
+ # Prefer clusters table first, then cluster_history as fallback.
1378
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1379
+ cluster_name)
1380
+ if not log_path_str:
1381
+ log_path_str = (
1382
+ global_user_state.get_cluster_history_provision_log_path(
1383
+ cluster_name))
1384
+ if not log_path_str:
1385
+ raise fastapi.HTTPException(
1386
+ status_code=404,
1387
+ detail=('Provision log path is not recorded for this cluster. '
1388
+ 'Please relaunch to generate provisioning logs.'))
1389
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1390
+ if not log_path.exists():
1391
+ raise fastapi.HTTPException(
1392
+ status_code=404,
1393
+ detail=f'Provision log path does not exist: {str(log_path)}')
1381
1394
 
1382
- log_path = pathlib.Path(log_path_str).expanduser().resolve()
1383
- if not log_path.exists():
1384
- raise fastapi.HTTPException(
1385
- status_code=404,
1386
- detail=f'Provision log path does not exist: {str(log_path)}')
1395
+ # stream worker node logs
1396
+ else:
1397
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
1398
+ if handle is None:
1399
+ raise fastapi.HTTPException(
1400
+ status_code=404,
1401
+ detail=('Cluster handle is not recorded for this cluster. '
1402
+ 'Please relaunch to generate provisioning logs.'))
1403
+ # instance_ids includes head node
1404
+ instance_ids = handle.instance_ids
1405
+ if instance_ids is None:
1406
+ raise fastapi.HTTPException(
1407
+ status_code=400,
1408
+ detail='Instance IDs are not recorded for this cluster. '
1409
+ 'Please relaunch to generate provisioning logs.')
1410
+ if worker > len(instance_ids) - 1:
1411
+ raise fastapi.HTTPException(
1412
+ status_code=400,
1413
+ detail=f'Worker {worker} is out of range. '
1414
+ f'The cluster has {len(instance_ids)} nodes.')
1415
+ log_path = metadata_utils.get_instance_log_dir(
1416
+ handle.get_cluster_name_on_cloud(), instance_ids[worker])
1387
1417
 
1388
1418
  # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1389
1419
  effective_tail = None if tail is None or tail <= 0 else tail
1390
1420
 
1391
1421
  return fastapi.responses.StreamingResponse(
1392
- content=stream_utils.log_streamer(
1393
- None,
1394
- log_path,
1395
- tail=effective_tail,
1396
- follow=follow,
1397
- cluster_name=cluster_body.cluster_name),
1422
+ content=stream_utils.log_streamer(None,
1423
+ log_path,
1424
+ tail=effective_tail,
1425
+ follow=follow,
1426
+ cluster_name=cluster_name),
1398
1427
  media_type='text/plain',
1399
1428
  headers={
1400
1429
  'Cache-Control': 'no-cache, no-transform',
@@ -1408,7 +1437,7 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1408
1437
  async def cost_report(request: fastapi.Request,
1409
1438
  cost_report_body: payloads.CostReportBody) -> None:
1410
1439
  """Gets the cost report of a cluster."""
1411
- executor.schedule_request(
1440
+ await executor.schedule_request_async(
1412
1441
  request_id=request.state.request_id,
1413
1442
  request_name='cost_report',
1414
1443
  request_body=cost_report_body,
@@ -1420,7 +1449,7 @@ async def cost_report(request: fastapi.Request,
1420
1449
  @app.get('/storage/ls')
1421
1450
  async def storage_ls(request: fastapi.Request) -> None:
1422
1451
  """Gets the storages."""
1423
- executor.schedule_request(
1452
+ await executor.schedule_request_async(
1424
1453
  request_id=request.state.request_id,
1425
1454
  request_name='storage_ls',
1426
1455
  request_body=payloads.RequestBody(),
@@ -1433,7 +1462,7 @@ async def storage_ls(request: fastapi.Request) -> None:
1433
1462
  async def storage_delete(request: fastapi.Request,
1434
1463
  storage_body: payloads.StorageBody) -> None:
1435
1464
  """Deletes a storage."""
1436
- executor.schedule_request(
1465
+ await executor.schedule_request_async(
1437
1466
  request_id=request.state.request_id,
1438
1467
  request_name='storage_delete',
1439
1468
  request_body=storage_body,
@@ -1446,7 +1475,7 @@ async def storage_delete(request: fastapi.Request,
1446
1475
  async def local_up(request: fastapi.Request,
1447
1476
  local_up_body: payloads.LocalUpBody) -> None:
1448
1477
  """Launches a Kubernetes cluster on API server."""
1449
- executor.schedule_request(
1478
+ await executor.schedule_request_async(
1450
1479
  request_id=request.state.request_id,
1451
1480
  request_name='local_up',
1452
1481
  request_body=local_up_body,
@@ -1459,7 +1488,7 @@ async def local_up(request: fastapi.Request,
1459
1488
  async def local_down(request: fastapi.Request,
1460
1489
  local_down_body: payloads.LocalDownBody) -> None:
1461
1490
  """Tears down the Kubernetes cluster started by local_up."""
1462
- executor.schedule_request(
1491
+ await executor.schedule_request_async(
1463
1492
  request_id=request.state.request_id,
1464
1493
  request_name='local_down',
1465
1494
  request_body=local_down_body,
@@ -1537,7 +1566,7 @@ async def stream(
1537
1566
  detail='Only one of request_id and log_path can be provided')
1538
1567
 
1539
1568
  if request_id is None and log_path is None:
1540
- request_id = requests_lib.get_latest_request_id()
1569
+ request_id = await requests_lib.get_latest_request_id_async()
1541
1570
  if request_id is None:
1542
1571
  raise fastapi.HTTPException(status_code=404,
1543
1572
  detail='No request found')
@@ -1567,11 +1596,14 @@ async def stream(
1567
1596
  polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1568
1597
  # Original plain text streaming logic
1569
1598
  if request_id is not None:
1570
- request_task = await requests_lib.get_request_async(request_id)
1599
+ request_task = await requests_lib.get_request_async(
1600
+ request_id, fields=['request_id', 'schedule_type'])
1571
1601
  if request_task is None:
1572
1602
  print(f'No task with request ID {request_id}')
1573
1603
  raise fastapi.HTTPException(
1574
1604
  status_code=404, detail=f'Request {request_id!r} not found')
1605
+ # req.log_path is derived from request_id,
1606
+ # so it's ok to just grab the request_id in the above query.
1575
1607
  log_path_to_stream = request_task.log_path
1576
1608
  if not log_path_to_stream.exists():
1577
1609
  # The log file might be deleted by the request GC daemon but the
@@ -1581,6 +1613,7 @@ async def stream(
1581
1613
  detail=f'Log of request {request_id!r} has been deleted')
1582
1614
  if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1583
1615
  polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1616
+ del request_task
1584
1617
  else:
1585
1618
  assert log_path is not None, (request_id, log_path)
1586
1619
  if log_path == constants.API_SERVER_LOGS:
@@ -1639,7 +1672,7 @@ async def stream(
1639
1672
  async def api_cancel(request: fastapi.Request,
1640
1673
  request_cancel_body: payloads.RequestCancelBody) -> None:
1641
1674
  """Cancels requests."""
1642
- executor.schedule_request(
1675
+ await executor.schedule_request_async(
1643
1676
  request_id=request.state.request_id,
1644
1677
  request_name='api_cancel',
1645
1678
  request_body=request_cancel_body,
@@ -1875,7 +1908,7 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1875
1908
  async def all_contexts(request: fastapi.Request) -> None:
1876
1909
  """Gets all Kubernetes and SSH node pool contexts."""
1877
1910
 
1878
- executor.schedule_request(
1911
+ await executor.schedule_request_async(
1879
1912
  request_id=request.state.request_id,
1880
1913
  request_name='all_contexts',
1881
1914
  request_body=payloads.RequestBody(),