skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/server/server.py CHANGED
@@ -3,8 +3,10 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import base64
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  import contextlib
7
8
  import datetime
9
+ from enum import IntEnum
8
10
  import hashlib
9
11
  import json
10
12
  import multiprocessing
@@ -14,15 +16,18 @@ import posixpath
14
16
  import re
15
17
  import resource
16
18
  import shutil
19
+ import struct
17
20
  import sys
18
21
  import threading
19
- from typing import Dict, List, Literal, Optional, Set, Tuple
22
+ import traceback
23
+ from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
24
  import uuid
21
25
  import zipfile
22
26
 
23
27
  import aiofiles
24
28
  import anyio
25
29
  import fastapi
30
+ from fastapi import responses as fastapi_responses
26
31
  from fastapi.middleware import cors
27
32
  import starlette.middleware.base
28
33
  import uvloop
@@ -38,9 +43,12 @@ from sky import global_user_state
38
43
  from sky import models
39
44
  from sky import sky_logging
40
45
  from sky.data import storage_utils
46
+ from sky.jobs import utils as managed_job_utils
41
47
  from sky.jobs.server import server as jobs_rest
42
48
  from sky.metrics import utils as metrics_utils
49
+ from sky.provision import metadata_utils
43
50
  from sky.provision.kubernetes import utils as kubernetes_utils
51
+ from sky.provision.slurm import utils as slurm_utils
44
52
  from sky.schemas.api import responses
45
53
  from sky.serve.server import server as serve_rest
46
54
  from sky.server import common
@@ -48,14 +56,19 @@ from sky.server import config as server_config
48
56
  from sky.server import constants as server_constants
49
57
  from sky.server import daemons
50
58
  from sky.server import metrics
59
+ from sky.server import middleware_utils
60
+ from sky.server import plugins
61
+ from sky.server import server_utils
51
62
  from sky.server import state
52
63
  from sky.server import stream_utils
53
64
  from sky.server import versions
54
65
  from sky.server.auth import authn
66
+ from sky.server.auth import loopback
55
67
  from sky.server.auth import oauth2_proxy
56
68
  from sky.server.requests import executor
57
69
  from sky.server.requests import payloads
58
70
  from sky.server.requests import preconditions
71
+ from sky.server.requests import request_names
59
72
  from sky.server.requests import requests as requests_lib
60
73
  from sky.skylet import constants
61
74
  from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -67,10 +80,13 @@ from sky.utils import common as common_lib
67
80
  from sky.utils import common_utils
68
81
  from sky.utils import context
69
82
  from sky.utils import context_utils
83
+ from sky.utils import controller_utils
70
84
  from sky.utils import dag_utils
85
+ from sky.utils import env_options
71
86
  from sky.utils import perf_utils
72
87
  from sky.utils import status_lib
73
88
  from sky.utils import subprocess_utils
89
+ from sky.utils import ux_utils
74
90
  from sky.utils.db import db_utils
75
91
  from sky.volumes.server import server as volumes_rest
76
92
  from sky.workspaces import server as workspaces_rest
@@ -128,6 +144,7 @@ def _try_set_basic_auth_user(request: fastapi.Request):
128
144
  break
129
145
 
130
146
 
147
+ @middleware_utils.websocket_aware
131
148
  class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
132
149
  """Middleware to handle RBAC."""
133
150
 
@@ -157,11 +174,9 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
157
174
  """Middleware to add a request ID to each request."""
158
175
 
159
176
  async def dispatch(self, request: fastapi.Request, call_next):
160
- request_id = str(uuid.uuid4())
177
+ request_id = requests_lib.get_new_request_id()
161
178
  request.state.request_id = request_id
162
179
  response = await call_next(request)
163
- # TODO(syang): remove X-Request-ID when v0.10.0 is released.
164
- response.headers['X-Request-ID'] = request_id
165
180
  response.headers['X-Skypilot-Request-ID'] = request_id
166
181
  return response
167
182
 
@@ -177,6 +192,7 @@ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
177
192
  return models.User(id=user_hash, name=user_name)
178
193
 
179
194
 
195
+ @middleware_utils.websocket_aware
180
196
  class InitializeRequestAuthUserMiddleware(
181
197
  starlette.middleware.base.BaseHTTPMiddleware):
182
198
 
@@ -187,10 +203,15 @@ class InitializeRequestAuthUserMiddleware(
187
203
  return await call_next(request)
188
204
 
189
205
 
206
+ @middleware_utils.websocket_aware
190
207
  class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
191
208
  """Middleware to handle HTTP Basic Auth."""
192
209
 
193
210
  async def dispatch(self, request: fastapi.Request, call_next):
211
+ if managed_job_utils.is_consolidation_mode(
212
+ ) and loopback.is_loopback_request(request):
213
+ return await call_next(request)
214
+
194
215
  if request.url.path.startswith('/api/health'):
195
216
  # Try to set the auth user from basic auth
196
217
  _try_set_basic_auth_user(request)
@@ -234,6 +255,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
234
255
  return await call_next(request)
235
256
 
236
257
 
258
+ @middleware_utils.websocket_aware
237
259
  class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
238
260
  """Middleware to handle Bearer Token Auth (Service Accounts)."""
239
261
 
@@ -361,6 +383,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
361
383
  return await call_next(request)
362
384
 
363
385
 
386
+ @middleware_utils.websocket_aware
364
387
  class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
365
388
  """Middleware to handle auth proxy."""
366
389
 
@@ -437,7 +460,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
437
460
  if lag_threshold is not None and lag > lag_threshold:
438
461
  logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
439
462
  f'{lag_threshold} seconds.')
440
- metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
463
+ metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
441
464
  pid=pid).observe(lag)
442
465
  target = now + interval
443
466
  loop.call_at(target, tick)
@@ -445,6 +468,23 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
445
468
  loop.call_at(target, tick)
446
469
 
447
470
 
471
+ async def schedule_on_boot_check_async():
472
+ try:
473
+ await executor.schedule_request_async(
474
+ request_id='skypilot-server-on-boot-check',
475
+ request_name=request_names.RequestName.CHECK,
476
+ request_body=server_utils.build_body_at_server(
477
+ request=None, body_type=payloads.CheckBody),
478
+ func=sky_check.check,
479
+ schedule_type=requests_lib.ScheduleType.SHORT,
480
+ is_skypilot_system=True,
481
+ )
482
+ except exceptions.RequestAlreadyExistsError:
483
+ # Lifespan will be executed in each uvicorn worker process, we
484
+ # can safely ignore the error if the task is already scheduled.
485
+ logger.debug('Request skypilot-server-on-boot-check already exists.')
486
+
487
+
448
488
  @contextlib.asynccontextmanager
449
489
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
450
490
  """FastAPI lifespan context manager."""
@@ -454,10 +494,11 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
454
494
  if event.should_skip():
455
495
  continue
456
496
  try:
457
- executor.schedule_request(
497
+ await executor.schedule_request_async(
458
498
  request_id=event.id,
459
499
  request_name=event.name,
460
- request_body=payloads.RequestBody(),
500
+ request_body=server_utils.build_body_at_server(
501
+ request=None, body_type=payloads.RequestBody),
461
502
  func=event.run_event,
462
503
  schedule_type=requests_lib.ScheduleType.SHORT,
463
504
  is_skypilot_system=True,
@@ -469,8 +510,9 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
469
510
  # Lifespan will be executed in each uvicorn worker process, we
470
511
  # can safely ignore the error if the task is already scheduled.
471
512
  logger.debug(f'Request {event.id} already exists.')
513
+ await schedule_on_boot_check_async()
472
514
  asyncio.create_task(cleanup_upload_ids())
473
- if metrics.METRICS_ENABLED:
515
+ if metrics_utils.METRICS_ENABLED:
474
516
  # Start monitoring the event loop lag in each server worker
475
517
  # event loop (process).
476
518
  asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
@@ -518,6 +560,7 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
518
560
  return await call_next(request)
519
561
 
520
562
 
563
+ @middleware_utils.websocket_aware
521
564
  class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
522
565
  """Middleware to control requests when server is shutting down."""
523
566
 
@@ -537,6 +580,7 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
537
580
  return await call_next(request)
538
581
 
539
582
 
583
+ @middleware_utils.websocket_aware
540
584
  class APIVersionMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
541
585
  """Middleware to add API version to the request."""
542
586
 
@@ -579,6 +623,9 @@ app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
579
623
  if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
580
624
  app.add_middleware(metrics.PrometheusMiddleware)
581
625
  app.add_middleware(APIVersionMiddleware)
626
+ # The order of all the authentication-related middleware is important.
627
+ # RBACMiddleware must precede all the auth middleware, so it can access
628
+ # request.state.auth_user.
582
629
  app.add_middleware(RBACMiddleware)
583
630
  app.add_middleware(InternalDashboardPrefixMiddleware)
584
631
  app.add_middleware(GracefulShutdownMiddleware)
@@ -592,12 +639,7 @@ app.add_middleware(
592
639
  allow_credentials=True,
593
640
  allow_methods=['*'],
594
641
  allow_headers=['*'],
595
- # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
596
- expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
597
- # The order of all the authentication-related middleware is important.
598
- # RBACMiddleware must precede all the auth middleware, so it can access
599
- # request.state.auth_user.
600
- app.add_middleware(RBACMiddleware)
642
+ expose_headers=['X-Skypilot-Request-ID'])
601
643
  # Authentication based on oauth2-proxy.
602
644
  app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
603
645
  # AuthProxyMiddleware should precede BasicAuthMiddleware and
@@ -615,6 +657,17 @@ app.add_middleware(BearerTokenMiddleware)
615
657
  # middleware above.
616
658
  app.add_middleware(InitializeRequestAuthUserMiddleware)
617
659
  app.add_middleware(RequestIDMiddleware)
660
+
661
+ # Load plugins after all the middlewares are added, to keep the core
662
+ # middleware stack intact if a plugin adds new middlewares.
663
+ # Note: server.py will be imported twice in server process, once as
664
+ # the top-level entrypoint module and once imported by uvicorn, we only
665
+ # load the plugin when imported by uvicorn for server process.
666
+ # TODO(aylei): move uvicorn app out of the top-level module to avoid
667
+ # duplicate app initialization.
668
+ if __name__ == 'sky.server.server':
669
+ plugins.load_plugins(plugins.ExtensionContext(app=app))
670
+
618
671
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
619
672
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
620
673
  app.include_router(users_rest.router, prefix='/users', tags=['users'])
@@ -625,16 +678,28 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
625
678
  app.include_router(ssh_node_pools_rest.router,
626
679
  prefix='/ssh_node_pools',
627
680
  tags=['ssh_node_pools'])
628
-
629
- # Increase the limit of files we can open to our hard limit. This fixes bugs
630
- # where we can not aquire file locks or open enough logs and the API server
631
- # crashes. On Mac, the hard limit is 9,223,372,036,854,775,807.
632
- # TODO(luca) figure out what to do if we need to open more than 2^63 files.
633
- try:
634
- soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
635
- resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
636
- except Exception: # pylint: disable=broad-except
637
- pass # no issue, we will warn the user later if its too low
681
+ # increase the resource limit for the server
682
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
683
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
684
+
685
+
686
+ @app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
687
+ def handle_concurrent_worker_exhausted_error(
688
+ request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
689
+ del request # request is not used
690
+ # Print detailed error message to server log
691
+ logger.error('Concurrent worker exhausted: '
692
+ f'{common_utils.format_exception(e)}')
693
+ with ux_utils.enable_traceback():
694
+ logger.error(f' Traceback: {traceback.format_exc()}')
695
+ # Return human readable error message to client
696
+ return fastapi.responses.JSONResponse(
697
+ status_code=503,
698
+ content={
699
+ 'detail':
700
+ ('The server has exhausted its concurrent worker limit. '
701
+ 'Please try again or scale the server if the load persists.')
702
+ })
638
703
 
639
704
 
640
705
  @app.get('/token')
@@ -680,9 +745,9 @@ async def token(request: fastapi.Request,
680
745
  async def check(request: fastapi.Request,
681
746
  check_body: payloads.CheckBody) -> None:
682
747
  """Checks enabled clouds."""
683
- executor.schedule_request(
748
+ await executor.schedule_request_async(
684
749
  request_id=request.state.request_id,
685
- request_name='check',
750
+ request_name=request_names.RequestName.CHECK,
686
751
  request_body=check_body,
687
752
  func=sky_check.check,
688
753
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -694,11 +759,14 @@ async def enabled_clouds(request: fastapi.Request,
694
759
  workspace: Optional[str] = None,
695
760
  expand: bool = False) -> None:
696
761
  """Gets enabled clouds on the server."""
697
- executor.schedule_request(
762
+ await executor.schedule_request_async(
698
763
  request_id=request.state.request_id,
699
- request_name='enabled_clouds',
700
- request_body=payloads.EnabledCloudsBody(workspace=workspace,
701
- expand=expand),
764
+ request_name=request_names.RequestName.ENABLED_CLOUDS,
765
+ request_body=server_utils.build_body_at_server(
766
+ request=request,
767
+ body_type=payloads.EnabledCloudsBody,
768
+ workspace=workspace,
769
+ expand=expand),
702
770
  func=core.enabled_clouds,
703
771
  schedule_type=requests_lib.ScheduleType.SHORT,
704
772
  )
@@ -710,9 +778,10 @@ async def realtime_kubernetes_gpu_availability(
710
778
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
711
779
  ) -> None:
712
780
  """Gets real-time Kubernetes GPU availability."""
713
- executor.schedule_request(
781
+ await executor.schedule_request_async(
714
782
  request_id=request.state.request_id,
715
- request_name='realtime_kubernetes_gpu_availability',
783
+ request_name=request_names.RequestName.
784
+ REALTIME_KUBERNETES_GPU_AVAILABILITY,
716
785
  request_body=realtime_gpu_availability_body,
717
786
  func=core.realtime_kubernetes_gpu_availability,
718
787
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -725,22 +794,53 @@ async def kubernetes_node_info(
725
794
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
726
795
  ) -> None:
727
796
  """Gets Kubernetes nodes information and hints."""
728
- executor.schedule_request(
797
+ await executor.schedule_request_async(
729
798
  request_id=request.state.request_id,
730
- request_name='kubernetes_node_info',
799
+ request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
731
800
  request_body=kubernetes_node_info_body,
732
801
  func=kubernetes_utils.get_kubernetes_node_info,
733
802
  schedule_type=requests_lib.ScheduleType.SHORT,
734
803
  )
735
804
 
736
805
 
806
+ @app.post('/slurm_gpu_availability')
807
+ async def slurm_gpu_availability(
808
+ request: fastapi.Request,
809
+ slurm_gpu_availability_body: payloads.SlurmGpuAvailabilityRequestBody
810
+ ) -> None:
811
+ """Gets real-time Slurm GPU availability."""
812
+ await executor.schedule_request_async(
813
+ request_id=request.state.request_id,
814
+ request_name=request_names.RequestName.REALTIME_SLURM_GPU_AVAILABILITY,
815
+ request_body=slurm_gpu_availability_body,
816
+ func=core.realtime_slurm_gpu_availability,
817
+ schedule_type=requests_lib.ScheduleType.SHORT,
818
+ )
819
+
820
+
821
+ @app.get('/slurm_node_info')
822
+ async def slurm_node_info(
823
+ request: fastapi.Request,
824
+ slurm_node_info_body: payloads.SlurmNodeInfoRequestBody) -> None:
825
+ """Gets detailed information for each node in the Slurm cluster."""
826
+ await executor.schedule_request_async(
827
+ request_id=request.state.request_id,
828
+ request_name=request_names.RequestName.SLURM_NODE_INFO,
829
+ request_body=slurm_node_info_body,
830
+ func=slurm_utils.slurm_node_info,
831
+ schedule_type=requests_lib.ScheduleType.SHORT,
832
+ )
833
+
834
+
737
835
  @app.get('/status_kubernetes')
738
836
  async def status_kubernetes(request: fastapi.Request) -> None:
739
- """Gets Kubernetes status."""
740
- executor.schedule_request(
837
+ """[Experimental] Get all SkyPilot resources (including from other '
838
+ 'users) in the current Kubernetes context."""
839
+ await executor.schedule_request_async(
741
840
  request_id=request.state.request_id,
742
- request_name='status_kubernetes',
743
- request_body=payloads.RequestBody(),
841
+ request_name=request_names.RequestName.STATUS_KUBERNETES,
842
+ request_body=server_utils.build_body_at_server(
843
+ request=request, body_type=payloads.RequestBody),
744
844
  func=core.status_kubernetes,
745
845
  schedule_type=requests_lib.ScheduleType.SHORT,
746
846
  )
@@ -751,9 +851,9 @@ async def list_accelerators(
751
851
  request: fastapi.Request,
752
852
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
753
853
  """Gets list of accelerators from cloud catalog."""
754
- executor.schedule_request(
854
+ await executor.schedule_request_async(
755
855
  request_id=request.state.request_id,
756
- request_name='list_accelerators',
856
+ request_name=request_names.RequestName.LIST_ACCELERATORS,
757
857
  request_body=list_accelerator_counts_body,
758
858
  func=catalog.list_accelerators,
759
859
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -766,9 +866,9 @@ async def list_accelerator_counts(
766
866
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
767
867
  ) -> None:
768
868
  """Gets list of accelerator counts from cloud catalog."""
769
- executor.schedule_request(
869
+ await executor.schedule_request_async(
770
870
  request_id=request.state.request_id,
771
- request_name='list_accelerator_counts',
871
+ request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
772
872
  request_body=list_accelerator_counts_body,
773
873
  func=catalog.list_accelerator_counts,
774
874
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -802,6 +902,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
802
902
  # server thread.
803
903
  with admin_policy_utils.apply_and_use_config_in_current_request(
804
904
  dag,
905
+ request_name=request_names.AdminPolicyRequestName.VALIDATE,
805
906
  request_options=validate_body.get_request_options()) as dag:
806
907
  dag.resolve_and_validate_volumes()
807
908
  # Skip validating workdir and file_mounts, as those need to be
@@ -815,6 +916,11 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
815
916
  # thread executor to avoid blocking the uvicorn event loop.
816
917
  await context_utils.to_thread(validate_dag, dag)
817
918
  except Exception as e: # pylint: disable=broad-except
919
+ # Print the exception to the API server log.
920
+ if env_options.Options.SHOW_DEBUG_INFO.get():
921
+ logger.info('/validate exception:', exc_info=True)
922
+ # Set the exception stacktrace for the serialized exception.
923
+ requests_lib.set_exception_stacktrace(e)
818
924
  raise fastapi.HTTPException(
819
925
  status_code=400, detail=exceptions.serialize_exception(e)) from e
820
926
 
@@ -823,9 +929,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
823
929
  async def optimize(optimize_body: payloads.OptimizeBody,
824
930
  request: fastapi.Request) -> None:
825
931
  """Optimizes the user's DAG."""
826
- executor.schedule_request(
932
+ await executor.schedule_request_async(
827
933
  request_id=request.state.request_id,
828
- request_name='optimize',
934
+ request_name=request_names.RequestName.OPTIMIZE,
829
935
  request_body=optimize_body,
830
936
  ignore_return_value=True,
831
937
  func=core.optimize,
@@ -1033,9 +1139,9 @@ async def launch(launch_body: payloads.LaunchBody,
1033
1139
  """Launches a cluster or task."""
1034
1140
  request_id = request.state.request_id
1035
1141
  logger.info(f'Launching request: {request_id}')
1036
- executor.schedule_request(
1142
+ await executor.schedule_request_async(
1037
1143
  request_id,
1038
- request_name='launch',
1144
+ request_name=request_names.RequestName.CLUSTER_LAUNCH,
1039
1145
  request_body=launch_body,
1040
1146
  func=execution.launch,
1041
1147
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1049,9 +1155,9 @@ async def launch(launch_body: payloads.LaunchBody,
1049
1155
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1050
1156
  """Executes a task on an existing cluster."""
1051
1157
  cluster_name = exec_body.cluster_name
1052
- executor.schedule_request(
1158
+ await executor.schedule_request_async(
1053
1159
  request_id=request.state.request_id,
1054
- request_name='exec',
1160
+ request_name=request_names.RequestName.CLUSTER_EXEC,
1055
1161
  request_body=exec_body,
1056
1162
  func=execution.exec,
1057
1163
  precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1067,9 +1173,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1067
1173
  async def stop(request: fastapi.Request,
1068
1174
  stop_body: payloads.StopOrDownBody) -> None:
1069
1175
  """Stops a cluster."""
1070
- executor.schedule_request(
1176
+ await executor.schedule_request_async(
1071
1177
  request_id=request.state.request_id,
1072
- request_name='stop',
1178
+ request_name=request_names.RequestName.CLUSTER_STOP,
1073
1179
  request_body=stop_body,
1074
1180
  func=core.stop,
1075
1181
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1087,9 +1193,9 @@ async def status(
1087
1193
  raise fastapi.HTTPException(
1088
1194
  status_code=503,
1089
1195
  detail='Server is shutting down, please try again later.')
1090
- executor.schedule_request(
1196
+ await executor.schedule_request_async(
1091
1197
  request_id=request.state.request_id,
1092
- request_name='status',
1198
+ request_name=request_names.RequestName.CLUSTER_STATUS,
1093
1199
  request_body=status_body,
1094
1200
  func=core.status,
1095
1201
  schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1102,9 +1208,9 @@ async def status(
1102
1208
  async def endpoints(request: fastapi.Request,
1103
1209
  endpoint_body: payloads.EndpointsBody) -> None:
1104
1210
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1105
- executor.schedule_request(
1211
+ await executor.schedule_request_async(
1106
1212
  request_id=request.state.request_id,
1107
- request_name='endpoints',
1213
+ request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
1108
1214
  request_body=endpoint_body,
1109
1215
  func=core.endpoints,
1110
1216
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1116,9 +1222,9 @@ async def endpoints(request: fastapi.Request,
1116
1222
  async def down(request: fastapi.Request,
1117
1223
  down_body: payloads.StopOrDownBody) -> None:
1118
1224
  """Tears down a cluster."""
1119
- executor.schedule_request(
1225
+ await executor.schedule_request_async(
1120
1226
  request_id=request.state.request_id,
1121
- request_name='down',
1227
+ request_name=request_names.RequestName.CLUSTER_DOWN,
1122
1228
  request_body=down_body,
1123
1229
  func=core.down,
1124
1230
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1130,9 +1236,9 @@ async def down(request: fastapi.Request,
1130
1236
  async def start(request: fastapi.Request,
1131
1237
  start_body: payloads.StartBody) -> None:
1132
1238
  """Restarts a cluster."""
1133
- executor.schedule_request(
1239
+ await executor.schedule_request_async(
1134
1240
  request_id=request.state.request_id,
1135
- request_name='start',
1241
+ request_name=request_names.RequestName.CLUSTER_START,
1136
1242
  request_body=start_body,
1137
1243
  func=core.start,
1138
1244
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1144,9 +1250,9 @@ async def start(request: fastapi.Request,
1144
1250
  async def autostop(request: fastapi.Request,
1145
1251
  autostop_body: payloads.AutostopBody) -> None:
1146
1252
  """Schedules an autostop/autodown for a cluster."""
1147
- executor.schedule_request(
1253
+ await executor.schedule_request_async(
1148
1254
  request_id=request.state.request_id,
1149
- request_name='autostop',
1255
+ request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
1150
1256
  request_body=autostop_body,
1151
1257
  func=core.autostop,
1152
1258
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1158,9 +1264,9 @@ async def autostop(request: fastapi.Request,
1158
1264
  async def queue(request: fastapi.Request,
1159
1265
  queue_body: payloads.QueueBody) -> None:
1160
1266
  """Gets the job queue of a cluster."""
1161
- executor.schedule_request(
1267
+ await executor.schedule_request_async(
1162
1268
  request_id=request.state.request_id,
1163
- request_name='queue',
1269
+ request_name=request_names.RequestName.CLUSTER_QUEUE,
1164
1270
  request_body=queue_body,
1165
1271
  func=core.queue,
1166
1272
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1172,9 +1278,9 @@ async def queue(request: fastapi.Request,
1172
1278
  async def job_status(request: fastapi.Request,
1173
1279
  job_status_body: payloads.JobStatusBody) -> None:
1174
1280
  """Gets the status of a job."""
1175
- executor.schedule_request(
1281
+ await executor.schedule_request_async(
1176
1282
  request_id=request.state.request_id,
1177
- request_name='job_status',
1283
+ request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
1178
1284
  request_body=job_status_body,
1179
1285
  func=core.job_status,
1180
1286
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1186,9 +1292,9 @@ async def job_status(request: fastapi.Request,
1186
1292
  async def cancel(request: fastapi.Request,
1187
1293
  cancel_body: payloads.CancelBody) -> None:
1188
1294
  """Cancels jobs on a cluster."""
1189
- executor.schedule_request(
1295
+ await executor.schedule_request_async(
1190
1296
  request_id=request.state.request_id,
1191
- request_name='cancel',
1297
+ request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
1192
1298
  request_body=cancel_body,
1193
1299
  func=core.cancel,
1194
1300
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1205,32 +1311,24 @@ async def logs(
1205
1311
  # TODO(zhwu): This should wait for the request on the cluster, e.g., async
1206
1312
  # launch, to finish, so that a user does not need to manually pull the
1207
1313
  # request status.
1208
- request_task = executor.prepare_request(
1314
+ executor.check_request_thread_executor_available()
1315
+ request_task = await executor.prepare_request_async(
1209
1316
  request_id=request.state.request_id,
1210
- request_name='logs',
1317
+ request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
1211
1318
  request_body=cluster_job_body,
1212
1319
  func=core.tail_logs,
1213
1320
  schedule_type=requests_lib.ScheduleType.SHORT,
1321
+ request_cluster_name=cluster_job_body.cluster_name,
1214
1322
  )
1215
- task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1216
-
1217
- async def cancel_task():
1218
- try:
1219
- logger.info('Client disconnected for request: '
1220
- f'{request.state.request_id}')
1221
- task.cancel()
1222
- await task
1223
- except asyncio.CancelledError:
1224
- pass
1225
-
1226
- # Cancel the task after the request is done or client disconnects
1227
- background_tasks.add_task(cancel_task)
1323
+ task = executor.execute_request_in_coroutine(request_task)
1324
+ background_tasks.add_task(task.cancel)
1228
1325
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1229
1326
  # the same approach as /stream.
1230
- return stream_utils.stream_response(
1327
+ return stream_utils.stream_response_for_long_request(
1231
1328
  request_id=request.state.request_id,
1232
1329
  logs_path=request_task.log_path,
1233
1330
  background_tasks=background_tasks,
1331
+ kill_request_on_disconnect=False,
1234
1332
  )
1235
1333
 
1236
1334
 
@@ -1245,9 +1343,9 @@ async def download_logs(
1245
1343
  # We should reuse the original request body, so that the env vars, such as
1246
1344
  # user hash, are kept the same.
1247
1345
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1248
- executor.schedule_request(
1346
+ await executor.schedule_request_async(
1249
1347
  request_id=request.state.request_id,
1250
- request_name='download_logs',
1348
+ request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
1251
1349
  request_body=cluster_jobs_body,
1252
1350
  func=core.download_logs,
1253
1351
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1324,27 +1422,55 @@ async def download(download_body: payloads.DownloadBody,
1324
1422
 
1325
1423
  # TODO(aylei): run it asynchronously after global_user_state support async op
1326
1424
  @app.post('/provision_logs')
1327
- def provision_logs(cluster_body: payloads.ClusterNameBody,
1425
+ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1328
1426
  follow: bool = True,
1329
1427
  tail: int = 0) -> fastapi.responses.StreamingResponse:
1330
1428
  """Streams the provision.log for the latest launch request of a cluster."""
1331
- # Prefer clusters table first, then cluster_history as fallback.
1332
- log_path_str = global_user_state.get_cluster_provision_log_path(
1333
- cluster_body.cluster_name)
1334
- if not log_path_str:
1335
- log_path_str = global_user_state.get_cluster_history_provision_log_path(
1336
- cluster_body.cluster_name)
1337
- if not log_path_str:
1338
- raise fastapi.HTTPException(
1339
- status_code=404,
1340
- detail=('Provision log path is not recorded for this cluster. '
1341
- 'Please relaunch to generate provisioning logs.'))
1429
+ log_path = None
1430
+ cluster_name = provision_logs_body.cluster_name
1431
+ worker = provision_logs_body.worker
1432
+ # stream head node logs
1433
+ if worker is None:
1434
+ # Prefer clusters table first, then cluster_history as fallback.
1435
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1436
+ cluster_name)
1437
+ if not log_path_str:
1438
+ log_path_str = (
1439
+ global_user_state.get_cluster_history_provision_log_path(
1440
+ cluster_name))
1441
+ if not log_path_str:
1442
+ raise fastapi.HTTPException(
1443
+ status_code=404,
1444
+ detail=('Provision log path is not recorded for this cluster. '
1445
+ 'Please relaunch to generate provisioning logs.'))
1446
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1447
+ if not log_path.exists():
1448
+ raise fastapi.HTTPException(
1449
+ status_code=404,
1450
+ detail=f'Provision log path does not exist: {str(log_path)}')
1342
1451
 
1343
- log_path = pathlib.Path(log_path_str).expanduser().resolve()
1344
- if not log_path.exists():
1345
- raise fastapi.HTTPException(
1346
- status_code=404,
1347
- detail=f'Provision log path does not exist: {str(log_path)}')
1452
+ # stream worker node logs
1453
+ else:
1454
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
1455
+ if handle is None:
1456
+ raise fastapi.HTTPException(
1457
+ status_code=404,
1458
+ detail=('Cluster handle is not recorded for this cluster. '
1459
+ 'Please relaunch to generate provisioning logs.'))
1460
+ # instance_ids includes head node
1461
+ instance_ids = handle.instance_ids
1462
+ if instance_ids is None:
1463
+ raise fastapi.HTTPException(
1464
+ status_code=400,
1465
+ detail='Instance IDs are not recorded for this cluster. '
1466
+ 'Please relaunch to generate provisioning logs.')
1467
+ if worker > len(instance_ids) - 1:
1468
+ raise fastapi.HTTPException(
1469
+ status_code=400,
1470
+ detail=f'Worker {worker} is out of range. '
1471
+ f'The cluster has {len(instance_ids)} nodes.')
1472
+ log_path = metadata_utils.get_instance_log_dir(
1473
+ handle.get_cluster_name_on_cloud(), instance_ids[worker])
1348
1474
 
1349
1475
  # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1350
1476
  effective_tail = None if tail is None or tail <= 0 else tail
@@ -1353,7 +1479,8 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1353
1479
  content=stream_utils.log_streamer(None,
1354
1480
  log_path,
1355
1481
  tail=effective_tail,
1356
- follow=follow),
1482
+ follow=follow,
1483
+ cluster_name=cluster_name),
1357
1484
  media_type='text/plain',
1358
1485
  headers={
1359
1486
  'Cache-Control': 'no-cache, no-transform',
@@ -1367,9 +1494,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1367
1494
  async def cost_report(request: fastapi.Request,
1368
1495
  cost_report_body: payloads.CostReportBody) -> None:
1369
1496
  """Gets the cost report of a cluster."""
1370
- executor.schedule_request(
1497
+ await executor.schedule_request_async(
1371
1498
  request_id=request.state.request_id,
1372
- request_name='cost_report',
1499
+ request_name=request_names.RequestName.CLUSTER_COST_REPORT,
1373
1500
  request_body=cost_report_body,
1374
1501
  func=core.cost_report,
1375
1502
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1379,10 +1506,11 @@ async def cost_report(request: fastapi.Request,
1379
1506
  @app.get('/storage/ls')
1380
1507
  async def storage_ls(request: fastapi.Request) -> None:
1381
1508
  """Gets the storages."""
1382
- executor.schedule_request(
1509
+ await executor.schedule_request_async(
1383
1510
  request_id=request.state.request_id,
1384
- request_name='storage_ls',
1385
- request_body=payloads.RequestBody(),
1511
+ request_name=request_names.RequestName.STORAGE_LS,
1512
+ request_body=server_utils.build_body_at_server(
1513
+ request=request, body_type=payloads.RequestBody),
1386
1514
  func=core.storage_ls,
1387
1515
  schedule_type=requests_lib.ScheduleType.SHORT,
1388
1516
  )
@@ -1392,9 +1520,9 @@ async def storage_ls(request: fastapi.Request) -> None:
1392
1520
  async def storage_delete(request: fastapi.Request,
1393
1521
  storage_body: payloads.StorageBody) -> None:
1394
1522
  """Deletes a storage."""
1395
- executor.schedule_request(
1523
+ await executor.schedule_request_async(
1396
1524
  request_id=request.state.request_id,
1397
- request_name='storage_delete',
1525
+ request_name=request_names.RequestName.STORAGE_DELETE,
1398
1526
  request_body=storage_body,
1399
1527
  func=core.storage_delete,
1400
1528
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1405,9 +1533,9 @@ async def storage_delete(request: fastapi.Request,
1405
1533
  async def local_up(request: fastapi.Request,
1406
1534
  local_up_body: payloads.LocalUpBody) -> None:
1407
1535
  """Launches a Kubernetes cluster on API server."""
1408
- executor.schedule_request(
1536
+ await executor.schedule_request_async(
1409
1537
  request_id=request.state.request_id,
1410
- request_name='local_up',
1538
+ request_name=request_names.RequestName.LOCAL_UP,
1411
1539
  request_body=local_up_body,
1412
1540
  func=core.local_up,
1413
1541
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1415,21 +1543,39 @@ async def local_up(request: fastapi.Request,
1415
1543
 
1416
1544
 
1417
1545
  @app.post('/local_down')
1418
- async def local_down(request: fastapi.Request) -> None:
1546
+ async def local_down(request: fastapi.Request,
1547
+ local_down_body: payloads.LocalDownBody) -> None:
1419
1548
  """Tears down the Kubernetes cluster started by local_up."""
1420
- executor.schedule_request(
1549
+ await executor.schedule_request_async(
1421
1550
  request_id=request.state.request_id,
1422
- request_name='local_down',
1423
- request_body=payloads.RequestBody(),
1551
+ request_name=request_names.RequestName.LOCAL_DOWN,
1552
+ request_body=local_down_body,
1424
1553
  func=core.local_down,
1425
1554
  schedule_type=requests_lib.ScheduleType.LONG,
1426
1555
  )
1427
1556
 
1428
1557
 
1558
+ async def get_expanded_request_id(request_id: str) -> str:
1559
+ """Gets the expanded request ID for a given request ID prefix."""
1560
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1561
+ request_id, fields=['request_id'])
1562
+ if request_tasks is None:
1563
+ raise fastapi.HTTPException(status_code=404,
1564
+ detail=f'Request {request_id!r} not found')
1565
+ if len(request_tasks) > 1:
1566
+ raise fastapi.HTTPException(status_code=400,
1567
+ detail=('Multiple requests found for '
1568
+ f'request ID prefix: {request_id}'))
1569
+ return request_tasks[0].request_id
1570
+
1571
+
1429
1572
  # === API server related APIs ===
1430
- @app.get('/api/get')
1573
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1431
1574
  async def api_get(request_id: str) -> payloads.RequestPayload:
1432
1575
  """Gets a request with a given request ID prefix."""
1576
+ # Validate request_id prefix matches a single request.
1577
+ request_id = await get_expanded_request_id(request_id)
1578
+
1433
1579
  while True:
1434
1580
  req_status = await requests_lib.get_request_status_async(request_id)
1435
1581
  if req_status is None:
@@ -1446,6 +1592,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1446
1592
  # to avoid storming the DB and CPU in the meantime
1447
1593
  await asyncio.sleep(0.1)
1448
1594
  request_task = await requests_lib.get_request_async(request_id)
1595
+ # TODO(aylei): refine this, /api/get will not be retried and this is
1596
+ # meaningless to retry. It is the original request that should be retried.
1449
1597
  if request_task.should_retry:
1450
1598
  raise fastapi.HTTPException(
1451
1599
  status_code=503, detail=f'Request {request_id!r} should be retried')
@@ -1487,13 +1635,18 @@ async def stream(
1487
1635
  clients, console for CLI/API clients), 'plain' (force plain text),
1488
1636
  'html' (force HTML), or 'console' (force console)
1489
1637
  """
1638
+ # We need to save the user-supplied request ID for the response header.
1639
+ user_supplied_request_id = request_id
1490
1640
  if request_id is not None and log_path is not None:
1491
1641
  raise fastapi.HTTPException(
1492
1642
  status_code=400,
1493
1643
  detail='Only one of request_id and log_path can be provided')
1494
1644
 
1645
+ if request_id is not None:
1646
+ request_id = await get_expanded_request_id(request_id)
1647
+
1495
1648
  if request_id is None and log_path is None:
1496
- request_id = requests_lib.get_latest_request_id()
1649
+ request_id = await requests_lib.get_latest_request_id_async()
1497
1650
  if request_id is None:
1498
1651
  raise fastapi.HTTPException(status_code=404,
1499
1652
  detail='No request found')
@@ -1520,13 +1673,17 @@ async def stream(
1520
1673
  'X-Accel-Buffering': 'no'
1521
1674
  })
1522
1675
 
1676
+ polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1523
1677
  # Original plain text streaming logic
1524
1678
  if request_id is not None:
1525
- request_task = await requests_lib.get_request_async(request_id)
1679
+ request_task = await requests_lib.get_request_async(
1680
+ request_id, fields=['request_id', 'schedule_type'])
1526
1681
  if request_task is None:
1527
1682
  print(f'No task with request ID {request_id}')
1528
1683
  raise fastapi.HTTPException(
1529
1684
  status_code=404, detail=f'Request {request_id!r} not found')
1685
+ # req.log_path is derived from request_id,
1686
+ # so it's ok to just grab the request_id in the above query.
1530
1687
  log_path_to_stream = request_task.log_path
1531
1688
  if not log_path_to_stream.exists():
1532
1689
  # The log file might be deleted by the request GC daemon but the
@@ -1534,6 +1691,9 @@ async def stream(
1534
1691
  raise fastapi.HTTPException(
1535
1692
  status_code=404,
1536
1693
  detail=f'Log of request {request_id!r} has been deleted')
1694
+ if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1695
+ polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1696
+ del request_task
1537
1697
  else:
1538
1698
  assert log_path is not None, (request_id, log_path)
1539
1699
  if log_path == constants.API_SERVER_LOGS:
@@ -1567,18 +1727,26 @@ async def stream(
1567
1727
  detail=f'Log path {log_path!r} does not exist')
1568
1728
 
1569
1729
  log_path_to_stream = resolved_log_path
1730
+
1731
+ headers = {
1732
+ 'Cache-Control': 'no-cache, no-transform',
1733
+ 'X-Accel-Buffering': 'no',
1734
+ 'Transfer-Encoding': 'chunked'
1735
+ }
1736
+ if request_id is not None:
1737
+ headers[server_constants.STREAM_REQUEST_HEADER] = (
1738
+ user_supplied_request_id
1739
+ if user_supplied_request_id else request_id)
1740
+
1570
1741
  return fastapi.responses.StreamingResponse(
1571
1742
  content=stream_utils.log_streamer(request_id,
1572
1743
  log_path_to_stream,
1573
1744
  plain_logs=format == 'plain',
1574
1745
  tail=tail,
1575
- follow=follow),
1746
+ follow=follow,
1747
+ polling_interval=polling_interval),
1576
1748
  media_type='text/plain',
1577
- headers={
1578
- 'Cache-Control': 'no-cache, no-transform',
1579
- 'X-Accel-Buffering': 'no',
1580
- 'Transfer-Encoding': 'chunked'
1581
- },
1749
+ headers=headers,
1582
1750
  )
1583
1751
 
1584
1752
 
@@ -1586,11 +1754,11 @@ async def stream(
1586
1754
  async def api_cancel(request: fastapi.Request,
1587
1755
  request_cancel_body: payloads.RequestCancelBody) -> None:
1588
1756
  """Cancels requests."""
1589
- executor.schedule_request(
1757
+ await executor.schedule_request_async(
1590
1758
  request_id=request.state.request_id,
1591
- request_name='api_cancel',
1759
+ request_name=request_names.RequestName.API_CANCEL,
1592
1760
  request_body=request_cancel_body,
1593
- func=requests_lib.kill_requests,
1761
+ func=requests_lib.kill_requests_with_prefix,
1594
1762
  schedule_type=requests_lib.ScheduleType.SHORT,
1595
1763
  )
1596
1764
 
@@ -1598,9 +1766,13 @@ async def api_cancel(request: fastapi.Request,
1598
1766
  @app.get('/api/status')
1599
1767
  async def api_status(
1600
1768
  request_ids: Optional[List[str]] = fastapi.Query(
1601
- None, description='Request IDs to get status for.'),
1769
+ None, description='Request ID prefixes to get status for.'),
1602
1770
  all_status: bool = fastapi.Query(
1603
1771
  False, description='Get finished requests as well.'),
1772
+ limit: Optional[int] = fastapi.Query(
1773
+ None, description='Number of requests to show.'),
1774
+ fields: Optional[List[str]] = fastapi.Query(
1775
+ None, description='Fields to get. If None, get all fields.'),
1604
1776
  ) -> List[payloads.RequestPayload]:
1605
1777
  """Gets the list of requests."""
1606
1778
  if request_ids is None:
@@ -1611,18 +1783,34 @@ async def api_status(
1611
1783
  requests_lib.RequestStatus.RUNNING,
1612
1784
  ]
1613
1785
  request_tasks = await requests_lib.get_request_tasks_async(
1614
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
1615
- return [r.readable_encode() for r in request_tasks]
1786
+ req_filter=requests_lib.RequestTaskFilter(
1787
+ status=statuses,
1788
+ limit=limit,
1789
+ fields=fields,
1790
+ sort=True,
1791
+ ))
1792
+ return requests_lib.encode_requests(request_tasks)
1616
1793
  else:
1617
1794
  encoded_request_tasks = []
1618
1795
  for request_id in request_ids:
1619
- request_task = await requests_lib.get_request_async(request_id)
1620
- if request_task is None:
1796
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1797
+ request_id)
1798
+ if request_tasks is None:
1621
1799
  continue
1622
- encoded_request_tasks.append(request_task.readable_encode())
1800
+ for request_task in request_tasks:
1801
+ encoded_request_tasks.append(request_task.readable_encode())
1623
1802
  return encoded_request_tasks
1624
1803
 
1625
1804
 
1805
+ @app.get('/api/plugins', response_class=fastapi_responses.ORJSONResponse)
1806
+ async def list_plugins() -> Dict[str, List[Dict[str, Any]]]:
1807
+ """Return metadata about loaded backend plugins."""
1808
+ plugin_info = [{
1809
+ 'js_extension_path': plugin.js_extension_path,
1810
+ } for plugin in plugins.get_plugins()]
1811
+ return {'plugins': plugin_info}
1812
+
1813
+
1626
1814
  @app.get(
1627
1815
  '/api/health',
1628
1816
  # response_model_exclude_unset omits unset fields
@@ -1679,23 +1867,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1679
1867
  version=sky.__version__,
1680
1868
  version_on_disk=common.get_skypilot_version_on_disk(),
1681
1869
  commit=sky.__commit__,
1870
+ # Whether basic auth on api server is enabled
1682
1871
  basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1683
1872
  'false').lower() == 'true',
1684
1873
  user=user if user is not None else None,
1874
+ # Whether service account token is enabled
1875
+ service_account_token_enabled=(os.environ.get(
1876
+ constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
1877
+ 'false').lower() == 'true'),
1878
+ # Whether basic auth on ingress is enabled
1879
+ ingress_basic_auth_enabled=os.environ.get(
1880
+ constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
1881
+ 'false').lower() == 'true',
1685
1882
  )
1686
1883
 
1687
1884
 
1885
+ class KubernetesSSHMessageType(IntEnum):
1886
+ REGULAR_DATA = 0
1887
+ PINGPONG = 1
1888
+ LATENCY_MEASUREMENT = 2
1889
+
1890
+
1688
1891
  @app.websocket('/kubernetes-pod-ssh-proxy')
1689
- async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1690
- cluster_name: str) -> None:
1892
+ async def kubernetes_pod_ssh_proxy(
1893
+ websocket: fastapi.WebSocket,
1894
+ cluster_name: str,
1895
+ client_version: Optional[int] = None) -> None:
1691
1896
  """Proxies SSH to the Kubernetes pod with websocket."""
1692
1897
  await websocket.accept()
1693
1898
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1694
1899
 
1900
+ timestamps_supported = client_version is not None and client_version > 21
1901
+ logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
1902
+ client_version = {client_version}')
1903
+
1695
1904
  # Run core.status in another thread to avoid blocking the event loop.
1696
- cluster_records = await context_utils.to_thread(core.status,
1697
- cluster_name,
1698
- all_users=True)
1905
+ with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1906
+ cluster_records = await context_utils.to_thread_with_executor(
1907
+ thread_pool_executor, core.status, cluster_name, all_users=True)
1699
1908
  cluster_record = cluster_records[0]
1700
1909
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1701
1910
  raise fastapi.HTTPException(
@@ -1734,17 +1943,70 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1734
1943
  return
1735
1944
 
1736
1945
  logger.info(f'Starting port-forward to local port: {local_port}')
1946
+ conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
1947
+ pid=os.getpid())
1948
+ ssh_failed = False
1949
+ websocket_closed = False
1737
1950
  try:
1951
+ conn_gauge.inc()
1738
1952
  # Connect to the local port
1739
1953
  reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
1740
1954
 
1741
1955
  async def websocket_to_ssh():
1742
1956
  try:
1743
1957
  async for message in websocket.iter_bytes():
1958
+ if timestamps_supported:
1959
+ type_size = struct.calcsize('!B')
1960
+ message_type = struct.unpack('!B',
1961
+ message[:type_size])[0]
1962
+ if (message_type ==
1963
+ KubernetesSSHMessageType.REGULAR_DATA):
1964
+ # Regular data - strip type byte and forward to SSH
1965
+ message = message[type_size:]
1966
+ elif message_type == KubernetesSSHMessageType.PINGPONG:
1967
+ # PING message - respond with PONG (type 1)
1968
+ ping_id_size = struct.calcsize('!I')
1969
+ if len(message) != type_size + ping_id_size:
1970
+ raise ValueError('Invalid PING message '
1971
+ f'length: {len(message)}')
1972
+ # Return the same PING message, so that the client
1973
+ # can measure the latency.
1974
+ await websocket.send_bytes(message)
1975
+ continue
1976
+ elif (message_type ==
1977
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT):
1978
+ # Latency measurement from client
1979
+ latency_size = struct.calcsize('!Q')
1980
+ if len(message) != type_size + latency_size:
1981
+ raise ValueError(
1982
+ 'Invalid latency measurement '
1983
+ f'message length: {len(message)}')
1984
+ avg_latency_ms = struct.unpack(
1985
+ '!Q',
1986
+ message[type_size:type_size + latency_size])[0]
1987
+ latency_seconds = avg_latency_ms / 1000
1988
+ metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
1989
+ continue
1990
+ else:
1991
+ # Unknown message type.
1992
+ raise ValueError(
1993
+ f'Unknown message type: {message_type}')
1744
1994
  writer.write(message)
1745
- await writer.drain()
1995
+ try:
1996
+ await writer.drain()
1997
+ except Exception as e: # pylint: disable=broad-except
1998
+ # Typically we will not reach here, if the ssh to pod
1999
+ # is disconnected, ssh_to_websocket will exit first.
2000
+ # But just in case.
2001
+ logger.error('Failed to write to pod through '
2002
+ f'port-forward connection: {e}')
2003
+ nonlocal ssh_failed
2004
+ ssh_failed = True
2005
+ break
1746
2006
  except fastapi.WebSocketDisconnect:
1747
2007
  pass
2008
+ nonlocal websocket_closed
2009
+ websocket_closed = True
1748
2010
  writer.close()
1749
2011
 
1750
2012
  async def ssh_to_websocket():
@@ -1752,62 +2014,65 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1752
2014
  while True:
1753
2015
  data = await reader.read(1024)
1754
2016
  if not data:
2017
+ if not websocket_closed:
2018
+ logger.warning('SSH connection to pod is '
2019
+ 'disconnected before websocket '
2020
+ 'connection is closed')
2021
+ nonlocal ssh_failed
2022
+ ssh_failed = True
1755
2023
  break
2024
+ if timestamps_supported:
2025
+ # Prepend message type byte (0 = regular data)
2026
+ message_type_bytes = struct.pack(
2027
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
2028
+ data = message_type_bytes + data
1756
2029
  await websocket.send_bytes(data)
1757
2030
  except Exception: # pylint: disable=broad-except
1758
2031
  pass
1759
- await websocket.close()
2032
+ try:
2033
+ await websocket.close()
2034
+ except Exception: # pylint: disable=broad-except
2035
+ # The websocket might has been closed by the client.
2036
+ pass
1760
2037
 
1761
2038
  await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
1762
2039
  finally:
1763
- proc.terminate()
2040
+ conn_gauge.dec()
2041
+ reason = ''
2042
+ try:
2043
+ logger.info('Terminating kubectl port-forward process')
2044
+ proc.terminate()
2045
+ except ProcessLookupError:
2046
+ stdout = await proc.stdout.read()
2047
+ logger.error('kubectl port-forward was terminated before the '
2048
+ 'ssh websocket connection was closed. Remaining '
2049
+ f'output: {str(stdout)}')
2050
+ reason = 'KubectlPortForwardExit'
2051
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
2052
+ pid=os.getpid(), reason='KubectlPortForwardExit').inc()
2053
+ else:
2054
+ if ssh_failed:
2055
+ reason = 'SSHToPodDisconnected'
2056
+ else:
2057
+ reason = 'ClientClosed'
2058
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
2059
+ pid=os.getpid(), reason=reason).inc()
1764
2060
 
1765
2061
 
1766
2062
  @app.get('/all_contexts')
1767
2063
  async def all_contexts(request: fastapi.Request) -> None:
1768
2064
  """Gets all Kubernetes and SSH node pool contexts."""
1769
2065
 
1770
- executor.schedule_request(
2066
+ await executor.schedule_request_async(
1771
2067
  request_id=request.state.request_id,
1772
- request_name='all_contexts',
1773
- request_body=payloads.RequestBody(),
2068
+ request_name=request_names.RequestName.ALL_CONTEXTS,
2069
+ request_body=server_utils.build_body_at_server(
2070
+ request=request, body_type=payloads.RequestBody),
1774
2071
  func=core.get_all_contexts,
1775
2072
  schedule_type=requests_lib.ScheduleType.SHORT,
1776
2073
  )
1777
2074
 
1778
2075
 
1779
- @app.get('/gpu-metrics')
1780
- async def gpu_metrics() -> fastapi.Response:
1781
- """Gets the GPU metrics from multiple external k8s clusters"""
1782
- contexts = core.get_all_contexts()
1783
- all_metrics = []
1784
- successful_contexts = 0
1785
-
1786
- tasks = [
1787
- asyncio.create_task(metrics_utils.get_metrics_for_context(context))
1788
- for context in contexts
1789
- if context != 'in-cluster'
1790
- ]
1791
-
1792
- results = await asyncio.gather(*tasks, return_exceptions=True)
1793
-
1794
- for i, result in enumerate(results):
1795
- if isinstance(result, Exception):
1796
- logger.error(
1797
- f'Failed to get metrics for context {contexts[i]}: {result}')
1798
- else:
1799
- metrics_text = result
1800
- all_metrics.append(metrics_text)
1801
- successful_contexts += 1
1802
-
1803
- combined_metrics = '\n\n'.join(all_metrics)
1804
-
1805
- # Return as plain text for Prometheus compatibility
1806
- return fastapi.Response(
1807
- content=combined_metrics,
1808
- media_type='text/plain; version=0.0.4; charset=utf-8')
1809
-
1810
-
1811
2076
  # === Internal APIs ===
1812
2077
  @app.get('/api/completion/cluster_name')
1813
2078
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -1852,6 +2117,14 @@ async def serve_dashboard(full_path: str):
1852
2117
  if os.path.isfile(file_path):
1853
2118
  return fastapi.responses.FileResponse(file_path)
1854
2119
 
2120
+ # Serve plugin catch-all page for any /plugins/* paths so client-side
2121
+ # routing can bootstrap correctly.
2122
+ if full_path == 'plugins' or full_path.startswith('plugins/'):
2123
+ plugin_catchall = os.path.join(server_constants.DASHBOARD_DIR,
2124
+ 'plugins', '[...slug].html')
2125
+ if os.path.isfile(plugin_catchall):
2126
+ return fastapi.responses.FileResponse(plugin_catchall)
2127
+
1855
2128
  # Serve index.html for client-side routing
1856
2129
  # e.g. /clusters, /jobs
1857
2130
  index_path = os.path.join(server_constants.DASHBOARD_DIR, 'index.html')
@@ -1905,6 +2178,7 @@ if __name__ == '__main__':
1905
2178
 
1906
2179
  from sky.server import uvicorn as skyuvicorn
1907
2180
 
2181
+ logger.info('Initializing SkyPilot API server')
1908
2182
  skyuvicorn.add_timestamp_prefix_for_server_logs()
1909
2183
 
1910
2184
  parser = argparse.ArgumentParser()
@@ -1916,22 +2190,63 @@ if __name__ == '__main__':
1916
2190
  parser.add_argument('--metrics-port', default=9090, type=int)
1917
2191
  cmd_args = parser.parse_args()
1918
2192
  if cmd_args.port == cmd_args.metrics_port:
2193
+ logger.error('port and metrics-port cannot be the same, exiting.')
1919
2194
  raise ValueError('port and metrics-port cannot be the same')
1920
2195
 
2196
+ # Fail fast if the port is not available to avoid corrupt the state
2197
+ # of potential running server instance.
2198
+ # We might reach here because the running server is currently not
2199
+ # responding, thus the healthz check fails and `sky api start` think
2200
+ # we should start a new server instance.
2201
+ if not common_utils.is_port_available(cmd_args.port):
2202
+ logger.error(f'Port {cmd_args.port} is not available, exiting.')
2203
+ raise RuntimeError(f'Port {cmd_args.port} is not available')
2204
+
2205
+ # Maybe touch the signal file on API server startup. Do it again here even
2206
+ # if we already touched it in the sky/server/common.py::_start_api_server.
2207
+ # This is because the sky/server/common.py::_start_api_server function call
2208
+ # is running outside the skypilot API server process tree. The process tree
2209
+ # starts within that function (see the `subprocess.Popen` call in
2210
+ # sky/server/common.py::_start_api_server). When pg is used, the
2211
+ # _start_api_server function will not load the config file from db, which
2212
+ # will ignore the consolidation mode config. Here, inside the process tree,
2213
+ # we already reload the config as a server (with env var _start_api_server),
2214
+ # so we will respect the consolidation mode config.
2215
+ # Refers to #7717 for more details.
2216
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
2217
+
1921
2218
  # Show the privacy policy if it is not already shown. We place it here so
1922
2219
  # that it is shown only when the API server is started.
1923
2220
  usage_lib.maybe_show_privacy_policy()
1924
2221
 
1925
2222
  # Initialize global user state db
1926
2223
  db_utils.set_max_connections(1)
2224
+ logger.info('Initializing database engine')
1927
2225
  global_user_state.initialize_and_get_db()
2226
+ logger.info('Database engine initialized')
1928
2227
  # Initialize request db
1929
2228
  requests_lib.reset_db_and_logs()
1930
2229
  # Restore the server user hash
2230
+ logger.info('Initializing server user hash')
1931
2231
  _init_or_restore_server_user_hash()
2232
+
1932
2233
  max_db_connections = global_user_state.get_max_db_connections()
1933
- config = server_config.compute_server_config(cmd_args.deploy,
1934
- max_db_connections)
2234
+ logger.info(f'Max db connections: {max_db_connections}')
2235
+
2236
+ # Reserve memory for jobs and serve/pool controller in consolidation mode.
2237
+ reserved_memory_mb = (
2238
+ controller_utils.compute_memory_reserved_for_controllers(
2239
+ reserve_for_controllers=os.environ.get(
2240
+ constants.OVERRIDE_CONSOLIDATION_MODE) is not None,
2241
+ # For jobs controller, we need to reserve for both jobs and
2242
+ # pool controller.
2243
+ reserve_extra_for_pool=not os.environ.get(
2244
+ constants.IS_SKYPILOT_SERVE_CONTROLLER)))
2245
+
2246
+ config = server_config.compute_server_config(
2247
+ cmd_args.deploy,
2248
+ max_db_connections,
2249
+ reserved_memory_mb=reserved_memory_mb)
1935
2250
 
1936
2251
  num_workers = config.num_server_workers
1937
2252
 
@@ -1960,7 +2275,8 @@ if __name__ == '__main__':
1960
2275
  uvicorn_config = uvicorn.Config('sky.server.server:app',
1961
2276
  host=cmd_args.host,
1962
2277
  port=cmd_args.port,
1963
- workers=num_workers)
2278
+ workers=num_workers,
2279
+ ws_per_message_deflate=False)
1964
2280
  skyuvicorn.run(uvicorn_config,
1965
2281
  max_db_connections=config.num_db_connections_per_worker)
1966
2282
  except Exception as exc: # pylint: disable=broad-except
@@ -1972,6 +2288,8 @@ if __name__ == '__main__':
1972
2288
 
1973
2289
  for gt in global_tasks:
1974
2290
  gt.cancel()
2291
+ for plugin in plugins.get_plugins():
2292
+ plugin.shutdown()
1975
2293
  subprocess_utils.run_in_parallel(lambda worker: worker.cancel(),
1976
2294
  workers,
1977
2295
  num_threads=len(workers))