skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/server.py CHANGED
@@ -3,8 +3,10 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import base64
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  import contextlib
7
8
  import datetime
9
+ from enum import IntEnum
8
10
  import hashlib
9
11
  import json
10
12
  import multiprocessing
@@ -14,8 +16,10 @@ import posixpath
14
16
  import re
15
17
  import resource
16
18
  import shutil
19
+ import struct
17
20
  import sys
18
21
  import threading
22
+ import traceback
19
23
  from typing import Dict, List, Literal, Optional, Set, Tuple
20
24
  import uuid
21
25
  import zipfile
@@ -23,6 +27,7 @@ import zipfile
23
27
  import aiofiles
24
28
  import anyio
25
29
  import fastapi
30
+ from fastapi import responses as fastapi_responses
26
31
  from fastapi.middleware import cors
27
32
  import starlette.middleware.base
28
33
  import uvloop
@@ -38,8 +43,10 @@ from sky import global_user_state
38
43
  from sky import models
39
44
  from sky import sky_logging
40
45
  from sky.data import storage_utils
46
+ from sky.jobs import utils as managed_job_utils
41
47
  from sky.jobs.server import server as jobs_rest
42
48
  from sky.metrics import utils as metrics_utils
49
+ from sky.provision import metadata_utils
43
50
  from sky.provision.kubernetes import utils as kubernetes_utils
44
51
  from sky.schemas.api import responses
45
52
  from sky.serve.server import server as serve_rest
@@ -48,14 +55,17 @@ from sky.server import config as server_config
48
55
  from sky.server import constants as server_constants
49
56
  from sky.server import daemons
50
57
  from sky.server import metrics
58
+ from sky.server import middleware_utils
51
59
  from sky.server import state
52
60
  from sky.server import stream_utils
53
61
  from sky.server import versions
54
62
  from sky.server.auth import authn
63
+ from sky.server.auth import loopback
55
64
  from sky.server.auth import oauth2_proxy
56
65
  from sky.server.requests import executor
57
66
  from sky.server.requests import payloads
58
67
  from sky.server.requests import preconditions
68
+ from sky.server.requests import request_names
59
69
  from sky.server.requests import requests as requests_lib
60
70
  from sky.skylet import constants
61
71
  from sky.ssh_node_pools import server as ssh_node_pools_rest
@@ -67,10 +77,13 @@ from sky.utils import common as common_lib
67
77
  from sky.utils import common_utils
68
78
  from sky.utils import context
69
79
  from sky.utils import context_utils
80
+ from sky.utils import controller_utils
70
81
  from sky.utils import dag_utils
82
+ from sky.utils import env_options
71
83
  from sky.utils import perf_utils
72
84
  from sky.utils import status_lib
73
85
  from sky.utils import subprocess_utils
86
+ from sky.utils import ux_utils
74
87
  from sky.utils.db import db_utils
75
88
  from sky.volumes.server import server as volumes_rest
76
89
  from sky.workspaces import server as workspaces_rest
@@ -128,6 +141,7 @@ def _try_set_basic_auth_user(request: fastapi.Request):
128
141
  break
129
142
 
130
143
 
144
+ @middleware_utils.websocket_aware
131
145
  class RBACMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
132
146
  """Middleware to handle RBAC."""
133
147
 
@@ -157,11 +171,9 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
157
171
  """Middleware to add a request ID to each request."""
158
172
 
159
173
  async def dispatch(self, request: fastapi.Request, call_next):
160
- request_id = str(uuid.uuid4())
174
+ request_id = requests_lib.get_new_request_id()
161
175
  request.state.request_id = request_id
162
176
  response = await call_next(request)
163
- # TODO(syang): remove X-Request-ID when v0.10.0 is released.
164
- response.headers['X-Request-ID'] = request_id
165
177
  response.headers['X-Skypilot-Request-ID'] = request_id
166
178
  return response
167
179
 
@@ -177,6 +189,7 @@ def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
177
189
  return models.User(id=user_hash, name=user_name)
178
190
 
179
191
 
192
+ @middleware_utils.websocket_aware
180
193
  class InitializeRequestAuthUserMiddleware(
181
194
  starlette.middleware.base.BaseHTTPMiddleware):
182
195
 
@@ -187,10 +200,15 @@ class InitializeRequestAuthUserMiddleware(
187
200
  return await call_next(request)
188
201
 
189
202
 
203
+ @middleware_utils.websocket_aware
190
204
  class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
191
205
  """Middleware to handle HTTP Basic Auth."""
192
206
 
193
207
  async def dispatch(self, request: fastapi.Request, call_next):
208
+ if managed_job_utils.is_consolidation_mode(
209
+ ) and loopback.is_loopback_request(request):
210
+ return await call_next(request)
211
+
194
212
  if request.url.path.startswith('/api/health'):
195
213
  # Try to set the auth user from basic auth
196
214
  _try_set_basic_auth_user(request)
@@ -234,6 +252,7 @@ class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
234
252
  return await call_next(request)
235
253
 
236
254
 
255
+ @middleware_utils.websocket_aware
237
256
  class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
238
257
  """Middleware to handle Bearer Token Auth (Service Accounts)."""
239
258
 
@@ -361,6 +380,7 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
361
380
  return await call_next(request)
362
381
 
363
382
 
383
+ @middleware_utils.websocket_aware
364
384
  class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
365
385
  """Middleware to handle auth proxy."""
366
386
 
@@ -437,7 +457,7 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
437
457
  if lag_threshold is not None and lag > lag_threshold:
438
458
  logger.warning(f'Event loop lag {lag} seconds exceeds threshold '
439
459
  f'{lag_threshold} seconds.')
440
- metrics.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
460
+ metrics_utils.SKY_APISERVER_EVENT_LOOP_LAG_SECONDS.labels(
441
461
  pid=pid).observe(lag)
442
462
  target = now + interval
443
463
  loop.call_at(target, tick)
@@ -445,6 +465,22 @@ async def loop_lag_monitor(loop: asyncio.AbstractEventLoop,
445
465
  loop.call_at(target, tick)
446
466
 
447
467
 
468
+ async def schedule_on_boot_check_async():
469
+ try:
470
+ await executor.schedule_request_async(
471
+ request_id='skypilot-server-on-boot-check',
472
+ request_name=request_names.RequestName.CHECK,
473
+ request_body=payloads.CheckBody(),
474
+ func=sky_check.check,
475
+ schedule_type=requests_lib.ScheduleType.SHORT,
476
+ is_skypilot_system=True,
477
+ )
478
+ except exceptions.RequestAlreadyExistsError:
479
+ # Lifespan will be executed in each uvicorn worker process, we
480
+ # can safely ignore the error if the task is already scheduled.
481
+ logger.debug('Request skypilot-server-on-boot-check already exists.')
482
+
483
+
448
484
  @contextlib.asynccontextmanager
449
485
  async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-name
450
486
  """FastAPI lifespan context manager."""
@@ -454,7 +490,7 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
454
490
  if event.should_skip():
455
491
  continue
456
492
  try:
457
- executor.schedule_request(
493
+ await executor.schedule_request_async(
458
494
  request_id=event.id,
459
495
  request_name=event.name,
460
496
  request_body=payloads.RequestBody(),
@@ -469,8 +505,9 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
469
505
  # Lifespan will be executed in each uvicorn worker process, we
470
506
  # can safely ignore the error if the task is already scheduled.
471
507
  logger.debug(f'Request {event.id} already exists.')
508
+ await schedule_on_boot_check_async()
472
509
  asyncio.create_task(cleanup_upload_ids())
473
- if metrics.METRICS_ENABLED:
510
+ if metrics_utils.METRICS_ENABLED:
474
511
  # Start monitoring the event loop lag in each server worker
475
512
  # event loop (process).
476
513
  asyncio.create_task(loop_lag_monitor(asyncio.get_event_loop()))
@@ -518,6 +555,7 @@ class PathCleanMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
518
555
  return await call_next(request)
519
556
 
520
557
 
558
+ @middleware_utils.websocket_aware
521
559
  class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
522
560
  """Middleware to control requests when server is shutting down."""
523
561
 
@@ -537,6 +575,7 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
537
575
  return await call_next(request)
538
576
 
539
577
 
578
+ @middleware_utils.websocket_aware
540
579
  class APIVersionMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
541
580
  """Middleware to add API version to the request."""
542
581
 
@@ -579,6 +618,9 @@ app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
579
618
  if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
580
619
  app.add_middleware(metrics.PrometheusMiddleware)
581
620
  app.add_middleware(APIVersionMiddleware)
621
+ # The order of all the authentication-related middleware is important.
622
+ # RBACMiddleware must precede all the auth middleware, so it can access
623
+ # request.state.auth_user.
582
624
  app.add_middleware(RBACMiddleware)
583
625
  app.add_middleware(InternalDashboardPrefixMiddleware)
584
626
  app.add_middleware(GracefulShutdownMiddleware)
@@ -592,12 +634,7 @@ app.add_middleware(
592
634
  allow_credentials=True,
593
635
  allow_methods=['*'],
594
636
  allow_headers=['*'],
595
- # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
596
- expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
597
- # The order of all the authentication-related middleware is important.
598
- # RBACMiddleware must precede all the auth middleware, so it can access
599
- # request.state.auth_user.
600
- app.add_middleware(RBACMiddleware)
637
+ expose_headers=['X-Skypilot-Request-ID'])
601
638
  # Authentication based on oauth2-proxy.
602
639
  app.add_middleware(oauth2_proxy.OAuth2ProxyMiddleware)
603
640
  # AuthProxyMiddleware should precede BasicAuthMiddleware and
@@ -625,16 +662,28 @@ app.include_router(volumes_rest.router, prefix='/volumes', tags=['volumes'])
625
662
  app.include_router(ssh_node_pools_rest.router,
626
663
  prefix='/ssh_node_pools',
627
664
  tags=['ssh_node_pools'])
628
-
629
- # Increase the limit of files we can open to our hard limit. This fixes bugs
630
- # where we can not aquire file locks or open enough logs and the API server
631
- # crashes. On Mac, the hard limit is 9,223,372,036,854,775,807.
632
- # TODO(luca) figure out what to do if we need to open more than 2^63 files.
633
- try:
634
- soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
635
- resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
636
- except Exception: # pylint: disable=broad-except
637
- pass # no issue, we will warn the user later if its too low
665
+ # increase the resource limit for the server
666
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
667
+ resource.setrlimit(resource.RLIMIT_NOFILE, (hard, hard))
668
+
669
+
670
+ @app.exception_handler(exceptions.ConcurrentWorkerExhaustedError)
671
+ def handle_concurrent_worker_exhausted_error(
672
+ request: fastapi.Request, e: exceptions.ConcurrentWorkerExhaustedError):
673
+ del request # request is not used
674
+ # Print detailed error message to server log
675
+ logger.error('Concurrent worker exhausted: '
676
+ f'{common_utils.format_exception(e)}')
677
+ with ux_utils.enable_traceback():
678
+ logger.error(f' Traceback: {traceback.format_exc()}')
679
+ # Return human readable error message to client
680
+ return fastapi.responses.JSONResponse(
681
+ status_code=503,
682
+ content={
683
+ 'detail':
684
+ ('The server has exhausted its concurrent worker limit. '
685
+ 'Please try again or scale the server if the load persists.')
686
+ })
638
687
 
639
688
 
640
689
  @app.get('/token')
@@ -680,9 +729,9 @@ async def token(request: fastapi.Request,
680
729
  async def check(request: fastapi.Request,
681
730
  check_body: payloads.CheckBody) -> None:
682
731
  """Checks enabled clouds."""
683
- executor.schedule_request(
732
+ await executor.schedule_request_async(
684
733
  request_id=request.state.request_id,
685
- request_name='check',
734
+ request_name=request_names.RequestName.CHECK,
686
735
  request_body=check_body,
687
736
  func=sky_check.check,
688
737
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -694,9 +743,9 @@ async def enabled_clouds(request: fastapi.Request,
694
743
  workspace: Optional[str] = None,
695
744
  expand: bool = False) -> None:
696
745
  """Gets enabled clouds on the server."""
697
- executor.schedule_request(
746
+ await executor.schedule_request_async(
698
747
  request_id=request.state.request_id,
699
- request_name='enabled_clouds',
748
+ request_name=request_names.RequestName.ENABLED_CLOUDS,
700
749
  request_body=payloads.EnabledCloudsBody(workspace=workspace,
701
750
  expand=expand),
702
751
  func=core.enabled_clouds,
@@ -710,9 +759,10 @@ async def realtime_kubernetes_gpu_availability(
710
759
  realtime_gpu_availability_body: payloads.RealtimeGpuAvailabilityRequestBody
711
760
  ) -> None:
712
761
  """Gets real-time Kubernetes GPU availability."""
713
- executor.schedule_request(
762
+ await executor.schedule_request_async(
714
763
  request_id=request.state.request_id,
715
- request_name='realtime_kubernetes_gpu_availability',
764
+ request_name=request_names.RequestName.
765
+ REALTIME_KUBERNETES_GPU_AVAILABILITY,
716
766
  request_body=realtime_gpu_availability_body,
717
767
  func=core.realtime_kubernetes_gpu_availability,
718
768
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -725,9 +775,9 @@ async def kubernetes_node_info(
725
775
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
726
776
  ) -> None:
727
777
  """Gets Kubernetes nodes information and hints."""
728
- executor.schedule_request(
778
+ await executor.schedule_request_async(
729
779
  request_id=request.state.request_id,
730
- request_name='kubernetes_node_info',
780
+ request_name=request_names.RequestName.KUBERNETES_NODE_INFO,
731
781
  request_body=kubernetes_node_info_body,
732
782
  func=kubernetes_utils.get_kubernetes_node_info,
733
783
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -736,10 +786,11 @@ async def kubernetes_node_info(
736
786
 
737
787
  @app.get('/status_kubernetes')
738
788
  async def status_kubernetes(request: fastapi.Request) -> None:
739
- """Gets Kubernetes status."""
740
- executor.schedule_request(
789
+ """[Experimental] Get all SkyPilot resources (including from other '
790
+ 'users) in the current Kubernetes context."""
791
+ await executor.schedule_request_async(
741
792
  request_id=request.state.request_id,
742
- request_name='status_kubernetes',
793
+ request_name=request_names.RequestName.STATUS_KUBERNETES,
743
794
  request_body=payloads.RequestBody(),
744
795
  func=core.status_kubernetes,
745
796
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -751,9 +802,9 @@ async def list_accelerators(
751
802
  request: fastapi.Request,
752
803
  list_accelerator_counts_body: payloads.ListAcceleratorsBody) -> None:
753
804
  """Gets list of accelerators from cloud catalog."""
754
- executor.schedule_request(
805
+ await executor.schedule_request_async(
755
806
  request_id=request.state.request_id,
756
- request_name='list_accelerators',
807
+ request_name=request_names.RequestName.LIST_ACCELERATORS,
757
808
  request_body=list_accelerator_counts_body,
758
809
  func=catalog.list_accelerators,
759
810
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -766,9 +817,9 @@ async def list_accelerator_counts(
766
817
  list_accelerator_counts_body: payloads.ListAcceleratorCountsBody
767
818
  ) -> None:
768
819
  """Gets list of accelerator counts from cloud catalog."""
769
- executor.schedule_request(
820
+ await executor.schedule_request_async(
770
821
  request_id=request.state.request_id,
771
- request_name='list_accelerator_counts',
822
+ request_name=request_names.RequestName.LIST_ACCELERATOR_COUNTS,
772
823
  request_body=list_accelerator_counts_body,
773
824
  func=catalog.list_accelerator_counts,
774
825
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -802,6 +853,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
802
853
  # server thread.
803
854
  with admin_policy_utils.apply_and_use_config_in_current_request(
804
855
  dag,
856
+ request_name=request_names.AdminPolicyRequestName.VALIDATE,
805
857
  request_options=validate_body.get_request_options()) as dag:
806
858
  dag.resolve_and_validate_volumes()
807
859
  # Skip validating workdir and file_mounts, as those need to be
@@ -815,6 +867,11 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
815
867
  # thread executor to avoid blocking the uvicorn event loop.
816
868
  await context_utils.to_thread(validate_dag, dag)
817
869
  except Exception as e: # pylint: disable=broad-except
870
+ # Print the exception to the API server log.
871
+ if env_options.Options.SHOW_DEBUG_INFO.get():
872
+ logger.info('/validate exception:', exc_info=True)
873
+ # Set the exception stacktrace for the serialized exception.
874
+ requests_lib.set_exception_stacktrace(e)
818
875
  raise fastapi.HTTPException(
819
876
  status_code=400, detail=exceptions.serialize_exception(e)) from e
820
877
 
@@ -823,9 +880,9 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
823
880
  async def optimize(optimize_body: payloads.OptimizeBody,
824
881
  request: fastapi.Request) -> None:
825
882
  """Optimizes the user's DAG."""
826
- executor.schedule_request(
883
+ await executor.schedule_request_async(
827
884
  request_id=request.state.request_id,
828
- request_name='optimize',
885
+ request_name=request_names.RequestName.OPTIMIZE,
829
886
  request_body=optimize_body,
830
887
  ignore_return_value=True,
831
888
  func=core.optimize,
@@ -1033,9 +1090,9 @@ async def launch(launch_body: payloads.LaunchBody,
1033
1090
  """Launches a cluster or task."""
1034
1091
  request_id = request.state.request_id
1035
1092
  logger.info(f'Launching request: {request_id}')
1036
- executor.schedule_request(
1093
+ await executor.schedule_request_async(
1037
1094
  request_id,
1038
- request_name='launch',
1095
+ request_name=request_names.RequestName.CLUSTER_LAUNCH,
1039
1096
  request_body=launch_body,
1040
1097
  func=execution.launch,
1041
1098
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1049,9 +1106,9 @@ async def launch(launch_body: payloads.LaunchBody,
1049
1106
  async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1050
1107
  """Executes a task on an existing cluster."""
1051
1108
  cluster_name = exec_body.cluster_name
1052
- executor.schedule_request(
1109
+ await executor.schedule_request_async(
1053
1110
  request_id=request.state.request_id,
1054
- request_name='exec',
1111
+ request_name=request_names.RequestName.CLUSTER_EXEC,
1055
1112
  request_body=exec_body,
1056
1113
  func=execution.exec,
1057
1114
  precondition=preconditions.ClusterStartCompletePrecondition(
@@ -1067,9 +1124,9 @@ async def exec(request: fastapi.Request, exec_body: payloads.ExecBody) -> None:
1067
1124
  async def stop(request: fastapi.Request,
1068
1125
  stop_body: payloads.StopOrDownBody) -> None:
1069
1126
  """Stops a cluster."""
1070
- executor.schedule_request(
1127
+ await executor.schedule_request_async(
1071
1128
  request_id=request.state.request_id,
1072
- request_name='stop',
1129
+ request_name=request_names.RequestName.CLUSTER_STOP,
1073
1130
  request_body=stop_body,
1074
1131
  func=core.stop,
1075
1132
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1087,9 +1144,9 @@ async def status(
1087
1144
  raise fastapi.HTTPException(
1088
1145
  status_code=503,
1089
1146
  detail='Server is shutting down, please try again later.')
1090
- executor.schedule_request(
1147
+ await executor.schedule_request_async(
1091
1148
  request_id=request.state.request_id,
1092
- request_name='status',
1149
+ request_name=request_names.RequestName.CLUSTER_STATUS,
1093
1150
  request_body=status_body,
1094
1151
  func=core.status,
1095
1152
  schedule_type=(requests_lib.ScheduleType.LONG if
@@ -1102,9 +1159,9 @@ async def status(
1102
1159
  async def endpoints(request: fastapi.Request,
1103
1160
  endpoint_body: payloads.EndpointsBody) -> None:
1104
1161
  """Gets the endpoint for a given cluster and port number (endpoint)."""
1105
- executor.schedule_request(
1162
+ await executor.schedule_request_async(
1106
1163
  request_id=request.state.request_id,
1107
- request_name='endpoints',
1164
+ request_name=request_names.RequestName.CLUSTER_ENDPOINTS,
1108
1165
  request_body=endpoint_body,
1109
1166
  func=core.endpoints,
1110
1167
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1116,9 +1173,9 @@ async def endpoints(request: fastapi.Request,
1116
1173
  async def down(request: fastapi.Request,
1117
1174
  down_body: payloads.StopOrDownBody) -> None:
1118
1175
  """Tears down a cluster."""
1119
- executor.schedule_request(
1176
+ await executor.schedule_request_async(
1120
1177
  request_id=request.state.request_id,
1121
- request_name='down',
1178
+ request_name=request_names.RequestName.CLUSTER_DOWN,
1122
1179
  request_body=down_body,
1123
1180
  func=core.down,
1124
1181
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1130,9 +1187,9 @@ async def down(request: fastapi.Request,
1130
1187
  async def start(request: fastapi.Request,
1131
1188
  start_body: payloads.StartBody) -> None:
1132
1189
  """Restarts a cluster."""
1133
- executor.schedule_request(
1190
+ await executor.schedule_request_async(
1134
1191
  request_id=request.state.request_id,
1135
- request_name='start',
1192
+ request_name=request_names.RequestName.CLUSTER_START,
1136
1193
  request_body=start_body,
1137
1194
  func=core.start,
1138
1195
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1144,9 +1201,9 @@ async def start(request: fastapi.Request,
1144
1201
  async def autostop(request: fastapi.Request,
1145
1202
  autostop_body: payloads.AutostopBody) -> None:
1146
1203
  """Schedules an autostop/autodown for a cluster."""
1147
- executor.schedule_request(
1204
+ await executor.schedule_request_async(
1148
1205
  request_id=request.state.request_id,
1149
- request_name='autostop',
1206
+ request_name=request_names.RequestName.CLUSTER_AUTOSTOP,
1150
1207
  request_body=autostop_body,
1151
1208
  func=core.autostop,
1152
1209
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1158,9 +1215,9 @@ async def autostop(request: fastapi.Request,
1158
1215
  async def queue(request: fastapi.Request,
1159
1216
  queue_body: payloads.QueueBody) -> None:
1160
1217
  """Gets the job queue of a cluster."""
1161
- executor.schedule_request(
1218
+ await executor.schedule_request_async(
1162
1219
  request_id=request.state.request_id,
1163
- request_name='queue',
1220
+ request_name=request_names.RequestName.CLUSTER_QUEUE,
1164
1221
  request_body=queue_body,
1165
1222
  func=core.queue,
1166
1223
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1172,9 +1229,9 @@ async def queue(request: fastapi.Request,
1172
1229
  async def job_status(request: fastapi.Request,
1173
1230
  job_status_body: payloads.JobStatusBody) -> None:
1174
1231
  """Gets the status of a job."""
1175
- executor.schedule_request(
1232
+ await executor.schedule_request_async(
1176
1233
  request_id=request.state.request_id,
1177
- request_name='job_status',
1234
+ request_name=request_names.RequestName.CLUSTER_JOB_STATUS,
1178
1235
  request_body=job_status_body,
1179
1236
  func=core.job_status,
1180
1237
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1186,9 +1243,9 @@ async def job_status(request: fastapi.Request,
1186
1243
  async def cancel(request: fastapi.Request,
1187
1244
  cancel_body: payloads.CancelBody) -> None:
1188
1245
  """Cancels jobs on a cluster."""
1189
- executor.schedule_request(
1246
+ await executor.schedule_request_async(
1190
1247
  request_id=request.state.request_id,
1191
- request_name='cancel',
1248
+ request_name=request_names.RequestName.CLUSTER_JOB_CANCEL,
1192
1249
  request_body=cancel_body,
1193
1250
  func=core.cancel,
1194
1251
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1205,32 +1262,24 @@ async def logs(
1205
1262
  # TODO(zhwu): This should wait for the request on the cluster, e.g., async
1206
1263
  # launch, to finish, so that a user does not need to manually pull the
1207
1264
  # request status.
1208
- request_task = executor.prepare_request(
1265
+ executor.check_request_thread_executor_available()
1266
+ request_task = await executor.prepare_request_async(
1209
1267
  request_id=request.state.request_id,
1210
- request_name='logs',
1268
+ request_name=request_names.RequestName.CLUSTER_JOB_LOGS,
1211
1269
  request_body=cluster_job_body,
1212
1270
  func=core.tail_logs,
1213
1271
  schedule_type=requests_lib.ScheduleType.SHORT,
1272
+ request_cluster_name=cluster_job_body.cluster_name,
1214
1273
  )
1215
- task = asyncio.create_task(executor.execute_request_coroutine(request_task))
1216
-
1217
- async def cancel_task():
1218
- try:
1219
- logger.info('Client disconnected for request: '
1220
- f'{request.state.request_id}')
1221
- task.cancel()
1222
- await task
1223
- except asyncio.CancelledError:
1224
- pass
1225
-
1226
- # Cancel the task after the request is done or client disconnects
1227
- background_tasks.add_task(cancel_task)
1274
+ task = executor.execute_request_in_coroutine(request_task)
1275
+ background_tasks.add_task(task.cancel)
1228
1276
  # TODO(zhwu): This makes viewing logs in browser impossible. We should adopt
1229
1277
  # the same approach as /stream.
1230
- return stream_utils.stream_response(
1278
+ return stream_utils.stream_response_for_long_request(
1231
1279
  request_id=request.state.request_id,
1232
1280
  logs_path=request_task.log_path,
1233
1281
  background_tasks=background_tasks,
1282
+ kill_request_on_disconnect=False,
1234
1283
  )
1235
1284
 
1236
1285
 
@@ -1245,9 +1294,9 @@ async def download_logs(
1245
1294
  # We should reuse the original request body, so that the env vars, such as
1246
1295
  # user hash, are kept the same.
1247
1296
  cluster_jobs_body.local_dir = str(logs_dir_on_api_server)
1248
- executor.schedule_request(
1297
+ await executor.schedule_request_async(
1249
1298
  request_id=request.state.request_id,
1250
- request_name='download_logs',
1299
+ request_name=request_names.RequestName.CLUSTER_JOB_DOWNLOAD_LOGS,
1251
1300
  request_body=cluster_jobs_body,
1252
1301
  func=core.download_logs,
1253
1302
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1324,27 +1373,55 @@ async def download(download_body: payloads.DownloadBody,
1324
1373
 
1325
1374
  # TODO(aylei): run it asynchronously after global_user_state support async op
1326
1375
  @app.post('/provision_logs')
1327
- def provision_logs(cluster_body: payloads.ClusterNameBody,
1376
+ def provision_logs(provision_logs_body: payloads.ProvisionLogsBody,
1328
1377
  follow: bool = True,
1329
1378
  tail: int = 0) -> fastapi.responses.StreamingResponse:
1330
1379
  """Streams the provision.log for the latest launch request of a cluster."""
1331
- # Prefer clusters table first, then cluster_history as fallback.
1332
- log_path_str = global_user_state.get_cluster_provision_log_path(
1333
- cluster_body.cluster_name)
1334
- if not log_path_str:
1335
- log_path_str = global_user_state.get_cluster_history_provision_log_path(
1336
- cluster_body.cluster_name)
1337
- if not log_path_str:
1338
- raise fastapi.HTTPException(
1339
- status_code=404,
1340
- detail=('Provision log path is not recorded for this cluster. '
1341
- 'Please relaunch to generate provisioning logs.'))
1380
+ log_path = None
1381
+ cluster_name = provision_logs_body.cluster_name
1382
+ worker = provision_logs_body.worker
1383
+ # stream head node logs
1384
+ if worker is None:
1385
+ # Prefer clusters table first, then cluster_history as fallback.
1386
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1387
+ cluster_name)
1388
+ if not log_path_str:
1389
+ log_path_str = (
1390
+ global_user_state.get_cluster_history_provision_log_path(
1391
+ cluster_name))
1392
+ if not log_path_str:
1393
+ raise fastapi.HTTPException(
1394
+ status_code=404,
1395
+ detail=('Provision log path is not recorded for this cluster. '
1396
+ 'Please relaunch to generate provisioning logs.'))
1397
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1398
+ if not log_path.exists():
1399
+ raise fastapi.HTTPException(
1400
+ status_code=404,
1401
+ detail=f'Provision log path does not exist: {str(log_path)}')
1342
1402
 
1343
- log_path = pathlib.Path(log_path_str).expanduser().resolve()
1344
- if not log_path.exists():
1345
- raise fastapi.HTTPException(
1346
- status_code=404,
1347
- detail=f'Provision log path does not exist: {str(log_path)}')
1403
+ # stream worker node logs
1404
+ else:
1405
+ handle = global_user_state.get_handle_from_cluster_name(cluster_name)
1406
+ if handle is None:
1407
+ raise fastapi.HTTPException(
1408
+ status_code=404,
1409
+ detail=('Cluster handle is not recorded for this cluster. '
1410
+ 'Please relaunch to generate provisioning logs.'))
1411
+ # instance_ids includes head node
1412
+ instance_ids = handle.instance_ids
1413
+ if instance_ids is None:
1414
+ raise fastapi.HTTPException(
1415
+ status_code=400,
1416
+ detail='Instance IDs are not recorded for this cluster. '
1417
+ 'Please relaunch to generate provisioning logs.')
1418
+ if worker > len(instance_ids) - 1:
1419
+ raise fastapi.HTTPException(
1420
+ status_code=400,
1421
+ detail=f'Worker {worker} is out of range. '
1422
+ f'The cluster has {len(instance_ids)} nodes.')
1423
+ log_path = metadata_utils.get_instance_log_dir(
1424
+ handle.get_cluster_name_on_cloud(), instance_ids[worker])
1348
1425
 
1349
1426
  # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1350
1427
  effective_tail = None if tail is None or tail <= 0 else tail
@@ -1353,7 +1430,8 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1353
1430
  content=stream_utils.log_streamer(None,
1354
1431
  log_path,
1355
1432
  tail=effective_tail,
1356
- follow=follow),
1433
+ follow=follow,
1434
+ cluster_name=cluster_name),
1357
1435
  media_type='text/plain',
1358
1436
  headers={
1359
1437
  'Cache-Control': 'no-cache, no-transform',
@@ -1367,9 +1445,9 @@ def provision_logs(cluster_body: payloads.ClusterNameBody,
1367
1445
  async def cost_report(request: fastapi.Request,
1368
1446
  cost_report_body: payloads.CostReportBody) -> None:
1369
1447
  """Gets the cost report of a cluster."""
1370
- executor.schedule_request(
1448
+ await executor.schedule_request_async(
1371
1449
  request_id=request.state.request_id,
1372
- request_name='cost_report',
1450
+ request_name=request_names.RequestName.CLUSTER_COST_REPORT,
1373
1451
  request_body=cost_report_body,
1374
1452
  func=core.cost_report,
1375
1453
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1379,9 +1457,9 @@ async def cost_report(request: fastapi.Request,
1379
1457
  @app.get('/storage/ls')
1380
1458
  async def storage_ls(request: fastapi.Request) -> None:
1381
1459
  """Gets the storages."""
1382
- executor.schedule_request(
1460
+ await executor.schedule_request_async(
1383
1461
  request_id=request.state.request_id,
1384
- request_name='storage_ls',
1462
+ request_name=request_names.RequestName.STORAGE_LS,
1385
1463
  request_body=payloads.RequestBody(),
1386
1464
  func=core.storage_ls,
1387
1465
  schedule_type=requests_lib.ScheduleType.SHORT,
@@ -1392,9 +1470,9 @@ async def storage_ls(request: fastapi.Request) -> None:
1392
1470
  async def storage_delete(request: fastapi.Request,
1393
1471
  storage_body: payloads.StorageBody) -> None:
1394
1472
  """Deletes a storage."""
1395
- executor.schedule_request(
1473
+ await executor.schedule_request_async(
1396
1474
  request_id=request.state.request_id,
1397
- request_name='storage_delete',
1475
+ request_name=request_names.RequestName.STORAGE_DELETE,
1398
1476
  request_body=storage_body,
1399
1477
  func=core.storage_delete,
1400
1478
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1405,9 +1483,9 @@ async def storage_delete(request: fastapi.Request,
1405
1483
  async def local_up(request: fastapi.Request,
1406
1484
  local_up_body: payloads.LocalUpBody) -> None:
1407
1485
  """Launches a Kubernetes cluster on API server."""
1408
- executor.schedule_request(
1486
+ await executor.schedule_request_async(
1409
1487
  request_id=request.state.request_id,
1410
- request_name='local_up',
1488
+ request_name=request_names.RequestName.LOCAL_UP,
1411
1489
  request_body=local_up_body,
1412
1490
  func=core.local_up,
1413
1491
  schedule_type=requests_lib.ScheduleType.LONG,
@@ -1415,21 +1493,39 @@ async def local_up(request: fastapi.Request,
1415
1493
 
1416
1494
 
1417
1495
  @app.post('/local_down')
1418
- async def local_down(request: fastapi.Request) -> None:
1496
+ async def local_down(request: fastapi.Request,
1497
+ local_down_body: payloads.LocalDownBody) -> None:
1419
1498
  """Tears down the Kubernetes cluster started by local_up."""
1420
- executor.schedule_request(
1499
+ await executor.schedule_request_async(
1421
1500
  request_id=request.state.request_id,
1422
- request_name='local_down',
1423
- request_body=payloads.RequestBody(),
1501
+ request_name=request_names.RequestName.LOCAL_DOWN,
1502
+ request_body=local_down_body,
1424
1503
  func=core.local_down,
1425
1504
  schedule_type=requests_lib.ScheduleType.LONG,
1426
1505
  )
1427
1506
 
1428
1507
 
1508
+ async def get_expanded_request_id(request_id: str) -> str:
1509
+ """Gets the expanded request ID for a given request ID prefix."""
1510
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1511
+ request_id, fields=['request_id'])
1512
+ if request_tasks is None:
1513
+ raise fastapi.HTTPException(status_code=404,
1514
+ detail=f'Request {request_id!r} not found')
1515
+ if len(request_tasks) > 1:
1516
+ raise fastapi.HTTPException(status_code=400,
1517
+ detail=('Multiple requests found for '
1518
+ f'request ID prefix: {request_id}'))
1519
+ return request_tasks[0].request_id
1520
+
1521
+
1429
1522
  # === API server related APIs ===
1430
- @app.get('/api/get')
1523
+ @app.get('/api/get', response_class=fastapi_responses.ORJSONResponse)
1431
1524
  async def api_get(request_id: str) -> payloads.RequestPayload:
1432
1525
  """Gets a request with a given request ID prefix."""
1526
+ # Validate request_id prefix matches a single request.
1527
+ request_id = await get_expanded_request_id(request_id)
1528
+
1433
1529
  while True:
1434
1530
  req_status = await requests_lib.get_request_status_async(request_id)
1435
1531
  if req_status is None:
@@ -1446,6 +1542,8 @@ async def api_get(request_id: str) -> payloads.RequestPayload:
1446
1542
  # to avoid storming the DB and CPU in the meantime
1447
1543
  await asyncio.sleep(0.1)
1448
1544
  request_task = await requests_lib.get_request_async(request_id)
1545
+ # TODO(aylei): refine this, /api/get will not be retried and this is
1546
+ # meaningless to retry. It is the original request that should be retried.
1449
1547
  if request_task.should_retry:
1450
1548
  raise fastapi.HTTPException(
1451
1549
  status_code=503, detail=f'Request {request_id!r} should be retried')
@@ -1487,13 +1585,18 @@ async def stream(
1487
1585
  clients, console for CLI/API clients), 'plain' (force plain text),
1488
1586
  'html' (force HTML), or 'console' (force console)
1489
1587
  """
1588
+ # We need to save the user-supplied request ID for the response header.
1589
+ user_supplied_request_id = request_id
1490
1590
  if request_id is not None and log_path is not None:
1491
1591
  raise fastapi.HTTPException(
1492
1592
  status_code=400,
1493
1593
  detail='Only one of request_id and log_path can be provided')
1494
1594
 
1595
+ if request_id is not None:
1596
+ request_id = await get_expanded_request_id(request_id)
1597
+
1495
1598
  if request_id is None and log_path is None:
1496
- request_id = requests_lib.get_latest_request_id()
1599
+ request_id = await requests_lib.get_latest_request_id_async()
1497
1600
  if request_id is None:
1498
1601
  raise fastapi.HTTPException(status_code=404,
1499
1602
  detail='No request found')
@@ -1520,13 +1623,17 @@ async def stream(
1520
1623
  'X-Accel-Buffering': 'no'
1521
1624
  })
1522
1625
 
1626
+ polling_interval = stream_utils.DEFAULT_POLL_INTERVAL
1523
1627
  # Original plain text streaming logic
1524
1628
  if request_id is not None:
1525
- request_task = await requests_lib.get_request_async(request_id)
1629
+ request_task = await requests_lib.get_request_async(
1630
+ request_id, fields=['request_id', 'schedule_type'])
1526
1631
  if request_task is None:
1527
1632
  print(f'No task with request ID {request_id}')
1528
1633
  raise fastapi.HTTPException(
1529
1634
  status_code=404, detail=f'Request {request_id!r} not found')
1635
+ # req.log_path is derived from request_id,
1636
+ # so it's ok to just grab the request_id in the above query.
1530
1637
  log_path_to_stream = request_task.log_path
1531
1638
  if not log_path_to_stream.exists():
1532
1639
  # The log file might be deleted by the request GC daemon but the
@@ -1534,6 +1641,9 @@ async def stream(
1534
1641
  raise fastapi.HTTPException(
1535
1642
  status_code=404,
1536
1643
  detail=f'Log of request {request_id!r} has been deleted')
1644
+ if request_task.schedule_type == requests_lib.ScheduleType.LONG:
1645
+ polling_interval = stream_utils.LONG_REQUEST_POLL_INTERVAL
1646
+ del request_task
1537
1647
  else:
1538
1648
  assert log_path is not None, (request_id, log_path)
1539
1649
  if log_path == constants.API_SERVER_LOGS:
@@ -1567,18 +1677,26 @@ async def stream(
1567
1677
  detail=f'Log path {log_path!r} does not exist')
1568
1678
 
1569
1679
  log_path_to_stream = resolved_log_path
1680
+
1681
+ headers = {
1682
+ 'Cache-Control': 'no-cache, no-transform',
1683
+ 'X-Accel-Buffering': 'no',
1684
+ 'Transfer-Encoding': 'chunked'
1685
+ }
1686
+ if request_id is not None:
1687
+ headers[server_constants.STREAM_REQUEST_HEADER] = (
1688
+ user_supplied_request_id
1689
+ if user_supplied_request_id else request_id)
1690
+
1570
1691
  return fastapi.responses.StreamingResponse(
1571
1692
  content=stream_utils.log_streamer(request_id,
1572
1693
  log_path_to_stream,
1573
1694
  plain_logs=format == 'plain',
1574
1695
  tail=tail,
1575
- follow=follow),
1696
+ follow=follow,
1697
+ polling_interval=polling_interval),
1576
1698
  media_type='text/plain',
1577
- headers={
1578
- 'Cache-Control': 'no-cache, no-transform',
1579
- 'X-Accel-Buffering': 'no',
1580
- 'Transfer-Encoding': 'chunked'
1581
- },
1699
+ headers=headers,
1582
1700
  )
1583
1701
 
1584
1702
 
@@ -1586,11 +1704,11 @@ async def stream(
1586
1704
  async def api_cancel(request: fastapi.Request,
1587
1705
  request_cancel_body: payloads.RequestCancelBody) -> None:
1588
1706
  """Cancels requests."""
1589
- executor.schedule_request(
1707
+ await executor.schedule_request_async(
1590
1708
  request_id=request.state.request_id,
1591
- request_name='api_cancel',
1709
+ request_name=request_names.RequestName.API_CANCEL,
1592
1710
  request_body=request_cancel_body,
1593
- func=requests_lib.kill_requests,
1711
+ func=requests_lib.kill_requests_with_prefix,
1594
1712
  schedule_type=requests_lib.ScheduleType.SHORT,
1595
1713
  )
1596
1714
 
@@ -1598,9 +1716,13 @@ async def api_cancel(request: fastapi.Request,
1598
1716
  @app.get('/api/status')
1599
1717
  async def api_status(
1600
1718
  request_ids: Optional[List[str]] = fastapi.Query(
1601
- None, description='Request IDs to get status for.'),
1719
+ None, description='Request ID prefixes to get status for.'),
1602
1720
  all_status: bool = fastapi.Query(
1603
1721
  False, description='Get finished requests as well.'),
1722
+ limit: Optional[int] = fastapi.Query(
1723
+ None, description='Number of requests to show.'),
1724
+ fields: Optional[List[str]] = fastapi.Query(
1725
+ None, description='Fields to get. If None, get all fields.'),
1604
1726
  ) -> List[payloads.RequestPayload]:
1605
1727
  """Gets the list of requests."""
1606
1728
  if request_ids is None:
@@ -1611,15 +1733,22 @@ async def api_status(
1611
1733
  requests_lib.RequestStatus.RUNNING,
1612
1734
  ]
1613
1735
  request_tasks = await requests_lib.get_request_tasks_async(
1614
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
1615
- return [r.readable_encode() for r in request_tasks]
1736
+ req_filter=requests_lib.RequestTaskFilter(
1737
+ status=statuses,
1738
+ limit=limit,
1739
+ fields=fields,
1740
+ sort=True,
1741
+ ))
1742
+ return requests_lib.encode_requests(request_tasks)
1616
1743
  else:
1617
1744
  encoded_request_tasks = []
1618
1745
  for request_id in request_ids:
1619
- request_task = await requests_lib.get_request_async(request_id)
1620
- if request_task is None:
1746
+ request_tasks = await requests_lib.get_requests_async_with_prefix(
1747
+ request_id)
1748
+ if request_tasks is None:
1621
1749
  continue
1622
- encoded_request_tasks.append(request_task.readable_encode())
1750
+ for request_task in request_tasks:
1751
+ encoded_request_tasks.append(request_task.readable_encode())
1623
1752
  return encoded_request_tasks
1624
1753
 
1625
1754
 
@@ -1679,23 +1808,44 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1679
1808
  version=sky.__version__,
1680
1809
  version_on_disk=common.get_skypilot_version_on_disk(),
1681
1810
  commit=sky.__commit__,
1811
+ # Whether basic auth on api server is enabled
1682
1812
  basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1683
1813
  'false').lower() == 'true',
1684
1814
  user=user if user is not None else None,
1815
+ # Whether service account token is enabled
1816
+ service_account_token_enabled=(os.environ.get(
1817
+ constants.ENV_VAR_ENABLE_SERVICE_ACCOUNTS,
1818
+ 'false').lower() == 'true'),
1819
+ # Whether basic auth on ingress is enabled
1820
+ ingress_basic_auth_enabled=os.environ.get(
1821
+ constants.SKYPILOT_INGRESS_BASIC_AUTH_ENABLED,
1822
+ 'false').lower() == 'true',
1685
1823
  )
1686
1824
 
1687
1825
 
1826
+ class KubernetesSSHMessageType(IntEnum):
1827
+ REGULAR_DATA = 0
1828
+ PINGPONG = 1
1829
+ LATENCY_MEASUREMENT = 2
1830
+
1831
+
1688
1832
  @app.websocket('/kubernetes-pod-ssh-proxy')
1689
- async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1690
- cluster_name: str) -> None:
1833
+ async def kubernetes_pod_ssh_proxy(
1834
+ websocket: fastapi.WebSocket,
1835
+ cluster_name: str,
1836
+ client_version: Optional[int] = None) -> None:
1691
1837
  """Proxies SSH to the Kubernetes pod with websocket."""
1692
1838
  await websocket.accept()
1693
1839
  logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
1694
1840
 
1841
+ timestamps_supported = client_version is not None and client_version > 21
1842
+ logger.info(f'Websocket timestamps supported: {timestamps_supported}, \
1843
+ client_version = {client_version}')
1844
+
1695
1845
  # Run core.status in another thread to avoid blocking the event loop.
1696
- cluster_records = await context_utils.to_thread(core.status,
1697
- cluster_name,
1698
- all_users=True)
1846
+ with ThreadPoolExecutor(max_workers=1) as thread_pool_executor:
1847
+ cluster_records = await context_utils.to_thread_with_executor(
1848
+ thread_pool_executor, core.status, cluster_name, all_users=True)
1699
1849
  cluster_record = cluster_records[0]
1700
1850
  if cluster_record['status'] != status_lib.ClusterStatus.UP:
1701
1851
  raise fastapi.HTTPException(
@@ -1734,17 +1884,70 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1734
1884
  return
1735
1885
 
1736
1886
  logger.info(f'Starting port-forward to local port: {local_port}')
1887
+ conn_gauge = metrics_utils.SKY_APISERVER_WEBSOCKET_CONNECTIONS.labels(
1888
+ pid=os.getpid())
1889
+ ssh_failed = False
1890
+ websocket_closed = False
1737
1891
  try:
1892
+ conn_gauge.inc()
1738
1893
  # Connect to the local port
1739
1894
  reader, writer = await asyncio.open_connection('127.0.0.1', local_port)
1740
1895
 
1741
1896
  async def websocket_to_ssh():
1742
1897
  try:
1743
1898
  async for message in websocket.iter_bytes():
1899
+ if timestamps_supported:
1900
+ type_size = struct.calcsize('!B')
1901
+ message_type = struct.unpack('!B',
1902
+ message[:type_size])[0]
1903
+ if (message_type ==
1904
+ KubernetesSSHMessageType.REGULAR_DATA):
1905
+ # Regular data - strip type byte and forward to SSH
1906
+ message = message[type_size:]
1907
+ elif message_type == KubernetesSSHMessageType.PINGPONG:
1908
+ # PING message - respond with PONG (type 1)
1909
+ ping_id_size = struct.calcsize('!I')
1910
+ if len(message) != type_size + ping_id_size:
1911
+ raise ValueError('Invalid PING message '
1912
+ f'length: {len(message)}')
1913
+ # Return the same PING message, so that the client
1914
+ # can measure the latency.
1915
+ await websocket.send_bytes(message)
1916
+ continue
1917
+ elif (message_type ==
1918
+ KubernetesSSHMessageType.LATENCY_MEASUREMENT):
1919
+ # Latency measurement from client
1920
+ latency_size = struct.calcsize('!Q')
1921
+ if len(message) != type_size + latency_size:
1922
+ raise ValueError(
1923
+ 'Invalid latency measurement '
1924
+ f'message length: {len(message)}')
1925
+ avg_latency_ms = struct.unpack(
1926
+ '!Q',
1927
+ message[type_size:type_size + latency_size])[0]
1928
+ latency_seconds = avg_latency_ms / 1000
1929
+ metrics_utils.SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS.labels(pid=os.getpid()).observe(latency_seconds) # pylint: disable=line-too-long
1930
+ continue
1931
+ else:
1932
+ # Unknown message type.
1933
+ raise ValueError(
1934
+ f'Unknown message type: {message_type}')
1744
1935
  writer.write(message)
1745
- await writer.drain()
1936
+ try:
1937
+ await writer.drain()
1938
+ except Exception as e: # pylint: disable=broad-except
1939
+ # Typically we will not reach here, if the ssh to pod
1940
+ # is disconnected, ssh_to_websocket will exit first.
1941
+ # But just in case.
1942
+ logger.error('Failed to write to pod through '
1943
+ f'port-forward connection: {e}')
1944
+ nonlocal ssh_failed
1945
+ ssh_failed = True
1946
+ break
1746
1947
  except fastapi.WebSocketDisconnect:
1747
1948
  pass
1949
+ nonlocal websocket_closed
1950
+ websocket_closed = True
1748
1951
  writer.close()
1749
1952
 
1750
1953
  async def ssh_to_websocket():
@@ -1752,62 +1955,64 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
1752
1955
  while True:
1753
1956
  data = await reader.read(1024)
1754
1957
  if not data:
1958
+ if not websocket_closed:
1959
+ logger.warning('SSH connection to pod is '
1960
+ 'disconnected before websocket '
1961
+ 'connection is closed')
1962
+ nonlocal ssh_failed
1963
+ ssh_failed = True
1755
1964
  break
1965
+ if timestamps_supported:
1966
+ # Prepend message type byte (0 = regular data)
1967
+ message_type_bytes = struct.pack(
1968
+ '!B', KubernetesSSHMessageType.REGULAR_DATA.value)
1969
+ data = message_type_bytes + data
1756
1970
  await websocket.send_bytes(data)
1757
1971
  except Exception: # pylint: disable=broad-except
1758
1972
  pass
1759
- await websocket.close()
1973
+ try:
1974
+ await websocket.close()
1975
+ except Exception: # pylint: disable=broad-except
1976
+ # The websocket might has been closed by the client.
1977
+ pass
1760
1978
 
1761
1979
  await asyncio.gather(websocket_to_ssh(), ssh_to_websocket())
1762
1980
  finally:
1763
- proc.terminate()
1981
+ conn_gauge.dec()
1982
+ reason = ''
1983
+ try:
1984
+ logger.info('Terminating kubectl port-forward process')
1985
+ proc.terminate()
1986
+ except ProcessLookupError:
1987
+ stdout = await proc.stdout.read()
1988
+ logger.error('kubectl port-forward was terminated before the '
1989
+ 'ssh websocket connection was closed. Remaining '
1990
+ f'output: {str(stdout)}')
1991
+ reason = 'KubectlPortForwardExit'
1992
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
1993
+ pid=os.getpid(), reason='KubectlPortForwardExit').inc()
1994
+ else:
1995
+ if ssh_failed:
1996
+ reason = 'SSHToPodDisconnected'
1997
+ else:
1998
+ reason = 'ClientClosed'
1999
+ metrics_utils.SKY_APISERVER_WEBSOCKET_CLOSED_TOTAL.labels(
2000
+ pid=os.getpid(), reason=reason).inc()
1764
2001
 
1765
2002
 
1766
2003
  @app.get('/all_contexts')
1767
2004
  async def all_contexts(request: fastapi.Request) -> None:
1768
2005
  """Gets all Kubernetes and SSH node pool contexts."""
1769
2006
 
1770
- executor.schedule_request(
2007
+ await executor.schedule_request_async(
1771
2008
  request_id=request.state.request_id,
1772
- request_name='all_contexts',
2009
+ request_name=request_names.RequestName.ALL_CONTEXTS,
1773
2010
  request_body=payloads.RequestBody(),
1774
2011
  func=core.get_all_contexts,
1775
2012
  schedule_type=requests_lib.ScheduleType.SHORT,
1776
2013
  )
1777
2014
 
1778
2015
 
1779
- @app.get('/gpu-metrics')
1780
- async def gpu_metrics() -> fastapi.Response:
1781
- """Gets the GPU metrics from multiple external k8s clusters"""
1782
- contexts = core.get_all_contexts()
1783
- all_metrics = []
1784
- successful_contexts = 0
1785
-
1786
- tasks = [
1787
- asyncio.create_task(metrics_utils.get_metrics_for_context(context))
1788
- for context in contexts
1789
- if context != 'in-cluster'
1790
- ]
1791
-
1792
- results = await asyncio.gather(*tasks, return_exceptions=True)
1793
-
1794
- for i, result in enumerate(results):
1795
- if isinstance(result, Exception):
1796
- logger.error(
1797
- f'Failed to get metrics for context {contexts[i]}: {result}')
1798
- else:
1799
- metrics_text = result
1800
- all_metrics.append(metrics_text)
1801
- successful_contexts += 1
1802
-
1803
- combined_metrics = '\n\n'.join(all_metrics)
1804
-
1805
- # Return as plain text for Prometheus compatibility
1806
- return fastapi.Response(
1807
- content=combined_metrics,
1808
- media_type='text/plain; version=0.0.4; charset=utf-8')
1809
-
1810
-
1811
2016
  # === Internal APIs ===
1812
2017
  @app.get('/api/completion/cluster_name')
1813
2018
  async def complete_cluster_name(incomplete: str,) -> List[str]:
@@ -1905,6 +2110,7 @@ if __name__ == '__main__':
1905
2110
 
1906
2111
  from sky.server import uvicorn as skyuvicorn
1907
2112
 
2113
+ logger.info('Initializing SkyPilot API server')
1908
2114
  skyuvicorn.add_timestamp_prefix_for_server_logs()
1909
2115
 
1910
2116
  parser = argparse.ArgumentParser()
@@ -1916,22 +2122,63 @@ if __name__ == '__main__':
1916
2122
  parser.add_argument('--metrics-port', default=9090, type=int)
1917
2123
  cmd_args = parser.parse_args()
1918
2124
  if cmd_args.port == cmd_args.metrics_port:
2125
+ logger.error('port and metrics-port cannot be the same, exiting.')
1919
2126
  raise ValueError('port and metrics-port cannot be the same')
1920
2127
 
2128
+ # Fail fast if the port is not available to avoid corrupt the state
2129
+ # of potential running server instance.
2130
+ # We might reach here because the running server is currently not
2131
+ # responding, thus the healthz check fails and `sky api start` think
2132
+ # we should start a new server instance.
2133
+ if not common_utils.is_port_available(cmd_args.port):
2134
+ logger.error(f'Port {cmd_args.port} is not available, exiting.')
2135
+ raise RuntimeError(f'Port {cmd_args.port} is not available')
2136
+
2137
+ # Maybe touch the signal file on API server startup. Do it again here even
2138
+ # if we already touched it in the sky/server/common.py::_start_api_server.
2139
+ # This is because the sky/server/common.py::_start_api_server function call
2140
+ # is running outside the skypilot API server process tree. The process tree
2141
+ # starts within that function (see the `subprocess.Popen` call in
2142
+ # sky/server/common.py::_start_api_server). When pg is used, the
2143
+ # _start_api_server function will not load the config file from db, which
2144
+ # will ignore the consolidation mode config. Here, inside the process tree,
2145
+ # we already reload the config as a server (with env var _start_api_server),
2146
+ # so we will respect the consolidation mode config.
2147
+ # Refers to #7717 for more details.
2148
+ managed_job_utils.is_consolidation_mode(on_api_restart=True)
2149
+
1921
2150
  # Show the privacy policy if it is not already shown. We place it here so
1922
2151
  # that it is shown only when the API server is started.
1923
2152
  usage_lib.maybe_show_privacy_policy()
1924
2153
 
1925
2154
  # Initialize global user state db
1926
2155
  db_utils.set_max_connections(1)
2156
+ logger.info('Initializing database engine')
1927
2157
  global_user_state.initialize_and_get_db()
2158
+ logger.info('Database engine initialized')
1928
2159
  # Initialize request db
1929
2160
  requests_lib.reset_db_and_logs()
1930
2161
  # Restore the server user hash
2162
+ logger.info('Initializing server user hash')
1931
2163
  _init_or_restore_server_user_hash()
2164
+
1932
2165
  max_db_connections = global_user_state.get_max_db_connections()
1933
- config = server_config.compute_server_config(cmd_args.deploy,
1934
- max_db_connections)
2166
+ logger.info(f'Max db connections: {max_db_connections}')
2167
+
2168
+ # Reserve memory for jobs and serve/pool controller in consolidation mode.
2169
+ reserved_memory_mb = (
2170
+ controller_utils.compute_memory_reserved_for_controllers(
2171
+ reserve_for_controllers=os.environ.get(
2172
+ constants.OVERRIDE_CONSOLIDATION_MODE) is not None,
2173
+ # For jobs controller, we need to reserve for both jobs and
2174
+ # pool controller.
2175
+ reserve_extra_for_pool=not os.environ.get(
2176
+ constants.IS_SKYPILOT_SERVE_CONTROLLER)))
2177
+
2178
+ config = server_config.compute_server_config(
2179
+ cmd_args.deploy,
2180
+ max_db_connections,
2181
+ reserved_memory_mb=reserved_memory_mb)
1935
2182
 
1936
2183
  num_workers = config.num_server_workers
1937
2184
 
@@ -1960,7 +2207,8 @@ if __name__ == '__main__':
1960
2207
  uvicorn_config = uvicorn.Config('sky.server.server:app',
1961
2208
  host=cmd_args.host,
1962
2209
  port=cmd_args.port,
1963
- workers=num_workers)
2210
+ workers=num_workers,
2211
+ ws_per_message_deflate=False)
1964
2212
  skyuvicorn.run(uvicorn_config,
1965
2213
  max_db_connections=config.num_db_connections_per_worker)
1966
2214
  except Exception as exc: # pylint: disable=broad-except