skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -3,42 +3,54 @@
3
3
  import dataclasses
4
4
  import enum
5
5
  import functools
6
+ from http.cookiejar import CookieJar
6
7
  from http.cookiejar import MozillaCookieJar
7
- import json
8
8
  import os
9
9
  import pathlib
10
10
  import re
11
+ import shutil
11
12
  import subprocess
12
13
  import sys
14
+ import tempfile
15
+ import threading
13
16
  import time
14
17
  import typing
15
- from typing import Any, Dict, Optional
16
- from urllib import parse
18
+ from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
19
+ Tuple, TypeVar, Union)
17
20
  import uuid
18
21
 
22
+ import cachetools
19
23
  import colorama
20
24
  import filelock
25
+ from passlib import context as passlib_context
26
+ from typing_extensions import ParamSpec
21
27
 
22
- import sky
23
28
  from sky import exceptions
24
29
  from sky import sky_logging
25
30
  from sky import skypilot_config
26
31
  from sky.adaptors import common as adaptors_common
32
+ from sky.client import service_account_auth
27
33
  from sky.data import data_utils
28
34
  from sky.server import constants as server_constants
35
+ from sky.server import rest
36
+ from sky.server import versions
29
37
  from sky.skylet import constants
30
38
  from sky.usage import usage_lib
31
39
  from sky.utils import annotations
32
40
  from sky.utils import common_utils
33
41
  from sky.utils import rich_utils
34
42
  from sky.utils import ux_utils
43
+ from sky.utils import yaml_utils
35
44
 
36
45
  if typing.TYPE_CHECKING:
46
+ import aiohttp
37
47
  import pydantic
38
48
  import requests
39
49
 
40
50
  from sky import dag as dag_lib
51
+ from sky import models
41
52
  else:
53
+ aiohttp = adaptors_common.LazyImport('aiohttp')
42
54
  pydantic = adaptors_common.LazyImport('pydantic')
43
55
  requests = adaptors_common.LazyImport('requests')
44
56
 
@@ -50,7 +62,7 @@ AVAILABLE_LOCAL_API_SERVER_URLS = [
50
62
 
51
63
  API_SERVER_CMD = '-m sky.server.server'
52
64
  # The client dir on the API server for storing user-specific data, such as file
53
- # mounts, logs, etc. This dir is empheral and will be cleaned up when the API
65
+ # mounts, logs, etc. This dir is ephemeral and will be cleaned up when the API
54
66
  # server is restarted.
55
67
  API_SERVER_CLIENT_DIR = pathlib.Path('~/.sky/api_server/clients')
56
68
  RETRY_COUNT_ON_TIMEOUT = 3
@@ -60,34 +72,11 @@ RETRY_COUNT_ON_TIMEOUT = 3
60
72
  # (e.g. in high contention env) and we will exit eagerly if server exit.
61
73
  WAIT_APISERVER_START_TIMEOUT_SEC = 60
62
74
 
63
- _VERSION_INFO = (
64
- f'{colorama.Style.RESET_ALL}'
65
- f'{colorama.Style.DIM}'
66
- 'client version: v{client_version} (API version: v{client_api_version})\n'
67
- 'server version: v{server_version} (API version: v{server_api_version})'
68
- f'{colorama.Style.RESET_ALL}')
69
75
  _LOCAL_API_SERVER_RESTART_HINT = (
70
- f'{colorama.Fore.YELLOW}Please restart the SkyPilot API server with:\n'
76
+ f'{colorama.Fore.YELLOW}The local SkyPilot API server is not compatible '
77
+ 'with the client. Please restart the API server with:\n'
71
78
  f'{colorama.Style.BRIGHT}sky api stop; sky api start'
72
79
  f'{colorama.Style.RESET_ALL}')
73
- _LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
74
- f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
75
- '{version_info}\n'
76
- f'{_LOCAL_API_SERVER_RESTART_HINT}'
77
- f'{colorama.Style.RESET_ALL}')
78
- _CLIENT_TOO_OLD_WARNING = (
79
- f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
80
- '{version_info}\n'
81
- f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
82
- '{command}'
83
- f'{colorama.Style.RESET_ALL}')
84
- _REMOTE_SERVER_TOO_OLD_WARNING = (
85
- f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
86
- '{version_info}\n'
87
- f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
88
- 'remote API server or downgrade your local client with:\n'
89
- '{command}\n'
90
- f'{colorama.Style.RESET_ALL}')
91
80
  _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
92
81
  f'{colorama.Fore.YELLOW}SkyPilot API server version does not match the '
93
82
  'installation on disk:\n'
@@ -99,23 +88,32 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
99
88
  f'{colorama.Fore.YELLOW}This can happen if you upgraded SkyPilot without '
100
89
  'restarting the API server.'
101
90
  f'{colorama.Style.RESET_ALL}')
102
- # Parse local API version eargly to catch version format errors.
103
- _LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
104
- # SkyPilot dev version.
105
- _DEV_VERSION = '1.0.0-dev0'
106
91
 
107
- RequestId = str
92
+ T = TypeVar('T')
93
+ P = ParamSpec('P')
94
+
95
+
96
+ class RequestId(str, Generic[T]):
97
+ pass
98
+
99
+
108
100
  ApiVersion = Optional[str]
109
101
 
110
102
  logger = sky_logging.init_logger(__name__)
111
103
 
112
104
  hinted_for_server_install_version_mismatch = False
113
105
 
106
+ crypt_ctx = passlib_context.CryptContext([
107
+ 'bcrypt', 'sha256_crypt', 'sha512_crypt', 'des_crypt', 'apr_md5_crypt',
108
+ 'ldap_sha1'
109
+ ])
110
+
114
111
 
115
112
  class ApiServerStatus(enum.Enum):
116
113
  HEALTHY = 'healthy'
117
114
  UNHEALTHY = 'unhealthy'
118
115
  VERSION_MISMATCH = 'version_mismatch'
116
+ NEEDS_AUTH = 'needs_auth'
119
117
 
120
118
 
121
119
  @dataclasses.dataclass
@@ -125,20 +123,209 @@ class ApiServerInfo:
125
123
  version: Optional[str] = None
126
124
  version_on_disk: Optional[str] = None
127
125
  commit: Optional[str] = None
126
+ user: Optional[Dict[str, Any]] = None
127
+ basic_auth_enabled: bool = False
128
+ error: Optional[str] = None
129
+
130
+
131
+ def get_api_cookie_jar_path() -> pathlib.Path:
132
+ """Returns the Path to the API cookie jar file."""
133
+ return pathlib.Path(
134
+ os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR,
135
+ server_constants.API_COOKIE_FILE_DEFAULT_LOCATION)
136
+ ).expanduser().resolve()
128
137
 
129
138
 
130
139
  def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
131
140
  """Returns the cookie jar used by the client to access the API server."""
132
- cookie_file = os.environ.get(server_constants.API_COOKIE_FILE_ENV_VAR)
133
141
  cookie_jar = requests.cookies.RequestsCookieJar()
134
- if cookie_file and os.path.exists(cookie_file):
135
- cookie_path = pathlib.Path(cookie_file).expanduser().resolve()
142
+ cookie_path = get_api_cookie_jar_path()
143
+ if cookie_path.exists():
136
144
  file_cookie_jar = MozillaCookieJar(cookie_path)
137
145
  file_cookie_jar.load()
138
146
  cookie_jar.update(file_cookie_jar)
139
147
  return cookie_jar
140
148
 
141
149
 
150
+ def set_api_cookie_jar(cookie_jar: CookieJar,
151
+ create_if_not_exists: bool = True) -> None:
152
+ """Updates the file cookie jar with the given cookie jar."""
153
+ if len(cookie_jar) == 0:
154
+ return
155
+ cookie_path = get_api_cookie_jar_path()
156
+ if not cookie_path.exists() and not create_if_not_exists:
157
+ # if the file doesn't exist and we don't want to create it, do nothing
158
+ return
159
+ if not cookie_path.parent.exists():
160
+ cookie_path.parent.mkdir(parents=True, exist_ok=True)
161
+
162
+ # Writing directly to the cookie jar path can race with other processes that
163
+ # are reading the cookie jar, making it look malformed. Instead, write to a
164
+ # temporary file and then move it to the final location.
165
+ # Avoid hardcoding the tmp file path, since it could cause a race with other
166
+ # processes that are also writing to the tmp file.
167
+ with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
168
+ delete=False) as tmp_file:
169
+ tmp_cookie_path = tmp_file.name
170
+ file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
171
+ if cookie_path.exists():
172
+ file_cookie_jar.load(str(cookie_path))
173
+
174
+ for cookie in cookie_jar:
175
+ file_cookie_jar.set_cookie(cookie)
176
+ file_cookie_jar.save()
177
+
178
+ # Move the temporary file to the final location.
179
+ os.replace(tmp_cookie_path, cookie_path)
180
+
181
+
182
+ def get_cookies_from_response(
183
+ response: 'requests.Response') -> requests.cookies.RequestsCookieJar:
184
+ """Returns the cookies from the API server response."""
185
+ server_url = get_server_url()
186
+ cookies = response.cookies
187
+ for prev_resp in response.history:
188
+ for cookie in prev_resp.cookies:
189
+ if cookie.domain in server_url:
190
+ cookies.set_cookie(cookie)
191
+ return cookies
192
+
193
+
194
+ def _prepare_authenticated_request_params(
195
+ path: str,
196
+ server_url: Optional[str] = None,
197
+ **kwargs) -> Tuple[str, Dict[str, Any]]:
198
+ """Prepare common parameters for authenticated requests (sync or async).
199
+
200
+ Returns:
201
+ Tuple of (url, updated_kwargs)
202
+ """
203
+ if server_url is None:
204
+ server_url = get_server_url()
205
+
206
+ # Prepare headers and URL for service account authentication
207
+ headers = service_account_auth.get_service_account_headers()
208
+
209
+ # Merge with existing headers
210
+ if 'headers' in kwargs:
211
+ headers.update(kwargs['headers'])
212
+ kwargs['headers'] = headers
213
+
214
+ # Always use the same URL regardless of authentication type
215
+ # OAuth2 proxy will handle authentication based on headers
216
+ url = f'{server_url}/{path}' if not path.startswith(
217
+ '/') else f'{server_url}{path}'
218
+
219
+ # Use cookie authentication if no Bearer token present
220
+ if not headers.get('Authorization') and 'cookies' not in kwargs:
221
+ kwargs['cookies'] = get_api_cookie_jar()
222
+
223
+ return url, kwargs
224
+
225
+
226
+ def _convert_requests_cookies_to_aiohttp(
227
+ cookie_jar: requests.cookies.RequestsCookieJar) -> Dict[str, str]:
228
+ """Convert requests cookie jar to aiohttp-compatible dict format."""
229
+ cookies = {}
230
+ for cookie in cookie_jar:
231
+ cookies[cookie.name] = cookie.value
232
+ return cookies # type: ignore
233
+
234
+
235
+ def make_authenticated_request(method: str,
236
+ path: str,
237
+ server_url: Optional[str] = None,
238
+ retry: bool = True,
239
+ **kwargs) -> 'requests.Response':
240
+ """Make an authenticated HTTP request to the API server.
241
+
242
+ Automatically handles service account token authentication or cookie-based
243
+ authentication based on what's available.
244
+
245
+ Args:
246
+ method: HTTP method (GET, POST, etc.)
247
+ path: API path (e.g., '/api/v1/status')
248
+ server_url: Server URL, defaults to configured server
249
+ retry: Whether to retry on transient errors
250
+ **kwargs: Additional arguments to pass to requests
251
+
252
+ Returns:
253
+ requests.Response object
254
+ """
255
+ url, kwargs = _prepare_authenticated_request_params(path, server_url,
256
+ **kwargs)
257
+
258
+ # Make the request
259
+ if retry:
260
+ return rest.request(method, url, **kwargs)
261
+ else:
262
+ assert method == 'GET', 'Only GET requests can be done without retry'
263
+ return rest.request_without_retry(method, url, **kwargs)
264
+
265
+
266
+ async def make_authenticated_request_async(
267
+ session: 'aiohttp.ClientSession',
268
+ method: str,
269
+ path: str,
270
+ server_url: Optional[str] = None,
271
+ retry: bool = True,
272
+ **kwargs) -> 'aiohttp.ClientResponse':
273
+ """Make an authenticated async HTTP request to the API server using aiohttp.
274
+
275
+ Automatically handles service account token authentication or cookie-based
276
+ authentication based on what's available.
277
+
278
+ Example usage:
279
+ async with aiohttp.ClientSession() as session:
280
+ response = await make_authenticated_request_async(
281
+ session, 'GET', '/api/v1/status')
282
+ data = await response.json()
283
+
284
+ Args:
285
+ session: aiohttp ClientSession to use for the request
286
+ method: HTTP method (GET, POST, etc.)
287
+ path: API path (e.g., '/api/v1/status')
288
+ server_url: Server URL, defaults to configured server
289
+ retry: Whether to retry on transient errors
290
+ **kwargs: Additional arguments to pass to aiohttp
291
+
292
+ Returns:
293
+ aiohttp.ClientResponse object
294
+
295
+ Raises:
296
+ aiohttp.ClientError: For HTTP-related errors
297
+ exceptions.ServerTemporarilyUnavailableError: When server returns 503
298
+ exceptions.RequestInterruptedError: When request is interrupted
299
+ """
300
+ url, kwargs = _prepare_authenticated_request_params(path, server_url,
301
+ **kwargs)
302
+
303
+ # Convert cookies to aiohttp format if needed
304
+ if 'cookies' in kwargs and isinstance(kwargs['cookies'],
305
+ requests.cookies.RequestsCookieJar):
306
+ kwargs['cookies'] = _convert_requests_cookies_to_aiohttp(
307
+ kwargs['cookies'])
308
+
309
+ # Convert params to strings for aiohttp compatibility
310
+ if 'params' in kwargs and kwargs['params'] is not None:
311
+ normalized_params = {}
312
+ for key, value in kwargs['params'].items():
313
+ if isinstance(value, bool):
314
+ normalized_params[key] = str(value).lower()
315
+ elif value is not None:
316
+ normalized_params[key] = str(value)
317
+ # Skip None values
318
+ kwargs['params'] = normalized_params
319
+
320
+ # Make the request
321
+ if retry:
322
+ return await rest.request_async(session, method, url, **kwargs)
323
+ else:
324
+ assert method == 'GET', 'Only GET requests can be done without retry'
325
+ return await rest.request_without_retry_async(session, method, url,
326
+ **kwargs)
327
+
328
+
142
329
  @annotations.lru_cache(scope='global')
143
330
  def get_server_url(host: Optional[str] = None) -> str:
144
331
  endpoint = DEFAULT_SERVER_URL
@@ -152,27 +339,42 @@ def get_server_url(host: Optional[str] = None) -> str:
152
339
 
153
340
 
154
341
  @annotations.lru_cache(scope='global')
155
- def get_dashboard_url(server_url: str) -> str:
156
- # The server_url may include username or password with the
157
- # format of https://username:password@example.com:8080/path
158
- # We need to remove the username and password and only
159
- # return `https://example.com:8080/path`
160
- parsed = parse.urlparse(server_url)
161
- # Reconstruct the URL without credentials but keeping the scheme
162
- dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
163
- if parsed.port:
164
- dashboard_url = f'{dashboard_url}:{parsed.port}'
165
- if parsed.path:
166
- dashboard_url = f'{dashboard_url}{parsed.path}'
167
- dashboard_url = dashboard_url.rstrip('/')
168
- return f'{dashboard_url}/dashboard'
342
+ def get_dashboard_url(server_url: str,
343
+ starting_page: Optional[str] = None) -> str:
344
+ dashboard_url = server_url.rstrip('/')
345
+ dashboard_url = f'{dashboard_url}/dashboard'
346
+ if starting_page:
347
+ dashboard_url = f'{dashboard_url}/{starting_page}'
348
+ return dashboard_url
169
349
 
170
350
 
171
351
  @annotations.lru_cache(scope='global')
172
- def is_api_server_local():
173
- return get_server_url() in AVAILABLE_LOCAL_API_SERVER_URLS
352
+ def is_api_server_local(endpoint: Optional[str] = None):
353
+ server_url = endpoint if endpoint is not None else get_server_url()
354
+ return server_url in AVAILABLE_LOCAL_API_SERVER_URLS
174
355
 
175
356
 
357
+ def _handle_non_200_server_status(
358
+ response: 'requests.Response') -> ApiServerInfo:
359
+ if response.status_code == 401:
360
+ return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
361
+ if response.status_code == 400:
362
+ # Check if a version mismatch error is returned.
363
+ try:
364
+ body = response.json()
365
+ if (body.get('error',
366
+ '') == ApiServerStatus.VERSION_MISMATCH.value):
367
+ return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
368
+ error=body.get('message', ''))
369
+ except requests.JSONDecodeError:
370
+ pass
371
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
372
+
373
+
374
+ @cachetools.cached(cache=cachetools.TTLCache(maxsize=10,
375
+ ttl=5.0,
376
+ timer=time.time),
377
+ lock=threading.RLock())
176
378
  def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
177
379
  """Retrieve the status of the API server.
178
380
 
@@ -193,35 +395,10 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
193
395
  server_url = endpoint if endpoint is not None else get_server_url()
194
396
  while time_out_try_count <= RETRY_COUNT_ON_TIMEOUT:
195
397
  try:
196
- response = requests.get(f'{server_url}/api/health',
197
- timeout=2.5,
198
- cookies=get_api_cookie_jar())
199
- if response.status_code == 200:
200
- try:
201
- result = response.json()
202
- api_version = result.get('api_version')
203
- version = result.get('version')
204
- version_on_disk = result.get('version_on_disk')
205
- commit = result.get('commit')
206
- server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
207
- api_version=api_version,
208
- version=version,
209
- version_on_disk=version_on_disk,
210
- commit=commit)
211
- if api_version is None or version is None or commit is None:
212
- logger.warning(f'API server response missing '
213
- f'version info. {server_url} may '
214
- f'not be running SkyPilot API server.')
215
- server_info.status = ApiServerStatus.UNHEALTHY
216
- elif api_version != server_constants.API_VERSION:
217
- server_info.status = ApiServerStatus.VERSION_MISMATCH
218
- return server_info
219
- except (json.JSONDecodeError, AttributeError) as e:
220
- logger.warning('Failed to parse API server response: '
221
- f'{str(e)}')
222
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
223
- else:
224
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
398
+ response = make_authenticated_request('GET',
399
+ '/api/health',
400
+ server_url=server_url,
401
+ timeout=2.5)
225
402
  except requests.exceptions.Timeout:
226
403
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
227
404
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
@@ -230,20 +407,90 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
230
407
  except requests.exceptions.ConnectionError:
231
408
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
232
409
 
410
+ logger.debug(f'Health check status: {response.status_code}')
411
+
412
+ if response.status_code != 200:
413
+ return _handle_non_200_server_status(response)
414
+
415
+ # The response is 200, so we can parse the response.
416
+ try:
417
+ result = response.json()
418
+ server_status = result.get('status')
419
+ api_version = result.get('api_version')
420
+ version = result.get('version')
421
+ version_on_disk = result.get('version_on_disk')
422
+ commit = result.get('commit')
423
+ user = result.get('user')
424
+ basic_auth_enabled = result.get('basic_auth_enabled')
425
+ server_info = ApiServerInfo(status=ApiServerStatus(server_status),
426
+ api_version=api_version,
427
+ version=version,
428
+ version_on_disk=version_on_disk,
429
+ commit=commit,
430
+ user=user,
431
+ basic_auth_enabled=basic_auth_enabled)
432
+ if api_version is None or version is None or commit is None:
433
+ logger.warning(f'API server response missing '
434
+ f'version info. {server_url} may '
435
+ f'not be running SkyPilot API server.')
436
+ server_info.status = ApiServerStatus.UNHEALTHY
437
+ version_info = versions.check_compatibility_at_client(
438
+ response.headers)
439
+ if version_info is None:
440
+ # Backward compatibility for server prior to v0.11.0 which
441
+ # does not check compatibility at server side.
442
+ # TODO(aylei): remove this after v0.13.0 is released.
443
+ return ApiServerInfo(
444
+ status=ApiServerStatus.VERSION_MISMATCH,
445
+ error=versions.SERVER_TOO_OLD_ERROR.format(
446
+ remote_version=version,
447
+ local_version=versions.get_local_readable_version(),
448
+ min_version=server_constants.MIN_COMPATIBLE_VERSION,
449
+ command=versions.install_version_command(
450
+ version, commit)))
451
+ if version_info.error is not None:
452
+ return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
453
+ error=version_info.error)
454
+
455
+ cookies = get_cookies_from_response(response)
456
+ # Save or refresh the cookie jar in case of session affinity and
457
+ # OAuth.
458
+ set_api_cookie_jar(cookies, create_if_not_exists=True)
459
+ return server_info
460
+ except (requests.JSONDecodeError, AttributeError) as e:
461
+ # Try to check if we got redirected to a login page.
462
+ for prev_response in response.history:
463
+ logger.debug(f'Previous response: {prev_response.url}')
464
+ # Heuristic: check if the url looks like a login page or
465
+ # oauth flow.
466
+ if any(key in prev_response.url for key in ['login', 'oauth2']):
467
+ logger.debug(f'URL {prev_response.url} looks like '
468
+ 'a login page or oauth flow, so try to '
469
+ 'get the cookie.')
470
+ return ApiServerInfo(status=ApiServerStatus.NEEDS_AUTH)
471
+ logger.warning('Failed to parse API server response: '
472
+ f'{str(e)}')
473
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
474
+
233
475
  return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
234
476
 
235
477
 
236
478
  def handle_request_error(response: 'requests.Response') -> None:
479
+ # Keep the original HTTPError if the response code >= 400
480
+ response.raise_for_status()
481
+
482
+ # Other status codes are not expected neither, e.g. we do not expect to
483
+ # handle redirection here.
237
484
  if response.status_code != 200:
238
485
  with ux_utils.print_exception_no_traceback():
239
486
  raise RuntimeError(
240
487
  'Failed to process response from SkyPilot API server at '
241
- f'{get_server_url()}. '
488
+ f'{response.url}. '
242
489
  f'Response: {response.status_code} '
243
490
  f'{response.text}')
244
491
 
245
492
 
246
- def get_request_id(response: 'requests.Response') -> RequestId:
493
+ def get_request_id(response: 'requests.Response') -> RequestId[T]:
247
494
  handle_request_error(response)
248
495
  request_id = response.headers.get('X-Skypilot-Request-ID')
249
496
  if request_id is None:
@@ -254,16 +501,33 @@ def get_request_id(response: 'requests.Response') -> RequestId:
254
501
  'Failed to get request ID from SkyPilot API server at '
255
502
  f'{get_server_url()}. Response: {response.status_code} '
256
503
  f'{response.text}')
257
- return request_id
504
+ return RequestId[T](request_id)
505
+
506
+
507
+ def get_stream_request_id(
508
+ response: 'requests.Response') -> Optional[RequestId[T]]:
509
+ """This is same as the above function, but just for `sdk.stream_and_get.
510
+ We do this because `/api/stream` may choose the latest request id, and
511
+ we need to keep track of that information. Request id in this case can
512
+ be None."""
513
+ handle_request_error(response)
514
+ request_id = response.headers.get(server_constants.STREAM_REQUEST_HEADER)
515
+ if request_id is not None:
516
+ return RequestId[T](request_id)
517
+ return None
258
518
 
259
519
 
260
520
  def _start_api_server(deploy: bool = False,
261
521
  host: str = '127.0.0.1',
262
- foreground: bool = False):
522
+ foreground: bool = False,
523
+ metrics: bool = False,
524
+ metrics_port: Optional[int] = None,
525
+ enable_basic_auth: bool = False):
263
526
  """Starts a SkyPilot API server locally."""
264
527
  server_url = get_server_url(host)
265
528
  assert server_url in AVAILABLE_LOCAL_API_SERVER_URLS, (
266
529
  f'server url {server_url} is not a local url')
530
+
267
531
  with rich_utils.client_status('Starting SkyPilot API server, '
268
532
  f'view logs at {constants.API_SERVER_LOGS}'):
269
533
  logger.info(f'{colorama.Style.DIM}Failed to connect to '
@@ -275,40 +539,71 @@ def _start_api_server(deploy: bool = False,
275
539
  'is not a local URL')
276
540
 
277
541
  # Check available memory before starting the server.
278
- avail_mem_size_gb: float = common_utils.get_mem_size_gb()
279
- if avail_mem_size_gb <= server_constants.MIN_AVAIL_MEM_GB:
280
- logger.warning(
281
- f'{colorama.Fore.YELLOW}Your SkyPilot API server machine only '
282
- f'has {avail_mem_size_gb:.1f}GB memory available. '
283
- f'At least {server_constants.MIN_AVAIL_MEM_GB}GB is '
284
- 'recommended to support higher load with better performance.'
285
- f'{colorama.Style.RESET_ALL}')
542
+ # Skip this warning if postgres is used, as:
543
+ # 1) that's almost certainly a remote API server;
544
+ # 2) the actual consolidation mode config is stashed in the database,
545
+ # and the value of `job_utils.is_consolidation_mode` will not be
546
+ # the actual value in the db, but only None as in this case, the
547
+ # whole YAML config is really just `db: <URI>`.
548
+ if skypilot_config.get_nested(('db',), None) is None:
549
+ avail_mem_size_gb: float = common_utils.get_mem_size_gb()
550
+ # pylint: disable=import-outside-toplevel
551
+ import sky.jobs.utils as job_utils
552
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
553
+ if job_utils.is_consolidation_mode(
554
+ on_api_restart=True) else
555
+ server_constants.MIN_AVAIL_MEM_GB)
556
+ if avail_mem_size_gb <= max_memory:
557
+ logger.warning(
558
+ f'{colorama.Fore.YELLOW}Your SkyPilot API server machine '
559
+ f'only has {avail_mem_size_gb:.1f}GB memory available. '
560
+ f'At least {max_memory}GB is recommended to support higher '
561
+ 'load with better performance.'
562
+ f'{colorama.Style.RESET_ALL}')
286
563
 
287
564
  args = [sys.executable, *API_SERVER_CMD.split()]
288
565
  if deploy:
289
566
  args += ['--deploy']
290
567
  if host is not None:
291
568
  args += [f'--host={host}']
569
+ if metrics_port is not None:
570
+ args += [f'--metrics-port={metrics_port}']
292
571
 
293
572
  if foreground:
294
573
  # Replaces the current process with the API server
295
574
  os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
575
+ _set_metrics_env_var(os.environ, metrics, deploy)
576
+ if enable_basic_auth:
577
+ os.environ[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
296
578
  os.execvp(args[0], args)
297
579
 
298
580
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
299
581
  os.makedirs(os.path.dirname(log_path), exist_ok=True)
300
- cmd = f'{" ".join(args)} > {log_path} 2>&1 < /dev/null'
301
582
 
583
+ # For spawn mode, copy the environ to avoid polluting the SDK process.
584
+ server_env = os.environ.copy()
585
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
302
586
  # Start the API server process in the background and don't wait for it.
303
587
  # If this is called from a CLI invocation, we need
304
588
  # start_new_session=True so that SIGINT on the CLI will not also kill
305
589
  # the API server.
306
- server_env = os.environ.copy()
307
- server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
308
- proc = subprocess.Popen(cmd,
309
- shell=True,
310
- start_new_session=True,
311
- env=server_env)
590
+ if enable_basic_auth:
591
+ server_env[constants.ENV_VAR_ENABLE_BASIC_AUTH] = 'true'
592
+ _set_metrics_env_var(server_env, metrics, deploy)
593
+ with open(log_path, 'w', encoding='utf-8') as log_file:
594
+ # Because the log file is opened using a with statement, it may seem
595
+ # that the file will be closed when the with statement is exited
596
+ # causing the child process to be unable to write to the log file.
597
+ # However, Popen makes the file descriptor inheritable which means
598
+ # the child process will inherit its own copy of the fd,
599
+ # independent of the parent's fd table which enables to child
600
+ # process to continue writing to the log file.
601
+ proc = subprocess.Popen(args,
602
+ stdout=log_file,
603
+ stderr=subprocess.STDOUT,
604
+ stdin=subprocess.DEVNULL,
605
+ start_new_session=True,
606
+ env=server_env)
312
607
 
313
608
  start_time = time.time()
314
609
  while True:
@@ -319,6 +614,8 @@ def _start_api_server(deploy: bool = False,
319
614
  'SkyPilot API server process exited unexpectedly.\n'
320
615
  f'View logs at: {constants.API_SERVER_LOGS}')
321
616
  try:
617
+ # Clear the cache to ensure fresh checks during startup
618
+ get_api_server_status.cache_clear() # type: ignore
322
619
  check_server_healthy()
323
620
  except exceptions.APIVersionMismatchError:
324
621
  raise
@@ -337,7 +634,7 @@ def _start_api_server(deploy: bool = False,
337
634
  server_url = get_server_url(host)
338
635
  dashboard_msg = ''
339
636
  api_server_info = get_api_server_status(server_url)
340
- if api_server_info.version == _DEV_VERSION:
637
+ if api_server_info.version == versions.DEV_VERSION:
341
638
  dashboard_msg += (
342
639
  f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
343
640
  f'{colorama.Fore.YELLOW}')
@@ -350,17 +647,40 @@ def _start_api_server(deploy: bool = False,
350
647
  dashboard_msg += (
351
648
  'Dashboard may be stale when installed from source, '
352
649
  'to rebuild: npm --prefix sky/dashboard install '
353
- '&& npm --prefix sky/dashboard run build\n')
354
- dashboard_msg += (
355
- f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
356
- f'Dashboard: {get_dashboard_url(server_url)}')
357
- dashboard_msg += f'{colorama.Style.RESET_ALL}'
650
+ '&& npm --prefix sky/dashboard run build')
358
651
  logger.info(
359
652
  ux_utils.finishing_message(
360
653
  f'SkyPilot API server started. {dashboard_msg}'))
361
654
 
362
655
 
363
- def check_server_healthy(endpoint: Optional[str] = None,) -> None:
656
+ def _set_metrics_env_var(env: Union[Dict[str, str], os._Environ], metrics: bool,
657
+ deploy: bool):
658
+ """Sets the metrics environment variables.
659
+
660
+ Args:
661
+ env: The environment variables to set.
662
+ metrics: Whether to enable metrics.
663
+ deploy: Whether the server is running in deploy mode, which means
664
+ multiple processes might be running.
665
+ """
666
+ del deploy
667
+ if metrics or os.getenv(constants.ENV_VAR_SERVER_METRICS_ENABLED) == 'true':
668
+ env[constants.ENV_VAR_SERVER_METRICS_ENABLED] = 'true'
669
+ # Always set the metrics dir since we need to collect metrics from
670
+ # subprocesses like the executor.
671
+ metrics_dir = os.path.join(tempfile.gettempdir(), 'metrics')
672
+ shutil.rmtree(metrics_dir, ignore_errors=True)
673
+ os.makedirs(metrics_dir, exist_ok=True)
674
+ # Refer to https://prometheus.github.io/client_python/multiprocess/
675
+ env['PROMETHEUS_MULTIPROC_DIR'] = metrics_dir
676
+
677
+
678
+ def check_server_healthy(
679
+ endpoint: Optional[str] = None
680
+ ) -> Tuple[Literal[
681
+ # Use an incomplete list of Literals here to enforce raising for other
682
+ # enum values.
683
+ ApiServerStatus.HEALTHY, ApiServerStatus.NEEDS_AUTH], ApiServerInfo]:
364
684
  """Check if the API server is healthy.
365
685
 
366
686
  Args:
@@ -370,38 +690,21 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
370
690
  Raises:
371
691
  RuntimeError: If the server is not healthy or the client version does
372
692
  not match the server version.
693
+
694
+ Returns:
695
+ ApiServerStatus: The status of the API server, unless the server is
696
+ unhealthy or the client version does not match the server version,
697
+ in which case an exception is raised.
373
698
  """
374
699
  endpoint = endpoint if endpoint is not None else get_server_url()
375
700
  api_server_info = get_api_server_status(endpoint)
376
701
  api_server_status = api_server_info.status
377
702
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
378
- sv = api_server_info.api_version
379
- assert sv is not None, 'Server API version is None'
380
- try:
381
- server_is_older = int(sv) < _LOCAL_API_VERSION
382
- except ValueError:
383
- # Raised when the server version using an unknown scheme.
384
- # Version compatibility checking is expected to handle all legacy
385
- # cases so we safely assume the server is newer when the version
386
- # scheme is unknown.
387
- logger.debug('API server version using unknown scheme: %s', sv)
388
- server_is_older = False
389
- version_info = _get_version_info_hint(api_server_info)
390
- if is_api_server_local():
703
+ msg = api_server_info.error
704
+ if is_api_server_local(endpoint):
391
705
  # For local server, just hint user to restart the server to get
392
706
  # a consistent version.
393
- msg = _LOCAL_SERVER_VERSION_MISMATCH_WARNING.format(
394
- version_info=version_info)
395
- else:
396
- assert api_server_info.version is not None, 'Server version is None'
397
- if server_is_older:
398
- msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
399
- version_info=version_info,
400
- command=_install_server_version_command(api_server_info))
401
- else:
402
- msg = _CLIENT_TOO_OLD_WARNING.format(
403
- version_info=version_info,
404
- command=_install_server_version_command(api_server_info))
707
+ msg = _LOCAL_API_SERVER_RESTART_HINT
405
708
  with ux_utils.print_exception_no_traceback():
406
709
  raise exceptions.APIVersionMismatchError(msg)
407
710
  elif api_server_status == ApiServerStatus.UNHEALTHY:
@@ -432,36 +735,7 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
432
735
 
433
736
  hinted_for_server_install_version_mismatch = True
434
737
 
435
-
436
- def _get_version_info_hint(server_info: ApiServerInfo) -> str:
437
- assert server_info.version is not None, 'Server version is None'
438
- # version_on_disk may be None if the server is older
439
- assert server_info.commit is not None, 'Server commit is None'
440
- sv = server_info.version
441
- cv = sky.__version__
442
- if server_info.version == _DEV_VERSION:
443
- sv = f'{sv} with commit {server_info.commit}'
444
- if cv == _DEV_VERSION:
445
- cv = f'{cv} with commit {sky.__commit__}'
446
- return _VERSION_INFO.format(client_version=cv,
447
- server_version=sv,
448
- client_api_version=server_constants.API_VERSION,
449
- server_api_version=server_info.api_version)
450
-
451
-
452
- def _install_server_version_command(server_info: ApiServerInfo) -> str:
453
- assert server_info.version is not None, 'Server version is None'
454
- assert server_info.commit is not None, 'Server commit is None'
455
- if server_info.version == _DEV_VERSION:
456
- # Dev build without valid version.
457
- return ('pip install git+https://github.com/skypilot-org/skypilot@'
458
- f'{server_info.commit}')
459
- elif 'dev' in server_info.version:
460
- # Nightly version.
461
- return f'pip install -U "skypilot-nightly=={server_info.version}"'
462
- else:
463
- # Stable version.
464
- return f'pip install -U "skypilot=={server_info.version}"'
738
+ return api_server_status, api_server_info
465
739
 
466
740
 
467
741
  # Keep in sync with sky/setup_files/setup.py find_version()
@@ -481,9 +755,17 @@ def get_skypilot_version_on_disk() -> str:
481
755
 
482
756
  def check_server_healthy_or_start_fn(deploy: bool = False,
483
757
  host: str = '127.0.0.1',
484
- foreground: bool = False):
758
+ foreground: bool = False,
759
+ metrics: bool = False,
760
+ metrics_port: Optional[int] = None,
761
+ enable_basic_auth: bool = False):
762
+ api_server_status = None
485
763
  try:
486
- check_server_healthy()
764
+ api_server_status, _ = check_server_healthy()
765
+ if api_server_status == ApiServerStatus.NEEDS_AUTH:
766
+ endpoint = get_server_url()
767
+ with ux_utils.print_exception_no_traceback():
768
+ raise exceptions.ApiServerAuthenticationError(endpoint)
487
769
  except exceptions.ApiServerConnectionError as exc:
488
770
  endpoint = get_server_url()
489
771
  if not is_api_server_local():
@@ -495,19 +777,21 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
495
777
  os.path.expanduser(constants.API_SERVER_CREATION_LOCK_PATH)):
496
778
  # Check again if server is already running. Other processes may
497
779
  # have started the server while we were waiting for the lock.
780
+ get_api_server_status.cache_clear() # type: ignore[attr-defined]
498
781
  api_server_info = get_api_server_status(endpoint)
499
782
  if api_server_info.status == ApiServerStatus.UNHEALTHY:
500
- _start_api_server(deploy, host, foreground)
783
+ _start_api_server(deploy, host, foreground, metrics,
784
+ metrics_port, enable_basic_auth)
501
785
 
502
786
 
503
- def check_server_healthy_or_start(func):
787
+ def check_server_healthy_or_start(func: Callable[P, T]) -> Callable[P, T]:
504
788
 
505
789
  @functools.wraps(func)
506
790
  def wrapper(*args, deploy: bool = False, host: str = '127.0.0.1', **kwargs):
507
791
  check_server_healthy_or_start_fn(deploy, host)
508
792
  return func(*args, **kwargs)
509
793
 
510
- return wrapper
794
+ return cast(Callable[P, T], wrapper)
511
795
 
512
796
 
513
797
  def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
@@ -551,20 +835,21 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
551
835
  return str(client_file_mounts_dir /
552
836
  file_mounts_mapping[original_path].lstrip('/'))
553
837
 
554
- task_configs = common_utils.read_yaml_all(str(client_task_path))
838
+ task_configs = yaml_utils.read_yaml_all(str(client_task_path))
555
839
  for task_config in task_configs:
556
840
  if task_config is None:
557
841
  continue
558
- file_mounts_mapping = task_config.get('file_mounts_mapping', {})
842
+ file_mounts_mapping = task_config.pop('file_mounts_mapping', {})
559
843
  if not file_mounts_mapping:
560
844
  # We did not mount any files to new paths on the remote server
561
845
  # so no need to resolve filepaths.
562
846
  continue
563
847
  if 'workdir' in task_config:
564
848
  workdir = task_config['workdir']
565
- task_config['workdir'] = str(
566
- client_file_mounts_dir /
567
- file_mounts_mapping[workdir].lstrip('/'))
849
+ if isinstance(workdir, str):
850
+ task_config['workdir'] = str(
851
+ client_file_mounts_dir /
852
+ file_mounts_mapping[workdir].lstrip('/'))
568
853
  if workdir_only:
569
854
  continue
570
855
  if 'file_mounts' in task_config:
@@ -603,7 +888,7 @@ def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
603
888
  # We can switch to using string, but this is to make it easier to debug, by
604
889
  # persisting the translated task yaml file.
605
890
  translated_client_task_path = client_dir / f'{task_id}_translated.yaml'
606
- common_utils.dump_yaml(str(translated_client_task_path), task_configs)
891
+ yaml_utils.dump_yaml(str(translated_client_task_path), task_configs)
607
892
 
608
893
  dag = dag_utils.load_chain_dag_from_yaml(str(translated_client_task_path))
609
894
  return dag
@@ -624,25 +909,33 @@ def request_body_to_params(body: 'pydantic.BaseModel') -> Dict[str, Any]:
624
909
 
625
910
  def reload_for_new_request(client_entrypoint: Optional[str],
626
911
  client_command: Optional[str],
627
- using_remote_api_server: bool):
628
- """Reload modules, global variables, and usage message for a new request."""
912
+ using_remote_api_server: bool, user: 'models.User',
913
+ request_id: str) -> None:
914
+ """Reload modules, global variables, and usage message for a new request.
915
+
916
+ Must be called within the request's context.
917
+ """
629
918
  # This should be called first to make sure the logger is up-to-date.
630
919
  sky_logging.reload_logger()
631
920
 
632
921
  # Reload the skypilot config to make sure the latest config is used.
633
- skypilot_config.safe_reload_config()
922
+ # We don't need to grab the lock here because this function is only
923
+ # run once we are inside the request's context, so there shouldn't
924
+ # be any race conditions when reloading the config.
925
+ skypilot_config.reload_config()
634
926
 
635
927
  # Reset the client entrypoint and command for the usage message.
636
- common_utils.set_client_status(
928
+ common_utils.set_request_context(
637
929
  client_entrypoint=client_entrypoint,
638
930
  client_command=client_command,
639
931
  using_remote_api_server=using_remote_api_server,
932
+ user=user,
933
+ request_id=request_id,
640
934
  )
641
935
 
642
936
  # Clear cache should be called before reload_logger and usage reset,
643
937
  # otherwise, the latest env var will not be used.
644
- for func in annotations.FUNCTIONS_NEED_RELOAD_CACHE:
645
- func.cache_clear()
938
+ annotations.clear_request_level_cache()
646
939
 
647
940
  # We need to reset usage message, so that the message is up-to-date with the
648
941
  # latest information in the context, e.g. client entrypoint and run id.
@@ -660,6 +953,7 @@ def clear_local_api_server_database() -> None:
660
953
  db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
661
954
  for extension in ['', '-shm', '-wal']:
662
955
  try:
956
+ logger.debug(f'Removing database file {db_path}{extension}')
663
957
  os.remove(f'{db_path}{extension}')
664
958
  except FileNotFoundError:
665
959
  logger.debug(f'Database file {db_path}{extension} not found.')