skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,7 @@
2
2
  import base64
3
3
  import pickle
4
4
  import typing
5
- from typing import Any, Dict, List, Optional, Tuple
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  from sky import jobs as managed_jobs
8
8
  from sky import models
@@ -56,10 +56,10 @@ def decode_status(
56
56
  clusters = return_value
57
57
  response = []
58
58
  for cluster in clusters:
59
- cluster['handle'] = decode_and_unpickle(cluster['handle'])
59
+ # handle may not always be present in the response.
60
+ if 'handle' in cluster and cluster['handle'] is not None:
61
+ cluster['handle'] = decode_and_unpickle(cluster['handle'])
60
62
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
61
- cluster['storage_mounts_metadata'] = decode_and_unpickle(
62
- cluster['storage_mounts_metadata'])
63
63
  if 'is_managed' not in cluster:
64
64
  cluster['is_managed'] = False
65
65
  response.append(responses.StatusResponse.model_validate(cluster))
@@ -72,7 +72,7 @@ def decode_status_kubernetes(
72
72
  List[Dict[str, Any]], Optional[str]]
73
73
  ) -> Tuple[List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
74
74
  List[kubernetes_utils.KubernetesSkyPilotClusterInfoPayload],
75
- List[Dict[str, Any]], Optional[str]]:
75
+ List[responses.ManagedJobRecord], Optional[str]]:
76
76
  (encoded_all_clusters, encoded_unmanaged_clusters, all_jobs,
77
77
  context) = return_value
78
78
  all_clusters = []
@@ -85,6 +85,7 @@ def decode_status_kubernetes(
85
85
  cluster['status'] = status_lib.ClusterStatus(cluster['status'])
86
86
  unmanaged_clusters.append(
87
87
  kubernetes_utils.KubernetesSkyPilotClusterInfoPayload(**cluster))
88
+ all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
88
89
  return all_clusters, unmanaged_clusters, all_jobs, context
89
90
 
90
91
 
@@ -101,29 +102,49 @@ def decode_start(return_value: str) -> 'backends.CloudVmRayResourceHandle':
101
102
 
102
103
 
103
104
  @register_decoders('queue')
104
- def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
105
+ def decode_queue(return_value: List[dict],) -> List[responses.ClusterJobRecord]:
105
106
  jobs = return_value
106
107
  for job in jobs:
107
108
  job['status'] = job_lib.JobStatus(job['status'])
108
- return jobs
109
+ return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
109
110
 
110
111
 
111
112
  @register_decoders('jobs.queue')
112
- def decode_jobs_queue(return_value):
113
+ def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
114
+ # To keep backward compatibility with v0.10.2
115
+ return decode_jobs_queue_v2(return_value)
116
+
117
+
118
+ @register_decoders('jobs.queue_v2')
119
+ def decode_jobs_queue_v2(
120
+ return_value
121
+ ) -> Union[Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int],
122
+ List[responses.ManagedJobRecord]]:
113
123
  """Decode jobs queue response.
114
124
 
115
- Supports legacy list, or a dict {jobs, total}.
116
- - Returns list[job]
125
+ Supports legacy list, or a dict {jobs, total, total_no_filter,
126
+ status_counts}.
127
+
128
+ - Returns either list[job] or tuple(list[job], total, status_counts,
129
+ total_no_filter)
117
130
  """
118
- # Case 1: dict shape {jobs, total}
119
- if isinstance(return_value, dict) and 'jobs' in return_value:
131
+ # Case 1: dict shape {jobs, total, total_no_filter, status_counts}
132
+ if isinstance(return_value, dict):
120
133
  jobs = return_value.get('jobs', [])
134
+ total = return_value.get('total', len(jobs))
135
+ total_no_filter = return_value.get('total_no_filter', total)
136
+ status_counts = return_value.get('status_counts', {})
137
+ for job in jobs:
138
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
139
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
140
+ return jobs, total, status_counts, total_no_filter
121
141
  else:
122
142
  # Case 2: legacy list
123
143
  jobs = return_value
124
- for job in jobs:
125
- job['status'] = managed_jobs.ManagedJobStatus(job['status'])
126
- return jobs
144
+ for job in jobs:
145
+ job['status'] = managed_jobs.ManagedJobStatus(job['status'])
146
+ jobs = [responses.ManagedJobRecord(**job) for job in jobs]
147
+ return jobs
127
148
 
128
149
 
129
150
  def _decode_serve_status(
@@ -175,14 +196,24 @@ def decode_list_accelerators(
175
196
 
176
197
  @register_decoders('storage_ls')
177
198
  def decode_storage_ls(
178
- return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
199
+ return_value: List[Dict[str, Any]]) -> List[responses.StorageRecord]:
179
200
  for storage_info in return_value:
180
201
  storage_info['status'] = status_lib.StorageStatus(
181
202
  storage_info['status'])
182
203
  storage_info['store'] = [
183
204
  storage.StoreType(store) for store in storage_info['store']
184
205
  ]
185
- return return_value
206
+ return [
207
+ responses.StorageRecord(**storage_info) for storage_info in return_value
208
+ ]
209
+
210
+
211
+ @register_decoders('volume_list')
212
+ def decode_volume_list(
213
+ return_value: List[Dict[str, Any]]) -> List[responses.VolumeRecord]:
214
+ return [
215
+ responses.VolumeRecord(**volume_info) for volume_info in return_value
216
+ ]
186
217
 
187
218
 
188
219
  @register_decoders('job_status')
@@ -6,8 +6,10 @@ import base64
6
6
  import dataclasses
7
7
  import pickle
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Tuple
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
10
 
11
+ from sky import models
12
+ from sky.catalog import common
11
13
  from sky.schemas.api import responses
12
14
  from sky.server import constants as server_constants
13
15
  from sky.utils import serialize_utils
@@ -15,7 +17,6 @@ from sky.utils import serialize_utils
15
17
  if typing.TYPE_CHECKING:
16
18
  from sky import backends
17
19
  from sky import clouds
18
- from sky import models
19
20
  from sky.provision.kubernetes import utils as kubernetes_utils
20
21
 
21
22
  handlers: Dict[str, Any] = {}
@@ -60,13 +61,23 @@ def encode_status(
60
61
  clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
61
62
  response = []
62
63
  for cluster in clusters:
63
- response_cluster = cluster.model_dump()
64
+ response_cluster = cluster.model_dump(exclude_none=True)
65
+ # These default setting is needed because last_use and status_updated_at
66
+ # used to be not optional.
67
+ # TODO(syang): remove this after v0.12.0
68
+ if 'last_use' not in response_cluster:
69
+ response_cluster['last_use'] = ''
70
+ if 'status_updated_at' not in response_cluster:
71
+ response_cluster['status_updated_at'] = 0
64
72
  response_cluster['status'] = cluster['status'].value
65
73
  handle = serialize_utils.prepare_handle_for_backwards_compatibility(
66
74
  cluster['handle'])
67
75
  response_cluster['handle'] = pickle_and_encode(handle)
76
+ # TODO (syang) We still need to return this field for backwards
77
+ # compatibility.
78
+ # Remove this field at or after v0.12.0
68
79
  response_cluster['storage_mounts_metadata'] = pickle_and_encode(
69
- response_cluster['storage_mounts_metadata'])
80
+ None) # Always returns None.
70
81
  response.append(response_cluster)
71
82
  return response
72
83
 
@@ -92,10 +103,14 @@ def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
92
103
 
93
104
 
94
105
  @register_encoder('queue')
95
- def encode_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
106
+ def encode_queue(
107
+ jobs: List[responses.ClusterJobRecord],) -> List[Dict[str, Any]]:
108
+ response = []
96
109
  for job in jobs:
97
- job['status'] = job['status'].value
98
- return jobs
110
+ response_job = job.model_dump()
111
+ response_job['status'] = job['status'].value
112
+ response.append(response_job)
113
+ return response
99
114
 
100
115
 
101
116
  @register_encoder('status_kubernetes')
@@ -103,7 +118,7 @@ def encode_status_kubernetes(
103
118
  return_value: Tuple[
104
119
  List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
105
120
  List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
106
- List[Dict[str, Any]], Optional[str]]
121
+ List[responses.ManagedJobRecord], Optional[str]]
107
122
  ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]],
108
123
  Optional[str]]:
109
124
  all_clusters, unmanaged_clusters, all_jobs, context = return_value
@@ -117,13 +132,22 @@ def encode_status_kubernetes(
117
132
  encoded_cluster = dataclasses.asdict(cluster)
118
133
  encoded_cluster['status'] = encoded_cluster['status'].value
119
134
  encoded_unmanaged_clusters.append(encoded_cluster)
135
+ all_jobs = [job.model_dump(by_alias=True) for job in all_jobs]
120
136
  return encoded_all_clusters, encoded_unmanaged_clusters, all_jobs, context
121
137
 
122
138
 
123
139
  @register_encoder('jobs.queue')
124
- def encode_jobs_queue(jobs_or_tuple):
140
+ def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
141
+ for job in jobs:
142
+ job['status'] = job['status'].value
143
+ return jobs
144
+
145
+
146
+ @register_encoder('jobs.queue_v2')
147
+ def encode_jobs_queue_v2(
148
+ jobs_or_tuple) -> Union[List[Dict[str, Any]], Dict[str, Any]]:
125
149
  # Support returning either a plain jobs list or a (jobs, total) tuple
126
- status_counts = {}
150
+ status_counts: Dict[str, int] = {}
127
151
  if isinstance(jobs_or_tuple, tuple):
128
152
  if len(jobs_or_tuple) == 2:
129
153
  jobs, total = jobs_or_tuple
@@ -135,12 +159,13 @@ def encode_jobs_queue(jobs_or_tuple):
135
159
  else:
136
160
  jobs = jobs_or_tuple
137
161
  total = None
138
- for job in jobs:
162
+ jobs_dict = [job.model_dump(by_alias=True) for job in jobs]
163
+ for job in jobs_dict:
139
164
  job['status'] = job['status'].value
140
165
  if total is None:
141
- return jobs
166
+ return jobs_dict
142
167
  return {
143
- 'jobs': jobs,
168
+ 'jobs': jobs_dict,
144
169
  'total': total,
145
170
  'total_no_filter': total_no_filter,
146
171
  'status_counts': status_counts
@@ -177,8 +202,9 @@ def encode_cost_report(
177
202
  for cluster_report in cost_report:
178
203
  if cluster_report['status'] is not None:
179
204
  cluster_report['status'] = cluster_report['status'].value
180
- cluster_report['resources'] = pickle_and_encode(
181
- cluster_report['resources'])
205
+ if 'resources' in cluster_report:
206
+ cluster_report['resources'] = pickle_and_encode(
207
+ cluster_report['resources'])
182
208
  return cost_report
183
209
 
184
210
 
@@ -190,19 +216,26 @@ def encode_enabled_clouds(clouds: List['clouds.Cloud']) -> List[str]:
190
216
 
191
217
  @register_encoder('storage_ls')
192
218
  def encode_storage_ls(
193
- return_value: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
194
- for storage_info in return_value:
219
+ return_value: List[responses.StorageRecord]) -> List[Dict[str, Any]]:
220
+ response_list = [storage_info.model_dump() for storage_info in return_value]
221
+ for storage_info in response_list:
195
222
  storage_info['status'] = storage_info['status'].value
196
223
  storage_info['store'] = [store.value for store in storage_info['store']]
197
- return return_value
224
+ return response_list
225
+
226
+
227
+ @register_encoder('volume_list')
228
+ def encode_volume_list(
229
+ return_value: List[responses.VolumeRecord]) -> List[Dict[str, Any]]:
230
+ return [volume_info.model_dump() for volume_info in return_value]
198
231
 
199
232
 
200
233
  @register_encoder('job_status')
201
- def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
234
+ def encode_job_status(return_value: Dict[int, Any]) -> Dict[str, str]:
202
235
  for job_id in return_value.keys():
203
236
  if return_value[job_id] is not None:
204
237
  return_value[job_id] = return_value[job_id].value
205
- return return_value
238
+ return {str(k): v for k, v in return_value.items()}
206
239
 
207
240
 
208
241
  @register_encoder('kubernetes_node_info')
@@ -214,3 +247,35 @@ def encode_kubernetes_node_info(
214
247
  @register_encoder('endpoints')
215
248
  def encode_endpoints(return_value: Dict[int, str]) -> Dict[str, str]:
216
249
  return {str(k): v for k, v in return_value.items()}
250
+
251
+
252
+ @register_encoder('realtime_kubernetes_gpu_availability')
253
+ def encode_realtime_gpu_availability(
254
+ return_value: List[Tuple[str,
255
+ List[Any]]]) -> List[Tuple[str, List[List[Any]]]]:
256
+ # Convert RealtimeGpuAvailability namedtuples to lists
257
+ # for JSON serialization.
258
+ encoded = []
259
+ for context, gpu_list in return_value:
260
+ converted_gpu_list = []
261
+ for gpu in gpu_list:
262
+ assert isinstance(gpu, models.RealtimeGpuAvailability), (
263
+ f'Expected RealtimeGpuAvailability, got {type(gpu)}')
264
+ converted_gpu_list.append(list(gpu))
265
+ encoded.append((context, converted_gpu_list))
266
+ return encoded
267
+
268
+
269
+ @register_encoder('list_accelerators')
270
+ def encode_list_accelerators(
271
+ return_value: Dict[str, List[Any]]) -> Dict[str, Any]:
272
+ encoded: Dict[str, Any] = {}
273
+ for accelerator_name, instances in return_value.items():
274
+ # Convert InstanceTypeInfo namedtuples to lists for JSON serialization.
275
+ converted_instances: List[Any] = []
276
+ for instance in instances:
277
+ assert isinstance(instance, common.InstanceTypeInfo), (
278
+ f'Expected InstanceTypeInfo, got {type(instance)}')
279
+ converted_instances.append(list(instance))
280
+ encoded[accelerator_name] = converted_instances
281
+ return encoded
@@ -0,0 +1,117 @@
1
+ """Request execution threads management."""
2
+
3
+ import concurrent.futures
4
+ import sys
5
+ import threading
6
+ from typing import Callable, Set, TypeVar
7
+
8
+ from sky import exceptions
9
+ from sky import sky_logging
10
+ from sky.utils import atomic
11
+
12
+ # pylint: disable=ungrouped-imports
13
+ if sys.version_info >= (3, 10):
14
+ from typing import ParamSpec
15
+ else:
16
+ from typing_extensions import ParamSpec
17
+
18
+ _P = ParamSpec('_P')
19
+ _T = TypeVar('_T')
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+
24
+ class OnDemandThreadExecutor(concurrent.futures.Executor):
25
+ """An executor that creates a new thread for each task and destroys it
26
+ after the task is completed.
27
+
28
+ Note(dev):
29
+ We raise an error instead of queuing the request if the limit is reached, so
30
+ that:
31
+ 1. the request might be handled by other processes that have idle workers
32
+ upon retry;
33
+ 2. if not, then users can be clearly hinted that they need to scale the API
34
+ server to support higher concurrency.
35
+ So this executor is only suitable for carefully selected cases where the
36
+ error can be properly handled by caller. To make this executor general, we
37
+ need to support configuring the queuing behavior (exception or queueing).
38
+ """
39
+
40
+ def __init__(self, name: str, max_workers: int):
41
+ self.name: str = name
42
+ self.max_workers: int = max_workers
43
+ self.running: atomic.AtomicInt = atomic.AtomicInt(0)
44
+ self._shutdown: bool = False
45
+ self._shutdown_lock: threading.Lock = threading.Lock()
46
+ self._threads: Set[threading.Thread] = set()
47
+ self._threads_lock: threading.Lock = threading.Lock()
48
+
49
+ def _cleanup_thread(self, thread: threading.Thread):
50
+ with self._threads_lock:
51
+ self._threads.discard(thread)
52
+
53
+ def _task_wrapper(self, fn: Callable, fut: concurrent.futures.Future, /,
54
+ *args, **kwargs):
55
+ try:
56
+ result = fn(*args, **kwargs)
57
+ fut.set_result(result)
58
+ except Exception as e: # pylint: disable=broad-except
59
+ logger.debug(f'Executor [{self.name}] error executing {fn}: {e}')
60
+ fut.set_exception(e)
61
+ finally:
62
+ self.running.decrement()
63
+ self._cleanup_thread(threading.current_thread())
64
+
65
+ def check_available(self, borrow: bool = False) -> int:
66
+ """Check if there are available workers.
67
+
68
+ Args:
69
+ borrow: If True, the caller borrow a worker from the executor.
70
+ The caller is responsible for returning the worker to the
71
+ executor after the task is completed.
72
+ """
73
+ count = self.running.increment()
74
+ if count > self.max_workers:
75
+ self.running.decrement()
76
+ raise exceptions.ConcurrentWorkerExhaustedError(
77
+ f'Maximum concurrent workers {self.max_workers} of threads '
78
+ f'executor [{self.name}] reached')
79
+ if not borrow:
80
+ self.running.decrement()
81
+ return count
82
+
83
+ def submit(self, fn: Callable[_P, _T], *args: _P.args,
84
+ **kwargs: _P.kwargs) -> 'concurrent.futures.Future[_T]':
85
+ with self._shutdown_lock:
86
+ if self._shutdown:
87
+ raise RuntimeError(
88
+ 'Cannot submit task after executor is shutdown')
89
+ count = self.check_available(borrow=True)
90
+ fut: concurrent.futures.Future = concurrent.futures.Future()
91
+ # Name is assigned for debugging purpose, duplication is fine
92
+ thread = threading.Thread(target=self._task_wrapper,
93
+ name=f'{self.name}-{count}',
94
+ args=(fn, fut, *args),
95
+ kwargs=kwargs,
96
+ daemon=True)
97
+ with self._threads_lock:
98
+ self._threads.add(thread)
99
+ try:
100
+ thread.start()
101
+ except Exception as e:
102
+ self.running.decrement()
103
+ self._cleanup_thread(thread)
104
+ fut.set_exception(e)
105
+ raise
106
+ assert thread.ident is not None, 'Thread should be started'
107
+ return fut
108
+
109
+ def shutdown(self, wait=True):
110
+ with self._shutdown_lock:
111
+ self._shutdown = True
112
+ if not wait:
113
+ return
114
+ with self._threads_lock:
115
+ threads = list(self._threads)
116
+ for t in threads:
117
+ t.join()
sky/server/rest.py CHANGED
@@ -4,11 +4,14 @@ import asyncio
4
4
  import contextlib
5
5
  import contextvars
6
6
  import functools
7
+ import html
8
+ import re
7
9
  import time
8
10
  import typing
9
11
  from typing import Any, Callable, cast, Optional, TypeVar
10
12
 
11
13
  import colorama
14
+ import urllib3.exceptions
12
15
 
13
16
  from sky import exceptions
14
17
  from sky import sky_logging
@@ -31,7 +34,15 @@ else:
31
34
 
32
35
  F = TypeVar('F', bound=Callable[..., Any])
33
36
 
34
- _RETRY_CONTEXT = contextvars.ContextVar('retry_context', default=None)
37
+
38
+ class RetryContext:
39
+
40
+ def __init__(self):
41
+ self.line_processed = 0
42
+
43
+
44
+ _RETRY_CONTEXT: contextvars.ContextVar[Optional[RetryContext]] = (
45
+ contextvars.ContextVar('retry_context', default=None))
35
46
 
36
47
  _session = requests.Session()
37
48
  # Tune connection pool size, otherwise the default max is just 10.
@@ -53,13 +64,11 @@ _session.headers[constants.VERSION_HEADER] = (
53
64
  _transient_errors = [
54
65
  requests.exceptions.RequestException,
55
66
  ConnectionError,
67
+ urllib3.exceptions.HTTPError,
56
68
  ]
57
69
 
58
-
59
- class RetryContext:
60
-
61
- def __init__(self):
62
- self.line_processed = 0
70
+ _HTML_TITLE_RE = re.compile(r'<title[^>]*>(.*?)</title>',
71
+ re.IGNORECASE | re.DOTALL)
63
72
 
64
73
 
65
74
  @contextlib.contextmanager
@@ -176,14 +185,16 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
176
185
  Notes(dev):
177
186
  """
178
187
 
188
+ def _readable_error_msg(message: str) -> str:
189
+ return (f'{colorama.Fore.YELLOW}API server is temporarily '
190
+ f'unavailable: {message}.\nRetrying...'
191
+ f'{colorama.Style.RESET_ALL}')
192
+
179
193
  def decorator(func: F) -> F:
180
194
 
181
195
  @functools.wraps(func)
182
196
  def wrapper(*args, **kwargs) -> Any:
183
- msg = (
184
- f'{colorama.Fore.YELLOW}API server is temporarily unavailable: '
185
- 'upgrade in progress. Waiting to resume...'
186
- f'{colorama.Style.RESET_ALL}')
197
+
187
198
  backoff = common_utils.Backoff(
188
199
  initial_backoff=initial_backoff,
189
200
  max_backoff_factor=max_backoff_factor)
@@ -201,7 +212,8 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
201
212
  # stop the status spinner before retrying func() to
202
213
  # avoid the status spinner get stuck if the func() runs
203
214
  # for a long time without update status, e.g. sky logs.
204
- with rich_utils.client_status(msg):
215
+ with rich_utils.client_status(
216
+ _readable_error_msg(e.message)):
205
217
  if time.time() - start_time > max_wait_seconds:
206
218
  # pylint: disable=line-too-long
207
219
  raise exceptions.ServerTemporarilyUnavailableError(
@@ -222,14 +234,98 @@ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
222
234
 
223
235
 
224
236
  def handle_server_unavailable(response: 'requests.Response') -> None:
225
- if response.status_code == 503:
226
- # TODO(aylei): Hacky, depends on how nginx controller handles backends
227
- # with no ready endpoints. Should use self-defined status code or header
228
- # to distinguish retryable server error from general 503 errors.
229
- with ux_utils.print_exception_no_traceback():
230
- raise exceptions.ServerTemporarilyUnavailableError(
231
- 'SkyPilot API server is temporarily unavailable. '
232
- 'Please try again later.')
237
+ """Handle 503 (Service Unavailable) error
238
+
239
+ The client get 503 error in the following cases:
240
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
241
+ request, e.g. when there is and rolling-update.
242
+ 2. The skypilot API server has temporary resource issue, e.g. when the
243
+ cucurrency of the handling process is exhausted.
244
+
245
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
246
+ message to the user to let user decide whether keep waiting or abort the
247
+ request.
248
+ """
249
+ if response.status_code != 503:
250
+ return
251
+
252
+ # error_msg = 'SkyPilot API server is temporarily unavailable. '
253
+ error_msg = ''
254
+ try:
255
+ response_data = response.json()
256
+ if 'detail' in response_data:
257
+ error_msg = response_data['detail']
258
+ except Exception: # pylint: disable=broad-except
259
+ error_msg = handle_response_text(response)
260
+
261
+ with ux_utils.print_exception_no_traceback():
262
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
263
+
264
+
265
+ def handle_response_text(response: 'requests.Response') -> str:
266
+ """Handle the plaintext response to get the error message
267
+
268
+ There is a special handling for html content which might be returned
269
+ by the reverse proxy to make the error message more user-friendly.
270
+ """
271
+ error_msg = ''
272
+ if isinstance(response, str):
273
+ text, headers = response, {}
274
+ else:
275
+ text = getattr(response, 'text', '')
276
+ headers = getattr(response, 'headers', {}) or {}
277
+ if not isinstance(text, str):
278
+ text = str(text) if text is not None else ''
279
+ if not text:
280
+ return ''
281
+ content_type = headers.get('Content-Type', '')
282
+ is_html = isinstance(content_type, str) and 'html' in (content_type.lower())
283
+ if not is_html:
284
+ stripped = text.lstrip()
285
+ is_html = stripped.startswith('<') and '<title' in stripped.lower()
286
+ if is_html:
287
+ match = _HTML_TITLE_RE.search(text)
288
+ if match:
289
+ title = html.unescape(match.group(1)).strip()
290
+ if title:
291
+ error_msg = title
292
+ if not error_msg:
293
+ error_msg = text
294
+ return error_msg
295
+
296
+
297
+ async def handle_server_unavailable_async(
298
+ response: 'aiohttp.ClientResponse') -> None:
299
+ """Async version: Handle 503 (Service Unavailable) error
300
+
301
+ The client get 503 error in the following cases:
302
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
303
+ request, e.g. when there is and rolling-update.
304
+ 2. The skypilot API server has temporary resource issue, e.g. when the
305
+ cucurrency of the handling process is exhausted.
306
+
307
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
308
+ message to the user to let user decide whether keep waiting or abort the
309
+ request.
310
+ """
311
+ if response.status != 503:
312
+ return
313
+
314
+ error_msg = ''
315
+ try:
316
+ response_data = await response.json()
317
+ if 'detail' in response_data:
318
+ error_msg = response_data['detail']
319
+ except Exception: # pylint: disable=broad-except
320
+ try:
321
+ text = await response.text()
322
+ if text:
323
+ error_msg = text
324
+ except Exception: # pylint: disable=broad-except
325
+ pass
326
+
327
+ with ux_utils.print_exception_no_traceback():
328
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
233
329
 
234
330
 
235
331
  @_retry_on_server_unavailable()
@@ -308,11 +404,7 @@ async def request_without_retry_async(session: 'aiohttp.ClientSession',
308
404
  response = await session.request(method, url, **kwargs)
309
405
 
310
406
  # Handle server unavailability (503 status) - same as sync version
311
- if response.status == 503:
312
- with ux_utils.print_exception_no_traceback():
313
- raise exceptions.ServerTemporarilyUnavailableError(
314
- 'SkyPilot API server is temporarily unavailable. '
315
- 'Please try again later.')
407
+ await handle_server_unavailable_async(response)
316
408
 
317
409
  # Set remote API version and version from headers - same as sync version
318
410
  remote_api_version = response.headers.get(constants.API_VERSION_HEADER)