skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/rest.py ADDED
@@ -0,0 +1,455 @@
1
+ """REST API client of SkyPilot API server"""
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import contextvars
6
+ import functools
7
+ import html
8
+ import re
9
+ import time
10
+ import typing
11
+ from typing import Any, Callable, cast, Optional, TypeVar
12
+
13
+ import colorama
14
+ import urllib3.exceptions
15
+
16
+ from sky import exceptions
17
+ from sky import sky_logging
18
+ from sky.adaptors import common as adaptors_common
19
+ from sky.server import constants
20
+ from sky.server import versions
21
+ from sky.utils import common_utils
22
+ from sky.utils import rich_utils
23
+ from sky.utils import ux_utils
24
+
25
+ logger = sky_logging.init_logger(__name__)
26
+
27
+ if typing.TYPE_CHECKING:
28
+ import aiohttp
29
+ import requests
30
+
31
+ else:
32
+ aiohttp = adaptors_common.LazyImport('aiohttp')
33
+ requests = adaptors_common.LazyImport('requests')
34
+
35
+ F = TypeVar('F', bound=Callable[..., Any])
36
+
37
+
38
+ class RetryContext:
39
+
40
+ def __init__(self):
41
+ self.line_processed = 0
42
+
43
+
44
+ _RETRY_CONTEXT: contextvars.ContextVar[Optional[RetryContext]] = (
45
+ contextvars.ContextVar('retry_context', default=None))
46
+
47
+ _session = requests.Session()
48
+ # Tune connection pool size, otherwise the default max is just 10.
49
+ adapter = requests.adapters.HTTPAdapter(
50
+ pool_connections=50,
51
+ pool_maxsize=200,
52
+ # We handle retries by ourselves in SDK.
53
+ max_retries=0,
54
+ )
55
+ _session.mount('http://', adapter)
56
+ _session.mount('https://', adapter)
57
+
58
+ _session.headers[constants.API_VERSION_HEADER] = str(constants.API_VERSION)
59
+ _session.headers[constants.VERSION_HEADER] = (
60
+ versions.get_local_readable_version())
61
+
62
+ # Enumerate error types that might be transient and can be addressed by
63
+ # retrying.
64
+ _transient_errors = [
65
+ requests.exceptions.RequestException,
66
+ ConnectionError,
67
+ urllib3.exceptions.HTTPError,
68
+ ]
69
+
70
+ _HTML_TITLE_RE = re.compile(r'<title[^>]*>(.*?)</title>',
71
+ re.IGNORECASE | re.DOTALL)
72
+
73
+
74
+ @contextlib.contextmanager
75
+ def _retry_in_context():
76
+ context = RetryContext()
77
+ token = _RETRY_CONTEXT.set(context)
78
+ try:
79
+ yield context
80
+ finally:
81
+ _RETRY_CONTEXT.reset(token)
82
+
83
+
84
+ def get_retry_context() -> Optional[RetryContext]:
85
+ return _RETRY_CONTEXT.get()
86
+
87
+
88
+ def retry_transient_errors(max_retries: int = 3,
89
+ initial_backoff=1,
90
+ max_backoff_factor=5):
91
+ """Decorator that retries a function when a transient error is caught.
92
+
93
+ This decorator is mainly used to decorate idempotent SDK functions to make
94
+ it more robust to transient errors.
95
+
96
+ Args:
97
+ max_retries: Maximum number of retries
98
+ initial_backoff: Initial backoff time in seconds
99
+ max_backoff_factor: Maximum backoff factor for exponential backoff
100
+ """
101
+
102
+ def is_transient_error(e: Exception) -> bool:
103
+ if isinstance(e, requests.exceptions.HTTPError):
104
+ # Only server error is considered as transient.
105
+ return e.response.status_code >= 500
106
+ for error in _transient_errors:
107
+ if isinstance(e, error):
108
+ return True
109
+ return False
110
+
111
+ def decorator(func: F) -> F:
112
+
113
+ @functools.wraps(func)
114
+ def wrapper(*args, **kwargs):
115
+ backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
116
+ consecutive_failed_count = 0
117
+
118
+ with _retry_in_context() as context:
119
+ previous_line_processed = context.line_processed # should be 0
120
+
121
+ def _handle_exception():
122
+ # If the function made progress on a retry,
123
+ # clears the backoff and resets the failed retry count.
124
+ # Otherwise, increments the failed retry count.
125
+ nonlocal backoff
126
+ nonlocal consecutive_failed_count
127
+ nonlocal previous_line_processed
128
+ if context.line_processed > previous_line_processed:
129
+ backoff = common_utils.Backoff(initial_backoff,
130
+ max_backoff_factor)
131
+ previous_line_processed = context.line_processed
132
+ consecutive_failed_count = 0
133
+ else:
134
+ consecutive_failed_count += 1
135
+
136
+ while consecutive_failed_count < max_retries:
137
+ try:
138
+ return func(*args, **kwargs)
139
+ # Occurs when the server proactively interrupts the request
140
+ # during rolling update, we can retry immediately on the
141
+ # new replica.
142
+ except exceptions.RequestInterruptedError:
143
+ _handle_exception()
144
+ logger.debug('Request interrupted. Retry immediately.')
145
+ continue
146
+ except Exception as e: # pylint: disable=broad-except
147
+ _handle_exception()
148
+ if consecutive_failed_count >= max_retries:
149
+ # Retries exhausted.
150
+ raise
151
+ if not is_transient_error(e):
152
+ # Permanent error, no need to retry.
153
+ raise
154
+ logger.debug(
155
+ f'Retry {func.__name__} due to {e}, '
156
+ f'attempt {consecutive_failed_count}/{max_retries}')
157
+ # Only sleep if this is not the first retry.
158
+ # The idea is that if the function made progress on a
159
+ # retry, we should try again immediately to reduce the
160
+ # waiting time.
161
+ if consecutive_failed_count > 0:
162
+ time.sleep(backoff.current_backoff())
163
+
164
+ return cast(F, wrapper)
165
+
166
+ return decorator
167
+
168
+
169
+ def _retry_on_server_unavailable(max_wait_seconds: int = 600,
170
+ initial_backoff: float = 5.0,
171
+ max_backoff_factor: int = 5):
172
+ """Decorator that retries a function when ServerTemporarilyUnavailableError
173
+ is caught.
174
+
175
+ This decorator is mainly used to decorate a Restful API call to make
176
+ the API call wait for server recovery when server is temporarily
177
+ unavailable.
178
+
179
+ Args:
180
+ max_wait_seconds: Maximum number of seconds to wait for the server to
181
+ be healthy
182
+ initial_backoff: Initial backoff time in seconds
183
+ max_backoff_factor: Maximum backoff factor for exponential backoff
184
+
185
+ Notes(dev):
186
+ """
187
+
188
+ def _readable_error_msg(message: str) -> str:
189
+ return (f'{colorama.Fore.YELLOW}API server is temporarily '
190
+ f'unavailable: {message}.\nRetrying...'
191
+ f'{colorama.Style.RESET_ALL}')
192
+
193
+ def decorator(func: F) -> F:
194
+
195
+ @functools.wraps(func)
196
+ def wrapper(*args, **kwargs) -> Any:
197
+
198
+ backoff = common_utils.Backoff(
199
+ initial_backoff=initial_backoff,
200
+ max_backoff_factor=max_backoff_factor)
201
+ start_time = time.time()
202
+ attempt = 0
203
+
204
+ with _retry_in_context():
205
+ while True:
206
+ attempt += 1
207
+ try:
208
+ return func(*args, **kwargs)
209
+ except exceptions.ServerTemporarilyUnavailableError as e:
210
+ # This will cause the status spinner being stopped and
211
+ # restarted in every retry loop. But it is necessary to
212
+ # stop the status spinner before retrying func() to
213
+ # avoid the status spinner get stuck if the func() runs
214
+ # for a long time without update status, e.g. sky logs.
215
+ with rich_utils.client_status(
216
+ _readable_error_msg(e.message)):
217
+ if time.time() - start_time > max_wait_seconds:
218
+ # pylint: disable=line-too-long
219
+ raise exceptions.ServerTemporarilyUnavailableError(
220
+ 'Timeout waiting for the API server to be '
221
+ f'available after {max_wait_seconds}s.') \
222
+ from e
223
+
224
+ sleep_time = backoff.current_backoff()
225
+ time.sleep(sleep_time)
226
+ logger.debug('The API server is unavailable. '
227
+ f'Retrying {func.__name__} '
228
+ f'(attempt {attempt}, '
229
+ f'backoff {sleep_time}s).')
230
+
231
+ return cast(F, wrapper)
232
+
233
+ return decorator
234
+
235
+
236
+ def handle_server_unavailable(response: 'requests.Response') -> None:
237
+ """Handle 503 (Service Unavailable) error
238
+
239
+ The client get 503 error in the following cases:
240
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
241
+ request, e.g. when there is and rolling-update.
242
+ 2. The skypilot API server has temporary resource issue, e.g. when the
243
+ cucurrency of the handling process is exhausted.
244
+
245
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
246
+ message to the user to let user decide whether keep waiting or abort the
247
+ request.
248
+ """
249
+ if response.status_code != 503:
250
+ return
251
+
252
+ # error_msg = 'SkyPilot API server is temporarily unavailable. '
253
+ error_msg = ''
254
+ try:
255
+ response_data = response.json()
256
+ if 'detail' in response_data:
257
+ error_msg = response_data['detail']
258
+ except Exception: # pylint: disable=broad-except
259
+ error_msg = handle_response_text(response)
260
+
261
+ with ux_utils.print_exception_no_traceback():
262
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
263
+
264
+
265
+ def handle_response_text(response: 'requests.Response') -> str:
266
+ """Handle the plaintext response to get the error message
267
+
268
+ There is a special handling for html content which might be returned
269
+ by the reverse proxy to make the error message more user-friendly.
270
+ """
271
+ error_msg = ''
272
+ if isinstance(response, str):
273
+ text, headers = response, {}
274
+ else:
275
+ text = getattr(response, 'text', '')
276
+ headers = getattr(response, 'headers', {}) or {}
277
+ if not isinstance(text, str):
278
+ text = str(text) if text is not None else ''
279
+ if not text:
280
+ return ''
281
+ content_type = headers.get('Content-Type', '')
282
+ is_html = isinstance(content_type, str) and 'html' in (content_type.lower())
283
+ if not is_html:
284
+ stripped = text.lstrip()
285
+ is_html = stripped.startswith('<') and '<title' in stripped.lower()
286
+ if is_html:
287
+ match = _HTML_TITLE_RE.search(text)
288
+ if match:
289
+ title = html.unescape(match.group(1)).strip()
290
+ if title:
291
+ error_msg = title
292
+ if not error_msg:
293
+ error_msg = text
294
+ return error_msg
295
+
296
+
297
+ async def handle_server_unavailable_async(
298
+ response: 'aiohttp.ClientResponse') -> None:
299
+ """Async version: Handle 503 (Service Unavailable) error
300
+
301
+ The client get 503 error in the following cases:
302
+ 1. The reverse proxy cannot find any ready backend endpoints to serve the
303
+ request, e.g. when there is and rolling-update.
304
+ 2. The skypilot API server has temporary resource issue, e.g. when the
305
+ cucurrency of the handling process is exhausted.
306
+
307
+ We expect the caller (CLI or SDK) retry on these cases and show clear wait
308
+ message to the user to let user decide whether keep waiting or abort the
309
+ request.
310
+ """
311
+ if response.status != 503:
312
+ return
313
+
314
+ error_msg = ''
315
+ try:
316
+ response_data = await response.json()
317
+ if 'detail' in response_data:
318
+ error_msg = response_data['detail']
319
+ except Exception: # pylint: disable=broad-except
320
+ try:
321
+ text = await response.text()
322
+ if text:
323
+ error_msg = text
324
+ except Exception: # pylint: disable=broad-except
325
+ pass
326
+
327
+ with ux_utils.print_exception_no_traceback():
328
+ raise exceptions.ServerTemporarilyUnavailableError(error_msg)
329
+
330
+
331
+ @_retry_on_server_unavailable()
332
+ def request(method, url, **kwargs) -> 'requests.Response':
333
+ """Send a request to the API server, retry on server temporarily
334
+ unavailable."""
335
+ return request_without_retry(method, url, **kwargs)
336
+
337
+
338
+ def request_without_retry(method, url, **kwargs) -> 'requests.Response':
339
+ """Send a request to the API server without retry."""
340
+ response = _session.request(method, url, **kwargs)
341
+ handle_server_unavailable(response)
342
+ remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
343
+ remote_version = response.headers.get(constants.VERSION_HEADER)
344
+ if remote_api_version is not None:
345
+ versions.set_remote_api_version(int(remote_api_version))
346
+ if remote_version is not None:
347
+ versions.set_remote_version(remote_version)
348
+ return response
349
+
350
+
351
+ # Async versions of the above functions
352
+
353
+
354
+ async def request_async(session: 'aiohttp.ClientSession', method: str, url: str,
355
+ **kwargs) -> 'aiohttp.ClientResponse':
356
+ """Send an async request to the API server, retry on server temporarily
357
+ unavailable."""
358
+ max_retries = 3
359
+ initial_backoff = 1.0
360
+ max_backoff_factor = 5
361
+
362
+ backoff = common_utils.Backoff(initial_backoff, max_backoff_factor)
363
+ last_exception = Exception('Uknown Exception') # this will be replaced by e
364
+
365
+ for retry_count in range(max_retries):
366
+ try:
367
+ return await request_without_retry_async(session, method, url,
368
+ **kwargs)
369
+ except exceptions.RequestInterruptedError:
370
+ logger.debug('Request interrupted. Retry immediately.')
371
+ continue
372
+ except Exception as e: # pylint: disable=broad-except
373
+ last_exception = e
374
+ if retry_count >= max_retries - 1:
375
+ # Retries exhausted
376
+ raise
377
+
378
+ # Check if this is a transient error (similar to sync version logic)
379
+ is_transient = _is_transient_error_async(e)
380
+ if not is_transient:
381
+ # Permanent error, no need to retry
382
+ raise
383
+
384
+ logger.debug(f'Retry async request due to {e}, '
385
+ f'attempt {retry_count + 1}/{max_retries}')
386
+ await asyncio.sleep(backoff.current_backoff())
387
+
388
+ # This should never be reached, but just in case
389
+ raise last_exception
390
+
391
+
392
+ async def request_without_retry_async(session: 'aiohttp.ClientSession',
393
+ method: str, url: str,
394
+ **kwargs) -> 'aiohttp.ClientResponse':
395
+ """Send an async request to the API server without retry."""
396
+ # Add API version headers for compatibility (like sync version does)
397
+ if 'headers' not in kwargs:
398
+ kwargs['headers'] = {}
399
+ kwargs['headers'][constants.API_VERSION_HEADER] = str(constants.API_VERSION)
400
+ kwargs['headers'][constants.VERSION_HEADER] = (
401
+ versions.get_local_readable_version())
402
+
403
+ try:
404
+ response = await session.request(method, url, **kwargs)
405
+
406
+ # Handle server unavailability (503 status) - same as sync version
407
+ await handle_server_unavailable_async(response)
408
+
409
+ # Set remote API version and version from headers - same as sync version
410
+ remote_api_version = response.headers.get(constants.API_VERSION_HEADER)
411
+ remote_version = response.headers.get(constants.VERSION_HEADER)
412
+ if remote_api_version is not None:
413
+ versions.set_remote_api_version(int(remote_api_version))
414
+ if remote_version is not None:
415
+ versions.set_remote_version(remote_version)
416
+
417
+ return response
418
+
419
+ except aiohttp.ClientError as e:
420
+ # Convert aiohttp errors to appropriate SkyPilot exceptions
421
+ if isinstance(e, aiohttp.ClientConnectorError):
422
+ raise exceptions.RequestInterruptedError(
423
+ f'Connection failed: {e}') from e
424
+ elif isinstance(e, aiohttp.ClientTimeout):
425
+ raise exceptions.RequestInterruptedError(
426
+ f'Request timeout: {e}') from e
427
+ else:
428
+ raise
429
+
430
+
431
+ def _is_transient_error_async(e: Exception) -> bool:
432
+ """Check if an exception from async request is transient and should be
433
+ retried.
434
+
435
+ Mirrors the logic from the sync version's is_transient_error().
436
+ """
437
+ if isinstance(e, aiohttp.ClientError):
438
+ # For response errors, check status code if available
439
+ if isinstance(e, aiohttp.ClientResponseError):
440
+ # Only server error is considered as transient (same as sync
441
+ # version)
442
+ return e.status >= 500
443
+ # Consider connection errors and timeouts as transient
444
+ if isinstance(e, (aiohttp.ClientConnectorError, aiohttp.ClientTimeout)):
445
+ return True
446
+
447
+ # Consider server temporarily unavailable as transient
448
+ if isinstance(e, exceptions.ServerTemporarilyUnavailableError):
449
+ return True
450
+
451
+ # It is hard to enumerate all other errors that are transient, e.g.
452
+ # broken pipe, connection refused, etc. Instead, it is safer to assume
453
+ # all other errors might be transient since we only retry for 3 times
454
+ # by default. (Same comment as in sync version)
455
+ return True