skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -4,11 +4,12 @@ import enum
4
4
  import itertools
5
5
  import json
6
6
  import math
7
- import re
8
7
  import typing
9
- from typing import Dict, List, Optional, Set, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
10
9
 
11
10
  from sky import skypilot_config
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
12
13
  from sky.utils import registry
13
14
  from sky.utils import ux_utils
14
15
 
@@ -50,6 +51,48 @@ class DiskTier(enum.Enum):
50
51
  return types.index(self) <= types.index(other)
51
52
 
52
53
 
54
+ class NetworkTier(enum.Enum):
55
+ """All network tiers supported by SkyPilot."""
56
+ STANDARD = 'standard'
57
+ BEST = 'best'
58
+
59
+ @classmethod
60
+ def supported_tiers(cls) -> List[str]:
61
+ return [tier.value for tier in cls]
62
+
63
+ @classmethod
64
+ def cli_help_message(cls) -> str:
65
+ return (
66
+ f'Network tier. Could be one of {", ".join(cls.supported_tiers())}'
67
+ f'. If {cls.BEST.value} is specified, use the best network tier '
68
+ 'available on the specified instance. '
69
+ f'Default: {cls.STANDARD.value}')
70
+
71
+ @classmethod
72
+ def from_str(cls, tier: str) -> 'NetworkTier':
73
+ if tier not in cls.supported_tiers():
74
+ raise ValueError(f'Invalid network tier: {tier}')
75
+ return cls(tier)
76
+
77
+ def __le__(self, other: 'NetworkTier') -> bool:
78
+ types = list(NetworkTier)
79
+ return types.index(self) <= types.index(other)
80
+
81
+
82
+ class StorageType(enum.Enum):
83
+ """Storage type."""
84
+ # Durable network storage, e.g. GCP persistent disks
85
+ NETWORK = 'network'
86
+ # Local instance storage, e.g. GCP local SSDs
87
+ INSTANCE = 'instance'
88
+
89
+
90
+ class DiskAttachMode(enum.Enum):
91
+ """Disk attach mode."""
92
+ READ_ONLY = 'read_only'
93
+ READ_WRITE = 'read_write'
94
+
95
+
53
96
  @dataclasses.dataclass
54
97
  class ClusterName:
55
98
  display_name: str
@@ -138,35 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
138
181
 
139
182
 
140
183
  def format_resource(resource: 'resources_lib.Resources',
141
- simplify: bool = False) -> str:
142
- if simplify:
143
- cloud = resource.cloud
144
- if resource.accelerators is None:
145
- vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
146
- resource.instance_type)
147
- hardware = f'vCPU={int(vcpu)}'
148
- else:
149
- hardware = f'{resource.accelerators}'
150
- spot = '[Spot]' if resource.use_spot else ''
151
- return f'{cloud}({spot}{hardware})'
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
+ resource = resource.assert_launchable()
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
190
+
191
+ elements_simple = []
192
+ elements_full = []
193
+
194
+ if resource.accelerators is not None:
195
+ acc, count = list(resource.accelerators.items())[0]
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
198
+
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
207
+ if vcpu is not None:
208
+ elements_full.append(f'cpus={int(vcpu)}')
209
+ if mem is not None:
210
+ elements_full.append(f'mem={int(mem)}')
211
+
212
+ if not is_k8s:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
220
+ image_id = resource.image_id
221
+ if image_id is not None:
222
+ if None in image_id:
223
+ elements_full.append(f'image_id={image_id[None]}')
224
+ else:
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
227
+ disk_tier = resource.disk_tier
228
+ if disk_tier is not None:
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
230
+ ports = resource.ports
231
+ if ports is not None:
232
+ elements_full.append(f'ports={ports}')
233
+
234
+ spot = '[spot]' if resource.use_spot else ''
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
152
239
  else:
153
- # accelerator_args is way too long.
154
- # Convert from:
155
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
156
- # to:
157
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
158
- pattern = ', accelerator_args={.*}'
159
- launched_resource_str = re.sub(pattern, '...', str(resource))
160
- return launched_resource_str
161
-
162
-
163
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
164
- simplify: bool = False) -> str:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
165
252
  if (handle.launched_nodes is not None and
166
253
  handle.launched_resources is not None):
167
- return (f'{handle.launched_nodes}x '
168
- f'{format_resource(handle.launched_resources, simplify)}')
169
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
170
259
 
171
260
 
172
261
  def make_ray_custom_resources_str(
@@ -208,10 +297,18 @@ def need_to_query_reservations() -> bool:
208
297
  clouds that do not use reservations.
209
298
  """
210
299
  for cloud_str in registry.CLOUD_REGISTRY.keys():
211
- cloud_specific_reservations = skypilot_config.get_nested(
212
- (cloud_str, 'specific_reservations'), None)
213
- cloud_prioritize_reservations = skypilot_config.get_nested(
214
- (cloud_str, 'prioritize_reservations'), False)
300
+ cloud_specific_reservations = (
301
+ skypilot_config.get_effective_region_config(
302
+ cloud=cloud_str,
303
+ region=None,
304
+ keys=('specific_reservations',),
305
+ default_value=None))
306
+ cloud_prioritize_reservations = (
307
+ skypilot_config.get_effective_region_config(
308
+ cloud=cloud_str,
309
+ region=None,
310
+ keys=('prioritize_reservations',),
311
+ default_value=False))
215
312
  if (cloud_specific_reservations is not None or
216
313
  cloud_prioritize_reservations):
217
314
  return True
@@ -248,6 +345,7 @@ def make_launchables_for_valid_region_zones(
248
345
  launchables = []
249
346
  regions = launchable_resources.get_valid_regions_for_launchable()
250
347
  for region in regions:
348
+ assert launchable_resources.cloud is not None, 'Cloud must be specified'
251
349
  optimize_by_zone = (override_optimize_by_zone or
252
350
  launchable_resources.cloud.optimize_by_zone())
253
351
  # It is possible that we force the optimize_by_zone but some clouds
@@ -266,3 +364,122 @@ def make_launchables_for_valid_region_zones(
266
364
  # Batch the requests at the granularity of a single region.
267
365
  launchables.append(launchable_resources.copy(region=region.name))
268
366
  return launchables
367
+
368
+
369
+ def parse_memory_resource(resource_qty_str: Union[str, int, float],
370
+ field_name: str,
371
+ ret_type: type = int,
372
+ unit: str = 'gb',
373
+ allow_plus: bool = False,
374
+ allow_x: bool = False,
375
+ allow_rounding: bool = False) -> str:
376
+ """Returns memory size in chosen units given a resource quantity string.
377
+
378
+ Args:
379
+ resource_qty_str: Resource quantity string
380
+ unit: Unit to convert to
381
+ allow_plus: Whether to allow '+' prefix
382
+ allow_x: Whether to allow 'x' suffix
383
+ """
384
+ assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
385
+
386
+ error_msg = (f'"{field_name}" field should be a '
387
+ f'{constants.MEMORY_SIZE_PATTERN}+?,'
388
+ f' got {resource_qty_str}')
389
+
390
+ resource_str = str(resource_qty_str)
391
+
392
+ # Handle plus and x suffixes, x is only used internally for jobs controller
393
+ plus = ''
394
+ if resource_str.endswith('+'):
395
+ if allow_plus:
396
+ resource_str = resource_str[:-1]
397
+ plus = '+'
398
+ else:
399
+ raise ValueError(error_msg)
400
+
401
+ x = ''
402
+ if resource_str.endswith('x'):
403
+ if allow_x:
404
+ resource_str = resource_str[:-1]
405
+ x = 'x'
406
+ else:
407
+ raise ValueError(error_msg)
408
+
409
+ try:
410
+ # We assume it is already in the wanted units to maintain backwards
411
+ # compatibility
412
+ ret_type(resource_str)
413
+ return f'{resource_str}{plus}{x}'
414
+ except ValueError:
415
+ pass
416
+
417
+ resource_str = resource_str.lower()
418
+ for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
419
+ if resource_str.endswith(mem_unit):
420
+ try:
421
+ value = ret_type(resource_str[:-len(mem_unit)])
422
+ converted = (value * multiplier /
423
+ constants.MEMORY_SIZE_UNITS[unit])
424
+ if not allow_rounding and ret_type(converted) != converted:
425
+ raise ValueError(error_msg)
426
+ converted = ret_type(converted)
427
+ return f'{converted}{plus}{x}'
428
+ except ValueError:
429
+ continue
430
+
431
+ raise ValueError(error_msg)
432
+
433
+
434
+ def parse_time_minutes(time: str) -> int:
435
+ """Convert a time string to minutes.
436
+
437
+ Args:
438
+ time: Time string with optional unit suffix (e.g., '30m', '2h', '1d')
439
+
440
+ Returns:
441
+ Time in minutes as an integer
442
+ """
443
+ time_str = str(time)
444
+
445
+ if time_str.isdecimal():
446
+ # We assume it is already in minutes to maintain backwards
447
+ # compatibility
448
+ return int(time_str)
449
+
450
+ time_str = time_str.lower()
451
+ for unit, multiplier in constants.TIME_UNITS.items():
452
+ if time_str.endswith(unit):
453
+ try:
454
+ value = float(time_str[:-len(unit)])
455
+ final_value = math.ceil(value * multiplier)
456
+ if final_value >= 0:
457
+ return final_value
458
+ except ValueError:
459
+ continue
460
+
461
+ raise ValueError(f'Invalid time format: {time}')
462
+
463
+
464
+ def normalize_any_of_resources_config(
465
+ any_of: List[Dict[str, Any]]) -> Tuple[str, ...]:
466
+ """Normalize a list of any_of resources config to a canonical form.
467
+
468
+ Args:
469
+ any_of: A list of any_of resources config.
470
+
471
+ Returns:
472
+ A normalized tuple representation that can be compared for equality.
473
+ Two lists with the same resource configurations in different orders
474
+ will produce the same normalized result.
475
+ """
476
+ if not any_of:
477
+ return tuple()
478
+
479
+ # Convert each config to JSON string with sorted keys, then sort the list
480
+ normalized_configs = [
481
+ json.dumps(config, sort_keys=True, separators=(',', ':'))
482
+ for config in any_of
483
+ ]
484
+
485
+ return tuple(sorted(normalized_configs))
sky/utils/rich_utils.py CHANGED
@@ -1,28 +1,53 @@
1
1
  """Rich status spinner utils."""
2
2
  import contextlib
3
+ import contextvars
3
4
  import enum
4
5
  import logging
5
6
  import threading
6
7
  import typing
7
- from typing import Dict, Iterator, Optional, Tuple, Union
8
+ from typing import Callable, Iterator, Optional, Tuple, Union
8
9
 
10
+ from sky import exceptions
9
11
  from sky.adaptors import common as adaptors_common
10
12
  from sky.utils import annotations
13
+ from sky.utils import context
11
14
  from sky.utils import message_utils
12
15
  from sky.utils import rich_console_utils
13
16
 
14
17
  if typing.TYPE_CHECKING:
18
+ import aiohttp
15
19
  import requests
16
20
  import rich.console as rich_console
17
21
  else:
18
22
  requests = adaptors_common.LazyImport('requests')
19
23
  rich_console = adaptors_common.LazyImport('rich.console')
24
+ aiohttp = adaptors_common.LazyImport('aiohttp')
25
+
26
+ GeneralStatus = Union['rich_console.Status', 'EncodedStatus']
27
+
28
+ _client_status: Optional[GeneralStatus] = None
29
+ _server_status: contextvars.ContextVar[
30
+ Optional[GeneralStatus]] = contextvars.ContextVar('server_status',
31
+ default=None)
32
+
33
+
34
+ def _get_client_status() -> Optional[GeneralStatus]:
35
+ return _client_status
36
+
37
+
38
+ def _get_server_status() -> Optional[GeneralStatus]:
39
+ return _server_status.get()
40
+
41
+
42
+ def _set_client_status(status: Optional[GeneralStatus]):
43
+ global _client_status
44
+ _client_status = status
45
+
46
+
47
+ def _set_server_status(status: Optional[GeneralStatus]):
48
+ _server_status.set(status)
49
+
20
50
 
21
- _statuses: Dict[str, Optional[Union['EncodedStatus',
22
- 'rich_console.Status']]] = {
23
- 'server': None,
24
- 'client': None,
25
- }
26
51
  _status_nesting_level = 0
27
52
 
28
53
  _logging_lock = threading.RLock()
@@ -35,6 +60,8 @@ class Control(enum.Enum):
35
60
  STOP = 'rich_stop'
36
61
  EXIT = 'rich_exit'
37
62
  UPDATE = 'rich_update'
63
+ HEARTBEAT = 'heartbeat'
64
+ RETRY = 'retry'
38
65
 
39
66
  def encode(self, msg: str) -> str:
40
67
  return f'<{self.value}>{msg}</{self.value}>'
@@ -128,20 +155,22 @@ class _NoOpConsoleStatus:
128
155
  class _RevertibleStatus:
129
156
  """A wrapper for status that can revert to previous message after exit."""
130
157
 
131
- def __init__(self, message: str, status_type: str):
158
+ def __init__(self, message: str, get_status_fn: Callable[[], GeneralStatus],
159
+ set_status_fn: Callable[[Optional[GeneralStatus]], None]):
132
160
  self.previous_message = None
133
- self.status_type = status_type
134
- status = _statuses[status_type]
161
+ self.get_status_fn = get_status_fn
162
+ self.set_status_fn = set_status_fn
163
+ status = self.get_status_fn()
135
164
  if status is not None:
136
165
  self.previous_message = status.status
137
166
  self.message = message
138
167
 
139
168
  def __enter__(self):
140
169
  global _status_nesting_level
141
- _statuses[self.status_type].update(self.message)
170
+ self.get_status_fn().update(self.message)
142
171
  _status_nesting_level += 1
143
- _statuses[self.status_type].__enter__()
144
- return _statuses[self.status_type]
172
+ self.get_status_fn().__enter__()
173
+ return self.get_status_fn()
145
174
 
146
175
  def __exit__(self, exc_type, exc_val, exc_tb):
147
176
  # We use the same lock with the `safe_logger` to avoid the following 2
@@ -160,32 +189,49 @@ class _RevertibleStatus:
160
189
  _status_nesting_level -= 1
161
190
  if _status_nesting_level <= 0:
162
191
  _status_nesting_level = 0
163
- if _statuses[self.status_type] is not None:
164
- _statuses[self.status_type].__exit__(
165
- exc_type, exc_val, exc_tb)
166
- _statuses[self.status_type] = None
192
+ if self.get_status_fn() is not None:
193
+ self.get_status_fn().__exit__(exc_type, exc_val, exc_tb)
194
+ self.set_status_fn(None)
167
195
  else:
168
- _statuses[self.status_type].update(self.previous_message)
196
+ if self.previous_message is not None:
197
+ self.get_status_fn().update(self.previous_message)
169
198
 
170
199
  def update(self, *args, **kwargs):
171
- _statuses[self.status_type].update(*args, **kwargs)
200
+ self.get_status_fn().update(*args, **kwargs)
172
201
 
173
202
  def stop(self):
174
- _statuses[self.status_type].stop()
203
+ self.get_status_fn().stop()
175
204
 
176
205
  def start(self):
177
- _statuses[self.status_type].start()
206
+ self.get_status_fn().start()
207
+
208
+
209
+ def _is_thread_safe() -> bool:
210
+ """Check if the current status context is thread-safe.
211
+
212
+ We are thread-safe if we are on the main thread or the server_status is
213
+ context-local, i.e. an async context has been initialized.
214
+ """
215
+ return (threading.current_thread() is threading.main_thread() or
216
+ context.get() is not None)
178
217
 
179
218
 
180
219
  def safe_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
181
- """A wrapper for multi-threaded console.status."""
220
+ """A wrapper for multi-threaded server-side console.status.
221
+
222
+ This function will encode rich status with control codes and output the
223
+ encoded string to stdout. Client-side decode control codes from server
224
+ output and update the rich status. This function is safe to be called in
225
+ async/multi-threaded context.
226
+
227
+ See also: :func:`client_status`, :class:`EncodedStatus`.
228
+ """
182
229
  from sky import sky_logging # pylint: disable=import-outside-toplevel
183
- if (annotations.is_on_api_server and
184
- threading.current_thread() is threading.main_thread() and
230
+ if (annotations.is_on_api_server and _is_thread_safe() and
185
231
  not sky_logging.is_silent()):
186
- if _statuses['server'] is None:
187
- _statuses['server'] = EncodedStatus(msg)
188
- return _RevertibleStatus(msg, 'server')
232
+ if _get_server_status() is None:
233
+ _set_server_status(EncodedStatus(msg))
234
+ return _RevertibleStatus(msg, _get_server_status, _set_server_status)
189
235
  return _NoOpConsoleStatus()
190
236
 
191
237
 
@@ -196,29 +242,34 @@ def stop_safe_status():
196
242
  stream logs from user program and do not want it to interfere with the
197
243
  spinner display.
198
244
  """
199
- if (threading.current_thread() is threading.main_thread() and
200
- _statuses['server'] is not None):
201
- _statuses['server'].stop()
245
+ if _is_thread_safe():
246
+ return
247
+ server_status = _get_server_status()
248
+ if server_status is not None:
249
+ server_status.stop()
202
250
 
203
251
 
204
252
  def force_update_status(msg: str):
205
253
  """Update the status message even if sky_logging.is_silent() is true."""
206
- if (threading.current_thread() is threading.main_thread() and
207
- _statuses['server'] is not None):
208
- _statuses['server'].update(msg)
254
+ if not _is_thread_safe():
255
+ return
256
+ server_status = _get_server_status()
257
+ if server_status is not None:
258
+ server_status.update(msg)
209
259
 
210
260
 
211
261
  @contextlib.contextmanager
212
262
  def safe_logger():
213
263
  with _logging_lock:
214
- client_status_obj = _statuses['client']
264
+ client_status_obj = _get_client_status()
215
265
 
216
266
  client_status_live = (client_status_obj is not None and
267
+ hasattr(client_status_obj, '_live') and
217
268
  client_status_obj._live.is_started) # pylint: disable=protected-access
218
- if client_status_live:
269
+ if client_status_live and client_status_obj is not None:
219
270
  client_status_obj.stop()
220
271
  yield
221
- if client_status_live:
272
+ if client_status_live and client_status_obj is not None:
222
273
  client_status_obj.start()
223
274
 
224
275
 
@@ -230,13 +281,13 @@ class RichSafeStreamHandler(logging.StreamHandler):
230
281
 
231
282
 
232
283
  def client_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
233
- """A wrapper for multi-threaded console.status."""
284
+ """A wrapper for multi-threaded client-side console.status."""
234
285
  from sky import sky_logging # pylint: disable=import-outside-toplevel
235
286
  if (threading.current_thread() is threading.main_thread() and
236
287
  not sky_logging.is_silent()):
237
- if _statuses['client'] is None:
238
- _statuses['client'] = rich_console_utils.get_console().status(msg)
239
- return _RevertibleStatus(msg, 'client')
288
+ if _get_client_status() is None:
289
+ _set_client_status(rich_console_utils.get_console().status(msg))
290
+ return _RevertibleStatus(msg, _get_client_status, _set_client_status)
240
291
  return _NoOpConsoleStatus()
241
292
 
242
293
 
@@ -320,6 +371,9 @@ def decode_rich_status(
320
371
  yield line
321
372
  continue
322
373
 
374
+ if control == Control.RETRY:
375
+ raise exceptions.RequestInterruptedError(
376
+ 'Streaming interrupted. Please retry.')
323
377
  # control is not None, i.e. it is a rich status control message.
324
378
  if threading.current_thread() is not threading.main_thread():
325
379
  yield None
@@ -341,6 +395,130 @@ def decode_rich_status(
341
395
  decoding_status.__exit__(None, None, None)
342
396
  elif control == Control.START:
343
397
  decoding_status.start()
398
+ elif control == Control.HEARTBEAT:
399
+ # Heartbeat is not displayed to the user, so we do not
400
+ # need to update the status.
401
+ pass
402
+ finally:
403
+ if decoding_status is not None:
404
+ decoding_status.__exit__(None, None, None)
405
+
406
+
407
+ async def decode_rich_status_async(
408
+ response: 'aiohttp.ClientResponse'
409
+ ) -> typing.AsyncIterator[Optional[str]]:
410
+ """Async version of rich_utils.decode_rich_status that decodes rich status
411
+ messages from an aiohttp response.
412
+
413
+ Args:
414
+ response: The aiohttp response.
415
+
416
+ Yields:
417
+ Optional[str]: Decoded lines or None for control messages.
418
+ """
419
+ decoding_status = None
420
+ try:
421
+ last_line = ''
422
+ # Buffer to store incomplete UTF-8 bytes between chunks
423
+ undecoded_buffer = b''
424
+
425
+ # Iterate over the response content in chunks
426
+ async for chunk, _ in response.content.iter_chunks():
427
+ if chunk is None:
428
+ return
429
+
430
+ # Append the new chunk to any leftover bytes from previous iteration
431
+ current_bytes = undecoded_buffer + chunk
432
+ undecoded_buffer = b''
433
+
434
+ # Try to decode the combined bytes
435
+ try:
436
+ encoded_msg = current_bytes.decode('utf-8')
437
+ except UnicodeDecodeError as e:
438
+ # Check if this is potentially an incomplete sequence at the end
439
+ if e.start > 0:
440
+ # Decode the valid part
441
+ encoded_msg = current_bytes[:e.start].decode('utf-8')
442
+
443
+ # Check if the remaining bytes are likely a partial char
444
+ # or actually invalid UTF-8
445
+ remaining_bytes = current_bytes[e.start:]
446
+ if len(remaining_bytes) < 4: # Max UTF-8 char is 4 bytes
447
+ # Likely incomplete - save for next chunk
448
+ undecoded_buffer = remaining_bytes
449
+ else:
450
+ # Likely invalid - replace with replacement character
451
+ encoded_msg += remaining_bytes.decode('utf-8',
452
+ errors='replace')
453
+ undecoded_buffer = b''
454
+ else:
455
+ # Error at the very beginning of the buffer - invalid UTF-8
456
+ encoded_msg = current_bytes.decode('utf-8',
457
+ errors='replace')
458
+ undecoded_buffer = b''
459
+
460
+ lines = encoded_msg.splitlines(keepends=True)
461
+
462
+ # Skip processing if lines is empty to avoid IndexError
463
+ if not lines:
464
+ continue
465
+
466
+ lines[0] = last_line + lines[0]
467
+ last_line = lines[-1]
468
+ # If the last line is not ended with `\r` or `\n` (with ending
469
+ # spaces stripped), it means the last line is not a complete line.
470
+ # We keep the last line in the buffer and continue.
471
+ if (not last_line.strip(' ').endswith('\r') and
472
+ not last_line.strip(' ').endswith('\n')):
473
+ lines = lines[:-1]
474
+ else:
475
+ # Reset the buffer for the next line, as the last line is a
476
+ # complete line.
477
+ last_line = ''
478
+
479
+ for line in lines:
480
+ if line.endswith('\r\n'):
481
+ # Replace `\r\n` with `\n`, as printing a line ends with
482
+ # `\r\n` in linux will cause the line to be empty.
483
+ line = line[:-2] + '\n'
484
+ is_payload, line = message_utils.decode_payload(
485
+ line, raise_for_mismatch=False)
486
+ if line is None:
487
+ continue
488
+ control = None
489
+ if is_payload:
490
+ control, encoded_status = Control.decode(line)
491
+ if control is None:
492
+ yield line
493
+ continue
494
+
495
+ if control == Control.RETRY:
496
+ raise exceptions.RequestInterruptedError(
497
+ 'Streaming interrupted. Please retry.')
498
+ # control is not None, i.e. it is a rich status control message.
499
+ # In async context, we'll handle rich status controls normally
500
+ # since async typically runs in main thread
501
+ if control == Control.INIT:
502
+ decoding_status = client_status(encoded_status)
503
+ else:
504
+ if decoding_status is None:
505
+ # status may not be initialized if a user use --tail for
506
+ # sky api logs.
507
+ continue
508
+ assert decoding_status is not None, (
509
+ f'Rich status not initialized: {line}')
510
+ if control == Control.UPDATE:
511
+ decoding_status.update(encoded_status)
512
+ elif control == Control.STOP:
513
+ decoding_status.stop()
514
+ elif control == Control.EXIT:
515
+ decoding_status.__exit__(None, None, None)
516
+ elif control == Control.START:
517
+ decoding_status.start()
518
+ elif control == Control.HEARTBEAT:
519
+ # Heartbeat is not displayed to the user, so we do not
520
+ # need to update the status.
521
+ pass
344
522
  finally:
345
523
  if decoding_status is not None:
346
524
  decoding_status.__exit__(None, None, None)