skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -4,11 +4,12 @@ import enum
4
4
  import itertools
5
5
  import json
6
6
  import math
7
- import re
8
7
  import typing
9
- from typing import Dict, List, Optional, Set, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
10
9
 
11
10
  from sky import skypilot_config
11
+ from sky.skylet import constants
12
+ from sky.utils import common_utils
12
13
  from sky.utils import registry
13
14
  from sky.utils import ux_utils
14
15
 
@@ -50,6 +51,48 @@ class DiskTier(enum.Enum):
50
51
  return types.index(self) <= types.index(other)
51
52
 
52
53
 
54
+ class NetworkTier(enum.Enum):
55
+ """All network tiers supported by SkyPilot."""
56
+ STANDARD = 'standard'
57
+ BEST = 'best'
58
+
59
+ @classmethod
60
+ def supported_tiers(cls) -> List[str]:
61
+ return [tier.value for tier in cls]
62
+
63
+ @classmethod
64
+ def cli_help_message(cls) -> str:
65
+ return (
66
+ f'Network tier. Could be one of {", ".join(cls.supported_tiers())}'
67
+ f'. If {cls.BEST.value} is specified, use the best network tier '
68
+ 'available on the specified instance. '
69
+ f'Default: {cls.STANDARD.value}')
70
+
71
+ @classmethod
72
+ def from_str(cls, tier: str) -> 'NetworkTier':
73
+ if tier not in cls.supported_tiers():
74
+ raise ValueError(f'Invalid network tier: {tier}')
75
+ return cls(tier)
76
+
77
+ def __le__(self, other: 'NetworkTier') -> bool:
78
+ types = list(NetworkTier)
79
+ return types.index(self) <= types.index(other)
80
+
81
+
82
+ class StorageType(enum.Enum):
83
+ """Storage type."""
84
+ # Durable network storage, e.g. GCP persistent disks
85
+ NETWORK = 'network'
86
+ # Local instance storage, e.g. GCP local SSDs
87
+ INSTANCE = 'instance'
88
+
89
+
90
+ class DiskAttachMode(enum.Enum):
91
+ """Disk attach mode."""
92
+ READ_ONLY = 'read_only'
93
+ READ_WRITE = 'read_write'
94
+
95
+
53
96
  @dataclasses.dataclass
54
97
  class ClusterName:
55
98
  display_name: str
@@ -138,35 +181,81 @@ def simplify_ports(ports: List[str]) -> List[str]:
138
181
 
139
182
 
140
183
  def format_resource(resource: 'resources_lib.Resources',
141
- simplify: bool = False) -> str:
142
- if simplify:
143
- cloud = resource.cloud
144
- if resource.accelerators is None:
145
- vcpu, _ = cloud.get_vcpus_mem_from_instance_type(
146
- resource.instance_type)
147
- hardware = f'vCPU={int(vcpu)}'
148
- else:
149
- hardware = f'{resource.accelerators}'
150
- spot = '[Spot]' if resource.use_spot else ''
151
- return f'{cloud}({spot}{hardware})'
184
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
185
+ resource = resource.assert_launchable()
186
+ is_k8s = str(resource.cloud).lower() == 'kubernetes'
187
+ if resource.accelerators is None or is_k8s or not simplified_only:
188
+ vcpu, mem = resource.cloud.get_vcpus_mem_from_instance_type(
189
+ resource.instance_type)
190
+
191
+ elements_simple = []
192
+ elements_full = []
193
+
194
+ if resource.accelerators is not None:
195
+ acc, count = list(resource.accelerators.items())[0]
196
+ elements_simple.append(f'gpus={acc}:{count}')
197
+ elements_full.append(f'gpus={acc}:{count}')
198
+
199
+ if (resource.accelerators is None or is_k8s):
200
+ if vcpu is not None:
201
+ elements_simple.append(f'cpus={int(vcpu)}')
202
+ elements_full.append(f'cpus={int(vcpu)}')
203
+ if mem is not None:
204
+ elements_simple.append(f'mem={int(mem)}')
205
+ elements_full.append(f'mem={int(mem)}')
206
+ elif not simplified_only:
207
+ if vcpu is not None:
208
+ elements_full.append(f'cpus={int(vcpu)}')
209
+ if mem is not None:
210
+ elements_full.append(f'mem={int(mem)}')
211
+
212
+ if not is_k8s:
213
+ instance_type_full = resource.instance_type
214
+ instance_type_simple = common_utils.truncate_long_string(
215
+ instance_type_full, 15)
216
+ elements_simple.append(instance_type_simple)
217
+ elements_full.append(instance_type_full)
218
+ elements_simple.append('...')
219
+ if not simplified_only:
220
+ image_id = resource.image_id
221
+ if image_id is not None:
222
+ if None in image_id:
223
+ elements_full.append(f'image_id={image_id[None]}')
224
+ else:
225
+ elements_full.append(f'image_id={image_id}')
226
+ elements_full.append(f'disk={resource.disk_size}')
227
+ disk_tier = resource.disk_tier
228
+ if disk_tier is not None:
229
+ elements_full.append(f'disk_tier={disk_tier.value}')
230
+ ports = resource.ports
231
+ if ports is not None:
232
+ elements_full.append(f'ports={ports}')
233
+
234
+ spot = '[spot]' if resource.use_spot else ''
235
+ resources_str_simple = (
236
+ f'{spot}({"" if not elements_simple else ", ".join(elements_simple)})')
237
+ if simplified_only:
238
+ return resources_str_simple, None
152
239
  else:
153
- # accelerator_args is way too long.
154
- # Convert from:
155
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}, accelerator_args={'runtime_version': '2.12.0'} # pylint: disable=line-too-long
156
- # to:
157
- # GCP(n1-highmem-8, {'tpu-v2-8': 1}...)
158
- pattern = ', accelerator_args={.*}'
159
- launched_resource_str = re.sub(pattern, '...', str(resource))
160
- return launched_resource_str
161
-
162
-
163
- def get_readable_resources_repr(handle: 'backends.CloudVmRayResourceHandle',
164
- simplify: bool = False) -> str:
240
+ resources_str_full = (
241
+ f'{spot}({"" if not elements_full else ", ".join(elements_full)})')
242
+ return resources_str_simple, resources_str_full
243
+
244
+
245
+ def get_readable_resources_repr(
246
+ handle: 'backends.CloudVmRayResourceHandle',
247
+ simplified_only: bool = False) -> Tuple[str, Optional[str]]:
248
+ resource_str_simple, resource_str_full = format_resource(
249
+ handle.launched_resources, simplified_only)
250
+ if not simplified_only:
251
+ assert resource_str_full is not None
165
252
  if (handle.launched_nodes is not None and
166
253
  handle.launched_resources is not None):
167
- return (f'{handle.launched_nodes}x '
168
- f'{format_resource(handle.launched_resources, simplify)}')
169
- return _DEFAULT_MESSAGE_HANDLE_INITIALIZING
254
+ return (f'{handle.launched_nodes}x{resource_str_simple}',
255
+ None if simplified_only else
256
+ f'{handle.launched_nodes}x{resource_str_full}')
257
+ return (_DEFAULT_MESSAGE_HANDLE_INITIALIZING,
258
+ _DEFAULT_MESSAGE_HANDLE_INITIALIZING)
170
259
 
171
260
 
172
261
  def make_ray_custom_resources_str(
@@ -208,10 +297,18 @@ def need_to_query_reservations() -> bool:
208
297
  clouds that do not use reservations.
209
298
  """
210
299
  for cloud_str in registry.CLOUD_REGISTRY.keys():
211
- cloud_specific_reservations = skypilot_config.get_nested(
212
- (cloud_str, 'specific_reservations'), None)
213
- cloud_prioritize_reservations = skypilot_config.get_nested(
214
- (cloud_str, 'prioritize_reservations'), False)
300
+ cloud_specific_reservations = (
301
+ skypilot_config.get_effective_region_config(
302
+ cloud=cloud_str,
303
+ region=None,
304
+ keys=('specific_reservations',),
305
+ default_value=None))
306
+ cloud_prioritize_reservations = (
307
+ skypilot_config.get_effective_region_config(
308
+ cloud=cloud_str,
309
+ region=None,
310
+ keys=('prioritize_reservations',),
311
+ default_value=False))
215
312
  if (cloud_specific_reservations is not None or
216
313
  cloud_prioritize_reservations):
217
314
  return True
@@ -248,6 +345,7 @@ def make_launchables_for_valid_region_zones(
248
345
  launchables = []
249
346
  regions = launchable_resources.get_valid_regions_for_launchable()
250
347
  for region in regions:
348
+ assert launchable_resources.cloud is not None, 'Cloud must be specified'
251
349
  optimize_by_zone = (override_optimize_by_zone or
252
350
  launchable_resources.cloud.optimize_by_zone())
253
351
  # It is possible that we force the optimize_by_zone but some clouds
@@ -266,3 +364,122 @@ def make_launchables_for_valid_region_zones(
266
364
  # Batch the requests at the granularity of a single region.
267
365
  launchables.append(launchable_resources.copy(region=region.name))
268
366
  return launchables
367
+
368
+
369
+ def parse_memory_resource(resource_qty_str: Union[str, int, float],
370
+ field_name: str,
371
+ ret_type: type = int,
372
+ unit: str = 'gb',
373
+ allow_plus: bool = False,
374
+ allow_x: bool = False,
375
+ allow_rounding: bool = False) -> str:
376
+ """Returns memory size in chosen units given a resource quantity string.
377
+
378
+ Args:
379
+ resource_qty_str: Resource quantity string
380
+ unit: Unit to convert to
381
+ allow_plus: Whether to allow '+' prefix
382
+ allow_x: Whether to allow 'x' suffix
383
+ """
384
+ assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
385
+
386
+ error_msg = (f'"{field_name}" field should be a '
387
+ f'{constants.MEMORY_SIZE_PATTERN}+?,'
388
+ f' got {resource_qty_str}')
389
+
390
+ resource_str = str(resource_qty_str)
391
+
392
+ # Handle plus and x suffixes, x is only used internally for jobs controller
393
+ plus = ''
394
+ if resource_str.endswith('+'):
395
+ if allow_plus:
396
+ resource_str = resource_str[:-1]
397
+ plus = '+'
398
+ else:
399
+ raise ValueError(error_msg)
400
+
401
+ x = ''
402
+ if resource_str.endswith('x'):
403
+ if allow_x:
404
+ resource_str = resource_str[:-1]
405
+ x = 'x'
406
+ else:
407
+ raise ValueError(error_msg)
408
+
409
+ try:
410
+ # We assume it is already in the wanted units to maintain backwards
411
+ # compatibility
412
+ ret_type(resource_str)
413
+ return f'{resource_str}{plus}{x}'
414
+ except ValueError:
415
+ pass
416
+
417
+ resource_str = resource_str.lower()
418
+ for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
419
+ if resource_str.endswith(mem_unit):
420
+ try:
421
+ value = ret_type(resource_str[:-len(mem_unit)])
422
+ converted = (value * multiplier /
423
+ constants.MEMORY_SIZE_UNITS[unit])
424
+ if not allow_rounding and ret_type(converted) != converted:
425
+ raise ValueError(error_msg)
426
+ converted = ret_type(converted)
427
+ return f'{converted}{plus}{x}'
428
+ except ValueError:
429
+ continue
430
+
431
+ raise ValueError(error_msg)
432
+
433
+
434
+ def parse_time_minutes(time: str) -> int:
435
+ """Convert a time string to minutes.
436
+
437
+ Args:
438
+ time: Time string with optional unit suffix (e.g., '30m', '2h', '1d')
439
+
440
+ Returns:
441
+ Time in minutes as an integer
442
+ """
443
+ time_str = str(time)
444
+
445
+ if time_str.isdecimal():
446
+ # We assume it is already in minutes to maintain backwards
447
+ # compatibility
448
+ return int(time_str)
449
+
450
+ time_str = time_str.lower()
451
+ for unit, multiplier in constants.TIME_UNITS.items():
452
+ if time_str.endswith(unit):
453
+ try:
454
+ value = float(time_str[:-len(unit)])
455
+ final_value = math.ceil(value * multiplier)
456
+ if final_value >= 0:
457
+ return final_value
458
+ except ValueError:
459
+ continue
460
+
461
+ raise ValueError(f'Invalid time format: {time}')
462
+
463
+
464
+ def normalize_any_of_resources_config(
465
+ any_of: List[Dict[str, Any]]) -> Tuple[str, ...]:
466
+ """Normalize a list of any_of resources config to a canonical form.
467
+
468
+ Args:
469
+ any_of: A list of any_of resources config.
470
+
471
+ Returns:
472
+ A normalized tuple representation that can be compared for equality.
473
+ Two lists with the same resource configurations in different orders
474
+ will produce the same normalized result.
475
+ """
476
+ if not any_of:
477
+ return tuple()
478
+
479
+ # Convert each config to JSON string with sorted keys, then sort the list
480
+ normalized_configs = [
481
+ json.dumps(config, sort_keys=True, separators=(',', ':'))
482
+ for config in any_of
483
+ ]
484
+
485
+ return tuple(sorted(normalized_configs))
sky/utils/rich_utils.py CHANGED
@@ -1,28 +1,53 @@
1
1
  """Rich status spinner utils."""
2
2
  import contextlib
3
+ import contextvars
3
4
  import enum
4
5
  import logging
5
6
  import threading
6
7
  import typing
7
- from typing import Dict, Iterator, Optional, Tuple, Union
8
+ from typing import Callable, Iterator, Optional, Tuple, Union
8
9
 
10
+ from sky import exceptions
9
11
  from sky.adaptors import common as adaptors_common
10
12
  from sky.utils import annotations
13
+ from sky.utils import context
11
14
  from sky.utils import message_utils
12
15
  from sky.utils import rich_console_utils
13
16
 
14
17
  if typing.TYPE_CHECKING:
18
+ import aiohttp
15
19
  import requests
16
20
  import rich.console as rich_console
17
21
  else:
18
22
  requests = adaptors_common.LazyImport('requests')
19
23
  rich_console = adaptors_common.LazyImport('rich.console')
24
+ aiohttp = adaptors_common.LazyImport('aiohttp')
25
+
26
+ GeneralStatus = Union['rich_console.Status', 'EncodedStatus']
27
+
28
+ _client_status: Optional[GeneralStatus] = None
29
+ _server_status: contextvars.ContextVar[
30
+ Optional[GeneralStatus]] = contextvars.ContextVar('server_status',
31
+ default=None)
32
+
33
+
34
+ def _get_client_status() -> Optional[GeneralStatus]:
35
+ return _client_status
36
+
37
+
38
+ def _get_server_status() -> Optional[GeneralStatus]:
39
+ return _server_status.get()
40
+
41
+
42
+ def _set_client_status(status: Optional[GeneralStatus]):
43
+ global _client_status
44
+ _client_status = status
45
+
46
+
47
+ def _set_server_status(status: Optional[GeneralStatus]):
48
+ _server_status.set(status)
49
+
20
50
 
21
- _statuses: Dict[str, Optional[Union['EncodedStatus',
22
- 'rich_console.Status']]] = {
23
- 'server': None,
24
- 'client': None,
25
- }
26
51
  _status_nesting_level = 0
27
52
 
28
53
  _logging_lock = threading.RLock()
@@ -35,6 +60,8 @@ class Control(enum.Enum):
35
60
  STOP = 'rich_stop'
36
61
  EXIT = 'rich_exit'
37
62
  UPDATE = 'rich_update'
63
+ HEARTBEAT = 'heartbeat'
64
+ RETRY = 'retry'
38
65
 
39
66
  def encode(self, msg: str) -> str:
40
67
  return f'<{self.value}>{msg}</{self.value}>'
@@ -128,20 +155,22 @@ class _NoOpConsoleStatus:
128
155
  class _RevertibleStatus:
129
156
  """A wrapper for status that can revert to previous message after exit."""
130
157
 
131
- def __init__(self, message: str, status_type: str):
158
+ def __init__(self, message: str, get_status_fn: Callable[[], GeneralStatus],
159
+ set_status_fn: Callable[[Optional[GeneralStatus]], None]):
132
160
  self.previous_message = None
133
- self.status_type = status_type
134
- status = _statuses[status_type]
161
+ self.get_status_fn = get_status_fn
162
+ self.set_status_fn = set_status_fn
163
+ status = self.get_status_fn()
135
164
  if status is not None:
136
165
  self.previous_message = status.status
137
166
  self.message = message
138
167
 
139
168
  def __enter__(self):
140
169
  global _status_nesting_level
141
- _statuses[self.status_type].update(self.message)
170
+ self.get_status_fn().update(self.message)
142
171
  _status_nesting_level += 1
143
- _statuses[self.status_type].__enter__()
144
- return _statuses[self.status_type]
172
+ self.get_status_fn().__enter__()
173
+ return self.get_status_fn()
145
174
 
146
175
  def __exit__(self, exc_type, exc_val, exc_tb):
147
176
  # We use the same lock with the `safe_logger` to avoid the following 2
@@ -160,32 +189,48 @@ class _RevertibleStatus:
160
189
  _status_nesting_level -= 1
161
190
  if _status_nesting_level <= 0:
162
191
  _status_nesting_level = 0
163
- if _statuses[self.status_type] is not None:
164
- _statuses[self.status_type].__exit__(
165
- exc_type, exc_val, exc_tb)
166
- _statuses[self.status_type] = None
192
+ if self.get_status_fn() is not None:
193
+ self.get_status_fn().__exit__(exc_type, exc_val, exc_tb)
194
+ self.set_status_fn(None)
167
195
  else:
168
- _statuses[self.status_type].update(self.previous_message)
196
+ self.get_status_fn().update(self.previous_message)
169
197
 
170
198
  def update(self, *args, **kwargs):
171
- _statuses[self.status_type].update(*args, **kwargs)
199
+ self.get_status_fn().update(*args, **kwargs)
172
200
 
173
201
  def stop(self):
174
- _statuses[self.status_type].stop()
202
+ self.get_status_fn().stop()
175
203
 
176
204
  def start(self):
177
- _statuses[self.status_type].start()
205
+ self.get_status_fn().start()
206
+
207
+
208
+ def _is_thread_safe() -> bool:
209
+ """Check if the current status context is thread-safe.
210
+
211
+ We are thread-safe if we are on the main thread or the server_status is
212
+ context-local, i.e. an async context has been initialized.
213
+ """
214
+ return (threading.current_thread() is threading.main_thread() or
215
+ context.get() is not None)
178
216
 
179
217
 
180
218
  def safe_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
181
- """A wrapper for multi-threaded console.status."""
219
+ """A wrapper for multi-threaded server-side console.status.
220
+
221
+ This function will encode rich status with control codes and output the
222
+ encoded string to stdout. Client-side decode control codes from server
223
+ output and update the rich status. This function is safe to be called in
224
+ async/multi-threaded context.
225
+
226
+ See also: :func:`client_status`, :class:`EncodedStatus`.
227
+ """
182
228
  from sky import sky_logging # pylint: disable=import-outside-toplevel
183
- if (annotations.is_on_api_server and
184
- threading.current_thread() is threading.main_thread() and
229
+ if (annotations.is_on_api_server and _is_thread_safe() and
185
230
  not sky_logging.is_silent()):
186
- if _statuses['server'] is None:
187
- _statuses['server'] = EncodedStatus(msg)
188
- return _RevertibleStatus(msg, 'server')
231
+ if _get_server_status() is None:
232
+ _set_server_status(EncodedStatus(msg))
233
+ return _RevertibleStatus(msg, _get_server_status, _set_server_status)
189
234
  return _NoOpConsoleStatus()
190
235
 
191
236
 
@@ -196,22 +241,26 @@ def stop_safe_status():
196
241
  stream logs from user program and do not want it to interfere with the
197
242
  spinner display.
198
243
  """
199
- if (threading.current_thread() is threading.main_thread() and
200
- _statuses['server'] is not None):
201
- _statuses['server'].stop()
244
+ if _is_thread_safe():
245
+ return
246
+ server_status = _get_server_status()
247
+ if server_status is not None:
248
+ server_status.stop()
202
249
 
203
250
 
204
251
  def force_update_status(msg: str):
205
252
  """Update the status message even if sky_logging.is_silent() is true."""
206
- if (threading.current_thread() is threading.main_thread() and
207
- _statuses['server'] is not None):
208
- _statuses['server'].update(msg)
253
+ if not _is_thread_safe():
254
+ return
255
+ server_status = _get_server_status()
256
+ if server_status is not None:
257
+ server_status.update(msg)
209
258
 
210
259
 
211
260
  @contextlib.contextmanager
212
261
  def safe_logger():
213
262
  with _logging_lock:
214
- client_status_obj = _statuses['client']
263
+ client_status_obj = _get_client_status()
215
264
 
216
265
  client_status_live = (client_status_obj is not None and
217
266
  client_status_obj._live.is_started) # pylint: disable=protected-access
@@ -230,13 +279,13 @@ class RichSafeStreamHandler(logging.StreamHandler):
230
279
 
231
280
 
232
281
  def client_status(msg: str) -> Union['rich_console.Status', _NoOpConsoleStatus]:
233
- """A wrapper for multi-threaded console.status."""
282
+ """A wrapper for multi-threaded client-side console.status."""
234
283
  from sky import sky_logging # pylint: disable=import-outside-toplevel
235
284
  if (threading.current_thread() is threading.main_thread() and
236
285
  not sky_logging.is_silent()):
237
- if _statuses['client'] is None:
238
- _statuses['client'] = rich_console_utils.get_console().status(msg)
239
- return _RevertibleStatus(msg, 'client')
286
+ if _get_client_status() is None:
287
+ _set_client_status(rich_console_utils.get_console().status(msg))
288
+ return _RevertibleStatus(msg, _get_client_status, _set_client_status)
240
289
  return _NoOpConsoleStatus()
241
290
 
242
291
 
@@ -320,6 +369,9 @@ def decode_rich_status(
320
369
  yield line
321
370
  continue
322
371
 
372
+ if control == Control.RETRY:
373
+ raise exceptions.RequestInterruptedError(
374
+ 'Streaming interrupted. Please retry.')
323
375
  # control is not None, i.e. it is a rich status control message.
324
376
  if threading.current_thread() is not threading.main_thread():
325
377
  yield None
@@ -341,6 +393,130 @@ def decode_rich_status(
341
393
  decoding_status.__exit__(None, None, None)
342
394
  elif control == Control.START:
343
395
  decoding_status.start()
396
+ elif control == Control.HEARTBEAT:
397
+ # Heartbeat is not displayed to the user, so we do not
398
+ # need to update the status.
399
+ pass
400
+ finally:
401
+ if decoding_status is not None:
402
+ decoding_status.__exit__(None, None, None)
403
+
404
+
405
+ async def decode_rich_status_async(
406
+ response: 'aiohttp.ClientResponse'
407
+ ) -> typing.AsyncIterator[Optional[str]]:
408
+ """Async version of rich_utils.decode_rich_status that decodes rich status
409
+ messages from an aiohttp response.
410
+
411
+ Args:
412
+ response: The aiohttp response.
413
+
414
+ Yields:
415
+ Optional[str]: Decoded lines or None for control messages.
416
+ """
417
+ decoding_status = None
418
+ try:
419
+ last_line = ''
420
+ # Buffer to store incomplete UTF-8 bytes between chunks
421
+ undecoded_buffer = b''
422
+
423
+ # Iterate over the response content in chunks
424
+ async for chunk, _ in response.content.iter_chunks():
425
+ if chunk is None:
426
+ return
427
+
428
+ # Append the new chunk to any leftover bytes from previous iteration
429
+ current_bytes = undecoded_buffer + chunk
430
+ undecoded_buffer = b''
431
+
432
+ # Try to decode the combined bytes
433
+ try:
434
+ encoded_msg = current_bytes.decode('utf-8')
435
+ except UnicodeDecodeError as e:
436
+ # Check if this is potentially an incomplete sequence at the end
437
+ if e.start > 0:
438
+ # Decode the valid part
439
+ encoded_msg = current_bytes[:e.start].decode('utf-8')
440
+
441
+ # Check if the remaining bytes are likely a partial char
442
+ # or actually invalid UTF-8
443
+ remaining_bytes = current_bytes[e.start:]
444
+ if len(remaining_bytes) < 4: # Max UTF-8 char is 4 bytes
445
+ # Likely incomplete - save for next chunk
446
+ undecoded_buffer = remaining_bytes
447
+ else:
448
+ # Likely invalid - replace with replacement character
449
+ encoded_msg += remaining_bytes.decode('utf-8',
450
+ errors='replace')
451
+ undecoded_buffer = b''
452
+ else:
453
+ # Error at the very beginning of the buffer - invalid UTF-8
454
+ encoded_msg = current_bytes.decode('utf-8',
455
+ errors='replace')
456
+ undecoded_buffer = b''
457
+
458
+ lines = encoded_msg.splitlines(keepends=True)
459
+
460
+ # Skip processing if lines is empty to avoid IndexError
461
+ if not lines:
462
+ continue
463
+
464
+ lines[0] = last_line + lines[0]
465
+ last_line = lines[-1]
466
+ # If the last line is not ended with `\r` or `\n` (with ending
467
+ # spaces stripped), it means the last line is not a complete line.
468
+ # We keep the last line in the buffer and continue.
469
+ if (not last_line.strip(' ').endswith('\r') and
470
+ not last_line.strip(' ').endswith('\n')):
471
+ lines = lines[:-1]
472
+ else:
473
+ # Reset the buffer for the next line, as the last line is a
474
+ # complete line.
475
+ last_line = ''
476
+
477
+ for line in lines:
478
+ if line.endswith('\r\n'):
479
+ # Replace `\r\n` with `\n`, as printing a line ends with
480
+ # `\r\n` in linux will cause the line to be empty.
481
+ line = line[:-2] + '\n'
482
+ is_payload, line = message_utils.decode_payload(
483
+ line, raise_for_mismatch=False)
484
+ if line is None:
485
+ continue
486
+ control = None
487
+ if is_payload:
488
+ control, encoded_status = Control.decode(line)
489
+ if control is None:
490
+ yield line
491
+ continue
492
+
493
+ if control == Control.RETRY:
494
+ raise exceptions.RequestInterruptedError(
495
+ 'Streaming interrupted. Please retry.')
496
+ # control is not None, i.e. it is a rich status control message.
497
+ # In async context, we'll handle rich status controls normally
498
+ # since async typically runs in main thread
499
+ if control == Control.INIT:
500
+ decoding_status = client_status(encoded_status)
501
+ else:
502
+ if decoding_status is None:
503
+ # status may not be initialized if a user use --tail for
504
+ # sky api logs.
505
+ continue
506
+ assert decoding_status is not None, (
507
+ f'Rich status not initialized: {line}')
508
+ if control == Control.UPDATE:
509
+ decoding_status.update(encoded_status)
510
+ elif control == Control.STOP:
511
+ decoding_status.stop()
512
+ elif control == Control.EXIT:
513
+ decoding_status.__exit__(None, None, None)
514
+ elif control == Control.START:
515
+ decoding_status.start()
516
+ elif control == Control.HEARTBEAT:
517
+ # Heartbeat is not displayed to the user, so we do not
518
+ # need to update the status.
519
+ pass
344
520
  finally:
345
521
  if decoding_status is not None:
346
522
  decoding_status.__exit__(None, None, None)