skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/optimizer.py CHANGED
@@ -14,6 +14,7 @@ from sky import clouds
14
14
  from sky import exceptions
15
15
  from sky import resources as resources_lib
16
16
  from sky import sky_logging
17
+ from sky import skypilot_config
17
18
  from sky import task as task_lib
18
19
  from sky.adaptors import common as adaptors_common
19
20
  from sky.clouds import cloud as sky_cloud
@@ -21,6 +22,7 @@ from sky.usage import usage_lib
21
22
  from sky.utils import common
22
23
  from sky.utils import env_options
23
24
  from sky.utils import log_utils
25
+ from sky.utils import registry
24
26
  from sky.utils import resources_utils
25
27
  from sky.utils import rich_utils
26
28
  from sky.utils import subprocess_utils
@@ -73,8 +75,8 @@ class Optimizer:
73
75
  def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
74
76
  gigabytes: float) -> float:
75
77
  """Returns estimated egress cost."""
76
- if isinstance(src_cloud, DummyCloud) or isinstance(
77
- dst_cloud, DummyCloud):
78
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
79
+ dst_cloud, clouds.DummyCloud):
78
80
  return 0.0
79
81
 
80
82
  if not src_cloud.is_same_cloud(dst_cloud):
@@ -88,8 +90,8 @@ class Optimizer:
88
90
  gigabytes: float) -> float:
89
91
  """Returns estimated egress time in seconds."""
90
92
  # FIXME: estimate bandwidth between each cloud-region pair.
91
- if isinstance(src_cloud, DummyCloud) or isinstance(
92
- dst_cloud, DummyCloud):
93
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
94
+ dst_cloud, clouds.DummyCloud):
93
95
  return 0.0
94
96
  if not src_cloud.is_same_cloud(dst_cloud):
95
97
  # 10Gbps is close to the average of observed b/w from S3
@@ -167,7 +169,7 @@ class Optimizer:
167
169
 
168
170
  def make_dummy(name):
169
171
  dummy = task_lib.Task(name)
170
- dummy.set_resources({DummyResources(DummyCloud(), None)})
172
+ dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
171
173
  dummy.set_time_estimator(lambda _: 0)
172
174
  return dummy
173
175
 
@@ -197,7 +199,7 @@ class Optimizer:
197
199
  node: task_lib.Task,
198
200
  resources: resources_lib.Resources,
199
201
  ) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
200
- if isinstance(parent_resources.cloud, DummyCloud):
202
+ if isinstance(parent_resources.cloud, clouds.DummyCloud):
201
203
  # Special case. The current 'node' is a real
202
204
  # source node, and its input may be on a different
203
205
  # cloud from 'resources'.
@@ -321,10 +323,10 @@ class Optimizer:
321
323
  estimated_runtime = 1 * 3600
322
324
  else:
323
325
  # We assume the time estimator takes in a partial resource
324
- # Resources('V100')
326
+ # Resources(accelerators='V100')
325
327
  # and treats their launchable versions
326
- # Resources(AWS, 'p3.2xlarge'),
327
- # Resources(GCP, '...', 'V100'),
328
+ # Resources(infra='aws', instance_type='p3.2xlarge'),
329
+ # Resources(infra='gcp', accelerators='V100'),
328
330
  # ...
329
331
  # as having the same run time.
330
332
  # FIXME(zongheng): take 'num_nodes' as an arg/into
@@ -376,6 +378,10 @@ class Optimizer:
376
378
  if any(orig_resources.cloud is None
377
379
  for orig_resources in node.resources):
378
380
  source_hint = 'catalog and kubernetes cluster'
381
+ elif all(
382
+ isinstance(orig_resources.cloud, clouds.SSH)
383
+ for orig_resources in node.resources):
384
+ source_hint = 'node pool'
379
385
  elif all(
380
386
  isinstance(orig_resources.cloud, clouds.Kubernetes)
381
387
  for orig_resources in node.resources):
@@ -671,7 +677,7 @@ class Optimizer:
671
677
  plan: Dict[task_lib.Task, resources_lib.Resources],
672
678
  ) -> float:
673
679
  """Estimates the total cost of running the DAG by the plan."""
674
- total_cost = 0
680
+ total_cost = 0.
675
681
  for node in topo_order:
676
682
  resources = plan[node]
677
683
  if node.time_estimator_func is None:
@@ -772,15 +778,27 @@ class Optimizer:
772
778
  f'{colorama.Style.BRIGHT}Estimated total cost: '
773
779
  f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
774
780
 
781
+ def _instance_type_str(resources: 'resources_lib.Resources') -> str:
782
+ instance_type = resources.instance_type
783
+ assert instance_type is not None, 'Instance type must be specified'
784
+ if isinstance(resources.cloud, clouds.Kubernetes):
785
+ instance_type = '-'
786
+ if resources.use_spot:
787
+ instance_type = ''
788
+ return instance_type
789
+
775
790
  def _get_resources_element_list(
776
791
  resources: 'resources_lib.Resources') -> List[str]:
777
792
  accelerators = resources.get_accelerators_str()
778
793
  spot = resources.get_spot_str()
779
794
  cloud = resources.cloud
780
- vcpus, mem = cloud.get_vcpus_mem_from_instance_type(
795
+ assert cloud is not None, 'Cloud must be specified'
796
+ assert (resources.instance_type is not None), \
797
+ 'Instance type must be specified'
798
+ vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
781
799
  resources.instance_type)
782
800
 
783
- def format_number(x):
801
+ def format_number(x: Optional[float]) -> str:
784
802
  if x is None:
785
803
  return '-'
786
804
  elif x.is_integer():
@@ -788,25 +806,23 @@ class Optimizer:
788
806
  else:
789
807
  return f'{x:.1f}'
790
808
 
791
- vcpus = format_number(vcpus)
792
- mem = format_number(mem)
809
+ vcpus = format_number(vcpus_)
810
+ mem = format_number(mem_)
811
+
812
+ # Format infra as CLOUD (REGION/ZONE)
813
+ infra = resources.infra.formatted_str()
793
814
 
794
- if resources.zone is None:
795
- region_or_zone = resources.region
796
- else:
797
- region_or_zone = resources.zone
798
815
  return [
799
- str(cloud),
800
- resources.instance_type + spot,
816
+ infra,
817
+ _instance_type_str(resources) + spot,
801
818
  vcpus,
802
819
  mem,
803
820
  str(accelerators),
804
- str(region_or_zone),
805
821
  ]
806
822
 
807
823
  Row = collections.namedtuple('Row', [
808
- 'cloud', 'instance', 'vcpus', 'mem', 'accelerators',
809
- 'region_or_zone', 'cost_str', 'chosen_str'
824
+ 'infra', 'instance', 'vcpus', 'mem', 'accelerators', 'cost_str',
825
+ 'chosen_str'
810
826
  ])
811
827
 
812
828
  def _get_resources_named_tuple(resources: 'resources_lib.Resources',
@@ -814,11 +830,12 @@ class Optimizer:
814
830
 
815
831
  accelerators = resources.get_accelerators_str()
816
832
  spot = resources.get_spot_str()
833
+ resources = resources.assert_launchable()
817
834
  cloud = resources.cloud
818
- vcpus, mem = cloud.get_vcpus_mem_from_instance_type(
835
+ vcpus_, mem_ = cloud.get_vcpus_mem_from_instance_type(
819
836
  resources.instance_type)
820
837
 
821
- def format_number(x):
838
+ def format_number(x: Optional[float]) -> str:
822
839
  if x is None:
823
840
  return '-'
824
841
  elif x.is_integer():
@@ -826,21 +843,18 @@ class Optimizer:
826
843
  else:
827
844
  return f'{x:.1f}'
828
845
 
829
- vcpus = format_number(vcpus)
830
- mem = format_number(mem)
846
+ vcpus = format_number(vcpus_)
847
+ mem = format_number(mem_)
831
848
 
832
- if resources.zone is None:
833
- region_or_zone = resources.region
834
- else:
835
- region_or_zone = resources.zone
849
+ infra = resources.infra.formatted_str()
836
850
 
837
851
  chosen_str = ''
838
852
  if chosen:
839
853
  chosen_str = (colorama.Fore.GREEN + ' ' + '\u2714' +
840
854
  colorama.Style.RESET_ALL)
841
- row = Row(cloud, resources.instance_type + spot, vcpus, mem,
842
- str(accelerators), str(region_or_zone), cost_str,
843
- chosen_str)
855
+ row = Row(infra,
856
+ _instance_type_str(resources) + spot, vcpus, mem,
857
+ str(accelerators), cost_str, chosen_str)
844
858
 
845
859
  return row
846
860
 
@@ -850,18 +864,23 @@ class Optimizer:
850
864
  'accelerators': f'{resources.accelerators}',
851
865
  'use_spot': resources.use_spot
852
866
  }
867
+
868
+ # Handle special case for Kubernetes and SSH clouds
853
869
  if isinstance(resources.cloud, clouds.Kubernetes):
854
- # Region for Kubernetes is the context name, i.e. different
855
- # Kubernetes clusters. We add region to the key to show all the
856
- # Kubernetes clusters in the optimizer table for better UX.
870
+ # Region for Kubernetes-like clouds (SSH, Kubernetes) is the
871
+ # context name, i.e. different Kubernetes clusters. We add
872
+ # region to the key to show all the Kubernetes clusters in the
873
+ # optimizer table for better UX.
874
+
875
+ if resources.cloud.__class__.__name__ == 'SSH':
876
+ resource_key_dict[
877
+ 'cloud'] = 'SSH' # Force the cloud name to be SSH
857
878
  resource_key_dict['region'] = resources.region
879
+
858
880
  return json.dumps(resource_key_dict, sort_keys=True)
859
881
 
860
882
  # Print the list of resouces that the optimizer considered.
861
- resource_fields = [
862
- 'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
863
- 'REGION/ZONE'
864
- ]
883
+ resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
865
884
  if len(ordered_best_plan) > 1:
866
885
  best_plan_rows = []
867
886
  for t, r in ordered_best_plan.items():
@@ -978,24 +997,36 @@ class Optimizer:
978
997
  @staticmethod
979
998
  def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates):
980
999
  for node, candidate_set in node_to_candidate_map.items():
981
- if node.best_resources:
982
- accelerator = node.best_resources.accelerators
983
- else:
984
- accelerator = list(node.resources)[0].accelerators
1000
+ best_resources = node.best_resources
1001
+ if best_resources is None:
1002
+ best_resources = list(node.resources)[0]
985
1003
  is_multi_instances = False
986
- if accelerator:
987
- acc_name, acc_count = list(accelerator.items())[0]
1004
+ if best_resources.accelerators:
1005
+ acc_name, acc_count = list(
1006
+ best_resources.accelerators.items())[0]
988
1007
  for cloud, candidate_list in candidate_set.items():
989
- if len(candidate_list) > 1:
1008
+ # Filter only the candidates matching the best
1009
+ # resources chosen by the optimizer.
1010
+ best_resources_candidates = [
1011
+ res for res in candidate_list if
1012
+ res.get_accelerators_str() == f'{acc_name}:{acc_count}'
1013
+ ]
1014
+ if len(best_resources_candidates) > 1:
990
1015
  is_multi_instances = True
991
- instance_list = [
992
- res.instance_type for res in candidate_list
993
- ]
1016
+ instance_list = set([
1017
+ res.instance_type
1018
+ for res in best_resources_candidates
1019
+ if res.instance_type is not None
1020
+ ])
1021
+ candidate_str = resources_utils.format_resource(
1022
+ best_resources, simplified_only=True)[0]
1023
+
994
1024
  logger.info(
995
- f'Multiple {cloud} instances satisfy '
996
- f'{acc_name}:{int(acc_count)}. '
997
- f'The cheapest {candidate_list[0]!r} is considered '
998
- f'among:\n{instance_list}.')
1025
+ f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
1026
+ f'satisfy {acc_name}:{int(acc_count)}. '
1027
+ f'The cheapest {candidate_str} is considered '
1028
+ f'among: {", ".join(instance_list)}.'
1029
+ f'{colorama.Style.RESET_ALL}')
999
1030
  if is_multi_instances:
1000
1031
  logger.info(
1001
1032
  f'To list more details, run: sky show-gpus {acc_name}\n')
@@ -1147,11 +1178,6 @@ class DummyResources(resources_lib.Resources):
1147
1178
  return 0
1148
1179
 
1149
1180
 
1150
- class DummyCloud(clouds.Cloud):
1151
- """A dummy Cloud that has zero egress cost from/to."""
1152
- pass
1153
-
1154
-
1155
1181
  def _filter_out_blocked_launchable_resources(
1156
1182
  launchable_resources: Iterable[resources_lib.Resources],
1157
1183
  blocked_resources: Iterable[resources_lib.Resources]):
@@ -1195,10 +1221,14 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1195
1221
  all_clouds_specified.add(cloud_str)
1196
1222
 
1197
1223
  # Explicitly check again to update the enabled cloud list.
1198
- sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
1199
- quiet=True,
1200
- clouds=list(clouds_need_recheck -
1201
- global_disabled_clouds))
1224
+ clouds_to_check_again = list(clouds_need_recheck -
1225
+ global_disabled_clouds)
1226
+ if len(clouds_to_check_again) > 0:
1227
+ sky_check.check_capability(
1228
+ sky_cloud.CloudCapability.COMPUTE,
1229
+ quiet=True,
1230
+ clouds=clouds_to_check_again,
1231
+ workspace=skypilot_config.get_active_workspace())
1202
1232
  enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
1203
1233
  capability=sky_cloud.CloudCapability.COMPUTE,
1204
1234
  raise_if_no_cloud_access=True)
@@ -1208,7 +1238,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1208
1238
  if disabled_clouds:
1209
1239
  is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
1210
1240
  task_name = f' {task.name!r}' if task.name is not None else ''
1211
- msg = (f'Task{task_name} requires {", ".join(disabled_clouds)} '
1241
+ disabled_display_names = []
1242
+ for c in disabled_clouds:
1243
+ cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
1244
+ if cloud_obj_one is not None:
1245
+ disabled_display_names.append(cloud_obj_one.display_name())
1246
+ cloud_names = ', '.join(disabled_display_names)
1247
+ msg = (f'Task{task_name} requires {cloud_names} '
1212
1248
  f'which {is_or_are} not enabled. To enable access, change '
1213
1249
  f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
1214
1250
  f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
@@ -1222,6 +1258,62 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1222
1258
  logger.warning(
1223
1259
  f'{colorama.Fore.YELLOW}{msg}{colorama.Style.RESET_ALL}')
1224
1260
 
1261
+ _check_specified_regions(task)
1262
+
1263
+
1264
+ def _check_specified_regions(task: task_lib.Task) -> None:
1265
+ """Check if specified regions (Kubernetes/SSH contexts) are enabled.
1266
+
1267
+ Args:
1268
+ task: The task to check.
1269
+ """
1270
+ # Only check for Kubernetes/SSH for now
1271
+ # Below check works because SSH inherits Kubernetes cloud.
1272
+ if not all(
1273
+ isinstance(resources.cloud, clouds.Kubernetes)
1274
+ for resources in task.resources):
1275
+ return
1276
+ # Kubernetes region is a context if set
1277
+ for resources in task.resources:
1278
+ if resources.region is None:
1279
+ continue
1280
+
1281
+ is_ssh = isinstance(resources.cloud, clouds.SSH)
1282
+ if is_ssh:
1283
+ existing_contexts = clouds.SSH.existing_allowed_contexts()
1284
+ else:
1285
+ existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
1286
+
1287
+ region = resources.region
1288
+ task_name = f' {task.name!r}' if task.name is not None else ''
1289
+ msg = f'Task{task_name} requires '
1290
+ if region not in existing_contexts:
1291
+ if is_ssh:
1292
+ infra_str = f'SSH/{region.lstrip("ssh-")}'
1293
+ else:
1294
+ infra_str = f'Kubernetes/{region}'
1295
+ logger.warning(f'{infra_str} is not enabled.')
1296
+ volume_mounts_str = ''
1297
+ if task.volume_mounts:
1298
+ if len(task.volume_mounts) > 1:
1299
+ volume_mounts_str += 'volumes '
1300
+ else:
1301
+ volume_mounts_str += 'volume '
1302
+ volume_mounts_str += ', '.join(
1303
+ [f'{v.volume_name}' for v in task.volume_mounts])
1304
+ volume_mounts_str += f' with infra {infra_str}'
1305
+ if volume_mounts_str:
1306
+ msg += volume_mounts_str
1307
+ else:
1308
+ msg += f'infra {infra_str}'
1309
+ msg += (
1310
+ f' which is not enabled. To enable access, change '
1311
+ f'the task infra requirement or run: {colorama.Style.BRIGHT}'
1312
+ f'sky check {colorama.Style.RESET_ALL}'
1313
+ f'to ensure the infra is enabled.')
1314
+ with ux_utils.print_exception_no_traceback():
1315
+ raise exceptions.ResourcesUnavailableError(msg)
1316
+
1225
1317
 
1226
1318
  def _fill_in_launchable_resources(
1227
1319
  task: task_lib.Task,
@@ -1251,8 +1343,7 @@ def _fill_in_launchable_resources(
1251
1343
  launchable: Dict[resources_lib.Resources, List[resources_lib.Resources]] = (
1252
1344
  collections.defaultdict(list))
1253
1345
  all_fuzzy_candidates = set()
1254
- cloud_candidates: _PerCloudCandidates = collections.defaultdict(
1255
- List[resources_lib.Resources])
1346
+ cloud_candidates: _PerCloudCandidates = collections.defaultdict(list)
1256
1347
  resource_hints: Dict[resources_lib.Resources,
1257
1348
  List[str]] = collections.defaultdict(list)
1258
1349
  if blocked_resources is None:
@@ -1283,13 +1374,16 @@ def _fill_in_launchable_resources(
1283
1374
  if feasible_resources.resources_list:
1284
1375
  # Assume feasible_resources is sorted by prices. Guaranteed by
1285
1376
  # the implementation of get_feasible_launchable_resources and
1286
- # the underlying service_catalog filtering
1377
+ # the underlying catalog filtering
1287
1378
  cheapest = feasible_resources.resources_list[0]
1288
1379
  # Generate region/zone-specified resources.
1289
1380
  launchable[resources].extend(
1290
1381
  resources_utils.make_launchables_for_valid_region_zones(
1291
1382
  cheapest))
1292
- cloud_candidates[cloud] = feasible_resources.resources_list
1383
+ # Each cloud can occur multiple times in feasible_list,
1384
+ # for different region/zone.
1385
+ cloud_candidates[cloud].extend(
1386
+ feasible_resources.resources_list)
1293
1387
  else:
1294
1388
  all_fuzzy_candidates.update(
1295
1389
  feasible_resources.fuzzy_candidate_list)
@@ -1299,7 +1393,7 @@ def _fill_in_launchable_resources(
1299
1393
  num_node_str = ''
1300
1394
  if task.num_nodes > 1:
1301
1395
  num_node_str = f'{task.num_nodes}x '
1302
- if not quiet:
1396
+ if not (quiet or resources.no_missing_accel_warnings):
1303
1397
  logger.info(
1304
1398
  f'No resource satisfying {num_node_str}'
1305
1399
  f'{resources.repr_with_region_zone} on {clouds_str}.')
sky/provision/__init__.py CHANGED
@@ -6,8 +6,9 @@ providers supported by SkyPilot need to follow.
6
6
  import functools
7
7
  import inspect
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Type
9
+ from typing import Any, Dict, List, Optional, Tuple, Type
10
10
 
11
+ from sky import models
11
12
  from sky import sky_logging
12
13
  # These provision.<cloud> modules should never fail even if underlying cloud SDK
13
14
  # dependencies are not installed. This is ensured by using sky.adaptors inside
@@ -18,11 +19,17 @@ from sky.provision import common
18
19
  from sky.provision import cudo
19
20
  from sky.provision import fluidstack
20
21
  from sky.provision import gcp
22
+ from sky.provision import hyperbolic
21
23
  from sky.provision import kubernetes
22
24
  from sky.provision import lambda_cloud
23
25
  from sky.provision import nebius
24
26
  from sky.provision import oci
27
+ from sky.provision import primeintellect
25
28
  from sky.provision import runpod
29
+ from sky.provision import scp
30
+ from sky.provision import seeweb
31
+ from sky.provision import shadeform
32
+ from sky.provision import ssh
26
33
  from sky.provision import vast
27
34
  from sky.provision import vsphere
28
35
  from sky.utils import command_runner
@@ -69,16 +76,24 @@ def _route_to_cloud_impl(func):
69
76
  @_route_to_cloud_impl
70
77
  def query_instances(
71
78
  provider_name: str,
79
+ cluster_name: str,
72
80
  cluster_name_on_cloud: str,
73
81
  provider_config: Optional[Dict[str, Any]] = None,
74
82
  non_terminated_only: bool = True,
75
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
83
+ retry_if_missing: bool = False,
84
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
76
85
  """Query instances.
77
86
 
78
- Returns a dictionary of instance IDs and status.
87
+ Returns a dictionary of instance IDs and a tuple of (status, reason for
88
+ being in status if any).
79
89
 
80
90
  A None status means the instance is marked as "terminated"
81
91
  or "terminating".
92
+
93
+ Args:
94
+ retry_if_missing: Whether to retry the call to the cloud api if the
95
+ cluster is not found when querying the live status on the cloud.
96
+ NOTE: This is currently only used on kubernetes.
82
97
  """
83
98
  raise NotImplementedError
84
99
 
@@ -101,7 +116,67 @@ def bootstrap_instances(
101
116
 
102
117
 
103
118
  @_route_to_cloud_impl
104
- def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
119
+ def apply_volume(provider_name: str,
120
+ volume_config: models.VolumeConfig) -> models.VolumeConfig:
121
+ """Create or register a volume.
122
+
123
+ This function creates or registers a volume with the provided configuration,
124
+ and returns a VolumeConfig object with updated configuration.
125
+ """
126
+ raise NotImplementedError
127
+
128
+
129
+ @_route_to_cloud_impl
130
+ def delete_volume(provider_name: str,
131
+ volume_config: models.VolumeConfig) -> models.VolumeConfig:
132
+ """Delete a volume."""
133
+ raise NotImplementedError
134
+
135
+
136
+ @_route_to_cloud_impl
137
+ def get_volume_usedby(
138
+ provider_name: str,
139
+ volume_config: models.VolumeConfig,
140
+ ) -> Tuple[List[str], List[str]]:
141
+ """Get the usedby of a volume.
142
+
143
+ Returns:
144
+ usedby_pods: List of pods using the volume. These may include pods
145
+ not created by SkyPilot.
146
+ usedby_clusters: List of clusters using the volume.
147
+ """
148
+ raise NotImplementedError
149
+
150
+
151
+ @_route_to_cloud_impl
152
+ def get_all_volumes_usedby(
153
+ provider_name: str, configs: List[models.VolumeConfig]
154
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
155
+ """Get the usedby of a volume.
156
+
157
+ Returns:
158
+ usedby_pods: List of dictionaries, each containing the config keys for
159
+ a volume and a key containing pods using the volume.
160
+ These may include pods not created by SkyPilot.
161
+ usedby_clusters: List of dictionaries, each containing the config keys
162
+ for a volume and a key containing clusters using
163
+ the volume.
164
+ """
165
+ raise NotImplementedError
166
+
167
+
168
+ @_route_to_cloud_impl
169
+ def map_all_volumes_usedby(
170
+ provider_name: str, used_by_pods: Dict[str, Any],
171
+ used_by_clusters: Dict[str, Any],
172
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
173
+ """Map the usedby resources of a volume."""
174
+ raise NotImplementedError
175
+
176
+
177
+ @_route_to_cloud_impl
178
+ def run_instances(provider_name: str, region: str, cluster_name: str,
179
+ cluster_name_on_cloud: str,
105
180
  config: common.ProvisionConfig) -> common.ProvisionRecord:
106
181
  """Start instances with bootstrapped configuration."""
107
182
  raise NotImplementedError
@@ -129,6 +204,17 @@ def terminate_instances(
129
204
  raise NotImplementedError
130
205
 
131
206
 
207
+ @_route_to_cloud_impl
208
+ def cleanup_custom_multi_network(
209
+ provider_name: str,
210
+ cluster_name_on_cloud: str,
211
+ provider_config: Dict[str, Any],
212
+ failover: bool = False,
213
+ ) -> None:
214
+ """Cleanup custom multi-network."""
215
+ raise NotImplementedError
216
+
217
+
132
218
  @_route_to_cloud_impl
133
219
  def open_ports(
134
220
  provider_name: str,