skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/resources.py CHANGED
@@ -1,38 +1,120 @@
1
1
  """Resources: compute requirements of Tasks."""
2
+ import collections
2
3
  import dataclasses
4
+ import re
3
5
  import textwrap
4
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
6
+ import typing
7
+ from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
5
8
 
6
9
  import colorama
7
10
 
11
+ from sky import catalog
8
12
  from sky import check as sky_check
9
13
  from sky import clouds
10
14
  from sky import exceptions
11
15
  from sky import sky_logging
12
16
  from sky import skypilot_config
13
17
  from sky.clouds import cloud as sky_cloud
14
- from sky.clouds import service_catalog
15
18
  from sky.provision import docker_utils
19
+ from sky.provision.gcp import constants as gcp_constants
16
20
  from sky.provision.kubernetes import utils as kubernetes_utils
21
+ from sky.provision.nebius import constants as nebius_constants
22
+ from sky.skylet import autostop_lib
17
23
  from sky.skylet import constants
18
24
  from sky.utils import accelerator_registry
19
25
  from sky.utils import annotations
20
26
  from sky.utils import common_utils
21
27
  from sky.utils import config_utils
28
+ from sky.utils import infra_utils
22
29
  from sky.utils import log_utils
23
30
  from sky.utils import registry
24
31
  from sky.utils import resources_utils
25
32
  from sky.utils import schemas
26
33
  from sky.utils import ux_utils
27
34
 
35
+ if typing.TYPE_CHECKING:
36
+ from sky.utils import volume as volume_lib
37
+
28
38
  logger = sky_logging.init_logger(__name__)
29
39
 
30
- _DEFAULT_DISK_SIZE_GB = 256
40
+ DEFAULT_DISK_SIZE_GB = 256
31
41
 
32
42
  RESOURCE_CONFIG_ALIASES = {
33
43
  'gpus': 'accelerators',
34
44
  }
35
45
 
46
+ MEMORY_SIZE_UNITS = {
47
+ 'b': 1,
48
+ 'k': 2**10,
49
+ 'kb': 2**10,
50
+ 'm': 2**20,
51
+ 'mb': 2**20,
52
+ 'g': 2**30,
53
+ 'gb': 2**30,
54
+ 't': 2**40,
55
+ 'tb': 2**40,
56
+ 'p': 2**50,
57
+ 'pb': 2**50,
58
+ }
59
+
60
+
61
+ @dataclasses.dataclass
62
+ class AutostopConfig:
63
+ """Configuration for autostop."""
64
+ # enabled isn't present in the yaml config, but it's needed for this class
65
+ # to be complete.
66
+ enabled: bool
67
+ # If enabled is False, these values are ignored.
68
+ # Keep the default value to 0 to make the behavior consistent with the CLI
69
+ # flags.
70
+ idle_minutes: int = 0
71
+ down: bool = False
72
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None
73
+
74
+ def to_yaml_config(self) -> Union[Literal[False], Dict[str, Any]]:
75
+ if not self.enabled:
76
+ return False
77
+ config: Dict[str, Any] = {
78
+ 'idle_minutes': self.idle_minutes,
79
+ 'down': self.down,
80
+ }
81
+ if self.wait_for is not None:
82
+ config['wait_for'] = self.wait_for.value
83
+ return config
84
+
85
+ @classmethod
86
+ def from_yaml_config(
87
+ cls, config: Union[bool, int, str, Dict[str, Any], None]
88
+ ) -> Optional['AutostopConfig']:
89
+ if isinstance(config, bool):
90
+ if config:
91
+ return cls(enabled=True)
92
+ else:
93
+ return cls(enabled=False)
94
+
95
+ if isinstance(config, int):
96
+ return cls(idle_minutes=config, down=False, enabled=True)
97
+
98
+ if isinstance(config, str):
99
+ return cls(idle_minutes=resources_utils.parse_time_minutes(config),
100
+ down=False,
101
+ enabled=True)
102
+
103
+ if isinstance(config, dict):
104
+ # If we have a dict, autostop is enabled. (Only way to disable is
105
+ # with `false`, a bool.)
106
+ autostop_config = cls(enabled=True)
107
+ if 'idle_minutes' in config:
108
+ autostop_config.idle_minutes = config['idle_minutes']
109
+ if 'down' in config:
110
+ autostop_config.down = config['down']
111
+ if 'wait_for' in config:
112
+ autostop_config.wait_for = (
113
+ autostop_lib.AutostopWaitFor.from_str(config['wait_for']))
114
+ return autostop_config
115
+
116
+ return None
117
+
36
118
 
37
119
  class Resources:
38
120
  """Resources: compute requirements of Tasks.
@@ -51,7 +133,7 @@ class Resources:
51
133
  """
52
134
  # If any fields changed, increment the version. For backward compatibility,
53
135
  # modify the __setstate__ method to handle the old version.
54
- _VERSION = 22
136
+ _VERSION = 28
55
137
 
56
138
  def __init__(
57
139
  self,
@@ -59,17 +141,23 @@ class Resources:
59
141
  instance_type: Optional[str] = None,
60
142
  cpus: Union[None, int, float, str] = None,
61
143
  memory: Union[None, int, float, str] = None,
62
- accelerators: Union[None, str, Dict[str, int]] = None,
144
+ accelerators: Union[None, str, Dict[str, Union[int, float]]] = None,
63
145
  accelerator_args: Optional[Dict[str, str]] = None,
146
+ infra: Optional[str] = None,
64
147
  use_spot: Optional[bool] = None,
65
- job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
148
+ job_recovery: Optional[Union[Dict[str, Optional[Union[str, int]]],
149
+ str]] = None,
66
150
  region: Optional[str] = None,
67
151
  zone: Optional[str] = None,
68
- image_id: Union[Dict[str, str], str, None] = None,
69
- disk_size: Optional[int] = None,
152
+ image_id: Union[Dict[Optional[str], str], str, None] = None,
153
+ disk_size: Optional[Union[str, int]] = None,
70
154
  disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
155
+ network_tier: Optional[Union[str, resources_utils.NetworkTier]] = None,
71
156
  ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
72
157
  labels: Optional[Dict[str, str]] = None,
158
+ autostop: Union[bool, int, str, Dict[str, Any], None] = None,
159
+ priority: Optional[int] = None,
160
+ volumes: Optional[List[Dict[str, Any]]] = None,
73
161
  # Internal use only.
74
162
  # pylint: disable=invalid-name
75
163
  _docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
@@ -77,6 +165,7 @@ class Resources:
77
165
  _is_image_managed: Optional[bool] = None,
78
166
  _requires_fuse: Optional[bool] = None,
79
167
  _cluster_config_overrides: Optional[Dict[str, Any]] = None,
168
+ _no_missing_accel_warnings: Optional[bool] = None,
80
169
  ):
81
170
  """Initialize a Resources object.
82
171
 
@@ -87,9 +176,9 @@ class Resources:
87
176
  .. code-block:: python
88
177
 
89
178
  # Fully specified cloud and instance type (is_launchable() is True).
90
- sky.Resources(clouds.AWS(), 'p3.2xlarge')
91
- sky.Resources(clouds.GCP(), 'n1-standard-16')
92
- sky.Resources(clouds.GCP(), 'n1-standard-8', 'V100')
179
+ sky.Resources(infra='aws', instance_type='p3.2xlarge')
180
+ sky.Resources(infra='k8s/my-cluster-ctx', accelerators='V100')
181
+ sky.Resources(infra='gcp/us-central1', accelerators='V100')
93
182
 
94
183
  # Specifying required resources; the system decides the
95
184
  # cloud/instance type. The below are equivalent:
@@ -98,8 +187,9 @@ class Resources:
98
187
  sky.Resources(accelerators={'V100': 1})
99
188
  sky.Resources(cpus='2+', memory='16+', accelerators='V100')
100
189
 
190
+
101
191
  Args:
102
- cloud: the cloud to use.
192
+ cloud: the cloud to use. Deprecated. Use `infra` instead.
103
193
  instance_type: the instance type to use.
104
194
  cpus: the number of CPUs required for the task.
105
195
  If a str, must be a string of the form ``'2'`` or ``'2+'``, where
@@ -113,6 +203,11 @@ class Resources:
113
203
  dict of the form ``{'V100': 2}`` or ``{'tpu-v2-8': 1}``.
114
204
  accelerator_args: accelerator-specific arguments. For example,
115
205
  ``{'tpu_vm': True, 'runtime_version': 'tpu-vm-base'}`` for TPUs.
206
+ infra: a string specifying the infrastructure to use, in the format
207
+ of "cloud/region" or "cloud/region/zone". For example,
208
+ `aws/us-east-1` or `k8s/my-cluster-ctx`. This is an alternative to
209
+ specifying cloud, region, and zone separately. If provided, it
210
+ takes precedence over cloud, region, and zone parameters.
116
211
  use_spot: whether to use spot instances. If None, defaults to
117
212
  False.
118
213
  job_recovery: the job recovery strategy to use for the managed
@@ -125,8 +220,8 @@ class Resources:
125
220
  - max_restarts_on_errors: the max number of restarts on user code
126
221
  errors.
127
222
 
128
- region: the region to use.
129
- zone: the zone to use.
223
+ region: the region to use. Deprecated. Use `infra` instead.
224
+ zone: the zone to use. Deprecated. Use `infra` instead.
130
225
  image_id: the image ID to use. If a str, must be a string
131
226
  of the image id from the cloud, such as AWS:
132
227
  ``'ami-1234567890abcdef0'``, GCP:
@@ -145,6 +240,8 @@ class Resources:
145
240
  disk_size: the size of the OS disk in GiB.
146
241
  disk_tier: the disk performance tier to use. If None, defaults to
147
242
  ``'medium'``.
243
+ network_tier: the network performance tier to use. If None, defaults to
244
+ ``'standard'``.
148
245
  ports: the ports to open on the instance.
149
246
  labels: the labels to apply to the instance. These are useful for
150
247
  assigning metadata that may be used by external tools.
@@ -152,6 +249,12 @@ class Resources:
152
249
  instance tags. On GCP, labels map to instance labels. On
153
250
  Kubernetes, labels map to pod labels. On other clouds, labels are
154
251
  not supported and will be ignored.
252
+ autostop: the autostop configuration to use. For launched resources,
253
+ may or may not correspond to the actual current autostop config.
254
+ priority: the priority for this resource configuration. Must be an
255
+ integer from -1000 to 1000, where higher values indicate higher priority.
256
+ If None, no priority is set.
257
+ volumes: the volumes to mount on the instance.
155
258
  _docker_login_config: the docker configuration to use. This includes
156
259
  the docker username, password, and registry server. If None, skip
157
260
  docker login.
@@ -169,6 +272,25 @@ class Resources:
169
272
  exceptions.NoCloudAccessError: if no public cloud is enabled.
170
273
  """
171
274
  self._version = self._VERSION
275
+
276
+ if infra is not None and (cloud is not None or region is not None or
277
+ zone is not None):
278
+ with ux_utils.print_exception_no_traceback():
279
+ raise ValueError('Cannot specify both `infra` and `cloud`, '
280
+ '`region`, or `zone` parameters. '
281
+ f'Got: infra={infra}, cloud={cloud}, '
282
+ f'region={region}, zone={zone}')
283
+
284
+ # Infra is user facing, and cloud, region, zone in parameters are for
285
+ # backward compatibility. Internally, we keep using cloud, region, zone
286
+ # for simplicity.
287
+ if infra is not None:
288
+ infra_info = infra_utils.InfraInfo.from_str(infra)
289
+ # Infra takes precedence over individually specified parameters
290
+ cloud = registry.CLOUD_REGISTRY.from_str(infra_info.cloud)
291
+ region = infra_info.region
292
+ zone = infra_info.zone
293
+
172
294
  self._cloud = cloud
173
295
  self._region: Optional[str] = region
174
296
  self._zone: Optional[str] = zone
@@ -177,7 +299,8 @@ class Resources:
177
299
 
178
300
  self._use_spot_specified = use_spot is not None
179
301
  self._use_spot = use_spot if use_spot is not None else False
180
- self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
302
+ self._job_recovery: Optional[Dict[str, Optional[Union[str,
303
+ int]]]] = None
181
304
  if job_recovery is not None:
182
305
  if isinstance(job_recovery, str):
183
306
  job_recovery = {'strategy': job_recovery}
@@ -188,20 +311,17 @@ class Resources:
188
311
  if strategy_name == 'none':
189
312
  self._job_recovery = None
190
313
  else:
191
- if strategy_name is not None:
314
+ if isinstance(strategy_name, str):
192
315
  job_recovery['strategy'] = strategy_name.upper()
193
316
  self._job_recovery = job_recovery
194
317
 
195
318
  if disk_size is not None:
196
- if round(disk_size) != disk_size:
197
- with ux_utils.print_exception_no_traceback():
198
- raise ValueError(
199
- f'OS disk size must be an integer. Got: {disk_size}.')
200
- self._disk_size = int(disk_size)
319
+ self._disk_size = int(
320
+ resources_utils.parse_memory_resource(disk_size, 'disk_size'))
201
321
  else:
202
- self._disk_size = _DEFAULT_DISK_SIZE_GB
322
+ self._disk_size = DEFAULT_DISK_SIZE_GB
203
323
 
204
- self._image_id = image_id
324
+ self._image_id: Optional[Dict[Optional[str], str]] = None
205
325
  if isinstance(image_id, str):
206
326
  self._image_id = {self._region: image_id.strip()}
207
327
  elif isinstance(image_id, dict):
@@ -209,8 +329,13 @@ class Resources:
209
329
  self._image_id = {self._region: image_id[None].strip()}
210
330
  else:
211
331
  self._image_id = {
212
- k.strip(): v.strip() for k, v in image_id.items()
332
+ typing.cast(str, k).strip(): v.strip()
333
+ for k, v in image_id.items()
213
334
  }
335
+ else:
336
+ self._image_id = image_id
337
+ if isinstance(self._cloud, clouds.Kubernetes):
338
+ _maybe_add_docker_prefix_to_image_id(self._image_id)
214
339
  self._is_image_managed = _is_image_managed
215
340
 
216
341
  if isinstance(disk_tier, str):
@@ -224,11 +349,25 @@ class Resources:
224
349
  disk_tier = resources_utils.DiskTier(disk_tier_str)
225
350
  self._disk_tier = disk_tier
226
351
 
352
+ if isinstance(network_tier, str):
353
+ network_tier_str = str(network_tier).lower()
354
+ supported_tiers = [
355
+ tier.value for tier in resources_utils.NetworkTier
356
+ ]
357
+ if network_tier_str not in supported_tiers:
358
+ with ux_utils.print_exception_no_traceback():
359
+ raise ValueError(
360
+ f'Invalid network_tier {network_tier_str!r}. '
361
+ f'Network tier must be one of '
362
+ f'{", ".join(supported_tiers)}.')
363
+ network_tier = resources_utils.NetworkTier(network_tier_str)
364
+ self._network_tier = network_tier
365
+
227
366
  if ports is not None:
228
367
  if isinstance(ports, tuple):
229
368
  ports = list(ports)
230
369
  if not isinstance(ports, list):
231
- ports = [ports]
370
+ ports = [str(ports)]
232
371
  ports = resources_utils.simplify_ports(
233
372
  [str(port) for port in ports])
234
373
  if not ports:
@@ -250,11 +389,18 @@ class Resources:
250
389
  self._requires_fuse = _requires_fuse
251
390
 
252
391
  self._cluster_config_overrides = _cluster_config_overrides
253
- self._cached_repr = None
392
+ self._cached_repr: Optional[str] = None
393
+ self._no_missing_accel_warnings = _no_missing_accel_warnings
394
+
395
+ # Initialize _priority before calling the setter
396
+ self._priority: Optional[int] = None
254
397
 
255
398
  self._set_cpus(cpus)
256
399
  self._set_memory(memory)
257
400
  self._set_accelerators(accelerators, accelerator_args)
401
+ self._set_autostop_config(autostop)
402
+ self._set_priority(priority)
403
+ self._set_volumes(volumes)
258
404
 
259
405
  def validate(self):
260
406
  """Validate the resources and infer the missing fields if possible."""
@@ -265,6 +411,7 @@ class Resources:
265
411
  self._try_validate_managed_job_attributes()
266
412
  self._try_validate_image_id()
267
413
  self._try_validate_disk_tier()
414
+ self._try_validate_volumes()
268
415
  self._try_validate_ports()
269
416
  self._try_validate_labels()
270
417
 
@@ -273,7 +420,7 @@ class Resources:
273
420
  # if it fails to fetch some account specific catalog information (e.g., AWS
274
421
  # zone mapping). It is fine to use the default catalog as this function is
275
422
  # only for display purposes.
276
- @service_catalog.fallback_to_default_catalog
423
+ @catalog.fallback_to_default_catalog
277
424
  def __repr__(self) -> str:
278
425
  """Returns a string representation for display.
279
426
 
@@ -330,8 +477,12 @@ class Resources:
330
477
  if self.disk_tier is not None:
331
478
  disk_tier = f', disk_tier={self.disk_tier.value}'
332
479
 
480
+ network_tier = ''
481
+ if self.network_tier is not None:
482
+ network_tier = f', network_tier={self.network_tier.value}'
483
+
333
484
  disk_size = ''
334
- if self.disk_size != _DEFAULT_DISK_SIZE_GB:
485
+ if self.disk_size != DEFAULT_DISK_SIZE_GB:
335
486
  disk_size = f', disk_size={self.disk_size}'
336
487
 
337
488
  ports = ''
@@ -349,7 +500,7 @@ class Resources:
349
500
  hardware_str = (
350
501
  f'{instance_type}{use_spot}'
351
502
  f'{cpus}{memory}{accelerators}{accelerator_args}{image_id}'
352
- f'{disk_tier}{disk_size}{ports}')
503
+ f'{disk_tier}{network_tier}{disk_size}{ports}')
353
504
  # It may have leading ',' (for example, instance_type not set) or empty
354
505
  # spaces. Remove them.
355
506
  while hardware_str and hardware_str[0] in (',', ' '):
@@ -366,7 +517,10 @@ class Resources:
366
517
  def repr_with_region_zone(self) -> str:
367
518
  region_str = ''
368
519
  if self.region is not None:
369
- region_str = f', region={self.region}'
520
+ region_name = self.region
521
+ if self.region.startswith('ssh-'):
522
+ region_name = common_utils.removeprefix(self.region, 'ssh-')
523
+ region_str = f', region={region_name}'
370
524
  zone_str = ''
371
525
  if self.zone is not None:
372
526
  zone_str = f', zone={self.zone}'
@@ -378,19 +532,24 @@ class Resources:
378
532
  return repr_str
379
533
 
380
534
  @property
381
- def cloud(self):
535
+ def infra(self) -> infra_utils.InfraInfo:
536
+ cloud = str(self.cloud) if self.cloud is not None else None
537
+ return infra_utils.InfraInfo(cloud, self.region, self.zone)
538
+
539
+ @property
540
+ def cloud(self) -> Optional[clouds.Cloud]:
382
541
  return self._cloud
383
542
 
384
543
  @property
385
- def region(self):
544
+ def region(self) -> Optional[str]:
386
545
  return self._region
387
546
 
388
547
  @property
389
- def zone(self):
548
+ def zone(self) -> Optional[str]:
390
549
  return self._zone
391
550
 
392
551
  @property
393
- def instance_type(self):
552
+ def instance_type(self) -> Optional[str]:
394
553
  return self._instance_type
395
554
 
396
555
  @property
@@ -432,9 +591,9 @@ class Resources:
432
591
  def accelerators(self) -> Optional[Dict[str, Union[int, float]]]:
433
592
  """Returns the accelerators field directly or by inferring.
434
593
 
435
- For example, Resources(AWS, 'p3.2xlarge') has its accelerators field
436
- set to None, but this function will infer {'V100': 1} from the instance
437
- type.
594
+ For example, Resources(infra='aws', instance_type='p3.2xlarge') has its
595
+ accelerators field set to None, but this function will infer {'V100': 1}
596
+ from the instance type.
438
597
  """
439
598
  if self._accelerators is not None:
440
599
  return self._accelerators
@@ -444,7 +603,7 @@ class Resources:
444
603
  return None
445
604
 
446
605
  @property
447
- def accelerator_args(self) -> Optional[Dict[str, str]]:
606
+ def accelerator_args(self) -> Optional[Dict[str, Any]]:
448
607
  return self._accelerator_args
449
608
 
450
609
  @property
@@ -456,7 +615,7 @@ class Resources:
456
615
  return self._use_spot_specified
457
616
 
458
617
  @property
459
- def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
618
+ def job_recovery(self) -> Optional[Dict[str, Optional[Union[str, int]]]]:
460
619
  return self._job_recovery
461
620
 
462
621
  @property
@@ -464,13 +623,17 @@ class Resources:
464
623
  return self._disk_size
465
624
 
466
625
  @property
467
- def image_id(self) -> Optional[Dict[str, str]]:
626
+ def image_id(self) -> Optional[Dict[Optional[str], str]]:
468
627
  return self._image_id
469
628
 
470
629
  @property
471
- def disk_tier(self) -> resources_utils.DiskTier:
630
+ def disk_tier(self) -> Optional[resources_utils.DiskTier]:
472
631
  return self._disk_tier
473
632
 
633
+ @property
634
+ def network_tier(self) -> Optional[resources_utils.NetworkTier]:
635
+ return self._network_tier
636
+
474
637
  @property
475
638
  def ports(self) -> Optional[List[str]]:
476
639
  return self._ports
@@ -479,6 +642,28 @@ class Resources:
479
642
  def labels(self) -> Optional[Dict[str, str]]:
480
643
  return self._labels
481
644
 
645
+ @property
646
+ def volumes(self) -> Optional[List[Dict[str, Any]]]:
647
+ return self._volumes
648
+
649
+ @property
650
+ def autostop_config(self) -> Optional[AutostopConfig]:
651
+ """The requested autostop config.
652
+
653
+ Warning: This is the autostop config that was originally used to
654
+ launch the resources. It may not correspond to the actual current
655
+ autostop config.
656
+ """
657
+ return self._autostop_config
658
+
659
+ @property
660
+ def priority(self) -> Optional[int]:
661
+ """The priority for this resource configuration.
662
+
663
+ Higher values indicate higher priority. Valid range is -1000 to 1000.
664
+ """
665
+ return self._priority
666
+
482
667
  @property
483
668
  def is_image_managed(self) -> Optional[bool]:
484
669
  return self._is_image_managed
@@ -489,15 +674,32 @@ class Resources:
489
674
  return False
490
675
  return self._requires_fuse
491
676
 
677
+ @property
678
+ def no_missing_accel_warnings(self) -> bool:
679
+ """Returns whether to force quiet mode for this resource."""
680
+ if self._no_missing_accel_warnings is None:
681
+ return False
682
+ return self._no_missing_accel_warnings
683
+
684
+ def set_requires_fuse(self, value: bool) -> None:
685
+ """Sets whether this resource requires FUSE mounting support.
686
+
687
+ Args:
688
+ value: Whether the resource requires FUSE mounting support.
689
+ """
690
+ # TODO(zeping): This violates the immutability of Resources.
691
+ # Refactor to use Resources.copy instead.
692
+ self._requires_fuse = value
693
+
492
694
  @property
493
695
  def cluster_config_overrides(self) -> Dict[str, Any]:
494
696
  if self._cluster_config_overrides is None:
495
697
  return {}
496
698
  return self._cluster_config_overrides
497
699
 
498
- @requires_fuse.setter
499
- def requires_fuse(self, value: Optional[bool]) -> None:
500
- self._requires_fuse = value
700
+ @property
701
+ def docker_login_config(self) -> Optional[docker_utils.DockerLoginConfig]:
702
+ return self._docker_login_config
501
703
 
502
704
  @property
503
705
  def docker_username_for_runpod(self) -> Optional[str]:
@@ -541,25 +743,27 @@ class Resources:
541
743
  self._memory = None
542
744
  return
543
745
 
544
- self._memory = str(memory)
545
- if isinstance(memory, str):
546
- if memory.endswith(('+', 'x')):
547
- # 'x' is used internally for make sure our resources used by
548
- # jobs controller (memory: 3x) to have enough memory based on
549
- # the vCPUs.
550
- num_memory_gb = memory[:-1]
551
- else:
552
- num_memory_gb = memory
553
-
554
- try:
555
- memory_gb = float(num_memory_gb)
556
- except ValueError:
557
- with ux_utils.print_exception_no_traceback():
558
- raise ValueError(
559
- f'The "memory" field should be either a number or '
560
- f'a string "<number>+". Found: {memory!r}') from None
746
+ memory = resources_utils.parse_memory_resource(str(memory),
747
+ 'memory',
748
+ ret_type=float,
749
+ allow_plus=True,
750
+ allow_x=True)
751
+ self._memory = memory
752
+ if memory.endswith(('+', 'x')):
753
+ # 'x' is used internally for make sure our resources used by
754
+ # jobs controller (memory: 3x) to have enough memory based on
755
+ # the vCPUs.
756
+ num_memory_gb = memory[:-1]
561
757
  else:
562
- memory_gb = float(memory)
758
+ num_memory_gb = memory
759
+
760
+ try:
761
+ memory_gb = float(num_memory_gb)
762
+ except ValueError:
763
+ with ux_utils.print_exception_no_traceback():
764
+ raise ValueError(
765
+ f'The "memory" field should be either a number or '
766
+ f'a string "<number>+". Found: {memory!r}') from None
563
767
 
564
768
  if memory_gb <= 0:
565
769
  with ux_utils.print_exception_no_traceback():
@@ -568,8 +772,8 @@ class Resources:
568
772
 
569
773
  def _set_accelerators(
570
774
  self,
571
- accelerators: Union[None, str, Dict[str, int]],
572
- accelerator_args: Optional[Dict[str, str]],
775
+ accelerators: Union[None, str, Dict[str, Union[int, float]]],
776
+ accelerator_args: Optional[Dict[str, Any]],
573
777
  ) -> None:
574
778
  """Sets accelerators.
575
779
 
@@ -582,6 +786,8 @@ class Resources:
582
786
  if ':' not in accelerators:
583
787
  accelerators = {accelerators: 1}
584
788
  else:
789
+ assert isinstance(accelerators,
790
+ str), (type(accelerators), accelerators)
585
791
  splits = accelerators.split(':')
586
792
  parse_error = ('The "accelerators" field as a str '
587
793
  'should be <name> or <name>:<cnt>. '
@@ -599,22 +805,29 @@ class Resources:
599
805
 
600
806
  acc, _ = list(accelerators.items())[0]
601
807
  if 'tpu' in acc.lower():
808
+ # TODO(syang): GCP TPU names are supported on both GCP and
809
+ # kubernetes (GKE), but this logic automatically assumes
810
+ # GCP TPUs can only be used on GCP.
811
+ # Fix the logic such that GCP TPU names can failover between
812
+ # GCP and kubernetes.
602
813
  if self.cloud is None:
603
- if kubernetes_utils.is_tpu_on_gke(acc):
814
+ if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
604
815
  self._cloud = clouds.Kubernetes()
605
816
  else:
606
817
  self._cloud = clouds.GCP()
607
- assert (self.cloud.is_same_cloud(clouds.GCP()) or
608
- self.cloud.is_same_cloud(clouds.Kubernetes())), (
609
- 'Cloud must be GCP or Kubernetes for TPU '
610
- 'accelerators.')
818
+ assert self.cloud is not None and (
819
+ self.cloud.is_same_cloud(clouds.GCP()) or
820
+ self.cloud.is_same_cloud(clouds.Kubernetes())), (
821
+ 'Cloud must be GCP or Kubernetes for TPU '
822
+ 'accelerators.')
611
823
 
612
824
  if accelerator_args is None:
613
825
  accelerator_args = {}
614
826
 
615
827
  use_tpu_vm = accelerator_args.get('tpu_vm', True)
616
828
  if (self.cloud.is_same_cloud(clouds.GCP()) and
617
- not kubernetes_utils.is_tpu_on_gke(acc)):
829
+ not kubernetes_utils.is_tpu_on_gke(acc,
830
+ normalize=False)):
618
831
  if 'runtime_version' not in accelerator_args:
619
832
 
620
833
  def _get_default_runtime_version() -> str:
@@ -641,15 +854,159 @@ class Resources:
641
854
  'Cannot specify instance type (got '
642
855
  f'{self.instance_type!r}) for TPU VM.')
643
856
 
644
- self._accelerators = accelerators
645
- self._accelerator_args = accelerator_args
857
+ self._accelerators: Optional[Dict[str, Union[int,
858
+ float]]] = accelerators
859
+ self._accelerator_args: Optional[Dict[str, Any]] = accelerator_args
860
+
861
+ def _set_autostop_config(
862
+ self,
863
+ autostop: Union[bool, int, str, Dict[str, Any], None],
864
+ ) -> None:
865
+ self._autostop_config = AutostopConfig.from_yaml_config(autostop)
866
+
867
+ def _set_priority(self, priority: Optional[int]) -> None:
868
+ """Sets the priority for this resource configuration.
869
+
870
+ Args:
871
+ priority: Priority value from -1000 to 1000, where higher values
872
+ indicate higher priority. If None, no priority is set.
873
+ """
874
+ if priority is not None:
875
+ if not constants.MIN_PRIORITY <= priority <= constants.MAX_PRIORITY:
876
+ with ux_utils.print_exception_no_traceback():
877
+ raise ValueError(
878
+ f'Priority must be between {constants.MIN_PRIORITY} and'
879
+ f' {constants.MAX_PRIORITY}. Found: {priority}')
880
+ self._priority = priority
881
+
882
+ def _set_volumes(
883
+ self,
884
+ volumes: Optional[List[Dict[str, Any]]],
885
+ ) -> None:
886
+ if not volumes:
887
+ self._volumes = None
888
+ return
889
+ valid_volumes = []
890
+ supported_tiers = [tier.value for tier in resources_utils.DiskTier]
891
+ supported_storage_types = [
892
+ storage_type.value for storage_type in resources_utils.StorageType
893
+ ]
894
+ supported_attach_modes = [
895
+ attach_mode.value for attach_mode in resources_utils.DiskAttachMode
896
+ ]
897
+ network_type = resources_utils.StorageType.NETWORK
898
+ read_write_mode = resources_utils.DiskAttachMode.READ_WRITE
899
+ for volume in volumes:
900
+ if 'path' not in volume:
901
+ with ux_utils.print_exception_no_traceback():
902
+ raise ValueError(f'Invalid volume {volume!r}. '
903
+ f'Volume must have a "path" field.')
904
+ if 'storage_type' not in volume:
905
+ volume['storage_type'] = network_type
906
+ else:
907
+ if isinstance(volume['storage_type'], str):
908
+ storage_type_str = str(volume['storage_type']).lower()
909
+ if storage_type_str not in supported_storage_types:
910
+ logger.warning(
911
+ f'Invalid storage_type {storage_type_str!r}. '
912
+ f'Set it to '
913
+ f'{network_type.value}.')
914
+ volume['storage_type'] = network_type
915
+ else:
916
+ volume['storage_type'] = resources_utils.StorageType(
917
+ storage_type_str)
918
+ if 'auto_delete' not in volume:
919
+ volume['auto_delete'] = False
920
+ if 'attach_mode' in volume:
921
+ if isinstance(volume['attach_mode'], str):
922
+ attach_mode_str = str(volume['attach_mode']).lower()
923
+ if attach_mode_str not in supported_attach_modes:
924
+ logger.warning(
925
+ f'Invalid attach_mode {attach_mode_str!r}. '
926
+ f'Set it to {read_write_mode.value}.')
927
+ volume['attach_mode'] = read_write_mode
928
+ else:
929
+ volume['attach_mode'] = resources_utils.DiskAttachMode(
930
+ attach_mode_str)
931
+ else:
932
+ volume['attach_mode'] = read_write_mode
933
+ if volume['storage_type'] == network_type:
934
+ # TODO(luca): add units to this disk_size as well
935
+ if ('disk_size' in volume and
936
+ round(volume['disk_size']) != volume['disk_size']):
937
+ with ux_utils.print_exception_no_traceback():
938
+ raise ValueError(f'Volume size must be an integer. '
939
+ f'Got: {volume["size"]}.')
940
+ if 'name' not in volume:
941
+ with ux_utils.print_exception_no_traceback():
942
+ raise ValueError(f'Network volume {volume["path"]} '
943
+ f'must have "name" field.')
944
+ elif 'name' in volume:
945
+ logger.info(f'Volume {volume["path"]} is a local disk. '
946
+ f'The "name" field will be ignored.')
947
+ del volume['name']
948
+ if 'disk_tier' in volume:
949
+ if isinstance(volume['disk_tier'], str):
950
+ disk_tier_str = str(volume['disk_tier']).lower()
951
+ if disk_tier_str not in supported_tiers:
952
+ logger.warning(
953
+ f'Invalid disk_tier {disk_tier_str!r}. '
954
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
955
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
956
+ else:
957
+ volume['disk_tier'] = resources_utils.DiskTier(
958
+ disk_tier_str)
959
+ elif volume['storage_type'] == network_type:
960
+ logger.debug(
961
+ f'No disk_tier specified for volume {volume["path"]}. '
962
+ f'Set it to {resources_utils.DiskTier.BEST.value}.')
963
+ volume['disk_tier'] = resources_utils.DiskTier.BEST
964
+
965
+ valid_volumes.append(volume)
966
+ self._volumes = valid_volumes
967
+
968
+ def override_autostop_config(
969
+ self,
970
+ down: bool = False,
971
+ idle_minutes: Optional[int] = None,
972
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = None) -> None:
973
+ """Override autostop config to the resource.
974
+
975
+ Args:
976
+ down: If true, override the autostop config to use autodown.
977
+ idle_minutes: If not None, override the idle minutes to autostop or
978
+ autodown.
979
+ wait_for: If not None, override the wait mode.
980
+ """
981
+ if not down and idle_minutes is None:
982
+ return
983
+ if self._autostop_config is None:
984
+ self._autostop_config = AutostopConfig(enabled=True,)
985
+ if down:
986
+ self._autostop_config.down = down
987
+ if idle_minutes is not None:
988
+ self._autostop_config.idle_minutes = idle_minutes
989
+ if wait_for is not None:
990
+ self._autostop_config.wait_for = wait_for
646
991
 
647
992
  def is_launchable(self) -> bool:
993
+ """Returns whether the resource is launchable."""
648
994
  return self.cloud is not None and self._instance_type is not None
649
995
 
996
+ def assert_launchable(self) -> 'LaunchableResources':
997
+ """A workaround to make mypy understand that is_launchable() is true.
998
+
999
+ Note: The `cast` to `LaunchableResources` is only for static type
1000
+ checking with MyPy. At runtime, the Python interpreter does not enforce
1001
+ types, and the returned object will still be an instance of `Resources`.
1002
+ """
1003
+ assert self.is_launchable(), self
1004
+ return typing.cast(LaunchableResources, self)
1005
+
650
1006
  def need_cleanup_after_preemption_or_failure(self) -> bool:
651
1007
  """Whether a resource needs cleanup after preemption or failure."""
652
1008
  assert self.is_launchable(), self
1009
+ assert self.cloud is not None, 'Cloud must be specified'
653
1010
  return self.cloud.need_cleanup_after_preemption_or_failure(self)
654
1011
 
655
1012
  def _try_canonicalize_accelerators(self) -> None:
@@ -706,10 +1063,10 @@ class Resources:
706
1063
  else:
707
1064
  table = log_utils.create_table(['Cloud', 'Hint'])
708
1065
  table.add_row(['-----', '----'])
709
- for cloud, error in cloud_to_errors.items():
1066
+ for cloud_msg, error in cloud_to_errors.items():
710
1067
  reason_str = '\n'.join(textwrap.wrap(
711
1068
  str(error), 80))
712
- table.add_row([str(cloud), reason_str])
1069
+ table.add_row([cloud_msg, reason_str])
713
1070
  hint = table.get_string()
714
1071
  raise ValueError(
715
1072
  f'Invalid (region {self._region!r}, zone '
@@ -741,17 +1098,22 @@ class Resources:
741
1098
  ssh_proxy_command dict with region names as keys).
742
1099
  """
743
1100
  assert self.is_launchable(), self
744
-
745
- regions = self._cloud.regions_with_offering(self._instance_type,
746
- self.accelerators,
747
- self._use_spot,
748
- self._region, self._zone)
1101
+ assert self.cloud is not None, 'Cloud must be specified'
1102
+ assert self._instance_type is not None, (
1103
+ 'Instance type must be specified')
1104
+ regions = self.cloud.regions_with_offering(self._instance_type,
1105
+ self.accelerators,
1106
+ self._use_spot, self._region,
1107
+ self._zone, self)
749
1108
  if self._image_id is not None and None not in self._image_id:
750
1109
  regions = [r for r in regions if r.name in self._image_id]
751
1110
 
752
1111
  # Filter the regions by the skypilot_config
753
- ssh_proxy_command_config = skypilot_config.get_nested(
754
- (str(self._cloud).lower(), 'ssh_proxy_command'), None)
1112
+ ssh_proxy_command_config = skypilot_config.get_effective_region_config(
1113
+ cloud=str(self._cloud).lower(),
1114
+ region=None,
1115
+ keys=('ssh_proxy_command',),
1116
+ default_value=None)
755
1117
  if (isinstance(ssh_proxy_command_config, str) or
756
1118
  ssh_proxy_command_config is None):
757
1119
  # All regions are valid as the regions are not specified for the
@@ -845,6 +1207,10 @@ class Resources:
845
1207
  cpus, mem = self.cloud.get_vcpus_mem_from_instance_type(
846
1208
  self._instance_type)
847
1209
  if self._cpus is not None:
1210
+ assert cpus is not None, (
1211
+ f'Can\'t get vCPUs from instance type: '
1212
+ f'{self._instance_type}, check catalog or '
1213
+ f'specify cpus directly.')
848
1214
  if self._cpus.endswith('+'):
849
1215
  if cpus < float(self._cpus[:-1]):
850
1216
  with ux_utils.print_exception_no_traceback():
@@ -859,6 +1225,10 @@ class Resources:
859
1225
  f'number of vCPUs. {self.instance_type} has {cpus} '
860
1226
  f'vCPUs, but {self._cpus} is requested.')
861
1227
  if self.memory is not None:
1228
+ assert mem is not None, (
1229
+ f'Can\'t get memory from instance type: '
1230
+ f'{self._instance_type}, check catalog or '
1231
+ f'specify memory directly.')
862
1232
  if self.memory.endswith(('+', 'x')):
863
1233
  if mem < float(self.memory[:-1]):
864
1234
  with ux_utils.print_exception_no_traceback():
@@ -882,16 +1252,22 @@ class Resources:
882
1252
  if self._job_recovery is None or self._job_recovery['strategy'] is None:
883
1253
  return
884
1254
  # Validate the job recovery strategy
1255
+ assert isinstance(self._job_recovery['strategy'],
1256
+ str), 'Job recovery strategy must be a string'
885
1257
  registry.JOBS_RECOVERY_STRATEGY_REGISTRY.from_str(
886
1258
  self._job_recovery['strategy'])
887
1259
 
888
1260
  def extract_docker_image(self) -> Optional[str]:
889
1261
  if self.image_id is None:
890
1262
  return None
891
- if len(self.image_id) == 1 and self.region in self.image_id:
892
- image_id = self.image_id[self.region]
893
- if image_id.startswith('docker:'):
894
- return image_id[len('docker:'):]
1263
+ # Handle dict image_id
1264
+ if len(self.image_id) == 1:
1265
+ # Check if the single key matches the region or is None (any region)
1266
+ image_key = list(self.image_id.keys())[0]
1267
+ if image_key == self.region or image_key is None:
1268
+ image_id = self.image_id[image_key]
1269
+ if image_id.startswith('docker:'):
1270
+ return image_id[len('docker:'):]
895
1271
  return None
896
1272
 
897
1273
  def _try_validate_image_id(self) -> None:
@@ -900,6 +1276,39 @@ class Resources:
900
1276
  Raises:
901
1277
  ValueError: if the attribute is invalid.
902
1278
  """
1279
+
1280
+ if self._network_tier == resources_utils.NetworkTier.BEST:
1281
+ if isinstance(self._cloud, clouds.GCP):
1282
+ # Handle GPU Direct TCPX requirement for docker images
1283
+ if self._image_id is None:
1284
+ self._image_id = {
1285
+ self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
1286
+ }
1287
+ elif isinstance(self._cloud, clouds.Nebius):
1288
+ if self._image_id is None:
1289
+ self._image_id = {
1290
+ self._region: nebius_constants.INFINIBAND_IMAGE_ID
1291
+ }
1292
+ elif self._image_id:
1293
+ # Custom image specified - validate it's a docker image
1294
+ # Check if any of the specified images are not docker images
1295
+ non_docker_images = []
1296
+ for region, image_id in self._image_id.items():
1297
+ if not image_id.startswith('docker:'):
1298
+ non_docker_images.append(
1299
+ f'{image_id} (region: {region})')
1300
+
1301
+ if non_docker_images:
1302
+ with ux_utils.print_exception_no_traceback():
1303
+ raise ValueError(
1304
+ f'When using network_tier=BEST, image_id '
1305
+ f'must be a docker image. '
1306
+ f'Found non-docker images: '
1307
+ f'{", ".join(non_docker_images)}. '
1308
+ f'Please either: (1) use a docker image '
1309
+ f'(prefix with "docker:"), or '
1310
+ f'(2) leave image_id empty to use the default')
1311
+
903
1312
  if self._image_id is None:
904
1313
  return
905
1314
 
@@ -916,37 +1325,51 @@ class Resources:
916
1325
  'Cloud must be specified when image_id is provided.')
917
1326
 
918
1327
  try:
919
- self._cloud.check_features_are_supported(
1328
+ self.cloud.check_features_are_supported(
920
1329
  self,
921
1330
  requested_features={
922
1331
  clouds.CloudImplementationFeatures.IMAGE_ID
923
1332
  })
924
1333
  except exceptions.NotSupportedError as e:
1334
+ # Provide a more helpful error message for Lambda cloud
1335
+ if self.cloud.is_same_cloud(clouds.Lambda()):
1336
+ with ux_utils.print_exception_no_traceback():
1337
+ raise ValueError(
1338
+ 'Lambda cloud only supports Docker images. '
1339
+ 'Please prefix your image with "docker:" '
1340
+ '(e.g., image_id: docker:your-image-name).') from e
925
1341
  with ux_utils.print_exception_no_traceback():
926
1342
  raise ValueError(
927
1343
  'image_id is only supported for AWS/GCP/Azure/IBM/OCI/'
928
- 'Kubernetes, please explicitly specify the cloud.') from e
1344
+ 'Kubernetes. For Lambda cloud, use "docker:" prefix for '
1345
+ 'Docker images.') from e
929
1346
 
930
1347
  if self._region is not None:
931
- if self._region not in self._image_id:
1348
+ # If the image_id has None as key (region-agnostic),
1349
+ # use it for any region
1350
+ if None in self._image_id:
1351
+ # Replace None key with the actual region
1352
+ self._image_id = {self._region: self._image_id[None]}
1353
+ elif self._region not in self._image_id:
932
1354
  with ux_utils.print_exception_no_traceback():
933
1355
  raise ValueError(
934
1356
  f'image_id {self._image_id} should contain the image '
935
1357
  f'for the specified region {self._region}.')
936
- # Narrow down the image_id to the specified region.
937
- self._image_id = {self._region: self._image_id[self._region]}
1358
+ else:
1359
+ # Narrow down the image_id to the specified region.
1360
+ self._image_id = {self._region: self._image_id[self._region]}
938
1361
 
939
1362
  # Check the image_id's are valid.
940
1363
  for region, image_id in self._image_id.items():
941
1364
  if (image_id.startswith('skypilot:') and
942
- not self._cloud.is_image_tag_valid(image_id, region)):
1365
+ not self.cloud.is_image_tag_valid(image_id, region)):
943
1366
  region_str = f' ({region})' if region else ''
944
1367
  with ux_utils.print_exception_no_traceback():
945
1368
  raise ValueError(
946
1369
  f'Image tag {image_id!r} is not valid, please make sure'
947
1370
  f' the tag exists in {self._cloud}{region_str}.')
948
1371
 
949
- if (self._cloud.is_same_cloud(clouds.AWS()) and
1372
+ if (self.cloud.is_same_cloud(clouds.AWS()) and
950
1373
  not image_id.startswith('skypilot:') and region is None):
951
1374
  with ux_utils.print_exception_no_traceback():
952
1375
  raise ValueError(
@@ -984,6 +1407,47 @@ class Resources:
984
1407
  f'Disk tier {self.disk_tier.value} is not supported '
985
1408
  f'for instance type {self.instance_type}.') from None
986
1409
 
1410
+ def _try_validate_volumes(self) -> None:
1411
+ """Try to validate the volumes attribute.
1412
+ Raises:
1413
+ ValueError: if the attribute is invalid.
1414
+ """
1415
+ if self.volumes is None:
1416
+ return
1417
+ if self.cloud is None:
1418
+ with ux_utils.print_exception_no_traceback():
1419
+ raise ValueError('Cloud must be specified when '
1420
+ 'volumes are provided.')
1421
+ if not self.cloud.is_same_cloud(clouds.GCP()):
1422
+ with ux_utils.print_exception_no_traceback():
1423
+ raise ValueError(f'Volumes are only supported for GCP'
1424
+ f' not for {self.cloud}.')
1425
+
1426
+ need_region_or_zone = False
1427
+ try:
1428
+ for volume in self.volumes:
1429
+ if ('name' in volume and volume['storage_type']
1430
+ == resources_utils.StorageType.NETWORK):
1431
+ need_region_or_zone = True
1432
+ if 'disk_tier' not in volume:
1433
+ continue
1434
+ # TODO(hailong): check instance local SSD
1435
+ # support for instance_type.
1436
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#machine-series-lssd # pylint: disable=line-too-long
1437
+ self.cloud.check_disk_tier_enabled(self.instance_type,
1438
+ volume['disk_tier'])
1439
+ if (need_region_or_zone and self._region is None and
1440
+ self._zone is None):
1441
+ with ux_utils.print_exception_no_traceback():
1442
+ raise ValueError('When specifying the volume name, please'
1443
+ ' also specify the region or zone.')
1444
+ except exceptions.NotSupportedError:
1445
+ with ux_utils.print_exception_no_traceback():
1446
+ raise ValueError(
1447
+ f'Disk tier {volume["disk_tier"].value} is not '
1448
+ f'supported for instance type {self.instance_type}.'
1449
+ ) from None
1450
+
987
1451
  def _try_validate_ports(self) -> None:
988
1452
  """Try to validate the ports attribute.
989
1453
 
@@ -1051,6 +1515,9 @@ class Resources:
1051
1515
  """Returns cost in USD for the runtime in seconds."""
1052
1516
  hours = seconds / 3600
1053
1517
  # Instance.
1518
+ assert self.cloud is not None, 'Cloud must be specified'
1519
+ assert self._instance_type is not None, (
1520
+ 'Instance type must be specified')
1054
1521
  hourly_cost = self.cloud.instance_type_to_hourly_cost(
1055
1522
  self._instance_type, self.use_spot, self._region, self._zone)
1056
1523
  # Accelerators (if any).
@@ -1071,11 +1538,15 @@ class Resources:
1071
1538
  def get_spot_str(self) -> str:
1072
1539
  return '[Spot]' if self.use_spot else ''
1073
1540
 
1074
- def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
1075
- region: clouds.Region,
1076
- zones: Optional[List[clouds.Zone]],
1077
- num_nodes: int,
1078
- dryrun: bool) -> Dict[str, Optional[str]]:
1541
+ def make_deploy_variables(
1542
+ self,
1543
+ cluster_name: resources_utils.ClusterName,
1544
+ region: clouds.Region,
1545
+ zones: Optional[List[clouds.Zone]],
1546
+ num_nodes: int,
1547
+ dryrun: bool,
1548
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
1549
+ ) -> Dict[str, Optional[str]]:
1079
1550
  """Converts planned sky.Resources to resource variables.
1080
1551
 
1081
1552
  These variables are divided into two categories: cloud-specific and
@@ -1095,8 +1566,9 @@ class Resources:
1095
1566
  docker_image = self.extract_docker_image()
1096
1567
 
1097
1568
  # Cloud specific variables
1569
+ assert self.cloud is not None, 'Cloud must be specified'
1098
1570
  cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1099
- self, cluster_name, region, zones, num_nodes, dryrun)
1571
+ self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
1100
1572
 
1101
1573
  # TODO(andyl): Should we print some warnings if users' envs share
1102
1574
  # same names with the cloud specific variables, but not enabled
@@ -1147,11 +1619,26 @@ class Resources:
1147
1619
  # to each cloud if any cloud supports reservations for spot.
1148
1620
  return {}
1149
1621
  specific_reservations = set(
1150
- skypilot_config.get_nested(
1151
- (str(self.cloud).lower(), 'specific_reservations'), set()))
1622
+ skypilot_config.get_effective_region_config(
1623
+ cloud=str(self.cloud).lower(),
1624
+ region=self.region,
1625
+ keys=('specific_reservations',),
1626
+ default_value=set()))
1627
+
1628
+ if isinstance(self.cloud, clouds.DummyCloud):
1629
+ return self.cloud.get_reservations_available_resources(
1630
+ instance_type='',
1631
+ region='',
1632
+ zone=None,
1633
+ specific_reservations=specific_reservations)
1634
+
1635
+ assert (self.cloud is not None and self.instance_type is not None and
1636
+ self.region is not None), (
1637
+ f'Cloud, instance type, region must be specified. '
1638
+ f'Resources={self}, cloud={self.cloud}, '
1639
+ f'instance_type={self.instance_type}, region={self.region}')
1152
1640
  return self.cloud.get_reservations_available_resources(
1153
- self._instance_type, self._region, self._zone,
1154
- specific_reservations)
1641
+ self.instance_type, self.region, self.zone, specific_reservations)
1155
1642
 
1156
1643
  def less_demanding_than(
1157
1644
  self,
@@ -1171,6 +1658,9 @@ class Resources:
1171
1658
  if isinstance(other, list):
1172
1659
  resources_list = [self.less_demanding_than(o) for o in other]
1173
1660
  return requested_num_nodes <= sum(resources_list)
1661
+
1662
+ assert other.cloud is not None, 'Other cloud must be specified'
1663
+
1174
1664
  if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
1175
1665
  return False
1176
1666
  # self.cloud <= other.cloud
@@ -1234,6 +1724,12 @@ class Resources:
1234
1724
  if not (self.disk_tier <= other.disk_tier): # pylint: disable=superfluous-parens
1235
1725
  return False
1236
1726
 
1727
+ if self.network_tier is not None:
1728
+ if other.network_tier is None:
1729
+ return False
1730
+ if not self.network_tier <= other.network_tier:
1731
+ return False
1732
+
1237
1733
  if check_ports:
1238
1734
  if self.ports is not None:
1239
1735
  if other.ports is None:
@@ -1259,6 +1755,7 @@ class Resources:
1259
1755
  If a field in `blocked` is None, it should be considered as a wildcard
1260
1756
  for that field.
1261
1757
  """
1758
+ assert self.cloud is not None, 'Cloud must be specified'
1262
1759
  is_matched = True
1263
1760
  if (blocked.cloud is not None and
1264
1761
  not self.cloud.is_same_cloud(blocked.cloud)):
@@ -1273,6 +1770,8 @@ class Resources:
1273
1770
  if (blocked.accelerators is not None and
1274
1771
  self.accelerators != blocked.accelerators):
1275
1772
  is_matched = False
1773
+ if blocked.use_spot is not None and self.use_spot != blocked.use_spot:
1774
+ is_matched = False
1276
1775
  return is_matched
1277
1776
 
1278
1777
  def is_empty(self) -> bool:
@@ -1285,8 +1784,9 @@ class Resources:
1285
1784
  self._accelerators is None,
1286
1785
  self._accelerator_args is None,
1287
1786
  not self._use_spot_specified,
1288
- self._disk_size == _DEFAULT_DISK_SIZE_GB,
1787
+ self._disk_size == DEFAULT_DISK_SIZE_GB,
1289
1788
  self._disk_tier is None,
1789
+ self._network_tier is None,
1290
1790
  self._image_id is None,
1291
1791
  self._ports is None,
1292
1792
  self._docker_login_config is None,
@@ -1297,7 +1797,7 @@ class Resources:
1297
1797
  use_spot = self.use_spot if self._use_spot_specified else None
1298
1798
 
1299
1799
  current_override_configs = self._cluster_config_overrides
1300
- if self._cluster_config_overrides is None:
1800
+ if current_override_configs is None:
1301
1801
  current_override_configs = {}
1302
1802
  new_override_configs = override.pop('_cluster_config_overrides', {})
1303
1803
  overlaid_configs = skypilot_config.overlay_skypilot_config(
@@ -1310,6 +1810,10 @@ class Resources:
1310
1810
  if elem is not None:
1311
1811
  override_configs.set_nested(key, elem)
1312
1812
 
1813
+ current_autostop_config = None
1814
+ if self.autostop_config is not None:
1815
+ current_autostop_config = self.autostop_config.to_yaml_config()
1816
+
1313
1817
  override_configs = dict(override_configs) if override_configs else None
1314
1818
  resources = Resources(
1315
1819
  cloud=override.pop('cloud', self.cloud),
@@ -1326,8 +1830,13 @@ class Resources:
1326
1830
  zone=override.pop('zone', self.zone),
1327
1831
  image_id=override.pop('image_id', self.image_id),
1328
1832
  disk_tier=override.pop('disk_tier', self.disk_tier),
1833
+ network_tier=override.pop('network_tier', self.network_tier),
1329
1834
  ports=override.pop('ports', self.ports),
1330
1835
  labels=override.pop('labels', self.labels),
1836
+ autostop=override.pop('autostop', current_autostop_config),
1837
+ priority=override.pop('priority', self.priority),
1838
+ volumes=override.pop('volumes', self.volumes),
1839
+ infra=override.pop('infra', None),
1331
1840
  _docker_login_config=override.pop('_docker_login_config',
1332
1841
  self._docker_login_config),
1333
1842
  _docker_username_for_runpod=override.pop(
@@ -1337,6 +1846,8 @@ class Resources:
1337
1846
  self._is_image_managed),
1338
1847
  _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
1339
1848
  _cluster_config_overrides=override_configs,
1849
+ _no_missing_accel_warnings=override.pop(
1850
+ 'no_missing_accel_warnings', self._no_missing_accel_warnings),
1340
1851
  )
1341
1852
  assert not override
1342
1853
  return resources
@@ -1361,12 +1872,21 @@ class Resources:
1361
1872
  if (self.disk_tier is not None and
1362
1873
  self.disk_tier != resources_utils.DiskTier.BEST):
1363
1874
  features.add(clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1875
+ if (self.network_tier is not None and
1876
+ self.network_tier == resources_utils.NetworkTier.BEST):
1877
+ features.add(clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER)
1364
1878
  if self.extract_docker_image() is not None:
1365
1879
  features.add(clouds.CloudImplementationFeatures.DOCKER_IMAGE)
1366
1880
  elif self.image_id is not None:
1367
1881
  features.add(clouds.CloudImplementationFeatures.IMAGE_ID)
1368
1882
  if self.ports is not None:
1369
1883
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1884
+ if self.volumes is not None:
1885
+ for volume in self.volumes:
1886
+ if 'disk_tier' in volume and volume[
1887
+ 'disk_tier'] != resources_utils.DiskTier.BEST:
1888
+ features.add(
1889
+ clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER)
1370
1890
  return features
1371
1891
 
1372
1892
  @staticmethod
@@ -1393,10 +1913,75 @@ class Resources:
1393
1913
  config[canonical] = config[alias]
1394
1914
  del config[alias]
1395
1915
 
1916
+ @classmethod
1917
+ def _parse_accelerators_from_str(
1918
+ cls, accelerators: str) -> List[Tuple[str, bool]]:
1919
+ """Parse accelerators string into a list of possible accelerators.
1920
+
1921
+ Returns:
1922
+ A list of possible accelerators. Each element is a tuple of
1923
+ (accelerator_name, was_user_specified). was_user_specified is True
1924
+ if the accelerator was directly named by the user (for example
1925
+ "H100:2" would be True, but "80GB+" would be False since it doesn't
1926
+ mention the name of the accelerator).
1927
+ """
1928
+ # sanity check
1929
+ assert isinstance(accelerators, str), accelerators
1930
+
1931
+ manufacturer = None
1932
+ memory = None
1933
+ count = 1
1934
+
1935
+ split = accelerators.split(':')
1936
+ if len(split) == 3:
1937
+ manufacturer, memory, count_str = split
1938
+ count = int(count_str)
1939
+ assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
1940
+ 'If specifying a GPU manufacturer, you must also' \
1941
+ 'specify the memory size'
1942
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1943
+ memory = split[0]
1944
+ count = int(split[1])
1945
+ elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
1946
+ manufacturer, memory = split
1947
+ elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
1948
+ memory = split[0]
1949
+ else:
1950
+ # it is just an accelerator name, not a memory size
1951
+ return [(accelerators, True)]
1952
+
1953
+ # we know we have some case of manufacturer, memory, count, now we
1954
+ # need to convert that to a list of possible accelerators
1955
+ memory_parsed = resources_utils.parse_memory_resource(memory,
1956
+ 'accelerators',
1957
+ allow_plus=True)
1958
+ plus = memory_parsed[-1] == '+'
1959
+ if plus:
1960
+ memory_parsed = memory_parsed[:-1]
1961
+ memory_gb = int(memory_parsed)
1962
+
1963
+ accelerators = [
1964
+ (f'{device}:{count}', False)
1965
+ for device in accelerator_registry.get_devices_by_memory(
1966
+ memory_gb, plus, manufacturer=manufacturer)
1967
+ ]
1968
+
1969
+ return accelerators
1970
+
1396
1971
  @classmethod
1397
1972
  def from_yaml_config(
1398
1973
  cls, config: Optional[Dict[str, Any]]
1399
1974
  ) -> Union[Set['Resources'], List['Resources']]:
1975
+ """Creates Resources objects from a YAML config.
1976
+
1977
+ Args:
1978
+ config: A dict of resource config.
1979
+
1980
+ Returns:
1981
+ A set of Resources objects if any_of is specified, otherwise a list
1982
+ of Resources objects if ordered is specified, otherwise a set with
1983
+ a single Resources object.
1984
+ """
1400
1985
  if config is None:
1401
1986
  return {Resources()}
1402
1987
 
@@ -1453,13 +2038,48 @@ class Resources:
1453
2038
  accelerators = config.get('accelerators')
1454
2039
  if config and accelerators is not None:
1455
2040
  if isinstance(accelerators, str):
1456
- accelerators = {accelerators}
2041
+ accelerators_list = cls._parse_accelerators_from_str(
2042
+ accelerators)
1457
2043
  elif isinstance(accelerators, dict):
1458
- accelerators = [
2044
+ accelerator_names = [
1459
2045
  f'{k}:{v}' if v is not None else f'{k}'
1460
2046
  for k, v in accelerators.items()
1461
2047
  ]
1462
- accelerators = set(accelerators)
2048
+ accelerators_list = []
2049
+ for accel_name in accelerator_names:
2050
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2051
+ accelerators_list.extend(parsed_accels)
2052
+ elif isinstance(accelerators, list) or isinstance(
2053
+ accelerators, set):
2054
+ accelerators_list = []
2055
+ for accel_name in accelerators:
2056
+ parsed_accels = cls._parse_accelerators_from_str(accel_name)
2057
+ accelerators_list.extend(parsed_accels)
2058
+ else:
2059
+ assert False, ('Invalid accelerators type:'
2060
+ f'{type(accelerators)}')
2061
+ # now that accelerators is a list, we need to decide which to
2062
+ # include in the final set, however, there may be multiple copies
2063
+ # of the same accelerator, some given by name by the user and the
2064
+ # other copy being given by memory size. In this case, we only care
2065
+ # about the user specified ones (so we can give a warning if it
2066
+ # doesn't exist).
2067
+ accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
2068
+ for accel, user_specified in accelerators_list:
2069
+ # If this accelerator is not in dict yet, or if current one is
2070
+ # user specified and existing one is not, update the entry
2071
+ accel_to_user_specified[accel] = (user_specified or
2072
+ accel_to_user_specified.get(
2073
+ accel, False))
2074
+
2075
+ # only time we care about ordered is when we are given a list,
2076
+ # otherwise we default to a set
2077
+ accelerators_type = list if isinstance(accelerators, list) else set
2078
+ accelerators = accelerators_type([
2079
+ (accel, user_specified)
2080
+ for accel, user_specified in accel_to_user_specified.items()
2081
+ ])
2082
+
1463
2083
  if len(accelerators) > 1 and ordered_configs:
1464
2084
  with ux_utils.print_exception_no_traceback():
1465
2085
  raise ValueError(
@@ -1469,7 +2089,7 @@ class Resources:
1469
2089
  not isinstance(accelerators, set)):
1470
2090
  with ux_utils.print_exception_no_traceback():
1471
2091
  raise ValueError(
1472
- 'Cannot specify multiple "accelerators" with prefered '
2092
+ 'Cannot specify multiple "accelerators" with preferred '
1473
2093
  'order (i.e., list of accelerators) with "any_of" '
1474
2094
  'in resources.')
1475
2095
 
@@ -1485,23 +2105,35 @@ class Resources:
1485
2105
  # In Task, we store a list of resources, each with 1 accelerator.
1486
2106
  # This for loop is for format conversion.
1487
2107
  tmp_resources_list = []
1488
- for acc in accelerators:
2108
+ for acc, user_specified in accelerators:
1489
2109
  tmp_resource = config.copy()
1490
2110
  tmp_resource['accelerators'] = acc
2111
+ if not user_specified:
2112
+ tmp_resource['_no_missing_accel_warnings'] = True
1491
2113
  tmp_resources_list.append(
1492
2114
  Resources._from_yaml_config_single(tmp_resource))
1493
2115
 
1494
2116
  assert isinstance(accelerators, (list, set)), accelerators
1495
2117
  return type(accelerators)(tmp_resources_list)
1496
-
1497
2118
  return {Resources._from_yaml_config_single(config)}
1498
2119
 
1499
2120
  @classmethod
1500
2121
  def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
2122
+ resources_fields: Dict[str, Any] = {}
2123
+
2124
+ # Extract infra field if present
2125
+ infra = config.pop('infra', None)
2126
+ resources_fields['infra'] = infra
1501
2127
 
1502
- resources_fields = {}
2128
+ # Keep backward compatibility with cloud, region, zone
2129
+ # Note: if both `infra` and any of `cloud`, `region`, `zone` are
2130
+ # specified, it will raise an error during the Resources.__init__
2131
+ # validation.
1503
2132
  resources_fields['cloud'] = registry.CLOUD_REGISTRY.from_str(
1504
2133
  config.pop('cloud', None))
2134
+ resources_fields['region'] = config.pop('region', None)
2135
+ resources_fields['zone'] = config.pop('zone', None)
2136
+
1505
2137
  resources_fields['instance_type'] = config.pop('instance_type', None)
1506
2138
  resources_fields['cpus'] = config.pop('cpus', None)
1507
2139
  resources_fields['memory'] = config.pop('memory', None)
@@ -1519,12 +2151,14 @@ class Resources:
1519
2151
  # exclusive by the schema validation.
1520
2152
  resources_fields['job_recovery'] = config.pop('job_recovery', None)
1521
2153
  resources_fields['disk_size'] = config.pop('disk_size', None)
1522
- resources_fields['region'] = config.pop('region', None)
1523
- resources_fields['zone'] = config.pop('zone', None)
1524
2154
  resources_fields['image_id'] = config.pop('image_id', None)
1525
2155
  resources_fields['disk_tier'] = config.pop('disk_tier', None)
2156
+ resources_fields['network_tier'] = config.pop('network_tier', None)
1526
2157
  resources_fields['ports'] = config.pop('ports', None)
1527
2158
  resources_fields['labels'] = config.pop('labels', None)
2159
+ resources_fields['autostop'] = config.pop('autostop', None)
2160
+ resources_fields['priority'] = config.pop('priority', None)
2161
+ resources_fields['volumes'] = config.pop('volumes', None)
1528
2162
  resources_fields['_docker_login_config'] = config.pop(
1529
2163
  '_docker_login_config', None)
1530
2164
  resources_fields['_docker_username_for_runpod'] = config.pop(
@@ -1543,7 +2177,11 @@ class Resources:
1543
2177
  resources_fields['accelerator_args'] = dict(
1544
2178
  resources_fields['accelerator_args'])
1545
2179
  if resources_fields['disk_size'] is not None:
1546
- resources_fields['disk_size'] = int(resources_fields['disk_size'])
2180
+ # although it will end up being an int, we don't know at this point
2181
+ # if it has units or not, so we store it as a string
2182
+ resources_fields['disk_size'] = str(resources_fields['disk_size'])
2183
+ resources_fields['_no_missing_accel_warnings'] = config.pop(
2184
+ '_no_missing_accel_warnings', None)
1547
2185
 
1548
2186
  assert not config, f'Invalid resource args: {config.keys()}'
1549
2187
  return Resources(**resources_fields)
@@ -1556,7 +2194,10 @@ class Resources:
1556
2194
  if value is not None and value != 'None':
1557
2195
  config[key] = value
1558
2196
 
1559
- add_if_not_none('cloud', str(self.cloud))
2197
+ # Construct infra field if cloud is set
2198
+ infra = self.infra.to_str()
2199
+ add_if_not_none('infra', infra)
2200
+
1560
2201
  add_if_not_none('instance_type', self.instance_type)
1561
2202
  add_if_not_none('cpus', self._cpus)
1562
2203
  add_if_not_none('memory', self.memory)
@@ -1567,13 +2208,34 @@ class Resources:
1567
2208
  add_if_not_none('use_spot', self.use_spot)
1568
2209
  add_if_not_none('job_recovery', self.job_recovery)
1569
2210
  add_if_not_none('disk_size', self.disk_size)
1570
- add_if_not_none('region', self.region)
1571
- add_if_not_none('zone', self.zone)
1572
2211
  add_if_not_none('image_id', self.image_id)
1573
2212
  if self.disk_tier is not None:
1574
2213
  config['disk_tier'] = self.disk_tier.value
2214
+ if self.network_tier is not None:
2215
+ config['network_tier'] = self.network_tier.value
1575
2216
  add_if_not_none('ports', self.ports)
1576
2217
  add_if_not_none('labels', self.labels)
2218
+ if self.volumes is not None:
2219
+ # Convert DiskTier/StorageType enum to string value for each volume
2220
+ volumes = []
2221
+ for volume in self.volumes:
2222
+ volume_copy = volume.copy()
2223
+ if 'disk_tier' in volume_copy:
2224
+ volume_copy['disk_tier'] = volume_copy['disk_tier'].value
2225
+ if 'storage_type' in volume_copy:
2226
+ volume_copy['storage_type'] = volume_copy[
2227
+ 'storage_type'].value
2228
+ if 'attach_mode' in volume_copy:
2229
+ volume_copy['attach_mode'] = volume_copy[
2230
+ 'attach_mode'].value
2231
+ volumes.append(volume_copy)
2232
+ config['volumes'] = volumes
2233
+ if self._autostop_config is not None:
2234
+ config['autostop'] = self._autostop_config.to_yaml_config()
2235
+
2236
+ add_if_not_none('_no_missing_accel_warnings',
2237
+ self._no_missing_accel_warnings)
2238
+ add_if_not_none('priority', self.priority)
1577
2239
  if self._docker_login_config is not None:
1578
2240
  config['_docker_login_config'] = dataclasses.asdict(
1579
2241
  self._docker_login_config)
@@ -1611,7 +2273,7 @@ class Resources:
1611
2273
  accelerator_args = state.pop('accelerator_args', None)
1612
2274
  state['_accelerator_args'] = accelerator_args
1613
2275
 
1614
- disk_size = state.pop('disk_size', _DEFAULT_DISK_SIZE_GB)
2276
+ disk_size = state.pop('disk_size', DEFAULT_DISK_SIZE_GB)
1615
2277
  state['_disk_size'] = disk_size
1616
2278
 
1617
2279
  if version < 2:
@@ -1729,4 +2391,68 @@ class Resources:
1729
2391
  self._docker_username_for_runpod = state.pop(
1730
2392
  '_docker_username_for_runpod', None)
1731
2393
 
2394
+ if version < 23:
2395
+ self._autostop_config = None
2396
+
2397
+ if version < 24:
2398
+ self._volumes = None
2399
+
2400
+ if version < 25:
2401
+ if isinstance(state.get('_cloud', None), clouds.Kubernetes):
2402
+ _maybe_add_docker_prefix_to_image_id(state['_image_id'])
2403
+
2404
+ if version < 26:
2405
+ self._network_tier = state.get('_network_tier', None)
2406
+
2407
+ if version < 27:
2408
+ self._priority = None
2409
+
2410
+ if version < 28:
2411
+ self._no_missing_accel_warnings = state.get(
2412
+ '_no_missing_accel_warnings', None)
2413
+
1732
2414
  self.__dict__.update(state)
2415
+
2416
+
2417
+ class LaunchableResources(Resources):
2418
+ """A class representing resources that can be launched on a cloud provider.
2419
+
2420
+ This class is primarily a type hint for MyPy to indicate that an instance
2421
+ of `Resources` is launchable (i.e., `cloud` and `instance_type` are not
2422
+ None). It should not be instantiated directly.
2423
+ """
2424
+
2425
+ def __init__(self, *args, **kwargs) -> None: # pylint: disable=super-init-not-called,unused-argument
2426
+ assert False, (
2427
+ 'LaunchableResources should not be instantiated directly. '
2428
+ 'It is only used for type checking by MyPy.')
2429
+
2430
+ @property
2431
+ def cloud(self) -> clouds.Cloud:
2432
+ assert self._cloud is not None, 'Cloud must be specified'
2433
+ return self._cloud
2434
+
2435
+ @property
2436
+ def instance_type(self) -> str:
2437
+ assert self._instance_type is not None, (
2438
+ 'Instance type must be specified')
2439
+ return self._instance_type
2440
+
2441
+ def copy(self, **override) -> 'LaunchableResources':
2442
+ """Ensure MyPy understands the return type is LaunchableResources.
2443
+
2444
+ This method is not expected to be called at runtime, as
2445
+ LaunchableResources should not be directly instantiated. It primarily
2446
+ serves as a type hint for static analysis.
2447
+ """
2448
+ self.assert_launchable()
2449
+ return typing.cast(LaunchableResources, super().copy(**override))
2450
+
2451
+
2452
+ def _maybe_add_docker_prefix_to_image_id(
2453
+ image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
2454
+ if image_id_dict is None:
2455
+ return
2456
+ for k, v in image_id_dict.items():
2457
+ if not v.startswith('docker:'):
2458
+ image_id_dict[k] = f'docker:{v}'