skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/cloud.py CHANGED
@@ -11,13 +11,14 @@ import collections
11
11
  import enum
12
12
  import math
13
13
  import typing
14
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
14
+ from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
15
+ Union)
15
16
 
16
17
  from typing_extensions import assert_never
17
18
 
19
+ from sky import catalog
18
20
  from sky import exceptions
19
21
  from sky import skypilot_config
20
- from sky.clouds import service_catalog
21
22
  from sky.utils import log_utils
22
23
  from sky.utils import resources_utils
23
24
  from sky.utils import timeline
@@ -26,6 +27,7 @@ from sky.utils import ux_utils
26
27
  if typing.TYPE_CHECKING:
27
28
  from sky import resources as resources_lib
28
29
  from sky.utils import status_lib
30
+ from sky.utils import volume as volume_lib
29
31
 
30
32
 
31
33
  class CloudImplementationFeatures(enum.Enum):
@@ -44,6 +46,7 @@ class CloudImplementationFeatures(enum.Enum):
44
46
  DOCKER_IMAGE = 'docker_image'
45
47
  SPOT_INSTANCE = 'spot_instance'
46
48
  CUSTOM_DISK_TIER = 'custom_disk_tier'
49
+ CUSTOM_NETWORK_TIER = 'custom_network_tier'
47
50
  OPEN_PORTS = 'open_ports'
48
51
  STORAGE_MOUNTING = 'storage_mounting'
49
52
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
@@ -52,6 +55,9 @@ class CloudImplementationFeatures(enum.Enum):
52
55
  AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
53
56
  AUTOSTOP = 'autostop' # Pod/VM can stop itself
54
57
  AUTODOWN = 'autodown' # Pod/VM can down itself
58
+ # Pod/VM can have customized multiple network interfaces
59
+ # e.g. GCP GPUDirect TCPX
60
+ CUSTOM_MULTI_NETWORK = 'custom_multi_network'
55
61
 
56
62
 
57
63
  # Use str, enum.Enum to allow CloudCapability to be used as a string.
@@ -138,6 +144,9 @@ class Cloud:
138
144
  _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
139
145
  _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
140
146
  _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
147
+ _SUPPORTED_NETWORK_TIERS = {
148
+ resources_utils.NetworkTier.STANDARD, resources_utils.NetworkTier.BEST
149
+ }
141
150
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
142
151
 
143
152
  # The version of provisioner and status query. This is used to determine
@@ -176,14 +185,19 @@ class Cloud:
176
185
  #### Regions/Zones ####
177
186
 
178
187
  @classmethod
179
- def regions_with_offering(cls, instance_type: str,
180
- accelerators: Optional[Dict[str, int]],
181
- use_spot: bool, region: Optional[str],
182
- zone: Optional[str]) -> List[Region]:
188
+ def regions_with_offering(
189
+ cls,
190
+ instance_type: str,
191
+ accelerators: Optional[Dict[str, int]],
192
+ use_spot: bool,
193
+ region: Optional[str],
194
+ zone: Optional[str],
195
+ resources: Optional['resources_lib.Resources'] = None,
196
+ ) -> List[Region]:
183
197
  """Returns the regions that offer the specified resources.
184
198
 
185
199
  The order of the regions follow the order of the regions returned by
186
- service_catalog/common.py#get_region_zones().
200
+ sky/catalog/common.py#get_region_zones().
187
201
  When region or zone is not None, the returned value will be limited to
188
202
  the specified region/zone.
189
203
 
@@ -302,7 +316,8 @@ class Cloud:
302
316
  zones: Optional[List['Zone']],
303
317
  num_nodes: int,
304
318
  dryrun: bool = False,
305
- ) -> Dict[str, Optional[str]]:
319
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
320
+ ) -> Dict[str, Any]:
306
321
  """Converts planned sky.Resources to cloud-specific resource variables.
307
322
 
308
323
  These variables are used to fill the node type section (instance type,
@@ -331,14 +346,23 @@ class Cloud:
331
346
  raise NotImplementedError
332
347
 
333
348
  @classmethod
334
- def get_default_instance_type(
335
- cls,
336
- cpus: Optional[str] = None,
337
- memory: Optional[str] = None,
338
- disk_tier: Optional[resources_utils.DiskTier] = None
349
+ def get_arch_from_instance_type(
350
+ cls,
351
+ instance_type: str,
339
352
  ) -> Optional[str]:
340
- """Returns the default instance type with the given #vCPUs, memory and
341
- disk tier.
353
+ """Returns the arch of the instance type, if any."""
354
+ raise NotImplementedError
355
+
356
+ @classmethod
357
+ def get_default_instance_type(cls,
358
+ cpus: Optional[str] = None,
359
+ memory: Optional[str] = None,
360
+ disk_tier: Optional[
361
+ resources_utils.DiskTier] = None,
362
+ region: Optional[str] = None,
363
+ zone: Optional[str] = None) -> Optional[str]:
364
+ """Returns the default instance type with the given #vCPUs, memory,
365
+ disk tier, region, and zone.
342
366
 
343
367
  For example, if cpus='4', this method returns the default instance type
344
368
  with 4 vCPUs. If cpus='4+', this method returns the default instance
@@ -362,9 +386,9 @@ class Cloud:
362
386
  @classmethod
363
387
  def is_image_tag_valid(cls, image_tag: str, region: Optional[str]) -> bool:
364
388
  """Validates that the image tag is valid for this cloud."""
365
- return service_catalog.is_image_tag_valid(image_tag,
366
- region,
367
- clouds=cls._REPR.lower())
389
+ return catalog.is_image_tag_valid(image_tag,
390
+ region,
391
+ clouds=cls._REPR.lower())
368
392
 
369
393
  @classmethod
370
394
  def is_label_valid(cls, label_key: str,
@@ -385,6 +409,21 @@ class Cloud:
385
409
  del label_key, label_value
386
410
  return True, None
387
411
 
412
+ @classmethod
413
+ def is_volume_name_valid(cls,
414
+ volume_name: str) -> Tuple[bool, Optional[str]]:
415
+ """Validates that the volume name is valid for this cloud.
416
+
417
+ Returns:
418
+ A tuple of a boolean indicating whether the volume name is valid
419
+ and an optional string describing the reason if the volume name
420
+ is invalid.
421
+ """
422
+ # If a cloud does not support volume, they are ignored. Only clouds
423
+ # that support volume implement this method.
424
+ del volume_name
425
+ return True, None
426
+
388
427
  @timeline.event
389
428
  def get_feasible_launchable_resources(
390
429
  self,
@@ -456,12 +495,14 @@ class Cloud:
456
495
 
457
496
  @classmethod
458
497
  def check_credentials(
459
- cls,
460
- cloud_capability: CloudCapability) -> Tuple[bool, Optional[str]]:
498
+ cls, cloud_capability: CloudCapability
499
+ ) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
461
500
  """Checks if the user has access credentials to this cloud.
462
501
 
463
- Returns a boolean of whether the user can access this cloud, and a
464
- string describing the reason if the user cannot access.
502
+ Returns a boolean of whether the user can access this cloud, and:
503
+ - For SSH and Kubernetes, a dictionary that maps context names to
504
+ the status of the context.
505
+ - For others, a string describing the reason if cannot access.
465
506
 
466
507
  Raises NotSupportedError if the capability is
467
508
  not supported by this cloud.
@@ -473,19 +514,30 @@ class Cloud:
473
514
  assert_never(cloud_capability)
474
515
 
475
516
  @classmethod
476
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
517
+ def _check_compute_credentials(
518
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
477
519
  """Checks if the user has access credentials to
478
520
  this cloud's compute service."""
479
521
  raise exceptions.NotSupportedError(
480
522
  f'{cls._REPR} does not support {CloudCapability.COMPUTE.value}.')
481
523
 
482
524
  @classmethod
483
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
525
+ def _check_storage_credentials(
526
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
484
527
  """Checks if the user has access credentials to
485
528
  this cloud's storage service."""
486
529
  raise exceptions.NotSupportedError(
487
530
  f'{cls._REPR} does not support {CloudCapability.STORAGE.value}.')
488
531
 
532
+ @classmethod
533
+ def expand_infras(cls) -> List[str]:
534
+ """Returns a list of enabled infrastructures for this cloud.
535
+
536
+ For Kubernetes and SSH, return a list of resource pools.
537
+ For all other clouds, return self.
538
+ """
539
+ return [cls.canonical_name()]
540
+
489
541
  # TODO(zhwu): Make the return type immutable.
490
542
  @classmethod
491
543
  def get_user_identities(cls) -> Optional[List[List[str]]]:
@@ -607,13 +659,13 @@ class Cloud:
607
659
  Raises:
608
660
  ValueError: If region or zone is invalid or not supported.
609
661
  """
610
- return service_catalog.validate_region_zone(region,
611
- zone,
612
- clouds=self._REPR.lower())
662
+ return catalog.validate_region_zone(region,
663
+ zone,
664
+ clouds=self._REPR.lower())
613
665
 
614
666
  def need_cleanup_after_preemption_or_failure(
615
667
  self, resources: 'resources_lib.Resources') -> bool:
616
- """Whether a resource needs cleanup after preeemption or failure.
668
+ """Whether a resource needs cleanup after preemption or failure.
617
669
 
618
670
  In most cases, spot resources do not need cleanup after preemption,
619
671
  as long as the cluster can be relaunched with the same name and tag,
@@ -627,8 +679,11 @@ class Cloud:
627
679
 
628
680
  @classmethod
629
681
  def check_features_are_supported(
630
- cls, resources: 'resources_lib.Resources',
631
- requested_features: Set[CloudImplementationFeatures]) -> None:
682
+ cls,
683
+ resources: 'resources_lib.Resources',
684
+ requested_features: Set[CloudImplementationFeatures],
685
+ region: Optional[str] = None,
686
+ ) -> None:
632
687
  """Errors out if the cloud does not support all requested features.
633
688
 
634
689
  For instance, Lambda Cloud does not support stop, so
@@ -646,11 +701,14 @@ class Cloud:
646
701
  requested features.
647
702
  """
648
703
  unsupported_features2reason = cls._unsupported_features_for_resources(
649
- resources)
704
+ resources, region)
650
705
 
651
706
  # Docker image is not compatible with ssh proxy command.
652
- if skypilot_config.get_nested(
653
- (str(cls._REPR).lower(), 'ssh_proxy_command'), None) is not None:
707
+ if skypilot_config.get_effective_region_config(
708
+ cloud=str(cls).lower(),
709
+ region=None,
710
+ keys=('ssh_proxy_command',),
711
+ default_value=None) is not None:
654
712
  unsupported_features2reason.update({
655
713
  CloudImplementationFeatures.DOCKER_IMAGE: (
656
714
  f'Docker image is currently not supported on {cls._REPR} '
@@ -673,7 +731,9 @@ class Cloud:
673
731
 
674
732
  @classmethod
675
733
  def _unsupported_features_for_resources(
676
- cls, resources: 'resources_lib.Resources'
734
+ cls,
735
+ resources: 'resources_lib.Resources',
736
+ region: Optional[str] = None,
677
737
  ) -> Dict[CloudImplementationFeatures, str]:
678
738
  """The features not supported based on the resources provided.
679
739
 
@@ -684,7 +744,7 @@ class Cloud:
684
744
  A dict of {feature: reason} for the features not supported by the
685
745
  cloud implementation.
686
746
  """
687
- del resources
747
+ del resources, region
688
748
  raise NotImplementedError
689
749
 
690
750
  @classmethod
@@ -701,6 +761,26 @@ class Cloud:
701
761
  raise exceptions.NotSupportedError(
702
762
  f'{disk_tier} is not supported by {cls._REPR}.')
703
763
 
764
+ @classmethod
765
+ def check_network_tier_enabled(
766
+ cls, instance_type: Optional[str],
767
+ network_tier: resources_utils.NetworkTier) -> None:
768
+ """Errors out if the network tier is not supported by the
769
+ cloud provider.
770
+
771
+ For BEST tier: always succeeds, will use best available tier.
772
+
773
+ Raises:
774
+ exceptions.NotSupportedError: If the network tier is not supported.
775
+ """
776
+ del instance_type # unused
777
+
778
+ # For other tiers, check if supported
779
+ if network_tier not in cls._SUPPORTED_NETWORK_TIERS:
780
+ with ux_utils.print_exception_no_traceback():
781
+ raise exceptions.NotSupportedError(
782
+ f'{network_tier} is not supported by {cls._REPR}.')
783
+
704
784
  @classmethod
705
785
  def _translate_disk_tier(
706
786
  cls, disk_tier: Optional[resources_utils.DiskTier]
@@ -721,7 +801,7 @@ class Cloud:
721
801
  Raises:
722
802
  ResourcesMismatchError: If the accelerator is not supported.
723
803
  """
724
- assert resources.is_launchable(), resources
804
+ resources = resources.assert_launchable()
725
805
 
726
806
  def _equal_accelerators(
727
807
  acc_requested: Optional[Dict[str, Union[int, float]]],
@@ -738,12 +818,21 @@ class Cloud:
738
818
  if acc_from_instance_type is None:
739
819
  return False
740
820
 
741
- for acc in acc_requested:
742
- if acc not in acc_from_instance_type:
821
+ for requested_acc in acc_requested:
822
+ for instance_acc in acc_from_instance_type:
823
+ # The requested accelerator can be canonicalized based on
824
+ # the accelerator registry, which may not has the same case
825
+ # as the cloud's catalog, e.g., 'RTXPro6000' in Shadeform
826
+ # catalog, and 'RTXPRO6000' in RunPod catalog.
827
+ if requested_acc.lower() == instance_acc.lower():
828
+ # Found the requested accelerator in the instance type.
829
+ break
830
+ else:
831
+ # Requested accelerator not found in instance type.
743
832
  return False
744
833
  # Avoid float point precision issue.
745
- if not math.isclose(acc_requested[acc],
746
- acc_from_instance_type[acc]):
834
+ if not math.isclose(acc_requested[requested_acc],
835
+ acc_from_instance_type[instance_acc]):
747
836
  return False
748
837
  return True
749
838
 
@@ -877,6 +966,11 @@ class Cloud:
877
966
  def canonical_name(cls) -> str:
878
967
  return cls.__name__.lower()
879
968
 
969
+ @classmethod
970
+ def display_name(cls) -> str:
971
+ """Name of the cloud used in messages displayed to the user."""
972
+ return cls.canonical_name()
973
+
880
974
  def __repr__(self):
881
975
  return self._REPR
882
976
 
@@ -887,6 +981,12 @@ class Cloud:
887
981
  return state
888
982
 
889
983
 
984
+ class DummyCloud(Cloud):
985
+ """A dummy Cloud that has zero egress cost from/to for optimization
986
+ purpose."""
987
+ pass
988
+
989
+
890
990
  # === Helper functions ===
891
991
  def cloud_in_iterable(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool:
892
992
  """Returns whether the cloud is in the given cloud list."""
sky/clouds/cudo.py CHANGED
@@ -3,8 +3,9 @@ import subprocess
3
3
  import typing
4
4
  from typing import Dict, Iterator, List, Optional, Tuple, Union
5
5
 
6
+ from sky import catalog
6
7
  from sky import clouds
7
- from sky.clouds import service_catalog
8
+ from sky.adaptors import common
8
9
  from sky.utils import common_utils
9
10
  from sky.utils import registry
10
11
  from sky.utils import resources_utils
@@ -12,6 +13,7 @@ from sky.utils import resources_utils
12
13
  if typing.TYPE_CHECKING:
13
14
  # Renaming to avoid shadowing variables.
14
15
  from sky import resources as resources_lib
16
+ from sky.utils import volume as volume_lib
15
17
 
16
18
  _CREDENTIAL_FILES = [
17
19
  # credential files for Cudo,
@@ -59,6 +61,8 @@ class Cudo(clouds.Cloud):
59
61
  ('Spot is not supported, as Cudo API does not implement spot.'),
60
62
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
61
63
  ('Custom disk tier is currently not supported on Cudo Compute'),
64
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
65
+ ('Custom network tier is currently not supported on Cudo Compute'),
62
66
  clouds.CloudImplementationFeatures.IMAGE_ID:
63
67
  ('Image ID is currently not supported on Cudo. '),
64
68
  clouds.CloudImplementationFeatures.DOCKER_IMAGE:
@@ -70,6 +74,9 @@ class Cudo(clouds.Cloud):
70
74
  ),
71
75
  clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
72
76
  ('High availability controllers are not supported on Cudo.'),
77
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
78
+ ('Customized multiple network interfaces are not supported on Cudo.'
79
+ ),
73
80
  }
74
81
  _MAX_CLUSTER_NAME_LEN_LIMIT = 60
75
82
 
@@ -80,7 +87,9 @@ class Cudo(clouds.Cloud):
80
87
 
81
88
  @classmethod
82
89
  def _unsupported_features_for_resources(
83
- cls, resources: 'resources_lib.Resources'
90
+ cls,
91
+ resources: 'resources_lib.Resources',
92
+ region: Optional[str] = None,
84
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
85
94
  """The features not supported based on the resources provided.
86
95
 
@@ -99,16 +108,21 @@ class Cudo(clouds.Cloud):
99
108
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
100
109
 
101
110
  @classmethod
102
- def regions_with_offering(cls, instance_type,
103
- accelerators: Optional[Dict[str, int]],
104
- use_spot: bool, region: Optional[str],
105
- zone: Optional[str]) -> List[clouds.Region]:
111
+ def regions_with_offering(
112
+ cls,
113
+ instance_type,
114
+ accelerators: Optional[Dict[str, int]],
115
+ use_spot: bool,
116
+ region: Optional[str],
117
+ zone: Optional[str],
118
+ resources: Optional['resources_lib.Resources'] = None,
119
+ ) -> List[clouds.Region]:
106
120
  assert zone is None, 'Cudo does not support zones.'
107
121
  del accelerators, zone # unused
108
122
  if use_spot:
109
123
  return []
110
124
 
111
- regions = service_catalog.get_region_zones_for_instance_type(
125
+ regions = catalog.get_region_zones_for_instance_type(
112
126
  instance_type, use_spot, 'cudo')
113
127
 
114
128
  if region is not None:
@@ -121,8 +135,8 @@ class Cudo(clouds.Cloud):
121
135
  instance_type: str,
122
136
  ) -> Tuple[Optional[float], Optional[float]]:
123
137
 
124
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
125
- clouds='cudo')
138
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
139
+ clouds='cudo')
126
140
 
127
141
  @classmethod
128
142
  def zones_provision_loop(
@@ -149,11 +163,11 @@ class Cudo(clouds.Cloud):
149
163
  use_spot: bool,
150
164
  region: Optional[str] = None,
151
165
  zone: Optional[str] = None) -> float:
152
- return service_catalog.get_hourly_cost(instance_type,
153
- use_spot=use_spot,
154
- region=region,
155
- zone=zone,
156
- clouds='cudo')
166
+ return catalog.get_hourly_cost(instance_type,
167
+ use_spot=use_spot,
168
+ region=region,
169
+ zone=zone,
170
+ clouds='cudo')
157
171
 
158
172
  def accelerators_to_hourly_cost(self,
159
173
  accelerators: Dict[str, int],
@@ -169,23 +183,27 @@ class Cudo(clouds.Cloud):
169
183
  return 0.0
170
184
 
171
185
  @classmethod
172
- def get_default_instance_type(
173
- cls,
174
- cpus: Optional[str] = None,
175
- memory: Optional[str] = None,
176
- disk_tier: Optional[resources_utils.DiskTier] = None
177
- ) -> Optional[str]:
178
- return service_catalog.get_default_instance_type(cpus=cpus,
179
- memory=memory,
180
- clouds='cudo')
186
+ def get_default_instance_type(cls,
187
+ cpus: Optional[str] = None,
188
+ memory: Optional[str] = None,
189
+ disk_tier: Optional[
190
+ resources_utils.DiskTier] = None,
191
+ region: Optional[str] = None,
192
+ zone: Optional[str] = None) -> Optional[str]:
193
+ return catalog.get_default_instance_type(cpus=cpus,
194
+ memory=memory,
195
+ disk_tier=disk_tier,
196
+ region=region,
197
+ zone=zone,
198
+ clouds='cudo')
181
199
 
182
200
  @classmethod
183
201
  def get_accelerators_from_instance_type(
184
202
  cls,
185
203
  instance_type: str,
186
204
  ) -> Optional[Dict[str, Union[int, float]]]:
187
- return service_catalog.get_accelerators_from_instance_type(
188
- instance_type, clouds='cudo')
205
+ return catalog.get_accelerators_from_instance_type(instance_type,
206
+ clouds='cudo')
189
207
 
190
208
  @classmethod
191
209
  def get_zone_shell_cmd(cls) -> Optional[str]:
@@ -199,10 +217,12 @@ class Cudo(clouds.Cloud):
199
217
  zones: Optional[List['clouds.Zone']],
200
218
  num_nodes: int,
201
219
  dryrun: bool = False,
220
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
202
221
  ) -> Dict[str, Optional[str]]:
203
222
  del zones, cluster_name # unused
204
- r = resources
205
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
223
+ resources = resources.assert_launchable()
224
+ acc_dict = self.get_accelerators_from_instance_type(
225
+ resources.instance_type)
206
226
  custom_resources = resources_utils.make_ray_custom_resources_str(
207
227
  acc_dict)
208
228
 
@@ -243,7 +263,9 @@ class Cudo(clouds.Cloud):
243
263
  default_instance_type = Cudo.get_default_instance_type(
244
264
  cpus=resources.cpus,
245
265
  memory=resources.memory,
246
- disk_tier=resources.disk_tier)
266
+ disk_tier=resources.disk_tier,
267
+ region=resources.region,
268
+ zone=resources.zone)
247
269
  if default_instance_type is None:
248
270
  return resources_utils.FeasibleResources([], [], None)
249
271
  else:
@@ -252,16 +274,16 @@ class Cudo(clouds.Cloud):
252
274
 
253
275
  assert len(accelerators) == 1, resources
254
276
  acc, acc_count = list(accelerators.items())[0]
255
- (instance_list, fuzzy_candidate_list
256
- ) = service_catalog.get_instance_type_for_accelerator(
257
- acc,
258
- acc_count,
259
- use_spot=resources.use_spot,
260
- cpus=resources.cpus,
261
- memory=resources.memory,
262
- region=resources.region,
263
- zone=resources.zone,
264
- clouds='cudo')
277
+ (instance_list,
278
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
279
+ acc,
280
+ acc_count,
281
+ use_spot=resources.use_spot,
282
+ cpus=resources.cpus,
283
+ memory=resources.memory,
284
+ region=resources.region,
285
+ zone=resources.zone,
286
+ clouds='cudo')
265
287
  if instance_list is None:
266
288
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
267
289
  None)
@@ -269,17 +291,13 @@ class Cudo(clouds.Cloud):
269
291
  fuzzy_candidate_list, None)
270
292
 
271
293
  @classmethod
272
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
294
+ def _check_compute_credentials(
295
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
273
296
  """Checks if the user has access credentials to
274
297
  Cudo's compute service."""
275
- try:
276
- # pylint: disable=import-outside-toplevel,unused-import
277
- from cudo_compute import cudo_api
278
- except (ImportError, subprocess.CalledProcessError) as e:
279
- return False, (
280
- f'{cls._DEPENDENCY_HINT}\n'
281
- f'{cls._INDENT_PREFIX}'
282
- f'{common_utils.format_exception(e, use_bracket=True)}')
298
+ if not common.can_import_modules(['cudo_compute']):
299
+ return False, (f'{cls._DEPENDENCY_HINT}\n'
300
+ f'{cls._INDENT_PREFIX}')
283
301
 
284
302
  try:
285
303
  _run_output('cudoctl --version')
@@ -292,7 +310,7 @@ class Cudo(clouds.Cloud):
292
310
  from cudo_compute import cudo_api
293
311
  from cudo_compute.rest import ApiException
294
312
  try:
295
- _, error = cudo_api.client()
313
+ _, error = cudo_api.make_client()
296
314
  except FileNotFoundError as e:
297
315
  return False, (
298
316
  'Cudo credentials are not set. '
@@ -334,7 +352,7 @@ class Cudo(clouds.Cloud):
334
352
  return None
335
353
 
336
354
  def instance_type_exists(self, instance_type: str) -> bool:
337
- return service_catalog.instance_type_exists(instance_type, 'cudo')
355
+ return catalog.instance_type_exists(instance_type, 'cudo')
338
356
 
339
357
  def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
340
- return service_catalog.validate_region_zone(region, zone, clouds='cudo')
358
+ return catalog.validate_region_zone(region, zone, clouds='cudo')