skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/cloud.py CHANGED
@@ -11,13 +11,14 @@ import collections
11
11
  import enum
12
12
  import math
13
13
  import typing
14
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
14
+ from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
15
+ Union)
15
16
 
16
17
  from typing_extensions import assert_never
17
18
 
19
+ from sky import catalog
18
20
  from sky import exceptions
19
21
  from sky import skypilot_config
20
- from sky.clouds import service_catalog
21
22
  from sky.utils import log_utils
22
23
  from sky.utils import resources_utils
23
24
  from sky.utils import timeline
@@ -26,6 +27,7 @@ from sky.utils import ux_utils
26
27
  if typing.TYPE_CHECKING:
27
28
  from sky import resources as resources_lib
28
29
  from sky.utils import status_lib
30
+ from sky.utils import volume as volume_lib
29
31
 
30
32
 
31
33
  class CloudImplementationFeatures(enum.Enum):
@@ -44,6 +46,7 @@ class CloudImplementationFeatures(enum.Enum):
44
46
  DOCKER_IMAGE = 'docker_image'
45
47
  SPOT_INSTANCE = 'spot_instance'
46
48
  CUSTOM_DISK_TIER = 'custom_disk_tier'
49
+ CUSTOM_NETWORK_TIER = 'custom_network_tier'
47
50
  OPEN_PORTS = 'open_ports'
48
51
  STORAGE_MOUNTING = 'storage_mounting'
49
52
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
@@ -52,6 +55,9 @@ class CloudImplementationFeatures(enum.Enum):
52
55
  AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
53
56
  AUTOSTOP = 'autostop' # Pod/VM can stop itself
54
57
  AUTODOWN = 'autodown' # Pod/VM can down itself
58
+ # Pod/VM can have customized multiple network interfaces
59
+ # e.g. GCP GPUDirect TCPX
60
+ CUSTOM_MULTI_NETWORK = 'custom_multi_network'
55
61
 
56
62
 
57
63
  # Use str, enum.Enum to allow CloudCapability to be used as a string.
@@ -138,6 +144,9 @@ class Cloud:
138
144
  _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
139
145
  _BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
140
146
  _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
147
+ _SUPPORTED_NETWORK_TIERS = {
148
+ resources_utils.NetworkTier.STANDARD, resources_utils.NetworkTier.BEST
149
+ }
141
150
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
142
151
 
143
152
  # The version of provisioner and status query. This is used to determine
@@ -176,14 +185,19 @@ class Cloud:
176
185
  #### Regions/Zones ####
177
186
 
178
187
  @classmethod
179
- def regions_with_offering(cls, instance_type: str,
180
- accelerators: Optional[Dict[str, int]],
181
- use_spot: bool, region: Optional[str],
182
- zone: Optional[str]) -> List[Region]:
188
+ def regions_with_offering(
189
+ cls,
190
+ instance_type: str,
191
+ accelerators: Optional[Dict[str, int]],
192
+ use_spot: bool,
193
+ region: Optional[str],
194
+ zone: Optional[str],
195
+ resources: Optional['resources_lib.Resources'] = None,
196
+ ) -> List[Region]:
183
197
  """Returns the regions that offer the specified resources.
184
198
 
185
199
  The order of the regions follow the order of the regions returned by
186
- service_catalog/common.py#get_region_zones().
200
+ sky/catalog/common.py#get_region_zones().
187
201
  When region or zone is not None, the returned value will be limited to
188
202
  the specified region/zone.
189
203
 
@@ -302,7 +316,8 @@ class Cloud:
302
316
  zones: Optional[List['Zone']],
303
317
  num_nodes: int,
304
318
  dryrun: bool = False,
305
- ) -> Dict[str, Optional[str]]:
319
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
320
+ ) -> Dict[str, Any]:
306
321
  """Converts planned sky.Resources to cloud-specific resource variables.
307
322
 
308
323
  These variables are used to fill the node type section (instance type,
@@ -331,14 +346,23 @@ class Cloud:
331
346
  raise NotImplementedError
332
347
 
333
348
  @classmethod
334
- def get_default_instance_type(
335
- cls,
336
- cpus: Optional[str] = None,
337
- memory: Optional[str] = None,
338
- disk_tier: Optional[resources_utils.DiskTier] = None
349
+ def get_arch_from_instance_type(
350
+ cls,
351
+ instance_type: str,
339
352
  ) -> Optional[str]:
340
- """Returns the default instance type with the given #vCPUs, memory and
341
- disk tier.
353
+ """Returns the arch of the instance type, if any."""
354
+ raise NotImplementedError
355
+
356
+ @classmethod
357
+ def get_default_instance_type(cls,
358
+ cpus: Optional[str] = None,
359
+ memory: Optional[str] = None,
360
+ disk_tier: Optional[
361
+ resources_utils.DiskTier] = None,
362
+ region: Optional[str] = None,
363
+ zone: Optional[str] = None) -> Optional[str]:
364
+ """Returns the default instance type with the given #vCPUs, memory,
365
+ disk tier, region, and zone.
342
366
 
343
367
  For example, if cpus='4', this method returns the default instance type
344
368
  with 4 vCPUs. If cpus='4+', this method returns the default instance
@@ -362,9 +386,9 @@ class Cloud:
362
386
  @classmethod
363
387
  def is_image_tag_valid(cls, image_tag: str, region: Optional[str]) -> bool:
364
388
  """Validates that the image tag is valid for this cloud."""
365
- return service_catalog.is_image_tag_valid(image_tag,
366
- region,
367
- clouds=cls._REPR.lower())
389
+ return catalog.is_image_tag_valid(image_tag,
390
+ region,
391
+ clouds=cls._REPR.lower())
368
392
 
369
393
  @classmethod
370
394
  def is_label_valid(cls, label_key: str,
@@ -385,6 +409,21 @@ class Cloud:
385
409
  del label_key, label_value
386
410
  return True, None
387
411
 
412
+ @classmethod
413
+ def is_volume_name_valid(cls,
414
+ volume_name: str) -> Tuple[bool, Optional[str]]:
415
+ """Validates that the volume name is valid for this cloud.
416
+
417
+ Returns:
418
+ A tuple of a boolean indicating whether the volume name is valid
419
+ and an optional string describing the reason if the volume name
420
+ is invalid.
421
+ """
422
+ # If a cloud does not support volume, they are ignored. Only clouds
423
+ # that support volume implement this method.
424
+ del volume_name
425
+ return True, None
426
+
388
427
  @timeline.event
389
428
  def get_feasible_launchable_resources(
390
429
  self,
@@ -456,12 +495,14 @@ class Cloud:
456
495
 
457
496
  @classmethod
458
497
  def check_credentials(
459
- cls,
460
- cloud_capability: CloudCapability) -> Tuple[bool, Optional[str]]:
498
+ cls, cloud_capability: CloudCapability
499
+ ) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
461
500
  """Checks if the user has access credentials to this cloud.
462
501
 
463
- Returns a boolean of whether the user can access this cloud, and a
464
- string describing the reason if the user cannot access.
502
+ Returns a boolean of whether the user can access this cloud, and:
503
+ - For SSH and Kubernetes, a dictionary that maps context names to
504
+ the status of the context.
505
+ - For others, a string describing the reason if cannot access.
465
506
 
466
507
  Raises NotSupportedError if the capability is
467
508
  not supported by this cloud.
@@ -473,19 +514,30 @@ class Cloud:
473
514
  assert_never(cloud_capability)
474
515
 
475
516
  @classmethod
476
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
517
+ def _check_compute_credentials(
518
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
477
519
  """Checks if the user has access credentials to
478
520
  this cloud's compute service."""
479
521
  raise exceptions.NotSupportedError(
480
522
  f'{cls._REPR} does not support {CloudCapability.COMPUTE.value}.')
481
523
 
482
524
  @classmethod
483
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
525
+ def _check_storage_credentials(
526
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
484
527
  """Checks if the user has access credentials to
485
528
  this cloud's storage service."""
486
529
  raise exceptions.NotSupportedError(
487
530
  f'{cls._REPR} does not support {CloudCapability.STORAGE.value}.')
488
531
 
532
+ @classmethod
533
+ def expand_infras(cls) -> List[str]:
534
+ """Returns a list of enabled infrastructures for this cloud.
535
+
536
+ For Kubernetes and SSH, return a list of resource pools.
537
+ For all other clouds, return self.
538
+ """
539
+ return [cls.canonical_name()]
540
+
489
541
  # TODO(zhwu): Make the return type immutable.
490
542
  @classmethod
491
543
  def get_user_identities(cls) -> Optional[List[List[str]]]:
@@ -607,13 +659,13 @@ class Cloud:
607
659
  Raises:
608
660
  ValueError: If region or zone is invalid or not supported.
609
661
  """
610
- return service_catalog.validate_region_zone(region,
611
- zone,
612
- clouds=self._REPR.lower())
662
+ return catalog.validate_region_zone(region,
663
+ zone,
664
+ clouds=self._REPR.lower())
613
665
 
614
666
  def need_cleanup_after_preemption_or_failure(
615
667
  self, resources: 'resources_lib.Resources') -> bool:
616
- """Whether a resource needs cleanup after preeemption or failure.
668
+ """Whether a resource needs cleanup after preemption or failure.
617
669
 
618
670
  In most cases, spot resources do not need cleanup after preemption,
619
671
  as long as the cluster can be relaunched with the same name and tag,
@@ -627,8 +679,11 @@ class Cloud:
627
679
 
628
680
  @classmethod
629
681
  def check_features_are_supported(
630
- cls, resources: 'resources_lib.Resources',
631
- requested_features: Set[CloudImplementationFeatures]) -> None:
682
+ cls,
683
+ resources: 'resources_lib.Resources',
684
+ requested_features: Set[CloudImplementationFeatures],
685
+ region: Optional[str] = None,
686
+ ) -> None:
632
687
  """Errors out if the cloud does not support all requested features.
633
688
 
634
689
  For instance, Lambda Cloud does not support stop, so
@@ -646,11 +701,14 @@ class Cloud:
646
701
  requested features.
647
702
  """
648
703
  unsupported_features2reason = cls._unsupported_features_for_resources(
649
- resources)
704
+ resources, region)
650
705
 
651
706
  # Docker image is not compatible with ssh proxy command.
652
- if skypilot_config.get_nested(
653
- (str(cls._REPR).lower(), 'ssh_proxy_command'), None) is not None:
707
+ if skypilot_config.get_effective_region_config(
708
+ cloud=str(cls).lower(),
709
+ region=None,
710
+ keys=('ssh_proxy_command',),
711
+ default_value=None) is not None:
654
712
  unsupported_features2reason.update({
655
713
  CloudImplementationFeatures.DOCKER_IMAGE: (
656
714
  f'Docker image is currently not supported on {cls._REPR} '
@@ -673,7 +731,9 @@ class Cloud:
673
731
 
674
732
  @classmethod
675
733
  def _unsupported_features_for_resources(
676
- cls, resources: 'resources_lib.Resources'
734
+ cls,
735
+ resources: 'resources_lib.Resources',
736
+ region: Optional[str] = None,
677
737
  ) -> Dict[CloudImplementationFeatures, str]:
678
738
  """The features not supported based on the resources provided.
679
739
 
@@ -684,7 +744,7 @@ class Cloud:
684
744
  A dict of {feature: reason} for the features not supported by the
685
745
  cloud implementation.
686
746
  """
687
- del resources
747
+ del resources, region
688
748
  raise NotImplementedError
689
749
 
690
750
  @classmethod
@@ -701,6 +761,26 @@ class Cloud:
701
761
  raise exceptions.NotSupportedError(
702
762
  f'{disk_tier} is not supported by {cls._REPR}.')
703
763
 
764
+ @classmethod
765
+ def check_network_tier_enabled(
766
+ cls, instance_type: Optional[str],
767
+ network_tier: resources_utils.NetworkTier) -> None:
768
+ """Errors out if the network tier is not supported by the
769
+ cloud provider.
770
+
771
+ For BEST tier: always succeeds, will use best available tier.
772
+
773
+ Raises:
774
+ exceptions.NotSupportedError: If the network tier is not supported.
775
+ """
776
+ del instance_type # unused
777
+
778
+ # For other tiers, check if supported
779
+ if network_tier not in cls._SUPPORTED_NETWORK_TIERS:
780
+ with ux_utils.print_exception_no_traceback():
781
+ raise exceptions.NotSupportedError(
782
+ f'{network_tier} is not supported by {cls._REPR}.')
783
+
704
784
  @classmethod
705
785
  def _translate_disk_tier(
706
786
  cls, disk_tier: Optional[resources_utils.DiskTier]
@@ -721,7 +801,7 @@ class Cloud:
721
801
  Raises:
722
802
  ResourcesMismatchError: If the accelerator is not supported.
723
803
  """
724
- assert resources.is_launchable(), resources
804
+ resources = resources.assert_launchable()
725
805
 
726
806
  def _equal_accelerators(
727
807
  acc_requested: Optional[Dict[str, Union[int, float]]],
@@ -877,6 +957,11 @@ class Cloud:
877
957
  def canonical_name(cls) -> str:
878
958
  return cls.__name__.lower()
879
959
 
960
+ @classmethod
961
+ def display_name(cls) -> str:
962
+ """Name of the cloud used in messages displayed to the user."""
963
+ return cls.canonical_name()
964
+
880
965
  def __repr__(self):
881
966
  return self._REPR
882
967
 
@@ -887,6 +972,12 @@ class Cloud:
887
972
  return state
888
973
 
889
974
 
975
+ class DummyCloud(Cloud):
976
+ """A dummy Cloud that has zero egress cost from/to for optimization
977
+ purpose."""
978
+ pass
979
+
980
+
890
981
  # === Helper functions ===
891
982
  def cloud_in_iterable(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool:
892
983
  """Returns whether the cloud is in the given cloud list."""
sky/clouds/cudo.py CHANGED
@@ -3,8 +3,9 @@ import subprocess
3
3
  import typing
4
4
  from typing import Dict, Iterator, List, Optional, Tuple, Union
5
5
 
6
+ from sky import catalog
6
7
  from sky import clouds
7
- from sky.clouds import service_catalog
8
+ from sky.adaptors import common
8
9
  from sky.utils import common_utils
9
10
  from sky.utils import registry
10
11
  from sky.utils import resources_utils
@@ -12,6 +13,7 @@ from sky.utils import resources_utils
12
13
  if typing.TYPE_CHECKING:
13
14
  # Renaming to avoid shadowing variables.
14
15
  from sky import resources as resources_lib
16
+ from sky.utils import volume as volume_lib
15
17
 
16
18
  _CREDENTIAL_FILES = [
17
19
  # credential files for Cudo,
@@ -59,6 +61,8 @@ class Cudo(clouds.Cloud):
59
61
  ('Spot is not supported, as Cudo API does not implement spot.'),
60
62
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
61
63
  ('Custom disk tier is currently not supported on Cudo Compute'),
64
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
65
+ ('Custom network tier is currently not supported on Cudo Compute'),
62
66
  clouds.CloudImplementationFeatures.IMAGE_ID:
63
67
  ('Image ID is currently not supported on Cudo. '),
64
68
  clouds.CloudImplementationFeatures.DOCKER_IMAGE:
@@ -70,6 +74,9 @@ class Cudo(clouds.Cloud):
70
74
  ),
71
75
  clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
72
76
  ('High availability controllers are not supported on Cudo.'),
77
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
78
+ ('Customized multiple network interfaces are not supported on Cudo.'
79
+ ),
73
80
  }
74
81
  _MAX_CLUSTER_NAME_LEN_LIMIT = 60
75
82
 
@@ -80,7 +87,9 @@ class Cudo(clouds.Cloud):
80
87
 
81
88
  @classmethod
82
89
  def _unsupported_features_for_resources(
83
- cls, resources: 'resources_lib.Resources'
90
+ cls,
91
+ resources: 'resources_lib.Resources',
92
+ region: Optional[str] = None,
84
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
85
94
  """The features not supported based on the resources provided.
86
95
 
@@ -99,16 +108,21 @@ class Cudo(clouds.Cloud):
99
108
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
100
109
 
101
110
  @classmethod
102
- def regions_with_offering(cls, instance_type,
103
- accelerators: Optional[Dict[str, int]],
104
- use_spot: bool, region: Optional[str],
105
- zone: Optional[str]) -> List[clouds.Region]:
111
+ def regions_with_offering(
112
+ cls,
113
+ instance_type,
114
+ accelerators: Optional[Dict[str, int]],
115
+ use_spot: bool,
116
+ region: Optional[str],
117
+ zone: Optional[str],
118
+ resources: Optional['resources_lib.Resources'] = None,
119
+ ) -> List[clouds.Region]:
106
120
  assert zone is None, 'Cudo does not support zones.'
107
121
  del accelerators, zone # unused
108
122
  if use_spot:
109
123
  return []
110
124
 
111
- regions = service_catalog.get_region_zones_for_instance_type(
125
+ regions = catalog.get_region_zones_for_instance_type(
112
126
  instance_type, use_spot, 'cudo')
113
127
 
114
128
  if region is not None:
@@ -121,8 +135,8 @@ class Cudo(clouds.Cloud):
121
135
  instance_type: str,
122
136
  ) -> Tuple[Optional[float], Optional[float]]:
123
137
 
124
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
125
- clouds='cudo')
138
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
139
+ clouds='cudo')
126
140
 
127
141
  @classmethod
128
142
  def zones_provision_loop(
@@ -149,11 +163,11 @@ class Cudo(clouds.Cloud):
149
163
  use_spot: bool,
150
164
  region: Optional[str] = None,
151
165
  zone: Optional[str] = None) -> float:
152
- return service_catalog.get_hourly_cost(instance_type,
153
- use_spot=use_spot,
154
- region=region,
155
- zone=zone,
156
- clouds='cudo')
166
+ return catalog.get_hourly_cost(instance_type,
167
+ use_spot=use_spot,
168
+ region=region,
169
+ zone=zone,
170
+ clouds='cudo')
157
171
 
158
172
  def accelerators_to_hourly_cost(self,
159
173
  accelerators: Dict[str, int],
@@ -169,23 +183,27 @@ class Cudo(clouds.Cloud):
169
183
  return 0.0
170
184
 
171
185
  @classmethod
172
- def get_default_instance_type(
173
- cls,
174
- cpus: Optional[str] = None,
175
- memory: Optional[str] = None,
176
- disk_tier: Optional[resources_utils.DiskTier] = None
177
- ) -> Optional[str]:
178
- return service_catalog.get_default_instance_type(cpus=cpus,
179
- memory=memory,
180
- clouds='cudo')
186
+ def get_default_instance_type(cls,
187
+ cpus: Optional[str] = None,
188
+ memory: Optional[str] = None,
189
+ disk_tier: Optional[
190
+ resources_utils.DiskTier] = None,
191
+ region: Optional[str] = None,
192
+ zone: Optional[str] = None) -> Optional[str]:
193
+ return catalog.get_default_instance_type(cpus=cpus,
194
+ memory=memory,
195
+ disk_tier=disk_tier,
196
+ region=region,
197
+ zone=zone,
198
+ clouds='cudo')
181
199
 
182
200
  @classmethod
183
201
  def get_accelerators_from_instance_type(
184
202
  cls,
185
203
  instance_type: str,
186
204
  ) -> Optional[Dict[str, Union[int, float]]]:
187
- return service_catalog.get_accelerators_from_instance_type(
188
- instance_type, clouds='cudo')
205
+ return catalog.get_accelerators_from_instance_type(instance_type,
206
+ clouds='cudo')
189
207
 
190
208
  @classmethod
191
209
  def get_zone_shell_cmd(cls) -> Optional[str]:
@@ -199,10 +217,12 @@ class Cudo(clouds.Cloud):
199
217
  zones: Optional[List['clouds.Zone']],
200
218
  num_nodes: int,
201
219
  dryrun: bool = False,
220
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
202
221
  ) -> Dict[str, Optional[str]]:
203
222
  del zones, cluster_name # unused
204
- r = resources
205
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
223
+ resources = resources.assert_launchable()
224
+ acc_dict = self.get_accelerators_from_instance_type(
225
+ resources.instance_type)
206
226
  custom_resources = resources_utils.make_ray_custom_resources_str(
207
227
  acc_dict)
208
228
 
@@ -243,7 +263,9 @@ class Cudo(clouds.Cloud):
243
263
  default_instance_type = Cudo.get_default_instance_type(
244
264
  cpus=resources.cpus,
245
265
  memory=resources.memory,
246
- disk_tier=resources.disk_tier)
266
+ disk_tier=resources.disk_tier,
267
+ region=resources.region,
268
+ zone=resources.zone)
247
269
  if default_instance_type is None:
248
270
  return resources_utils.FeasibleResources([], [], None)
249
271
  else:
@@ -252,16 +274,16 @@ class Cudo(clouds.Cloud):
252
274
 
253
275
  assert len(accelerators) == 1, resources
254
276
  acc, acc_count = list(accelerators.items())[0]
255
- (instance_list, fuzzy_candidate_list
256
- ) = service_catalog.get_instance_type_for_accelerator(
257
- acc,
258
- acc_count,
259
- use_spot=resources.use_spot,
260
- cpus=resources.cpus,
261
- memory=resources.memory,
262
- region=resources.region,
263
- zone=resources.zone,
264
- clouds='cudo')
277
+ (instance_list,
278
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
279
+ acc,
280
+ acc_count,
281
+ use_spot=resources.use_spot,
282
+ cpus=resources.cpus,
283
+ memory=resources.memory,
284
+ region=resources.region,
285
+ zone=resources.zone,
286
+ clouds='cudo')
265
287
  if instance_list is None:
266
288
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
267
289
  None)
@@ -269,17 +291,13 @@ class Cudo(clouds.Cloud):
269
291
  fuzzy_candidate_list, None)
270
292
 
271
293
  @classmethod
272
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
294
+ def _check_compute_credentials(
295
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
273
296
  """Checks if the user has access credentials to
274
297
  Cudo's compute service."""
275
- try:
276
- # pylint: disable=import-outside-toplevel,unused-import
277
- from cudo_compute import cudo_api
278
- except (ImportError, subprocess.CalledProcessError) as e:
279
- return False, (
280
- f'{cls._DEPENDENCY_HINT}\n'
281
- f'{cls._INDENT_PREFIX}'
282
- f'{common_utils.format_exception(e, use_bracket=True)}')
298
+ if not common.can_import_modules(['cudo_compute']):
299
+ return False, (f'{cls._DEPENDENCY_HINT}\n'
300
+ f'{cls._INDENT_PREFIX}')
283
301
 
284
302
  try:
285
303
  _run_output('cudoctl --version')
@@ -292,7 +310,7 @@ class Cudo(clouds.Cloud):
292
310
  from cudo_compute import cudo_api
293
311
  from cudo_compute.rest import ApiException
294
312
  try:
295
- _, error = cudo_api.client()
313
+ _, error = cudo_api.make_client()
296
314
  except FileNotFoundError as e:
297
315
  return False, (
298
316
  'Cudo credentials are not set. '
@@ -334,7 +352,7 @@ class Cudo(clouds.Cloud):
334
352
  return None
335
353
 
336
354
  def instance_type_exists(self, instance_type: str) -> bool:
337
- return service_catalog.instance_type_exists(instance_type, 'cudo')
355
+ return catalog.instance_type_exists(instance_type, 'cudo')
338
356
 
339
357
  def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
340
- return service_catalog.validate_region_zone(region, zone, clouds='cudo')
358
+ return catalog.validate_region_zone(region, zone, clouds='cudo')