skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/common.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
8
  from sky import sky_logging
9
+ from sky.utils import env_options
9
10
  from sky.utils import resources_utils
10
11
 
11
12
  # NOTE: we can use pydantic instead of dataclasses or namedtuples, because
@@ -96,6 +97,8 @@ class InstanceInfo:
96
97
  external_ip: Optional[str]
97
98
  tags: Dict[str, str]
98
99
  ssh_port: int = 22
100
+ # The internal service address of the instance on Kubernetes.
101
+ internal_svc: Optional[str] = None
99
102
 
100
103
  def get_feasible_ip(self) -> str:
101
104
  """Get the most feasible IPs of the instance. This function returns
@@ -238,12 +241,21 @@ class Endpoint:
238
241
 
239
242
  @dataclasses.dataclass
240
243
  class SocketEndpoint(Endpoint):
241
- """Socket endpoint accesible via a host and a port."""
244
+ """Socket endpoint accessible via a host and a port."""
242
245
  port: Optional[int]
243
246
  host: str = ''
244
247
 
245
248
  def url(self, override_ip: Optional[str] = None) -> str:
246
249
  host = override_ip if override_ip else self.host
250
+ if env_options.Options.RUNNING_IN_BUILDKITE.get(
251
+ ) and 'localhost' in host:
252
+ # In Buildkite CI, we run a kind (Kubernetes in Docker) cluster.
253
+ # The controller pod runs inside this kind cluster, which itself
254
+ # runs in a container. When the pod tries to access 'localhost',
255
+ # it can't reach the host machine's localhost. Using
256
+ # 'host.docker.internal' allows the pod to properly communicate
257
+ # with services running on the host machine's localhost.
258
+ host = 'host.docker.internal'
247
259
  return f'{host}{":" + str(self.port) if self.port else ""}'
248
260
 
249
261
 
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute VM spec helper for SkyPilot."""
2
2
  import csv
3
3
 
4
- from sky.clouds.service_catalog.common import get_catalog_path
4
+ from sky.catalog.common import get_catalog_path
5
5
 
6
6
  VMS_CSV = 'cudo/vms.csv'
7
7
 
@@ -1,22 +1,28 @@
1
1
  """Cudo catalog helper."""
2
2
 
3
3
  cudo_gpu_model = {
4
- 'NVIDIA V100': 'V100',
5
- 'NVIDIA A40': 'A40',
6
- 'RTX 3080': 'RTX3080',
7
- 'RTX A4000': 'RTXA4000',
8
- 'RTX A4500': 'RTXA4500',
4
+ 'H100 NVL': 'H100',
5
+ 'H100 SXM': 'H100-SXM',
6
+ 'L40S (compute mode)': 'L40S',
7
+ 'L40S (graphics mode)': 'L40S',
8
+ 'A40 (compute mode)': 'A40',
9
+ 'A40 (graphics mode)': 'A40',
9
10
  'RTX A5000': 'RTXA5000',
10
11
  'RTX A6000': 'RTXA6000',
12
+ 'A100 80GB PCIe': 'A100',
13
+ 'A800 PCIe': 'A800',
14
+ 'V100': 'V100',
11
15
  }
12
16
 
13
17
  cudo_gpu_mem = {
14
- 'RTX3080': 12,
18
+ 'H100': 94,
19
+ 'H100-SXM': 80,
20
+ 'L40S': 48,
15
21
  'A40': 48,
16
- 'RTXA4000': 16,
17
- 'RTXA4500': 20,
18
22
  'RTXA5000': 24,
19
23
  'RTXA6000': 48,
24
+ 'A100': 80,
25
+ 'A800': 80,
20
26
  'V100': 16,
21
27
  }
22
28
 
@@ -4,7 +4,7 @@ from typing import Dict
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.adaptors import cudo
7
- import sky.provision.cudo.cudo_utils as utils
7
+ from sky.provision.cudo import cudo_utils as utils
8
8
 
9
9
  logger = sky_logging.init_logger(__name__)
10
10
 
@@ -28,12 +28,10 @@ def launch(name: str, data_center_id: str, ssh_key: str, machine_type: str,
28
28
  size_gib=disk_size),
29
29
  metadata=tags)
30
30
 
31
- try:
32
- api = cudo.cudo.cudo_api.virtual_machines()
33
- vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
34
- return vm.to_dict()['id']
35
- except cudo.cudo.rest.ApiException as e:
36
- raise e
31
+ api = cudo.cudo.cudo_api.virtual_machines()
32
+ vm = api.create_vm(cudo.cudo.cudo_api.project_id_throwable(), request)
33
+
34
+ return vm.to_dict()['id']
37
35
 
38
36
 
39
37
  def remove(instance_id: str):
@@ -54,11 +52,8 @@ def remove(instance_id: str):
54
52
  state = 'unknown'
55
53
  project_id = cudo.cudo.cudo_api.project_id_throwable()
56
54
  while retry_count < max_retries:
57
- try:
58
- vm = api.get_vm(project_id, instance_id)
59
- state = vm.to_dict()['vm']['short_state']
60
- except cudo.cudo.rest.ApiException as e:
61
- raise e
55
+ vm = api.get_vm(project_id, instance_id)
56
+ state = vm.to_dict()['vm']['short_state']
62
57
 
63
58
  if state in terminate_ok:
64
59
  break
@@ -69,76 +64,82 @@ def remove(instance_id: str):
69
64
  'Timeout error, could not terminate due to VM state: {}'.format(
70
65
  state))
71
66
 
72
- try:
73
- api.terminate_vm(project_id, instance_id)
74
- except cudo.cudo.rest.ApiException as e:
75
- raise e
67
+ api.terminate_vm(project_id, instance_id)
76
68
 
77
69
 
78
70
  def set_tags(instance_id: str, tags: Dict):
79
71
  """Sets the tags for the given instance."""
80
- try:
81
- api = cudo.cudo.cudo_api.virtual_machines()
82
- api.update_vm_metadata(
83
- cudo.cudo.cudo_api.project_id(), instance_id,
84
- cudo.cudo.UpdateVMMetadataBody(
85
- metadata=tags,
86
- merge=True)) # TODO (skypilot team) merge or overwrite?
87
- except cudo.cudo.rest.ApiException as e:
88
- raise e
72
+ api = cudo.cudo.cudo_api.virtual_machines()
73
+ api.update_vm_metadata(
74
+ cudo.cudo.cudo_api.project_id(), instance_id,
75
+ cudo.cudo.UpdateVMMetadataBody(
76
+ metadata=tags,
77
+ merge=True)) # TODO (skypilot team) merge or overwrite?
89
78
 
90
79
 
91
80
  def get_instance(vm_id):
92
- try:
93
- api = cudo.cudo.cudo_api.virtual_machines()
94
- vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
95
- vm_dict = vm.to_dict()
96
- return vm_dict
97
- except cudo.cudo.rest.ApiException as e:
98
- raise e
81
+ api = cudo.cudo.cudo_api.virtual_machines()
82
+ vm = api.get_vm(cudo.cudo.cudo_api.project_id_throwable(), vm_id)
83
+ vm_dict = vm.to_dict()
84
+ return vm_dict
99
85
 
100
86
 
101
87
  def list_instances():
102
- try:
103
- api = cudo.cudo.cudo_api.virtual_machines()
104
- vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
105
- instances = {}
106
- for vm in vms.to_dict()['vms']:
107
- ex_ip = vm['external_ip_address']
108
- in_ip = vm['internal_ip_address']
109
- if not in_ip:
110
- in_ip = ex_ip
111
- instance = {
112
- # active_state, init_state, lcm_state, short_state
113
- 'status': vm['short_state'],
114
- 'tags': vm['metadata'],
115
- 'name': vm['id'],
116
- 'ip': ex_ip,
117
- 'external_ip': ex_ip,
118
- 'internal_ip': in_ip
119
- }
120
- instances[vm['id']] = instance
121
- return instances
122
- except cudo.cudo.rest.ApiException as e:
123
- raise e
88
+ api = cudo.cudo.cudo_api.virtual_machines()
89
+ vms = api.list_vms(cudo.cudo.cudo_api.project_id_throwable())
90
+ instances = {}
91
+ for vm in vms.to_dict()['vms']:
92
+ ex_ip = vm['external_ip_address']
93
+ in_ip = vm['internal_ip_address']
94
+ if not in_ip:
95
+ in_ip = ex_ip
96
+ instance = {
97
+ # active_state, init_state, lcm_state, short_state
98
+ 'status': vm['short_state'],
99
+ 'tags': vm['metadata'],
100
+ 'name': vm['id'],
101
+ 'ip': ex_ip,
102
+ 'external_ip': ex_ip,
103
+ 'internal_ip': in_ip
104
+ }
105
+ instances[vm['id']] = instance
106
+ return instances
124
107
 
125
108
 
126
109
  def vm_available(to_start_count, gpu_count, gpu_model, data_center_id, mem,
127
110
  cpus):
128
- try:
129
- gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
130
- api = cudo.cudo.cudo_api.virtual_machines()
131
- types = api.list_vm_machine_types(mem,
132
- cpus,
133
- gpu=gpu_count,
134
- gpu_model=gpu_model,
135
- data_center_id=data_center_id)
136
- types_dict = types.to_dict()
137
- hc = types_dict['host_configs']
138
- total_count = sum(item['count_vm_available'] for item in hc)
139
- if total_count < to_start_count:
140
- raise Exception(
141
- 'Too many VMs requested, try another gpu type or region')
142
- return total_count
143
- except cudo.cudo.rest.ApiException as e:
144
- raise e
111
+ gpu_model = utils.skypilot_gpu_to_cudo_gpu(gpu_model)
112
+ api = cudo.cudo.cudo_api.virtual_machines()
113
+ types = api.list_vm_machine_types2()
114
+ types_dict = types.to_dict()
115
+ machine_types = types_dict['machine_types']
116
+
117
+ # Filter machine types based on requirements
118
+ matching_types = []
119
+ for machine_type in machine_types:
120
+ # Check if this machine type matches our requirements
121
+ if (machine_type['data_center_id'] == data_center_id and
122
+ machine_type['gpu_model'] == gpu_model and
123
+ machine_type['min_vcpu'] <= cpus <= machine_type.get(
124
+ 'max_vcpu_free', float('inf')) and
125
+ machine_type['min_memory_gib'] <= mem <= machine_type.get(
126
+ 'max_memory_gib_free', float('inf'))):
127
+
128
+ # Calculate available VMs based on resource constraints
129
+ max_vms_by_vcpu = machine_type[
130
+ 'total_vcpu_free'] // cpus if cpus > 0 else float('inf')
131
+ max_vms_by_memory = machine_type[
132
+ 'total_memory_gib_free'] // mem if mem > 0 else float('inf')
133
+ max_vms_by_gpu = machine_type[
134
+ 'total_gpu_free'] // gpu_count if gpu_count > 0 else float(
135
+ 'inf')
136
+
137
+ available_vms = min(max_vms_by_vcpu, max_vms_by_memory,
138
+ max_vms_by_gpu)
139
+ matching_types.append(available_vms)
140
+
141
+ total_count = sum(matching_types)
142
+ if total_count < to_start_count:
143
+ raise Exception(
144
+ 'Too many VMs requested, try another gpu type or region')
145
+ return total_count
@@ -1,7 +1,7 @@
1
1
  """Cudo Compute instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -40,10 +40,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
40
40
  return head_instance_id
41
41
 
42
42
 
43
- def run_instances(region: str, cluster_name_on_cloud: str,
43
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
44
44
  config: common.ProvisionConfig) -> common.ProvisionRecord:
45
45
  """Runs instances for the given cluster."""
46
-
46
+ del cluster_name # unused
47
47
  pending_status = ['pend', 'init', 'prol', 'boot']
48
48
 
49
49
  while True:
@@ -191,11 +191,14 @@ def get_cluster_info(
191
191
 
192
192
 
193
193
  def query_instances(
194
+ cluster_name: str,
194
195
  cluster_name_on_cloud: str,
195
196
  provider_config: Optional[Dict[str, Any]] = None,
196
197
  non_terminated_only: bool = True,
197
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
198
+ retry_if_missing: bool = False,
199
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
198
200
  """See sky/provision/__init__.py"""
201
+ del cluster_name, retry_if_missing # unused
199
202
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
200
203
  instances = _filter_instances(cluster_name_on_cloud, None)
201
204
 
@@ -210,12 +213,13 @@ def query_instances(
210
213
  'done': status_lib.ClusterStatus.STOPPED,
211
214
  'poff': status_lib.ClusterStatus.STOPPED,
212
215
  }
213
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
216
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
217
+ Optional[str]]] = {}
214
218
  for inst_id, inst in instances.items():
215
219
  status = status_map[inst['status']]
216
220
  if non_terminated_only and status is None:
217
221
  continue
218
- statuses[inst_id] = status
222
+ statuses[inst_id] = (status, None)
219
223
  return statuses
220
224
 
221
225
 
@@ -1,7 +1,7 @@
1
1
  """DigitalOcean instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
  import uuid
6
6
 
7
7
  from sky import sky_logging
@@ -26,10 +26,10 @@ def _get_head_instance(
26
26
  return None
27
27
 
28
28
 
29
- def run_instances(region: str, cluster_name_on_cloud: str,
29
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
30
30
  config: common.ProvisionConfig) -> common.ProvisionRecord:
31
31
  """Runs instances for the given cluster."""
32
-
32
+ del cluster_name # unused
33
33
  pending_status = ['new']
34
34
  newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
35
35
  pending_status + ['off'])
@@ -242,11 +242,14 @@ def get_cluster_info(
242
242
 
243
243
 
244
244
  def query_instances(
245
+ cluster_name: str,
245
246
  cluster_name_on_cloud: str,
246
247
  provider_config: Optional[Dict[str, Any]] = None,
247
248
  non_terminated_only: bool = True,
248
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
249
+ retry_if_missing: bool = False,
250
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
249
251
  """See sky/provision/__init__.py"""
252
+ del cluster_name, retry_if_missing # unused
250
253
  # terminated instances are not retrieved by the
251
254
  # API making `non_terminated_only` argument moot.
252
255
  del non_terminated_only
@@ -260,10 +263,11 @@ def query_instances(
260
263
  'active': status_lib.ClusterStatus.UP,
261
264
  'off': status_lib.ClusterStatus.STOPPED,
262
265
  }
263
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
266
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
267
+ Optional[str]]] = {}
264
268
  for instance_meta in instances.values():
265
269
  status = status_map[instance_meta['status']]
266
- statuses[instance_meta['name']] = status
270
+ statuses[instance_meta['name']] = (status, None)
267
271
  return statuses
268
272
 
269
273
 
sky/provision/do/utils.py CHANGED
@@ -17,6 +17,7 @@ from sky.provision import constants as provision_constants
17
17
  from sky.provision.do import constants
18
18
  from sky.utils import annotations
19
19
  from sky.utils import common_utils
20
+ from sky.utils import yaml_utils
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
@@ -30,7 +31,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
30
31
  INITIAL_BACKOFF_SECONDS = 10
31
32
  MAX_BACKOFF_FACTOR = 10
32
33
  MAX_ATTEMPTS = 6
33
- SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
34
+ SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
34
35
 
35
36
  _client = None
36
37
  _ssh_key_id = None
@@ -61,7 +62,7 @@ def _init_client():
61
62
  if get_credentials_path() is None:
62
63
  raise DigitalOceanError(
63
64
  'No credentials found, please run `doctl auth init`')
64
- credentials = common_utils.read_yaml(get_credentials_path())
65
+ credentials = yaml_utils.read_yaml(get_credentials_path())
65
66
  default_token = credentials.get('access-token', None)
66
67
  if default_token is not None:
67
68
  try:
@@ -125,7 +126,7 @@ def ssh_key_id(public_key: str):
125
126
 
126
127
  request = {
127
128
  'public_key': public_key,
128
- 'name': SSH_KEY_NAME_ON_DO,
129
+ 'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
129
130
  }
130
131
  _ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
131
132
  return _ssh_key_id