skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -15,15 +15,22 @@ import colorama
15
15
  import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
+ from sky import global_user_state
19
+ from sky import logs
18
20
  from sky import provision
21
+ from sky import resources as resources_lib
19
22
  from sky import sky_logging
23
+ from sky import skypilot_config
20
24
  from sky.adaptors import aws
21
25
  from sky.backends import backend_utils
26
+ from sky.jobs.server import utils as server_jobs_utils
22
27
  from sky.provision import common as provision_common
23
28
  from sky.provision import instance_setup
24
29
  from sky.provision import logging as provision_logging
25
30
  from sky.provision import metadata_utils
31
+ from sky.provision import volume as provision_volume
26
32
  from sky.skylet import constants
33
+ from sky.utils import common
27
34
  from sky.utils import common_utils
28
35
  from sky.utils import message_utils
29
36
  from sky.utils import resources_utils
@@ -53,6 +60,11 @@ def _bulk_provision(
53
60
  region_name = region.name
54
61
 
55
62
  start = time.time()
63
+
64
+ provision_volume.provision_ephemeral_volumes(cloud, region_name,
65
+ cluster_name.name_on_cloud,
66
+ bootstrap_config)
67
+
56
68
  # TODO(suquark): Should we cache the bootstrapped result?
57
69
  # Currently it is not necessary as bootstrapping takes
58
70
  # only ~3s, caching it seems over-engineering and could
@@ -64,6 +76,7 @@ def _bulk_provision(
64
76
 
65
77
  provision_record = provision.run_instances(provider_name,
66
78
  region_name,
79
+ str(cluster_name),
67
80
  cluster_name.name_on_cloud,
68
81
  config=config)
69
82
 
@@ -71,7 +84,8 @@ def _bulk_provision(
71
84
  logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
72
85
  rich_utils.force_update_status(
73
86
  ux_utils.spinner_message('Launching - Checking instance status',
74
- str(provision_logging.config.log_path)))
87
+ str(provision_logging.config.log_path),
88
+ cluster_name=str(cluster_name)))
75
89
  # AWS would take a very short time (<<1s) updating the state of the
76
90
  # instance.
77
91
  time.sleep(1)
@@ -95,6 +109,12 @@ def _bulk_provision(
95
109
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
96
110
  f'seconds.')
97
111
 
112
+ # Add cluster event for provisioning completion.
113
+ global_user_state.add_cluster_event(
114
+ str(cluster_name), status_lib.ClusterStatus.INIT,
115
+ f'Instances launched on {cloud.display_name()} in {region}',
116
+ global_user_state.ClusterEventType.STATUS_CHANGE)
117
+
98
118
  return provision_record
99
119
 
100
120
 
@@ -117,7 +137,7 @@ def bulk_provision(
117
137
  Cloud specific exceptions: If the provisioning process failed, cloud-
118
138
  specific exceptions will be raised by the cloud APIs.
119
139
  """
120
- original_config = common_utils.read_yaml(cluster_yaml)
140
+ original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
121
141
  head_node_type = original_config['head_node_type']
122
142
  bootstrap_config = provision_common.ProvisionConfig(
123
143
  provider_config=original_config['provider'],
@@ -155,7 +175,7 @@ def bulk_provision(
155
175
  # This error is a user error instead of a provisioning failure.
156
176
  # And there is no possibility to fix it by teardown.
157
177
  raise
158
- except Exception: # pylint: disable=broad-except
178
+ except Exception as exc: # pylint: disable=broad-except
159
179
  zone_str = 'all zones'
160
180
  if zones:
161
181
  zone_str = ','.join(zone.name for zone in zones)
@@ -177,14 +197,18 @@ def bulk_provision(
177
197
  provider_config=original_config['provider'])
178
198
  break
179
199
  except NotImplementedError as e:
180
- verb = 'terminate' if terminate else 'stop'
200
+ assert not terminate, (
201
+ 'Terminating must be supported by all clouds')
202
+ exc_msg = common_utils.format_exception(exc).replace(
203
+ '\n', ' ')
181
204
  # If the underlying cloud does not support stopping
182
205
  # instances, we should stop failover as well.
183
206
  raise provision_common.StopFailoverError(
184
- 'During provisioner\'s failover, '
185
- f'{terminate_str.lower()} {cluster_name!r} failed. '
186
- f'We cannot {verb} the resources launched, as it is '
187
- f'not supported by {cloud}. Please try launching the '
207
+ f'Provisioning cluster {cluster_name.display_name} '
208
+ f'failed: {exc_msg}. Failover is stopped for safety '
209
+ 'because the cluster was previously in UP state but '
210
+ f'{cloud} does not support stopping instances to '
211
+ 'preserve the cluster state. Please try launching the '
188
212
  'cluster again, or terminate it with: '
189
213
  f'sky down {cluster_name.display_name}') from e
190
214
  except Exception as e: # pylint: disable=broad-except
@@ -219,6 +243,7 @@ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
219
243
  provision.terminate_instances(cloud_name, cluster_name.name_on_cloud,
220
244
  provider_config)
221
245
  metadata_utils.remove_cluster_metadata(cluster_name.name_on_cloud)
246
+ provision_volume.delete_ephemeral_volumes(provider_config)
222
247
  else:
223
248
  provision.stop_instances(cloud_name, cluster_name.name_on_cloud,
224
249
  provider_config)
@@ -228,9 +253,9 @@ def _ssh_probe_command(ip: str,
228
253
  ssh_port: int,
229
254
  ssh_user: str,
230
255
  ssh_private_key: str,
256
+ ssh_probe_timeout: int,
231
257
  ssh_proxy_command: Optional[str] = None) -> List[str]:
232
- # NOTE: Ray uses 'uptime' command and 10s timeout, we use the same
233
- # setting here.
258
+ # NOTE: Ray uses 'uptime' command, we use the same setting here.
234
259
  command = [
235
260
  'ssh',
236
261
  '-T',
@@ -244,7 +269,7 @@ def _ssh_probe_command(ip: str,
244
269
  '-o',
245
270
  'PasswordAuthentication=no',
246
271
  '-o',
247
- 'ConnectTimeout=10s',
272
+ f'ConnectTimeout={ssh_probe_timeout}s',
248
273
  '-o',
249
274
  f'UserKnownHostsFile={os.devnull}',
250
275
  '-o',
@@ -277,6 +302,7 @@ def _wait_ssh_connection_direct(ip: str,
277
302
  ssh_port: int,
278
303
  ssh_user: str,
279
304
  ssh_private_key: str,
305
+ ssh_probe_timeout: int,
280
306
  ssh_control_name: Optional[str] = None,
281
307
  ssh_proxy_command: Optional[str] = None,
282
308
  **kwargs) -> Tuple[bool, str]:
@@ -305,6 +331,7 @@ def _wait_ssh_connection_direct(ip: str,
305
331
  if success:
306
332
  return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
307
333
  ssh_private_key,
334
+ ssh_probe_timeout,
308
335
  ssh_control_name,
309
336
  ssh_proxy_command)
310
337
  except socket.timeout: # this is the most expected exception
@@ -312,7 +339,7 @@ def _wait_ssh_connection_direct(ip: str,
312
339
  except Exception as e: # pylint: disable=broad-except
313
340
  stderr = f'Error: {common_utils.format_exception(e)}'
314
341
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
315
- ssh_proxy_command)
342
+ ssh_probe_timeout, ssh_proxy_command)
316
343
  logger.debug(f'Waiting for SSH to {ip}. Try: '
317
344
  f'{_shlex_join(command)}. '
318
345
  f'{stderr}')
@@ -323,6 +350,7 @@ def _wait_ssh_connection_indirect(ip: str,
323
350
  ssh_port: int,
324
351
  ssh_user: str,
325
352
  ssh_private_key: str,
353
+ ssh_probe_timeout: int,
326
354
  ssh_control_name: Optional[str] = None,
327
355
  ssh_proxy_command: Optional[str] = None,
328
356
  **kwargs) -> Tuple[bool, str]:
@@ -333,14 +361,14 @@ def _wait_ssh_connection_indirect(ip: str,
333
361
  """
334
362
  del ssh_control_name, kwargs # unused
335
363
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
336
- ssh_proxy_command)
364
+ ssh_probe_timeout, ssh_proxy_command)
337
365
  message = f'Waiting for SSH using command: {_shlex_join(command)}'
338
366
  logger.debug(message)
339
367
  try:
340
368
  proc = subprocess.run(command,
341
369
  shell=False,
342
370
  check=False,
343
- timeout=10,
371
+ timeout=ssh_probe_timeout,
344
372
  stdout=subprocess.DEVNULL,
345
373
  stderr=subprocess.PIPE)
346
374
  if proc.returncode != 0:
@@ -383,8 +411,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
383
411
  def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
384
412
  ip, ssh_port = ip_ssh_port
385
413
  success = False
414
+ ssh_probe_timeout = skypilot_config.get_nested(
415
+ ('provision', 'ssh_timeout'), 10)
386
416
  while not success:
387
- success, stderr = waiter(ip, ssh_port, **ssh_credentials)
417
+ success, stderr = waiter(ip,
418
+ ssh_port,
419
+ **ssh_credentials,
420
+ ssh_probe_timeout=ssh_probe_timeout)
388
421
  if not success and time.time() - start > timeout:
389
422
  with ux_utils.print_exception_no_traceback():
390
423
  raise RuntimeError(
@@ -403,16 +436,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
403
436
 
404
437
 
405
438
  def _post_provision_setup(
406
- cloud_name: str, cluster_name: resources_utils.ClusterName,
407
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
439
+ launched_resources: resources_lib.Resources,
440
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
441
+ provision_record: provision_common.ProvisionRecord,
408
442
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
409
- config_from_yaml = common_utils.read_yaml(cluster_yaml)
443
+ config_from_yaml = global_user_state.get_cluster_yaml_dict(
444
+ handle_cluster_yaml)
410
445
  provider_config = config_from_yaml.get('provider')
446
+ cloud_name = repr(launched_resources.cloud)
411
447
  cluster_info = provision.get_cluster_info(cloud_name,
412
448
  provision_record.region,
413
449
  cluster_name.name_on_cloud,
414
450
  provider_config=provider_config)
415
451
 
452
+ # Update cluster info in handle so cluster instance ids are set. This
453
+ # allows us to expose provision logs to debug nodes that failed during post
454
+ # provision setup.
455
+ handle = global_user_state.get_handle_from_cluster_name(
456
+ cluster_name.display_name)
457
+ handle.cached_cluster_info = cluster_info
458
+ global_user_state.update_cluster_handle(cluster_name.display_name, handle)
459
+
416
460
  if cluster_info.num_instances > 1:
417
461
  # Only worker nodes have logs in the per-instance log directory. Head
418
462
  # node's log will be redirected to the main log file.
@@ -437,23 +481,24 @@ def _post_provision_setup(
437
481
  # TODO(suquark): Move wheel build here in future PRs.
438
482
  # We don't set docker_user here, as we are configuring the VM itself.
439
483
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
440
- cluster_yaml, ssh_user=cluster_info.ssh_user)
484
+ handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
441
485
  docker_config = config_from_yaml.get('docker', {})
442
486
 
443
487
  with rich_utils.safe_status(
444
- ux_utils.spinner_message(
445
- 'Launching - Waiting for SSH access',
446
- provision_logging.config.log_path)) as status:
488
+ ux_utils.spinner_message('Launching - Waiting for SSH access',
489
+ provision_logging.config.log_path,
490
+ cluster_name=str(cluster_name))) as status:
447
491
  # If on Kubernetes, skip SSH check since the pods are guaranteed to be
448
492
  # ready by the provisioner, and we use kubectl instead of SSH to run the
449
493
  # commands and rsync on the pods. SSH will still be ready after a while
450
494
  # for the users to SSH into the pod.
451
- if cloud_name.lower() != 'kubernetes':
495
+ is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
496
+ if not is_k8s_cloud:
452
497
  logger.debug(
453
498
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
454
499
  wait_for_ssh(cluster_info, ssh_credentials)
455
500
  logger.debug(f'SSH Connection ready for {cluster_name!r}')
456
- vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
501
+ vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
457
502
  plural = '' if len(cluster_info.instances) == 1 else 's'
458
503
  verb = 'is' if len(cluster_info.instances) == 1 else 'are'
459
504
  indent_str = (ux_utils.INDENT_SYMBOL
@@ -472,7 +517,8 @@ def _post_provision_setup(
472
517
  status.update(
473
518
  ux_utils.spinner_message(
474
519
  'Launching - Initializing docker container',
475
- provision_logging.config.log_path))
520
+ provision_logging.config.log_path,
521
+ cluster_name=str(cluster_name)))
476
522
  docker_user = instance_setup.initialize_docker(
477
523
  cluster_name.name_on_cloud,
478
524
  docker_config=docker_config,
@@ -489,6 +535,25 @@ def _post_provision_setup(
489
535
  logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
490
536
  f'Docker container is up.{colorama.Style.RESET_ALL}')
491
537
 
538
+ # Check version compatibility for jobs controller clusters
539
+ if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
540
+ # TODO(zeping): remove this in v0.12.0
541
+ # This only happens in upgrade from <0.9.3 to > 0.10.0
542
+ # After 0.10.0 no incompatibility issue
543
+ # See https://github.com/skypilot-org/skypilot/pull/6096
544
+ # For more details
545
+ status.update(
546
+ ux_utils.spinner_message(
547
+ 'Checking controller version compatibility'))
548
+
549
+ try:
550
+ server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
551
+ except exceptions.ClusterNotUpError:
552
+ # Controller is not up yet during initial provisioning, that
553
+ # also means no non-terminal jobs, so no incompatibility in
554
+ # this case.
555
+ pass
556
+
492
557
  # We mount the metadata with sky wheel for speedup.
493
558
  # NOTE: currently we mount all credentials for all nodes, because
494
559
  # (1) jobs controllers need permission to launch/down nodes of
@@ -502,7 +567,8 @@ def _post_provision_setup(
502
567
 
503
568
  runtime_preparation_str = (ux_utils.spinner_message(
504
569
  'Preparing SkyPilot runtime ({step}/3 - {step_name})',
505
- provision_logging.config.log_path))
570
+ provision_logging.config.log_path,
571
+ cluster_name=str(cluster_name)))
506
572
  status.update(
507
573
  runtime_preparation_str.format(step=1, step_name='initializing'))
508
574
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -636,19 +702,32 @@ def _post_provision_setup(
636
702
  logger.debug('Ray cluster is ready. Skip starting ray cluster on '
637
703
  'worker nodes.')
638
704
 
639
- instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
640
- cluster_info, ssh_credentials)
705
+ logging_agent = logs.get_logging_agent()
706
+ if logging_agent:
707
+ status.update(
708
+ ux_utils.spinner_message('Setting up logging agent',
709
+ provision_logging.config.log_path,
710
+ cluster_name=str(cluster_name)))
711
+ instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
712
+ cluster_info,
713
+ ssh_credentials)
714
+
715
+ instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
716
+ ssh_credentials,
717
+ launched_resources)
641
718
 
642
719
  logger.info(
643
720
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
644
- provision_logging.config.log_path))
721
+ provision_logging.config.log_path,
722
+ cluster_name=str(cluster_name)))
645
723
  return cluster_info
646
724
 
647
725
 
648
726
  @timeline.event
649
727
  def post_provision_runtime_setup(
650
- cloud_name: str, cluster_name: resources_utils.ClusterName,
651
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
728
+ launched_resources: resources_lib.Resources,
729
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
730
+ provision_record: provision_common.ProvisionRecord,
652
731
  custom_resource: Optional[str],
653
732
  log_dir: str) -> provision_common.ClusterInfo:
654
733
  """Run internal setup commands after provisioning and before user setup.
@@ -659,6 +738,7 @@ def post_provision_runtime_setup(
659
738
  and other necessary files to the VM.
660
739
  3. Run setup commands to install dependencies.
661
740
  4. Start ray cluster and skylet.
741
+ 5. (Optional) Setup logging agent.
662
742
 
663
743
  Raises:
664
744
  RuntimeError: If the setup process encounters any error.
@@ -666,11 +746,12 @@ def post_provision_runtime_setup(
666
746
  with provision_logging.setup_provision_logging(log_dir):
667
747
  try:
668
748
  logger.debug(_TITLE.format('System Setup After Provision'))
669
- return _post_provision_setup(cloud_name,
670
- cluster_name,
671
- cluster_yaml=cluster_yaml,
672
- provision_record=provision_record,
673
- custom_resource=custom_resource)
749
+ return _post_provision_setup(
750
+ launched_resources,
751
+ cluster_name,
752
+ handle_cluster_yaml=handle_cluster_yaml,
753
+ provision_record=provision_record,
754
+ custom_resource=custom_resource)
674
755
  except Exception: # pylint: disable=broad-except
675
756
  logger.error(
676
757
  ux_utils.error_message(
@@ -9,3 +9,8 @@ from sky.provision.runpod.instance import run_instances
9
9
  from sky.provision.runpod.instance import stop_instances
10
10
  from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
+ from sky.provision.runpod.volume import apply_volume
13
+ from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_all_volumes_usedby
15
+ from sky.provision.runpod.volume import get_volume_usedby
16
+ from sky.provision.runpod.volume import map_all_volumes_usedby
@@ -1,6 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
44
44
  return head_instance_id
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
-
50
+ del cluster_name # unused
51
51
  pending_status = ['CREATED', 'RESTARTING']
52
52
 
53
53
  while True:
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
80
80
  created_instance_ids=[])
81
81
 
82
82
  created_instance_ids = []
83
+ volume_mounts = config.node_config.get('VolumeMounts', [])
84
+ network_volume_id = None
85
+ volume_mount_path = None
86
+ if volume_mounts:
87
+ if len(volume_mounts) > 1:
88
+ logger.warning(
89
+ f'RunPod only supports one network volume mount, '
90
+ f'but {len(volume_mounts)} are specified. Only the first one '
91
+ f'will be used.')
92
+ volume_mount = volume_mounts[0]
93
+ network_volume_id = volume_mount.get('VolumeIdOnCloud')
94
+ volume_mount_path = volume_mount.get('MountPath')
95
+ if network_volume_id is None or volume_mount_path is None:
96
+ raise RuntimeError(
97
+ 'Network volume ID and mount path must be specified.')
83
98
  for _ in range(to_start_count):
84
99
  node_type = 'head' if head_instance_id is None else 'worker'
85
100
  try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
112
  bid_per_gpu=config.node_config['BidPerGPU'],
98
113
  docker_login_config=config.provider_config.get(
99
114
  'docker_login_config'),
115
+ network_volume_id=network_volume_id,
116
+ volume_mount_path=volume_mount_path,
100
117
  )
101
118
  except Exception as e: # pylint: disable=broad-except
102
119
  logger.warning(f'run_instances error: {e}')
@@ -201,11 +218,14 @@ def get_cluster_info(
201
218
 
202
219
 
203
220
  def query_instances(
221
+ cluster_name: str,
204
222
  cluster_name_on_cloud: str,
205
223
  provider_config: Optional[Dict[str, Any]] = None,
206
224
  non_terminated_only: bool = True,
207
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
225
+ retry_if_missing: bool = False,
226
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
227
  """See sky/provision/__init__.py"""
228
+ del cluster_name, retry_if_missing # unused
209
229
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
230
  instances = _filter_instances(cluster_name_on_cloud, None)
211
231
 
@@ -215,12 +235,13 @@ def query_instances(
215
235
  'PAUSED': status_lib.ClusterStatus.INIT,
216
236
  'RUNNING': status_lib.ClusterStatus.UP,
217
237
  }
218
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
238
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
239
+ Optional[str]]] = {}
219
240
  for inst_id, inst in instances.items():
220
241
  status = status_map[inst['status']]
221
242
  if non_terminated_only and status is None:
222
243
  continue
223
- statuses[inst_id] = status
244
+ statuses[inst_id] = (status, None)
224
245
  return statuses
225
246
 
226
247
 
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
9
  from sky.provision import docker_utils
10
- import sky.provision.runpod.api.commands as runpod_commands
10
+ from sky.provision.runpod.api import commands as runpod_commands
11
11
  from sky.skylet import constants
12
12
  from sky.utils import common_utils
13
13
 
@@ -263,25 +263,36 @@ def _create_template_for_docker_login(
263
263
  return login_config.format_image(image_name), create_template_resp['id']
264
264
 
265
265
 
266
- def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
267
- zone: str, disk_size: int, image_name: str,
268
- ports: Optional[List[int]], public_key: str,
269
- preemptible: Optional[bool], bid_per_gpu: float,
270
- docker_login_config: Optional[Dict[str, str]]) -> str:
266
+ def launch(
267
+ cluster_name: str,
268
+ node_type: str,
269
+ instance_type: str,
270
+ region: str,
271
+ zone: str,
272
+ disk_size: int,
273
+ image_name: str,
274
+ ports: Optional[List[int]],
275
+ public_key: str,
276
+ preemptible: Optional[bool],
277
+ bid_per_gpu: float,
278
+ docker_login_config: Optional[Dict[str, str]],
279
+ *,
280
+ network_volume_id: Optional[str] = None,
281
+ volume_mount_path: Optional[str] = None,
282
+ ) -> str:
271
283
  """Launches an instance with the given parameters.
272
284
 
273
- Converts the instance_type to the RunPod GPU name, finds the specs for the
274
- GPU, and launches the instance.
285
+ For CPU instances, we directly use the instance_type for launching the
286
+ instance.
287
+
288
+ For GPU instances, we convert the instance_type to the RunPod GPU name,
289
+ and finds the specs for the GPU, before launching the instance.
275
290
 
276
291
  Returns:
277
292
  instance_id: The instance ID.
278
293
  """
279
294
  name = f'{cluster_name}-{node_type}'
280
- gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
281
- gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
282
- cloud_type = instance_type.split('_')[2]
283
295
 
284
- gpu_specs = runpod.runpod.get_gpu(gpu_type)
285
296
  # TODO(zhwu): keep this align with setups in
286
297
  # `provision.kuberunetes.instance.py`
287
298
  setup_cmd = (
@@ -329,12 +340,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
329
340
  params = {
330
341
  'name': name,
331
342
  'image_name': image_name_formatted,
332
- 'gpu_type_id': gpu_type,
333
- 'cloud_type': cloud_type,
334
343
  'container_disk_in_gb': disk_size,
335
- 'min_vcpu_count': 4 * gpu_quantity,
336
- 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
337
- 'gpu_count': gpu_quantity,
338
344
  'country_code': region,
339
345
  'data_center_id': zone,
340
346
  'ports': ports_str,
@@ -343,12 +349,39 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
343
349
  'template_id': template_id,
344
350
  }
345
351
 
352
+ # Optional network volume mount.
353
+ if volume_mount_path is not None:
354
+ params['volume_mount_path'] = volume_mount_path
355
+ if network_volume_id is not None:
356
+ params['network_volume_id'] = network_volume_id
357
+
358
+ # GPU instance types start with f'{gpu_count}x',
359
+ # CPU instance types start with 'cpu'.
360
+ is_cpu_instance = instance_type.startswith('cpu')
361
+ if is_cpu_instance:
362
+ # RunPod CPU instances can be uniquely identified by the instance_id.
363
+ params.update({
364
+ 'instance_id': instance_type,
365
+ })
366
+ else:
367
+ gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
368
+ gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
369
+ cloud_type = instance_type.split('_')[2]
370
+ gpu_specs = runpod.runpod.get_gpu(gpu_type)
371
+ params.update({
372
+ 'gpu_type_id': gpu_type,
373
+ 'cloud_type': cloud_type,
374
+ 'min_vcpu_count': 4 * gpu_quantity,
375
+ 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
376
+ 'gpu_count': gpu_quantity,
377
+ })
378
+
346
379
  if preemptible is None or not preemptible:
347
380
  new_instance = runpod.runpod.create_pod(**params)
348
381
  else:
349
382
  new_instance = runpod_commands.create_spot_pod(
350
383
  bid_per_gpu=bid_per_gpu,
351
- **params,
384
+ **params, # type: ignore[arg-type]
352
385
  )
353
386
 
354
387
  return new_instance['id']