skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -15,15 +15,21 @@ import colorama
15
15
  import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
+ from sky import global_user_state
19
+ from sky import logs
18
20
  from sky import provision
21
+ from sky import resources as resources_lib
19
22
  from sky import sky_logging
23
+ from sky import skypilot_config
20
24
  from sky.adaptors import aws
21
25
  from sky.backends import backend_utils
26
+ from sky.jobs.server import utils as server_jobs_utils
22
27
  from sky.provision import common as provision_common
23
28
  from sky.provision import instance_setup
24
29
  from sky.provision import logging as provision_logging
25
30
  from sky.provision import metadata_utils
26
31
  from sky.skylet import constants
32
+ from sky.utils import common
27
33
  from sky.utils import common_utils
28
34
  from sky.utils import message_utils
29
35
  from sky.utils import resources_utils
@@ -64,6 +70,7 @@ def _bulk_provision(
64
70
 
65
71
  provision_record = provision.run_instances(provider_name,
66
72
  region_name,
73
+ str(cluster_name),
67
74
  cluster_name.name_on_cloud,
68
75
  config=config)
69
76
 
@@ -71,7 +78,8 @@ def _bulk_provision(
71
78
  logger.debug(f'\nWaiting for instances of {cluster_name!r} to be ready...')
72
79
  rich_utils.force_update_status(
73
80
  ux_utils.spinner_message('Launching - Checking instance status',
74
- str(provision_logging.config.log_path)))
81
+ str(provision_logging.config.log_path),
82
+ cluster_name=str(cluster_name)))
75
83
  # AWS would take a very short time (<<1s) updating the state of the
76
84
  # instance.
77
85
  time.sleep(1)
@@ -95,6 +103,12 @@ def _bulk_provision(
95
103
  f'\nProvisioning {cluster_name!r} took {time.time() - start:.2f} '
96
104
  f'seconds.')
97
105
 
106
+ # Add cluster event for provisioning completion.
107
+ global_user_state.add_cluster_event(
108
+ str(cluster_name), status_lib.ClusterStatus.INIT,
109
+ f'Instances launched on {cloud.display_name()} in {region}',
110
+ global_user_state.ClusterEventType.STATUS_CHANGE)
111
+
98
112
  return provision_record
99
113
 
100
114
 
@@ -117,7 +131,7 @@ def bulk_provision(
117
131
  Cloud specific exceptions: If the provisioning process failed, cloud-
118
132
  specific exceptions will be raised by the cloud APIs.
119
133
  """
120
- original_config = common_utils.read_yaml(cluster_yaml)
134
+ original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
121
135
  head_node_type = original_config['head_node_type']
122
136
  bootstrap_config = provision_common.ProvisionConfig(
123
137
  provider_config=original_config['provider'],
@@ -155,7 +169,7 @@ def bulk_provision(
155
169
  # This error is a user error instead of a provisioning failure.
156
170
  # And there is no possibility to fix it by teardown.
157
171
  raise
158
- except Exception: # pylint: disable=broad-except
172
+ except Exception as exc: # pylint: disable=broad-except
159
173
  zone_str = 'all zones'
160
174
  if zones:
161
175
  zone_str = ','.join(zone.name for zone in zones)
@@ -177,14 +191,18 @@ def bulk_provision(
177
191
  provider_config=original_config['provider'])
178
192
  break
179
193
  except NotImplementedError as e:
180
- verb = 'terminate' if terminate else 'stop'
194
+ assert not terminate, (
195
+ 'Terminating must be supported by all clouds')
196
+ exc_msg = common_utils.format_exception(exc).replace(
197
+ '\n', ' ')
181
198
  # If the underlying cloud does not support stopping
182
199
  # instances, we should stop failover as well.
183
200
  raise provision_common.StopFailoverError(
184
- 'During provisioner\'s failover, '
185
- f'{terminate_str.lower()} {cluster_name!r} failed. '
186
- f'We cannot {verb} the resources launched, as it is '
187
- f'not supported by {cloud}. Please try launching the '
201
+ f'Provisioning cluster {cluster_name.display_name} '
202
+ f'failed: {exc_msg}. Failover is stopped for safety '
203
+ 'because the cluster was previously in UP state but '
204
+ f'{cloud} does not support stopping instances to '
205
+ 'preserve the cluster state. Please try launching the '
188
206
  'cluster again, or terminate it with: '
189
207
  f'sky down {cluster_name.display_name}') from e
190
208
  except Exception as e: # pylint: disable=broad-except
@@ -228,9 +246,9 @@ def _ssh_probe_command(ip: str,
228
246
  ssh_port: int,
229
247
  ssh_user: str,
230
248
  ssh_private_key: str,
249
+ ssh_probe_timeout: int,
231
250
  ssh_proxy_command: Optional[str] = None) -> List[str]:
232
- # NOTE: Ray uses 'uptime' command and 10s timeout, we use the same
233
- # setting here.
251
+ # NOTE: Ray uses 'uptime' command, we use the same setting here.
234
252
  command = [
235
253
  'ssh',
236
254
  '-T',
@@ -244,7 +262,7 @@ def _ssh_probe_command(ip: str,
244
262
  '-o',
245
263
  'PasswordAuthentication=no',
246
264
  '-o',
247
- 'ConnectTimeout=10s',
265
+ f'ConnectTimeout={ssh_probe_timeout}s',
248
266
  '-o',
249
267
  f'UserKnownHostsFile={os.devnull}',
250
268
  '-o',
@@ -277,6 +295,7 @@ def _wait_ssh_connection_direct(ip: str,
277
295
  ssh_port: int,
278
296
  ssh_user: str,
279
297
  ssh_private_key: str,
298
+ ssh_probe_timeout: int,
280
299
  ssh_control_name: Optional[str] = None,
281
300
  ssh_proxy_command: Optional[str] = None,
282
301
  **kwargs) -> Tuple[bool, str]:
@@ -305,6 +324,7 @@ def _wait_ssh_connection_direct(ip: str,
305
324
  if success:
306
325
  return _wait_ssh_connection_indirect(ip, ssh_port, ssh_user,
307
326
  ssh_private_key,
327
+ ssh_probe_timeout,
308
328
  ssh_control_name,
309
329
  ssh_proxy_command)
310
330
  except socket.timeout: # this is the most expected exception
@@ -312,7 +332,7 @@ def _wait_ssh_connection_direct(ip: str,
312
332
  except Exception as e: # pylint: disable=broad-except
313
333
  stderr = f'Error: {common_utils.format_exception(e)}'
314
334
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
315
- ssh_proxy_command)
335
+ ssh_probe_timeout, ssh_proxy_command)
316
336
  logger.debug(f'Waiting for SSH to {ip}. Try: '
317
337
  f'{_shlex_join(command)}. '
318
338
  f'{stderr}')
@@ -323,6 +343,7 @@ def _wait_ssh_connection_indirect(ip: str,
323
343
  ssh_port: int,
324
344
  ssh_user: str,
325
345
  ssh_private_key: str,
346
+ ssh_probe_timeout: int,
326
347
  ssh_control_name: Optional[str] = None,
327
348
  ssh_proxy_command: Optional[str] = None,
328
349
  **kwargs) -> Tuple[bool, str]:
@@ -333,14 +354,14 @@ def _wait_ssh_connection_indirect(ip: str,
333
354
  """
334
355
  del ssh_control_name, kwargs # unused
335
356
  command = _ssh_probe_command(ip, ssh_port, ssh_user, ssh_private_key,
336
- ssh_proxy_command)
357
+ ssh_probe_timeout, ssh_proxy_command)
337
358
  message = f'Waiting for SSH using command: {_shlex_join(command)}'
338
359
  logger.debug(message)
339
360
  try:
340
361
  proc = subprocess.run(command,
341
362
  shell=False,
342
363
  check=False,
343
- timeout=10,
364
+ timeout=ssh_probe_timeout,
344
365
  stdout=subprocess.DEVNULL,
345
366
  stderr=subprocess.PIPE)
346
367
  if proc.returncode != 0:
@@ -383,8 +404,13 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
383
404
  def _retry_ssh_thread(ip_ssh_port: Tuple[str, int]):
384
405
  ip, ssh_port = ip_ssh_port
385
406
  success = False
407
+ ssh_probe_timeout = skypilot_config.get_nested(
408
+ ('provision', 'ssh_timeout'), 10)
386
409
  while not success:
387
- success, stderr = waiter(ip, ssh_port, **ssh_credentials)
410
+ success, stderr = waiter(ip,
411
+ ssh_port,
412
+ **ssh_credentials,
413
+ ssh_probe_timeout=ssh_probe_timeout)
388
414
  if not success and time.time() - start > timeout:
389
415
  with ux_utils.print_exception_no_traceback():
390
416
  raise RuntimeError(
@@ -403,16 +429,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
403
429
 
404
430
 
405
431
  def _post_provision_setup(
406
- cloud_name: str, cluster_name: resources_utils.ClusterName,
407
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
432
+ launched_resources: resources_lib.Resources,
433
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
434
+ provision_record: provision_common.ProvisionRecord,
408
435
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
409
- config_from_yaml = common_utils.read_yaml(cluster_yaml)
436
+ config_from_yaml = global_user_state.get_cluster_yaml_dict(
437
+ handle_cluster_yaml)
410
438
  provider_config = config_from_yaml.get('provider')
439
+ cloud_name = repr(launched_resources.cloud)
411
440
  cluster_info = provision.get_cluster_info(cloud_name,
412
441
  provision_record.region,
413
442
  cluster_name.name_on_cloud,
414
443
  provider_config=provider_config)
415
444
 
445
+ # Update cluster info in handle so cluster instance ids are set. This
446
+ # allows us to expose provision logs to debug nodes that failed during post
447
+ # provision setup.
448
+ handle = global_user_state.get_handle_from_cluster_name(
449
+ cluster_name.display_name)
450
+ handle.cached_cluster_info = cluster_info
451
+ global_user_state.update_cluster_handle(cluster_name.display_name, handle)
452
+
416
453
  if cluster_info.num_instances > 1:
417
454
  # Only worker nodes have logs in the per-instance log directory. Head
418
455
  # node's log will be redirected to the main log file.
@@ -437,23 +474,24 @@ def _post_provision_setup(
437
474
  # TODO(suquark): Move wheel build here in future PRs.
438
475
  # We don't set docker_user here, as we are configuring the VM itself.
439
476
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
440
- cluster_yaml, ssh_user=cluster_info.ssh_user)
477
+ handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
441
478
  docker_config = config_from_yaml.get('docker', {})
442
479
 
443
480
  with rich_utils.safe_status(
444
- ux_utils.spinner_message(
445
- 'Launching - Waiting for SSH access',
446
- provision_logging.config.log_path)) as status:
481
+ ux_utils.spinner_message('Launching - Waiting for SSH access',
482
+ provision_logging.config.log_path,
483
+ cluster_name=str(cluster_name))) as status:
447
484
  # If on Kubernetes, skip SSH check since the pods are guaranteed to be
448
485
  # ready by the provisioner, and we use kubectl instead of SSH to run the
449
486
  # commands and rsync on the pods. SSH will still be ready after a while
450
487
  # for the users to SSH into the pod.
451
- if cloud_name.lower() != 'kubernetes':
488
+ is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
489
+ if not is_k8s_cloud:
452
490
  logger.debug(
453
491
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
454
492
  wait_for_ssh(cluster_info, ssh_credentials)
455
493
  logger.debug(f'SSH Connection ready for {cluster_name!r}')
456
- vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
494
+ vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
457
495
  plural = '' if len(cluster_info.instances) == 1 else 's'
458
496
  verb = 'is' if len(cluster_info.instances) == 1 else 'are'
459
497
  indent_str = (ux_utils.INDENT_SYMBOL
@@ -472,7 +510,8 @@ def _post_provision_setup(
472
510
  status.update(
473
511
  ux_utils.spinner_message(
474
512
  'Launching - Initializing docker container',
475
- provision_logging.config.log_path))
513
+ provision_logging.config.log_path,
514
+ cluster_name=str(cluster_name)))
476
515
  docker_user = instance_setup.initialize_docker(
477
516
  cluster_name.name_on_cloud,
478
517
  docker_config=docker_config,
@@ -489,6 +528,25 @@ def _post_provision_setup(
489
528
  logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}'
490
529
  f'Docker container is up.{colorama.Style.RESET_ALL}')
491
530
 
531
+ # Check version compatibility for jobs controller clusters
532
+ if cluster_name.display_name.startswith(common.JOB_CONTROLLER_PREFIX):
533
+ # TODO(zeping): remove this in v0.12.0
534
+ # This only happens in upgrade from <0.9.3 to > 0.10.0
535
+ # After 0.10.0 no incompatibility issue
536
+ # See https://github.com/skypilot-org/skypilot/pull/6096
537
+ # For more details
538
+ status.update(
539
+ ux_utils.spinner_message(
540
+ 'Checking controller version compatibility'))
541
+
542
+ try:
543
+ server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
544
+ except exceptions.ClusterNotUpError:
545
+ # Controller is not up yet during initial provisioning, that
546
+ # also means no non-terminal jobs, so no incompatibility in
547
+ # this case.
548
+ pass
549
+
492
550
  # We mount the metadata with sky wheel for speedup.
493
551
  # NOTE: currently we mount all credentials for all nodes, because
494
552
  # (1) jobs controllers need permission to launch/down nodes of
@@ -502,7 +560,8 @@ def _post_provision_setup(
502
560
 
503
561
  runtime_preparation_str = (ux_utils.spinner_message(
504
562
  'Preparing SkyPilot runtime ({step}/3 - {step_name})',
505
- provision_logging.config.log_path))
563
+ provision_logging.config.log_path,
564
+ cluster_name=str(cluster_name)))
506
565
  status.update(
507
566
  runtime_preparation_str.format(step=1, step_name='initializing'))
508
567
  instance_setup.internal_file_mounts(cluster_name.name_on_cloud,
@@ -636,19 +695,32 @@ def _post_provision_setup(
636
695
  logger.debug('Ray cluster is ready. Skip starting ray cluster on '
637
696
  'worker nodes.')
638
697
 
639
- instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
640
- cluster_info, ssh_credentials)
698
+ logging_agent = logs.get_logging_agent()
699
+ if logging_agent:
700
+ status.update(
701
+ ux_utils.spinner_message('Setting up logging agent',
702
+ provision_logging.config.log_path,
703
+ cluster_name=str(cluster_name)))
704
+ instance_setup.setup_logging_on_cluster(logging_agent, cluster_name,
705
+ cluster_info,
706
+ ssh_credentials)
707
+
708
+ instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
709
+ ssh_credentials,
710
+ launched_resources)
641
711
 
642
712
  logger.info(
643
713
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
644
- provision_logging.config.log_path))
714
+ provision_logging.config.log_path,
715
+ cluster_name=str(cluster_name)))
645
716
  return cluster_info
646
717
 
647
718
 
648
719
  @timeline.event
649
720
  def post_provision_runtime_setup(
650
- cloud_name: str, cluster_name: resources_utils.ClusterName,
651
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
721
+ launched_resources: resources_lib.Resources,
722
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
723
+ provision_record: provision_common.ProvisionRecord,
652
724
  custom_resource: Optional[str],
653
725
  log_dir: str) -> provision_common.ClusterInfo:
654
726
  """Run internal setup commands after provisioning and before user setup.
@@ -659,6 +731,7 @@ def post_provision_runtime_setup(
659
731
  and other necessary files to the VM.
660
732
  3. Run setup commands to install dependencies.
661
733
  4. Start ray cluster and skylet.
734
+ 5. (Optional) Setup logging agent.
662
735
 
663
736
  Raises:
664
737
  RuntimeError: If the setup process encounters any error.
@@ -666,11 +739,12 @@ def post_provision_runtime_setup(
666
739
  with provision_logging.setup_provision_logging(log_dir):
667
740
  try:
668
741
  logger.debug(_TITLE.format('System Setup After Provision'))
669
- return _post_provision_setup(cloud_name,
670
- cluster_name,
671
- cluster_yaml=cluster_yaml,
672
- provision_record=provision_record,
673
- custom_resource=custom_resource)
742
+ return _post_provision_setup(
743
+ launched_resources,
744
+ cluster_name,
745
+ handle_cluster_yaml=handle_cluster_yaml,
746
+ provision_record=provision_record,
747
+ custom_resource=custom_resource)
674
748
  except Exception: # pylint: disable=broad-except
675
749
  logger.error(
676
750
  ux_utils.error_message(
@@ -9,3 +9,8 @@ from sky.provision.runpod.instance import run_instances
9
9
  from sky.provision.runpod.instance import stop_instances
10
10
  from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
+ from sky.provision.runpod.volume import apply_volume
13
+ from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_all_volumes_usedby
15
+ from sky.provision.runpod.volume import get_volume_usedby
16
+ from sky.provision.runpod.volume import map_all_volumes_usedby
@@ -1,6 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
44
44
  return head_instance_id
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
-
50
+ del cluster_name # unused
51
51
  pending_status = ['CREATED', 'RESTARTING']
52
52
 
53
53
  while True:
@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
80
80
  created_instance_ids=[])
81
81
 
82
82
  created_instance_ids = []
83
+ volume_mounts = config.node_config.get('VolumeMounts', [])
84
+ network_volume_id = None
85
+ volume_mount_path = None
86
+ if volume_mounts:
87
+ if len(volume_mounts) > 1:
88
+ logger.warning(
89
+ f'RunPod only supports one network volume mount, '
90
+ f'but {len(volume_mounts)} are specified. Only the first one '
91
+ f'will be used.')
92
+ volume_mount = volume_mounts[0]
93
+ network_volume_id = volume_mount.get('VolumeIdOnCloud')
94
+ volume_mount_path = volume_mount.get('MountPath')
95
+ if network_volume_id is None or volume_mount_path is None:
96
+ raise RuntimeError(
97
+ 'Network volume ID and mount path must be specified.')
83
98
  for _ in range(to_start_count):
84
99
  node_type = 'head' if head_instance_id is None else 'worker'
85
100
  try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
97
112
  bid_per_gpu=config.node_config['BidPerGPU'],
98
113
  docker_login_config=config.provider_config.get(
99
114
  'docker_login_config'),
115
+ network_volume_id=network_volume_id,
116
+ volume_mount_path=volume_mount_path,
100
117
  )
101
118
  except Exception as e: # pylint: disable=broad-except
102
119
  logger.warning(f'run_instances error: {e}')
@@ -201,11 +218,14 @@ def get_cluster_info(
201
218
 
202
219
 
203
220
  def query_instances(
221
+ cluster_name: str,
204
222
  cluster_name_on_cloud: str,
205
223
  provider_config: Optional[Dict[str, Any]] = None,
206
224
  non_terminated_only: bool = True,
207
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
225
+ retry_if_missing: bool = False,
226
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
208
227
  """See sky/provision/__init__.py"""
228
+ del cluster_name, retry_if_missing # unused
209
229
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
210
230
  instances = _filter_instances(cluster_name_on_cloud, None)
211
231
 
@@ -215,12 +235,13 @@ def query_instances(
215
235
  'PAUSED': status_lib.ClusterStatus.INIT,
216
236
  'RUNNING': status_lib.ClusterStatus.UP,
217
237
  }
218
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
238
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
239
+ Optional[str]]] = {}
219
240
  for inst_id, inst in instances.items():
220
241
  status = status_map[inst['status']]
221
242
  if non_terminated_only and status is None:
222
243
  continue
223
- statuses[inst_id] = status
244
+ statuses[inst_id] = (status, None)
224
245
  return statuses
225
246
 
226
247
 
@@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple
7
7
  from sky import sky_logging
8
8
  from sky.adaptors import runpod
9
9
  from sky.provision import docker_utils
10
- import sky.provision.runpod.api.commands as runpod_commands
10
+ from sky.provision.runpod.api import commands as runpod_commands
11
11
  from sky.skylet import constants
12
12
  from sky.utils import common_utils
13
13
 
@@ -263,25 +263,36 @@ def _create_template_for_docker_login(
263
263
  return login_config.format_image(image_name), create_template_resp['id']
264
264
 
265
265
 
266
- def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
267
- zone: str, disk_size: int, image_name: str,
268
- ports: Optional[List[int]], public_key: str,
269
- preemptible: Optional[bool], bid_per_gpu: float,
270
- docker_login_config: Optional[Dict[str, str]]) -> str:
266
+ def launch(
267
+ cluster_name: str,
268
+ node_type: str,
269
+ instance_type: str,
270
+ region: str,
271
+ zone: str,
272
+ disk_size: int,
273
+ image_name: str,
274
+ ports: Optional[List[int]],
275
+ public_key: str,
276
+ preemptible: Optional[bool],
277
+ bid_per_gpu: float,
278
+ docker_login_config: Optional[Dict[str, str]],
279
+ *,
280
+ network_volume_id: Optional[str] = None,
281
+ volume_mount_path: Optional[str] = None,
282
+ ) -> str:
271
283
  """Launches an instance with the given parameters.
272
284
 
273
- Converts the instance_type to the RunPod GPU name, finds the specs for the
274
- GPU, and launches the instance.
285
+ For CPU instances, we directly use the instance_type for launching the
286
+ instance.
287
+
288
+ For GPU instances, we convert the instance_type to the RunPod GPU name,
289
+ and finds the specs for the GPU, before launching the instance.
275
290
 
276
291
  Returns:
277
292
  instance_id: The instance ID.
278
293
  """
279
294
  name = f'{cluster_name}-{node_type}'
280
- gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
281
- gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
282
- cloud_type = instance_type.split('_')[2]
283
295
 
284
- gpu_specs = runpod.runpod.get_gpu(gpu_type)
285
296
  # TODO(zhwu): keep this align with setups in
286
297
  # `provision.kuberunetes.instance.py`
287
298
  setup_cmd = (
@@ -329,12 +340,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
329
340
  params = {
330
341
  'name': name,
331
342
  'image_name': image_name_formatted,
332
- 'gpu_type_id': gpu_type,
333
- 'cloud_type': cloud_type,
334
343
  'container_disk_in_gb': disk_size,
335
- 'min_vcpu_count': 4 * gpu_quantity,
336
- 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
337
- 'gpu_count': gpu_quantity,
338
344
  'country_code': region,
339
345
  'data_center_id': zone,
340
346
  'ports': ports_str,
@@ -343,12 +349,39 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
343
349
  'template_id': template_id,
344
350
  }
345
351
 
352
+ # Optional network volume mount.
353
+ if volume_mount_path is not None:
354
+ params['volume_mount_path'] = volume_mount_path
355
+ if network_volume_id is not None:
356
+ params['network_volume_id'] = network_volume_id
357
+
358
+ # GPU instance types start with f'{gpu_count}x',
359
+ # CPU instance types start with 'cpu'.
360
+ is_cpu_instance = instance_type.startswith('cpu')
361
+ if is_cpu_instance:
362
+ # RunPod CPU instances can be uniquely identified by the instance_id.
363
+ params.update({
364
+ 'instance_id': instance_type,
365
+ })
366
+ else:
367
+ gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
368
+ gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
369
+ cloud_type = instance_type.split('_')[2]
370
+ gpu_specs = runpod.runpod.get_gpu(gpu_type)
371
+ params.update({
372
+ 'gpu_type_id': gpu_type,
373
+ 'cloud_type': cloud_type,
374
+ 'min_vcpu_count': 4 * gpu_quantity,
375
+ 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
376
+ 'gpu_count': gpu_quantity,
377
+ })
378
+
346
379
  if preemptible is None or not preemptible:
347
380
  new_instance = runpod.runpod.create_pod(**params)
348
381
  else:
349
382
  new_instance = runpod_commands.create_spot_pod(
350
383
  bid_per_gpu=bid_per_gpu,
351
- **params,
384
+ **params, # type: ignore[arg-type]
352
385
  )
353
386
 
354
387
  return new_instance['id']