skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,398 @@
1
+ """Prime Intellect library wrapper for SkyPilot."""
2
+
3
+ import json
4
+ import os
5
+ import shlex
6
+ import time
7
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
+ import uuid
9
+
10
+ import requests
11
+
12
+ from sky.catalog import common as catalog_common
13
+ from sky.utils import common_utils
14
+
15
+ _df = None
16
+ _lookup_dict = None
17
+
18
+ # Base URL for Prime Intellect API (used as default if not configured).
19
+ DEFAULT_BASE_URL = 'https://api.primeintellect.ai'
20
+ CREDENTIALS_PATH = '~/.prime/config.json'
21
+ INITIAL_BACKOFF_SECONDS = 10
22
+ MAX_BACKOFF_FACTOR = 10
23
+ MAX_ATTEMPTS = 6
24
+
25
+
26
+ class PrimeintellectAPIError(Exception):
27
+ """Base exception for Prime Intellect API errors."""
28
+
29
+ def __init__(self,
30
+ message: str,
31
+ status_code: Optional[int] = None,
32
+ response_data: Optional[Dict[str, Any]] = None):
33
+ super().__init__(message)
34
+ self.status_code = status_code
35
+ self.response_data = response_data
36
+
37
+
38
+ class PrimeintellectResourcesUnavailableError(PrimeintellectAPIError):
39
+ """Exception for when resources are unavailable on Prime Intellect."""
40
+ pass
41
+
42
+
43
+ def _parse_api_error(response: Any) -> Tuple[str, bool]:
44
+ """Parse API error response to extract meaningful error messages.
45
+
46
+ Returns:
47
+ Tuple[str, bool]:
48
+ - str: A human-readable error message parsed from the API response.
49
+ - bool: True if the error indicates resource unavailability (e.g.,
50
+ capacity issues or quota/limit exceeded), otherwise False.
51
+ """
52
+ try:
53
+ if hasattr(response, 'json'):
54
+ error_data = response.json()
55
+ else:
56
+ error_data = response
57
+
58
+ if isinstance(error_data, dict):
59
+ # Try to extract error message from common error response fields
60
+ error_message = error_data.get('message', '')
61
+ if not error_message:
62
+ error_message = error_data.get('error', '')
63
+ if not error_message:
64
+ error_message = error_data.get('detail', '')
65
+
66
+ # Check if it's a resource unavailability error
67
+ if any(keyword in error_message.lower() for keyword in [
68
+ 'no capacity', 'capacity', 'unavailable', 'out of stock',
69
+ 'insufficient', 'not available', 'quota exceeded',
70
+ 'limit exceeded'
71
+ ]):
72
+ return error_message, True
73
+
74
+ return error_message, False
75
+
76
+ return str(error_data), False
77
+ except Exception: # pylint: disable=broad-except
78
+ return f'HTTP {response.status_code} {response.reason}', False
79
+
80
+
81
+ def _try_request_with_backoff(
82
+ method: str,
83
+ url: str,
84
+ headers: Dict[str, str],
85
+ data: Optional[Union[str, Dict[str, Any]]] = None) -> Dict[str, Any]:
86
+ backoff = common_utils.Backoff(initial_backoff=INITIAL_BACKOFF_SECONDS,
87
+ max_backoff_factor=MAX_BACKOFF_FACTOR)
88
+ for i in range(MAX_ATTEMPTS):
89
+ timeout = 30
90
+ if method == 'get':
91
+ response = requests.get(url,
92
+ headers=headers,
93
+ params=data,
94
+ timeout=timeout)
95
+ elif method == 'post':
96
+ response = requests.post(url,
97
+ headers=headers,
98
+ json=data,
99
+ timeout=timeout)
100
+ elif method == 'put':
101
+ response = requests.put(url,
102
+ headers=headers,
103
+ json=data,
104
+ timeout=timeout)
105
+ elif method == 'patch':
106
+ response = requests.patch(url,
107
+ headers=headers,
108
+ json=data,
109
+ timeout=timeout)
110
+ elif method == 'delete':
111
+ response = requests.delete(url, headers=headers, timeout=timeout)
112
+ else:
113
+ raise ValueError(f'Unsupported requests method: {method}')
114
+ # If rate limited, wait and try again
115
+ if response.status_code == 429 and i != MAX_ATTEMPTS - 1:
116
+ time.sleep(backoff.current_backoff())
117
+ continue
118
+ if response.ok:
119
+ return response.json()
120
+ else:
121
+ # Parse the error response for meaningful messages
122
+ err, is_resource_unavailable = _parse_api_error(response)
123
+
124
+ # Create a more informative error message
125
+ if not err:
126
+ err = (f'API request failed: {method} {url}: '
127
+ f'{response.status_code} {response.reason}')
128
+ else:
129
+ err = f'API request failed: {err}'
130
+
131
+ # Raise appropriate exception based on error type
132
+ if is_resource_unavailable:
133
+ raise PrimeintellectResourcesUnavailableError(
134
+ err,
135
+ status_code=response.status_code,
136
+ response_data=response.json()
137
+ if hasattr(response, 'json') else None)
138
+ else:
139
+ raise PrimeintellectAPIError(
140
+ err,
141
+ status_code=response.status_code,
142
+ response_data=response.json()
143
+ if hasattr(response, 'json') else None)
144
+ return {}
145
+
146
+
147
+ def get_upstream_cloud_id(instance_type: str) -> Optional[str]:
148
+ global _df, _lookup_dict
149
+ if _df is None:
150
+ _df = catalog_common.read_catalog('primeintellect/vms.csv')
151
+ _lookup_dict = (
152
+ _df.set_index('InstanceType')['UpstreamCloudId'].to_dict())
153
+ return _lookup_dict.get(instance_type)
154
+
155
+
156
+ class PrimeIntellectAPIClient:
157
+ """Client for interacting with Prime Intellect API."""
158
+
159
+ def __init__(self) -> None:
160
+ self.credentials = os.path.expanduser(CREDENTIALS_PATH)
161
+ assert os.path.exists(self.credentials), 'Credentials not found'
162
+ with open(self.credentials, 'r', encoding='utf-8') as f:
163
+ self._credentials = json.load(f)
164
+ self.api_key = self._credentials.get('api_key')
165
+ self.team_id = self._credentials.get('team_id')
166
+ self.base_url = self._credentials.get('base_url', DEFAULT_BASE_URL)
167
+ self.headers = {
168
+ 'Authorization': f'Bearer {self.api_key}',
169
+ 'Content-Type': 'application/json'
170
+ }
171
+
172
+ def list_instances(self, **search_kwargs) -> List[Dict[str, Any]]:
173
+ response = _try_request_with_backoff('get',
174
+ f'{self.base_url}/api/v1/pods',
175
+ headers=self.headers,
176
+ data=search_kwargs)
177
+ return response['data']
178
+
179
+ def get_instance_details(self, instance_id: str) -> Dict[str, Any]:
180
+ return _try_request_with_backoff(
181
+ 'get',
182
+ f'{self.base_url}/api/v1/pods/{instance_id}',
183
+ headers=self.headers)
184
+
185
+ def launch(self,
186
+ name: str,
187
+ instance_type: str,
188
+ region: str,
189
+ availability_zone: str,
190
+ disk_size: int,
191
+ vcpus: int = 0,
192
+ memory: int = 0) -> Dict[str, Any]:
193
+ """Create a pod/instance via Prime Intellect API.
194
+
195
+ Args:
196
+ name: User-visible name of the pod.
197
+ instance_type: A catalog instance type string. The expected format
198
+ is:
199
+ "<provider>__<accelerator>__<vcpus>__<memory>[_SPOT]".
200
+
201
+ - <provider>: Upstream provider tag (e.g., "primecompute").
202
+ - <accelerator>:
203
+ * GPU nodes: "<N>x<GPU_MODEL>", e.g., "8xH100_80GB".
204
+ * CPU-only nodes: the literal "CPU_NODE".
205
+ - <vcpus>: Integer string for vCPU count (e.g., "104").
206
+ - <memory>: Integer string for memory in GB (e.g., "752").
207
+ - Optional suffix "_SPOT" may be present in the full string
208
+ (ignored here; pricing/spot behavior is not controlled by
209
+ this method).
210
+
211
+ Notes:
212
+ - Parsing: only the first two components (provider,
213
+ accelerator) are needed to build the payload. The vCPU
214
+ and memory values are provided via the ``vcpus`` and
215
+ ``memory`` arguments.
216
+ - Catalog lookup: the full instance_type string is used to
217
+ map to the catalog's UpstreamCloudId.
218
+ - CPU-only: accelerator "CPU_NODE" is a sentinel for
219
+ "no GPU". We set gpuType='CPU_NODE' and gpuCount=1 to
220
+ represent CPU-only pods.
221
+ - Spot: the optional "__SPOT" suffix (if present) is ignored
222
+ here; pricing/spot behavior is handled elsewhere.
223
+
224
+ region: Country/region code used by Prime Intellect.
225
+ availability_zone: Data center ID (zone) within the region.
226
+ disk_size: Boot disk size in GB.
227
+ vcpus: Optional explicit vCPU override; if >0 it will be sent.
228
+ memory: Optional explicit memory override in GB; if >0 it will be
229
+ sent.
230
+
231
+ Returns:
232
+ The API response JSON as a dict.
233
+ """
234
+ cloud_id = get_upstream_cloud_id(instance_type)
235
+ assert cloud_id, 'cloudId cannot be None'
236
+ assert availability_zone, 'availability_zone cannot be None'
237
+
238
+ # Parse the instance_type. We only need the first two components:
239
+ # provider and accelerator info (see docstring above).
240
+ provider, gpu_parts, _, _ = instance_type.split('__', 3)
241
+ if 'CPU_NODE' in gpu_parts:
242
+ # Prime Intellect API uses the same schema for CPU-only and GPU
243
+ # pods. For CPU-only instances, we set gpuType='CPU_NODE' and
244
+ # gpuCount=1 as a sentinel to indicate "no GPU". This is how CPU
245
+ # instances are represented internally on our platform; the
246
+ # backend does not interpret this as having a physical GPU.
247
+ gpu_type = 'CPU_NODE'
248
+ gpu_count = 1
249
+ else:
250
+ parts = gpu_parts.split('x', 1)
251
+ gpu_count = int(parts[0])
252
+ gpu_type = parts[1]
253
+
254
+ payload: Dict[str, Any] = {
255
+ 'pod': {
256
+ 'name': name,
257
+ 'cloudId': cloud_id,
258
+ 'socket': 'PCIe',
259
+ 'gpuType': gpu_type,
260
+ 'gpuCount': int(gpu_count),
261
+ 'diskSize': disk_size,
262
+ # Prime Intellect API historically required maxPrice.
263
+ # Set to 0 to indicate on-demand/non-spot pricing.
264
+ 'maxPrice': 0,
265
+ },
266
+ 'provider': {
267
+ 'type': provider,
268
+ }
269
+ }
270
+
271
+ if vcpus > 0:
272
+ payload['pod']['vcpus'] = vcpus
273
+ if memory > 0:
274
+ payload['pod']['memory'] = memory
275
+
276
+ if region != 'UNSPECIFIED':
277
+ payload['pod']['country'] = region
278
+ if availability_zone != 'UNSPECIFIED':
279
+ payload['pod']['dataCenterId'] = availability_zone
280
+
281
+ if self.team_id is not None and self.team_id != '':
282
+ payload['team'] = {'teamId': self.team_id}
283
+
284
+ response = _try_request_with_backoff(
285
+ 'post',
286
+ f'{self.base_url}/api/v1/pods',
287
+ headers=self.headers,
288
+ data=payload,
289
+ )
290
+ return response
291
+
292
+ def remove(self, instance_id: str) -> Dict[str, Any]:
293
+ return _try_request_with_backoff(
294
+ 'delete',
295
+ f'{self.base_url}/api/v1/pods/{instance_id}',
296
+ headers=self.headers,
297
+ )
298
+
299
+ def list_ssh_keys(self) -> List[Dict[str, Any]]:
300
+ response = _try_request_with_backoff('get',
301
+ f'{self.base_url}/api/v1/ssh_keys',
302
+ headers=self.headers)
303
+ return response['data']
304
+
305
+ def get_or_add_ssh_key(self, ssh_pub_key: str = '') -> Dict[str, str]:
306
+ """Add ssh key if not already added."""
307
+ # Check if the public key is already added
308
+ ssh_keys = self.list_ssh_keys()
309
+ for key in ssh_keys:
310
+ if key['publicKey'].strip().split()[:2] == ssh_pub_key.strip(
311
+ ).split()[:2]:
312
+ return {'name': key['name'], 'ssh_key': ssh_pub_key}
313
+
314
+ # Add the public key to Prime Intellect account if not already added
315
+ ssh_key_name = 'skypilot-' + str(uuid.uuid4()).replace('-', '')[:8]
316
+ _try_request_with_backoff(
317
+ 'post',
318
+ f'{self.base_url}/api/v1/ssh_keys',
319
+ headers=self.headers,
320
+ data={
321
+ 'name': ssh_key_name,
322
+ 'publicKey': ssh_pub_key
323
+ },
324
+ )
325
+ return {'name': ssh_key_name, 'ssh_key': ssh_pub_key}
326
+
327
+
328
+ def parse_ssh_connection(ssh_connection: Any) -> Tuple[Optional[str], int]:
329
+ """Parse and extract SSH username and port from a connection field.
330
+
331
+ The provider may return the SSH connection in multiple shapes. This helper
332
+ robustly extracts the SSH username and port while tolerating extra flags or
333
+ various tokenizations.
334
+
335
+ Accepted formats (examples):
336
+ - String with port flag:
337
+ "ubuntu@1.2.3.4 -p 2222 [-o <flag> ...]"
338
+ - String without explicit port (defaults to 22):
339
+ "ubuntu@1.2.3.4"
340
+ - String with host:port:
341
+ "ubuntu@1.2.3.4:2222"
342
+ - List with a single target:
343
+ ["ubuntu@1.2.3.4"]
344
+ - List of tokens (e.g., split form):
345
+ ["ubuntu@1.2.3.4", "-p", "2222"]
346
+
347
+ Args:
348
+ ssh_connection: The raw field from the API; can be a string or a list
349
+ of strings.
350
+
351
+ Returns:
352
+ (ssh_user, ssh_port): username if found, else None; port if found,
353
+ else 22.
354
+ """
355
+ ssh_user: Optional[str] = None
356
+ ssh_port: int = 22
357
+
358
+ # Normalize into a list of tokens for easier processing.
359
+ tokens: List[str] = []
360
+ if isinstance(ssh_connection, str):
361
+ try:
362
+ tokens = shlex.split(ssh_connection)
363
+ except Exception: # pylint: disable=broad-except
364
+ tokens = [ssh_connection]
365
+ elif isinstance(ssh_connection, list):
366
+ for elem in ssh_connection:
367
+ if isinstance(elem, str):
368
+ try:
369
+ tokens.extend(shlex.split(elem))
370
+ except Exception: # pylint: disable=broad-except
371
+ tokens.append(elem)
372
+ else:
373
+ # Unknown type; return defaults.
374
+ return ssh_user, ssh_port
375
+
376
+ # Find the first token containing '@' as the user@host candidate.
377
+ user_host: Optional[str] = next((t for t in tokens if '@' in t), None)
378
+ if user_host:
379
+ ssh_user = user_host.split('@', 1)[0].strip()
380
+ # Try host:port format (after '@').
381
+ host_part = user_host.split('@', 1)[1]
382
+ if ':' in host_part:
383
+ _, maybe_port = host_part.rsplit(':', 1)
384
+ try:
385
+ ssh_port = int(maybe_port)
386
+ except ValueError:
387
+ pass
388
+
389
+ # Check for '-p <port>' pair anywhere in the tokens. This takes priority.
390
+ if '-p' in tokens:
391
+ idx = tokens.index('-p')
392
+ if idx + 1 < len(tokens):
393
+ try:
394
+ ssh_port = int(tokens[idx + 1])
395
+ except ValueError:
396
+ pass
397
+
398
+ return ssh_user, ssh_port
@@ -18,6 +18,7 @@ from sky import exceptions
18
18
  from sky import global_user_state
19
19
  from sky import logs
20
20
  from sky import provision
21
+ from sky import resources as resources_lib
21
22
  from sky import sky_logging
22
23
  from sky import skypilot_config
23
24
  from sky.adaptors import aws
@@ -27,6 +28,7 @@ from sky.provision import common as provision_common
27
28
  from sky.provision import instance_setup
28
29
  from sky.provision import logging as provision_logging
29
30
  from sky.provision import metadata_utils
31
+ from sky.provision import volume as provision_volume
30
32
  from sky.skylet import constants
31
33
  from sky.utils import common
32
34
  from sky.utils import common_utils
@@ -58,6 +60,11 @@ def _bulk_provision(
58
60
  region_name = region.name
59
61
 
60
62
  start = time.time()
63
+
64
+ provision_volume.provision_ephemeral_volumes(cloud, region_name,
65
+ cluster_name.name_on_cloud,
66
+ bootstrap_config)
67
+
61
68
  # TODO(suquark): Should we cache the bootstrapped result?
62
69
  # Currently it is not necessary as bootstrapping takes
63
70
  # only ~3s, caching it seems over-engineering and could
@@ -69,6 +76,7 @@ def _bulk_provision(
69
76
 
70
77
  provision_record = provision.run_instances(provider_name,
71
78
  region_name,
79
+ str(cluster_name),
72
80
  cluster_name.name_on_cloud,
73
81
  config=config)
74
82
 
@@ -149,9 +157,9 @@ def bulk_provision(
149
157
  logger.debug(f'SkyPilot version: {sky.__version__}; '
150
158
  f'commit: {sky.__commit__}')
151
159
  logger.debug(_TITLE.format('Provisioning'))
152
- logger.debug(
153
- 'Provision config:\n'
154
- f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
160
+ redacted_config = bootstrap_config.get_redacted_config()
161
+ logger.debug('Provision config:\n'
162
+ f'{json.dumps(redacted_config, indent=2)}')
155
163
  return _bulk_provision(cloud, region, cluster_name,
156
164
  bootstrap_config)
157
165
  except exceptions.NoClusterLaunchedError:
@@ -235,6 +243,7 @@ def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName,
235
243
  provision.terminate_instances(cloud_name, cluster_name.name_on_cloud,
236
244
  provider_config)
237
245
  metadata_utils.remove_cluster_metadata(cluster_name.name_on_cloud)
246
+ provision_volume.delete_ephemeral_volumes(provider_config)
238
247
  else:
239
248
  provision.stop_instances(cloud_name, cluster_name.name_on_cloud,
240
249
  provider_config)
@@ -427,18 +436,27 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
427
436
 
428
437
 
429
438
  def _post_provision_setup(
430
- cloud_name: str, cluster_name: resources_utils.ClusterName,
431
- handle_cluster_yaml: str,
439
+ launched_resources: resources_lib.Resources,
440
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
432
441
  provision_record: provision_common.ProvisionRecord,
433
442
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
434
443
  config_from_yaml = global_user_state.get_cluster_yaml_dict(
435
444
  handle_cluster_yaml)
436
445
  provider_config = config_from_yaml.get('provider')
446
+ cloud_name = repr(launched_resources.cloud)
437
447
  cluster_info = provision.get_cluster_info(cloud_name,
438
448
  provision_record.region,
439
449
  cluster_name.name_on_cloud,
440
450
  provider_config=provider_config)
441
451
 
452
+ # Update cluster info in handle so cluster instance ids are set. This
453
+ # allows us to expose provision logs to debug nodes that failed during post
454
+ # provision setup.
455
+ handle = global_user_state.get_handle_from_cluster_name(
456
+ cluster_name.display_name)
457
+ handle.cached_cluster_info = cluster_info
458
+ global_user_state.update_cluster_handle(cluster_name.display_name, handle)
459
+
442
460
  if cluster_info.num_instances > 1:
443
461
  # Only worker nodes have logs in the per-instance log directory. Head
444
462
  # node's log will be redirected to the main log file.
@@ -474,12 +492,13 @@ def _post_provision_setup(
474
492
  # ready by the provisioner, and we use kubectl instead of SSH to run the
475
493
  # commands and rsync on the pods. SSH will still be ready after a while
476
494
  # for the users to SSH into the pod.
477
- if cloud_name.lower() != 'kubernetes':
495
+ is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
496
+ if not is_k8s_cloud:
478
497
  logger.debug(
479
498
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
480
499
  wait_for_ssh(cluster_info, ssh_credentials)
481
500
  logger.debug(f'SSH Connection ready for {cluster_name!r}')
482
- vm_str = 'Instance' if cloud_name.lower() != 'kubernetes' else 'Pod'
501
+ vm_str = 'Instance' if not is_k8s_cloud else 'Pod'
483
502
  plural = '' if len(cluster_info.instances) == 1 else 's'
484
503
  verb = 'is' if len(cluster_info.instances) == 1 else 'are'
485
504
  indent_str = (ux_utils.INDENT_SYMBOL
@@ -526,6 +545,7 @@ def _post_provision_setup(
526
545
  status.update(
527
546
  ux_utils.spinner_message(
528
547
  'Checking controller version compatibility'))
548
+
529
549
  try:
530
550
  server_jobs_utils.check_version_mismatch_and_non_terminal_jobs()
531
551
  except exceptions.ClusterNotUpError:
@@ -615,10 +635,15 @@ def _post_provision_setup(
615
635
  status.update(
616
636
  runtime_preparation_str.format(step=3, step_name='runtime'))
617
637
 
638
+ skip_ray_setup = False
618
639
  ray_port = constants.SKY_REMOTE_RAY_PORT
619
640
  head_ray_needs_restart = True
620
641
  ray_cluster_healthy = False
621
- if (not provision_record.is_instance_just_booted(
642
+ if (launched_resources.cloud is not None and
643
+ not launched_resources.cloud.uses_ray()):
644
+ skip_ray_setup = True
645
+ logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
646
+ elif (not provision_record.is_instance_just_booted(
622
647
  head_instance.instance_id)):
623
648
  # Check if head node Ray is alive
624
649
  (ray_port, ray_cluster_healthy,
@@ -643,7 +668,9 @@ def _post_provision_setup(
643
668
  'async setup to complete...')
644
669
  time.sleep(1)
645
670
 
646
- if head_ray_needs_restart:
671
+ if skip_ray_setup:
672
+ logger.debug('Skip Ray cluster setup on the head node.')
673
+ elif head_ray_needs_restart:
647
674
  logger.debug('Starting Ray on the entire cluster.')
648
675
  instance_setup.start_ray_on_head_node(
649
676
  cluster_name.name_on_cloud,
@@ -666,7 +693,9 @@ def _post_provision_setup(
666
693
  # We don't need to restart ray on worker nodes if the ray cluster is
667
694
  # already healthy, i.e. the head node has expected number of nodes
668
695
  # connected to the ray cluster.
669
- if cluster_info.num_instances > 1 and not ray_cluster_healthy:
696
+ if skip_ray_setup:
697
+ logger.debug('Skip Ray cluster setup on the worker nodes.')
698
+ elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
670
699
  instance_setup.start_ray_on_worker_nodes(
671
700
  cluster_name.name_on_cloud,
672
701
  no_restart=not head_ray_needs_restart,
@@ -692,8 +721,9 @@ def _post_provision_setup(
692
721
  cluster_info,
693
722
  ssh_credentials)
694
723
 
695
- instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
696
- cluster_info, ssh_credentials)
724
+ instance_setup.start_skylet_on_head_node(cluster_name, cluster_info,
725
+ ssh_credentials,
726
+ launched_resources)
697
727
 
698
728
  logger.info(
699
729
  ux_utils.finishing_message(f'Cluster launched: {cluster_name}.',
@@ -704,8 +734,8 @@ def _post_provision_setup(
704
734
 
705
735
  @timeline.event
706
736
  def post_provision_runtime_setup(
707
- cloud_name: str, cluster_name: resources_utils.ClusterName,
708
- handle_cluster_yaml: str,
737
+ launched_resources: resources_lib.Resources,
738
+ cluster_name: resources_utils.ClusterName, handle_cluster_yaml: str,
709
739
  provision_record: provision_common.ProvisionRecord,
710
740
  custom_resource: Optional[str],
711
741
  log_dir: str) -> provision_common.ClusterInfo:
@@ -726,7 +756,7 @@ def post_provision_runtime_setup(
726
756
  try:
727
757
  logger.debug(_TITLE.format('System Setup After Provision'))
728
758
  return _post_provision_setup(
729
- cloud_name,
759
+ launched_resources,
730
760
  cluster_name,
731
761
  handle_cluster_yaml=handle_cluster_yaml,
732
762
  provision_record=provision_record,
@@ -11,4 +11,6 @@ from sky.provision.runpod.instance import terminate_instances
11
11
  from sky.provision.runpod.instance import wait_instances
12
12
  from sky.provision.runpod.volume import apply_volume
13
13
  from sky.provision.runpod.volume import delete_volume
14
+ from sky.provision.runpod.volume import get_all_volumes_usedby
14
15
  from sky.provision.runpod.volume import get_volume_usedby
16
+ from sky.provision.runpod.volume import map_all_volumes_usedby
@@ -44,10 +44,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
44
44
  return head_instance_id
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
-
50
+ del cluster_name # unused
51
51
  pending_status = ['CREATED', 'RESTARTING']
52
52
 
53
53
  while True:
@@ -222,9 +222,10 @@ def query_instances(
222
222
  cluster_name_on_cloud: str,
223
223
  provider_config: Optional[Dict[str, Any]] = None,
224
224
  non_terminated_only: bool = True,
225
+ retry_if_missing: bool = False,
225
226
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
226
227
  """See sky/provision/__init__.py"""
227
- del cluster_name # unused
228
+ del cluster_name, retry_if_missing # unused
228
229
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
229
230
  instances = _filter_instances(cluster_name_on_cloud, None)
230
231