skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,952 @@
1
+ """SSH-based Kubernetes Cluster Deployment Script"""
2
+ # pylint: disable=line-too-long
3
+ import base64
4
+ import concurrent.futures as cf
5
+ import os
6
+ import re
7
+ import shlex
8
+ import shutil
9
+ import tempfile
10
+ from typing import List, Optional
11
+
12
+ import colorama
13
+ import yaml
14
+
15
+ from sky import sky_logging
16
+ from sky.ssh_node_pools import constants
17
+ from sky.ssh_node_pools import utils as ssh_utils
18
+ from sky.ssh_node_pools.deploy import tunnel_utils
19
+ from sky.ssh_node_pools.deploy import utils as deploy_utils
20
+ from sky.utils import rich_utils
21
+ from sky.utils import ux_utils
22
+
23
+ RESET_ALL = colorama.Style.RESET_ALL
24
+
25
+ # Get the directory of this script
26
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
27
+
28
+ logger = sky_logging.init_logger(__name__)
29
+
30
+
31
+ def progress_message(message):
32
+ """Show a progress message."""
33
+ logger.info(f'{colorama.Fore.YELLOW}➜ {message}{RESET_ALL}')
34
+
35
+
36
+ def success_message(message):
37
+ """Show a success message."""
38
+ logger.info(f'{colorama.Fore.GREEN}✔ {message}{RESET_ALL}')
39
+
40
+
41
+ def force_update_status(message):
42
+ """Force update rich spinner status."""
43
+ rich_utils.force_update_status(ux_utils.spinner_message(message))
44
+
45
+
46
+ def run(cleanup: bool = False,
47
+ infra: Optional[str] = None,
48
+ kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH):
49
+ """Deploy a Kubernetes cluster on SSH targets.
50
+
51
+ This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
52
+ Kubernetes cluster on the specified machines.
53
+
54
+ Args:
55
+ cleanup: Whether to clean up the cluster instead of deploying.
56
+ infra: Name of the cluster in ssh_node_pools.yaml to use.
57
+ If None, the first cluster in the file will be used.
58
+ kubeconfig_path: Path to save the Kubernetes configuration file.
59
+ If None, the default ~/.kube/config will be used.
60
+ """
61
+ deploy_utils.check_ssh_cluster_dependencies()
62
+ action = 'Cleanup' if cleanup else 'Deployment'
63
+ msg_str = f'Initializing SSH Node Pools {action}...'
64
+
65
+ with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
66
+ try:
67
+ deploy_multiple_clusters(infra=infra,
68
+ cleanup=cleanup,
69
+ kubeconfig_path=kubeconfig_path)
70
+ except Exception as e: # pylint: disable=broad-except
71
+ logger.error(str(e))
72
+ with ux_utils.print_exception_no_traceback():
73
+ raise RuntimeError(
74
+ 'Failed to deploy SkyPilot on some Node Pools.') from e
75
+
76
+ # Add empty line for ux-purposes.
77
+ logger.info('')
78
+ if cleanup:
79
+ logger.info(
80
+ ux_utils.finishing_message(
81
+ '🎉 SSH Node Pools cleaned up successfully.'))
82
+ else:
83
+ logger.info(
84
+ ux_utils.finishing_message(
85
+ '🎉 SSH Node Pools set up successfully. ',
86
+ follow_up_message=(
87
+ f'Run `{colorama.Style.BRIGHT}'
88
+ f'sky check ssh'
89
+ f'{colorama.Style.RESET_ALL}` to verify access, '
90
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
91
+ f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
92
+
93
+
94
+ def deploy_multiple_clusters(
95
+ infra: Optional[str],
96
+ ssh_node_pools_file: str = constants.DEFAULT_SSH_NODE_POOLS_PATH,
97
+ kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH,
98
+ cleanup: bool = True):
99
+
100
+ kubeconfig_path = kubeconfig_path or constants.DEFAULT_KUBECONFIG_PATH
101
+ kubeconfig_path = os.path.expanduser(kubeconfig_path)
102
+
103
+ failed_clusters = []
104
+ successful_clusters = []
105
+
106
+ # Using YAML configuration
107
+ targets = ssh_utils.load_ssh_targets(ssh_node_pools_file)
108
+ clusters_config = ssh_utils.get_cluster_config(
109
+ targets, infra, file_path=ssh_node_pools_file)
110
+
111
+ # Print information about clusters being processed
112
+ num_clusters = len(clusters_config)
113
+ cluster_names = list(clusters_config.keys())
114
+ cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
115
+ logger.info(f'{colorama.Fore.CYAN}{cluster_info}{RESET_ALL}')
116
+
117
+ # Process each cluster
118
+ for cluster_name, cluster_config in clusters_config.items():
119
+ try:
120
+ action = 'Cleaning up' if cleanup else 'Deploying'
121
+ force_update_status(f'{action} Node Pool: {cluster_name}')
122
+ hosts_info = ssh_utils.prepare_hosts_info(cluster_name,
123
+ cluster_config)
124
+
125
+ if not hosts_info:
126
+ logger.warning(
127
+ f'{colorama.Fore.RED}Error: No valid hosts found '
128
+ f'for cluster {cluster_name!r}. Skipping.{RESET_ALL}')
129
+ continue
130
+
131
+ context_name = f'ssh-{cluster_name}'
132
+
133
+ # Check cluster history
134
+ os.makedirs(constants.NODE_POOLS_INFO_DIR, exist_ok=True)
135
+ history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
136
+ f'{context_name}-history.yaml')
137
+
138
+ history = None
139
+ if os.path.exists(history_yaml_file):
140
+ logger.debug(f'Loading history from {history_yaml_file}')
141
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
142
+ history = yaml.safe_load(f)
143
+ else:
144
+ logger.debug(f'No history found for {context_name}.')
145
+
146
+ history_workers_info = None
147
+ history_worker_nodes = None
148
+ history_use_ssh_config = None
149
+ # Do not support changing anything besides hosts for now
150
+ if history is not None:
151
+ for key in ['user', 'identity_file', 'password']:
152
+ if not cleanup and history.get(key) != cluster_config.get(
153
+ key):
154
+ raise ValueError(
155
+ f'Cluster configuration has changed for field {key!r}. '
156
+ f'Previous value: {history.get(key)}, '
157
+ f'Current value: {cluster_config.get(key)}')
158
+ history_hosts_info = ssh_utils.prepare_hosts_info(
159
+ cluster_name, history)
160
+ if not cleanup and history_hosts_info[0] != hosts_info[0]:
161
+ raise ValueError(
162
+ f'Cluster configuration has changed for master node. '
163
+ f'Previous value: {history_hosts_info[0]}, '
164
+ f'Current value: {hosts_info[0]}')
165
+ history_workers_info = history_hosts_info[1:] if len(
166
+ history_hosts_info) > 1 else []
167
+ history_worker_nodes = [h['ip'] for h in history_workers_info]
168
+ history_use_ssh_config = [
169
+ h.get('use_ssh_config', False) for h in history_workers_info
170
+ ]
171
+
172
+ # Use the first host as the head node and the rest as worker nodes
173
+ head_host = hosts_info[0]
174
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
175
+
176
+ head_node = head_host['ip']
177
+ worker_nodes = [h['ip'] for h in worker_hosts]
178
+ ssh_user = head_host['user']
179
+ ssh_key = head_host['identity_file']
180
+ head_use_ssh_config = head_host.get('use_ssh_config', False)
181
+ worker_use_ssh_config = [
182
+ h.get('use_ssh_config', False) for h in worker_hosts
183
+ ]
184
+ password = head_host['password']
185
+
186
+ # Deploy this cluster
187
+ unsuccessful_workers = deploy_single_cluster(
188
+ cluster_name,
189
+ head_node,
190
+ worker_nodes,
191
+ ssh_user,
192
+ ssh_key,
193
+ context_name,
194
+ password,
195
+ head_use_ssh_config,
196
+ worker_use_ssh_config,
197
+ kubeconfig_path,
198
+ cleanup,
199
+ worker_hosts=worker_hosts,
200
+ history_worker_nodes=history_worker_nodes,
201
+ history_workers_info=history_workers_info,
202
+ history_use_ssh_config=history_use_ssh_config)
203
+
204
+ if not cleanup:
205
+ successful_hosts = []
206
+ for host in cluster_config['hosts']:
207
+ if isinstance(host, str):
208
+ host_node = host
209
+ else:
210
+ host_node = host['ip']
211
+ if host_node not in unsuccessful_workers:
212
+ successful_hosts.append(host)
213
+ cluster_config['hosts'] = successful_hosts
214
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
215
+ logger.debug(f'Writing history to {history_yaml_file}')
216
+ yaml.dump(cluster_config, f)
217
+
218
+ action = 'cleanup' if cleanup else 'deployment'
219
+ logger.info(
220
+ f'{colorama.Fore.CYAN}Completed {action} for cluster: {cluster_name}{colorama.Style.RESET_ALL}'
221
+ )
222
+ successful_clusters.append(cluster_name)
223
+ except Exception as e: # pylint: disable=broad-except
224
+ reason = str(e)
225
+ failed_clusters.append((cluster_name, reason))
226
+ action = 'cleaning' if cleanup else 'deploying'
227
+ logger.debug(
228
+ f'Error {action} SSH Node Pool `{cluster_name}`: {reason}')
229
+
230
+ if failed_clusters:
231
+ action = 'clean' if cleanup else 'deploy'
232
+ msg = f'{colorama.Fore.GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {RESET_ALL}'
233
+ msg += f'{colorama.Fore.RED}Failed to {action} {len(failed_clusters)} cluster(s): {RESET_ALL}'
234
+ for cluster_name, reason in failed_clusters:
235
+ msg += f'\n {cluster_name}: {reason}'
236
+ raise RuntimeError(msg)
237
+
238
+
239
+ def deploy_single_cluster(cluster_name,
240
+ head_node,
241
+ worker_nodes,
242
+ ssh_user,
243
+ ssh_key,
244
+ context_name,
245
+ password,
246
+ head_use_ssh_config,
247
+ worker_use_ssh_config,
248
+ kubeconfig_path,
249
+ cleanup,
250
+ worker_hosts=None,
251
+ history_worker_nodes=None,
252
+ history_workers_info=None,
253
+ history_use_ssh_config=None) -> List[str]:
254
+ """Deploy or clean up a single Kubernetes cluster.
255
+
256
+ Returns: List of unsuccessful worker nodes.
257
+ """
258
+ history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
259
+ f'{context_name}-history.yaml')
260
+ cert_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
261
+ f'{context_name}-cert.pem')
262
+ key_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
263
+ f'{context_name}-key.pem')
264
+ tunnel_log_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
265
+ f'{context_name}-tunnel.log')
266
+
267
+ # Generate the askpass block if password is provided
268
+ askpass_block = create_askpass_script(password)
269
+
270
+ # Token for k3s
271
+ # TODO (kyuds): make this configurable?
272
+ k3s_token = constants.K3S_TOKEN
273
+
274
+ # Pre-flight checks
275
+ logger.info(f'Checking SSH connection to head node ({head_node})...')
276
+ result = deploy_utils.run_remote(
277
+ head_node,
278
+ f'echo \'SSH connection successful ({head_node})\'',
279
+ ssh_user,
280
+ ssh_key,
281
+ use_ssh_config=head_use_ssh_config)
282
+ if result is None:
283
+ with ux_utils.print_exception_no_traceback():
284
+ raise RuntimeError(
285
+ f'Failed to SSH to head node ({head_node}). '
286
+ f'Please check the SSH configuration and logs for more details.'
287
+ )
288
+ elif result.startswith('SSH connection successful'):
289
+ success_message(f'SSH connection established to head node {head_node}.')
290
+
291
+ # Checking history
292
+ history_exists = (history_worker_nodes is not None and
293
+ history_workers_info is not None and
294
+ history_use_ssh_config is not None)
295
+
296
+ # Cleanup history worker nodes
297
+ worker_nodes_to_cleanup = []
298
+ remove_worker_cmds = []
299
+ if history_exists:
300
+ for history_node, history_info, use_ssh_config in zip(
301
+ history_worker_nodes, history_workers_info,
302
+ history_use_ssh_config):
303
+ if worker_hosts is not None and history_info not in worker_hosts:
304
+ logger.debug(
305
+ f'Worker node {history_node} not found in YAML config. '
306
+ 'Removing from history...')
307
+ worker_nodes_to_cleanup.append(
308
+ dict(
309
+ node=history_node,
310
+ user=ssh_user
311
+ if history_info is None else history_info['user'],
312
+ ssh_key=ssh_key if history_info is None else
313
+ history_info['identity_file'],
314
+ askpass_block=(askpass_block if history_info is None
315
+ else create_askpass_script(
316
+ history_info['password'])),
317
+ use_ssh_config=use_ssh_config,
318
+ ))
319
+ remove_worker_cmds.append(
320
+ f'kubectl delete node -l skypilot-ip={history_node}')
321
+ # If this is a create operation and there exists some stale log,
322
+ # cleanup the log for a new file to store new logs.
323
+ if not cleanup and os.path.exists(tunnel_log_file_path):
324
+ os.remove(tunnel_log_file_path)
325
+
326
+ # If --cleanup flag is set, uninstall k3s and exit
327
+ if cleanup:
328
+ # Pickup all nodes
329
+ worker_nodes_to_cleanup.clear()
330
+ for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
331
+ worker_use_ssh_config):
332
+ worker_nodes_to_cleanup.append(
333
+ dict(
334
+ node=node,
335
+ user=ssh_user if info is None else info['user'],
336
+ ssh_key=ssh_key if info is None else info['identity_file'],
337
+ askpass_block=(askpass_block if info is None else
338
+ create_askpass_script(info['password'])),
339
+ use_ssh_config=use_ssh_config,
340
+ ))
341
+
342
+ # Clean up head node
343
+ cleanup_node(head_node,
344
+ ssh_user,
345
+ ssh_key,
346
+ askpass_block,
347
+ use_ssh_config=head_use_ssh_config,
348
+ is_worker=False)
349
+ # Clean up worker nodes
350
+ force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
351
+ with cf.ThreadPoolExecutor() as executor:
352
+ executor.map(lambda kwargs: cleanup_node(**kwargs),
353
+ worker_nodes_to_cleanup)
354
+
355
+ with cf.ThreadPoolExecutor() as executor:
356
+ executor.map(lambda cmd: deploy_utils.run_command(cmd, shell=True),
357
+ remove_worker_cmds)
358
+
359
+ if cleanup:
360
+ # Remove the context from local kubeconfig if it exists
361
+ if os.path.isfile(kubeconfig_path):
362
+ logger.debug(
363
+ f'Removing context {context_name!r} from local kubeconfig...')
364
+ deploy_utils.run_command(
365
+ ['kubectl', 'config', 'delete-context', context_name],
366
+ shell=False,
367
+ silent=True)
368
+ deploy_utils.run_command(
369
+ ['kubectl', 'config', 'delete-cluster', context_name],
370
+ shell=False,
371
+ silent=True)
372
+ deploy_utils.run_command(
373
+ ['kubectl', 'config', 'delete-user', context_name],
374
+ shell=False,
375
+ silent=True)
376
+
377
+ # Update the current context to the first available context
378
+ contexts = deploy_utils.run_command([
379
+ 'kubectl', 'config', 'view', '-o',
380
+ 'jsonpath=\'{.contexts[0].name}\''
381
+ ],
382
+ shell=False,
383
+ silent=True)
384
+ if contexts:
385
+ deploy_utils.run_command(
386
+ ['kubectl', 'config', 'use-context', contexts],
387
+ shell=False,
388
+ silent=True)
389
+ else:
390
+ # If no context is available, simply unset the current context
391
+ deploy_utils.run_command(
392
+ ['kubectl', 'config', 'unset', 'current-context'],
393
+ shell=False,
394
+ silent=True)
395
+
396
+ logger.debug(
397
+ f'Context {context_name!r} removed from local kubeconfig.')
398
+
399
+ for file in [history_yaml_file, cert_file_path, key_file_path]:
400
+ if os.path.exists(file):
401
+ os.remove(file)
402
+
403
+ # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
404
+ # will restart the ssh tunnel if it's not running.
405
+ tunnel_utils.cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
406
+
407
+ success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
408
+ return []
409
+
410
+ logger.debug('Checking TCP Forwarding Options...')
411
+ cmd = (
412
+ 'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
413
+ f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
414
+ 'else '
415
+ 'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
416
+ '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
417
+ f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
418
+ 'fi')
419
+ result = deploy_utils.run_remote(head_node,
420
+ shlex.quote(cmd),
421
+ ssh_user,
422
+ ssh_key,
423
+ use_ssh_config=head_use_ssh_config,
424
+ use_shell=True)
425
+ if result is None:
426
+ with ux_utils.print_exception_no_traceback():
427
+ raise RuntimeError(
428
+ f'Failed to setup TCP forwarding on head node ({head_node}). '
429
+ f'Please check the SSH configuration.')
430
+
431
+ # Get effective IP for master node if using SSH config - needed for workers to connect
432
+ if head_use_ssh_config:
433
+ effective_master_ip = deploy_utils.get_effective_host_ip(head_node)
434
+ logger.info(f'{colorama.Fore.GREEN}Resolved head node {head_node} '
435
+ f'to {effective_master_ip} from SSH config{RESET_ALL}')
436
+ else:
437
+ effective_master_ip = head_node
438
+
439
+ # Step 1: Install k3s on the head node
440
+ # Check if head node has a GPU
441
+ install_gpu = False
442
+ force_update_status(
443
+ f'Deploying SkyPilot runtime on head node ({head_node}).')
444
+ cmd = f"""
445
+ {askpass_block}
446
+ curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
447
+ mkdir -p ~/.kube &&
448
+ sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
449
+ sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
450
+ for i in {{1..3}}; do
451
+ if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
452
+ break
453
+ else
454
+ echo 'Waiting for nodes to be ready...'
455
+ sleep 5
456
+ fi
457
+ done
458
+ if [ $i -eq 3 ]; then
459
+ echo 'Failed to wait for nodes to be ready after 3 attempts'
460
+ exit 1
461
+ fi
462
+ """
463
+ result = deploy_utils.run_remote(head_node,
464
+ cmd,
465
+ ssh_user,
466
+ ssh_key,
467
+ use_ssh_config=head_use_ssh_config)
468
+ if result is None:
469
+ with ux_utils.print_exception_no_traceback():
470
+ raise RuntimeError(
471
+ f'Failed to deploy K3s on head node ({head_node}).')
472
+ success_message(
473
+ f'SkyPilot runtime successfully deployed on head node ({head_node}).')
474
+
475
+ # Check if head node has a GPU
476
+ install_gpu = False
477
+ if deploy_utils.check_gpu(head_node,
478
+ ssh_user,
479
+ ssh_key,
480
+ use_ssh_config=head_use_ssh_config,
481
+ is_head=True):
482
+ install_gpu = True
483
+
484
+ # Fetch the head node's internal IP (this will be passed to worker nodes)
485
+ master_addr = deploy_utils.run_remote(head_node,
486
+ 'hostname -I | awk \'{print $1}\'',
487
+ ssh_user,
488
+ ssh_key,
489
+ use_ssh_config=head_use_ssh_config)
490
+ if master_addr is None:
491
+ with ux_utils.print_exception_no_traceback():
492
+ raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
493
+ f'Please check the SSH configuration.')
494
+ logger.debug(f'Master node internal IP: {master_addr}')
495
+
496
+ # Step 2: Install k3s on worker nodes and join them to the master node
497
+ def deploy_worker(args):
498
+ (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
499
+ askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
500
+
501
+ # If using YAML config with specific worker info
502
+ if worker_hosts and i < len(worker_hosts):
503
+ if history_workers_info is not None and worker_hosts[
504
+ i] in history_workers_info:
505
+ logger.info(
506
+ f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
507
+ f'Skipping...{RESET_ALL}')
508
+ return node, True, False
509
+ worker_user = worker_hosts[i]['user']
510
+ worker_key = worker_hosts[i]['identity_file']
511
+ worker_password = worker_hosts[i]['password']
512
+ worker_askpass = create_askpass_script(worker_password)
513
+ worker_config = worker_use_ssh_config[i]
514
+ else:
515
+ worker_user = ssh_user
516
+ worker_key = ssh_key
517
+ worker_askpass = askpass_block
518
+ worker_config = worker_use_ssh_config[i]
519
+
520
+ return start_agent_node(node,
521
+ master_addr,
522
+ k3s_token,
523
+ worker_user,
524
+ worker_key,
525
+ worker_askpass,
526
+ use_ssh_config=worker_config)
527
+
528
+ unsuccessful_workers = []
529
+
530
+ # Deploy workers in parallel using thread pool
531
+ force_update_status(
532
+ f'Deploying SkyPilot runtime on worker nodes [{cluster_name}]')
533
+ with cf.ThreadPoolExecutor() as executor:
534
+ futures = []
535
+ for i, node in enumerate(worker_nodes):
536
+ args = (i, node, worker_hosts, history_workers_info, ssh_user,
537
+ ssh_key, askpass_block, worker_use_ssh_config, master_addr,
538
+ k3s_token)
539
+ futures.append(executor.submit(deploy_worker, args))
540
+
541
+ # Check if worker node has a GPU
542
+ for future in cf.as_completed(futures):
543
+ node, suc, has_gpu = future.result()
544
+ install_gpu = install_gpu or has_gpu
545
+ if not suc:
546
+ unsuccessful_workers.append(node)
547
+
548
+ # Step 3: Configure local kubectl to connect to the cluster
549
+ force_update_status(f'Setting up SkyPilot configuration [{cluster_name}]')
550
+
551
+ # Create temporary directory for kubeconfig operations
552
+ with tempfile.TemporaryDirectory() as temp_dir:
553
+ temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
554
+
555
+ # Get the kubeconfig from remote server
556
+ if head_use_ssh_config:
557
+ scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
558
+ else:
559
+ scp_cmd = [
560
+ 'scp', '-o', 'StrictHostKeyChecking=no', '-o',
561
+ 'IdentitiesOnly=yes', '-i', ssh_key,
562
+ f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
563
+ ]
564
+ deploy_utils.run_command(scp_cmd, shell=False)
565
+
566
+ # Create the directory for the kubeconfig file if it doesn't exist
567
+ deploy_utils.ensure_directory_exists(kubeconfig_path)
568
+
569
+ # Create empty kubeconfig if it doesn't exist
570
+ if not os.path.isfile(kubeconfig_path):
571
+ open(kubeconfig_path, 'a', encoding='utf-8').close()
572
+
573
+ # Modify the temporary kubeconfig to update server address and context name
574
+ modified_config = os.path.join(temp_dir, 'modified_config')
575
+ with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
576
+ with open(modified_config, 'w', encoding='utf-8') as f_out:
577
+ in_cluster = False
578
+ in_user = False
579
+ client_cert_data = None
580
+ client_key_data = None
581
+
582
+ for line in f_in:
583
+ if 'clusters:' in line:
584
+ in_cluster = True
585
+ in_user = False
586
+ elif 'users:' in line:
587
+ in_cluster = False
588
+ in_user = True
589
+ elif 'contexts:' in line:
590
+ in_cluster = False
591
+ in_user = False
592
+
593
+ # Skip certificate authority data in cluster section
594
+ if in_cluster and 'certificate-authority-data:' in line:
595
+ continue
596
+ # Skip client certificate data in user section but extract it
597
+ elif in_user and 'client-certificate-data:' in line:
598
+ client_cert_data = line.split(':', 1)[1].strip()
599
+ continue
600
+ # Skip client key data in user section but extract it
601
+ elif in_user and 'client-key-data:' in line:
602
+ client_key_data = line.split(':', 1)[1].strip()
603
+ continue
604
+ elif in_cluster and 'server:' in line:
605
+ # Initially just set to the effective master IP
606
+ # (will be changed to localhost by setup_kubectl_ssh_tunnel later)
607
+ f_out.write(
608
+ f' server: https://{effective_master_ip}:6443\n')
609
+ f_out.write(' insecure-skip-tls-verify: true\n')
610
+ continue
611
+
612
+ # Replace default context names with user-provided context name
613
+ line = line.replace('name: default',
614
+ f'name: {context_name}')
615
+ line = line.replace('cluster: default',
616
+ f'cluster: {context_name}')
617
+ line = line.replace('user: default',
618
+ f'user: {context_name}')
619
+ line = line.replace('current-context: default',
620
+ f'current-context: {context_name}')
621
+
622
+ f_out.write(line)
623
+
624
+ # Save certificate data if available
625
+
626
+ if client_cert_data:
627
+ # Decode base64 data and save as PEM
628
+ try:
629
+ # Clean up the certificate data by removing whitespace
630
+ clean_cert_data = ''.join(client_cert_data.split())
631
+ cert_pem = base64.b64decode(clean_cert_data).decode(
632
+ 'utf-8')
633
+
634
+ # Check if the data already looks like a PEM file
635
+ has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
636
+ has_end = '-----END CERTIFICATE-----' in cert_pem
637
+
638
+ if not has_begin or not has_end:
639
+ logger.debug(
640
+ 'Warning: Certificate data missing PEM markers, attempting to fix...'
641
+ )
642
+ # Add PEM markers if missing
643
+ if not has_begin:
644
+ cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
645
+ if not has_end:
646
+ cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
647
+
648
+ # Write the certificate
649
+ with open(cert_file_path, 'w',
650
+ encoding='utf-8') as cert_file:
651
+ cert_file.write(cert_pem)
652
+
653
+ # Verify the file was written correctly
654
+ if os.path.getsize(cert_file_path) > 0:
655
+ logger.debug(
656
+ f'Successfully saved certificate data ({len(cert_pem)} bytes)'
657
+ )
658
+
659
+ # Quick validation of PEM format
660
+ with open(cert_file_path, 'r',
661
+ encoding='utf-8') as f:
662
+ content = f.readlines()
663
+ first_line = content[0].strip(
664
+ ) if content else ''
665
+ last_line = content[-1].strip(
666
+ ) if content else ''
667
+
668
+ if not first_line.startswith(
669
+ '-----BEGIN') or not last_line.startswith(
670
+ '-----END'):
671
+ logger.debug(
672
+ 'Warning: Certificate may not be in proper PEM format'
673
+ )
674
+ else:
675
+ logger.error(
676
+ f'{colorama.Fore.RED}Error: '
677
+ f'Certificate file is empty{RESET_ALL}')
678
+ except Exception as e: # pylint: disable=broad-except
679
+ logger.error(f'{colorama.Fore.RED}'
680
+ f'Error processing certificate data: {e}'
681
+ f'{RESET_ALL}')
682
+
683
+ if client_key_data:
684
+ # Decode base64 data and save as PEM
685
+ try:
686
+ # Clean up the key data by removing whitespace
687
+ clean_key_data = ''.join(client_key_data.split())
688
+ key_pem = base64.b64decode(clean_key_data).decode(
689
+ 'utf-8')
690
+
691
+ # Check if the data already looks like a PEM file
692
+
693
+ # Check for EC key format
694
+ if 'EC PRIVATE KEY' in key_pem:
695
+ # Handle EC KEY format directly
696
+ match_ec = re.search(
697
+ r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
698
+ key_pem, re.DOTALL)
699
+ if match_ec:
700
+ # Extract and properly format EC key
701
+ key_content = match_ec.group(1).strip()
702
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
703
+ else:
704
+ # Extract content and assume EC format
705
+ key_content = re.sub(r'-----BEGIN.*?-----', '',
706
+ key_pem)
707
+ key_content = re.sub(r'-----END.*?-----.*', '',
708
+ key_content).strip()
709
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
710
+ else:
711
+ # Handle regular private key format
712
+ has_begin = any(marker in key_pem for marker in [
713
+ '-----BEGIN PRIVATE KEY-----',
714
+ '-----BEGIN RSA PRIVATE KEY-----'
715
+ ])
716
+ has_end = any(marker in key_pem for marker in [
717
+ '-----END PRIVATE KEY-----',
718
+ '-----END RSA PRIVATE KEY-----'
719
+ ])
720
+
721
+ if not has_begin or not has_end:
722
+ logger.debug(
723
+ 'Warning: Key data missing PEM markers, attempting to fix...'
724
+ )
725
+ # Add PEM markers if missing
726
+ if not has_begin:
727
+ key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
728
+ if not has_end:
729
+ key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
730
+ # Remove any trailing characters after END marker
731
+ key_pem = re.sub(
732
+ r'(-----END PRIVATE KEY-----).*', r'\1',
733
+ key_pem)
734
+
735
+ # Write the key
736
+ with open(key_file_path, 'w',
737
+ encoding='utf-8') as key_file:
738
+ key_file.write(key_pem)
739
+
740
+ # Verify the file was written correctly
741
+ if os.path.getsize(key_file_path) > 0:
742
+ logger.debug(
743
+ f'Successfully saved key data ({len(key_pem)} bytes)'
744
+ )
745
+
746
+ # Quick validation of PEM format
747
+ with open(key_file_path, 'r',
748
+ encoding='utf-8') as f:
749
+ content = f.readlines()
750
+ first_line = content[0].strip(
751
+ ) if content else ''
752
+ last_line = content[-1].strip(
753
+ ) if content else ''
754
+
755
+ if not first_line.startswith(
756
+ '-----BEGIN') or not last_line.startswith(
757
+ '-----END'):
758
+ logger.debug(
759
+ 'Warning: Key may not be in proper PEM format'
760
+ )
761
+ else:
762
+ logger.error(f'{colorama.Fore.RED}Error: '
763
+ f'Key file is empty{RESET_ALL}')
764
+ except Exception as e: # pylint: disable=broad-except
765
+ logger.error(f'{colorama.Fore.RED}'
766
+ f'Error processing key data: {e}'
767
+ f'{RESET_ALL}')
768
+
769
+ # First check if context name exists and delete it if it does
770
+ # TODO(romilb): Should we throw an error here instead?
771
+ deploy_utils.run_command(
772
+ ['kubectl', 'config', 'delete-context', context_name],
773
+ shell=False,
774
+ silent=True)
775
+ deploy_utils.run_command(
776
+ ['kubectl', 'config', 'delete-cluster', context_name],
777
+ shell=False,
778
+ silent=True)
779
+ deploy_utils.run_command(
780
+ ['kubectl', 'config', 'delete-user', context_name],
781
+ shell=False,
782
+ silent=True)
783
+
784
+ # Merge the configurations using kubectl
785
+ merged_config = os.path.join(temp_dir, 'merged_config')
786
+ os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
787
+ with open(merged_config, 'w', encoding='utf-8') as merged_file:
788
+ kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
789
+ result = deploy_utils.run_command(kubectl_cmd, shell=False)
790
+ if result:
791
+ merged_file.write(result)
792
+
793
+ # Replace the kubeconfig with the merged config
794
+ shutil.move(merged_config, kubeconfig_path)
795
+
796
+ # Set the new context as the current context
797
+ deploy_utils.run_command(
798
+ ['kubectl', 'config', 'use-context', context_name],
799
+ shell=False,
800
+ silent=True)
801
+
802
+ # Always set up SSH tunnel since we assume only port 22 is accessible
803
+ tunnel_utils.setup_kubectl_ssh_tunnel(head_node,
804
+ ssh_user,
805
+ ssh_key,
806
+ context_name,
807
+ use_ssh_config=head_use_ssh_config)
808
+
809
+ logger.debug(f'kubectl configured with new context \'{context_name}\'.')
810
+ success_message(f'SkyPilot runtime is up [{cluster_name}].')
811
+
812
+ # Install GPU operator if a GPU was detected on any node
813
+ if install_gpu:
814
+ force_update_status(f'Configuring NVIDIA GPUs [{cluster_name}]')
815
+ cmd = f"""
816
+ {askpass_block}
817
+ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
818
+ chmod 700 get_helm.sh &&
819
+ ./get_helm.sh &&
820
+ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
821
+ kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
822
+ sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
823
+ helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
824
+ --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
825
+ --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
826
+ --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
827
+ --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
828
+ --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
829
+ --set 'toolkit.env[2].value=nvidia' &&
830
+ echo 'Waiting for GPU operator installation...' &&
831
+ while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
832
+ echo 'Waiting for GPU operator...'
833
+ sleep 5
834
+ done
835
+ echo 'GPU operator installed successfully.'
836
+ """
837
+ result = deploy_utils.run_remote(head_node,
838
+ cmd,
839
+ ssh_user,
840
+ ssh_key,
841
+ use_ssh_config=head_use_ssh_config)
842
+ if result is None:
843
+ logger.error(f'{colorama.Fore.RED}Failed to install GPU Operator.'
844
+ f'{RESET_ALL}')
845
+ else:
846
+ success_message('GPU Operator installed.')
847
+ else:
848
+ logger.debug('No GPUs detected. Skipping GPU Operator installation.')
849
+
850
+ # The env var KUBECONFIG ensures sky check uses the right kubeconfig
851
+ os.environ['KUBECONFIG'] = kubeconfig_path
852
+ deploy_utils.run_command(['sky', 'check', 'ssh'], shell=False)
853
+
854
+ success_message('SkyPilot configured successfully.')
855
+
856
+ if unsuccessful_workers:
857
+ quoted_unsuccessful_workers = [
858
+ f'"{worker}"' for worker in unsuccessful_workers
859
+ ]
860
+
861
+ logger.info(f'{colorama.Fore.YELLOW}'
862
+ 'Failed to deploy Kubernetes on the following nodes: '
863
+ f'{", ".join(quoted_unsuccessful_workers)}. Please check '
864
+ f'the logs for more details.{RESET_ALL}')
865
+ else:
866
+ success_message(f'Node Pool `{cluster_name}` deployed successfully.')
867
+
868
+ return unsuccessful_workers
869
+
870
+
871
+ def create_askpass_script(password):
872
+ """Create an askpass script block for sudo with password."""
873
+ if not password:
874
+ return ''
875
+
876
+ return f"""
877
+ # Create temporary askpass script
878
+ ASKPASS_SCRIPT=$(mktemp)
879
+ trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
880
+ cat > $ASKPASS_SCRIPT << EOF
881
+ #!/bin/bash
882
+ echo {password}
883
+ EOF
884
+ chmod 700 $ASKPASS_SCRIPT
885
+ # Use askpass
886
+ export SUDO_ASKPASS=$ASKPASS_SCRIPT
887
+ """
888
+
889
+
890
+ def cleanup_node(node,
891
+ user,
892
+ ssh_key,
893
+ askpass_block,
894
+ use_ssh_config=False,
895
+ is_worker=True):
896
+ """Uninstall k3s and clean up the state on a node."""
897
+ ntype = 'worker' if is_worker else 'head'
898
+ force_update_status(f'Cleaning up {ntype} node ({node})...')
899
+ script = f'k3s{"-agent" if is_worker else ""}-uninstall.sh'
900
+ cmd = f"""
901
+ {askpass_block}
902
+ echo 'Uninstalling k3s...' &&
903
+ sudo -A /usr/local/bin/{script} || true &&
904
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
905
+ """
906
+ result = deploy_utils.run_remote(node,
907
+ cmd,
908
+ user,
909
+ ssh_key,
910
+ use_ssh_config=use_ssh_config)
911
+ if result is None:
912
+ logger.error(f'{colorama.Fore.RED}Failed to clean up {ntype} '
913
+ f'node ({node}).{RESET_ALL}')
914
+ else:
915
+ success_message(f'Node {node} cleaned up successfully.')
916
+
917
+
918
+ def start_agent_node(node,
919
+ master_addr,
920
+ k3s_token,
921
+ user,
922
+ ssh_key,
923
+ askpass_block,
924
+ use_ssh_config=False):
925
+ """Start a k3s agent node.
926
+ Returns: if the start is successful, and whether the node has a GPU."""
927
+ logger.info(f'Deploying worker node ({node}).')
928
+ cmd = f"""
929
+ {askpass_block}
930
+ curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
931
+ K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
932
+ """
933
+ result = deploy_utils.run_remote(node,
934
+ cmd,
935
+ user,
936
+ ssh_key,
937
+ use_ssh_config=use_ssh_config)
938
+ if result is None:
939
+ logger.error(f'{colorama.Fore.RED}✗ Failed to deploy K3s on worker '
940
+ f'node ({node}).{RESET_ALL}')
941
+ return node, False, False
942
+ success_message(
943
+ f'SkyPilot runtime successfully deployed on worker node ({node}).')
944
+ # Check if worker node has a GPU
945
+ if deploy_utils.check_gpu(node,
946
+ user,
947
+ ssh_key,
948
+ use_ssh_config=use_ssh_config):
949
+ logger.info(f'{colorama.Fore.YELLOW}GPU detected on worker node '
950
+ f'({node}).{RESET_ALL}')
951
+ return node, True, True
952
+ return node, True, False