skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -1,1299 +0,0 @@
1
- """SSH-based Kubernetes Cluster Deployment Script"""
2
- # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. # pylint: disable=line-too-long
3
- import argparse
4
- import base64
5
- import concurrent.futures as cf
6
- import os
7
- import random
8
- import re
9
- import shlex
10
- import shutil
11
- import subprocess
12
- import sys
13
- import tempfile
14
- from typing import List, Set
15
-
16
- import yaml
17
-
18
- from sky.utils import ux_utils
19
- from sky.utils.kubernetes import ssh_utils
20
-
21
- # Colors for nicer UX
22
- RED = '\033[0;31m'
23
- GREEN = '\033[0;32m'
24
- YELLOW = '\033[1;33m'
25
- WARNING_YELLOW = '\x1b[33m'
26
- NC = '\033[0m' # No color
27
-
28
- DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
- SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
30
- NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
31
-
32
- # Get the directory of this script
33
- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
34
-
35
-
36
- def parse_args():
37
- parser = argparse.ArgumentParser(
38
- description='Deploy a Kubernetes cluster on remote machines.')
39
- parser.add_argument(
40
- '--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
41
- parser.add_argument(
42
- '--ssh-node-pools-file',
43
- dest='ssh_node_pools_file',
44
- default=ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
45
- help=
46
- f'Path to SSH node pools YAML file (default: {ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH})'
47
- )
48
- parser.add_argument(
49
- '--kubeconfig-path',
50
- dest='kubeconfig_path',
51
- default=DEFAULT_KUBECONFIG_PATH,
52
- help=
53
- f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
54
- )
55
- parser.add_argument(
56
- '--use-ssh-config',
57
- dest='use_ssh_config',
58
- action='store_true',
59
- help='Use SSH config for host settings instead of explicit parameters')
60
- #TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
61
- # Remove these args after 0.11.0 release.
62
- parser.add_argument(
63
- '--ips-file',
64
- dest='ips_file',
65
- help=
66
- '[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
67
- )
68
- parser.add_argument(
69
- '--user',
70
- help=
71
- '[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
72
- )
73
- parser.add_argument(
74
- '--ssh-key',
75
- dest='ssh_key',
76
- help=
77
- '[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
78
- )
79
- parser.add_argument(
80
- '--context-name',
81
- dest='context_name',
82
- default='default',
83
- help=
84
- '[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
85
- )
86
- parser.add_argument('--cleanup',
87
- action='store_true',
88
- help='Clean up the cluster')
89
- parser.add_argument(
90
- '--password',
91
- help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
92
- )
93
-
94
- return parser.parse_args()
95
-
96
-
97
- def run_command(cmd, shell=False):
98
- """Run a local command and return the output."""
99
- process = subprocess.run(cmd,
100
- shell=shell,
101
- capture_output=True,
102
- text=True,
103
- check=False)
104
- if process.returncode != 0:
105
- print(f'{RED}Error executing command: {cmd}{NC}')
106
- print(f'STDOUT: {process.stdout}')
107
- print(f'STDERR: {process.stderr}')
108
- return None
109
- return process.stdout.strip()
110
-
111
-
112
- def get_effective_host_ip(hostname: str) -> str:
113
- """Get the effective IP for a hostname from SSH config."""
114
- try:
115
- result = subprocess.run(['ssh', '-G', hostname],
116
- capture_output=True,
117
- text=True,
118
- check=False)
119
- if result.returncode == 0:
120
- for line in result.stdout.splitlines():
121
- if line.startswith('hostname '):
122
- return line.split(' ', 1)[1].strip()
123
- except Exception: # pylint: disable=broad-except
124
- pass
125
- return hostname # Return the original hostname if lookup fails
126
-
127
-
128
- def run_remote(node,
129
- cmd,
130
- user='',
131
- ssh_key='',
132
- connect_timeout=30,
133
- use_ssh_config=False,
134
- print_output=False,
135
- use_shell=False):
136
- """Run a command on a remote machine via SSH."""
137
- if use_ssh_config:
138
- # Use SSH config for connection parameters
139
- ssh_cmd = ['ssh', node, cmd]
140
- else:
141
- # Use explicit parameters
142
- ssh_cmd = [
143
- 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
144
- '-o', f'ConnectTimeout={connect_timeout}', '-o',
145
- 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
146
- ]
147
-
148
- if ssh_key:
149
- if not os.path.isfile(ssh_key):
150
- raise ValueError(f'SSH key not found: {ssh_key}')
151
- ssh_cmd.extend(['-i', ssh_key])
152
-
153
- ssh_cmd.append(f'{user}@{node}' if user else node)
154
- ssh_cmd.append(cmd)
155
-
156
- if use_shell:
157
- ssh_cmd = ' '.join(ssh_cmd)
158
-
159
- process = subprocess.run(ssh_cmd,
160
- capture_output=True,
161
- text=True,
162
- check=False,
163
- shell=use_shell)
164
- if process.returncode != 0:
165
- print(f'{RED}Error executing command {cmd} on {node}:{NC}')
166
- print(f'STDERR: {process.stderr}')
167
- return None
168
- if print_output:
169
- print(process.stdout)
170
- return process.stdout.strip()
171
-
172
-
173
- def create_askpass_script(password):
174
- """Create an askpass script block for sudo with password."""
175
- if not password:
176
- return ''
177
-
178
- return f"""
179
- # Create temporary askpass script
180
- ASKPASS_SCRIPT=$(mktemp)
181
- trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
182
- cat > $ASKPASS_SCRIPT << EOF
183
- #!/bin/bash
184
- echo {password}
185
- EOF
186
- chmod 700 $ASKPASS_SCRIPT
187
- # Use askpass
188
- export SUDO_ASKPASS=$ASKPASS_SCRIPT
189
- """
190
-
191
-
192
- def progress_message(message):
193
- """Show a progress message."""
194
- print(f'{YELLOW}➜ {message}{NC}')
195
-
196
-
197
- def success_message(message):
198
- """Show a success message."""
199
- print(f'{GREEN}✔ {message}{NC}')
200
-
201
-
202
- def cleanup_server_node(node,
203
- user,
204
- ssh_key,
205
- askpass_block,
206
- use_ssh_config=False):
207
- """Uninstall k3s and clean up the state on a server node."""
208
- print(f'{YELLOW}Cleaning up head node {node}...{NC}')
209
- cmd = f"""
210
- {askpass_block}
211
- echo 'Uninstalling k3s...' &&
212
- sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
213
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
214
- """
215
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
216
- if result is None:
217
- print(f'{RED}Failed to clean up head node ({node}).{NC}')
218
- else:
219
- success_message(f'Node {node} cleaned up successfully.')
220
-
221
-
222
- def cleanup_agent_node(node,
223
- user,
224
- ssh_key,
225
- askpass_block,
226
- use_ssh_config=False):
227
- """Uninstall k3s and clean up the state on an agent node."""
228
- print(f'{YELLOW}Cleaning up worker node {node}...{NC}')
229
- cmd = f"""
230
- {askpass_block}
231
- echo 'Uninstalling k3s...' &&
232
- sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
233
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
234
- """
235
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
236
- if result is None:
237
- print(f'{RED}Failed to clean up worker node ({node}).{NC}')
238
- else:
239
- success_message(f'Node {node} cleaned up successfully.')
240
-
241
-
242
- def start_agent_node(node,
243
- master_addr,
244
- k3s_token,
245
- user,
246
- ssh_key,
247
- askpass_block,
248
- use_ssh_config=False):
249
- """Start a k3s agent node.
250
- Returns: if the start is successful, and if the node has a GPU."""
251
- cmd = f"""
252
- {askpass_block}
253
- curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
254
- K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
255
- """
256
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
257
- if result is None:
258
- print(f'{RED}Failed to deploy K3s on worker node ({node}).{NC}')
259
- return node, False, False
260
- success_message(f'Kubernetes deployed on worker node ({node}).')
261
- # Check if worker node has a GPU
262
- if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
263
- print(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
264
- return node, True, True
265
- return node, True, False
266
-
267
-
268
- def check_gpu(node, user, ssh_key, use_ssh_config=False):
269
- """Check if a node has a GPU."""
270
- cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
271
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
272
- return result is not None
273
-
274
-
275
- def ensure_directory_exists(path):
276
- """Ensure the directory for the specified file path exists."""
277
- directory = os.path.dirname(path)
278
- if directory and not os.path.exists(directory):
279
- os.makedirs(directory, exist_ok=True)
280
-
281
-
282
- def get_used_localhost_ports() -> Set[int]:
283
- """Get SSH port forwardings already in use on localhost"""
284
- used_ports = set()
285
-
286
- # Get ports from netstat (works on macOS and Linux)
287
- try:
288
- if sys.platform == 'darwin':
289
- # macOS
290
- result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
291
- capture_output=True,
292
- text=True,
293
- check=False)
294
- else:
295
- # Linux and other Unix-like systems
296
- result = subprocess.run(['netstat', '-tln'],
297
- capture_output=True,
298
- text=True,
299
- check=False)
300
-
301
- if result.returncode == 0:
302
- # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
303
- for line in result.stdout.splitlines():
304
- if '127.0.0.1:' in line or 'localhost:' in line:
305
- match = re.search(r':(64\d\d)\s', line)
306
- if match:
307
- port = int(match.group(1))
308
- if 6400 <= port <= 6500: # Only consider our range
309
- used_ports.add(port)
310
- except (subprocess.SubprocessError, FileNotFoundError):
311
- # If netstat fails, try another approach
312
- pass
313
-
314
- # Also check ports from existing kubeconfig entries
315
- try:
316
- result = subprocess.run([
317
- 'kubectl', 'config', 'view', '-o',
318
- 'jsonpath=\'{.clusters[*].cluster.server}\''
319
- ],
320
- capture_output=True,
321
- text=True,
322
- check=False)
323
-
324
- if result.returncode == 0:
325
- # Look for localhost URLs with ports
326
- for url in result.stdout.split():
327
- if 'localhost:' in url or '127.0.0.1:' in url:
328
- match = re.search(r':(\d+)', url)
329
- if match:
330
- port = int(match.group(1))
331
- if 6400 <= port <= 6500: # Only consider our range
332
- used_ports.add(port)
333
- except subprocess.SubprocessError:
334
- pass
335
-
336
- return used_ports
337
-
338
-
339
- def get_available_port(start: int = 6443, end: int = 6499) -> int:
340
- """Get an available port in the given range that's not used by other tunnels"""
341
- used_ports = get_used_localhost_ports()
342
-
343
- # Try to use port 6443 first if available for the first cluster
344
- if start == 6443 and start not in used_ports:
345
- return start
346
-
347
- # Otherwise find any available port in the range
348
- available_ports = list(set(range(start, end + 1)) - used_ports)
349
-
350
- if not available_ports:
351
- # If all ports are used, pick a random one from our range
352
- # (we'll terminate any existing connection in the setup)
353
- return random.randint(start, end)
354
-
355
- # Sort to get deterministic allocation
356
- available_ports.sort()
357
- return available_ports[0]
358
-
359
-
360
- def setup_kubectl_ssh_tunnel(head_node,
361
- ssh_user,
362
- ssh_key,
363
- context_name,
364
- use_ssh_config=False):
365
- """Set up kubeconfig exec credential plugin for SSH tunnel"""
366
- progress_message('Setting up SSH tunnel for Kubernetes API access...')
367
-
368
- # Get an available port for this cluster
369
- port = get_available_port()
370
-
371
- # Paths to scripts
372
- tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
373
-
374
- # Make sure scripts are executable
375
- os.chmod(tunnel_script, 0o755)
376
-
377
- # Certificate files
378
- client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
379
- f'{context_name}-cert.pem')
380
- client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
381
- f'{context_name}-key.pem')
382
-
383
- # Update kubeconfig to use localhost with the selected port
384
- run_command([
385
- 'kubectl', 'config', 'set-cluster', context_name,
386
- f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
387
- ])
388
-
389
- # Build the exec args list based on auth method
390
- exec_args = [
391
- '--exec-command', tunnel_script, '--exec-api-version',
392
- 'client.authentication.k8s.io/v1beta1'
393
- ]
394
-
395
- # Set credential TTL to force frequent tunnel checks
396
- ttl_seconds = 30
397
-
398
- # Verify if we have extracted certificate data files
399
- has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
400
- client_key_file)
401
- if has_cert_files:
402
- print(
403
- f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
404
- )
405
-
406
- if use_ssh_config:
407
- run_command(
408
- ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
409
- [
410
- '--exec-arg=--context', f'--exec-arg={context_name}',
411
- '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
412
- f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
413
- '--exec-arg=--host', f'--exec-arg={head_node}'
414
- ])
415
- else:
416
- run_command(['kubectl', 'config', 'set-credentials', context_name] +
417
- exec_args + [
418
- '--exec-arg=--context', f'--exec-arg={context_name}',
419
- '--exec-arg=--port', f'--exec-arg={port}',
420
- '--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
421
- '--exec-arg=--host', f'--exec-arg={head_node}',
422
- '--exec-arg=--user', f'--exec-arg={ssh_user}',
423
- '--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
424
- ])
425
-
426
- success_message(
427
- f'SSH tunnel configured through kubectl credential plugin on port {port}'
428
- )
429
- print(
430
- f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
431
- )
432
- print(
433
- f'{GREEN}This tunnel will be automatically established when needed.{NC}'
434
- )
435
- print(
436
- f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
437
- )
438
-
439
- return port
440
-
441
-
442
- def cleanup_kubectl_ssh_tunnel(context_name):
443
- """Clean up the SSH tunnel for a specific context"""
444
- progress_message(f'Cleaning up SSH tunnel for context {context_name}...')
445
-
446
- # Path to cleanup script
447
- cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
448
-
449
- # Make sure script is executable
450
- if os.path.exists(cleanup_script):
451
- os.chmod(cleanup_script, 0o755)
452
-
453
- # Run the cleanup script
454
- subprocess.run([cleanup_script, context_name],
455
- stdout=subprocess.DEVNULL,
456
- stderr=subprocess.DEVNULL,
457
- check=False)
458
-
459
- success_message(f'SSH tunnel for context {context_name} cleaned up')
460
- else:
461
- print(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
462
-
463
-
464
- def main():
465
- args = parse_args()
466
-
467
- kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
468
- global_use_ssh_config = args.use_ssh_config
469
-
470
- failed_clusters = []
471
- successful_clusters = []
472
-
473
- # Print cleanup mode marker if applicable
474
- if args.cleanup:
475
- print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
476
-
477
- # Check if using YAML configuration or command line arguments
478
- if args.ips_file:
479
- # Using command line arguments - legacy mode
480
- if args.ssh_key and not os.path.isfile(
481
- args.ssh_key) and not global_use_ssh_config:
482
- with ux_utils.print_exception_no_traceback():
483
- raise ValueError(f'SSH key not found: {args.ssh_key}')
484
-
485
- if not os.path.isfile(args.ips_file):
486
- with ux_utils.print_exception_no_traceback():
487
- raise ValueError(f'IPs file not found: {args.ips_file}')
488
-
489
- with open(args.ips_file, 'r', encoding='utf-8') as f:
490
- hosts = [line.strip() for line in f if line.strip()]
491
-
492
- if not hosts:
493
- with ux_utils.print_exception_no_traceback():
494
- raise ValueError(
495
- 'Hosts file is empty or not formatted correctly.')
496
-
497
- head_node = hosts[0]
498
- worker_nodes = hosts[1:]
499
- ssh_user = args.user if not global_use_ssh_config else ''
500
- ssh_key = args.ssh_key if not global_use_ssh_config else ''
501
- context_name = args.context_name
502
- password = args.password
503
-
504
- # Check if hosts are in SSH config
505
- head_use_ssh_config = global_use_ssh_config or ssh_utils.check_host_in_ssh_config(
506
- head_node)
507
- worker_use_ssh_config = [
508
- global_use_ssh_config or ssh_utils.check_host_in_ssh_config(node)
509
- for node in worker_nodes
510
- ]
511
-
512
- # Single cluster deployment for legacy mode
513
- deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name,
514
- password, head_use_ssh_config, worker_use_ssh_config,
515
- kubeconfig_path, args.cleanup)
516
- else:
517
- # Using YAML configuration
518
- targets = ssh_utils.load_ssh_targets(args.ssh_node_pools_file)
519
- clusters_config = ssh_utils.get_cluster_config(
520
- targets, args.infra, file_path=args.ssh_node_pools_file)
521
-
522
- # Print information about clusters being processed
523
- num_clusters = len(clusters_config)
524
- cluster_names = list(clusters_config.keys())
525
- cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
526
- print(f'SKYPILOT_CLUSTER_INFO: {cluster_info}')
527
-
528
- # Process each cluster
529
- for cluster_name, cluster_config in clusters_config.items():
530
- try:
531
- print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
532
- print(
533
- f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
534
- hosts_info = ssh_utils.prepare_hosts_info(
535
- cluster_name, cluster_config)
536
-
537
- if not hosts_info:
538
- print(
539
- f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
540
- )
541
- continue
542
-
543
- # Generate a unique context name for each cluster
544
- context_name = args.context_name
545
- if context_name == 'default':
546
- context_name = 'ssh-' + cluster_name
547
-
548
- # Check cluster history
549
- os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
550
- history_yaml_file = os.path.join(
551
- NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
552
-
553
- history = None
554
- if os.path.exists(history_yaml_file):
555
- print(
556
- f'{YELLOW}Loading history from {history_yaml_file}{NC}')
557
- with open(history_yaml_file, 'r', encoding='utf-8') as f:
558
- history = yaml.safe_load(f)
559
- else:
560
- print(f'{YELLOW}No history found for {context_name}.{NC}')
561
-
562
- history_workers_info = None
563
- history_worker_nodes = None
564
- history_use_ssh_config = None
565
- # Do not support changing anything besides hosts for now
566
- if history is not None:
567
- for key in ['user', 'identity_file', 'password']:
568
- if not args.cleanup and history.get(
569
- key) != cluster_config.get(key):
570
- raise ValueError(
571
- f'Cluster configuration has changed for field {key!r}. '
572
- f'Previous value: {history.get(key)}, '
573
- f'Current value: {cluster_config.get(key)}')
574
- history_hosts_info = ssh_utils.prepare_hosts_info(
575
- cluster_name, history)
576
- if not args.cleanup and history_hosts_info[0] != hosts_info[
577
- 0]:
578
- raise ValueError(
579
- f'Cluster configuration has changed for master node. '
580
- f'Previous value: {history_hosts_info[0]}, '
581
- f'Current value: {hosts_info[0]}')
582
- history_workers_info = history_hosts_info[1:] if len(
583
- history_hosts_info) > 1 else []
584
- history_worker_nodes = [
585
- h['ip'] for h in history_workers_info
586
- ]
587
- history_use_ssh_config = [
588
- h.get('use_ssh_config', False)
589
- for h in history_workers_info
590
- ]
591
-
592
- # Use the first host as the head node and the rest as worker nodes
593
- head_host = hosts_info[0]
594
- worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
595
-
596
- head_node = head_host['ip']
597
- worker_nodes = [h['ip'] for h in worker_hosts]
598
- ssh_user = head_host['user']
599
- ssh_key = head_host['identity_file']
600
- head_use_ssh_config = global_use_ssh_config or head_host.get(
601
- 'use_ssh_config', False)
602
- worker_use_ssh_config = [
603
- global_use_ssh_config or h.get('use_ssh_config', False)
604
- for h in worker_hosts
605
- ]
606
- password = head_host['password']
607
-
608
- # Deploy this cluster
609
- unsuccessful_workers = deploy_cluster(
610
- head_node,
611
- worker_nodes,
612
- ssh_user,
613
- ssh_key,
614
- context_name,
615
- password,
616
- head_use_ssh_config,
617
- worker_use_ssh_config,
618
- kubeconfig_path,
619
- args.cleanup,
620
- worker_hosts=worker_hosts,
621
- history_worker_nodes=history_worker_nodes,
622
- history_workers_info=history_workers_info,
623
- history_use_ssh_config=history_use_ssh_config)
624
-
625
- if not args.cleanup:
626
- successful_hosts = []
627
- for host in cluster_config['hosts']:
628
- if isinstance(host, str):
629
- host_node = host
630
- else:
631
- host_node = host['ip']
632
- if host_node not in unsuccessful_workers:
633
- successful_hosts.append(host)
634
- cluster_config['hosts'] = successful_hosts
635
- with open(history_yaml_file, 'w', encoding='utf-8') as f:
636
- print(
637
- f'{YELLOW}Writing history to {history_yaml_file}{NC}'
638
- )
639
- yaml.dump(cluster_config, f)
640
-
641
- print(
642
- f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
643
- )
644
- successful_clusters.append(cluster_name)
645
- except Exception as e: # pylint: disable=broad-except
646
- reason = str(e)
647
- failed_clusters.append((cluster_name, reason))
648
- print(
649
- f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
650
- ) # Print for internal logging
651
-
652
- if failed_clusters:
653
- action = 'clean' if args.cleanup else 'deploy'
654
- msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
655
- msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
656
- for cluster_name, reason in failed_clusters:
657
- msg += f'\n {cluster_name}: {reason}'
658
- raise RuntimeError(msg)
659
-
660
-
661
- def deploy_cluster(head_node,
662
- worker_nodes,
663
- ssh_user,
664
- ssh_key,
665
- context_name,
666
- password,
667
- head_use_ssh_config,
668
- worker_use_ssh_config,
669
- kubeconfig_path,
670
- cleanup,
671
- worker_hosts=None,
672
- history_worker_nodes=None,
673
- history_workers_info=None,
674
- history_use_ssh_config=None) -> List[str]:
675
- """Deploy or clean up a single Kubernetes cluster.
676
-
677
- Returns: List of unsuccessful worker nodes.
678
- """
679
- history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
680
- f'{context_name}-history.yaml')
681
- cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
682
- f'{context_name}-cert.pem')
683
- key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
684
- tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
685
- f'{context_name}-tunnel.log')
686
-
687
- # Generate the askpass block if password is provided
688
- askpass_block = create_askpass_script(password)
689
-
690
- # Token for k3s
691
- k3s_token = 'mytoken' # Any string can be used as the token
692
-
693
- # Pre-flight checks
694
- print(f'{YELLOW}Checking SSH connection to head node...{NC}')
695
- result = run_remote(
696
- head_node,
697
- f'echo \'SSH connection successful ({head_node})\'',
698
- ssh_user,
699
- ssh_key,
700
- use_ssh_config=head_use_ssh_config,
701
- # For SkySSHUpLineProcessor
702
- print_output=True)
703
- if not cleanup and result is None:
704
- with ux_utils.print_exception_no_traceback():
705
- raise RuntimeError(
706
- f'Failed to SSH to head node ({head_node}). '
707
- f'Please check the SSH configuration and logs for more details.'
708
- )
709
-
710
- # Checking history
711
- history_exists = (history_worker_nodes is not None and
712
- history_workers_info is not None and
713
- history_use_ssh_config is not None)
714
-
715
- # Cleanup history worker nodes
716
- worker_nodes_to_cleanup = []
717
- remove_worker_cmds = []
718
- if history_exists:
719
- for history_node, history_info, use_ssh_config in zip(
720
- history_worker_nodes, history_workers_info,
721
- history_use_ssh_config):
722
- if worker_hosts is not None and history_info not in worker_hosts:
723
- print(
724
- f'{YELLOW}Worker node {history_node} not found in YAML config. '
725
- f'Removing from history...{NC}')
726
- worker_nodes_to_cleanup.append(
727
- dict(
728
- node=history_node,
729
- user=ssh_user
730
- if history_info is None else history_info['user'],
731
- ssh_key=ssh_key if history_info is None else
732
- history_info['identity_file'],
733
- askpass_block=(askpass_block if history_info is None
734
- else create_askpass_script(
735
- history_info['password'])),
736
- use_ssh_config=use_ssh_config,
737
- ))
738
- remove_worker_cmds.append(
739
- f'kubectl delete node -l skypilot-ip={history_node}')
740
- # If this is a create operation and there exists some stale log,
741
- # cleanup the log for a new file to store new logs.
742
- if not cleanup and os.path.exists(tunnel_log_file_path):
743
- os.remove(tunnel_log_file_path)
744
-
745
- # If --cleanup flag is set, uninstall k3s and exit
746
- if cleanup:
747
- # Pickup all nodes
748
- worker_nodes_to_cleanup.clear()
749
- for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
750
- worker_use_ssh_config):
751
- worker_nodes_to_cleanup.append(
752
- dict(
753
- node=node,
754
- user=ssh_user if info is None else info['user'],
755
- ssh_key=ssh_key if info is None else info['identity_file'],
756
- askpass_block=(askpass_block if info is None else
757
- create_askpass_script(info['password'])),
758
- use_ssh_config=use_ssh_config,
759
- ))
760
-
761
- print(f'{YELLOW}Starting cleanup...{NC}')
762
-
763
- # Clean up head node
764
- cleanup_server_node(head_node,
765
- ssh_user,
766
- ssh_key,
767
- askpass_block,
768
- use_ssh_config=head_use_ssh_config)
769
- # Clean up worker nodes
770
- with cf.ThreadPoolExecutor() as executor:
771
- executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
772
- worker_nodes_to_cleanup)
773
-
774
- with cf.ThreadPoolExecutor() as executor:
775
-
776
- def run_cleanup_cmd(cmd):
777
- print('Cleaning up worker nodes:', cmd)
778
- run_command(cmd, shell=True)
779
-
780
- executor.map(run_cleanup_cmd, remove_worker_cmds)
781
-
782
- if cleanup:
783
-
784
- # Remove the context from local kubeconfig if it exists
785
- if os.path.isfile(kubeconfig_path):
786
- progress_message(
787
- f'Removing context {context_name!r} from local kubeconfig...')
788
- run_command(['kubectl', 'config', 'delete-context', context_name],
789
- shell=False)
790
- run_command(['kubectl', 'config', 'delete-cluster', context_name],
791
- shell=False)
792
- run_command(['kubectl', 'config', 'delete-user', context_name],
793
- shell=False)
794
-
795
- # Update the current context to the first available context
796
- contexts = run_command([
797
- 'kubectl', 'config', 'view', '-o',
798
- 'jsonpath=\'{.contexts[0].name}\''
799
- ],
800
- shell=False)
801
- if contexts:
802
- run_command(['kubectl', 'config', 'use-context', contexts],
803
- shell=False)
804
- else:
805
- # If no context is available, simply unset the current context
806
- run_command(['kubectl', 'config', 'unset', 'current-context'],
807
- shell=False)
808
-
809
- success_message(
810
- f'Context {context_name!r} removed from local kubeconfig.')
811
-
812
- for file in [history_yaml_file, cert_file_path, key_file_path]:
813
- if os.path.exists(file):
814
- os.remove(file)
815
-
816
- # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
817
- # will restart the ssh tunnel if it's not running.
818
- cleanup_kubectl_ssh_tunnel(context_name)
819
-
820
- print(f'{GREEN}Cleanup completed successfully.{NC}')
821
-
822
- # Print completion marker for current cluster
823
- print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
824
-
825
- return []
826
-
827
- print(f'{YELLOW}Checking TCP Forwarding Options...{NC}')
828
- cmd = (
829
- 'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
830
- f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
831
- 'else '
832
- 'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
833
- '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
834
- f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
835
- 'fi')
836
- result = run_remote(
837
- head_node,
838
- shlex.quote(cmd),
839
- ssh_user,
840
- ssh_key,
841
- use_ssh_config=head_use_ssh_config,
842
- # For SkySSHUpLineProcessor
843
- print_output=True,
844
- use_shell=True)
845
- if result is None:
846
- with ux_utils.print_exception_no_traceback():
847
- raise RuntimeError(
848
- f'Failed to setup TCP forwarding on head node ({head_node}). '
849
- f'Please check the SSH configuration.')
850
-
851
- # Get effective IP for master node if using SSH config - needed for workers to connect
852
- if head_use_ssh_config:
853
- effective_master_ip = get_effective_host_ip(head_node)
854
- print(
855
- f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
856
- )
857
- else:
858
- effective_master_ip = head_node
859
-
860
- # Step 1: Install k3s on the head node
861
- # Check if head node has a GPU
862
- install_gpu = False
863
- progress_message(f'Deploying Kubernetes on head node ({head_node})...')
864
- cmd = f"""
865
- {askpass_block}
866
- curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
867
- mkdir -p ~/.kube &&
868
- sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
869
- sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
870
- for i in {{1..3}}; do
871
- if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
872
- break
873
- else
874
- echo 'Waiting for nodes to be ready...'
875
- sleep 5
876
- fi
877
- done
878
- if [ $i -eq 3 ]; then
879
- echo 'Failed to wait for nodes to be ready after 3 attempts'
880
- exit 1
881
- fi
882
- """
883
- result = run_remote(head_node,
884
- cmd,
885
- ssh_user,
886
- ssh_key,
887
- use_ssh_config=head_use_ssh_config)
888
- if result is None:
889
- with ux_utils.print_exception_no_traceback():
890
- raise RuntimeError(
891
- f'Failed to deploy K3s on head node ({head_node}).')
892
- success_message(f'K3s deployed on head node ({head_node}).')
893
-
894
- # Check if head node has a GPU
895
- install_gpu = False
896
- if check_gpu(head_node,
897
- ssh_user,
898
- ssh_key,
899
- use_ssh_config=head_use_ssh_config):
900
- print(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
901
- install_gpu = True
902
-
903
- # Fetch the head node's internal IP (this will be passed to worker nodes)
904
- master_addr = run_remote(head_node,
905
- 'hostname -I | awk \'{print $1}\'',
906
- ssh_user,
907
- ssh_key,
908
- use_ssh_config=head_use_ssh_config)
909
- if master_addr is None:
910
- with ux_utils.print_exception_no_traceback():
911
- raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
912
- f'Please check the SSH configuration.')
913
- print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
914
-
915
- # Step 2: Install k3s on worker nodes and join them to the master node
916
- def deploy_worker(args):
917
- (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
918
- askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
919
- progress_message(f'Deploying Kubernetes on worker node ({node})...')
920
-
921
- # If using YAML config with specific worker info
922
- if worker_hosts and i < len(worker_hosts):
923
- if history_workers_info is not None and worker_hosts[
924
- i] in history_workers_info:
925
- print(
926
- f'{YELLOW}Worker node ({node}) already exists in history. '
927
- f'Skipping...{NC}')
928
- return node, True, False
929
- worker_user = worker_hosts[i]['user']
930
- worker_key = worker_hosts[i]['identity_file']
931
- worker_password = worker_hosts[i]['password']
932
- worker_askpass = create_askpass_script(worker_password)
933
- worker_config = worker_use_ssh_config[i]
934
- else:
935
- worker_user = ssh_user
936
- worker_key = ssh_key
937
- worker_askpass = askpass_block
938
- worker_config = worker_use_ssh_config[i]
939
-
940
- return start_agent_node(node,
941
- master_addr,
942
- k3s_token,
943
- worker_user,
944
- worker_key,
945
- worker_askpass,
946
- use_ssh_config=worker_config)
947
-
948
- unsuccessful_workers = []
949
-
950
- # Deploy workers in parallel using thread pool
951
- with cf.ThreadPoolExecutor() as executor:
952
- futures = []
953
- for i, node in enumerate(worker_nodes):
954
- args = (i, node, worker_hosts, history_workers_info, ssh_user,
955
- ssh_key, askpass_block, worker_use_ssh_config, master_addr,
956
- k3s_token)
957
- futures.append(executor.submit(deploy_worker, args))
958
-
959
- # Check if worker node has a GPU
960
- for future in cf.as_completed(futures):
961
- node, suc, has_gpu = future.result()
962
- install_gpu = install_gpu or has_gpu
963
- if not suc:
964
- unsuccessful_workers.append(node)
965
-
966
- # Step 3: Configure local kubectl to connect to the cluster
967
- progress_message('Configuring local kubectl to connect to the cluster...')
968
-
969
- # Create temporary directory for kubeconfig operations
970
- with tempfile.TemporaryDirectory() as temp_dir:
971
- temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
972
-
973
- # Get the kubeconfig from remote server
974
- if head_use_ssh_config:
975
- scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
976
- else:
977
- scp_cmd = [
978
- 'scp', '-o', 'StrictHostKeyChecking=no', '-o',
979
- 'IdentitiesOnly=yes', '-i', ssh_key,
980
- f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
981
- ]
982
- run_command(scp_cmd, shell=False)
983
-
984
- # Create the directory for the kubeconfig file if it doesn't exist
985
- ensure_directory_exists(kubeconfig_path)
986
-
987
- # Create empty kubeconfig if it doesn't exist
988
- if not os.path.isfile(kubeconfig_path):
989
- open(kubeconfig_path, 'a', encoding='utf-8').close()
990
-
991
- # Modify the temporary kubeconfig to update server address and context name
992
- modified_config = os.path.join(temp_dir, 'modified_config')
993
- with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
994
- with open(modified_config, 'w', encoding='utf-8') as f_out:
995
- in_cluster = False
996
- in_user = False
997
- client_cert_data = None
998
- client_key_data = None
999
-
1000
- for line in f_in:
1001
- if 'clusters:' in line:
1002
- in_cluster = True
1003
- in_user = False
1004
- elif 'users:' in line:
1005
- in_cluster = False
1006
- in_user = True
1007
- elif 'contexts:' in line:
1008
- in_cluster = False
1009
- in_user = False
1010
-
1011
- # Skip certificate authority data in cluster section
1012
- if in_cluster and 'certificate-authority-data:' in line:
1013
- continue
1014
- # Skip client certificate data in user section but extract it
1015
- elif in_user and 'client-certificate-data:' in line:
1016
- client_cert_data = line.split(':', 1)[1].strip()
1017
- continue
1018
- # Skip client key data in user section but extract it
1019
- elif in_user and 'client-key-data:' in line:
1020
- client_key_data = line.split(':', 1)[1].strip()
1021
- continue
1022
- elif in_cluster and 'server:' in line:
1023
- # Initially just set to the effective master IP
1024
- # (will be changed to localhost by setup_kubectl_ssh_tunnel later)
1025
- f_out.write(
1026
- f' server: https://{effective_master_ip}:6443\n')
1027
- f_out.write(' insecure-skip-tls-verify: true\n')
1028
- continue
1029
-
1030
- # Replace default context names with user-provided context name
1031
- line = line.replace('name: default',
1032
- f'name: {context_name}')
1033
- line = line.replace('cluster: default',
1034
- f'cluster: {context_name}')
1035
- line = line.replace('user: default',
1036
- f'user: {context_name}')
1037
- line = line.replace('current-context: default',
1038
- f'current-context: {context_name}')
1039
-
1040
- f_out.write(line)
1041
-
1042
- # Save certificate data if available
1043
-
1044
- if client_cert_data:
1045
- # Decode base64 data and save as PEM
1046
- try:
1047
- # Clean up the certificate data by removing whitespace
1048
- clean_cert_data = ''.join(client_cert_data.split())
1049
- cert_pem = base64.b64decode(clean_cert_data).decode(
1050
- 'utf-8')
1051
-
1052
- # Check if the data already looks like a PEM file
1053
- has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
1054
- has_end = '-----END CERTIFICATE-----' in cert_pem
1055
-
1056
- if not has_begin or not has_end:
1057
- print(
1058
- f'{YELLOW}Warning: Certificate data missing PEM markers, attempting to fix...{NC}'
1059
- )
1060
- # Add PEM markers if missing
1061
- if not has_begin:
1062
- cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
1063
- if not has_end:
1064
- cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
1065
-
1066
- # Write the certificate
1067
- with open(cert_file_path, 'w',
1068
- encoding='utf-8') as cert_file:
1069
- cert_file.write(cert_pem)
1070
-
1071
- # Verify the file was written correctly
1072
- if os.path.getsize(cert_file_path) > 0:
1073
- print(
1074
- f'{GREEN}Successfully saved certificate data ({len(cert_pem)} bytes){NC}'
1075
- )
1076
-
1077
- # Quick validation of PEM format
1078
- with open(cert_file_path, 'r',
1079
- encoding='utf-8') as f:
1080
- content = f.readlines()
1081
- first_line = content[0].strip(
1082
- ) if content else ''
1083
- last_line = content[-1].strip(
1084
- ) if content else ''
1085
-
1086
- if not first_line.startswith(
1087
- '-----BEGIN') or not last_line.startswith(
1088
- '-----END'):
1089
- print(
1090
- f'{YELLOW}Warning: Certificate may not be in proper PEM format{NC}'
1091
- )
1092
- else:
1093
- print(f'{RED}Error: Certificate file is empty{NC}')
1094
- except Exception as e: # pylint: disable=broad-except
1095
- print(
1096
- f'{RED}Error processing certificate data: {e}{NC}')
1097
-
1098
- if client_key_data:
1099
- # Decode base64 data and save as PEM
1100
- try:
1101
- # Clean up the key data by removing whitespace
1102
- clean_key_data = ''.join(client_key_data.split())
1103
- key_pem = base64.b64decode(clean_key_data).decode(
1104
- 'utf-8')
1105
-
1106
- # Check if the data already looks like a PEM file
1107
-
1108
- # Check for EC key format
1109
- if 'EC PRIVATE KEY' in key_pem:
1110
- # Handle EC KEY format directly
1111
- match_ec = re.search(
1112
- r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
1113
- key_pem, re.DOTALL)
1114
- if match_ec:
1115
- # Extract and properly format EC key
1116
- key_content = match_ec.group(1).strip()
1117
- key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1118
- else:
1119
- # Extract content and assume EC format
1120
- key_content = re.sub(r'-----BEGIN.*?-----', '',
1121
- key_pem)
1122
- key_content = re.sub(r'-----END.*?-----.*', '',
1123
- key_content).strip()
1124
- key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1125
- else:
1126
- # Handle regular private key format
1127
- has_begin = any(marker in key_pem for marker in [
1128
- '-----BEGIN PRIVATE KEY-----',
1129
- '-----BEGIN RSA PRIVATE KEY-----'
1130
- ])
1131
- has_end = any(marker in key_pem for marker in [
1132
- '-----END PRIVATE KEY-----',
1133
- '-----END RSA PRIVATE KEY-----'
1134
- ])
1135
-
1136
- if not has_begin or not has_end:
1137
- print(
1138
- f'{YELLOW}Warning: Key data missing PEM markers, attempting to fix...{NC}'
1139
- )
1140
- # Add PEM markers if missing
1141
- if not has_begin:
1142
- key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
1143
- if not has_end:
1144
- key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
1145
- # Remove any trailing characters after END marker
1146
- key_pem = re.sub(
1147
- r'(-----END PRIVATE KEY-----).*', r'\1',
1148
- key_pem)
1149
-
1150
- # Write the key
1151
- with open(key_file_path, 'w',
1152
- encoding='utf-8') as key_file:
1153
- key_file.write(key_pem)
1154
-
1155
- # Verify the file was written correctly
1156
- if os.path.getsize(key_file_path) > 0:
1157
- print(
1158
- f'{GREEN}Successfully saved key data ({len(key_pem)} bytes){NC}'
1159
- )
1160
-
1161
- # Quick validation of PEM format
1162
- with open(key_file_path, 'r',
1163
- encoding='utf-8') as f:
1164
- content = f.readlines()
1165
- first_line = content[0].strip(
1166
- ) if content else ''
1167
- last_line = content[-1].strip(
1168
- ) if content else ''
1169
-
1170
- if not first_line.startswith(
1171
- '-----BEGIN') or not last_line.startswith(
1172
- '-----END'):
1173
- print(
1174
- f'{YELLOW}Warning: Key may not be in proper PEM format{NC}'
1175
- )
1176
- else:
1177
- print(f'{RED}Error: Key file is empty{NC}')
1178
- except Exception as e: # pylint: disable=broad-except
1179
- print(f'{RED}Error processing key data: {e}{NC}')
1180
-
1181
- # First check if context name exists and delete it if it does
1182
- # TODO(romilb): Should we throw an error here instead?
1183
- run_command(['kubectl', 'config', 'delete-context', context_name],
1184
- shell=False)
1185
- run_command(['kubectl', 'config', 'delete-cluster', context_name],
1186
- shell=False)
1187
- run_command(['kubectl', 'config', 'delete-user', context_name],
1188
- shell=False)
1189
-
1190
- # Merge the configurations using kubectl
1191
- merged_config = os.path.join(temp_dir, 'merged_config')
1192
- os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
1193
- with open(merged_config, 'w', encoding='utf-8') as merged_file:
1194
- kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
1195
- result = run_command(kubectl_cmd, shell=False)
1196
- if result:
1197
- merged_file.write(result)
1198
-
1199
- # Replace the kubeconfig with the merged config
1200
- shutil.move(merged_config, kubeconfig_path)
1201
-
1202
- # Set the new context as the current context
1203
- run_command(['kubectl', 'config', 'use-context', context_name],
1204
- shell=False)
1205
-
1206
- # Always set up SSH tunnel since we assume only port 22 is accessible
1207
- setup_kubectl_ssh_tunnel(head_node,
1208
- ssh_user,
1209
- ssh_key,
1210
- context_name,
1211
- use_ssh_config=head_use_ssh_config)
1212
-
1213
- success_message(f'kubectl configured with new context \'{context_name}\'.')
1214
-
1215
- print(
1216
- f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
1217
- print('You can now run \'kubectl get nodes\' to verify the setup.')
1218
-
1219
- # Install GPU operator if a GPU was detected on any node
1220
- if install_gpu:
1221
- print(
1222
- f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
1223
- )
1224
- cmd = f"""
1225
- {askpass_block}
1226
- curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
1227
- chmod 700 get_helm.sh &&
1228
- ./get_helm.sh &&
1229
- helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
1230
- kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
1231
- sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
1232
- helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
1233
- --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
1234
- --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
1235
- --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
1236
- --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
1237
- --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
1238
- --set 'toolkit.env[2].value=nvidia' &&
1239
- echo 'Waiting for GPU operator installation...' &&
1240
- while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
1241
- echo 'Waiting for GPU operator...'
1242
- sleep 5
1243
- done
1244
- echo 'GPU operator installed successfully.'
1245
- """
1246
- result = run_remote(head_node,
1247
- cmd,
1248
- ssh_user,
1249
- ssh_key,
1250
- use_ssh_config=head_use_ssh_config)
1251
- if result is None:
1252
- print(f'{RED}Failed to install GPU Operator.{NC}')
1253
- else:
1254
- success_message('GPU Operator installed.')
1255
- else:
1256
- print(
1257
- f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
1258
- )
1259
-
1260
- # Configure SkyPilot
1261
- progress_message('Configuring SkyPilot...')
1262
-
1263
- # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1264
- os.environ['KUBECONFIG'] = kubeconfig_path
1265
- run_command(['sky', 'check', 'kubernetes'], shell=False)
1266
-
1267
- success_message('SkyPilot configured successfully.')
1268
-
1269
- # Display final success message
1270
- print(
1271
- f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
1272
- )
1273
- print(
1274
- 'You can now interact with your Kubernetes cluster through SkyPilot: ')
1275
- print(' • List available GPUs: sky show-gpus --cloud kubernetes')
1276
- print(
1277
- ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1278
- )
1279
- print(
1280
- ' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
1281
- )
1282
- # Print completion marker for current cluster
1283
- print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1284
-
1285
- if unsuccessful_workers:
1286
- quoted_unsuccessful_workers = [
1287
- f'"{worker}"' for worker in unsuccessful_workers
1288
- ]
1289
-
1290
- print(
1291
- f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1292
- f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1293
- f'the logs for more details.{NC}')
1294
-
1295
- return unsuccessful_workers
1296
-
1297
-
1298
- if __name__ == '__main__':
1299
- main()