skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,5 @@
1
1
  """SSH-based Kubernetes Cluster Deployment Script"""
2
- # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. # pylint: disable=line-too-long
3
- import argparse
2
+ # pylint: disable=line-too-long
4
3
  import base64
5
4
  import concurrent.futures as cf
6
5
  import os
@@ -11,10 +10,13 @@ import shutil
11
10
  import subprocess
12
11
  import sys
13
12
  import tempfile
14
- from typing import List, Set
13
+ from typing import List, Optional, Set
15
14
 
15
+ import colorama
16
16
  import yaml
17
17
 
18
+ from sky import sky_logging
19
+ from sky.utils import rich_utils
18
20
  from sky.utils import ux_utils
19
21
  from sky.utils.kubernetes import ssh_utils
20
22
 
@@ -24,6 +26,9 @@ GREEN = '\033[0;32m'
24
26
  YELLOW = '\033[1;33m'
25
27
  WARNING_YELLOW = '\x1b[33m'
26
28
  NC = '\033[0m' # No color
29
+ DIM = colorama.Style.DIM
30
+ CYAN = colorama.Fore.CYAN
31
+ RESET_ALL = colorama.Style.RESET_ALL
27
32
 
28
33
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
29
34
  SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
@@ -32,69 +37,10 @@ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
32
37
  # Get the directory of this script
33
38
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
34
39
 
40
+ logger = sky_logging.init_logger(__name__)
35
41
 
36
- def parse_args():
37
- parser = argparse.ArgumentParser(
38
- description='Deploy a Kubernetes cluster on remote machines.')
39
- parser.add_argument(
40
- '--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
41
- parser.add_argument(
42
- '--ssh-node-pools-file',
43
- dest='ssh_node_pools_file',
44
- default=ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
45
- help=
46
- f'Path to SSH node pools YAML file (default: {ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH})'
47
- )
48
- parser.add_argument(
49
- '--kubeconfig-path',
50
- dest='kubeconfig_path',
51
- default=DEFAULT_KUBECONFIG_PATH,
52
- help=
53
- f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
54
- )
55
- parser.add_argument(
56
- '--use-ssh-config',
57
- dest='use_ssh_config',
58
- action='store_true',
59
- help='Use SSH config for host settings instead of explicit parameters')
60
- #TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
61
- # Remove these args after 0.11.0 release.
62
- parser.add_argument(
63
- '--ips-file',
64
- dest='ips_file',
65
- help=
66
- '[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
67
- )
68
- parser.add_argument(
69
- '--user',
70
- help=
71
- '[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
72
- )
73
- parser.add_argument(
74
- '--ssh-key',
75
- dest='ssh_key',
76
- help=
77
- '[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
78
- )
79
- parser.add_argument(
80
- '--context-name',
81
- dest='context_name',
82
- default='default',
83
- help=
84
- '[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
85
- )
86
- parser.add_argument('--cleanup',
87
- action='store_true',
88
- help='Clean up the cluster')
89
- parser.add_argument(
90
- '--password',
91
- help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
92
- )
93
-
94
- return parser.parse_args()
95
42
 
96
-
97
- def run_command(cmd, shell=False):
43
+ def run_command(cmd, shell=False, silent=False):
98
44
  """Run a local command and return the output."""
99
45
  process = subprocess.run(cmd,
100
46
  shell=shell,
@@ -102,9 +48,10 @@ def run_command(cmd, shell=False):
102
48
  text=True,
103
49
  check=False)
104
50
  if process.returncode != 0:
105
- print(f'{RED}Error executing command: {cmd}{NC}')
106
- print(f'STDOUT: {process.stdout}')
107
- print(f'STDERR: {process.stderr}')
51
+ if not silent:
52
+ logger.error(f'{RED}Error executing command: {cmd}{NC}\n'
53
+ f'STDOUT: {process.stdout}\n'
54
+ f'STDERR: {process.stderr}')
108
55
  return None
109
56
  return process.stdout.strip()
110
57
 
@@ -132,8 +79,12 @@ def run_remote(node,
132
79
  connect_timeout=30,
133
80
  use_ssh_config=False,
134
81
  print_output=False,
135
- use_shell=False):
136
- """Run a command on a remote machine via SSH."""
82
+ use_shell=False,
83
+ silent=False):
84
+ """Run a command on a remote machine via SSH.
85
+
86
+ silent is used for gpu checking (will show error logs when no gpus are found)"""
87
+ ssh_cmd: List[str]
137
88
  if use_ssh_config:
138
89
  # Use SSH config for connection parameters
139
90
  ssh_cmd = ['ssh', node, cmd]
@@ -153,20 +104,19 @@ def run_remote(node,
153
104
  ssh_cmd.append(f'{user}@{node}' if user else node)
154
105
  ssh_cmd.append(cmd)
155
106
 
156
- if use_shell:
157
- ssh_cmd = ' '.join(ssh_cmd)
158
-
159
- process = subprocess.run(ssh_cmd,
107
+ subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
108
+ process = subprocess.run(subprocess_cmd,
160
109
  capture_output=True,
161
110
  text=True,
162
111
  check=False,
163
112
  shell=use_shell)
164
113
  if process.returncode != 0:
165
- print(f'{RED}Error executing command {cmd} on {node}:{NC}')
166
- print(f'STDERR: {process.stderr}')
114
+ if not silent:
115
+ logger.error(f'{RED}Error executing command {cmd} on {node}:{NC} '
116
+ f'{process.stderr}')
167
117
  return None
168
118
  if print_output:
169
- print(process.stdout)
119
+ logger.info(process.stdout)
170
120
  return process.stdout.strip()
171
121
 
172
122
 
@@ -191,12 +141,17 @@ export SUDO_ASKPASS=$ASKPASS_SCRIPT
191
141
 
192
142
  def progress_message(message):
193
143
  """Show a progress message."""
194
- print(f'{YELLOW}➜ {message}{NC}')
144
+ logger.info(f'{YELLOW}➜ {message}{NC}')
195
145
 
196
146
 
197
147
  def success_message(message):
198
148
  """Show a success message."""
199
- print(f'{GREEN}✔ {message}{NC}')
149
+ logger.info(f'{GREEN}✔ {message}{NC}')
150
+
151
+
152
+ def force_update_status(message):
153
+ """Force update rich spinner status."""
154
+ rich_utils.force_update_status(ux_utils.spinner_message(message))
200
155
 
201
156
 
202
157
  def cleanup_server_node(node,
@@ -205,7 +160,7 @@ def cleanup_server_node(node,
205
160
  askpass_block,
206
161
  use_ssh_config=False):
207
162
  """Uninstall k3s and clean up the state on a server node."""
208
- print(f'{YELLOW}Cleaning up head node {node}...{NC}')
163
+ force_update_status(f'Cleaning up head node ({node})...')
209
164
  cmd = f"""
210
165
  {askpass_block}
211
166
  echo 'Uninstalling k3s...' &&
@@ -214,7 +169,7 @@ def cleanup_server_node(node,
214
169
  """
215
170
  result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
216
171
  if result is None:
217
- print(f'{RED}Failed to clean up head node ({node}).{NC}')
172
+ logger.error(f'{RED}Failed to clean up head node ({node}).{NC}')
218
173
  else:
219
174
  success_message(f'Node {node} cleaned up successfully.')
220
175
 
@@ -225,7 +180,7 @@ def cleanup_agent_node(node,
225
180
  askpass_block,
226
181
  use_ssh_config=False):
227
182
  """Uninstall k3s and clean up the state on an agent node."""
228
- print(f'{YELLOW}Cleaning up worker node {node}...{NC}')
183
+ force_update_status(f'Cleaning up worker node ({node})...')
229
184
  cmd = f"""
230
185
  {askpass_block}
231
186
  echo 'Uninstalling k3s...' &&
@@ -234,7 +189,7 @@ def cleanup_agent_node(node,
234
189
  """
235
190
  result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
236
191
  if result is None:
237
- print(f'{RED}Failed to clean up worker node ({node}).{NC}')
192
+ logger.error(f'{RED}Failed to clean up worker node ({node}).{NC}')
238
193
  else:
239
194
  success_message(f'Node {node} cleaned up successfully.')
240
195
 
@@ -248,6 +203,7 @@ def start_agent_node(node,
248
203
  use_ssh_config=False):
249
204
  """Start a k3s agent node.
250
205
  Returns: if the start is successful, and if the node has a GPU."""
206
+ logger.info(f'Deploying worker node ({node}).')
251
207
  cmd = f"""
252
208
  {askpass_block}
253
209
  curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
@@ -255,12 +211,14 @@ def start_agent_node(node,
255
211
  """
256
212
  result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
257
213
  if result is None:
258
- print(f'{RED}Failed to deploy K3s on worker node ({node}).{NC}')
214
+ logger.error(
215
+ f'{RED}✗ Failed to deploy K3s on worker node ({node}).{NC}')
259
216
  return node, False, False
260
- success_message(f'Kubernetes deployed on worker node ({node}).')
217
+ success_message(
218
+ f'SkyPilot runtime successfully deployed on worker node ({node}).')
261
219
  # Check if worker node has a GPU
262
220
  if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
263
- print(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
221
+ logger.info(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
264
222
  return node, True, True
265
223
  return node, True, False
266
224
 
@@ -268,7 +226,12 @@ def start_agent_node(node,
268
226
  def check_gpu(node, user, ssh_key, use_ssh_config=False):
269
227
  """Check if a node has a GPU."""
270
228
  cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
271
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
229
+ result = run_remote(node,
230
+ cmd,
231
+ user,
232
+ ssh_key,
233
+ use_ssh_config=use_ssh_config,
234
+ silent=True)
272
235
  return result is not None
273
236
 
274
237
 
@@ -399,7 +362,7 @@ def setup_kubectl_ssh_tunnel(head_node,
399
362
  has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
400
363
  client_key_file)
401
364
  if has_cert_files:
402
- print(
365
+ logger.info(
403
366
  f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
404
367
  )
405
368
 
@@ -426,22 +389,22 @@ def setup_kubectl_ssh_tunnel(head_node,
426
389
  success_message(
427
390
  f'SSH tunnel configured through kubectl credential plugin on port {port}'
428
391
  )
429
- print(
392
+ logger.info(
430
393
  f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
431
394
  )
432
- print(
395
+ logger.info(
433
396
  f'{GREEN}This tunnel will be automatically established when needed.{NC}'
434
397
  )
435
- print(
398
+ logger.info(
436
399
  f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
437
400
  )
438
401
 
439
402
  return port
440
403
 
441
404
 
442
- def cleanup_kubectl_ssh_tunnel(context_name):
405
+ def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
443
406
  """Clean up the SSH tunnel for a specific context"""
444
- progress_message(f'Cleaning up SSH tunnel for context {context_name}...')
407
+ progress_message(f'Cleaning up SSH tunnel for `{cluster_name}`...')
445
408
 
446
409
  # Path to cleanup script
447
410
  cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
@@ -456,201 +419,148 @@ def cleanup_kubectl_ssh_tunnel(context_name):
456
419
  stderr=subprocess.DEVNULL,
457
420
  check=False)
458
421
 
459
- success_message(f'SSH tunnel for context {context_name} cleaned up')
422
+ success_message(f'SSH tunnel for `{cluster_name}` cleaned up.')
460
423
  else:
461
- print(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
424
+ logger.error(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
462
425
 
463
426
 
464
- def main():
465
- args = parse_args()
427
+ def deploy_clusters(
428
+ infra: Optional[str],
429
+ ssh_node_pools_file: str = ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
430
+ kubeconfig_path: Optional[str] = None,
431
+ cleanup: bool = True):
466
432
 
467
- kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
468
- global_use_ssh_config = args.use_ssh_config
433
+ kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
434
+ kubeconfig_path = os.path.expanduser(kubeconfig_path)
469
435
 
470
436
  failed_clusters = []
471
437
  successful_clusters = []
472
438
 
473
- # Print cleanup mode marker if applicable
474
- if args.cleanup:
475
- print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
476
-
477
- # Check if using YAML configuration or command line arguments
478
- if args.ips_file:
479
- # Using command line arguments - legacy mode
480
- if args.ssh_key and not os.path.isfile(
481
- args.ssh_key) and not global_use_ssh_config:
482
- with ux_utils.print_exception_no_traceback():
483
- raise ValueError(f'SSH key not found: {args.ssh_key}')
484
-
485
- if not os.path.isfile(args.ips_file):
486
- with ux_utils.print_exception_no_traceback():
487
- raise ValueError(f'IPs file not found: {args.ips_file}')
488
-
489
- with open(args.ips_file, 'r', encoding='utf-8') as f:
490
- hosts = [line.strip() for line in f if line.strip()]
491
-
492
- if not hosts:
493
- with ux_utils.print_exception_no_traceback():
494
- raise ValueError(
495
- 'Hosts file is empty or not formatted correctly.')
496
-
497
- head_node = hosts[0]
498
- worker_nodes = hosts[1:]
499
- ssh_user = args.user if not global_use_ssh_config else ''
500
- ssh_key = args.ssh_key if not global_use_ssh_config else ''
501
- context_name = args.context_name
502
- password = args.password
503
-
504
- # Check if hosts are in SSH config
505
- head_use_ssh_config = global_use_ssh_config or ssh_utils.check_host_in_ssh_config(
506
- head_node)
507
- worker_use_ssh_config = [
508
- global_use_ssh_config or ssh_utils.check_host_in_ssh_config(node)
509
- for node in worker_nodes
510
- ]
439
+ # Using YAML configuration
440
+ targets = ssh_utils.load_ssh_targets(ssh_node_pools_file)
441
+ clusters_config = ssh_utils.get_cluster_config(
442
+ targets, infra, file_path=ssh_node_pools_file)
443
+
444
+ # Print information about clusters being processed
445
+ num_clusters = len(clusters_config)
446
+ cluster_names = list(clusters_config.keys())
447
+ cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
448
+ logger.info(f'{colorama.Fore.CYAN}{cluster_info}{colorama.Style.RESET_ALL}')
449
+
450
+ # Process each cluster
451
+ for cluster_name, cluster_config in clusters_config.items():
452
+ try:
453
+ action = 'Cleaning up' if cleanup else 'Deploying'
454
+ force_update_status(f'{action} Node Pool: {cluster_name}')
455
+ hosts_info = ssh_utils.prepare_hosts_info(cluster_name,
456
+ cluster_config)
457
+
458
+ if not hosts_info:
459
+ logger.warning(
460
+ f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
461
+ )
462
+ continue
511
463
 
512
- # Single cluster deployment for legacy mode
513
- deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name,
514
- password, head_use_ssh_config, worker_use_ssh_config,
515
- kubeconfig_path, args.cleanup)
516
- else:
517
- # Using YAML configuration
518
- targets = ssh_utils.load_ssh_targets(args.ssh_node_pools_file)
519
- clusters_config = ssh_utils.get_cluster_config(
520
- targets, args.infra, file_path=args.ssh_node_pools_file)
521
-
522
- # Print information about clusters being processed
523
- num_clusters = len(clusters_config)
524
- cluster_names = list(clusters_config.keys())
525
- cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
526
- print(f'SKYPILOT_CLUSTER_INFO: {cluster_info}')
527
-
528
- # Process each cluster
529
- for cluster_name, cluster_config in clusters_config.items():
530
- try:
531
- print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
532
- print(
533
- f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
534
- hosts_info = ssh_utils.prepare_hosts_info(
535
- cluster_name, cluster_config)
536
-
537
- if not hosts_info:
538
- print(
539
- f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
540
- )
541
- continue
542
-
543
- # Generate a unique context name for each cluster
544
- context_name = args.context_name
545
- if context_name == 'default':
546
- context_name = 'ssh-' + cluster_name
547
-
548
- # Check cluster history
549
- os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
550
- history_yaml_file = os.path.join(
551
- NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
552
-
553
- history = None
554
- if os.path.exists(history_yaml_file):
555
- print(
556
- f'{YELLOW}Loading history from {history_yaml_file}{NC}')
557
- with open(history_yaml_file, 'r', encoding='utf-8') as f:
558
- history = yaml.safe_load(f)
559
- else:
560
- print(f'{YELLOW}No history found for {context_name}.{NC}')
561
-
562
- history_workers_info = None
563
- history_worker_nodes = None
564
- history_use_ssh_config = None
565
- # Do not support changing anything besides hosts for now
566
- if history is not None:
567
- for key in ['user', 'identity_file', 'password']:
568
- if not args.cleanup and history.get(
569
- key) != cluster_config.get(key):
570
- raise ValueError(
571
- f'Cluster configuration has changed for field {key!r}. '
572
- f'Previous value: {history.get(key)}, '
573
- f'Current value: {cluster_config.get(key)}')
574
- history_hosts_info = ssh_utils.prepare_hosts_info(
575
- cluster_name, history)
576
- if not args.cleanup and history_hosts_info[0] != hosts_info[
577
- 0]:
464
+ context_name = f'ssh-{cluster_name}'
465
+
466
+ # Check cluster history
467
+ os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
468
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
469
+ f'{context_name}-history.yaml')
470
+
471
+ history = None
472
+ if os.path.exists(history_yaml_file):
473
+ logger.debug(f'Loading history from {history_yaml_file}')
474
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
475
+ history = yaml.safe_load(f)
476
+ else:
477
+ logger.debug(f'No history found for {context_name}.')
478
+
479
+ history_workers_info = None
480
+ history_worker_nodes = None
481
+ history_use_ssh_config = None
482
+ # Do not support changing anything besides hosts for now
483
+ if history is not None:
484
+ for key in ['user', 'identity_file', 'password']:
485
+ if not cleanup and history.get(key) != cluster_config.get(
486
+ key):
578
487
  raise ValueError(
579
- f'Cluster configuration has changed for master node. '
580
- f'Previous value: {history_hosts_info[0]}, '
581
- f'Current value: {hosts_info[0]}')
582
- history_workers_info = history_hosts_info[1:] if len(
583
- history_hosts_info) > 1 else []
584
- history_worker_nodes = [
585
- h['ip'] for h in history_workers_info
586
- ]
587
- history_use_ssh_config = [
588
- h.get('use_ssh_config', False)
589
- for h in history_workers_info
590
- ]
591
-
592
- # Use the first host as the head node and the rest as worker nodes
593
- head_host = hosts_info[0]
594
- worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
595
-
596
- head_node = head_host['ip']
597
- worker_nodes = [h['ip'] for h in worker_hosts]
598
- ssh_user = head_host['user']
599
- ssh_key = head_host['identity_file']
600
- head_use_ssh_config = global_use_ssh_config or head_host.get(
601
- 'use_ssh_config', False)
602
- worker_use_ssh_config = [
603
- global_use_ssh_config or h.get('use_ssh_config', False)
604
- for h in worker_hosts
488
+ f'Cluster configuration has changed for field {key!r}. '
489
+ f'Previous value: {history.get(key)}, '
490
+ f'Current value: {cluster_config.get(key)}')
491
+ history_hosts_info = ssh_utils.prepare_hosts_info(
492
+ cluster_name, history)
493
+ if not cleanup and history_hosts_info[0] != hosts_info[0]:
494
+ raise ValueError(
495
+ f'Cluster configuration has changed for master node. '
496
+ f'Previous value: {history_hosts_info[0]}, '
497
+ f'Current value: {hosts_info[0]}')
498
+ history_workers_info = history_hosts_info[1:] if len(
499
+ history_hosts_info) > 1 else []
500
+ history_worker_nodes = [h['ip'] for h in history_workers_info]
501
+ history_use_ssh_config = [
502
+ h.get('use_ssh_config', False) for h in history_workers_info
605
503
  ]
606
- password = head_host['password']
607
-
608
- # Deploy this cluster
609
- unsuccessful_workers = deploy_cluster(
610
- head_node,
611
- worker_nodes,
612
- ssh_user,
613
- ssh_key,
614
- context_name,
615
- password,
616
- head_use_ssh_config,
617
- worker_use_ssh_config,
618
- kubeconfig_path,
619
- args.cleanup,
620
- worker_hosts=worker_hosts,
621
- history_worker_nodes=history_worker_nodes,
622
- history_workers_info=history_workers_info,
623
- history_use_ssh_config=history_use_ssh_config)
624
-
625
- if not args.cleanup:
626
- successful_hosts = []
627
- for host in cluster_config['hosts']:
628
- if isinstance(host, str):
629
- host_node = host
630
- else:
631
- host_node = host['ip']
632
- if host_node not in unsuccessful_workers:
633
- successful_hosts.append(host)
634
- cluster_config['hosts'] = successful_hosts
635
- with open(history_yaml_file, 'w', encoding='utf-8') as f:
636
- print(
637
- f'{YELLOW}Writing history to {history_yaml_file}{NC}'
638
- )
639
- yaml.dump(cluster_config, f)
640
-
641
- print(
642
- f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
643
- )
644
- successful_clusters.append(cluster_name)
645
- except Exception as e: # pylint: disable=broad-except
646
- reason = str(e)
647
- failed_clusters.append((cluster_name, reason))
648
- print(
649
- f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
650
- ) # Print for internal logging
504
+
505
+ # Use the first host as the head node and the rest as worker nodes
506
+ head_host = hosts_info[0]
507
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
508
+
509
+ head_node = head_host['ip']
510
+ worker_nodes = [h['ip'] for h in worker_hosts]
511
+ ssh_user = head_host['user']
512
+ ssh_key = head_host['identity_file']
513
+ head_use_ssh_config = head_host.get('use_ssh_config', False)
514
+ worker_use_ssh_config = [
515
+ h.get('use_ssh_config', False) for h in worker_hosts
516
+ ]
517
+ password = head_host['password']
518
+
519
+ # Deploy this cluster
520
+ unsuccessful_workers = deploy_cluster(
521
+ cluster_name,
522
+ head_node,
523
+ worker_nodes,
524
+ ssh_user,
525
+ ssh_key,
526
+ context_name,
527
+ password,
528
+ head_use_ssh_config,
529
+ worker_use_ssh_config,
530
+ kubeconfig_path,
531
+ cleanup,
532
+ worker_hosts=worker_hosts,
533
+ history_worker_nodes=history_worker_nodes,
534
+ history_workers_info=history_workers_info,
535
+ history_use_ssh_config=history_use_ssh_config)
536
+
537
+ if not cleanup:
538
+ successful_hosts = []
539
+ for host in cluster_config['hosts']:
540
+ if isinstance(host, str):
541
+ host_node = host
542
+ else:
543
+ host_node = host['ip']
544
+ if host_node not in unsuccessful_workers:
545
+ successful_hosts.append(host)
546
+ cluster_config['hosts'] = successful_hosts
547
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
548
+ logger.debug(f'Writing history to {history_yaml_file}')
549
+ yaml.dump(cluster_config, f)
550
+
551
+ action = 'cleanup' if cleanup else 'deployment'
552
+ logger.info(
553
+ f'{colorama.Fore.CYAN}Completed {action} for cluster: {cluster_name}{colorama.Style.RESET_ALL}'
554
+ )
555
+ successful_clusters.append(cluster_name)
556
+ except Exception as e: # pylint: disable=broad-except
557
+ reason = str(e)
558
+ failed_clusters.append((cluster_name, reason))
559
+ logger.debug(
560
+ f'Error deploying SSH Node Pool `{cluster_name}`: {reason}')
651
561
 
652
562
  if failed_clusters:
653
- action = 'clean' if args.cleanup else 'deploy'
563
+ action = 'clean' if cleanup else 'deploy'
654
564
  msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
655
565
  msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
656
566
  for cluster_name, reason in failed_clusters:
@@ -658,7 +568,8 @@ def main():
658
568
  raise RuntimeError(msg)
659
569
 
660
570
 
661
- def deploy_cluster(head_node,
571
+ def deploy_cluster(cluster_name,
572
+ head_node,
662
573
  worker_nodes,
663
574
  ssh_user,
664
575
  ssh_key,
@@ -691,15 +602,15 @@ def deploy_cluster(head_node,
691
602
  k3s_token = 'mytoken' # Any string can be used as the token
692
603
 
693
604
  # Pre-flight checks
694
- print(f'{YELLOW}Checking SSH connection to head node...{NC}')
695
- result = run_remote(
696
- head_node,
697
- f'echo \'SSH connection successful ({head_node})\'',
698
- ssh_user,
699
- ssh_key,
700
- use_ssh_config=head_use_ssh_config,
701
- # For SkySSHUpLineProcessor
702
- print_output=True)
605
+ logger.info(f'Checking SSH connection to head node ({head_node})...')
606
+ result = run_remote(head_node,
607
+ f'echo \'SSH connection successful ({head_node})\'',
608
+ ssh_user,
609
+ ssh_key,
610
+ use_ssh_config=head_use_ssh_config)
611
+ if result.startswith('SSH connection successful'):
612
+ success_message(f'SSH connection established to head node {head_node}.')
613
+
703
614
  if not cleanup and result is None:
704
615
  with ux_utils.print_exception_no_traceback():
705
616
  raise RuntimeError(
@@ -720,9 +631,9 @@ def deploy_cluster(head_node,
720
631
  history_worker_nodes, history_workers_info,
721
632
  history_use_ssh_config):
722
633
  if worker_hosts is not None and history_info not in worker_hosts:
723
- print(
724
- f'{YELLOW}Worker node {history_node} not found in YAML config. '
725
- f'Removing from history...{NC}')
634
+ logger.debug(
635
+ f'Worker node {history_node} not found in YAML config. '
636
+ 'Removing from history...')
726
637
  worker_nodes_to_cleanup.append(
727
638
  dict(
728
639
  node=history_node,
@@ -758,8 +669,6 @@ def deploy_cluster(head_node,
758
669
  use_ssh_config=use_ssh_config,
759
670
  ))
760
671
 
761
- print(f'{YELLOW}Starting cleanup...{NC}')
762
-
763
672
  # Clean up head node
764
673
  cleanup_server_node(head_node,
765
674
  ssh_user,
@@ -767,23 +676,20 @@ def deploy_cluster(head_node,
767
676
  askpass_block,
768
677
  use_ssh_config=head_use_ssh_config)
769
678
  # Clean up worker nodes
679
+ force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
770
680
  with cf.ThreadPoolExecutor() as executor:
771
681
  executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
772
682
  worker_nodes_to_cleanup)
773
683
 
774
684
  with cf.ThreadPoolExecutor() as executor:
775
-
776
- def run_cleanup_cmd(cmd):
777
- print('Cleaning up worker nodes:', cmd)
778
- run_command(cmd, shell=True)
779
-
780
- executor.map(run_cleanup_cmd, remove_worker_cmds)
685
+ executor.map(lambda cmd: run_command(cmd, shell=True),
686
+ remove_worker_cmds)
781
687
 
782
688
  if cleanup:
783
689
 
784
690
  # Remove the context from local kubeconfig if it exists
785
691
  if os.path.isfile(kubeconfig_path):
786
- progress_message(
692
+ logger.debug(
787
693
  f'Removing context {context_name!r} from local kubeconfig...')
788
694
  run_command(['kubectl', 'config', 'delete-context', context_name],
789
695
  shell=False)
@@ -806,7 +712,7 @@ def deploy_cluster(head_node,
806
712
  run_command(['kubectl', 'config', 'unset', 'current-context'],
807
713
  shell=False)
808
714
 
809
- success_message(
715
+ logger.debug(
810
716
  f'Context {context_name!r} removed from local kubeconfig.')
811
717
 
812
718
  for file in [history_yaml_file, cert_file_path, key_file_path]:
@@ -815,16 +721,12 @@ def deploy_cluster(head_node,
815
721
 
816
722
  # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
817
723
  # will restart the ssh tunnel if it's not running.
818
- cleanup_kubectl_ssh_tunnel(context_name)
819
-
820
- print(f'{GREEN}Cleanup completed successfully.{NC}')
821
-
822
- # Print completion marker for current cluster
823
- print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
724
+ cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
824
725
 
726
+ success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
825
727
  return []
826
728
 
827
- print(f'{YELLOW}Checking TCP Forwarding Options...{NC}')
729
+ logger.debug('Checking TCP Forwarding Options...')
828
730
  cmd = (
829
731
  'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
830
732
  f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
@@ -833,15 +735,12 @@ def deploy_cluster(head_node,
833
735
  '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
834
736
  f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
835
737
  'fi')
836
- result = run_remote(
837
- head_node,
838
- shlex.quote(cmd),
839
- ssh_user,
840
- ssh_key,
841
- use_ssh_config=head_use_ssh_config,
842
- # For SkySSHUpLineProcessor
843
- print_output=True,
844
- use_shell=True)
738
+ result = run_remote(head_node,
739
+ shlex.quote(cmd),
740
+ ssh_user,
741
+ ssh_key,
742
+ use_ssh_config=head_use_ssh_config,
743
+ use_shell=True)
845
744
  if result is None:
846
745
  with ux_utils.print_exception_no_traceback():
847
746
  raise RuntimeError(
@@ -851,7 +750,7 @@ def deploy_cluster(head_node,
851
750
  # Get effective IP for master node if using SSH config - needed for workers to connect
852
751
  if head_use_ssh_config:
853
752
  effective_master_ip = get_effective_host_ip(head_node)
854
- print(
753
+ logger.info(
855
754
  f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
856
755
  )
857
756
  else:
@@ -860,7 +759,8 @@ def deploy_cluster(head_node,
860
759
  # Step 1: Install k3s on the head node
861
760
  # Check if head node has a GPU
862
761
  install_gpu = False
863
- progress_message(f'Deploying Kubernetes on head node ({head_node})...')
762
+ force_update_status(
763
+ f'Deploying SkyPilot runtime on head node ({head_node}).')
864
764
  cmd = f"""
865
765
  {askpass_block}
866
766
  curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
@@ -889,7 +789,8 @@ def deploy_cluster(head_node,
889
789
  with ux_utils.print_exception_no_traceback():
890
790
  raise RuntimeError(
891
791
  f'Failed to deploy K3s on head node ({head_node}).')
892
- success_message(f'K3s deployed on head node ({head_node}).')
792
+ success_message(
793
+ f'SkyPilot runtime successfully deployed on head node ({head_node}).')
893
794
 
894
795
  # Check if head node has a GPU
895
796
  install_gpu = False
@@ -897,7 +798,7 @@ def deploy_cluster(head_node,
897
798
  ssh_user,
898
799
  ssh_key,
899
800
  use_ssh_config=head_use_ssh_config):
900
- print(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
801
+ logger.info(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
901
802
  install_gpu = True
902
803
 
903
804
  # Fetch the head node's internal IP (this will be passed to worker nodes)
@@ -910,21 +811,20 @@ def deploy_cluster(head_node,
910
811
  with ux_utils.print_exception_no_traceback():
911
812
  raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
912
813
  f'Please check the SSH configuration.')
913
- print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
814
+ logger.debug(f'Master node internal IP: {master_addr}')
914
815
 
915
816
  # Step 2: Install k3s on worker nodes and join them to the master node
916
817
  def deploy_worker(args):
917
818
  (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
918
819
  askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
919
- progress_message(f'Deploying Kubernetes on worker node ({node})...')
920
820
 
921
821
  # If using YAML config with specific worker info
922
822
  if worker_hosts and i < len(worker_hosts):
923
823
  if history_workers_info is not None and worker_hosts[
924
824
  i] in history_workers_info:
925
- print(
926
- f'{YELLOW}Worker node ({node}) already exists in history. '
927
- f'Skipping...{NC}')
825
+ logger.info(
826
+ f'{colorama.Style.DIM} SkyPilot runtime already deployed on worker node {node}. '
827
+ f'Skipping...{colorama.Style.RESET_ALL}')
928
828
  return node, True, False
929
829
  worker_user = worker_hosts[i]['user']
930
830
  worker_key = worker_hosts[i]['identity_file']
@@ -948,6 +848,8 @@ def deploy_cluster(head_node,
948
848
  unsuccessful_workers = []
949
849
 
950
850
  # Deploy workers in parallel using thread pool
851
+ force_update_status(
852
+ f'Deploying SkyPilot runtime on worker nodes [{cluster_name}]')
951
853
  with cf.ThreadPoolExecutor() as executor:
952
854
  futures = []
953
855
  for i, node in enumerate(worker_nodes):
@@ -964,7 +866,7 @@ def deploy_cluster(head_node,
964
866
  unsuccessful_workers.append(node)
965
867
 
966
868
  # Step 3: Configure local kubectl to connect to the cluster
967
- progress_message('Configuring local kubectl to connect to the cluster...')
869
+ force_update_status(f'Setting up SkyPilot configuration [{cluster_name}]')
968
870
 
969
871
  # Create temporary directory for kubeconfig operations
970
872
  with tempfile.TemporaryDirectory() as temp_dir:
@@ -1054,8 +956,8 @@ def deploy_cluster(head_node,
1054
956
  has_end = '-----END CERTIFICATE-----' in cert_pem
1055
957
 
1056
958
  if not has_begin or not has_end:
1057
- print(
1058
- f'{YELLOW}Warning: Certificate data missing PEM markers, attempting to fix...{NC}'
959
+ logger.debug(
960
+ 'Warning: Certificate data missing PEM markers, attempting to fix...'
1059
961
  )
1060
962
  # Add PEM markers if missing
1061
963
  if not has_begin:
@@ -1070,8 +972,8 @@ def deploy_cluster(head_node,
1070
972
 
1071
973
  # Verify the file was written correctly
1072
974
  if os.path.getsize(cert_file_path) > 0:
1073
- print(
1074
- f'{GREEN}Successfully saved certificate data ({len(cert_pem)} bytes){NC}'
975
+ logger.debug(
976
+ f'Successfully saved certificate data ({len(cert_pem)} bytes)'
1075
977
  )
1076
978
 
1077
979
  # Quick validation of PEM format
@@ -1086,13 +988,14 @@ def deploy_cluster(head_node,
1086
988
  if not first_line.startswith(
1087
989
  '-----BEGIN') or not last_line.startswith(
1088
990
  '-----END'):
1089
- print(
1090
- f'{YELLOW}Warning: Certificate may not be in proper PEM format{NC}'
991
+ logger.debug(
992
+ 'Warning: Certificate may not be in proper PEM format'
1091
993
  )
1092
994
  else:
1093
- print(f'{RED}Error: Certificate file is empty{NC}')
995
+ logger.error(
996
+ f'{RED}Error: Certificate file is empty{NC}')
1094
997
  except Exception as e: # pylint: disable=broad-except
1095
- print(
998
+ logger.error(
1096
999
  f'{RED}Error processing certificate data: {e}{NC}')
1097
1000
 
1098
1001
  if client_key_data:
@@ -1134,8 +1037,8 @@ def deploy_cluster(head_node,
1134
1037
  ])
1135
1038
 
1136
1039
  if not has_begin or not has_end:
1137
- print(
1138
- f'{YELLOW}Warning: Key data missing PEM markers, attempting to fix...{NC}'
1040
+ logger.debug(
1041
+ 'Warning: Key data missing PEM markers, attempting to fix...'
1139
1042
  )
1140
1043
  # Add PEM markers if missing
1141
1044
  if not has_begin:
@@ -1154,8 +1057,8 @@ def deploy_cluster(head_node,
1154
1057
 
1155
1058
  # Verify the file was written correctly
1156
1059
  if os.path.getsize(key_file_path) > 0:
1157
- print(
1158
- f'{GREEN}Successfully saved key data ({len(key_pem)} bytes){NC}'
1060
+ logger.debug(
1061
+ f'Successfully saved key data ({len(key_pem)} bytes)'
1159
1062
  )
1160
1063
 
1161
1064
  # Quick validation of PEM format
@@ -1170,22 +1073,25 @@ def deploy_cluster(head_node,
1170
1073
  if not first_line.startswith(
1171
1074
  '-----BEGIN') or not last_line.startswith(
1172
1075
  '-----END'):
1173
- print(
1174
- f'{YELLOW}Warning: Key may not be in proper PEM format{NC}'
1076
+ logger.debug(
1077
+ 'Warning: Key may not be in proper PEM format'
1175
1078
  )
1176
1079
  else:
1177
- print(f'{RED}Error: Key file is empty{NC}')
1080
+ logger.error(f'{RED}Error: Key file is empty{NC}')
1178
1081
  except Exception as e: # pylint: disable=broad-except
1179
- print(f'{RED}Error processing key data: {e}{NC}')
1082
+ logger.error(f'{RED}Error processing key data: {e}{NC}')
1180
1083
 
1181
1084
  # First check if context name exists and delete it if it does
1182
1085
  # TODO(romilb): Should we throw an error here instead?
1183
1086
  run_command(['kubectl', 'config', 'delete-context', context_name],
1184
- shell=False)
1087
+ shell=False,
1088
+ silent=True)
1185
1089
  run_command(['kubectl', 'config', 'delete-cluster', context_name],
1186
- shell=False)
1090
+ shell=False,
1091
+ silent=True)
1187
1092
  run_command(['kubectl', 'config', 'delete-user', context_name],
1188
- shell=False)
1093
+ shell=False,
1094
+ silent=True)
1189
1095
 
1190
1096
  # Merge the configurations using kubectl
1191
1097
  merged_config = os.path.join(temp_dir, 'merged_config')
@@ -1210,17 +1116,12 @@ def deploy_cluster(head_node,
1210
1116
  context_name,
1211
1117
  use_ssh_config=head_use_ssh_config)
1212
1118
 
1213
- success_message(f'kubectl configured with new context \'{context_name}\'.')
1214
-
1215
- print(
1216
- f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
1217
- print('You can now run \'kubectl get nodes\' to verify the setup.')
1119
+ logger.debug(f'kubectl configured with new context \'{context_name}\'.')
1120
+ success_message(f'SkyPilot runtime is up [{cluster_name}].')
1218
1121
 
1219
1122
  # Install GPU operator if a GPU was detected on any node
1220
1123
  if install_gpu:
1221
- print(
1222
- f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
1223
- )
1124
+ force_update_status(f'Configuring NVIDIA GPUs [{cluster_name}]')
1224
1125
  cmd = f"""
1225
1126
  {askpass_block}
1226
1127
  curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
@@ -1240,7 +1141,7 @@ def deploy_cluster(head_node,
1240
1141
  while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
1241
1142
  echo 'Waiting for GPU operator...'
1242
1143
  sleep 5
1243
- done
1144
+ done
1244
1145
  echo 'GPU operator installed successfully.'
1245
1146
  """
1246
1147
  result = run_remote(head_node,
@@ -1249,51 +1150,28 @@ def deploy_cluster(head_node,
1249
1150
  ssh_key,
1250
1151
  use_ssh_config=head_use_ssh_config)
1251
1152
  if result is None:
1252
- print(f'{RED}Failed to install GPU Operator.{NC}')
1153
+ logger.error(f'{RED}Failed to install GPU Operator.{NC}')
1253
1154
  else:
1254
1155
  success_message('GPU Operator installed.')
1255
1156
  else:
1256
- print(
1257
- f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
1258
- )
1259
-
1260
- # Configure SkyPilot
1261
- progress_message('Configuring SkyPilot...')
1157
+ logger.debug('No GPUs detected. Skipping GPU Operator installation.')
1262
1158
 
1263
1159
  # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1264
1160
  os.environ['KUBECONFIG'] = kubeconfig_path
1265
- run_command(['sky', 'check', 'kubernetes'], shell=False)
1161
+ run_command(['sky', 'check', 'ssh'], shell=False)
1266
1162
 
1267
1163
  success_message('SkyPilot configured successfully.')
1268
1164
 
1269
- # Display final success message
1270
- print(
1271
- f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
1272
- )
1273
- print(
1274
- 'You can now interact with your Kubernetes cluster through SkyPilot: ')
1275
- print(' • List available GPUs: sky show-gpus --cloud kubernetes')
1276
- print(
1277
- ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1278
- )
1279
- print(
1280
- ' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
1281
- )
1282
- # Print completion marker for current cluster
1283
- print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1284
-
1285
1165
  if unsuccessful_workers:
1286
1166
  quoted_unsuccessful_workers = [
1287
1167
  f'"{worker}"' for worker in unsuccessful_workers
1288
1168
  ]
1289
1169
 
1290
- print(
1170
+ logger.info(
1291
1171
  f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1292
1172
  f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1293
1173
  f'the logs for more details.{NC}')
1174
+ else:
1175
+ success_message(f'Node Pool `{cluster_name}` deployed successfully.')
1294
1176
 
1295
1177
  return unsuccessful_workers
1296
-
1297
-
1298
- if __name__ == '__main__':
1299
- main()