skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1177 @@
1
+ """SSH-based Kubernetes Cluster Deployment Script"""
2
+ # pylint: disable=line-too-long
3
+ import base64
4
+ import concurrent.futures as cf
5
+ import os
6
+ import random
7
+ import re
8
+ import shlex
9
+ import shutil
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from typing import List, Optional, Set
14
+
15
+ import colorama
16
+ import yaml
17
+
18
+ from sky import sky_logging
19
+ from sky.utils import rich_utils
20
+ from sky.utils import ux_utils
21
+ from sky.utils.kubernetes import ssh_utils
22
+
23
+ # Colors for nicer UX
24
+ RED = '\033[0;31m'
25
+ GREEN = '\033[0;32m'
26
+ YELLOW = '\033[1;33m'
27
+ WARNING_YELLOW = '\x1b[33m'
28
+ NC = '\033[0m' # No color
29
+ DIM = colorama.Style.DIM
30
+ CYAN = colorama.Fore.CYAN
31
+ RESET_ALL = colorama.Style.RESET_ALL
32
+
33
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
34
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
35
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
36
+
37
+ # Get the directory of this script
38
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
39
+
40
+ logger = sky_logging.init_logger(__name__)
41
+
42
+
43
+ def run_command(cmd, shell=False, silent=False):
44
+ """Run a local command and return the output."""
45
+ process = subprocess.run(cmd,
46
+ shell=shell,
47
+ capture_output=True,
48
+ text=True,
49
+ check=False)
50
+ if process.returncode != 0:
51
+ if not silent:
52
+ logger.error(f'{RED}Error executing command: {cmd}{NC}\n'
53
+ f'STDOUT: {process.stdout}\n'
54
+ f'STDERR: {process.stderr}')
55
+ return None
56
+ return process.stdout.strip()
57
+
58
+
59
+ def get_effective_host_ip(hostname: str) -> str:
60
+ """Get the effective IP for a hostname from SSH config."""
61
+ try:
62
+ result = subprocess.run(['ssh', '-G', hostname],
63
+ capture_output=True,
64
+ text=True,
65
+ check=False)
66
+ if result.returncode == 0:
67
+ for line in result.stdout.splitlines():
68
+ if line.startswith('hostname '):
69
+ return line.split(' ', 1)[1].strip()
70
+ except Exception: # pylint: disable=broad-except
71
+ pass
72
+ return hostname # Return the original hostname if lookup fails
73
+
74
+
75
+ def run_remote(node,
76
+ cmd,
77
+ user='',
78
+ ssh_key='',
79
+ connect_timeout=30,
80
+ use_ssh_config=False,
81
+ print_output=False,
82
+ use_shell=False,
83
+ silent=False):
84
+ """Run a command on a remote machine via SSH.
85
+
86
+ silent is used for gpu checking (will show error logs when no gpus are found)"""
87
+ ssh_cmd: List[str]
88
+ if use_ssh_config:
89
+ # Use SSH config for connection parameters
90
+ ssh_cmd = ['ssh', node, cmd]
91
+ else:
92
+ # Use explicit parameters
93
+ ssh_cmd = [
94
+ 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
95
+ '-o', f'ConnectTimeout={connect_timeout}', '-o',
96
+ 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
97
+ ]
98
+
99
+ if ssh_key:
100
+ if not os.path.isfile(ssh_key):
101
+ raise ValueError(f'SSH key not found: {ssh_key}')
102
+ ssh_cmd.extend(['-i', ssh_key])
103
+
104
+ ssh_cmd.append(f'{user}@{node}' if user else node)
105
+ ssh_cmd.append(cmd)
106
+
107
+ subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
108
+ process = subprocess.run(subprocess_cmd,
109
+ capture_output=True,
110
+ text=True,
111
+ check=False,
112
+ shell=use_shell)
113
+ if process.returncode != 0:
114
+ if not silent:
115
+ logger.error(f'{RED}Error executing command {cmd} on {node}:{NC} '
116
+ f'{process.stderr}')
117
+ return None
118
+ if print_output:
119
+ logger.info(process.stdout)
120
+ return process.stdout.strip()
121
+
122
+
123
+ def create_askpass_script(password):
124
+ """Create an askpass script block for sudo with password."""
125
+ if not password:
126
+ return ''
127
+
128
+ return f"""
129
+ # Create temporary askpass script
130
+ ASKPASS_SCRIPT=$(mktemp)
131
+ trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
132
+ cat > $ASKPASS_SCRIPT << EOF
133
+ #!/bin/bash
134
+ echo {password}
135
+ EOF
136
+ chmod 700 $ASKPASS_SCRIPT
137
+ # Use askpass
138
+ export SUDO_ASKPASS=$ASKPASS_SCRIPT
139
+ """
140
+
141
+
142
+ def progress_message(message):
143
+ """Show a progress message."""
144
+ logger.info(f'{YELLOW}➜ {message}{NC}')
145
+
146
+
147
+ def success_message(message):
148
+ """Show a success message."""
149
+ logger.info(f'{GREEN}✔ {message}{NC}')
150
+
151
+
152
+ def force_update_status(message):
153
+ """Force update rich spinner status."""
154
+ rich_utils.force_update_status(ux_utils.spinner_message(message))
155
+
156
+
157
+ def cleanup_server_node(node,
158
+ user,
159
+ ssh_key,
160
+ askpass_block,
161
+ use_ssh_config=False):
162
+ """Uninstall k3s and clean up the state on a server node."""
163
+ force_update_status(f'Cleaning up head node ({node})...')
164
+ cmd = f"""
165
+ {askpass_block}
166
+ echo 'Uninstalling k3s...' &&
167
+ sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
168
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
169
+ """
170
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
171
+ if result is None:
172
+ logger.error(f'{RED}Failed to clean up head node ({node}).{NC}')
173
+ else:
174
+ success_message(f'Node {node} cleaned up successfully.')
175
+
176
+
177
+ def cleanup_agent_node(node,
178
+ user,
179
+ ssh_key,
180
+ askpass_block,
181
+ use_ssh_config=False):
182
+ """Uninstall k3s and clean up the state on an agent node."""
183
+ force_update_status(f'Cleaning up worker node ({node})...')
184
+ cmd = f"""
185
+ {askpass_block}
186
+ echo 'Uninstalling k3s...' &&
187
+ sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
188
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
189
+ """
190
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
191
+ if result is None:
192
+ logger.error(f'{RED}Failed to clean up worker node ({node}).{NC}')
193
+ else:
194
+ success_message(f'Node {node} cleaned up successfully.')
195
+
196
+
197
+ def start_agent_node(node,
198
+ master_addr,
199
+ k3s_token,
200
+ user,
201
+ ssh_key,
202
+ askpass_block,
203
+ use_ssh_config=False):
204
+ """Start a k3s agent node.
205
+ Returns: if the start is successful, and if the node has a GPU."""
206
+ logger.info(f'Deploying worker node ({node}).')
207
+ cmd = f"""
208
+ {askpass_block}
209
+ curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
210
+ K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
211
+ """
212
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
213
+ if result is None:
214
+ logger.error(
215
+ f'{RED}✗ Failed to deploy K3s on worker node ({node}).{NC}')
216
+ return node, False, False
217
+ success_message(
218
+ f'SkyPilot runtime successfully deployed on worker node ({node}).')
219
+ # Check if worker node has a GPU
220
+ if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
221
+ logger.info(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
222
+ return node, True, True
223
+ return node, True, False
224
+
225
+
226
+ def check_gpu(node, user, ssh_key, use_ssh_config=False):
227
+ """Check if a node has a GPU."""
228
+ cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
229
+ result = run_remote(node,
230
+ cmd,
231
+ user,
232
+ ssh_key,
233
+ use_ssh_config=use_ssh_config,
234
+ silent=True)
235
+ return result is not None
236
+
237
+
238
+ def ensure_directory_exists(path):
239
+ """Ensure the directory for the specified file path exists."""
240
+ directory = os.path.dirname(path)
241
+ if directory and not os.path.exists(directory):
242
+ os.makedirs(directory, exist_ok=True)
243
+
244
+
245
+ def get_used_localhost_ports() -> Set[int]:
246
+ """Get SSH port forwardings already in use on localhost"""
247
+ used_ports = set()
248
+
249
+ # Get ports from netstat (works on macOS and Linux)
250
+ try:
251
+ if sys.platform == 'darwin':
252
+ # macOS
253
+ result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
254
+ capture_output=True,
255
+ text=True,
256
+ check=False)
257
+ else:
258
+ # Linux and other Unix-like systems
259
+ result = subprocess.run(['netstat', '-tln'],
260
+ capture_output=True,
261
+ text=True,
262
+ check=False)
263
+
264
+ if result.returncode == 0:
265
+ # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
266
+ for line in result.stdout.splitlines():
267
+ if '127.0.0.1:' in line or 'localhost:' in line:
268
+ match = re.search(r':(64\d\d)\s', line)
269
+ if match:
270
+ port = int(match.group(1))
271
+ if 6400 <= port <= 6500: # Only consider our range
272
+ used_ports.add(port)
273
+ except (subprocess.SubprocessError, FileNotFoundError):
274
+ # If netstat fails, try another approach
275
+ pass
276
+
277
+ # Also check ports from existing kubeconfig entries
278
+ try:
279
+ result = subprocess.run([
280
+ 'kubectl', 'config', 'view', '-o',
281
+ 'jsonpath=\'{.clusters[*].cluster.server}\''
282
+ ],
283
+ capture_output=True,
284
+ text=True,
285
+ check=False)
286
+
287
+ if result.returncode == 0:
288
+ # Look for localhost URLs with ports
289
+ for url in result.stdout.split():
290
+ if 'localhost:' in url or '127.0.0.1:' in url:
291
+ match = re.search(r':(\d+)', url)
292
+ if match:
293
+ port = int(match.group(1))
294
+ if 6400 <= port <= 6500: # Only consider our range
295
+ used_ports.add(port)
296
+ except subprocess.SubprocessError:
297
+ pass
298
+
299
+ return used_ports
300
+
301
+
302
+ def get_available_port(start: int = 6443, end: int = 6499) -> int:
303
+ """Get an available port in the given range that's not used by other tunnels"""
304
+ used_ports = get_used_localhost_ports()
305
+
306
+ # Try to use port 6443 first if available for the first cluster
307
+ if start == 6443 and start not in used_ports:
308
+ return start
309
+
310
+ # Otherwise find any available port in the range
311
+ available_ports = list(set(range(start, end + 1)) - used_ports)
312
+
313
+ if not available_ports:
314
+ # If all ports are used, pick a random one from our range
315
+ # (we'll terminate any existing connection in the setup)
316
+ return random.randint(start, end)
317
+
318
+ # Sort to get deterministic allocation
319
+ available_ports.sort()
320
+ return available_ports[0]
321
+
322
+
323
+ def setup_kubectl_ssh_tunnel(head_node,
324
+ ssh_user,
325
+ ssh_key,
326
+ context_name,
327
+ use_ssh_config=False):
328
+ """Set up kubeconfig exec credential plugin for SSH tunnel"""
329
+ progress_message('Setting up SSH tunnel for Kubernetes API access...')
330
+
331
+ # Get an available port for this cluster
332
+ port = get_available_port()
333
+
334
+ # Paths to scripts
335
+ tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
336
+
337
+ # Make sure scripts are executable
338
+ os.chmod(tunnel_script, 0o755)
339
+
340
+ # Certificate files
341
+ client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
342
+ f'{context_name}-cert.pem')
343
+ client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
344
+ f'{context_name}-key.pem')
345
+
346
+ # Update kubeconfig to use localhost with the selected port
347
+ run_command([
348
+ 'kubectl', 'config', 'set-cluster', context_name,
349
+ f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
350
+ ])
351
+
352
+ # Build the exec args list based on auth method
353
+ exec_args = [
354
+ '--exec-command', tunnel_script, '--exec-api-version',
355
+ 'client.authentication.k8s.io/v1beta1'
356
+ ]
357
+
358
+ # Set credential TTL to force frequent tunnel checks
359
+ ttl_seconds = 30
360
+
361
+ # Verify if we have extracted certificate data files
362
+ has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
363
+ client_key_file)
364
+ if has_cert_files:
365
+ logger.info(
366
+ f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
367
+ )
368
+
369
+ if use_ssh_config:
370
+ run_command(
371
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
372
+ [
373
+ '--exec-arg=--context', f'--exec-arg={context_name}',
374
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
375
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
376
+ '--exec-arg=--host', f'--exec-arg={head_node}'
377
+ ])
378
+ else:
379
+ run_command(['kubectl', 'config', 'set-credentials', context_name] +
380
+ exec_args + [
381
+ '--exec-arg=--context', f'--exec-arg={context_name}',
382
+ '--exec-arg=--port', f'--exec-arg={port}',
383
+ '--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
384
+ '--exec-arg=--host', f'--exec-arg={head_node}',
385
+ '--exec-arg=--user', f'--exec-arg={ssh_user}',
386
+ '--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
387
+ ])
388
+
389
+ success_message(
390
+ f'SSH tunnel configured through kubectl credential plugin on port {port}'
391
+ )
392
+ logger.info(
393
+ f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
394
+ )
395
+ logger.info(
396
+ f'{GREEN}This tunnel will be automatically established when needed.{NC}'
397
+ )
398
+ logger.info(
399
+ f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
400
+ )
401
+
402
+ return port
403
+
404
+
405
+ def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
406
+ """Clean up the SSH tunnel for a specific context"""
407
+ progress_message(f'Cleaning up SSH tunnel for `{cluster_name}`...')
408
+
409
+ # Path to cleanup script
410
+ cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
411
+
412
+ # Make sure script is executable
413
+ if os.path.exists(cleanup_script):
414
+ os.chmod(cleanup_script, 0o755)
415
+
416
+ # Run the cleanup script
417
+ subprocess.run([cleanup_script, context_name],
418
+ stdout=subprocess.DEVNULL,
419
+ stderr=subprocess.DEVNULL,
420
+ check=False)
421
+
422
+ success_message(f'SSH tunnel for `{cluster_name}` cleaned up.')
423
+ else:
424
+ logger.error(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
425
+
426
+
427
+ def deploy_clusters(
428
+ infra: Optional[str],
429
+ ssh_node_pools_file: str = ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
430
+ kubeconfig_path: Optional[str] = None,
431
+ cleanup: bool = True):
432
+
433
+ kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
434
+ kubeconfig_path = os.path.expanduser(kubeconfig_path)
435
+
436
+ failed_clusters = []
437
+ successful_clusters = []
438
+
439
+ # Using YAML configuration
440
+ targets = ssh_utils.load_ssh_targets(ssh_node_pools_file)
441
+ clusters_config = ssh_utils.get_cluster_config(
442
+ targets, infra, file_path=ssh_node_pools_file)
443
+
444
+ # Print information about clusters being processed
445
+ num_clusters = len(clusters_config)
446
+ cluster_names = list(clusters_config.keys())
447
+ cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
448
+ logger.info(f'{colorama.Fore.CYAN}{cluster_info}{colorama.Style.RESET_ALL}')
449
+
450
+ # Process each cluster
451
+ for cluster_name, cluster_config in clusters_config.items():
452
+ try:
453
+ action = 'Cleaning up' if cleanup else 'Deploying'
454
+ force_update_status(f'{action} Node Pool: {cluster_name}')
455
+ hosts_info = ssh_utils.prepare_hosts_info(cluster_name,
456
+ cluster_config)
457
+
458
+ if not hosts_info:
459
+ logger.warning(
460
+ f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
461
+ )
462
+ continue
463
+
464
+ context_name = f'ssh-{cluster_name}'
465
+
466
+ # Check cluster history
467
+ os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
468
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
469
+ f'{context_name}-history.yaml')
470
+
471
+ history = None
472
+ if os.path.exists(history_yaml_file):
473
+ logger.debug(f'Loading history from {history_yaml_file}')
474
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
475
+ history = yaml.safe_load(f)
476
+ else:
477
+ logger.debug(f'No history found for {context_name}.')
478
+
479
+ history_workers_info = None
480
+ history_worker_nodes = None
481
+ history_use_ssh_config = None
482
+ # Do not support changing anything besides hosts for now
483
+ if history is not None:
484
+ for key in ['user', 'identity_file', 'password']:
485
+ if not cleanup and history.get(key) != cluster_config.get(
486
+ key):
487
+ raise ValueError(
488
+ f'Cluster configuration has changed for field {key!r}. '
489
+ f'Previous value: {history.get(key)}, '
490
+ f'Current value: {cluster_config.get(key)}')
491
+ history_hosts_info = ssh_utils.prepare_hosts_info(
492
+ cluster_name, history)
493
+ if not cleanup and history_hosts_info[0] != hosts_info[0]:
494
+ raise ValueError(
495
+ f'Cluster configuration has changed for master node. '
496
+ f'Previous value: {history_hosts_info[0]}, '
497
+ f'Current value: {hosts_info[0]}')
498
+ history_workers_info = history_hosts_info[1:] if len(
499
+ history_hosts_info) > 1 else []
500
+ history_worker_nodes = [h['ip'] for h in history_workers_info]
501
+ history_use_ssh_config = [
502
+ h.get('use_ssh_config', False) for h in history_workers_info
503
+ ]
504
+
505
+ # Use the first host as the head node and the rest as worker nodes
506
+ head_host = hosts_info[0]
507
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
508
+
509
+ head_node = head_host['ip']
510
+ worker_nodes = [h['ip'] for h in worker_hosts]
511
+ ssh_user = head_host['user']
512
+ ssh_key = head_host['identity_file']
513
+ head_use_ssh_config = head_host.get('use_ssh_config', False)
514
+ worker_use_ssh_config = [
515
+ h.get('use_ssh_config', False) for h in worker_hosts
516
+ ]
517
+ password = head_host['password']
518
+
519
+ # Deploy this cluster
520
+ unsuccessful_workers = deploy_cluster(
521
+ cluster_name,
522
+ head_node,
523
+ worker_nodes,
524
+ ssh_user,
525
+ ssh_key,
526
+ context_name,
527
+ password,
528
+ head_use_ssh_config,
529
+ worker_use_ssh_config,
530
+ kubeconfig_path,
531
+ cleanup,
532
+ worker_hosts=worker_hosts,
533
+ history_worker_nodes=history_worker_nodes,
534
+ history_workers_info=history_workers_info,
535
+ history_use_ssh_config=history_use_ssh_config)
536
+
537
+ if not cleanup:
538
+ successful_hosts = []
539
+ for host in cluster_config['hosts']:
540
+ if isinstance(host, str):
541
+ host_node = host
542
+ else:
543
+ host_node = host['ip']
544
+ if host_node not in unsuccessful_workers:
545
+ successful_hosts.append(host)
546
+ cluster_config['hosts'] = successful_hosts
547
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
548
+ logger.debug(f'Writing history to {history_yaml_file}')
549
+ yaml.dump(cluster_config, f)
550
+
551
+ action = 'cleanup' if cleanup else 'deployment'
552
+ logger.info(
553
+ f'{colorama.Fore.CYAN}Completed {action} for cluster: {cluster_name}{colorama.Style.RESET_ALL}'
554
+ )
555
+ successful_clusters.append(cluster_name)
556
+ except Exception as e: # pylint: disable=broad-except
557
+ reason = str(e)
558
+ failed_clusters.append((cluster_name, reason))
559
+ logger.debug(
560
+ f'Error deploying SSH Node Pool `{cluster_name}`: {reason}')
561
+
562
+ if failed_clusters:
563
+ action = 'clean' if cleanup else 'deploy'
564
+ msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
565
+ msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
566
+ for cluster_name, reason in failed_clusters:
567
+ msg += f'\n {cluster_name}: {reason}'
568
+ raise RuntimeError(msg)
569
+
570
+
571
+ def deploy_cluster(cluster_name,
572
+ head_node,
573
+ worker_nodes,
574
+ ssh_user,
575
+ ssh_key,
576
+ context_name,
577
+ password,
578
+ head_use_ssh_config,
579
+ worker_use_ssh_config,
580
+ kubeconfig_path,
581
+ cleanup,
582
+ worker_hosts=None,
583
+ history_worker_nodes=None,
584
+ history_workers_info=None,
585
+ history_use_ssh_config=None) -> List[str]:
586
+ """Deploy or clean up a single Kubernetes cluster.
587
+
588
+ Returns: List of unsuccessful worker nodes.
589
+ """
590
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
591
+ f'{context_name}-history.yaml')
592
+ cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
593
+ f'{context_name}-cert.pem')
594
+ key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
595
+ tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
596
+ f'{context_name}-tunnel.log')
597
+
598
+ # Generate the askpass block if password is provided
599
+ askpass_block = create_askpass_script(password)
600
+
601
+ # Token for k3s
602
+ k3s_token = 'mytoken' # Any string can be used as the token
603
+
604
+ # Pre-flight checks
605
+ logger.info(f'Checking SSH connection to head node ({head_node})...')
606
+ result = run_remote(head_node,
607
+ f'echo \'SSH connection successful ({head_node})\'',
608
+ ssh_user,
609
+ ssh_key,
610
+ use_ssh_config=head_use_ssh_config)
611
+ if result.startswith('SSH connection successful'):
612
+ success_message(f'SSH connection established to head node {head_node}.')
613
+
614
+ if not cleanup and result is None:
615
+ with ux_utils.print_exception_no_traceback():
616
+ raise RuntimeError(
617
+ f'Failed to SSH to head node ({head_node}). '
618
+ f'Please check the SSH configuration and logs for more details.'
619
+ )
620
+
621
+ # Checking history
622
+ history_exists = (history_worker_nodes is not None and
623
+ history_workers_info is not None and
624
+ history_use_ssh_config is not None)
625
+
626
+ # Cleanup history worker nodes
627
+ worker_nodes_to_cleanup = []
628
+ remove_worker_cmds = []
629
+ if history_exists:
630
+ for history_node, history_info, use_ssh_config in zip(
631
+ history_worker_nodes, history_workers_info,
632
+ history_use_ssh_config):
633
+ if worker_hosts is not None and history_info not in worker_hosts:
634
+ logger.debug(
635
+ f'Worker node {history_node} not found in YAML config. '
636
+ 'Removing from history...')
637
+ worker_nodes_to_cleanup.append(
638
+ dict(
639
+ node=history_node,
640
+ user=ssh_user
641
+ if history_info is None else history_info['user'],
642
+ ssh_key=ssh_key if history_info is None else
643
+ history_info['identity_file'],
644
+ askpass_block=(askpass_block if history_info is None
645
+ else create_askpass_script(
646
+ history_info['password'])),
647
+ use_ssh_config=use_ssh_config,
648
+ ))
649
+ remove_worker_cmds.append(
650
+ f'kubectl delete node -l skypilot-ip={history_node}')
651
+ # If this is a create operation and there exists some stale log,
652
+ # cleanup the log for a new file to store new logs.
653
+ if not cleanup and os.path.exists(tunnel_log_file_path):
654
+ os.remove(tunnel_log_file_path)
655
+
656
+ # If --cleanup flag is set, uninstall k3s and exit
657
+ if cleanup:
658
+ # Pickup all nodes
659
+ worker_nodes_to_cleanup.clear()
660
+ for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
661
+ worker_use_ssh_config):
662
+ worker_nodes_to_cleanup.append(
663
+ dict(
664
+ node=node,
665
+ user=ssh_user if info is None else info['user'],
666
+ ssh_key=ssh_key if info is None else info['identity_file'],
667
+ askpass_block=(askpass_block if info is None else
668
+ create_askpass_script(info['password'])),
669
+ use_ssh_config=use_ssh_config,
670
+ ))
671
+
672
+ # Clean up head node
673
+ cleanup_server_node(head_node,
674
+ ssh_user,
675
+ ssh_key,
676
+ askpass_block,
677
+ use_ssh_config=head_use_ssh_config)
678
+ # Clean up worker nodes
679
+ force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
680
+ with cf.ThreadPoolExecutor() as executor:
681
+ executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
682
+ worker_nodes_to_cleanup)
683
+
684
+ with cf.ThreadPoolExecutor() as executor:
685
+ executor.map(lambda cmd: run_command(cmd, shell=True),
686
+ remove_worker_cmds)
687
+
688
+ if cleanup:
689
+
690
+ # Remove the context from local kubeconfig if it exists
691
+ if os.path.isfile(kubeconfig_path):
692
+ logger.debug(
693
+ f'Removing context {context_name!r} from local kubeconfig...')
694
+ run_command(['kubectl', 'config', 'delete-context', context_name],
695
+ shell=False)
696
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
697
+ shell=False)
698
+ run_command(['kubectl', 'config', 'delete-user', context_name],
699
+ shell=False)
700
+
701
+ # Update the current context to the first available context
702
+ contexts = run_command([
703
+ 'kubectl', 'config', 'view', '-o',
704
+ 'jsonpath=\'{.contexts[0].name}\''
705
+ ],
706
+ shell=False)
707
+ if contexts:
708
+ run_command(['kubectl', 'config', 'use-context', contexts],
709
+ shell=False)
710
+ else:
711
+ # If no context is available, simply unset the current context
712
+ run_command(['kubectl', 'config', 'unset', 'current-context'],
713
+ shell=False)
714
+
715
+ logger.debug(
716
+ f'Context {context_name!r} removed from local kubeconfig.')
717
+
718
+ for file in [history_yaml_file, cert_file_path, key_file_path]:
719
+ if os.path.exists(file):
720
+ os.remove(file)
721
+
722
+ # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
723
+ # will restart the ssh tunnel if it's not running.
724
+ cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
725
+
726
+ success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
727
+ return []
728
+
729
+ logger.debug('Checking TCP Forwarding Options...')
730
+ cmd = (
731
+ 'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
732
+ f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
733
+ 'else '
734
+ 'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
735
+ '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
736
+ f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
737
+ 'fi')
738
+ result = run_remote(head_node,
739
+ shlex.quote(cmd),
740
+ ssh_user,
741
+ ssh_key,
742
+ use_ssh_config=head_use_ssh_config,
743
+ use_shell=True)
744
+ if result is None:
745
+ with ux_utils.print_exception_no_traceback():
746
+ raise RuntimeError(
747
+ f'Failed to setup TCP forwarding on head node ({head_node}). '
748
+ f'Please check the SSH configuration.')
749
+
750
+ # Get effective IP for master node if using SSH config - needed for workers to connect
751
+ if head_use_ssh_config:
752
+ effective_master_ip = get_effective_host_ip(head_node)
753
+ logger.info(
754
+ f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
755
+ )
756
+ else:
757
+ effective_master_ip = head_node
758
+
759
+ # Step 1: Install k3s on the head node
760
+ # Check if head node has a GPU
761
+ install_gpu = False
762
+ force_update_status(
763
+ f'Deploying SkyPilot runtime on head node ({head_node}).')
764
+ cmd = f"""
765
+ {askpass_block}
766
+ curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
767
+ mkdir -p ~/.kube &&
768
+ sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
769
+ sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
770
+ for i in {{1..3}}; do
771
+ if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
772
+ break
773
+ else
774
+ echo 'Waiting for nodes to be ready...'
775
+ sleep 5
776
+ fi
777
+ done
778
+ if [ $i -eq 3 ]; then
779
+ echo 'Failed to wait for nodes to be ready after 3 attempts'
780
+ exit 1
781
+ fi
782
+ """
783
+ result = run_remote(head_node,
784
+ cmd,
785
+ ssh_user,
786
+ ssh_key,
787
+ use_ssh_config=head_use_ssh_config)
788
+ if result is None:
789
+ with ux_utils.print_exception_no_traceback():
790
+ raise RuntimeError(
791
+ f'Failed to deploy K3s on head node ({head_node}).')
792
+ success_message(
793
+ f'SkyPilot runtime successfully deployed on head node ({head_node}).')
794
+
795
+ # Check if head node has a GPU
796
+ install_gpu = False
797
+ if check_gpu(head_node,
798
+ ssh_user,
799
+ ssh_key,
800
+ use_ssh_config=head_use_ssh_config):
801
+ logger.info(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
802
+ install_gpu = True
803
+
804
+ # Fetch the head node's internal IP (this will be passed to worker nodes)
805
+ master_addr = run_remote(head_node,
806
+ 'hostname -I | awk \'{print $1}\'',
807
+ ssh_user,
808
+ ssh_key,
809
+ use_ssh_config=head_use_ssh_config)
810
+ if master_addr is None:
811
+ with ux_utils.print_exception_no_traceback():
812
+ raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
813
+ f'Please check the SSH configuration.')
814
+ logger.debug(f'Master node internal IP: {master_addr}')
815
+
816
+ # Step 2: Install k3s on worker nodes and join them to the master node
817
+ def deploy_worker(args):
818
+ (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
819
+ askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
820
+
821
+ # If using YAML config with specific worker info
822
+ if worker_hosts and i < len(worker_hosts):
823
+ if history_workers_info is not None and worker_hosts[
824
+ i] in history_workers_info:
825
+ logger.info(
826
+ f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
827
+ f'Skipping...{colorama.Style.RESET_ALL}')
828
+ return node, True, False
829
+ worker_user = worker_hosts[i]['user']
830
+ worker_key = worker_hosts[i]['identity_file']
831
+ worker_password = worker_hosts[i]['password']
832
+ worker_askpass = create_askpass_script(worker_password)
833
+ worker_config = worker_use_ssh_config[i]
834
+ else:
835
+ worker_user = ssh_user
836
+ worker_key = ssh_key
837
+ worker_askpass = askpass_block
838
+ worker_config = worker_use_ssh_config[i]
839
+
840
+ return start_agent_node(node,
841
+ master_addr,
842
+ k3s_token,
843
+ worker_user,
844
+ worker_key,
845
+ worker_askpass,
846
+ use_ssh_config=worker_config)
847
+
848
+ unsuccessful_workers = []
849
+
850
+ # Deploy workers in parallel using thread pool
851
+ force_update_status(
852
+ f'Deploying SkyPilot runtime on worker nodes [{cluster_name}]')
853
+ with cf.ThreadPoolExecutor() as executor:
854
+ futures = []
855
+ for i, node in enumerate(worker_nodes):
856
+ args = (i, node, worker_hosts, history_workers_info, ssh_user,
857
+ ssh_key, askpass_block, worker_use_ssh_config, master_addr,
858
+ k3s_token)
859
+ futures.append(executor.submit(deploy_worker, args))
860
+
861
+ # Check if worker node has a GPU
862
+ for future in cf.as_completed(futures):
863
+ node, suc, has_gpu = future.result()
864
+ install_gpu = install_gpu or has_gpu
865
+ if not suc:
866
+ unsuccessful_workers.append(node)
867
+
868
+ # Step 3: Configure local kubectl to connect to the cluster
869
+ force_update_status(f'Setting up SkyPilot configuration [{cluster_name}]')
870
+
871
+ # Create temporary directory for kubeconfig operations
872
+ with tempfile.TemporaryDirectory() as temp_dir:
873
+ temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
874
+
875
+ # Get the kubeconfig from remote server
876
+ if head_use_ssh_config:
877
+ scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
878
+ else:
879
+ scp_cmd = [
880
+ 'scp', '-o', 'StrictHostKeyChecking=no', '-o',
881
+ 'IdentitiesOnly=yes', '-i', ssh_key,
882
+ f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
883
+ ]
884
+ run_command(scp_cmd, shell=False)
885
+
886
+ # Create the directory for the kubeconfig file if it doesn't exist
887
+ ensure_directory_exists(kubeconfig_path)
888
+
889
+ # Create empty kubeconfig if it doesn't exist
890
+ if not os.path.isfile(kubeconfig_path):
891
+ open(kubeconfig_path, 'a', encoding='utf-8').close()
892
+
893
+ # Modify the temporary kubeconfig to update server address and context name
894
+ modified_config = os.path.join(temp_dir, 'modified_config')
895
+ with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
896
+ with open(modified_config, 'w', encoding='utf-8') as f_out:
897
+ in_cluster = False
898
+ in_user = False
899
+ client_cert_data = None
900
+ client_key_data = None
901
+
902
+ for line in f_in:
903
+ if 'clusters:' in line:
904
+ in_cluster = True
905
+ in_user = False
906
+ elif 'users:' in line:
907
+ in_cluster = False
908
+ in_user = True
909
+ elif 'contexts:' in line:
910
+ in_cluster = False
911
+ in_user = False
912
+
913
+ # Skip certificate authority data in cluster section
914
+ if in_cluster and 'certificate-authority-data:' in line:
915
+ continue
916
+ # Skip client certificate data in user section but extract it
917
+ elif in_user and 'client-certificate-data:' in line:
918
+ client_cert_data = line.split(':', 1)[1].strip()
919
+ continue
920
+ # Skip client key data in user section but extract it
921
+ elif in_user and 'client-key-data:' in line:
922
+ client_key_data = line.split(':', 1)[1].strip()
923
+ continue
924
+ elif in_cluster and 'server:' in line:
925
+ # Initially just set to the effective master IP
926
+ # (will be changed to localhost by setup_kubectl_ssh_tunnel later)
927
+ f_out.write(
928
+ f' server: https://{effective_master_ip}:6443\n')
929
+ f_out.write(' insecure-skip-tls-verify: true\n')
930
+ continue
931
+
932
+ # Replace default context names with user-provided context name
933
+ line = line.replace('name: default',
934
+ f'name: {context_name}')
935
+ line = line.replace('cluster: default',
936
+ f'cluster: {context_name}')
937
+ line = line.replace('user: default',
938
+ f'user: {context_name}')
939
+ line = line.replace('current-context: default',
940
+ f'current-context: {context_name}')
941
+
942
+ f_out.write(line)
943
+
944
+ # Save certificate data if available
945
+
946
+ if client_cert_data:
947
+ # Decode base64 data and save as PEM
948
+ try:
949
+ # Clean up the certificate data by removing whitespace
950
+ clean_cert_data = ''.join(client_cert_data.split())
951
+ cert_pem = base64.b64decode(clean_cert_data).decode(
952
+ 'utf-8')
953
+
954
+ # Check if the data already looks like a PEM file
955
+ has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
956
+ has_end = '-----END CERTIFICATE-----' in cert_pem
957
+
958
+ if not has_begin or not has_end:
959
+ logger.debug(
960
+ 'Warning: Certificate data missing PEM markers, attempting to fix...'
961
+ )
962
+ # Add PEM markers if missing
963
+ if not has_begin:
964
+ cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
965
+ if not has_end:
966
+ cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
967
+
968
+ # Write the certificate
969
+ with open(cert_file_path, 'w',
970
+ encoding='utf-8') as cert_file:
971
+ cert_file.write(cert_pem)
972
+
973
+ # Verify the file was written correctly
974
+ if os.path.getsize(cert_file_path) > 0:
975
+ logger.debug(
976
+ f'Successfully saved certificate data ({len(cert_pem)} bytes)'
977
+ )
978
+
979
+ # Quick validation of PEM format
980
+ with open(cert_file_path, 'r',
981
+ encoding='utf-8') as f:
982
+ content = f.readlines()
983
+ first_line = content[0].strip(
984
+ ) if content else ''
985
+ last_line = content[-1].strip(
986
+ ) if content else ''
987
+
988
+ if not first_line.startswith(
989
+ '-----BEGIN') or not last_line.startswith(
990
+ '-----END'):
991
+ logger.debug(
992
+ 'Warning: Certificate may not be in proper PEM format'
993
+ )
994
+ else:
995
+ logger.error(
996
+ f'{RED}Error: Certificate file is empty{NC}')
997
+ except Exception as e: # pylint: disable=broad-except
998
+ logger.error(
999
+ f'{RED}Error processing certificate data: {e}{NC}')
1000
+
1001
+ if client_key_data:
1002
+ # Decode base64 data and save as PEM
1003
+ try:
1004
+ # Clean up the key data by removing whitespace
1005
+ clean_key_data = ''.join(client_key_data.split())
1006
+ key_pem = base64.b64decode(clean_key_data).decode(
1007
+ 'utf-8')
1008
+
1009
+ # Check if the data already looks like a PEM file
1010
+
1011
+ # Check for EC key format
1012
+ if 'EC PRIVATE KEY' in key_pem:
1013
+ # Handle EC KEY format directly
1014
+ match_ec = re.search(
1015
+ r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
1016
+ key_pem, re.DOTALL)
1017
+ if match_ec:
1018
+ # Extract and properly format EC key
1019
+ key_content = match_ec.group(1).strip()
1020
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1021
+ else:
1022
+ # Extract content and assume EC format
1023
+ key_content = re.sub(r'-----BEGIN.*?-----', '',
1024
+ key_pem)
1025
+ key_content = re.sub(r'-----END.*?-----.*', '',
1026
+ key_content).strip()
1027
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1028
+ else:
1029
+ # Handle regular private key format
1030
+ has_begin = any(marker in key_pem for marker in [
1031
+ '-----BEGIN PRIVATE KEY-----',
1032
+ '-----BEGIN RSA PRIVATE KEY-----'
1033
+ ])
1034
+ has_end = any(marker in key_pem for marker in [
1035
+ '-----END PRIVATE KEY-----',
1036
+ '-----END RSA PRIVATE KEY-----'
1037
+ ])
1038
+
1039
+ if not has_begin or not has_end:
1040
+ logger.debug(
1041
+ 'Warning: Key data missing PEM markers, attempting to fix...'
1042
+ )
1043
+ # Add PEM markers if missing
1044
+ if not has_begin:
1045
+ key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
1046
+ if not has_end:
1047
+ key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
1048
+ # Remove any trailing characters after END marker
1049
+ key_pem = re.sub(
1050
+ r'(-----END PRIVATE KEY-----).*', r'\1',
1051
+ key_pem)
1052
+
1053
+ # Write the key
1054
+ with open(key_file_path, 'w',
1055
+ encoding='utf-8') as key_file:
1056
+ key_file.write(key_pem)
1057
+
1058
+ # Verify the file was written correctly
1059
+ if os.path.getsize(key_file_path) > 0:
1060
+ logger.debug(
1061
+ f'Successfully saved key data ({len(key_pem)} bytes)'
1062
+ )
1063
+
1064
+ # Quick validation of PEM format
1065
+ with open(key_file_path, 'r',
1066
+ encoding='utf-8') as f:
1067
+ content = f.readlines()
1068
+ first_line = content[0].strip(
1069
+ ) if content else ''
1070
+ last_line = content[-1].strip(
1071
+ ) if content else ''
1072
+
1073
+ if not first_line.startswith(
1074
+ '-----BEGIN') or not last_line.startswith(
1075
+ '-----END'):
1076
+ logger.debug(
1077
+ 'Warning: Key may not be in proper PEM format'
1078
+ )
1079
+ else:
1080
+ logger.error(f'{RED}Error: Key file is empty{NC}')
1081
+ except Exception as e: # pylint: disable=broad-except
1082
+ logger.error(f'{RED}Error processing key data: {e}{NC}')
1083
+
1084
+ # First check if context name exists and delete it if it does
1085
+ # TODO(romilb): Should we throw an error here instead?
1086
+ run_command(['kubectl', 'config', 'delete-context', context_name],
1087
+ shell=False,
1088
+ silent=True)
1089
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
1090
+ shell=False,
1091
+ silent=True)
1092
+ run_command(['kubectl', 'config', 'delete-user', context_name],
1093
+ shell=False,
1094
+ silent=True)
1095
+
1096
+ # Merge the configurations using kubectl
1097
+ merged_config = os.path.join(temp_dir, 'merged_config')
1098
+ os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
1099
+ with open(merged_config, 'w', encoding='utf-8') as merged_file:
1100
+ kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
1101
+ result = run_command(kubectl_cmd, shell=False)
1102
+ if result:
1103
+ merged_file.write(result)
1104
+
1105
+ # Replace the kubeconfig with the merged config
1106
+ shutil.move(merged_config, kubeconfig_path)
1107
+
1108
+ # Set the new context as the current context
1109
+ run_command(['kubectl', 'config', 'use-context', context_name],
1110
+ shell=False)
1111
+
1112
+ # Always set up SSH tunnel since we assume only port 22 is accessible
1113
+ setup_kubectl_ssh_tunnel(head_node,
1114
+ ssh_user,
1115
+ ssh_key,
1116
+ context_name,
1117
+ use_ssh_config=head_use_ssh_config)
1118
+
1119
+ logger.debug(f'kubectl configured with new context \'{context_name}\'.')
1120
+ success_message(f'SkyPilot runtime is up [{cluster_name}].')
1121
+
1122
+ # Install GPU operator if a GPU was detected on any node
1123
+ if install_gpu:
1124
+ force_update_status(f'Configuring NVIDIA GPUs [{cluster_name}]')
1125
+ cmd = f"""
1126
+ {askpass_block}
1127
+ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
1128
+ chmod 700 get_helm.sh &&
1129
+ ./get_helm.sh &&
1130
+ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
1131
+ kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
1132
+ sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
1133
+ helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
1134
+ --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
1135
+ --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
1136
+ --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
1137
+ --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
1138
+ --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
1139
+ --set 'toolkit.env[2].value=nvidia' &&
1140
+ echo 'Waiting for GPU operator installation...' &&
1141
+ while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
1142
+ echo 'Waiting for GPU operator...'
1143
+ sleep 5
1144
+ done
1145
+ echo 'GPU operator installed successfully.'
1146
+ """
1147
+ result = run_remote(head_node,
1148
+ cmd,
1149
+ ssh_user,
1150
+ ssh_key,
1151
+ use_ssh_config=head_use_ssh_config)
1152
+ if result is None:
1153
+ logger.error(f'{RED}Failed to install GPU Operator.{NC}')
1154
+ else:
1155
+ success_message('GPU Operator installed.')
1156
+ else:
1157
+ logger.debug('No GPUs detected. Skipping GPU Operator installation.')
1158
+
1159
+ # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1160
+ os.environ['KUBECONFIG'] = kubeconfig_path
1161
+ run_command(['sky', 'check', 'ssh'], shell=False)
1162
+
1163
+ success_message('SkyPilot configured successfully.')
1164
+
1165
+ if unsuccessful_workers:
1166
+ quoted_unsuccessful_workers = [
1167
+ f'"{worker}"' for worker in unsuccessful_workers
1168
+ ]
1169
+
1170
+ logger.info(
1171
+ f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1172
+ f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1173
+ f'the logs for more details.{NC}')
1174
+ else:
1175
+ success_message(f'Node Pool `{cluster_name}` deployed successfully.')
1176
+
1177
+ return unsuccessful_workers