skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -4,11 +4,16 @@ processes of proc_pid.
4
4
  """
5
5
  import argparse
6
6
  import os
7
+ import signal
7
8
  import sys
8
9
  import time
10
+ from typing import List, Optional
9
11
 
10
12
  import psutil
11
13
 
14
+ # Environment variable to enable kill_pg in subprocess daemon.
15
+ USE_KILL_PG_ENV_VAR = 'SKYPILOT_SUBPROCESS_DAEMON_KILL_PG'
16
+
12
17
 
13
18
  def daemonize():
14
19
  """Detaches the process from its parent process with double-forking.
@@ -38,8 +43,75 @@ def daemonize():
38
43
  # This process is now fully detached from the original parent and terminal
39
44
 
40
45
 
41
- if __name__ == '__main__':
46
+ def get_pgid_if_leader(pid) -> Optional[int]:
47
+ """Get the process group ID of the target process if it is the leader."""
48
+ try:
49
+ pgid = os.getpgid(pid)
50
+ # Only use process group if the target process is the leader. This is
51
+ # to avoid killing the entire process group while the target process is
52
+ # just a subprocess in the group.
53
+ if pgid == pid:
54
+ print(f'Process group {pgid} is the leader.')
55
+ return pgid
56
+ return None
57
+ except Exception: # pylint: disable=broad-except
58
+ # Process group is only available in UNIX.
59
+ return None
60
+
61
+
62
+ def kill_process_group(pgid: int) -> bool:
63
+ """Kill the target process group."""
64
+ try:
65
+ print(f'Terminating process group {pgid}...')
66
+ os.killpg(pgid, signal.SIGTERM)
67
+ except Exception: # pylint: disable=broad-except
68
+ return False
69
+
70
+ # Wait 30s for the process group to exit gracefully.
71
+ time.sleep(30)
72
+
73
+ try:
74
+ print(f'Force killing process group {pgid}...')
75
+ os.killpg(pgid, signal.SIGKILL)
76
+ except Exception: # pylint: disable=broad-except
77
+ pass
78
+
79
+ return True
80
+
81
+
82
+ def kill_process_tree(process: psutil.Process,
83
+ children: List[psutil.Process]) -> bool:
84
+ """Kill the process tree of the target process."""
85
+ if process is not None:
86
+ # Kill the target process first to avoid having more children, or fail
87
+ # the process due to the children being defunct.
88
+ children = [process] + children
89
+
90
+ if not children:
91
+ sys.exit()
92
+
93
+ for child in children:
94
+ try:
95
+ child.terminate()
96
+ except psutil.NoSuchProcess:
97
+ continue
98
+
99
+ # Wait 30s for the processes to exit gracefully.
100
+ time.sleep(30)
101
+
102
+ # SIGKILL if they're still running.
103
+ for child in children:
104
+ try:
105
+ child.kill()
106
+ except psutil.NoSuchProcess:
107
+ continue
108
+
109
+ return True
110
+
111
+
112
+ def main():
42
113
  daemonize()
114
+
43
115
  parser = argparse.ArgumentParser()
44
116
  parser.add_argument('--parent-pid', type=int, required=True)
45
117
  parser.add_argument('--proc-pid', type=int, required=True)
@@ -72,37 +144,41 @@ if __name__ == '__main__':
72
144
  except (psutil.NoSuchProcess, ValueError):
73
145
  pass
74
146
 
147
+ pgid: Optional[int] = None
148
+ if os.environ.get(USE_KILL_PG_ENV_VAR) == '1':
149
+ # Use kill_pg on UNIX system if allowed to reduce the resource usage.
150
+ # Note that both implementations might leave subprocessed uncancelled:
151
+ # - kill_process_tree(default): a subprocess is able to detach itself
152
+ # from the process tree use the same technique as daemonize(). Also,
153
+ # since we refresh the process tree per second, if the subprocess is
154
+ # launched between the [last_poll, parent_die] interval, the
155
+ # subprocess will not be captured will not be killed.
156
+ # - kill_process_group: kill_pg will kill all the processed in the group
157
+ # but if a subprocess calls setpgid(0, 0) to detach itself from the
158
+ # process group (usually to daemonize itself), the subprocess will
159
+ # not be killed.
160
+ if process is not None:
161
+ pgid = get_pgid_if_leader(process.pid)
162
+
75
163
  if process is not None and parent_process is not None:
76
164
  # Wait for either parent or target process to exit
77
165
  while process.is_running() and parent_process.is_running():
78
- try:
79
- tmp_children = process.children(recursive=True)
80
- if tmp_children:
81
- children = tmp_children
82
- except psutil.NoSuchProcess:
83
- pass
166
+ if pgid is None:
167
+ # Refresh process tree for cleanup if process group is not
168
+ # available.
169
+ try:
170
+ tmp_children = process.children(recursive=True)
171
+ if tmp_children:
172
+ children = tmp_children
173
+ except psutil.NoSuchProcess:
174
+ pass
84
175
  time.sleep(1)
85
176
 
86
- if process is not None:
87
- # Kill the target process first to avoid having more children, or fail
88
- # the process due to the children being defunct.
89
- children = [process] + children
90
-
91
- if not children:
92
- sys.exit()
93
-
94
- for child in children:
95
- try:
96
- child.terminate()
97
- except psutil.NoSuchProcess:
98
- continue
177
+ if pgid is not None:
178
+ kill_process_group(pgid)
179
+ else:
180
+ kill_process_tree(process, children)
99
181
 
100
- # Wait 30s for the processes to exit gracefully.
101
- time.sleep(30)
102
182
 
103
- # SIGKILL if they're still running.
104
- for child in children:
105
- try:
106
- child.kill()
107
- except psutil.NoSuchProcess:
108
- continue
183
+ if __name__ == '__main__':
184
+ main()
sky/skypilot_config.py CHANGED
@@ -64,7 +64,6 @@ from sqlalchemy import orm
64
64
  from sqlalchemy.dialects import postgresql
65
65
  from sqlalchemy.dialects import sqlite
66
66
  from sqlalchemy.ext import declarative
67
- from sqlalchemy.pool import NullPool
68
67
 
69
68
  from sky import exceptions
70
69
  from sky import sky_logging
@@ -77,6 +76,7 @@ from sky.utils import schemas
77
76
  from sky.utils import ux_utils
78
77
  from sky.utils import yaml_utils
79
78
  from sky.utils.db import db_utils
79
+ from sky.utils.db import migration_utils
80
80
  from sky.utils.kubernetes import config_map_utils
81
81
 
82
82
  if typing.TYPE_CHECKING:
@@ -121,7 +121,8 @@ _PROJECT_CONFIG_PATH = '.sky.yaml'
121
121
 
122
122
  API_SERVER_CONFIG_KEY = 'api_server_config'
123
123
 
124
- _DB_USE_LOCK = threading.Lock()
124
+ _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
125
+ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
125
126
 
126
127
  Base = declarative.declarative_base()
127
128
 
@@ -415,10 +416,17 @@ def local_active_workspace_ctx(workspace: str) -> Iterator[None]:
415
416
  def get_active_workspace(force_user_workspace: bool = False) -> str:
416
417
  context_workspace = getattr(_active_workspace_context, 'workspace', None)
417
418
  if not force_user_workspace and context_workspace is not None:
418
- logger.debug(f'Get context workspace: {context_workspace}')
419
+ logger.debug(f'Got context workspace: {context_workspace}')
419
420
  return context_workspace
420
- return get_nested(keys=('active_workspace',),
421
- default_value=constants.SKYPILOT_DEFAULT_WORKSPACE)
421
+ active_workspace = get_nested(keys=('active_workspace',),
422
+ default_value=None)
423
+ if active_workspace is None:
424
+ logger.debug(f'No active workspace found, using default workspace: '
425
+ f'{constants.SKYPILOT_DEFAULT_WORKSPACE}')
426
+ active_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
427
+ else:
428
+ logger.debug(f'Got active workspace: {active_workspace}')
429
+ return active_workspace
422
430
 
423
431
 
424
432
  def set_nested(keys: Tuple[str, ...], value: Any) -> Dict[str, Any]:
@@ -474,7 +482,7 @@ def safe_reload_config() -> None:
474
482
  reload_config()
475
483
 
476
484
 
477
- def reload_config() -> None:
485
+ def reload_config(init_db: bool = False) -> None:
478
486
  internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
479
487
  if internal_config_path is not None:
480
488
  # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
@@ -486,7 +494,7 @@ def reload_config() -> None:
486
494
  return
487
495
 
488
496
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
489
- _reload_config_as_server()
497
+ _reload_config_as_server(init_db=init_db)
490
498
  else:
491
499
  _reload_config_as_client()
492
500
 
@@ -557,7 +565,43 @@ def _reload_config_from_internal_file(internal_config_path: str) -> None:
557
565
  _set_loaded_config_path(config_path)
558
566
 
559
567
 
560
- def _reload_config_as_server() -> None:
568
+ def _create_table(engine: sqlalchemy.engine.Engine):
569
+ """Initialize the config database with migrations."""
570
+ migration_utils.safe_alembic_upgrade(
571
+ engine, migration_utils.SKYPILOT_CONFIG_DB_NAME,
572
+ migration_utils.SKYPILOT_CONFIG_VERSION)
573
+
574
+
575
+ def _initialize_and_get_db() -> sqlalchemy.engine.Engine:
576
+ """Initialize and return the config database engine.
577
+
578
+ This function should only be called by the API Server during initialization.
579
+ Client-side code should never call this function.
580
+ """
581
+ assert os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None, (
582
+ 'initialize_and_get_db() can only be called by the API Server')
583
+
584
+ global _SQLALCHEMY_ENGINE
585
+
586
+ if _SQLALCHEMY_ENGINE is not None:
587
+ return _SQLALCHEMY_ENGINE
588
+
589
+ with _SQLALCHEMY_ENGINE_LOCK:
590
+ if _SQLALCHEMY_ENGINE is not None:
591
+ return _SQLALCHEMY_ENGINE
592
+
593
+ # We only store config in the DB when using Postgres,
594
+ # so no need to pass in db_name here.
595
+ engine = db_utils.get_engine(None)
596
+
597
+ # Run migrations if needed
598
+ _create_table(engine)
599
+
600
+ _SQLALCHEMY_ENGINE = engine
601
+ return _SQLALCHEMY_ENGINE
602
+
603
+
604
+ def _reload_config_as_server(init_db: bool = False) -> None:
561
605
  # Reset the global variables, to avoid using stale values.
562
606
  _set_loaded_config(config_utils.Config())
563
607
  _set_loaded_config_path(None)
@@ -573,37 +617,24 @@ def _reload_config_as_server() -> None:
573
617
  raise ValueError(
574
618
  'If db config is specified, no other config is allowed')
575
619
  logger.debug('retrieving config from database')
576
- with _DB_USE_LOCK:
577
- dispose_engine = False
578
- if db_utils.get_max_connections() == 0:
579
- dispose_engine = True
580
- sqlalchemy_engine = sqlalchemy.create_engine(db_url,
581
- poolclass=NullPool)
582
- else:
583
- sqlalchemy_engine = db_utils.get_engine('config')
584
- db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
585
- sqlalchemy_engine)
586
-
587
- def _get_config_yaml_from_db(
588
- key: str) -> Optional[config_utils.Config]:
589
- assert sqlalchemy_engine is not None
590
- with orm.Session(sqlalchemy_engine) as session:
591
- row = session.query(config_yaml_table).filter_by(
592
- key=key).first()
593
- if row:
594
- db_config = config_utils.Config(
595
- yaml_utils.safe_load(row.value))
596
- db_config.pop_nested(('db',), None)
597
- return db_config
598
- return None
599
-
600
- db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
601
- if db_config:
602
- server_config = overlay_skypilot_config(server_config,
603
- db_config)
604
- # Close the engine to avoid connection leaks
605
- if dispose_engine:
606
- sqlalchemy_engine.dispose()
620
+
621
+ if init_db:
622
+ _initialize_and_get_db()
623
+
624
+ def _get_config_yaml_from_db(key: str) -> Optional[config_utils.Config]:
625
+ assert _SQLALCHEMY_ENGINE is not None
626
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
627
+ row = session.query(config_yaml_table).filter_by(
628
+ key=key).first()
629
+ if row:
630
+ db_config = config_utils.Config(yaml_utils.safe_load(row.value))
631
+ db_config.pop_nested(('db',), None)
632
+ return db_config
633
+ return None
634
+
635
+ db_config = _get_config_yaml_from_db(API_SERVER_CONFIG_KEY)
636
+ if db_config:
637
+ server_config = overlay_skypilot_config(server_config, db_config)
607
638
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
608
639
  logger.debug(f'server config: \n'
609
640
  f'{yaml_utils.dump_yaml_str(dict(server_config))}')
@@ -659,7 +690,7 @@ def loaded_config_path_serialized() -> Optional[str]:
659
690
 
660
691
 
661
692
  # Load on import, synchronization is guaranteed by python interpreter.
662
- reload_config()
693
+ reload_config(init_db=True)
663
694
 
664
695
 
665
696
  def loaded() -> bool:
@@ -818,7 +849,8 @@ def _compose_cli_config(cli_config: Optional[List[str]]) -> config_utils.Config:
818
849
  except ValueError as e:
819
850
  raise ValueError(f'Invalid config override: {cli_config}. '
820
851
  f'Check if config file exists or if the dotlist '
821
- f'is formatted as: key1=value1,key2=value2') from e
852
+ f'is formatted as: key1=value1,key2=value2.\n'
853
+ f'Details: {e}') from e
822
854
  logger.debug('CLI overrides config syntax check passed.')
823
855
 
824
856
  return parsed_config
@@ -872,44 +904,32 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
872
904
  if new_db_url and new_db_url != existing_db_url:
873
905
  raise ValueError('Cannot change db url while server is running')
874
906
  if existing_db_url:
875
- with _DB_USE_LOCK:
876
- dispose_engine = False
877
- if db_utils.get_max_connections() == 0:
878
- dispose_engine = True
879
- sqlalchemy_engine = sqlalchemy.create_engine(
880
- existing_db_url, poolclass=NullPool)
881
- else:
882
- sqlalchemy_engine = db_utils.get_engine('config')
883
- db_utils.add_all_tables_to_db_sqlalchemy(
884
- Base.metadata, sqlalchemy_engine)
885
-
886
- def _set_config_yaml_to_db(key: str,
887
- config: config_utils.Config):
888
- assert sqlalchemy_engine is not None
889
- config_str = yaml_utils.dump_yaml_str(dict(config))
890
- with orm.Session(sqlalchemy_engine) as session:
891
- if (sqlalchemy_engine.dialect.name ==
892
- db_utils.SQLAlchemyDialect.SQLITE.value):
893
- insert_func = sqlite.insert
894
- elif (sqlalchemy_engine.dialect.name ==
895
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
896
- insert_func = postgresql.insert
897
- else:
898
- raise ValueError('Unsupported database dialect')
899
- insert_stmnt = insert_func(config_yaml_table).values(
900
- key=key, value=config_str)
901
- do_update_stmt = insert_stmnt.on_conflict_do_update(
902
- index_elements=[config_yaml_table.c.key],
903
- set_={config_yaml_table.c.value: config_str})
904
- session.execute(do_update_stmt)
905
- session.commit()
906
-
907
- logger.debug('saving api_server config to db')
908
- _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
909
- db_updated = True
910
- # Close the engine to avoid connection leaks
911
- if dispose_engine:
912
- sqlalchemy_engine.dispose()
907
+
908
+ def _set_config_yaml_to_db(key: str, config: config_utils.Config):
909
+ # reload_config(init_db=True) is called when this module is
910
+ # imported, so the database engine must already be initialized.
911
+ assert _SQLALCHEMY_ENGINE is not None
912
+ config_str = yaml_utils.dump_yaml_str(dict(config))
913
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
914
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
915
+ db_utils.SQLAlchemyDialect.SQLITE.value):
916
+ insert_func = sqlite.insert
917
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
918
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
919
+ insert_func = postgresql.insert
920
+ else:
921
+ raise ValueError('Unsupported database dialect')
922
+ insert_stmnt = insert_func(config_yaml_table).values(
923
+ key=key, value=config_str)
924
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
925
+ index_elements=[config_yaml_table.c.key],
926
+ set_={config_yaml_table.c.value: config_str})
927
+ session.execute(do_update_stmt)
928
+ session.commit()
929
+
930
+ logger.debug('saving api_server config to db')
931
+ _set_config_yaml_to_db(API_SERVER_CONFIG_KEY, config)
932
+ db_updated = True
913
933
 
914
934
  if not db_updated:
915
935
  # save to the local file (PVC in Kubernetes, local file otherwise)
@@ -0,0 +1,12 @@
1
+ """Constants for SSH Node Pools"""
2
+ # pylint: disable=line-too-long
3
+ import os
4
+
5
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
6
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
7
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
8
+ NODE_POOLS_KEY_DIR = os.path.expanduser('~/.sky/ssh_keys')
9
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
10
+
11
+ # TODO (kyuds): make this configurable?
12
+ K3S_TOKEN = 'mytoken' # Any string can be used as the token
@@ -1,10 +1,15 @@
1
1
  """SSH Node Pool management core functionality."""
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Any, Dict, List
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  import yaml
7
7
 
8
+ from sky import clouds
9
+ from sky.ssh_node_pools import constants
10
+ from sky.ssh_node_pools import deploy
11
+ from sky.usage import usage_lib
12
+ from sky.utils import common_utils
8
13
  from sky.utils import yaml_utils
9
14
 
10
15
 
@@ -12,8 +17,8 @@ class SSHNodePoolManager:
12
17
  """Manager for SSH Node Pool configurations."""
13
18
 
14
19
  def __init__(self):
15
- self.config_path = Path.home() / '.sky' / 'ssh_node_pools.yaml'
16
- self.keys_dir = Path.home() / '.sky' / 'ssh_keys'
20
+ self.config_path = Path(constants.DEFAULT_SSH_NODE_POOLS_PATH)
21
+ self.keys_dir = Path(constants.NODE_POOLS_KEY_DIR)
17
22
  self.keys_dir.mkdir(parents=True, exist_ok=True)
18
23
 
19
24
  def get_all_pools(self) -> Dict[str, Any]:
@@ -133,3 +138,35 @@ def list_ssh_keys() -> List[str]:
133
138
  """List available SSH keys."""
134
139
  manager = SSHNodePoolManager()
135
140
  return manager.list_ssh_keys()
141
+
142
+
143
+ @usage_lib.entrypoint
144
+ def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
145
+ """Deploys or tears down a Kubernetes cluster on SSH targets.
146
+
147
+ Args:
148
+ infra: Name of the cluster configuration in ssh_node_pools.yaml.
149
+ If None, the first cluster in the file is used.
150
+ cleanup: If True, clean up the cluster instead of deploying.
151
+ """
152
+ deploy.run(cleanup=cleanup, infra=infra)
153
+
154
+
155
+ @usage_lib.entrypoint
156
+ def ssh_status(context_name: str) -> Tuple[bool, str]:
157
+ """Check the status of an SSH Node Pool context.
158
+
159
+ Args:
160
+ context_name: The SSH context name (e.g., 'ssh-my-cluster')
161
+
162
+ Returns:
163
+ Tuple[bool, str]: (is_ready, reason)
164
+ - is_ready: True if the SSH Node Pool is ready, False otherwise
165
+ - reason: Explanation of the status
166
+ """
167
+ try:
168
+ is_ready, reason = clouds.SSH.check_single_context(context_name)
169
+ return is_ready, reason
170
+ except Exception as e: # pylint: disable=broad-except
171
+ return False, ('Failed to check SSH context: '
172
+ f'{common_utils.format_exception(e)}')
@@ -0,0 +1,4 @@
1
+ """Module for Deploying SSH Node Pools"""
2
+ from sky.ssh_node_pools.deploy.deploy import run
3
+
4
+ __all__ = ['run']