skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/utils/annotations.py CHANGED
@@ -1,13 +1,20 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal, TypeVar
4
+ import threading
5
+ import time
6
+ from typing import Callable, List, Literal, TypeVar
7
+ import weakref
5
8
 
9
+ import cachetools
6
10
  from typing_extensions import ParamSpec
7
11
 
8
12
  # Whether the current process is a SkyPilot API server process.
9
13
  is_on_api_server = True
10
- _FUNCTIONS_NEED_RELOAD_CACHE = []
14
+ _FUNCTIONS_NEED_RELOAD_CACHE_LOCK = threading.Lock()
15
+ # Caches can be thread-local, use weakref to avoid blocking the GC when the
16
+ # thread is destroyed.
17
+ _FUNCTIONS_NEED_RELOAD_CACHE: List[weakref.ReferenceType] = []
11
18
 
12
19
  T = TypeVar('T')
13
20
  P = ParamSpec('P')
@@ -29,6 +36,94 @@ def client_api(func: Callable[P, T]) -> Callable[P, T]:
29
36
  return wrapper
30
37
 
31
38
 
39
+ def _register_functions_need_reload_cache(func: Callable) -> Callable:
40
+ """Register a cachefunction that needs to be reloaded for a new request.
41
+
42
+ The function will be registered as a weak reference to avoid blocking GC.
43
+ """
44
+ assert hasattr(func, 'cache_clear'), f'{func.__name__} is not cacheable'
45
+ wrapped_fn = func
46
+ try:
47
+ func_ref = weakref.ref(func)
48
+ except TypeError:
49
+ # The function might be not weakrefable (e.g. functools.lru_cache),
50
+ # wrap it in this case.
51
+ @functools.wraps(func)
52
+ def wrapper(*args, **kwargs):
53
+ return func(*args, **kwargs)
54
+
55
+ wrapper.cache_clear = func.cache_clear # type: ignore[attr-defined]
56
+ func_ref = weakref.ref(wrapper)
57
+ wrapped_fn = wrapper
58
+ with _FUNCTIONS_NEED_RELOAD_CACHE_LOCK:
59
+ _FUNCTIONS_NEED_RELOAD_CACHE.append(func_ref)
60
+ return wrapped_fn
61
+
62
+
63
+ class ThreadLocalTTLCache(threading.local):
64
+ """Thread-local storage for _thread_local_lru_cache decorator."""
65
+
66
+ def __init__(self, func, maxsize: int, ttl: int):
67
+ super().__init__()
68
+ self.func = func
69
+ self.maxsize = maxsize
70
+ self.ttl = ttl
71
+
72
+ def get_cache(self):
73
+ if not hasattr(self, 'cache'):
74
+ self.cache = ttl_cache(scope='request',
75
+ maxsize=self.maxsize,
76
+ ttl=self.ttl,
77
+ timer=time.time)(self.func)
78
+ return self.cache
79
+
80
+ def __del__(self):
81
+ if hasattr(self, 'cache'):
82
+ self.cache.cache_clear()
83
+ self.cache = None
84
+
85
+
86
+ def thread_local_ttl_cache(maxsize=32, ttl=60 * 55):
87
+ """Thread-local TTL cache decorator.
88
+
89
+ Args:
90
+ maxsize: Maximum size of the cache.
91
+ ttl: Time to live for the cache in seconds.
92
+ Default is 55 minutes, a bit less than 1 hour
93
+ default lifetime of an STS token.
94
+ """
95
+
96
+ def decorator(func):
97
+ # Create thread-local storage for the LRU cache
98
+ local_cache = ThreadLocalTTLCache(func, maxsize, ttl)
99
+
100
+ # We can't apply the lru_cache here, because this runs at import time
101
+ # so we will always have the main thread's cache.
102
+
103
+ @functools.wraps(func)
104
+ def wrapper(*args, **kwargs):
105
+ # We are within the actual function call, which may be on a thread,
106
+ # so local_cache.cache will return the correct thread-local cache,
107
+ # which we can now apply and immediately call.
108
+ return local_cache.get_cache()(*args, **kwargs)
109
+
110
+ def cache_info():
111
+ # Note that this will only give the cache info for the current
112
+ # thread's cache.
113
+ return local_cache.get_cache().cache_info()
114
+
115
+ def cache_clear():
116
+ # Note that this will only clear the cache for the current thread.
117
+ local_cache.get_cache().cache_clear()
118
+
119
+ wrapper.cache_info = cache_info # type: ignore[attr-defined]
120
+ wrapper.cache_clear = cache_clear # type: ignore[attr-defined]
121
+
122
+ return wrapper
123
+
124
+ return decorator
125
+
126
+
32
127
  def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
33
128
  **lru_cache_kwargs) -> Callable:
34
129
  """LRU cache decorator for functions.
@@ -50,13 +145,40 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
50
145
  else:
51
146
  cached_func = functools.lru_cache(*lru_cache_args,
52
147
  **lru_cache_kwargs)(func)
53
- _FUNCTIONS_NEED_RELOAD_CACHE.append(cached_func)
54
- return cached_func
148
+ return _register_functions_need_reload_cache(cached_func)
149
+
150
+ return decorator
151
+
152
+
153
+ def ttl_cache(scope: Literal['global', 'request'], *ttl_cache_args,
154
+ **ttl_cache_kwargs) -> Callable:
155
+ """TTLCache decorator for functions.
156
+
157
+ This decorator allows us to track which functions need to be reloaded for a
158
+ new request using the scope argument.
159
+ """
160
+
161
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
162
+ if scope == 'global':
163
+ return cachetools.cached(
164
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
165
+ else:
166
+ cached_func = cachetools.cached(
167
+ cachetools.TTLCache(*ttl_cache_args, **ttl_cache_kwargs))(func)
168
+ return _register_functions_need_reload_cache(cached_func)
55
169
 
56
170
  return decorator
57
171
 
58
172
 
59
173
  def clear_request_level_cache():
60
174
  """Clear the request-level cache."""
61
- for func in _FUNCTIONS_NEED_RELOAD_CACHE:
62
- func.cache_clear()
175
+ alive_entries = []
176
+ with _FUNCTIONS_NEED_RELOAD_CACHE_LOCK:
177
+ for entry in _FUNCTIONS_NEED_RELOAD_CACHE:
178
+ func = entry()
179
+ if func is None:
180
+ # Has been GC'ed, drop the reference.
181
+ continue
182
+ func.cache_clear()
183
+ alive_entries.append(entry)
184
+ _FUNCTIONS_NEED_RELOAD_CACHE[:] = alive_entries
@@ -0,0 +1,78 @@
1
+ """Asyncio utilities."""
2
+
3
+ import asyncio
4
+ import functools
5
+ from typing import Set
6
+
7
+ _background_tasks: Set[asyncio.Task] = set()
8
+
9
+
10
+ def shield(func):
11
+ """Shield the decorated async function from cancellation.
12
+
13
+ If the outer coroutine is cancelled, the inner decorated function
14
+ will be protected from cancellation by asyncio.shield(). And we will
15
+ maintain a reference to the the inner task to avoid it get GCed before
16
+ it is done.
17
+
18
+ For example, filelock.AsyncFileLock is not cancellation safe. The
19
+ following code:
20
+
21
+ async def fn_with_lock():
22
+ async with filelock.AsyncFileLock('lock'):
23
+ await asyncio.sleep(1)
24
+
25
+ is equivalent to:
26
+
27
+ # The lock may leak if the cancellation happens in
28
+ # lock.acquire() or lock.release()
29
+ async def fn_with_lock():
30
+ lock = filelock.AsyncFileLock('lock')
31
+ await lock.acquire()
32
+ try:
33
+ await asyncio.sleep(1)
34
+ finally:
35
+ await lock.release()
36
+
37
+ Shilding the function ensures there is no cancellation will happen in the
38
+ function, thus the lock will be released properly:
39
+
40
+ @shield
41
+ async def fn_with_lock()
42
+
43
+ Note that the resource acquisition and release should usually be protected
44
+ in one @shield block but not separately, e.g.:
45
+
46
+ lock = filelock.AsyncFileLock('lock')
47
+
48
+ @shield
49
+ async def acquire():
50
+ await lock.acquire()
51
+
52
+ @shield
53
+ async def release():
54
+ await lock.release()
55
+
56
+ async def fn_with_lock():
57
+ await acquire()
58
+ try:
59
+ do_something()
60
+ finally:
61
+ await release()
62
+
63
+ The above code is not safe because if `fn_with_lock` is cancelled,
64
+ `acquire()` and `release()` will be executed in the background
65
+ concurrently and causes race conditions.
66
+ """
67
+
68
+ @functools.wraps(func)
69
+ async def async_wrapper(*args, **kwargs):
70
+ task = asyncio.create_task(func(*args, **kwargs))
71
+ try:
72
+ return await asyncio.shield(task)
73
+ except asyncio.CancelledError:
74
+ _background_tasks.add(task)
75
+ task.add_done_callback(lambda _: _background_tasks.discard(task))
76
+ raise
77
+
78
+ return async_wrapper
sky/utils/atomic.py CHANGED
@@ -1,4 +1,4 @@
1
- """Atomic structures and utilties."""
1
+ """Atomic structures and utilities."""
2
2
 
3
3
  import threading
4
4
 
@@ -0,0 +1,153 @@
1
+ """Utils for managing SkyPilot SSH key pairs."""
2
+
3
+ import functools
4
+ import os
5
+ from typing import Tuple
6
+
7
+ import filelock
8
+
9
+ from sky import global_user_state
10
+ from sky import sky_logging
11
+ from sky.utils import common_utils
12
+
13
+ logger = sky_logging.init_logger(__name__)
14
+
15
+ MAX_TRIALS = 64
16
+ # TODO(zhwu): Support user specified key pair.
17
+ # We intentionally not have the ssh key pair to be stored in
18
+ # ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
19
+ # because ssh key pair need to persist across API server restarts, while
20
+ # the former dir is ephemeral.
21
+ _SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
22
+
23
+
24
+ def get_ssh_key_and_lock_path(user_hash: str) -> Tuple[str, str, str]:
25
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
26
+
27
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix),
28
+ exist_ok=True,
29
+ mode=0o700)
30
+ private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
31
+ public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
32
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
33
+ return private_key_path, public_key_path, lock_path
34
+
35
+
36
+ def _generate_rsa_key_pair() -> Tuple[str, str]:
37
+ # Keep the import of the cryptography local to avoid expensive
38
+ # third-party imports when not needed.
39
+ # pylint: disable=import-outside-toplevel
40
+ from cryptography.hazmat.backends import default_backend
41
+ from cryptography.hazmat.primitives import serialization
42
+ from cryptography.hazmat.primitives.asymmetric import rsa
43
+
44
+ key = rsa.generate_private_key(backend=default_backend(),
45
+ public_exponent=65537,
46
+ key_size=2048)
47
+
48
+ private_key = key.private_bytes(
49
+ encoding=serialization.Encoding.PEM,
50
+ format=serialization.PrivateFormat.TraditionalOpenSSL,
51
+ encryption_algorithm=serialization.NoEncryption()).decode(
52
+ 'utf-8').strip()
53
+
54
+ public_key = key.public_key().public_bytes(
55
+ serialization.Encoding.OpenSSH,
56
+ serialization.PublicFormat.OpenSSH).decode('utf-8').strip()
57
+
58
+ return public_key, private_key
59
+
60
+
61
+ def _save_key_pair(private_key_path: str, public_key_path: str,
62
+ private_key: str, public_key: str) -> None:
63
+ key_dir = os.path.dirname(private_key_path)
64
+ os.makedirs(key_dir, exist_ok=True, mode=0o700)
65
+
66
+ with open(
67
+ private_key_path,
68
+ 'w',
69
+ encoding='utf-8',
70
+ opener=functools.partial(os.open, mode=0o600),
71
+ ) as f:
72
+ f.write(private_key)
73
+
74
+ with open(public_key_path,
75
+ 'w',
76
+ encoding='utf-8',
77
+ opener=functools.partial(os.open, mode=0o644)) as f:
78
+ f.write(public_key)
79
+
80
+
81
+ def get_or_generate_keys() -> Tuple[str, str]:
82
+ """Returns the absolute private and public key paths."""
83
+ user_hash = common_utils.get_user_hash()
84
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path(
85
+ user_hash)
86
+ private_key_path = os.path.expanduser(private_key_path)
87
+ public_key_path = os.path.expanduser(public_key_path)
88
+ lock_path = os.path.expanduser(lock_path)
89
+
90
+ lock_dir = os.path.dirname(lock_path)
91
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
92
+ # as the ssh configs will be written to this folder as well in
93
+ # backend_utils.SSHConfigHelper
94
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
95
+ with filelock.FileLock(lock_path, timeout=10):
96
+ if not os.path.exists(private_key_path):
97
+ ssh_public_key, ssh_private_key, exists = (
98
+ global_user_state.get_ssh_keys(user_hash))
99
+ if not exists:
100
+ ssh_public_key, ssh_private_key = _generate_rsa_key_pair()
101
+ global_user_state.set_ssh_keys(user_hash, ssh_public_key,
102
+ ssh_private_key)
103
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
104
+ ssh_public_key)
105
+ assert os.path.exists(public_key_path), (
106
+ 'Private key found, but associated public key '
107
+ f'{public_key_path} does not exist.')
108
+ return private_key_path, public_key_path
109
+
110
+
111
+ def create_ssh_key_files_from_db(private_key_path: str) -> bool:
112
+ """Creates the ssh key files from the database.
113
+
114
+ Returns:
115
+ True if the ssh key files are created successfully, False otherwise.
116
+ """
117
+ # Assume private key path is in the format of
118
+ # ~/.sky/clients/<user_hash>/ssh/sky-key
119
+ separated_path = os.path.normpath(private_key_path).split(os.path.sep)
120
+ assert separated_path[-1] == 'sky-key'
121
+ assert separated_path[-2] == 'ssh'
122
+ user_hash = separated_path[-3]
123
+
124
+ private_key_path_generated, public_key_path, lock_path = (
125
+ get_ssh_key_and_lock_path(user_hash))
126
+ assert private_key_path == os.path.expanduser(private_key_path_generated), (
127
+ f'Private key path {private_key_path} does not '
128
+ 'match the generated path '
129
+ f'{os.path.expanduser(private_key_path_generated)}')
130
+ private_key_path = os.path.expanduser(private_key_path)
131
+ public_key_path = os.path.expanduser(public_key_path)
132
+ lock_path = os.path.expanduser(lock_path)
133
+ lock_dir = os.path.dirname(lock_path)
134
+
135
+ if os.path.exists(private_key_path) and os.path.exists(public_key_path):
136
+ return True
137
+ # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
138
+ # as the ssh configs will be written to this folder as well in
139
+ # backend_utils.SSHConfigHelper
140
+ os.makedirs(lock_dir, exist_ok=True, mode=0o700)
141
+ with filelock.FileLock(lock_path, timeout=10):
142
+ if not os.path.exists(private_key_path):
143
+ ssh_public_key, ssh_private_key, exists = (
144
+ global_user_state.get_ssh_keys(user_hash))
145
+ if not exists:
146
+ logger.debug(f'SSH keys not found for user {user_hash}')
147
+ return False
148
+ _save_key_pair(private_key_path, public_key_path, ssh_private_key,
149
+ ssh_public_key)
150
+ assert os.path.exists(public_key_path), (
151
+ 'Private key found, but associated public key '
152
+ f'{public_key_path} does not exist.')
153
+ return True
@@ -11,6 +11,7 @@ from sky.utils import common_utils
11
11
  from sky.utils import log_utils
12
12
  from sky.utils import resources_utils
13
13
  from sky.utils import status_lib
14
+ from sky.utils import ux_utils
14
15
 
15
16
  if typing.TYPE_CHECKING:
16
17
  from sky.provision.kubernetes import utils as kubernetes_utils
@@ -105,11 +106,9 @@ def show_status_table(cluster_records: List[responses.StatusResponse],
105
106
 
106
107
  if query_clusters:
107
108
  cluster_names = {record['name'] for record in cluster_records}
108
- not_found_clusters = [
109
- repr(cluster)
110
- for cluster in query_clusters
111
- if cluster not in cluster_names
112
- ]
109
+ not_found_clusters = ux_utils.get_non_matched_query(
110
+ query_clusters, cluster_names)
111
+ not_found_clusters = [repr(cluster) for cluster in not_found_clusters]
113
112
  if not_found_clusters:
114
113
  cluster_str = 'Cluster'
115
114
  if len(not_found_clusters) > 1:
@@ -283,8 +282,14 @@ def _get_resources(cluster_record: _ClusterRecord,
283
282
  if resources_str_full is not None:
284
283
  resources_str = resources_str_full
285
284
  if resources_str is None:
286
- resources_str = resources_utils.get_readable_resources_repr(
287
- handle, simplify=truncate)
285
+ resources_str_simple, resources_str_full = (
286
+ resources_utils.get_readable_resources_repr(
287
+ handle, simplified_only=truncate))
288
+ if truncate:
289
+ resources_str = resources_str_simple
290
+ else:
291
+ assert resources_str_full is not None
292
+ resources_str = resources_str_full
288
293
 
289
294
  return resources_str
290
295
  return '-'
@@ -144,6 +144,9 @@ class SSHConfigHelper(object):
144
144
  username = docker_user
145
145
 
146
146
  key_path = cls.generate_local_key_file(cluster_name, auth_config)
147
+ # Keep the unexpanded path for SSH config (with ~)
148
+ key_path_for_config = key_path
149
+ # Expand the path for internal operations that need absolute path
147
150
  key_path = os.path.expanduser(key_path)
148
151
  sky_autogen_comment = ('# Added by sky (use `sky stop/down '
149
152
  f'{cluster_name}` to remove)')
@@ -190,11 +193,29 @@ class SSHConfigHelper(object):
190
193
  proxy_command = auth_config.get('ssh_proxy_command', None)
191
194
 
192
195
  docker_proxy_command_generator = None
196
+ proxy_command_for_nodes = proxy_command
193
197
  if docker_user is not None:
194
- docker_proxy_command_generator = lambda ip, port: ' '.join(
195
- ['ssh'] + command_runner.ssh_options_list(
196
- key_path, ssh_control_name=None, port=port) +
197
- ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
198
+
199
+ def _docker_proxy_cmd(ip: str, port: int) -> str:
200
+ inner_proxy = proxy_command
201
+ inner_port = port or 22
202
+ if inner_proxy is not None:
203
+ inner_proxy = inner_proxy.replace('%h', ip)
204
+ inner_proxy = inner_proxy.replace('%p', str(inner_port))
205
+ return ' '.join(['ssh'] + command_runner.ssh_options_list(
206
+ key_path,
207
+ ssh_control_name=None,
208
+ ssh_proxy_command=inner_proxy,
209
+ port=inner_port,
210
+ # ProxyCommand (ssh -W) is a forwarding tunnel, not an
211
+ # interactive session. ControlMaster would cache these
212
+ # processes, causing them to hang and block subsequent
213
+ # connections. Each ProxyCommand should be ephemeral.
214
+ disable_control_master=True
215
+ ) + ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
216
+
217
+ docker_proxy_command_generator = _docker_proxy_cmd
218
+ proxy_command_for_nodes = None
198
219
 
199
220
  codegen = ''
200
221
  # Add the nodes to the codegen
@@ -208,8 +229,9 @@ class SSHConfigHelper(object):
208
229
  node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
209
230
  # TODO(romilb): Update port number when k8s supports multinode
210
231
  codegen += cls._get_generated_config(
211
- sky_autogen_comment, node_name, ip, username, key_path,
212
- proxy_command, port, docker_proxy_command) + '\n'
232
+ sky_autogen_comment, node_name, ip, username,
233
+ key_path_for_config, proxy_command_for_nodes, port,
234
+ docker_proxy_command) + '\n'
213
235
 
214
236
  cluster_config_path = os.path.expanduser(
215
237
  cls.ssh_cluster_path.format(cluster_name))