skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,7 @@ import enum
3
3
  import hashlib
4
4
  import os
5
5
  import pathlib
6
+ import re
6
7
  import shlex
7
8
  import sys
8
9
  import time
@@ -13,6 +14,7 @@ from sky import exceptions
13
14
  from sky import sky_logging
14
15
  from sky.skylet import constants
15
16
  from sky.skylet import log_lib
17
+ from sky.utils import auth_utils
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import context_utils
18
20
  from sky.utils import control_master_utils
@@ -22,6 +24,9 @@ from sky.utils import timeline
22
24
 
23
25
  logger = sky_logging.init_logger(__name__)
24
26
 
27
+ # Pattern to extract home directory from command output
28
+ _HOME_DIR_PATTERN = re.compile(r'SKYPILOT_HOME_DIR: ([^\s\n]+)')
29
+
25
30
  # Rsync options
26
31
  # TODO(zhwu): This will print a per-file progress bar (with -P),
27
32
  # shooting a lot of messages to the output. --info=progress2 is used
@@ -58,6 +63,22 @@ def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
58
63
  return path
59
64
 
60
65
 
66
+ def _is_skypilot_managed_key(key_path: str) -> bool:
67
+ """Check if SSH key follows SkyPilot's managed key format.
68
+
69
+ SkyPilot-managed keys follow the pattern: ~/.sky/clients/<hash>/ssh/sky-key
70
+ External keys (like ~/.ssh/id_rsa) do not follow this pattern.
71
+
72
+ Args:
73
+ key_path: Path to the SSH private key.
74
+
75
+ Returns:
76
+ True if the key follows SkyPilot's managed format, False otherwise.
77
+ """
78
+ parts = os.path.normpath(key_path).split(os.path.sep)
79
+ return len(parts) >= 2 and parts[-1] == 'sky-key' and parts[-2] == 'ssh'
80
+
81
+
61
82
  # Disable sudo for root user. This is useful when the command is running in a
62
83
  # docker container, i.e. image_id is a docker image.
63
84
  ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD = (
@@ -183,17 +204,25 @@ class CommandRunner:
183
204
  return '-'.join(str(x) for x in self.node)
184
205
 
185
206
  def _get_remote_home_dir(self) -> str:
186
- # Use `echo ~` to get the remote home directory, instead of pwd or
187
- # echo $HOME, because pwd can be `/` when the remote user is root
188
- # and $HOME is not always set.
189
- rc, remote_home_dir, stderr = self.run('echo ~',
190
- require_outputs=True,
191
- separate_stderr=True,
192
- stream_logs=False)
207
+ # Use pattern matching to extract home directory.
208
+ # Some container images print MOTD when login shells start, which can
209
+ # contaminate command output. We use a unique pattern to extract the
210
+ # actual home directory reliably.
211
+ rc, output, stderr = self.run('echo "SKYPILOT_HOME_DIR: $(echo ~)"',
212
+ require_outputs=True,
213
+ separate_stderr=True,
214
+ stream_logs=False)
193
215
  if rc != 0:
194
216
  raise ValueError('Failed to get remote home directory: '
195
- f'{remote_home_dir + stderr}')
196
- remote_home_dir = remote_home_dir.strip()
217
+ f'{output + stderr}')
218
+
219
+ # Extract home directory using pattern matching
220
+ home_dir_match = _HOME_DIR_PATTERN.search(output)
221
+ if home_dir_match:
222
+ remote_home_dir = home_dir_match.group(1)
223
+ else:
224
+ raise ValueError('Failed to find remote home directory identifier: '
225
+ f'{output + stderr}')
197
226
  return remote_home_dir
198
227
 
199
228
  def _get_command_to_run(
@@ -414,7 +443,6 @@ class CommandRunner:
414
443
  SkyPilot but we still want to get rid of some warning messages,
415
444
  such as SSH warnings.
416
445
 
417
-
418
446
  Returns:
419
447
  returncode
420
448
  or
@@ -469,15 +497,19 @@ class CommandRunner:
469
497
  """Close the cached connection to the remote machine."""
470
498
  pass
471
499
 
472
- def port_forward_command(self,
473
- port_forward: List[Tuple[int, int]],
474
- connect_timeout: int = 1) -> List[str]:
500
+ def port_forward_command(
501
+ self,
502
+ port_forward: List[Tuple[int, int]],
503
+ connect_timeout: int = 1,
504
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
475
505
  """Command for forwarding ports from localhost to the remote machine.
476
506
 
477
507
  Args:
478
508
  port_forward: A list of ports to forward from the localhost to the
479
509
  remote host.
480
510
  connect_timeout: The timeout for the connection.
511
+ ssh_mode: The mode to use for ssh.
512
+ See SSHMode for more details.
481
513
  """
482
514
  raise NotImplementedError
483
515
 
@@ -587,16 +619,17 @@ class SSHCommandRunner(CommandRunner):
587
619
  self,
588
620
  node: Tuple[str, int],
589
621
  ssh_user: str,
590
- ssh_private_key: str,
622
+ ssh_private_key: Optional[str],
591
623
  ssh_control_name: Optional[str] = '__default__',
592
624
  ssh_proxy_command: Optional[str] = None,
593
625
  docker_user: Optional[str] = None,
594
626
  disable_control_master: Optional[bool] = False,
627
+ port_forward_execute_remote_command: Optional[bool] = False,
595
628
  ):
596
629
  """Initialize SSHCommandRunner.
597
630
 
598
631
  Example Usage:
599
- runner = SSHCommandRunner(ip, ssh_user, ssh_private_key)
632
+ runner = SSHCommandRunner((ip, port), ssh_user, ssh_private_key)
600
633
  runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
601
634
  runner.rsync(source, target, up=True)
602
635
 
@@ -618,6 +651,10 @@ class SSHCommandRunner(CommandRunner):
618
651
  disable_control_master: bool; specifies either or not the ssh
619
652
  command will utilize ControlMaster. We currently disable
620
653
  it for k8s instance.
654
+ port_forward_execute_remote_command: bool; specifies whether to
655
+ add -N to the port forwarding command. This is useful if you
656
+ want to run a command on the remote machine to make sure the
657
+ SSH tunnel is established.
621
658
  """
622
659
  super().__init__(node)
623
660
  ip, port = node
@@ -629,39 +666,72 @@ class SSHCommandRunner(CommandRunner):
629
666
  self.disable_control_master = (
630
667
  disable_control_master or
631
668
  control_master_utils.should_disable_control_master())
669
+ # Ensure SSH key is available. For SkyPilot-managed keys, create from
670
+ # database. For external keys (e.g., Slurm clusters), verify existence.
671
+ if ssh_private_key is not None and _is_skypilot_managed_key(
672
+ ssh_private_key):
673
+ auth_utils.create_ssh_key_files_from_db(ssh_private_key)
674
+ elif ssh_private_key is not None:
675
+ # Externally managed key - just verify it exists
676
+ expanded_key_path = os.path.expanduser(ssh_private_key)
677
+ if not os.path.exists(expanded_key_path):
678
+ raise FileNotFoundError(
679
+ f'SSH private key not found: {expanded_key_path}')
632
680
  if docker_user is not None:
633
681
  assert port is None or port == 22, (
634
682
  f'port must be None or 22 for docker_user, got {port}.')
635
- # Already checked in resources
636
- assert ssh_proxy_command is None, (
637
- 'ssh_proxy_command is not supported when using docker.')
683
+ # When connecting via docker, the outer SSH hop points to the
684
+ # container's sshd (localhost). Preserve the user proxy for the
685
+ # inner hop that reaches the host VM, and clear the outer proxy to
686
+ # avoid forwarding localhost through the jump host.
687
+ inner_proxy_command = ssh_proxy_command
688
+ inner_proxy_port = port or 22
689
+ self._ssh_proxy_command = None
638
690
  self.ip = 'localhost'
639
691
  self.ssh_user = docker_user
640
692
  self.port = constants.DEFAULT_DOCKER_PORT
693
+ if inner_proxy_command is not None:
694
+ # Replace %h/%p placeholders with actual host values, since the
695
+ # final destination from the perspective of the user proxy is
696
+ # the host VM (ip, inner_proxy_port).
697
+ inner_proxy_command = inner_proxy_command.replace('%h', ip)
698
+ inner_proxy_command = inner_proxy_command.replace(
699
+ '%p', str(inner_proxy_port))
641
700
  self._docker_ssh_proxy_command = lambda ssh: ' '.join(
642
- ssh + ssh_options_list(ssh_private_key, None
643
- ) + ['-W', '%h:%p', f'{ssh_user}@{ip}'])
701
+ ssh + ssh_options_list(ssh_private_key,
702
+ None,
703
+ ssh_proxy_command=inner_proxy_command,
704
+ port=inner_proxy_port,
705
+ disable_control_master=self.
706
+ disable_control_master) +
707
+ ['-W', '%h:%p', f'{ssh_user}@{ip}'])
644
708
  else:
645
709
  self.ip = ip
646
710
  self.ssh_user = ssh_user
647
711
  self.port = port
648
712
  self._docker_ssh_proxy_command = None
713
+ self.port_forward_execute_remote_command = (
714
+ port_forward_execute_remote_command)
649
715
 
650
- def port_forward_command(self,
651
- port_forward: List[Tuple[int, int]],
652
- connect_timeout: int = 1) -> List[str]:
716
+ def port_forward_command(
717
+ self,
718
+ port_forward: List[Tuple[int, int]],
719
+ connect_timeout: int = 1,
720
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
653
721
  """Command for forwarding ports from localhost to the remote machine.
654
722
 
655
723
  Args:
656
724
  port_forward: A list of ports to forward from the local port to the
657
725
  remote port.
658
726
  connect_timeout: The timeout for the ssh connection.
727
+ ssh_mode: The mode to use for ssh.
728
+ See SSHMode for more details.
659
729
 
660
730
  Returns:
661
731
  The command for forwarding ports from localhost to the remote
662
732
  machine.
663
733
  """
664
- return self.ssh_base_command(ssh_mode=SshMode.INTERACTIVE,
734
+ return self.ssh_base_command(ssh_mode=ssh_mode,
665
735
  port_forward=port_forward,
666
736
  connect_timeout=connect_timeout)
667
737
 
@@ -680,7 +750,11 @@ class SSHCommandRunner(CommandRunner):
680
750
  for local, remote in port_forward:
681
751
  logger.debug(
682
752
  f'Forwarding local port {local} to remote port {remote}.')
683
- ssh += ['-NL', f'{local}:localhost:{remote}']
753
+ if self.port_forward_execute_remote_command:
754
+ ssh += ['-L']
755
+ else:
756
+ ssh += ['-NL']
757
+ ssh += [f'{local}:localhost:{remote}']
684
758
  if self._docker_ssh_proxy_command is not None:
685
759
  docker_ssh_proxy_command = self._docker_ssh_proxy_command(ssh)
686
760
  else:
@@ -818,6 +892,7 @@ class SSHCommandRunner(CommandRunner):
818
892
  log_path: str = os.devnull,
819
893
  stream_logs: bool = True,
820
894
  max_retry: int = 1,
895
+ get_remote_home_dir: Callable[[], str] = lambda: '~',
821
896
  ) -> None:
822
897
  """Uses 'rsync' to sync 'source' to 'target'.
823
898
 
@@ -830,6 +905,8 @@ class SSHCommandRunner(CommandRunner):
830
905
  stream_logs: Stream logs to the stdout/stderr.
831
906
  max_retry: The maximum number of retries for the rsync command.
832
907
  This value should be non-negative.
908
+ get_remote_home_dir: A callable that returns the remote home
909
+ directory. Defaults to '~'.
833
910
 
834
911
  Raises:
835
912
  exceptions.CommandError: rsync command failed.
@@ -854,7 +931,8 @@ class SSHCommandRunner(CommandRunner):
854
931
  rsh_option=rsh_option,
855
932
  log_path=log_path,
856
933
  stream_logs=stream_logs,
857
- max_retry=max_retry)
934
+ max_retry=max_retry,
935
+ get_remote_home_dir=get_remote_home_dir)
858
936
 
859
937
 
860
938
  class KubernetesCommandRunner(CommandRunner):
@@ -894,9 +972,11 @@ class KubernetesCommandRunner(CommandRunner):
894
972
  else:
895
973
  return f'pod/{self.pod_name}'
896
974
 
897
- def port_forward_command(self,
898
- port_forward: List[Tuple[int, int]],
899
- connect_timeout: int = 1) -> List[str]:
975
+ def port_forward_command(
976
+ self,
977
+ port_forward: List[Tuple[int, int]],
978
+ connect_timeout: int = 1,
979
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
900
980
  """Command for forwarding ports from localhost to the remote machine.
901
981
 
902
982
  Args:
@@ -904,14 +984,25 @@ class KubernetesCommandRunner(CommandRunner):
904
984
  remote port. Currently, only one port is supported, i.e. the
905
985
  list should have only one element.
906
986
  connect_timeout: The timeout for the ssh connection.
987
+ ssh_mode: The mode to use for ssh.
988
+ See SSHMode for more details.
907
989
  """
990
+ del ssh_mode # unused
908
991
  assert port_forward and len(port_forward) == 1, (
909
992
  'Only one port is supported for Kubernetes port-forward.')
910
993
  kubectl_args = [
911
994
  '--pod-running-timeout', f'{connect_timeout}s', '-n', self.namespace
912
995
  ]
996
+ # The same logic to either set `--context` to the k8s context where
997
+ # the sky cluster is hosted, or `--kubeconfig` to /dev/null for
998
+ # in-cluster k8s is used below in the `run()` method.
913
999
  if self.context:
914
1000
  kubectl_args += ['--context', self.context]
1001
+ # If context is none, it means the cluster is hosted on in-cluster k8s.
1002
+ # In this case, we need to set KUBECONFIG to /dev/null to avoid looking
1003
+ # for the cluster in whatever active context is set in the kubeconfig.
1004
+ else:
1005
+ kubectl_args += ['--kubeconfig', '/dev/null']
915
1006
  local_port, remote_port = port_forward[0]
916
1007
  local_port_str = f'{local_port}' if local_port is not None else ''
917
1008
 
@@ -967,7 +1058,6 @@ class KubernetesCommandRunner(CommandRunner):
967
1058
  SkyPilot but we still want to get rid of some warning messages,
968
1059
  such as SSH warnings.
969
1060
 
970
-
971
1061
  Returns:
972
1062
  returncode
973
1063
  or
@@ -1186,3 +1276,166 @@ class LocalProcessCommandRunner(CommandRunner):
1186
1276
  log_path=log_path,
1187
1277
  stream_logs=stream_logs,
1188
1278
  max_retry=max_retry)
1279
+
1280
+
1281
+ class SlurmCommandRunner(SSHCommandRunner):
1282
+ """Runner for Slurm commands.
1283
+
1284
+ SlurmCommandRunner sends commands over an SSH connection through the Slurm
1285
+ controller, to the virtual instances.
1286
+ """
1287
+
1288
+ def __init__(
1289
+ self,
1290
+ node: Tuple[str, int],
1291
+ ssh_user: str,
1292
+ ssh_private_key: Optional[str],
1293
+ *,
1294
+ sky_dir: str,
1295
+ skypilot_runtime_dir: str,
1296
+ job_id: str,
1297
+ slurm_node: str,
1298
+ **kwargs,
1299
+ ):
1300
+ """Initialize SlurmCommandRunner.
1301
+
1302
+ Example Usage:
1303
+ runner = SlurmCommandRunner(
1304
+ (ip, port),
1305
+ ssh_user,
1306
+ ssh_private_key,
1307
+ sky_dir=sky_dir,
1308
+ skypilot_runtime_dir=skypilot_runtime_dir,
1309
+ job_id=job_id,
1310
+ slurm_node=slurm_node)
1311
+ runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
1312
+ runner.rsync(source, target, up=True)
1313
+
1314
+ Args:
1315
+ node: (ip, port) The IP address and port of the remote machine
1316
+ (login node).
1317
+ ssh_user: SSH username.
1318
+ ssh_private_key: Path to SSH private key.
1319
+ sky_dir: The private directory for the SkyPilot cluster on the
1320
+ Slurm cluster.
1321
+ skypilot_runtime_dir: The directory for the SkyPilot runtime
1322
+ on the Slurm cluster.
1323
+ job_id: The Slurm job ID for this instance.
1324
+ slurm_node: The Slurm node hostname for this instance
1325
+ (compute node).
1326
+ **kwargs: Additional arguments forwarded to SSHCommandRunner
1327
+ (e.g., ssh_proxy_command).
1328
+ """
1329
+ super().__init__(node, ssh_user, ssh_private_key, **kwargs)
1330
+ self.sky_dir = sky_dir
1331
+ self.skypilot_runtime_dir = skypilot_runtime_dir
1332
+ self.job_id = job_id
1333
+ self.slurm_node = slurm_node
1334
+
1335
+ # Build a chained ProxyCommand that goes through the login node to reach
1336
+ # the compute node where the job is running.
1337
+
1338
+ # First, build SSH options to reach the login node, using the user's
1339
+ # existing proxy command if provided.
1340
+ proxy_ssh_options = ' '.join(
1341
+ ssh_options_list(self.ssh_private_key,
1342
+ None,
1343
+ ssh_proxy_command=self._ssh_proxy_command,
1344
+ port=self.port,
1345
+ disable_control_master=True))
1346
+ login_node_proxy_command = (f'ssh {proxy_ssh_options} '
1347
+ f'-W %h:%p {self.ssh_user}@{self.ip}')
1348
+
1349
+ # Update the proxy command to be the login node proxy, which will
1350
+ # be used by super().run() to reach the compute node.
1351
+ self._ssh_proxy_command = login_node_proxy_command
1352
+ # Update self.ip to target the compute node.
1353
+ self.ip = slurm_node
1354
+ # Assume the compute node's SSH port is 22.
1355
+ # TODO(kevin): Make this configurable if needed.
1356
+ self.port = 22
1357
+
1358
+ def rsync(
1359
+ self,
1360
+ source: str,
1361
+ target: str,
1362
+ *,
1363
+ up: bool,
1364
+ log_path: str = os.devnull,
1365
+ stream_logs: bool = True,
1366
+ max_retry: int = 1,
1367
+ ) -> None:
1368
+ """Rsyncs files directly to the Slurm compute node,
1369
+ by proxying through the Slurm login node.
1370
+
1371
+ For Slurm, files need to be accessible by compute nodes where jobs
1372
+ execute via srun. This means either it has to be on the compute node's
1373
+ local filesystem, or on a shared filesystem.
1374
+ """
1375
+ # TODO(kevin): We can probably optimize this to skip the proxying
1376
+ # if the target dir is in a shared filesystem, since it will
1377
+ # be accessible by the compute node.
1378
+
1379
+ # Build SSH options for rsync using the ProxyCommand set up in __init__
1380
+ # to reach the compute node through the login node.
1381
+ ssh_options = ' '.join(
1382
+ ssh_options_list(
1383
+ # Assume nothing and rely on default SSH behavior when -i is
1384
+ # not specified.
1385
+ None,
1386
+ None,
1387
+ ssh_proxy_command=self._ssh_proxy_command,
1388
+ disable_control_master=True))
1389
+ rsh_option = f'ssh {ssh_options}'
1390
+
1391
+ self._rsync(
1392
+ source,
1393
+ target,
1394
+ # Compute node
1395
+ node_destination=f'{self.ssh_user}@{self.slurm_node}',
1396
+ up=up,
1397
+ rsh_option=rsh_option,
1398
+ log_path=log_path,
1399
+ stream_logs=stream_logs,
1400
+ max_retry=max_retry,
1401
+ get_remote_home_dir=lambda: self.sky_dir)
1402
+
1403
+ @timeline.event
1404
+ @context_utils.cancellation_guard
1405
+ def run(self, cmd: Union[str, List[str]],
1406
+ **kwargs) -> Union[int, Tuple[int, str, str]]:
1407
+ """Run Slurm-supported user commands over an SSH connection.
1408
+
1409
+ Args:
1410
+ cmd: The Slurm-supported user command to run.
1411
+
1412
+ Returns:
1413
+ returncode
1414
+ or
1415
+ A tuple of (returncode, stdout, stderr).
1416
+ """
1417
+ # Override $HOME so that each SkyPilot cluster's state is isolated
1418
+ # from one another. We rely on the assumption that ~ is exclusively
1419
+ # used by a cluster, and in Slurm that is not the case, as $HOME
1420
+ # could be part of a shared filesystem.
1421
+ # And similarly for SKY_RUNTIME_DIR. See constants.\
1422
+ # SKY_RUNTIME_DIR_ENV_VAR_KEY for more details.
1423
+ #
1424
+ # SSH directly to the compute node instead of using srun.
1425
+ # This avoids Slurm's proctrack/cgroup which kills all processes
1426
+ # when the job step ends (including child processes launched as
1427
+ # a separate process group), breaking background process spawning
1428
+ # (e.g., JobScheduler._run_job which uses launch_new_process_tree).
1429
+ # Note: proctrack/cgroup is enabled by default on Nebius'
1430
+ # Managed Soperator.
1431
+ cmd = (
1432
+ f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
1433
+ f'"{self.skypilot_runtime_dir}" && '
1434
+ # Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
1435
+ # package installation while avoiding permission conflicts when
1436
+ # multiple users share the same host. Otherwise it defaults to
1437
+ # ~/.cache/uv.
1438
+ f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
1439
+ f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
1440
+
1441
+ return super().run(cmd, **kwargs)
@@ -6,7 +6,7 @@ determine the return type based on the value of require_outputs.
6
6
  """
7
7
  import enum
8
8
  import typing
9
- from typing import Any, Iterable, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
10
10
 
11
11
  from typing_extensions import Literal
12
12
 
@@ -36,9 +36,9 @@ def ssh_options_list(
36
36
 
37
37
 
38
38
  class SshMode(enum.Enum):
39
- NON_INTERACTIVE: int
40
- INTERACTIVE: int
41
- LOGIN: int
39
+ NON_INTERACTIVE = ...
40
+ INTERACTIVE = ...
41
+ LOGIN = ...
42
42
 
43
43
 
44
44
  class CommandRunner:
@@ -106,6 +106,13 @@ class CommandRunner:
106
106
  max_retry: int = ...) -> None:
107
107
  ...
108
108
 
109
+ def port_forward_command(
110
+ self,
111
+ port_forward: List[Tuple[int, int]],
112
+ connect_timeout: int = 1,
113
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
114
+ ...
115
+
109
116
  @classmethod
110
117
  def make_runner_list(cls: typing.Type[CommandRunner],
111
118
  node_list: Iterable[Tuple[Any, ...]],
@@ -123,19 +130,22 @@ class SSHCommandRunner(CommandRunner):
123
130
  ip: str
124
131
  port: int
125
132
  ssh_user: str
126
- ssh_private_key: str
133
+ ssh_private_key: Optional[str]
127
134
  ssh_control_name: Optional[str]
128
135
  docker_user: str
129
136
  disable_control_master: Optional[bool]
137
+ port_forward_execute_remote_command: Optional[bool]
130
138
 
131
139
  def __init__(
132
140
  self,
133
141
  node: Tuple[str, int],
134
142
  ssh_user: str,
135
- ssh_private_key: str,
143
+ ssh_private_key: Optional[str],
136
144
  ssh_control_name: Optional[str] = ...,
145
+ ssh_proxy_command: Optional[str] = ...,
137
146
  docker_user: Optional[str] = ...,
138
147
  disable_control_master: Optional[bool] = ...,
148
+ port_forward_execute_remote_command: Optional[bool] = ...,
139
149
  ) -> None:
140
150
  ...
141
151
 
@@ -190,6 +200,15 @@ class SSHCommandRunner(CommandRunner):
190
200
  **kwargs) -> Union[Tuple[int, str, str], int]:
191
201
  ...
192
202
 
203
+ def ssh_base_command(
204
+ self,
205
+ *,
206
+ ssh_mode: SshMode,
207
+ port_forward: Optional[List[Tuple[int, int]]],
208
+ connect_timeout: Optional[int],
209
+ ) -> List[str]:
210
+ ...
211
+
193
212
  def rsync(self,
194
213
  source: str,
195
214
  target: str,
@@ -197,7 +216,15 @@ class SSHCommandRunner(CommandRunner):
197
216
  up: bool,
198
217
  log_path: str = ...,
199
218
  stream_logs: bool = ...,
200
- max_retry: int = ...) -> None:
219
+ max_retry: int = ...,
220
+ get_remote_home_dir: Callable[[], str] = ...) -> None:
221
+ ...
222
+
223
+ def port_forward_command(
224
+ self,
225
+ port_forward: List[Tuple[int, int]],
226
+ connect_timeout: int = 1,
227
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
201
228
  ...
202
229
 
203
230
 
@@ -272,6 +299,35 @@ class KubernetesCommandRunner(CommandRunner):
272
299
  max_retry: int = ...) -> None:
273
300
  ...
274
301
 
302
+ def port_forward_command(
303
+ self,
304
+ port_forward: List[Tuple[int, int]],
305
+ connect_timeout: int = 1,
306
+ ssh_mode: SshMode = SshMode.INTERACTIVE) -> List[str]:
307
+ ...
308
+
309
+
310
+ class SlurmCommandRunner(SSHCommandRunner):
311
+ """Runner for Slurm commands."""
312
+ sky_dir: str
313
+ skypilot_runtime_dir: str
314
+ job_id: str
315
+ slurm_node: str
316
+
317
+ def __init__(
318
+ self,
319
+ node: Tuple[str, int],
320
+ ssh_user: str,
321
+ ssh_private_key: Optional[str],
322
+ *,
323
+ sky_dir: str,
324
+ skypilot_runtime_dir: str,
325
+ job_id: str,
326
+ slurm_node: str,
327
+ **kwargs,
328
+ ) -> None:
329
+ ...
330
+
275
331
 
276
332
  class LocalProcessCommandRunner(CommandRunner):
277
333
 
sky/utils/common.py CHANGED
@@ -31,7 +31,7 @@ JOB_CONTROLLER_NAME: str
31
31
  def refresh_server_id() -> None:
32
32
  """Refresh the server id.
33
33
 
34
- This function is used to ensure the server id is read from the authorative
34
+ This function is used to ensure the server id is read from the authoritative
35
35
  source.
36
36
  """
37
37
  global SERVER_ID
@@ -42,6 +42,8 @@ def refresh_server_id() -> None:
42
42
  JOB_CONTROLLER_NAME = f'{JOB_CONTROLLER_PREFIX}{SERVER_ID}'
43
43
 
44
44
 
45
+ # TODO(kevin): Remove this side effect and have callers call
46
+ # refresh_server_id() explicitly as needed.
45
47
  refresh_server_id()
46
48
 
47
49