skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/clouds/slurm.py ADDED
@@ -0,0 +1,578 @@
1
+ """Slurm."""
2
+
3
+ import typing
4
+ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+
6
+ from sky import catalog
7
+ from sky import clouds
8
+ from sky import sky_logging
9
+ from sky import skypilot_config
10
+ from sky.adaptors import slurm
11
+ from sky.provision.slurm import utils as slurm_utils
12
+ from sky.utils import annotations
13
+ from sky.utils import common_utils
14
+ from sky.utils import registry
15
+ from sky.utils import resources_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ from sky import resources as resources_lib
19
+ from sky.utils import volume as volume_lib
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+ CREDENTIAL_PATH = slurm_utils.DEFAULT_SLURM_PATH
24
+
25
+
26
+ @registry.CLOUD_REGISTRY.register
27
+ class Slurm(clouds.Cloud):
28
+ """Slurm."""
29
+
30
+ _REPR = 'Slurm'
31
+ _CLOUD_UNSUPPORTED_FEATURES = {
32
+ clouds.CloudImplementationFeatures.AUTOSTOP: 'Slurm does not '
33
+ 'support autostop.',
34
+ clouds.CloudImplementationFeatures.STOP: 'Slurm does not support '
35
+ 'stopping instances.',
36
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are '
37
+ 'not supported in '
38
+ 'Slurm.',
39
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
40
+ 'Customized multiple network interfaces are not supported in '
41
+ 'Slurm.',
42
+ clouds.CloudImplementationFeatures.OPEN_PORTS: 'Opening ports is not '
43
+ 'supported in Slurm.',
44
+ clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
45
+ 'Running '
46
+ 'controllers is not '
47
+ 'well tested with '
48
+ 'Slurm.',
49
+ clouds.CloudImplementationFeatures.IMAGE_ID: 'Specifying image ID is '
50
+ 'not supported in Slurm.',
51
+ clouds.CloudImplementationFeatures.DOCKER_IMAGE: 'Docker image is not '
52
+ 'supported in Slurm.',
53
+ }
54
+ _MAX_CLUSTER_NAME_LEN_LIMIT = 120
55
+ _regions: List[clouds.Region] = []
56
+ _INDENT_PREFIX = ' '
57
+
58
+ # Using the latest SkyPilot provisioner API to provision and check status.
59
+ PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
60
+ STATUS_VERSION = clouds.StatusVersion.SKYPILOT
61
+
62
+ @classmethod
63
+ def _unsupported_features_for_resources(
64
+ cls,
65
+ resources: 'resources_lib.Resources',
66
+ region: Optional[str] = None,
67
+ ) -> Dict[clouds.CloudImplementationFeatures, str]:
68
+ del region # unused
69
+ # logger.critical('[BYPASS] Check Slurm's unsupported features...')
70
+ return cls._CLOUD_UNSUPPORTED_FEATURES
71
+
72
+ @classmethod
73
+ def _max_cluster_name_length(cls) -> Optional[int]:
74
+ return cls._MAX_CLUSTER_NAME_LEN_LIMIT
75
+
76
+ @classmethod
77
+ def uses_ray(cls) -> bool:
78
+ return False
79
+
80
+ @classmethod
81
+ def get_vcpus_mem_from_instance_type(
82
+ cls,
83
+ instance_type: str,
84
+ ) -> Tuple[Optional[float], Optional[float]]:
85
+ inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
86
+ return inst.cpus, inst.memory
87
+
88
+ @classmethod
89
+ def zones_provision_loop(
90
+ cls,
91
+ *,
92
+ region: str,
93
+ num_nodes: int,
94
+ instance_type: str,
95
+ accelerators: Optional[Dict[str, int]] = None,
96
+ use_spot: bool = False,
97
+ ) -> Iterator[Optional[List[clouds.Zone]]]:
98
+ """Iterate over partitions (zones) for provisioning with failover.
99
+
100
+ Yields one partition at a time for failover retry logic.
101
+ """
102
+ del num_nodes # unused
103
+
104
+ regions = cls.regions_with_offering(instance_type,
105
+ accelerators,
106
+ use_spot,
107
+ region=region,
108
+ zone=None)
109
+
110
+ for r in regions:
111
+ if r.zones:
112
+ # Yield one partition at a time for failover
113
+ for zone in r.zones:
114
+ yield [zone]
115
+ else:
116
+ # No partitions discovered, use default
117
+ yield None
118
+
119
+ @classmethod
120
+ @annotations.lru_cache(scope='global', maxsize=1)
121
+ def _log_skipped_clusters_once(cls, skipped_clusters: Tuple[str,
122
+ ...]) -> None:
123
+ """Log skipped clusters for only once.
124
+
125
+ We don't directly cache the result of existing_allowed_clusters
126
+ as the config may update the allowed clusters.
127
+ """
128
+ if skipped_clusters:
129
+ logger.warning(
130
+ f'Slurm clusters {set(skipped_clusters)!r} specified in '
131
+ '"allowed_clusters" not found in ~/.slurm/config. '
132
+ 'Ignoring these clusters.')
133
+
134
+ @classmethod
135
+ def existing_allowed_clusters(cls, silent: bool = False) -> List[str]:
136
+ """Get existing allowed clusters.
137
+
138
+ Returns clusters based on the following logic:
139
+ 1. If 'allowed_clusters' is set to 'all' in ~/.sky/config.yaml,
140
+ return all clusters from ~/.slurm/config
141
+ 2. If specific clusters are listed in 'allowed_clusters',
142
+ return only those that exist in ~/.slurm/config
143
+ 3. If no configuration is specified, return all clusters
144
+ from ~/.slurm/config (default behavior)
145
+ """
146
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
147
+ if len(all_clusters) == 0:
148
+ return []
149
+
150
+ all_clusters = set(all_clusters)
151
+
152
+ # Workspace-level allowed_clusters should take precedence over
153
+ # the global allowed_clusters.
154
+ allowed_clusters = skypilot_config.get_workspace_cloud('slurm').get(
155
+ 'allowed_clusters', None)
156
+ if allowed_clusters is None:
157
+ allowed_clusters = skypilot_config.get_effective_region_config(
158
+ cloud='slurm',
159
+ region=None,
160
+ keys=('allowed_clusters',),
161
+ default_value=None)
162
+
163
+ allow_all_clusters = allowed_clusters == 'all'
164
+ if allow_all_clusters:
165
+ allowed_clusters = list(all_clusters)
166
+
167
+ if allowed_clusters is None:
168
+ # Default to all clusters if no configuration is specified
169
+ allowed_clusters = list(all_clusters)
170
+
171
+ existing_clusters = []
172
+ skipped_clusters = []
173
+ for cluster in allowed_clusters:
174
+ if cluster in all_clusters:
175
+ existing_clusters.append(cluster)
176
+ else:
177
+ skipped_clusters.append(cluster)
178
+
179
+ if not silent:
180
+ cls._log_skipped_clusters_once(tuple(sorted(skipped_clusters)))
181
+
182
+ return existing_clusters
183
+
184
+ @classmethod
185
+ def regions_with_offering(
186
+ cls,
187
+ instance_type: Optional[str],
188
+ accelerators: Optional[Dict[str, int]],
189
+ use_spot: bool,
190
+ region: Optional[str],
191
+ zone: Optional[str],
192
+ resources: Optional['resources_lib.Resources'] = None
193
+ ) -> List[clouds.Region]:
194
+ del accelerators, use_spot, resources # unused
195
+ existing_clusters = cls.existing_allowed_clusters()
196
+
197
+ regions: List[clouds.Region] = []
198
+ for cluster in existing_clusters:
199
+ # Filter by region if specified
200
+ if region is not None and cluster != region:
201
+ continue
202
+
203
+ # Fetch partitions for this cluster and attach as zones
204
+ try:
205
+ partitions = slurm_utils.get_partitions(cluster)
206
+ if zone is not None:
207
+ # Filter by zone (partition) if specified
208
+ partitions = [p for p in partitions if p == zone]
209
+ zones = [clouds.Zone(p) for p in partitions]
210
+ except Exception as e: # pylint: disable=broad-except
211
+ logger.debug(f'Failed to get partitions for {cluster}: {e}')
212
+ zones = []
213
+
214
+ r = clouds.Region(cluster)
215
+ if zones:
216
+ r.set_zones(zones)
217
+ regions.append(r)
218
+
219
+ # Check if requested instance type will fit in the cluster.
220
+ if instance_type is None:
221
+ return regions
222
+
223
+ regions_to_return = []
224
+ for r in regions:
225
+ cluster = r.name
226
+
227
+ # Check each partition (zone) in the cluster
228
+ partitions_to_check = [z.name for z in r.zones] if r.zones else []
229
+ valid_zones = []
230
+
231
+ # TODO(kevin): Batch this check to reduce number of roundtrips.
232
+ for partition in partitions_to_check:
233
+ fits, reason = slurm_utils.check_instance_fits(
234
+ cluster, instance_type, partition)
235
+ if fits:
236
+ if partition:
237
+ valid_zones.append(clouds.Zone(partition))
238
+ else:
239
+ logger.debug(
240
+ f'Instance type {instance_type} does not fit in '
241
+ f'{cluster}/{partition}: {reason}')
242
+
243
+ if valid_zones:
244
+ r.set_zones(valid_zones)
245
+ regions_to_return.append(r)
246
+
247
+ return regions_to_return
248
+
249
+ def instance_type_to_hourly_cost(self,
250
+ instance_type: str,
251
+ use_spot: bool,
252
+ region: Optional[str] = None,
253
+ zone: Optional[str] = None) -> float:
254
+ """For now, we assume zero cost for Slurm clusters."""
255
+ return 0.0
256
+
257
+ def accelerators_to_hourly_cost(self,
258
+ accelerators: Dict[str, int],
259
+ use_spot: bool,
260
+ region: Optional[str] = None,
261
+ zone: Optional[str] = None) -> float:
262
+ """Returns the hourly cost of the accelerators, in dollars/hour."""
263
+ del accelerators, use_spot, region, zone # unused
264
+ return 0.0
265
+
266
+ def get_egress_cost(self, num_gigabytes: float) -> float:
267
+ return 0.0
268
+
269
+ def __repr__(self):
270
+ return self._REPR
271
+
272
+ def is_same_cloud(self, other: clouds.Cloud) -> bool:
273
+ # Returns true if the two clouds are the same cloud type.
274
+ return isinstance(other, Slurm)
275
+
276
+ @classmethod
277
+ def get_default_instance_type(cls,
278
+ cpus: Optional[str] = None,
279
+ memory: Optional[str] = None,
280
+ disk_tier: Optional[
281
+ resources_utils.DiskTier] = None,
282
+ region: Optional[str] = None,
283
+ zone: Optional[str] = None) -> Optional[str]:
284
+ """Returns the default instance type for Slurm."""
285
+ return catalog.get_default_instance_type(cpus=cpus,
286
+ memory=memory,
287
+ disk_tier=disk_tier,
288
+ region=region,
289
+ zone=zone,
290
+ clouds='slurm')
291
+
292
+ @classmethod
293
+ def get_accelerators_from_instance_type(
294
+ cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
295
+ inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
296
+ return {
297
+ inst.accelerator_type: inst.accelerator_count
298
+ } if (inst.accelerator_count is not None and
299
+ inst.accelerator_type is not None) else None
300
+
301
+ @classmethod
302
+ def get_zone_shell_cmd(cls) -> Optional[str]:
303
+ return None
304
+
305
+ def make_deploy_resources_variables(
306
+ self,
307
+ resources: 'resources_lib.Resources',
308
+ cluster_name: 'resources_utils.ClusterName',
309
+ region: Optional['clouds.Region'],
310
+ zones: Optional[List['clouds.Zone']],
311
+ num_nodes: int,
312
+ dryrun: bool = False,
313
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
314
+ ) -> Dict[str, Optional[str]]:
315
+ del cluster_name, dryrun, volume_mounts # Unused.
316
+ if region is not None:
317
+ cluster = region.name
318
+ else:
319
+ cluster = 'localcluster'
320
+ assert cluster is not None, 'No available Slurm cluster found.'
321
+
322
+ # Use zone as partition if specified, otherwise default
323
+ if zones and len(zones) > 0:
324
+ partition = zones[0].name
325
+ else:
326
+ partition = slurm_utils.get_cluster_default_partition(cluster)
327
+
328
+ # cluster is our target slurmctld host.
329
+ ssh_config = slurm_utils.get_slurm_ssh_config()
330
+ ssh_config_dict = ssh_config.lookup(cluster)
331
+
332
+ resources = resources.assert_launchable()
333
+ acc_dict = self.get_accelerators_from_instance_type(
334
+ resources.instance_type)
335
+ custom_resources = resources_utils.make_ray_custom_resources_str(
336
+ acc_dict)
337
+
338
+ # resources.memory and cpus are none if they are not explicitly set.
339
+ # we fetch the default values for the instance type in that case.
340
+ s = slurm_utils.SlurmInstanceType.from_instance_type(
341
+ resources.instance_type)
342
+ cpus = s.cpus
343
+ mem = s.memory
344
+ # Optionally populate accelerator information.
345
+ acc_count = s.accelerator_count if s.accelerator_count else 0
346
+ acc_type = s.accelerator_type if s.accelerator_type else None
347
+
348
+ deploy_vars = {
349
+ 'instance_type': resources.instance_type,
350
+ 'custom_resources': custom_resources,
351
+ 'cpus': str(cpus),
352
+ 'memory': str(mem),
353
+ 'accelerator_count': str(acc_count),
354
+ 'accelerator_type': acc_type,
355
+ 'slurm_cluster': cluster,
356
+ 'slurm_partition': partition,
357
+ # TODO(jwj): Pass SSH config in a smarter way
358
+ 'ssh_hostname': ssh_config_dict['hostname'],
359
+ 'ssh_port': str(ssh_config_dict.get('port', 22)),
360
+ 'ssh_user': ssh_config_dict['user'],
361
+ 'slurm_proxy_command': ssh_config_dict.get('proxycommand', None),
362
+ # TODO(jwj): Solve naming collision with 'ssh_private_key'.
363
+ # Please refer to slurm-ray.yml.j2 'ssh' and 'auth' sections.
364
+ 'slurm_private_key': ssh_config_dict['identityfile'][0],
365
+ }
366
+
367
+ return deploy_vars
368
+
369
+ def _get_feasible_launchable_resources(
370
+ self, resources: 'resources_lib.Resources'
371
+ ) -> 'resources_utils.FeasibleResources':
372
+ """Returns a list of feasible resources for the given resources."""
373
+ if resources.instance_type is not None:
374
+ assert resources.is_launchable(), resources
375
+ # Check if the instance type is available in at least one cluster
376
+ available_regions = self.regions_with_offering(
377
+ resources.instance_type,
378
+ accelerators=None,
379
+ use_spot=resources.use_spot,
380
+ region=resources.region,
381
+ zone=resources.zone)
382
+ if not available_regions:
383
+ return resources_utils.FeasibleResources([], [], None)
384
+
385
+ # Return a single resource without region set.
386
+ # The optimizer will call make_launchables_for_valid_region_zones()
387
+ # which will create one resource per region/cluster.
388
+ resources = resources.copy(accelerators=None)
389
+ return resources_utils.FeasibleResources([resources], [], None)
390
+
391
+ def _make(instance_list):
392
+ resource_list = []
393
+ for instance_type in instance_list:
394
+ r = resources.copy(
395
+ cloud=Slurm(),
396
+ instance_type=instance_type,
397
+ accelerators=None,
398
+ )
399
+ resource_list.append(r)
400
+ return resource_list
401
+
402
+ # Currently, handle a filter on accelerators only.
403
+ accelerators = resources.accelerators
404
+
405
+ default_instance_type = Slurm.get_default_instance_type(
406
+ cpus=resources.cpus,
407
+ memory=resources.memory,
408
+ disk_tier=resources.disk_tier,
409
+ region=resources.region,
410
+ zone=resources.zone)
411
+ if default_instance_type is None:
412
+ return resources_utils.FeasibleResources([], [], None)
413
+
414
+ if accelerators is None:
415
+ chosen_instance_type = default_instance_type
416
+ else:
417
+ assert len(accelerators) == 1, resources
418
+
419
+ # Build GPU-enabled instance type.
420
+ acc_type, acc_count = list(accelerators.items())[0]
421
+
422
+ slurm_instance_type = (slurm_utils.SlurmInstanceType.
423
+ from_instance_type(default_instance_type))
424
+
425
+ gpu_task_cpus = slurm_instance_type.cpus
426
+ gpu_task_memory = slurm_instance_type.memory
427
+ # if resources.cpus is None:
428
+ # gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
429
+ # gpu_task_memory = (float(resources.memory.strip('+')) if
430
+ # resources.memory is not None else
431
+ # gpu_task_cpus *
432
+ # self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
433
+
434
+ chosen_instance_type = (
435
+ slurm_utils.SlurmInstanceType.from_resources(
436
+ gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
437
+
438
+ # Check the availability of the specified instance type in all
439
+ # Slurm clusters.
440
+ available_regions = self.regions_with_offering(
441
+ chosen_instance_type,
442
+ accelerators=None,
443
+ use_spot=resources.use_spot,
444
+ region=resources.region,
445
+ zone=resources.zone)
446
+ if not available_regions:
447
+ return resources_utils.FeasibleResources([], [], None)
448
+
449
+ return resources_utils.FeasibleResources(_make([chosen_instance_type]),
450
+ [], None)
451
+
452
+ @classmethod
453
+ def _check_compute_credentials(
454
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
455
+ """Checks if the user has access credentials to the Slurm cluster."""
456
+ try:
457
+ ssh_config = slurm_utils.get_slurm_ssh_config()
458
+ except FileNotFoundError:
459
+ return (
460
+ False,
461
+ f'Slurm configuration file {slurm_utils.DEFAULT_SLURM_PATH} '
462
+ 'does not exist.\n'
463
+ f'{cls._INDENT_PREFIX}For more info: '
464
+ 'https://docs.skypilot.co/en/latest/getting-started/'
465
+ 'installation.html#slurm-installation')
466
+ except Exception as e: # pylint: disable=broad-except
467
+ return (False, 'Failed to load SSH configuration from '
468
+ f'{slurm_utils.DEFAULT_SLURM_PATH}: '
469
+ f'{common_utils.format_exception(e)}.')
470
+ existing_allowed_clusters = cls.existing_allowed_clusters()
471
+
472
+ if not existing_allowed_clusters:
473
+ return (False, 'No SLURM clusters found in ~/.slurm/config. '
474
+ 'Please configure at least one SLURM cluster.')
475
+
476
+ # Check credentials for each cluster and return ctx2text mapping
477
+ ctx2text = {}
478
+ success = False
479
+ for cluster in existing_allowed_clusters:
480
+ # Retrieve the config options for a given SlurmctldHost name alias.
481
+ ssh_config_dict = ssh_config.lookup(cluster)
482
+
483
+ try:
484
+ client = slurm.SlurmClient(
485
+ ssh_config_dict['hostname'],
486
+ int(ssh_config_dict.get('port', 22)),
487
+ ssh_config_dict['user'],
488
+ ssh_config_dict['identityfile'][0],
489
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None))
490
+ info = client.info()
491
+ logger.debug(f'Slurm cluster {cluster} sinfo: {info}')
492
+ ctx2text[cluster] = 'enabled'
493
+ success = True
494
+ except Exception as e: # pylint: disable=broad-except
495
+ error_msg = (f'Credential check failed: '
496
+ f'{common_utils.format_exception(e)}')
497
+ ctx2text[cluster] = f'disabled. {error_msg}'
498
+
499
+ return success, ctx2text
500
+
501
+ def get_credential_file_mounts(self) -> Dict[str, str]:
502
+ ########
503
+ # TODO #
504
+ ########
505
+ # Return dictionary of credential file paths. This may look
506
+ # something like:
507
+ return {}
508
+
509
+ @classmethod
510
+ def get_current_user_identity(cls) -> Optional[List[str]]:
511
+ # NOTE: used for very advanced SkyPilot functionality
512
+ # Can implement later if desired
513
+ return None
514
+
515
+ def instance_type_exists(self, instance_type: str) -> bool:
516
+ return catalog.instance_type_exists(instance_type, 'slurm')
517
+
518
+ def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
519
+ """Validate region (cluster) and zone (partition).
520
+
521
+ Args:
522
+ region: Slurm cluster name.
523
+ zone: Slurm partition name (optional).
524
+
525
+ Returns:
526
+ Tuple of (region, zone) if valid.
527
+
528
+ Raises:
529
+ ValueError: If cluster or partition not found.
530
+ """
531
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
532
+ if region and region not in all_clusters:
533
+ raise ValueError(
534
+ f'Cluster {region} not found in Slurm config. Slurm only '
535
+ 'supports cluster names as regions. Available '
536
+ f'clusters: {all_clusters}')
537
+
538
+ # Validate partition (zone) if specified
539
+ if zone is not None:
540
+ if region is None:
541
+ raise ValueError(
542
+ 'Cannot specify partition (zone) without specifying '
543
+ 'cluster (region) for Slurm.')
544
+
545
+ partitions = slurm_utils.get_partitions(region)
546
+ if zone not in partitions:
547
+ raise ValueError(
548
+ f'Partition {zone!r} not found in cluster {region!r}. '
549
+ f'Available partitions: {partitions}')
550
+
551
+ return region, zone
552
+
553
+ def accelerator_in_region_or_zone(self,
554
+ accelerator: str,
555
+ acc_count: int,
556
+ region: Optional[str] = None,
557
+ zone: Optional[str] = None) -> bool:
558
+ del zone # unused for now
559
+ regions = catalog.get_region_zones_for_accelerators(accelerator,
560
+ acc_count,
561
+ use_spot=False,
562
+ clouds='slurm')
563
+ if not regions:
564
+ return False
565
+ if region is None:
566
+ return True
567
+ return any(r.name == region for r in regions)
568
+
569
+ @classmethod
570
+ def expand_infras(cls) -> List[str]:
571
+ """Returns a list of enabled Slurm clusters.
572
+
573
+ Each is returned as 'Slurm/cluster-name'.
574
+ """
575
+ infras = []
576
+ for cluster in cls.existing_allowed_clusters(silent=True):
577
+ infras.append(f'{cls.canonical_name()}/{cluster}')
578
+ return infras
sky/clouds/ssh.py CHANGED
@@ -9,6 +9,7 @@ from sky import skypilot_config
9
9
  from sky.adaptors import kubernetes as kubernetes_adaptor
10
10
  from sky.clouds import kubernetes
11
11
  from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.ssh_node_pools import constants as ssh_constants
12
13
  from sky.utils import annotations
13
14
  from sky.utils import common_utils
14
15
  from sky.utils import registry
@@ -20,7 +21,7 @@ if typing.TYPE_CHECKING:
20
21
 
21
22
  logger = sky_logging.init_logger(__name__)
22
23
 
23
- SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
24
+ SSH_NODE_POOLS_PATH = ssh_constants.DEFAULT_SSH_NODE_POOLS_PATH
24
25
 
25
26
 
26
27
  @registry.CLOUD_REGISTRY.register()
@@ -44,10 +45,12 @@ class SSH(kubernetes.Kubernetes):
44
45
 
45
46
  @classmethod
46
47
  def _unsupported_features_for_resources(
47
- cls, resources: 'resources_lib.Resources'
48
+ cls,
49
+ resources: 'resources_lib.Resources',
50
+ region: Optional[str] = None,
48
51
  ) -> Dict[kubernetes.clouds.CloudImplementationFeatures, str]:
49
52
  # Inherit all Kubernetes unsupported features
50
- return super()._unsupported_features_for_resources(resources)
53
+ return super()._unsupported_features_for_resources(resources, region)
51
54
 
52
55
  @classmethod
53
56
  def get_ssh_node_pool_contexts(cls) -> List[str]: