skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,583 @@
1
+ """Slurm utilities for SkyPilot."""
2
+ import math
3
+ import os
4
+ import re
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
7
+ from paramiko.config import SSHConfig
8
+
9
+ from sky import exceptions
10
+ from sky import sky_logging
11
+ from sky.adaptors import slurm
12
+ from sky.utils import annotations
13
+ from sky.utils import common_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ # TODO(jwj): Choose commonly used default values.
18
+ DEFAULT_SLURM_PATH = '~/.slurm/config'
19
+ DEFAULT_CLUSTER_NAME = 'localcluster'
20
+ DEFAULT_PARTITION = 'dev'
21
+
22
+
23
+ def get_slurm_ssh_config() -> SSHConfig:
24
+ """Get the Slurm SSH config."""
25
+ slurm_config_path = os.path.expanduser(DEFAULT_SLURM_PATH)
26
+ slurm_config = SSHConfig.from_path(slurm_config_path)
27
+ return slurm_config
28
+
29
+
30
+ class SlurmInstanceType:
31
+ """Class to represent the "Instance Type" in a Slurm cluster.
32
+
33
+ Since Slurm does not have a notion of instances, we generate
34
+ virtual instance types that represent the resources requested by a
35
+ Slurm worker node.
36
+
37
+ This name captures the following resource requests:
38
+ - CPU
39
+ - Memory
40
+ - Accelerators
41
+
42
+ The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
43
+ k is the amount of memory in GB. Accelerators can be specified by
44
+ appending "--{type}:{a}" where type is the accelerator type and a
45
+ is the number of accelerators.
46
+ CPU and memory can be specified as floats. Accelerator count must be int.
47
+
48
+ Examples:
49
+ - 4CPU--16GB
50
+ - 0.5CPU--1.5GB
51
+ - 4CPU--16GB--V100:1
52
+ """
53
+
54
+ def __init__(self,
55
+ cpus: float,
56
+ memory: float,
57
+ accelerator_count: Optional[int] = None,
58
+ accelerator_type: Optional[str] = None):
59
+ self.cpus = cpus
60
+ self.memory = memory
61
+ self.accelerator_count = accelerator_count
62
+ self.accelerator_type = accelerator_type
63
+
64
+ @property
65
+ def name(self) -> str:
66
+ """Returns the name of the instance."""
67
+ assert self.cpus is not None
68
+ assert self.memory is not None
69
+ name = (f'{common_utils.format_float(self.cpus)}CPU--'
70
+ f'{common_utils.format_float(self.memory)}GB')
71
+ if self.accelerator_count is not None:
72
+ # Replace spaces with underscores in accelerator type to make it a
73
+ # valid logical instance type name.
74
+ assert self.accelerator_type is not None, self.accelerator_count
75
+ acc_name = self.accelerator_type.replace(' ', '_')
76
+ name += f'--{acc_name}:{self.accelerator_count}'
77
+ return name
78
+
79
+ @staticmethod
80
+ def is_valid_instance_type(name: str) -> bool:
81
+ """Returns whether the given name is a valid instance type."""
82
+ pattern = re.compile(
83
+ r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
84
+ return bool(pattern.match(name))
85
+
86
+ @classmethod
87
+ def _parse_instance_type(
88
+ cls,
89
+ name: str) -> Tuple[float, float, Optional[int], Optional[str]]:
90
+ """Parses and returns resources from the given InstanceType name.
91
+
92
+ Returns:
93
+ cpus | float: Number of CPUs
94
+ memory | float: Amount of memory in GB
95
+ accelerator_count | float: Number of accelerators
96
+ accelerator_type | str: Type of accelerator
97
+ """
98
+ pattern = re.compile(
99
+ r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
100
+ )
101
+ match = pattern.match(name)
102
+ if match is not None:
103
+ cpus = float(match.group('cpus'))
104
+ memory = float(match.group('memory'))
105
+ accelerator_count = match.group('accelerator_count')
106
+ accelerator_type = match.group('accelerator_type')
107
+ if accelerator_count is not None:
108
+ accelerator_count = int(accelerator_count)
109
+ # This is to revert the accelerator types with spaces back to
110
+ # the original format.
111
+ accelerator_type = str(accelerator_type).replace(' ', '_')
112
+ else:
113
+ accelerator_count = None
114
+ accelerator_type = None
115
+ return cpus, memory, accelerator_count, accelerator_type
116
+ else:
117
+ raise ValueError(f'Invalid instance name: {name}')
118
+
119
+ @classmethod
120
+ def from_instance_type(cls, name: str) -> 'SlurmInstanceType':
121
+ """Returns an instance name object from the given name."""
122
+ if not cls.is_valid_instance_type(name):
123
+ raise ValueError(f'Invalid instance name: {name}')
124
+ cpus, memory, accelerator_count, accelerator_type = \
125
+ cls._parse_instance_type(name)
126
+ return cls(cpus=cpus,
127
+ memory=memory,
128
+ accelerator_count=accelerator_count,
129
+ accelerator_type=accelerator_type)
130
+
131
+ @classmethod
132
+ def from_resources(cls,
133
+ cpus: float,
134
+ memory: float,
135
+ accelerator_count: Union[float, int] = 0,
136
+ accelerator_type: str = '') -> 'SlurmInstanceType':
137
+ """Returns an instance name object from the given resources.
138
+
139
+ If accelerator_count is not an int, it will be rounded up since GPU
140
+ requests in Slurm must be int.
141
+
142
+ NOTE: Should we take MIG management into account? See
143
+ https://slurm.schedmd.com/gres.html#MIG_Management.
144
+ """
145
+ name = f'{cpus}CPU--{memory}GB'
146
+ # Round up accelerator_count if it is not an int.
147
+ accelerator_count = math.ceil(accelerator_count)
148
+ if accelerator_count > 0:
149
+ name += f'--{accelerator_type}:{accelerator_count}'
150
+ return cls(cpus=cpus,
151
+ memory=memory,
152
+ accelerator_count=accelerator_count,
153
+ accelerator_type=accelerator_type)
154
+
155
+ def __str__(self):
156
+ return self.name
157
+
158
+ def __repr__(self):
159
+ return (f'SlurmInstanceType(cpus={self.cpus!r}, '
160
+ f'memory={self.memory!r}, '
161
+ f'accelerator_count={self.accelerator_count!r}, '
162
+ f'accelerator_type={self.accelerator_type!r})')
163
+
164
+
165
+ def instance_id(job_id: str, node: str) -> str:
166
+ """Generates the SkyPilot-defined instance ID for Slurm.
167
+
168
+ A (job id, node) pair is unique within a Slurm cluster.
169
+ """
170
+ return f'job{job_id}-{node}'
171
+
172
+
173
+ def get_cluster_name_from_config(provider_config: Dict[str, Any]) -> str:
174
+ """Return the cluster name from the provider config.
175
+
176
+ The concept of cluster can be mapped to a cloud region.
177
+ """
178
+ return provider_config.get('cluster', DEFAULT_CLUSTER_NAME)
179
+
180
+
181
+ def get_partition_from_config(provider_config: Dict[str, Any]) -> str:
182
+ """Return the partition from the provider config.
183
+
184
+ The concept of partition can be mapped to a cloud zone.
185
+ """
186
+ return provider_config.get('partition', DEFAULT_PARTITION)
187
+
188
+
189
+ @annotations.lru_cache(scope='request')
190
+ def get_cluster_default_partition(cluster_name: str) -> str:
191
+ """Get the default partition for a Slurm cluster.
192
+
193
+ Queries the Slurm cluster for the partition marked with an asterisk (*)
194
+ in sinfo output. Falls back to DEFAULT_PARTITION if the query fails or
195
+ no default partition is found.
196
+
197
+ Args:
198
+ cluster_name: Name of the Slurm cluster.
199
+
200
+ Returns:
201
+ The default partition name for the cluster.
202
+ """
203
+ try:
204
+ ssh_config = get_slurm_ssh_config()
205
+ ssh_config_dict = ssh_config.lookup(cluster_name)
206
+ except Exception as e:
207
+ raise ValueError(
208
+ f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
209
+ f'{common_utils.format_exception(e)}') from e
210
+
211
+ client = slurm.SlurmClient(
212
+ ssh_config_dict['hostname'],
213
+ int(ssh_config_dict.get('port', 22)),
214
+ ssh_config_dict['user'],
215
+ ssh_config_dict['identityfile'][0],
216
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
217
+ )
218
+
219
+ default_partition = client.get_default_partition()
220
+ if default_partition is None:
221
+ # TODO(kevin): Have a way to specify default partition in
222
+ # ~/.sky/config.yaml if needed, in case a Slurm cluster
223
+ # really does not have a default partition.
224
+ raise ValueError('No default partition found for cluster '
225
+ f'{cluster_name}.')
226
+ return default_partition
227
+
228
+
229
+ def get_all_slurm_cluster_names() -> List[str]:
230
+ """Get all Slurm cluster names available in the environment.
231
+
232
+ Returns:
233
+ List[str]: The list of Slurm cluster names if available,
234
+ an empty list otherwise.
235
+ """
236
+ try:
237
+ ssh_config = get_slurm_ssh_config()
238
+ except FileNotFoundError:
239
+ return []
240
+ except Exception as e:
241
+ raise ValueError(
242
+ f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
243
+ f'{common_utils.format_exception(e)}') from e
244
+
245
+ cluster_names = []
246
+ for cluster in ssh_config.get_hostnames():
247
+ if cluster == '*':
248
+ continue
249
+
250
+ cluster_names.append(cluster)
251
+
252
+ return cluster_names
253
+
254
+
255
+ def _check_cpu_mem_fits(
256
+ candidate_instance_type: SlurmInstanceType,
257
+ node_list: List[slurm.NodeInfo]) -> Tuple[bool, Optional[str]]:
258
+ """Checks if instance fits on candidate nodes based on CPU and memory.
259
+
260
+ We check capacity (not allocatable) because availability can change
261
+ during scheduling, and we want to let the Slurm scheduler handle that.
262
+ """
263
+ # We log max CPU and memory found on the GPU nodes for debugging.
264
+ max_cpu = 0
265
+ max_mem_gb = 0.0
266
+
267
+ for node_info in node_list:
268
+ node_cpus = node_info.cpus
269
+ node_mem_gb = node_info.memory_gb
270
+
271
+ if node_cpus > max_cpu:
272
+ max_cpu = node_cpus
273
+ max_mem_gb = node_mem_gb
274
+
275
+ if (node_cpus >= candidate_instance_type.cpus and
276
+ node_mem_gb >= candidate_instance_type.memory):
277
+ return True, None
278
+
279
+ return False, (f'Max found: {max_cpu} CPUs, '
280
+ f'{common_utils.format_float(max_mem_gb)}G memory')
281
+
282
+
283
+ def check_instance_fits(
284
+ cluster: str,
285
+ instance_type: str,
286
+ partition: Optional[str] = None) -> Tuple[bool, Optional[str]]:
287
+ """Check if the given instance type fits in the given cluster/partition.
288
+
289
+ Args:
290
+ cluster: Name of the Slurm cluster.
291
+ instance_type: The instance type to check.
292
+ partition: Optional partition name. If None, checks all partitions.
293
+
294
+ Returns:
295
+ Tuple of (fits, reason) where fits is True if available.
296
+ """
297
+ # Get Slurm node list in the given cluster (region).
298
+ try:
299
+ ssh_config = get_slurm_ssh_config()
300
+ except FileNotFoundError:
301
+ return (False, f'Could not query Slurm cluster {cluster} '
302
+ f'because the Slurm configuration file '
303
+ f'{DEFAULT_SLURM_PATH} does not exist.')
304
+ except Exception as e: # pylint: disable=broad-except
305
+ return (False, f'Could not query Slurm cluster {cluster} '
306
+ f'because Slurm SSH configuration at {DEFAULT_SLURM_PATH} '
307
+ f'could not be loaded: {common_utils.format_exception(e)}.')
308
+ ssh_config_dict = ssh_config.lookup(cluster)
309
+
310
+ client = slurm.SlurmClient(
311
+ ssh_config_dict['hostname'],
312
+ int(ssh_config_dict.get('port', 22)),
313
+ ssh_config_dict['user'],
314
+ ssh_config_dict['identityfile'][0],
315
+ ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
316
+ )
317
+
318
+ nodes = client.info_nodes()
319
+ default_partition = get_cluster_default_partition(cluster)
320
+
321
+ def is_default_partition(node_partition: str) -> bool:
322
+ # info_nodes does not strip the '*' from the default partition name.
323
+ # But non-default partition names can also end with '*',
324
+ # so we need to check whether the partition name without the '*'
325
+ # is the same as the default partition name.
326
+ return (node_partition.endswith('*') and
327
+ node_partition[:-1] == default_partition)
328
+
329
+ partition_suffix = ''
330
+ if partition is not None:
331
+ filtered = []
332
+ for node_info in nodes:
333
+ node_partition = node_info.partition
334
+ if is_default_partition(node_partition):
335
+ # Strip '*' from default partition name.
336
+ node_partition = node_partition[:-1]
337
+ if node_partition == partition:
338
+ filtered.append(node_info)
339
+ nodes = filtered
340
+ partition_suffix = f' in partition {partition}'
341
+
342
+ slurm_instance_type = SlurmInstanceType.from_instance_type(instance_type)
343
+ acc_count = (slurm_instance_type.accelerator_count
344
+ if slurm_instance_type.accelerator_count is not None else 0)
345
+ acc_type = slurm_instance_type.accelerator_type
346
+ candidate_nodes = nodes
347
+ not_fit_reason_prefix = (
348
+ f'No nodes found with enough '
349
+ f'CPU (> {slurm_instance_type.cpus} CPUs) and/or '
350
+ f'memory (> {slurm_instance_type.memory} G){partition_suffix}. ')
351
+ if acc_type is not None:
352
+ assert acc_count is not None, (acc_type, acc_count)
353
+
354
+ gpu_nodes = []
355
+ # GRES string format: 'gpu:acc_type:acc_count(optional_extra_info)'
356
+ # Examples:
357
+ # - gpu:nvidia_h100_80gb_hbm3:8(S:0-1)
358
+ # - gpu:a10g:8
359
+ # - gpu:l4:1
360
+ gres_pattern = re.compile(r'^gpu:([^:]+):(\d+)')
361
+ for node_info in nodes:
362
+ gres_str = node_info.gres
363
+ # Extract the GPU type and count from the GRES string
364
+ match = gres_pattern.match(gres_str)
365
+ if not match:
366
+ continue
367
+
368
+ node_acc_type = match.group(1).lower()
369
+ node_acc_count = int(match.group(2))
370
+
371
+ # TODO(jwj): Handle status check.
372
+
373
+ # Check if the node has the requested GPU type and at least the
374
+ # requested count
375
+ if (node_acc_type == acc_type.lower() and
376
+ node_acc_count >= acc_count):
377
+ gpu_nodes.append(node_info)
378
+ if len(gpu_nodes) == 0:
379
+ return (False,
380
+ f'No GPU nodes found with at least {acc_type}:{acc_count} '
381
+ f'on the cluster.')
382
+
383
+ candidate_nodes = gpu_nodes
384
+ not_fit_reason_prefix = (
385
+ f'GPU nodes with {acc_type}{partition_suffix} do not have '
386
+ f'enough CPU (> {slurm_instance_type.cpus} CPUs) and/or '
387
+ f'memory (> {slurm_instance_type.memory} G). ')
388
+
389
+ # Check if CPU and memory requirements are met on at least one
390
+ # candidate node.
391
+ fits, reason = _check_cpu_mem_fits(slurm_instance_type, candidate_nodes)
392
+ if not fits and reason is not None:
393
+ reason = not_fit_reason_prefix + reason
394
+ return fits, reason
395
+
396
+
397
+ def _get_slurm_node_info_list(
398
+ slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
399
+ """Gathers detailed information about each node in the Slurm cluster.
400
+
401
+ Raises:
402
+ FileNotFoundError: If the Slurm configuration file does not exist.
403
+ ValueError: If no Slurm cluster name is found in the Slurm
404
+ configuration file.
405
+ """
406
+ # 1. Get node state and GRES using sinfo
407
+
408
+ # can raise FileNotFoundError if config file does not exist.
409
+ slurm_config = get_slurm_ssh_config()
410
+ if slurm_cluster_name is None:
411
+ slurm_cluster_names = get_all_slurm_cluster_names()
412
+ if slurm_cluster_names:
413
+ slurm_cluster_name = slurm_cluster_names[0]
414
+ if slurm_cluster_name is None:
415
+ raise ValueError(
416
+ f'No Slurm cluster name found in the {DEFAULT_SLURM_PATH} '
417
+ f'configuration.')
418
+ slurm_config_dict = slurm_config.lookup(slurm_cluster_name)
419
+ logger.debug(f'Slurm config dict: {slurm_config_dict}')
420
+ slurm_client = slurm.SlurmClient(
421
+ slurm_config_dict['hostname'],
422
+ int(slurm_config_dict.get('port', 22)),
423
+ slurm_config_dict['user'],
424
+ slurm_config_dict['identityfile'][0],
425
+ ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
426
+ )
427
+ node_infos = slurm_client.info_nodes()
428
+
429
+ if not node_infos:
430
+ logger.warning(
431
+ f'`sinfo -N` returned no output on cluster {slurm_cluster_name}. '
432
+ f'No nodes found?')
433
+ return []
434
+
435
+ # 2. Process each node, aggregating partitions per node
436
+ slurm_nodes_info: Dict[str, Dict[str, Any]] = {}
437
+ gres_gpu_pattern = re.compile(r'((gpu)(?::([^:]+))?:(\d+))')
438
+
439
+ for node_info in node_infos:
440
+ node_name = node_info.node
441
+ state = node_info.state
442
+ gres_str = node_info.gres
443
+ partition = node_info.partition
444
+
445
+ if node_name in slurm_nodes_info:
446
+ slurm_nodes_info[node_name]['partitions'].append(partition)
447
+ continue
448
+
449
+ # Extract GPU info from GRES
450
+ gres_match = gres_gpu_pattern.search(gres_str)
451
+
452
+ total_gpus = 0
453
+ gpu_type_from_sinfo = None # Default to None for CPU-only nodes
454
+ if gres_match:
455
+ try:
456
+ total_gpus = int(gres_match.group(4))
457
+ if gres_match.group(3):
458
+ gpu_type_from_sinfo = gres_match.group(3).upper()
459
+ # If total_gpus > 0 but no type, default to 'GPU'
460
+ elif total_gpus > 0:
461
+ gpu_type_from_sinfo = 'GPU'
462
+ except ValueError:
463
+ logger.warning(
464
+ f'Could not parse GPU count from GRES for {node_name}.')
465
+
466
+ # Get allocated GPUs via squeue
467
+ allocated_gpus = 0
468
+ # TODO(zhwu): move to enum
469
+ if state in ('alloc', 'mix', 'drain', 'drng', 'drained', 'resv',
470
+ 'comp'):
471
+ try:
472
+ node_jobs = slurm_client.get_node_jobs(node_name)
473
+ if node_jobs:
474
+ job_gres_pattern = re.compile(r'gpu(?::[^:]+)*:(\d+)')
475
+ for job_line in node_jobs:
476
+ gres_job_match = job_gres_pattern.search(job_line)
477
+ if gres_job_match:
478
+ allocated_gpus += int(gres_job_match.group(1))
479
+ except Exception as e: # pylint: disable=broad-except
480
+ if state == 'alloc':
481
+ # We can infer allocated GPUs only if the node is
482
+ # in 'alloc' state.
483
+ allocated_gpus = total_gpus
484
+ else:
485
+ # Otherwise, just raise the error.
486
+ raise e
487
+ elif state == 'idle':
488
+ allocated_gpus = 0
489
+
490
+ free_gpus = total_gpus - allocated_gpus if state not in ('down',
491
+ 'drain',
492
+ 'drng',
493
+ 'maint') else 0
494
+ free_gpus = max(0, free_gpus)
495
+
496
+ # Get CPU/Mem info via scontrol
497
+ vcpu_total = 0
498
+ mem_gb = 0.0
499
+ try:
500
+ node_details = slurm_client.node_details(node_name)
501
+ vcpu_total = int(node_details.get('CPUTot', '0'))
502
+ mem_gb = float(node_details.get('RealMemory', '0')) / 1024.0
503
+ except Exception as e: # pylint: disable=broad-except
504
+ logger.warning(
505
+ f'Failed to get CPU/memory info for {node_name}: {e}')
506
+
507
+ slurm_nodes_info[node_name] = {
508
+ 'node_name': node_name,
509
+ 'slurm_cluster_name': slurm_cluster_name,
510
+ 'partitions': [partition],
511
+ 'node_state': state,
512
+ 'gpu_type': gpu_type_from_sinfo,
513
+ 'total_gpus': total_gpus,
514
+ 'free_gpus': free_gpus,
515
+ 'vcpu_count': vcpu_total,
516
+ 'memory_gb': round(mem_gb, 2),
517
+ }
518
+
519
+ for node_info in slurm_nodes_info.values():
520
+ partitions = node_info.pop('partitions')
521
+ node_info['partition'] = ','.join(str(p) for p in partitions)
522
+
523
+ return list(slurm_nodes_info.values())
524
+
525
+
526
+ def slurm_node_info(
527
+ slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
528
+ """Gets detailed information for each node in the Slurm cluster.
529
+
530
+ Returns:
531
+ List[Dict[str, Any]]: A list of dictionaries, each containing node info.
532
+ """
533
+ try:
534
+ node_list = _get_slurm_node_info_list(
535
+ slurm_cluster_name=slurm_cluster_name)
536
+ except (RuntimeError, exceptions.NotSupportedError) as e:
537
+ logger.debug(f'Could not retrieve Slurm node info: {e}')
538
+ return []
539
+ return node_list
540
+
541
+
542
+ def is_inside_slurm_job() -> bool:
543
+ return os.environ.get('SLURM_JOB_ID') is not None
544
+
545
+
546
+ def get_partitions(cluster_name: str) -> List[str]:
547
+ """Get unique partition names available in a Slurm cluster.
548
+
549
+ Args:
550
+ cluster_name: Name of the Slurm cluster.
551
+
552
+ Returns:
553
+ List of unique partition names available in the cluster.
554
+ The default partition appears first,
555
+ and the rest are sorted alphabetically.
556
+ """
557
+ try:
558
+ slurm_config = SSHConfig.from_path(
559
+ os.path.expanduser(DEFAULT_SLURM_PATH))
560
+ slurm_config_dict = slurm_config.lookup(cluster_name)
561
+
562
+ client = slurm.SlurmClient(
563
+ slurm_config_dict['hostname'],
564
+ int(slurm_config_dict.get('port', 22)),
565
+ slurm_config_dict['user'],
566
+ slurm_config_dict['identityfile'][0],
567
+ ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
568
+ )
569
+
570
+ partitions_info = client.get_partitions_info()
571
+ default_partitions = []
572
+ other_partitions = []
573
+ for partition in partitions_info:
574
+ if partition.is_default:
575
+ default_partitions.append(partition.name)
576
+ else:
577
+ other_partitions.append(partition.name)
578
+ return default_partitions + sorted(other_partitions)
579
+ except Exception as e: # pylint: disable=broad-except
580
+ logger.warning(
581
+ f'Failed to get partitions for cluster {cluster_name}: {e}')
582
+ # Fall back to default partition if query fails
583
+ return [DEFAULT_PARTITION]
@@ -39,14 +39,15 @@ def _filter_instances(cluster_name_on_cloud: str,
39
39
 
40
40
  def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
41
41
  for inst_id, inst in instances.items():
42
- if inst['name'].endswith('-head'):
42
+ if inst.get('name') and inst['name'].endswith('-head'):
43
43
  return inst_id
44
44
  return None
45
45
 
46
46
 
47
- def run_instances(region: str, cluster_name_on_cloud: str,
47
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
48
48
  config: common.ProvisionConfig) -> common.ProvisionRecord:
49
49
  """Runs instances for the given cluster."""
50
+ del cluster_name # unused
50
51
  pending_status = ['CREATED', 'RESTARTING']
51
52
 
52
53
  created_instance_ids = []
@@ -88,6 +89,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
88
89
  resumed_instance_ids=[],
89
90
  created_instance_ids=[])
90
91
 
92
+ secure_only = config.provider_config.get('secure_only', False)
91
93
  for _ in range(to_start_count):
92
94
  node_type = 'head' if head_instance_id is None else 'worker'
93
95
  try:
@@ -98,7 +100,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
98
100
  disk_size=config.node_config['DiskSize'],
99
101
  preemptible=config.node_config['Preemptible'],
100
102
  image_name=config.node_config['ImageId'],
101
- ports=config.ports_to_open_on_launch)
103
+ ports=config.ports_to_open_on_launch,
104
+ secure_only=secure_only,
105
+ )
102
106
  except Exception as e: # pylint: disable=broad-except
103
107
  logger.warning(f'run_instances error: {e}')
104
108
  raise
@@ -220,9 +224,10 @@ def query_instances(
220
224
  cluster_name_on_cloud: str,
221
225
  provider_config: Optional[Dict[str, Any]] = None,
222
226
  non_terminated_only: bool = True,
227
+ retry_if_missing: bool = False,
223
228
  ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
224
229
  """See sky/provision/__init__.py"""
225
- del cluster_name # unused
230
+ del cluster_name, retry_if_missing # unused
226
231
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
227
232
  instances = _filter_instances(cluster_name_on_cloud, None)
228
233
  # "running", "frozen", "stopped", "unknown", "loading"
@@ -34,8 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
34
34
 
35
35
 
36
36
  def launch(name: str, instance_type: str, region: str, disk_size: int,
37
- image_name: str, ports: Optional[List[int]],
38
- preemptible: bool) -> str:
37
+ image_name: str, ports: Optional[List[int]], preemptible: bool,
38
+ secure_only: bool) -> str:
39
39
  """Launches an instance with the given parameters.
40
40
 
41
41
  Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -87,7 +87,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
87
87
  gpu_name = instance_type.split('-')[1].replace('_', ' ')
88
88
  num_gpus = int(instance_type.split('-')[0].replace('x', ''))
89
89
 
90
- query = ' '.join([
90
+ query = [
91
91
  'chunked=true',
92
92
  'georegion=true',
93
93
  f'geolocation="{region[-2:]}"',
@@ -95,13 +95,17 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
95
95
  f'num_gpus={num_gpus}',
96
96
  f'gpu_name="{gpu_name}"',
97
97
  f'cpu_ram>="{cpu_ram}"',
98
- ])
98
+ ]
99
+ if secure_only:
100
+ query.append('datacenter=true')
101
+ query_str = ' '.join(query)
99
102
 
100
- instance_list = vast.vast().search_offers(query=query)
103
+ instance_list = vast.vast().search_offers(query=query_str)
101
104
 
102
105
  if isinstance(instance_list, int) or len(instance_list) == 0:
103
106
  raise RuntimeError('Failed to create instances, could not find an '
104
- f'offer that satisfies the requirements "{query}".')
107
+ 'offer that satisfies the requirements '
108
+ f'"{query_str}".')
105
109
 
106
110
  instance_touse = instance_list[0]
107
111