skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,184 @@
1
+ """Seeweb service catalog.
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for Seeweb.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple
9
+
10
+ import pandas as pd
11
+
12
+ from sky.catalog import common
13
+ from sky.utils import resources_utils
14
+ from sky.utils import ux_utils
15
+
16
+ if typing.TYPE_CHECKING:
17
+ from sky.clouds import cloud
18
+
19
+ _PULL_FREQUENCY_HOURS = 8
20
+ _df = common.read_catalog('seeweb/vms.csv',
21
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
22
+
23
+
24
+ def instance_type_exists(instance_type: str) -> bool:
25
+ result = common.instance_type_exists_impl(_df, instance_type)
26
+ return result
27
+
28
+
29
+ def validate_region_zone(
30
+ region: Optional[str],
31
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
32
+ if zone is not None:
33
+ with ux_utils.print_exception_no_traceback():
34
+ raise ValueError('Seeweb does not support zones.')
35
+
36
+ result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
37
+ return result
38
+
39
+
40
+ def get_hourly_cost(instance_type: str,
41
+ use_spot: bool = False,
42
+ region: Optional[str] = None,
43
+ zone: Optional[str] = None) -> float:
44
+ """Returns the cost, or the cheapest cost among all zones for spot."""
45
+ if zone is not None:
46
+ with ux_utils.print_exception_no_traceback():
47
+ raise ValueError('Seeweb does not support zones.')
48
+
49
+ result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
50
+ zone)
51
+ return result
52
+
53
+
54
+ def get_vcpus_mem_from_instance_type(
55
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
56
+ result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
57
+ return result
58
+
59
+
60
+ def get_default_instance_type(cpus: Optional[str] = None,
61
+ memory: Optional[str] = None,
62
+ disk_tier: Optional[
63
+ resources_utils.DiskTier] = None,
64
+ region: Optional[str] = None,
65
+ zone: Optional[str] = None) -> Optional[str]:
66
+ del disk_tier # unused
67
+ result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
68
+ region, zone)
69
+ return result
70
+
71
+
72
+ def get_accelerators_from_instance_type(
73
+ instance_type: str) -> Optional[Dict[str, int]]:
74
+ # Filter the dataframe for the specific instance type
75
+ df_filtered = _df[_df['InstanceType'] == instance_type]
76
+ if df_filtered.empty:
77
+ return None
78
+
79
+ # Get the first row (all rows for same instance
80
+ # type should have same accelerator info)
81
+ row = df_filtered.iloc[0]
82
+ acc_name = row['AcceleratorName']
83
+ acc_count = row['AcceleratorCount']
84
+
85
+ # Check if the instance has accelerators
86
+ if pd.isna(acc_name) or pd.isna(
87
+ acc_count) or acc_name == '' or acc_count == '':
88
+ return None
89
+
90
+ # Convert accelerator count to int/float
91
+ try:
92
+ if int(acc_count) == acc_count:
93
+ acc_count = int(acc_count)
94
+ else:
95
+ acc_count = float(acc_count)
96
+ except (ValueError, TypeError):
97
+ return None
98
+
99
+ result = {acc_name: acc_count}
100
+ return result
101
+
102
+
103
+ def get_instance_type_for_accelerator(
104
+ acc_name: str,
105
+ acc_count: int,
106
+ cpus: Optional[str] = None,
107
+ memory: Optional[str] = None,
108
+ use_spot: bool = False,
109
+ region: Optional[str] = None,
110
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
111
+ """Returns a list of instance types satisfying
112
+ the required count of accelerators."""
113
+ if zone is not None:
114
+ with ux_utils.print_exception_no_traceback():
115
+ raise ValueError('Seeweb does not support zones.')
116
+
117
+ result = common.get_instance_type_for_accelerator_impl(df=_df,
118
+ acc_name=acc_name,
119
+ acc_count=acc_count,
120
+ cpus=cpus,
121
+ memory=memory,
122
+ use_spot=use_spot,
123
+ region=region,
124
+ zone=zone)
125
+ return result
126
+
127
+
128
+ def regions() -> List['cloud.Region']:
129
+ result = common.get_region_zones(_df, use_spot=False)
130
+ return result
131
+
132
+
133
+ def get_region_zones_for_instance_type(instance_type: str,
134
+ use_spot: bool = False
135
+ ) -> List['cloud.Region']:
136
+ """Returns a list of regions for a given instance type."""
137
+ # Filter the dataframe for the specific instance type
138
+ df_filtered = _df[_df['InstanceType'] == instance_type]
139
+ if df_filtered.empty:
140
+ return []
141
+
142
+ # Use common.get_region_zones() like all other providers
143
+ region_list = common.get_region_zones(df_filtered, use_spot)
144
+
145
+ # Default region: Frosinone (it-fr2)
146
+ # Other regions: Milano (it-mi2), Lugano (ch-lug1), Bulgaria (bg-sof1)
147
+ priority_regions = ['it-fr2']
148
+ prioritized_regions = []
149
+ other_regions = []
150
+
151
+ # First, add regions in priority order if they exist
152
+ for priority_region in priority_regions:
153
+ for region in region_list:
154
+ if region.name == priority_region:
155
+ prioritized_regions.append(region)
156
+ break
157
+
158
+ # Then, add any remaining regions that weren't in the priority list
159
+ for region in region_list:
160
+ if region.name not in priority_regions:
161
+ other_regions.append(region)
162
+
163
+ result = prioritized_regions + other_regions
164
+ return result
165
+
166
+
167
+ def list_accelerators(
168
+ gpus_only: bool,
169
+ name_filter: Optional[str],
170
+ region_filter: Optional[str],
171
+ quantity_filter: Optional[int],
172
+ case_sensitive: bool = True,
173
+ all_regions: bool = False,
174
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
175
+ """Lists accelerators offered in Seeweb."""
176
+ # Filter out rows with empty or null regions (indicating unavailability)
177
+ df_filtered = _df.dropna(subset=['Region'])
178
+ df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
179
+
180
+ result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
181
+ name_filter, region_filter,
182
+ quantity_filter, case_sensitive,
183
+ all_regions, require_price)
184
+ return result
@@ -0,0 +1,165 @@
1
+ """ Shadeform | Catalog
2
+
3
+ This module loads pricing and instance information from the Shadeform API
4
+ and can be used to query instance types and pricing information for Shadeform.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ import pandas as pd
11
+
12
+ from sky.catalog import common
13
+
14
+ if typing.TYPE_CHECKING:
15
+ from sky.clouds import cloud
16
+
17
+ # We'll use dynamic fetching, so no static CSV file to load
18
+ _df = None
19
+
20
+
21
+ def _get_df():
22
+ """Get the dataframe, fetching from API if needed."""
23
+ global _df
24
+ if _df is None:
25
+ # For now, we'll fall back to a minimal static catalog
26
+ # In a full implementation, this would call the Shadeform API
27
+ # to dynamically fetch the latest instance types and pricing
28
+ try:
29
+ df = common.read_catalog('shadeform/vms.csv')
30
+ except FileNotFoundError:
31
+ # If no static catalog exists, create an empty one
32
+ # This would be replaced with dynamic API fetching
33
+ _df = pd.DataFrame(columns=[
34
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
35
+ 'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
36
+ ])
37
+ else:
38
+ df = df[df['InstanceType'].notna()]
39
+ if 'AcceleratorName' in df.columns:
40
+ df = df[df['AcceleratorName'].notna()]
41
+ df = df.assign(AcceleratorName=df['AcceleratorName'].astype(
42
+ str).str.strip())
43
+ _df = df.reset_index(drop=True)
44
+ return _df
45
+
46
+
47
+ def _is_not_found_error(err: ValueError) -> bool:
48
+ msg = str(err).lower()
49
+ return 'not found' in msg or 'not supported' in msg
50
+
51
+
52
+ def _call_or_default(func, default):
53
+ try:
54
+ return func()
55
+ except ValueError as err:
56
+ if _is_not_found_error(err):
57
+ return default
58
+ raise
59
+
60
+
61
+ def instance_type_exists(instance_type: str) -> bool:
62
+ """Check if an instance type exists."""
63
+ return common.instance_type_exists_impl(_get_df(), instance_type)
64
+
65
+
66
+ def validate_region_zone(
67
+ region: Optional[str],
68
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
69
+ """Validate region and zone for Shadeform."""
70
+ return common.validate_region_zone_impl('shadeform', _get_df(), region,
71
+ zone)
72
+
73
+
74
+ def get_hourly_cost(instance_type: str,
75
+ use_spot: bool = False,
76
+ region: Optional[str] = None,
77
+ zone: Optional[str] = None) -> float:
78
+ """Returns the cost, or the cheapest cost among all zones for spot."""
79
+ # Shadeform doesn't support spot instances currently
80
+ if use_spot:
81
+ raise ValueError('Spot instances are not supported on Shadeform')
82
+
83
+ return common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
84
+ region, zone)
85
+
86
+
87
+ def get_vcpus_mem_from_instance_type(
88
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
89
+ """Get vCPUs and memory from instance type."""
90
+ return _call_or_default(
91
+ lambda: common.get_vcpus_mem_from_instance_type_impl(
92
+ _get_df(), instance_type), (None, None))
93
+
94
+
95
+ def get_default_instance_type(cpus: Optional[str] = None,
96
+ memory: Optional[str] = None,
97
+ disk_tier: Optional[str] = None,
98
+ region: Optional[str] = None,
99
+ zone: Optional[str] = None) -> Optional[str]:
100
+ """Get default instance type based on requirements."""
101
+ del disk_tier # Shadeform doesn't support custom disk tiers yet
102
+ return _call_or_default(
103
+ lambda: common.get_instance_type_for_cpus_mem_impl(
104
+ _get_df(), cpus, memory, region, zone), None)
105
+
106
+
107
+ def get_accelerators_from_instance_type(
108
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
109
+ """Get accelerator information from instance type."""
110
+ return _call_or_default(
111
+ lambda: common.get_accelerators_from_instance_type_impl(
112
+ _get_df(), instance_type), None)
113
+
114
+
115
+ def get_instance_type_for_accelerator(
116
+ acc_name: str,
117
+ acc_count: int,
118
+ cpus: Optional[str] = None,
119
+ memory: Optional[str] = None,
120
+ use_spot: bool = False,
121
+ region: Optional[str] = None,
122
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
123
+ """Returns a list of instance types that have the given accelerator."""
124
+ if use_spot:
125
+ # Return empty lists since spot is not supported
126
+ return None, ['Spot instances are not supported on Shadeform']
127
+
128
+ return _call_or_default(
129
+ lambda: common.get_instance_type_for_accelerator_impl(
130
+ df=_get_df(),
131
+ acc_name=acc_name,
132
+ acc_count=acc_count,
133
+ cpus=cpus,
134
+ memory=memory,
135
+ use_spot=use_spot,
136
+ region=region,
137
+ zone=zone), (None, []))
138
+
139
+
140
+ def get_region_zones_for_instance_type(instance_type: str,
141
+ use_spot: bool) -> List['cloud.Region']:
142
+ """Get regions and zones for an instance type."""
143
+ if use_spot:
144
+ return [] # No spot support
145
+
146
+ df = _get_df()
147
+ df_filtered = df[df['InstanceType'] == instance_type]
148
+ return _call_or_default(
149
+ lambda: common.get_region_zones(df_filtered, use_spot), [])
150
+
151
+
152
+ def list_accelerators(
153
+ gpus_only: bool,
154
+ name_filter: Optional[str],
155
+ region_filter: Optional[str],
156
+ quantity_filter: Optional[int],
157
+ case_sensitive: bool = True,
158
+ all_regions: bool = False,
159
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
160
+ """Returns all instance types in Shadeform offering GPUs."""
161
+ del require_price # Unused.
162
+ return common.list_accelerators_impl('Shadeform', _get_df(), gpus_only,
163
+ name_filter, region_filter,
164
+ quantity_filter, case_sensitive,
165
+ all_regions)
@@ -0,0 +1,243 @@
1
+ """Slurm Catalog."""
2
+
3
+ import collections
4
+ import re
5
+ from typing import Dict, List, Optional, Set, Tuple
6
+
7
+ from sky import check as sky_check
8
+ from sky import clouds as sky_clouds
9
+ from sky import sky_logging
10
+ from sky.catalog import common
11
+ from sky.clouds import cloud
12
+ from sky.provision.slurm import utils as slurm_utils
13
+ from sky.utils import resources_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ _DEFAULT_NUM_VCPUS = 2
18
+ _DEFAULT_MEMORY_CPU_RATIO = 1
19
+
20
+
21
+ def instance_type_exists(instance_type: str) -> bool:
22
+ """Check if the given instance type is valid for Slurm."""
23
+ return slurm_utils.SlurmInstanceType.is_valid_instance_type(instance_type)
24
+
25
+
26
+ def get_default_instance_type(cpus: Optional[str] = None,
27
+ memory: Optional[str] = None,
28
+ disk_tier: Optional[
29
+ resources_utils.DiskTier] = None,
30
+ region: Optional[str] = None,
31
+ zone: Optional[str] = None) -> Optional[str]:
32
+ # Delete unused parameters.
33
+ del disk_tier, region, zone
34
+
35
+ # Slurm provisions resources via --cpus-per-task and --mem.
36
+ instance_cpus = float(
37
+ cpus.strip('+')) if cpus is not None else _DEFAULT_NUM_VCPUS
38
+ if memory is not None:
39
+ if memory.endswith('+'):
40
+ instance_mem = float(memory[:-1])
41
+ elif memory.endswith('x'):
42
+ instance_mem = float(memory[:-1]) * instance_cpus
43
+ else:
44
+ instance_mem = float(memory)
45
+ else:
46
+ instance_mem = instance_cpus * _DEFAULT_MEMORY_CPU_RATIO
47
+ virtual_instance_type = slurm_utils.SlurmInstanceType(
48
+ instance_cpus, instance_mem).name
49
+ return virtual_instance_type
50
+
51
+
52
+ def list_accelerators(
53
+ gpus_only: bool,
54
+ name_filter: Optional[str],
55
+ region_filter: Optional[str],
56
+ quantity_filter: Optional[int],
57
+ case_sensitive: bool = True,
58
+ all_regions: bool = False,
59
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
60
+ """List accelerators in Slurm clusters.
61
+
62
+ Returns a dictionary mapping GPU type to a list of InstanceTypeInfo objects.
63
+ """
64
+ return list_accelerators_realtime(gpus_only, name_filter, region_filter,
65
+ quantity_filter, case_sensitive,
66
+ all_regions, require_price)[0]
67
+
68
+
69
+ def list_accelerators_realtime(
70
+ gpus_only: bool = True,
71
+ name_filter: Optional[str] = None,
72
+ region_filter: Optional[str] = None,
73
+ quantity_filter: Optional[int] = None,
74
+ case_sensitive: bool = True,
75
+ all_regions: bool = False,
76
+ require_price: bool = False,
77
+ ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
78
+ int]]:
79
+ """Fetches real-time accelerator information from the Slurm cluster.
80
+
81
+ Uses the `get_slurm_node_info_list` helper function.
82
+
83
+ Args:
84
+ gpus_only: If True, only return GPU accelerators.
85
+ name_filter: Regex filter for accelerator names (e.g., 'V100', 'gpu').
86
+ region_filter: Optional filter for Slurm partitions.
87
+ quantity_filter: Minimum number of accelerators required per node.
88
+ case_sensitive: Whether name_filter is case-sensitive.
89
+ all_regions: Unused in Slurm context.
90
+ require_price: Unused in Slurm context.
91
+
92
+ Returns:
93
+ A tuple of three dictionaries:
94
+ - qtys_map: Maps GPU type to set of InstanceTypeInfo objects for unique
95
+ counts found per node.
96
+ - total_capacity: Maps GPU type to total count across all nodes.
97
+ - total_available: Maps GPU type to total free count across all nodes.
98
+ """
99
+ del gpus_only, all_regions, require_price
100
+
101
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
102
+ cloud.CloudCapability.COMPUTE)
103
+ if not sky_clouds.cloud_in_iterable(sky_clouds.Slurm(), enabled_clouds):
104
+ return {}, {}, {}
105
+
106
+ if region_filter is None:
107
+ # Get the first available cluster as default
108
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
109
+ if not all_clusters:
110
+ return {}, {}, {}
111
+ slurm_cluster = all_clusters[0]
112
+ else:
113
+ slurm_cluster = region_filter
114
+
115
+ partition_filter = slurm_utils.get_cluster_default_partition(slurm_cluster)
116
+
117
+ # Call the helper function to get node info
118
+ slurm_nodes_info = slurm_utils.slurm_node_info(
119
+ slurm_cluster_name=slurm_cluster)
120
+
121
+ if not slurm_nodes_info:
122
+ # Customize error message based on filters
123
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
124
+ filters_applied = []
125
+ if name_filter:
126
+ filters_applied.append(f'gpu_name={name_filter!r}')
127
+ if quantity_filter:
128
+ filters_applied.append(f'quantity>={quantity_filter}')
129
+ if region_filter:
130
+ filters_applied.append(f'cluster={region_filter!r}')
131
+ if filters_applied:
132
+ err_msg += f' with filters ({", ".join(filters_applied)})'
133
+ err_msg += '.'
134
+ logger.error(
135
+ err_msg) # Log as error as it indicates no usable resources found
136
+ raise ValueError(err_msg)
137
+
138
+ # Aggregate results into the required format
139
+ qtys_map: Dict[str,
140
+ Set[common.InstanceTypeInfo]] = collections.defaultdict(set)
141
+ total_capacity: Dict[str, int] = collections.defaultdict(int)
142
+ total_available: Dict[str, int] = collections.defaultdict(int)
143
+
144
+ for node_info in slurm_nodes_info:
145
+ gpu_type = node_info['gpu_type']
146
+ node_total_gpus = node_info['total_gpus']
147
+ node_free_gpus = node_info['free_gpus']
148
+ partition = node_info['partition']
149
+
150
+ # Apply name filter to the determined GPU type
151
+ regex_flags = 0 if case_sensitive else re.IGNORECASE
152
+ if name_filter and not re.match(
153
+ name_filter, gpu_type, flags=regex_flags):
154
+ continue
155
+
156
+ # Apply quantity filter (total GPUs on node must meet this)
157
+ if quantity_filter and node_total_gpus < quantity_filter:
158
+ continue
159
+
160
+ # Apply partition filter if specified
161
+ # TODO(zhwu): when a node is in multiple partitions, the partition
162
+ # mapping from node to partition does not work.
163
+ # if partition_filter and partition != partition_filter:
164
+ # continue
165
+
166
+ # Create InstanceTypeInfo objects for various GPU counts
167
+ # Similar to Kubernetes, generate powers of 2 up to node_total_gpus
168
+ if node_total_gpus > 0:
169
+ count = 1
170
+ while count <= node_total_gpus:
171
+ instance_info = common.InstanceTypeInfo(
172
+ instance_type=None, # Slurm doesn't have instance types
173
+ accelerator_name=gpu_type,
174
+ accelerator_count=count,
175
+ cpu_count=node_info['vcpu_count'],
176
+ memory=node_info['memory_gb'],
177
+ price=0.0, # Slurm doesn't have price info
178
+ region=partition, # Use partition as region
179
+ cloud='slurm', # Specify cloud as 'slurm'
180
+ device_memory=0.0, # No GPU memory info from Slurm
181
+ spot_price=0.0, # Slurm doesn't have spot pricing
182
+ )
183
+ qtys_map[gpu_type].add(instance_info)
184
+ count *= 2
185
+
186
+ # Add the actual total if it's not already included
187
+ # (e.g., if node has 12 GPUs, include counts 1, 2, 4, 8, 12)
188
+ if count // 2 != node_total_gpus:
189
+ instance_info = common.InstanceTypeInfo(
190
+ instance_type=None,
191
+ accelerator_name=gpu_type,
192
+ accelerator_count=node_total_gpus,
193
+ cpu_count=node_info['vcpu_count'],
194
+ memory=node_info['memory_gb'],
195
+ price=0.0,
196
+ region=partition,
197
+ cloud='slurm',
198
+ device_memory=0.0,
199
+ spot_price=0.0,
200
+ )
201
+ qtys_map[gpu_type].add(instance_info)
202
+
203
+ # Map of GPU type -> total count across all matched nodes
204
+ total_capacity[gpu_type] += node_total_gpus
205
+
206
+ # Map of GPU type -> total *free* count across all matched nodes
207
+ total_available[gpu_type] += node_free_gpus
208
+
209
+ # Check if any GPUs were found after applying filters
210
+ if not total_capacity:
211
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
212
+ filters_applied = []
213
+ if name_filter:
214
+ filters_applied.append(f'gpu_name={name_filter!r}')
215
+ if quantity_filter:
216
+ filters_applied.append(f'quantity>={quantity_filter}')
217
+ if partition_filter:
218
+ filters_applied.append(f'partition={partition_filter!r}')
219
+ if filters_applied:
220
+ err_msg += f' with filters ({", ".join(filters_applied)})'
221
+ err_msg += '.'
222
+ logger.error(err_msg)
223
+ raise ValueError(err_msg)
224
+
225
+ # Convert sets of InstanceTypeInfo to sorted lists
226
+ final_qtys_map = {
227
+ gpu: sorted(list(instances), key=lambda x: x.accelerator_count)
228
+ for gpu, instances in qtys_map.items()
229
+ }
230
+
231
+ logger.debug(f'Aggregated Slurm GPU Info: '
232
+ f'qtys={final_qtys_map}, '
233
+ f'capacity={dict(total_capacity)}, '
234
+ f'available={dict(total_available)}')
235
+
236
+ return final_qtys_map, dict(total_capacity), dict(total_available)
237
+
238
+
239
+ def validate_region_zone(
240
+ region_name: Optional[str],
241
+ zone_name: Optional[str],
242
+ ) -> Tuple[Optional[str], Optional[str]]:
243
+ return (region_name, zone_name)