skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,15 @@ and can be used to query instance types and pricing information for Shadeform.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
- import pandas as pd
11
-
10
+ from sky.adaptors import common as adaptors_common
12
11
  from sky.catalog import common
13
12
 
14
13
  if typing.TYPE_CHECKING:
14
+ import pandas as pd
15
+
15
16
  from sky.clouds import cloud
17
+ else:
18
+ pd = adaptors_common.LazyImport('pandas')
16
19
 
17
20
  # We'll use dynamic fetching, so no static CSV file to load
18
21
  _df = None
@@ -0,0 +1,236 @@
1
+ """Slurm Catalog."""
2
+
3
+ import collections
4
+ import re
5
+ from typing import Dict, List, Optional, Set, Tuple
6
+
7
+ from sky import check as sky_check
8
+ from sky import clouds as sky_clouds
9
+ from sky import sky_logging
10
+ from sky.catalog import common
11
+ from sky.clouds import cloud
12
+ from sky.provision.slurm import utils as slurm_utils
13
+ from sky.utils import resources_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ _DEFAULT_NUM_VCPUS = 2
18
+ _DEFAULT_MEMORY_CPU_RATIO = 1
19
+
20
+
21
+ def instance_type_exists(instance_type: str) -> bool:
22
+ """Check if the given instance type is valid for Slurm."""
23
+ return slurm_utils.SlurmInstanceType.is_valid_instance_type(instance_type)
24
+
25
+
26
+ def get_default_instance_type(cpus: Optional[str] = None,
27
+ memory: Optional[str] = None,
28
+ disk_tier: Optional[
29
+ resources_utils.DiskTier] = None,
30
+ region: Optional[str] = None,
31
+ zone: Optional[str] = None) -> Optional[str]:
32
+ # Delete unused parameters.
33
+ del disk_tier, region, zone
34
+
35
+ # Slurm provisions resources via --cpus-per-task and --mem.
36
+ instance_cpus = float(
37
+ cpus.strip('+')) if cpus is not None else _DEFAULT_NUM_VCPUS
38
+ if memory is not None:
39
+ if memory.endswith('+'):
40
+ instance_mem = float(memory[:-1])
41
+ elif memory.endswith('x'):
42
+ instance_mem = float(memory[:-1]) * instance_cpus
43
+ else:
44
+ instance_mem = float(memory)
45
+ else:
46
+ instance_mem = instance_cpus * _DEFAULT_MEMORY_CPU_RATIO
47
+ virtual_instance_type = slurm_utils.SlurmInstanceType(
48
+ instance_cpus, instance_mem).name
49
+ return virtual_instance_type
50
+
51
+
52
+ def list_accelerators(
53
+ gpus_only: bool,
54
+ name_filter: Optional[str],
55
+ region_filter: Optional[str],
56
+ quantity_filter: Optional[int],
57
+ case_sensitive: bool = True,
58
+ all_regions: bool = False,
59
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
60
+ """List accelerators in Slurm clusters.
61
+
62
+ Returns a dictionary mapping GPU type to a list of InstanceTypeInfo objects.
63
+ """
64
+ return list_accelerators_realtime(gpus_only, name_filter, region_filter,
65
+ quantity_filter, case_sensitive,
66
+ all_regions, require_price)[0]
67
+
68
+
69
+ def list_accelerators_realtime(
70
+ gpus_only: bool = True,
71
+ name_filter: Optional[str] = None,
72
+ region_filter: Optional[str] = None,
73
+ quantity_filter: Optional[int] = None,
74
+ case_sensitive: bool = True,
75
+ all_regions: bool = False,
76
+ require_price: bool = False,
77
+ ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
78
+ int]]:
79
+ """Fetches real-time accelerator information from the Slurm cluster.
80
+
81
+ Uses the `get_slurm_node_info_list` helper function.
82
+
83
+ Args:
84
+ gpus_only: If True, only return GPU accelerators.
85
+ name_filter: Regex filter for accelerator names (e.g., 'V100', 'gpu').
86
+ region_filter: Optional filter for Slurm partitions.
87
+ quantity_filter: Minimum number of accelerators required per node.
88
+ case_sensitive: Whether name_filter is case-sensitive.
89
+ all_regions: Unused in Slurm context.
90
+ require_price: Unused in Slurm context.
91
+
92
+ Returns:
93
+ A tuple of three dictionaries:
94
+ - qtys_map: Maps GPU type to set of InstanceTypeInfo objects for unique
95
+ counts found per node.
96
+ - total_capacity: Maps GPU type to total count across all nodes.
97
+ - total_available: Maps GPU type to total free count across all nodes.
98
+ """
99
+ del gpus_only, all_regions, require_price
100
+
101
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
102
+ cloud.CloudCapability.COMPUTE)
103
+ if not sky_clouds.cloud_in_iterable(sky_clouds.Slurm(), enabled_clouds):
104
+ return {}, {}, {}
105
+
106
+ if region_filter is None:
107
+ # Get the first available cluster as default
108
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
109
+ if not all_clusters:
110
+ return {}, {}, {}
111
+ slurm_cluster = all_clusters[0]
112
+ else:
113
+ slurm_cluster = region_filter
114
+
115
+ slurm_nodes_info = slurm_utils.slurm_node_info(
116
+ slurm_cluster_name=slurm_cluster)
117
+
118
+ if not slurm_nodes_info:
119
+ # Customize error message based on filters
120
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
121
+ filters_applied = []
122
+ if name_filter:
123
+ filters_applied.append(f'gpu_name={name_filter!r}')
124
+ if quantity_filter:
125
+ filters_applied.append(f'quantity>={quantity_filter}')
126
+ if filters_applied:
127
+ err_msg += f' with filters ({", ".join(filters_applied)})'
128
+ err_msg += '.'
129
+ logger.error(
130
+ err_msg) # Log as error as it indicates no usable resources found
131
+ raise ValueError(err_msg)
132
+
133
+ # Aggregate results into the required format
134
+ qtys_map: Dict[str,
135
+ Set[common.InstanceTypeInfo]] = collections.defaultdict(set)
136
+ total_capacity: Dict[str, int] = collections.defaultdict(int)
137
+ total_available: Dict[str, int] = collections.defaultdict(int)
138
+
139
+ for node_info in slurm_nodes_info:
140
+ gpu_type = node_info['gpu_type']
141
+ node_total_gpus = node_info['total_gpus']
142
+ node_free_gpus = node_info['free_gpus']
143
+ partition = node_info['partition']
144
+
145
+ # Apply name filter to the determined GPU type
146
+ regex_flags = 0 if case_sensitive else re.IGNORECASE
147
+ if name_filter and not re.match(
148
+ name_filter, gpu_type, flags=regex_flags):
149
+ continue
150
+
151
+ # Apply quantity filter (total GPUs on node must meet this)
152
+ if quantity_filter and node_total_gpus < quantity_filter:
153
+ continue
154
+
155
+ # Apply partition filter if specified
156
+ # TODO(zhwu): when a node is in multiple partitions, the partition
157
+ # mapping from node to partition does not work.
158
+ # if partition_filter and partition != partition_filter:
159
+ # continue
160
+
161
+ # Create InstanceTypeInfo objects for various GPU counts
162
+ # Similar to Kubernetes, generate powers of 2 up to node_total_gpus
163
+ if node_total_gpus > 0:
164
+ count = 1
165
+ while count <= node_total_gpus:
166
+ instance_info = common.InstanceTypeInfo(
167
+ instance_type=None, # Slurm doesn't have instance types
168
+ accelerator_name=gpu_type,
169
+ accelerator_count=count,
170
+ cpu_count=node_info['vcpu_count'],
171
+ memory=node_info['memory_gb'],
172
+ price=0.0, # Slurm doesn't have price info
173
+ region=partition, # Use partition as region
174
+ cloud='slurm', # Specify cloud as 'slurm'
175
+ device_memory=0.0, # No GPU memory info from Slurm
176
+ spot_price=0.0, # Slurm doesn't have spot pricing
177
+ )
178
+ qtys_map[gpu_type].add(instance_info)
179
+ count *= 2
180
+
181
+ # Add the actual total if it's not already included
182
+ # (e.g., if node has 12 GPUs, include counts 1, 2, 4, 8, 12)
183
+ if count // 2 != node_total_gpus:
184
+ instance_info = common.InstanceTypeInfo(
185
+ instance_type=None,
186
+ accelerator_name=gpu_type,
187
+ accelerator_count=node_total_gpus,
188
+ cpu_count=node_info['vcpu_count'],
189
+ memory=node_info['memory_gb'],
190
+ price=0.0,
191
+ region=partition,
192
+ cloud='slurm',
193
+ device_memory=0.0,
194
+ spot_price=0.0,
195
+ )
196
+ qtys_map[gpu_type].add(instance_info)
197
+
198
+ # Map of GPU type -> total count across all matched nodes
199
+ total_capacity[gpu_type] += node_total_gpus
200
+
201
+ # Map of GPU type -> total *free* count across all matched nodes
202
+ total_available[gpu_type] += node_free_gpus
203
+
204
+ # Check if any GPUs were found after applying filters
205
+ if not total_capacity:
206
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
207
+ filters_applied = []
208
+ if name_filter:
209
+ filters_applied.append(f'gpu_name={name_filter!r}')
210
+ if quantity_filter:
211
+ filters_applied.append(f'quantity>={quantity_filter}')
212
+ if filters_applied:
213
+ err_msg += f' with filters ({", ".join(filters_applied)})'
214
+ err_msg += '.'
215
+ logger.error(err_msg)
216
+ raise ValueError(err_msg)
217
+
218
+ # Convert sets of InstanceTypeInfo to sorted lists
219
+ final_qtys_map = {
220
+ gpu: sorted(list(instances), key=lambda x: x.accelerator_count)
221
+ for gpu, instances in qtys_map.items()
222
+ }
223
+
224
+ logger.debug(f'Aggregated Slurm GPU Info: '
225
+ f'qtys={final_qtys_map}, '
226
+ f'capacity={dict(total_capacity)}, '
227
+ f'available={dict(total_available)}')
228
+
229
+ return final_qtys_map, dict(total_capacity), dict(total_available)
230
+
231
+
232
+ def validate_region_zone(
233
+ region_name: Optional[str],
234
+ zone_name: Optional[str],
235
+ ) -> Tuple[Optional[str], Optional[str]]:
236
+ return (region_name, zone_name)
@@ -7,7 +7,10 @@ query instance types and pricing information for Vast.ai.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
+ import pandas as pd
11
+
10
12
  from sky.catalog import common
13
+ from sky.utils import resources_utils
11
14
  from sky.utils import ux_utils
12
15
 
13
16
  if typing.TYPE_CHECKING:
@@ -16,6 +19,17 @@ if typing.TYPE_CHECKING:
16
19
  _df = common.read_catalog('vast/vms.csv')
17
20
 
18
21
 
22
+ def _apply_datacenter_filter(df: pd.DataFrame,
23
+ datacenter_only: bool) -> pd.DataFrame:
24
+ """Filter dataframe by hosting_type if datacenter_only is True.
25
+
26
+ hosting_type: 0 = Consumer hosted, 1 = Datacenter hosted
27
+ """
28
+ if not datacenter_only or 'HostingType' not in df.columns:
29
+ return df
30
+ return df[df['HostingType'] >= 1]
31
+
32
+
19
33
  def instance_type_exists(instance_type: str) -> bool:
20
34
  return common.instance_type_exists_impl(_df, instance_type)
21
35
 
@@ -48,13 +62,16 @@ def get_vcpus_mem_from_instance_type(
48
62
 
49
63
  def get_default_instance_type(cpus: Optional[str] = None,
50
64
  memory: Optional[str] = None,
51
- disk_tier: Optional[str] = None,
65
+ disk_tier: Optional[
66
+ resources_utils.DiskTier] = None,
52
67
  region: Optional[str] = None,
53
- zone: Optional[str] = None) -> Optional[str]:
68
+ zone: Optional[str] = None,
69
+ datacenter_only: bool = False) -> Optional[str]:
54
70
  del disk_tier
55
71
  # NOTE: After expanding catalog to multiple entries, you may
56
72
  # want to specify a default instance type or family.
57
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
73
+ df = _apply_datacenter_filter(_df, datacenter_only)
74
+ return common.get_instance_type_for_cpus_mem_impl(df, cpus, memory, region,
58
75
  zone)
59
76
 
60
77
 
@@ -70,12 +87,19 @@ def get_instance_type_for_accelerator(
70
87
  memory: Optional[str] = None,
71
88
  use_spot: bool = False,
72
89
  region: Optional[str] = None,
73
- zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
74
- """Returns a list of instance types that have the given accelerator."""
90
+ zone: Optional[str] = None,
91
+ datacenter_only: bool = False) -> Tuple[Optional[List[str]], List[str]]:
92
+ """Returns a list of instance types that have the given accelerator.
93
+
94
+ Args:
95
+ datacenter_only: If True, only return instances hosted in datacenters
96
+ (hosting_type >= 1).
97
+ """
75
98
  if zone is not None:
76
99
  with ux_utils.print_exception_no_traceback():
77
100
  raise ValueError('Vast does not support zones.')
78
- return common.get_instance_type_for_accelerator_impl(df=_df,
101
+ df = _apply_datacenter_filter(_df, datacenter_only)
102
+ return common.get_instance_type_for_accelerator_impl(df=df,
79
103
  acc_name=acc_name,
80
104
  acc_count=acc_count,
81
105
  cpus=cpus,
sky/check.py CHANGED
@@ -528,8 +528,9 @@ def _print_checked_cloud(
528
528
  # `dict` reasons for K8s and SSH will be printed in detail in
529
529
  # _format_enabled_cloud. Skip here unless the cloud is disabled.
530
530
  if not isinstance(reason, str):
531
- if not ok and isinstance(cloud_tuple[1],
532
- (sky_clouds.SSH, sky_clouds.Kubernetes)):
531
+ if not ok and isinstance(
532
+ cloud_tuple[1],
533
+ (sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
533
534
  if reason is not None:
534
535
  reason_str = _format_context_details(cloud_tuple[1],
535
536
  show_details=True,
@@ -555,7 +556,9 @@ def _print_checked_cloud(
555
556
  capability_string = f'[{", ".join(enabled_capabilities)}]'
556
557
  if verbose and cloud is not cloudflare and cloud is not coreweave:
557
558
  activated_account = cloud.get_active_user_identity_str()
558
- if isinstance(cloud_tuple[1], (sky_clouds.SSH, sky_clouds.Kubernetes)):
559
+ if isinstance(
560
+ cloud_tuple[1],
561
+ (sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
559
562
  detail_string = _format_context_details(cloud_tuple[1],
560
563
  show_details=True,
561
564
  ctx2text=ctx2text)
@@ -586,6 +589,9 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
586
589
  if isinstance(cloud_type, sky_clouds.SSH):
587
590
  # Get the cluster names by reading from the node pools file
588
591
  contexts = sky_clouds.SSH.get_ssh_node_pool_contexts()
592
+ elif isinstance(cloud_type, sky_clouds.Slurm):
593
+ # Get the cluster names from SLURM config
594
+ contexts = sky_clouds.Slurm.existing_allowed_clusters()
589
595
  else:
590
596
  assert isinstance(cloud_type, sky_clouds.Kubernetes)
591
597
  contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
@@ -650,15 +656,19 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
650
656
  'configuration.'))
651
657
  else:
652
658
  # Default case - not set up
653
- text_suffix = (': ' + _red_color('disabled. ') +
654
- _dim_color('Reason: Not set up. Use '
655
- '`sky ssh up --infra '
656
- f'{context.lstrip("ssh-")}` '
657
- 'to set up.'))
659
+ text_suffix = (': ' + _red_color('disabled. ') + _dim_color(
660
+ 'Reason: Not set up. Use '
661
+ '`sky ssh up --infra '
662
+ f'{common_utils.removeprefix(context, "ssh-")}` '
663
+ 'to set up.'))
658
664
  contexts_formatted.append(
659
665
  f'\n {symbol}{cleaned_context}{text_suffix}')
660
- identity_str = ('SSH Node Pools' if isinstance(cloud_type, sky_clouds.SSH)
661
- else 'Allowed contexts')
666
+ if isinstance(cloud_type, sky_clouds.SSH):
667
+ identity_str = 'SSH Node Pools'
668
+ elif isinstance(cloud_type, sky_clouds.Slurm):
669
+ identity_str = 'Allowed clusters'
670
+ else:
671
+ identity_str = 'Allowed contexts'
662
672
  return f'\n {identity_str}:{"".join(contexts_formatted)}'
663
673
 
664
674
 
@@ -677,7 +687,11 @@ def _format_enabled_cloud(cloud_name: str,
677
687
  cloud_and_capabilities = f'{cloud_name} [{", ".join(capabilities)}]'
678
688
  title = _green_color(cloud_and_capabilities)
679
689
 
680
- if cloud_name in [repr(sky_clouds.Kubernetes()), repr(sky_clouds.SSH())]:
690
+ if cloud_name in [
691
+ repr(sky_clouds.Kubernetes()),
692
+ repr(sky_clouds.SSH()),
693
+ repr(sky_clouds.Slurm())
694
+ ]:
681
695
  return (f'{title}' + _format_context_details(
682
696
  cloud_name, show_details=False, ctx2text=ctx2text))
683
697
  return _green_color(cloud_and_capabilities)