skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +478 -0
  4. sky/backends/backend_utils.py +45 -4
  5. sky/backends/cloud_vm_ray_backend.py +32 -33
  6. sky/backends/task_codegen.py +340 -2
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/kubernetes_catalog.py +12 -4
  9. sky/catalog/slurm_catalog.py +243 -0
  10. sky/check.py +14 -3
  11. sky/client/cli/command.py +329 -22
  12. sky/client/sdk.py +56 -2
  13. sky/clouds/__init__.py +2 -0
  14. sky/clouds/cloud.py +7 -0
  15. sky/clouds/slurm.py +578 -0
  16. sky/clouds/ssh.py +2 -1
  17. sky/clouds/vast.py +10 -0
  18. sky/core.py +128 -36
  19. sky/dashboard/out/404.html +1 -1
  20. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
  35. sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
  38. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
  42. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  43. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  44. sky/dashboard/out/clusters/[cluster].html +1 -1
  45. sky/dashboard/out/clusters.html +1 -1
  46. sky/dashboard/out/config.html +1 -1
  47. sky/dashboard/out/index.html +1 -1
  48. sky/dashboard/out/infra/[context].html +1 -1
  49. sky/dashboard/out/infra.html +1 -1
  50. sky/dashboard/out/jobs/[job].html +1 -1
  51. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  52. sky/dashboard/out/jobs.html +1 -1
  53. sky/dashboard/out/plugins/[...slug].html +1 -0
  54. sky/dashboard/out/users.html +1 -1
  55. sky/dashboard/out/volumes.html +1 -1
  56. sky/dashboard/out/workspace/new.html +1 -1
  57. sky/dashboard/out/workspaces/[name].html +1 -1
  58. sky/dashboard/out/workspaces.html +1 -1
  59. sky/data/mounting_utils.py +16 -2
  60. sky/global_user_state.py +3 -3
  61. sky/models.py +2 -0
  62. sky/optimizer.py +6 -5
  63. sky/provision/__init__.py +1 -0
  64. sky/provision/common.py +20 -0
  65. sky/provision/docker_utils.py +15 -2
  66. sky/provision/kubernetes/utils.py +42 -6
  67. sky/provision/provisioner.py +15 -6
  68. sky/provision/slurm/__init__.py +12 -0
  69. sky/provision/slurm/config.py +13 -0
  70. sky/provision/slurm/instance.py +572 -0
  71. sky/provision/slurm/utils.py +583 -0
  72. sky/provision/vast/instance.py +4 -1
  73. sky/provision/vast/utils.py +10 -6
  74. sky/serve/server/impl.py +1 -1
  75. sky/server/constants.py +1 -1
  76. sky/server/plugins.py +222 -0
  77. sky/server/requests/executor.py +5 -2
  78. sky/server/requests/payloads.py +12 -1
  79. sky/server/requests/request_names.py +2 -0
  80. sky/server/requests/requests.py +5 -1
  81. sky/server/requests/serializers/encoders.py +17 -0
  82. sky/server/requests/serializers/return_value_serializers.py +60 -0
  83. sky/server/server.py +78 -8
  84. sky/server/server_utils.py +30 -0
  85. sky/setup_files/dependencies.py +2 -0
  86. sky/skylet/attempt_skylet.py +13 -3
  87. sky/skylet/constants.py +34 -9
  88. sky/skylet/events.py +10 -4
  89. sky/skylet/executor/__init__.py +1 -0
  90. sky/skylet/executor/slurm.py +189 -0
  91. sky/skylet/job_lib.py +2 -1
  92. sky/skylet/log_lib.py +22 -6
  93. sky/skylet/log_lib.pyi +8 -6
  94. sky/skylet/skylet.py +5 -1
  95. sky/skylet/subprocess_daemon.py +2 -1
  96. sky/ssh_node_pools/constants.py +12 -0
  97. sky/ssh_node_pools/core.py +40 -3
  98. sky/ssh_node_pools/deploy/__init__.py +4 -0
  99. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  100. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  101. sky/ssh_node_pools/deploy/utils.py +173 -0
  102. sky/ssh_node_pools/server.py +11 -13
  103. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  104. sky/templates/kubernetes-ray.yml.j2 +8 -0
  105. sky/templates/slurm-ray.yml.j2 +85 -0
  106. sky/templates/vast-ray.yml.j2 +1 -0
  107. sky/users/model.conf +1 -1
  108. sky/users/permission.py +24 -1
  109. sky/users/rbac.py +31 -3
  110. sky/utils/annotations.py +108 -8
  111. sky/utils/command_runner.py +197 -5
  112. sky/utils/command_runner.pyi +27 -4
  113. sky/utils/common_utils.py +18 -3
  114. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  115. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  116. sky/utils/schemas.py +31 -0
  117. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
  118. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
  119. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  121. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  126. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  127. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  128. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  129. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  130. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  131. /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
  132. /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
  133. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  134. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  135. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
  136. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,243 @@
1
+ """Slurm Catalog."""
2
+
3
+ import collections
4
+ import re
5
+ from typing import Dict, List, Optional, Set, Tuple
6
+
7
+ from sky import check as sky_check
8
+ from sky import clouds as sky_clouds
9
+ from sky import sky_logging
10
+ from sky.catalog import common
11
+ from sky.clouds import cloud
12
+ from sky.provision.slurm import utils as slurm_utils
13
+ from sky.utils import resources_utils
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+ _DEFAULT_NUM_VCPUS = 2
18
+ _DEFAULT_MEMORY_CPU_RATIO = 1
19
+
20
+
21
+ def instance_type_exists(instance_type: str) -> bool:
22
+ """Check if the given instance type is valid for Slurm."""
23
+ return slurm_utils.SlurmInstanceType.is_valid_instance_type(instance_type)
24
+
25
+
26
+ def get_default_instance_type(cpus: Optional[str] = None,
27
+ memory: Optional[str] = None,
28
+ disk_tier: Optional[
29
+ resources_utils.DiskTier] = None,
30
+ region: Optional[str] = None,
31
+ zone: Optional[str] = None) -> Optional[str]:
32
+ # Delete unused parameters.
33
+ del disk_tier, region, zone
34
+
35
+ # Slurm provisions resources via --cpus-per-task and --mem.
36
+ instance_cpus = float(
37
+ cpus.strip('+')) if cpus is not None else _DEFAULT_NUM_VCPUS
38
+ if memory is not None:
39
+ if memory.endswith('+'):
40
+ instance_mem = float(memory[:-1])
41
+ elif memory.endswith('x'):
42
+ instance_mem = float(memory[:-1]) * instance_cpus
43
+ else:
44
+ instance_mem = float(memory)
45
+ else:
46
+ instance_mem = instance_cpus * _DEFAULT_MEMORY_CPU_RATIO
47
+ virtual_instance_type = slurm_utils.SlurmInstanceType(
48
+ instance_cpus, instance_mem).name
49
+ return virtual_instance_type
50
+
51
+
52
+ def list_accelerators(
53
+ gpus_only: bool,
54
+ name_filter: Optional[str],
55
+ region_filter: Optional[str],
56
+ quantity_filter: Optional[int],
57
+ case_sensitive: bool = True,
58
+ all_regions: bool = False,
59
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
60
+ """List accelerators in Slurm clusters.
61
+
62
+ Returns a dictionary mapping GPU type to a list of InstanceTypeInfo objects.
63
+ """
64
+ return list_accelerators_realtime(gpus_only, name_filter, region_filter,
65
+ quantity_filter, case_sensitive,
66
+ all_regions, require_price)[0]
67
+
68
+
69
+ def list_accelerators_realtime(
70
+ gpus_only: bool = True,
71
+ name_filter: Optional[str] = None,
72
+ region_filter: Optional[str] = None,
73
+ quantity_filter: Optional[int] = None,
74
+ case_sensitive: bool = True,
75
+ all_regions: bool = False,
76
+ require_price: bool = False,
77
+ ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
78
+ int]]:
79
+ """Fetches real-time accelerator information from the Slurm cluster.
80
+
81
+ Uses the `get_slurm_node_info_list` helper function.
82
+
83
+ Args:
84
+ gpus_only: If True, only return GPU accelerators.
85
+ name_filter: Regex filter for accelerator names (e.g., 'V100', 'gpu').
86
+ region_filter: Optional filter for Slurm partitions.
87
+ quantity_filter: Minimum number of accelerators required per node.
88
+ case_sensitive: Whether name_filter is case-sensitive.
89
+ all_regions: Unused in Slurm context.
90
+ require_price: Unused in Slurm context.
91
+
92
+ Returns:
93
+ A tuple of three dictionaries:
94
+ - qtys_map: Maps GPU type to set of InstanceTypeInfo objects for unique
95
+ counts found per node.
96
+ - total_capacity: Maps GPU type to total count across all nodes.
97
+ - total_available: Maps GPU type to total free count across all nodes.
98
+ """
99
+ del gpus_only, all_regions, require_price
100
+
101
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
102
+ cloud.CloudCapability.COMPUTE)
103
+ if not sky_clouds.cloud_in_iterable(sky_clouds.Slurm(), enabled_clouds):
104
+ return {}, {}, {}
105
+
106
+ if region_filter is None:
107
+ # Get the first available cluster as default
108
+ all_clusters = slurm_utils.get_all_slurm_cluster_names()
109
+ if not all_clusters:
110
+ return {}, {}, {}
111
+ slurm_cluster = all_clusters[0]
112
+ else:
113
+ slurm_cluster = region_filter
114
+
115
+ partition_filter = slurm_utils.get_cluster_default_partition(slurm_cluster)
116
+
117
+ # Call the helper function to get node info
118
+ slurm_nodes_info = slurm_utils.slurm_node_info(
119
+ slurm_cluster_name=slurm_cluster)
120
+
121
+ if not slurm_nodes_info:
122
+ # Customize error message based on filters
123
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
124
+ filters_applied = []
125
+ if name_filter:
126
+ filters_applied.append(f'gpu_name={name_filter!r}')
127
+ if quantity_filter:
128
+ filters_applied.append(f'quantity>={quantity_filter}')
129
+ if region_filter:
130
+ filters_applied.append(f'cluster={region_filter!r}')
131
+ if filters_applied:
132
+ err_msg += f' with filters ({", ".join(filters_applied)})'
133
+ err_msg += '.'
134
+ logger.error(
135
+ err_msg) # Log as error as it indicates no usable resources found
136
+ raise ValueError(err_msg)
137
+
138
+ # Aggregate results into the required format
139
+ qtys_map: Dict[str,
140
+ Set[common.InstanceTypeInfo]] = collections.defaultdict(set)
141
+ total_capacity: Dict[str, int] = collections.defaultdict(int)
142
+ total_available: Dict[str, int] = collections.defaultdict(int)
143
+
144
+ for node_info in slurm_nodes_info:
145
+ gpu_type = node_info['gpu_type']
146
+ node_total_gpus = node_info['total_gpus']
147
+ node_free_gpus = node_info['free_gpus']
148
+ partition = node_info['partition']
149
+
150
+ # Apply name filter to the determined GPU type
151
+ regex_flags = 0 if case_sensitive else re.IGNORECASE
152
+ if name_filter and not re.match(
153
+ name_filter, gpu_type, flags=regex_flags):
154
+ continue
155
+
156
+ # Apply quantity filter (total GPUs on node must meet this)
157
+ if quantity_filter and node_total_gpus < quantity_filter:
158
+ continue
159
+
160
+ # Apply partition filter if specified
161
+ # TODO(zhwu): when a node is in multiple partitions, the partition
162
+ # mapping from node to partition does not work.
163
+ # if partition_filter and partition != partition_filter:
164
+ # continue
165
+
166
+ # Create InstanceTypeInfo objects for various GPU counts
167
+ # Similar to Kubernetes, generate powers of 2 up to node_total_gpus
168
+ if node_total_gpus > 0:
169
+ count = 1
170
+ while count <= node_total_gpus:
171
+ instance_info = common.InstanceTypeInfo(
172
+ instance_type=None, # Slurm doesn't have instance types
173
+ accelerator_name=gpu_type,
174
+ accelerator_count=count,
175
+ cpu_count=node_info['vcpu_count'],
176
+ memory=node_info['memory_gb'],
177
+ price=0.0, # Slurm doesn't have price info
178
+ region=partition, # Use partition as region
179
+ cloud='slurm', # Specify cloud as 'slurm'
180
+ device_memory=0.0, # No GPU memory info from Slurm
181
+ spot_price=0.0, # Slurm doesn't have spot pricing
182
+ )
183
+ qtys_map[gpu_type].add(instance_info)
184
+ count *= 2
185
+
186
+ # Add the actual total if it's not already included
187
+ # (e.g., if node has 12 GPUs, include counts 1, 2, 4, 8, 12)
188
+ if count // 2 != node_total_gpus:
189
+ instance_info = common.InstanceTypeInfo(
190
+ instance_type=None,
191
+ accelerator_name=gpu_type,
192
+ accelerator_count=node_total_gpus,
193
+ cpu_count=node_info['vcpu_count'],
194
+ memory=node_info['memory_gb'],
195
+ price=0.0,
196
+ region=partition,
197
+ cloud='slurm',
198
+ device_memory=0.0,
199
+ spot_price=0.0,
200
+ )
201
+ qtys_map[gpu_type].add(instance_info)
202
+
203
+ # Map of GPU type -> total count across all matched nodes
204
+ total_capacity[gpu_type] += node_total_gpus
205
+
206
+ # Map of GPU type -> total *free* count across all matched nodes
207
+ total_available[gpu_type] += node_free_gpus
208
+
209
+ # Check if any GPUs were found after applying filters
210
+ if not total_capacity:
211
+ err_msg = 'No matching GPU nodes found in the Slurm cluster'
212
+ filters_applied = []
213
+ if name_filter:
214
+ filters_applied.append(f'gpu_name={name_filter!r}')
215
+ if quantity_filter:
216
+ filters_applied.append(f'quantity>={quantity_filter}')
217
+ if partition_filter:
218
+ filters_applied.append(f'partition={partition_filter!r}')
219
+ if filters_applied:
220
+ err_msg += f' with filters ({", ".join(filters_applied)})'
221
+ err_msg += '.'
222
+ logger.error(err_msg)
223
+ raise ValueError(err_msg)
224
+
225
+ # Convert sets of InstanceTypeInfo to sorted lists
226
+ final_qtys_map = {
227
+ gpu: sorted(list(instances), key=lambda x: x.accelerator_count)
228
+ for gpu, instances in qtys_map.items()
229
+ }
230
+
231
+ logger.debug(f'Aggregated Slurm GPU Info: '
232
+ f'qtys={final_qtys_map}, '
233
+ f'capacity={dict(total_capacity)}, '
234
+ f'available={dict(total_available)}')
235
+
236
+ return final_qtys_map, dict(total_capacity), dict(total_available)
237
+
238
+
239
+ def validate_region_zone(
240
+ region_name: Optional[str],
241
+ zone_name: Optional[str],
242
+ ) -> Tuple[Optional[str], Optional[str]]:
243
+ return (region_name, zone_name)
sky/check.py CHANGED
@@ -586,6 +586,9 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
586
586
  if isinstance(cloud_type, sky_clouds.SSH):
587
587
  # Get the cluster names by reading from the node pools file
588
588
  contexts = sky_clouds.SSH.get_ssh_node_pool_contexts()
589
+ elif isinstance(cloud_type, sky_clouds.Slurm):
590
+ # Get the cluster names from SLURM config
591
+ contexts = sky_clouds.Slurm.existing_allowed_clusters()
589
592
  else:
590
593
  assert isinstance(cloud_type, sky_clouds.Kubernetes)
591
594
  contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
@@ -657,8 +660,12 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
657
660
  'to set up.'))
658
661
  contexts_formatted.append(
659
662
  f'\n {symbol}{cleaned_context}{text_suffix}')
660
- identity_str = ('SSH Node Pools' if isinstance(cloud_type, sky_clouds.SSH)
661
- else 'Allowed contexts')
663
+ if isinstance(cloud_type, sky_clouds.SSH):
664
+ identity_str = 'SSH Node Pools'
665
+ elif isinstance(cloud_type, sky_clouds.Slurm):
666
+ identity_str = 'Allowed clusters'
667
+ else:
668
+ identity_str = 'Allowed contexts'
662
669
  return f'\n {identity_str}:{"".join(contexts_formatted)}'
663
670
 
664
671
 
@@ -677,7 +684,11 @@ def _format_enabled_cloud(cloud_name: str,
677
684
  cloud_and_capabilities = f'{cloud_name} [{", ".join(capabilities)}]'
678
685
  title = _green_color(cloud_and_capabilities)
679
686
 
680
- if cloud_name in [repr(sky_clouds.Kubernetes()), repr(sky_clouds.SSH())]:
687
+ if cloud_name in [
688
+ repr(sky_clouds.Kubernetes()),
689
+ repr(sky_clouds.SSH()),
690
+ repr(sky_clouds.Slurm())
691
+ ]:
681
692
  return (f'{title}' + _format_context_details(
682
693
  cloud_name, show_details=False, ctx2text=ctx2text))
683
694
  return _green_color(cloud_and_capabilities)