skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
144
144
  DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
145
145
 
146
146
  MEMORY_SIZE_UNITS = {
147
+ 'm': 0.001,
147
148
  'B': 1,
148
149
  'K': 2**10,
149
150
  'M': 2**20,
@@ -1205,15 +1206,24 @@ class V1NodeAddress:
1205
1206
  address: str
1206
1207
 
1207
1208
 
1209
+ @dataclasses.dataclass
1210
+ class V1NodeCondition:
1211
+ """Represents a Kubernetes node condition."""
1212
+ type: str
1213
+ status: str
1214
+
1215
+
1208
1216
  @dataclasses.dataclass
1209
1217
  class V1NodeStatus:
1210
1218
  allocatable: Dict[str, str]
1211
1219
  capacity: Dict[str, str]
1212
1220
  addresses: List[V1NodeAddress]
1221
+ conditions: List[V1NodeCondition]
1213
1222
 
1214
1223
 
1215
1224
  @dataclasses.dataclass
1216
1225
  class V1Node:
1226
+ """Represents a Kubernetes node."""
1217
1227
  metadata: V1ObjectMeta
1218
1228
  status: V1NodeStatus
1219
1229
 
@@ -1231,8 +1241,24 @@ class V1Node:
1231
1241
  V1NodeAddress(type=addr['type'],
1232
1242
  address=addr['address'])
1233
1243
  for addr in data['status'].get('addresses', [])
1244
+ ],
1245
+ conditions=[
1246
+ V1NodeCondition(type=cond['type'],
1247
+ status=cond['status'])
1248
+ for cond in data['status'].get('conditions', [])
1234
1249
  ]))
1235
1250
 
1251
+ def is_ready(self) -> bool:
1252
+ """Check if the node is ready based on its conditions.
1253
+
1254
+ A node is considered ready if it has a 'Ready' condition with
1255
+ status 'True'.
1256
+ """
1257
+ for condition in self.status.conditions:
1258
+ if condition.type == 'Ready':
1259
+ return condition.status == 'True'
1260
+ return False
1261
+
1236
1262
 
1237
1263
  @annotations.lru_cache(scope='request', maxsize=10)
1238
1264
  @_retry_on_error(resource_type='node')
@@ -1306,12 +1332,20 @@ class V1Pod:
1306
1332
 
1307
1333
 
1308
1334
  @_retry_on_error(resource_type='pod')
1309
- def get_allocated_gpu_qty_by_node(
1335
+ def get_allocated_resources_by_node(
1310
1336
  *,
1311
1337
  context: Optional[str] = None,
1312
- ) -> Dict[str, int]:
1313
- """Gets allocated GPU quantity by each node by fetching pods in
1338
+ ) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
1339
+ """Gets allocated GPU, CPU, and memory by each node by fetching pods in
1314
1340
  all namespaces in kubernetes cluster indicated by context.
1341
+
1342
+ This function combines GPU and CPU/memory allocation tracking into a single
1343
+ API call for better performance.
1344
+
1345
+ Returns:
1346
+ Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
1347
+ - allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
1348
+ - allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
1315
1349
  """
1316
1350
  if context is None:
1317
1351
  context = get_current_kube_config_context_name()
@@ -1330,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
1330
1364
  field_selector=field_selector)
1331
1365
  try:
1332
1366
  allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1367
+ allocated_cpu_memory_by_node: Dict[str, Tuple[
1368
+ float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
1333
1369
  for item_dict in ijson.items(response,
1334
1370
  'items.item',
1335
1371
  buf_size=IJSON_BUFFER_SIZE):
1336
1372
  pod = V1Pod.from_dict(item_dict)
1337
1373
  if should_exclude_pod_from_gpu_allocation(pod):
1338
1374
  logger.debug(
1339
- f'Excluding pod {pod.metadata.name} from GPU count '
1375
+ f'Excluding pod {pod.metadata.name} from resource count '
1340
1376
  f'calculations on node {pod.spec.node_name}')
1341
1377
  continue
1342
- # Iterate over all the containers in the pod and sum the
1343
- # GPU requests
1378
+ if not pod.spec.node_name:
1379
+ continue
1380
+
1381
+ # Iterate over all the containers in the pod and sum the resources
1344
1382
  pod_allocated_qty = 0
1383
+ pod_allocated_cpu = 0.0
1384
+ pod_allocated_memory_gb = 0.0
1345
1385
  for container in pod.spec.containers:
1346
1386
  if container.resources.requests:
1387
+ requests = container.resources.requests
1388
+ # Parse GPU
1347
1389
  pod_allocated_qty += get_node_accelerator_count(
1348
- context, container.resources.requests)
1349
- if pod_allocated_qty > 0 and pod.spec.node_name:
1390
+ context, requests)
1391
+ # Parse CPU
1392
+ if 'cpu' in requests:
1393
+ pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
1394
+ requests['cpu'])
1395
+ # Parse memory
1396
+ if 'memory' in requests:
1397
+ pod_allocated_memory_gb += parse_memory_resource(
1398
+ requests['memory'], unit='G')
1399
+
1400
+ if pod_allocated_qty > 0:
1350
1401
  allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1351
- return allocated_qty_by_node
1402
+ if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
1403
+ current_cpu, current_memory = allocated_cpu_memory_by_node[
1404
+ pod.spec.node_name]
1405
+ allocated_cpu_memory_by_node[pod.spec.node_name] = (
1406
+ current_cpu + pod_allocated_cpu,
1407
+ current_memory + pod_allocated_memory_gb)
1408
+ return allocated_qty_by_node, allocated_cpu_memory_by_node
1352
1409
  finally:
1353
1410
  response.release_conn()
1354
1411
 
1355
1412
 
1413
+ @_retry_on_error(resource_type='pod')
1414
+ def get_allocated_gpu_qty_by_node(
1415
+ *,
1416
+ context: Optional[str] = None,
1417
+ ) -> Dict[str, int]:
1418
+ """Gets allocated GPU quantity by each node by fetching pods in
1419
+ all namespaces in kubernetes cluster indicated by context.
1420
+
1421
+ Note: For better performance when you also need CPU/memory allocation,
1422
+ use get_allocated_resources_by_node() instead.
1423
+ """
1424
+ allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
1425
+ return allocated_qty_by_node
1426
+
1427
+
1356
1428
  def check_instance_fits(context: Optional[str],
1357
1429
  instance: str) -> Tuple[bool, Optional[str]]:
1358
1430
  """Checks if the instance fits on the Kubernetes cluster.
@@ -1451,11 +1523,12 @@ def check_instance_fits(context: Optional[str],
1451
1523
  return False, str(e)
1452
1524
  # Get the set of nodes that have the GPU type
1453
1525
  gpu_nodes = [
1454
- node for node in nodes if gpu_label_key in node.metadata.labels and
1526
+ node for node in nodes
1527
+ if node.is_ready() and gpu_label_key in node.metadata.labels and
1455
1528
  node.metadata.labels[gpu_label_key] in gpu_label_values
1456
1529
  ]
1457
1530
  if not gpu_nodes:
1458
- return False, f'No GPU nodes found with {acc_type} on the cluster'
1531
+ return False, f'No ready GPU nodes found with {acc_type} on the cluster'
1459
1532
  if is_tpu_on_gke(acc_type):
1460
1533
  # If requested accelerator is a TPU type, check if the cluster
1461
1534
  # has sufficient TPU resource to meet the requirement.
@@ -1479,7 +1552,9 @@ def check_instance_fits(context: Optional[str],
1479
1552
  f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
1480
1553
  f'memory (> {k8s_instance_type.memory} G). ')
1481
1554
  else:
1482
- candidate_nodes = nodes
1555
+ candidate_nodes = [node for node in nodes if node.is_ready()]
1556
+ if not candidate_nodes:
1557
+ return False, 'No ready nodes found in the cluster.'
1483
1558
  not_fit_reason_prefix = (f'No nodes found with enough '
1484
1559
  f'CPU (> {k8s_instance_type.cpus} CPUs) '
1485
1560
  'and/or memory '
@@ -2161,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
2161
2236
  _, current_context = kubernetes.list_kube_config_contexts()
2162
2237
  return current_context['name']
2163
2238
  except k8s.config.config_exception.ConfigException:
2239
+ # If kubeconfig is not available, check if running in-cluster and
2240
+ # return the in-cluster context name. This is needed when kubeconfig
2241
+ # is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
2242
+ # but we still need to know the context name for operations like
2243
+ # port mode detection.
2244
+ if is_incluster_config_available():
2245
+ return kubernetes.in_cluster_context_name()
2164
2246
  return None
2165
2247
 
2166
2248
 
@@ -2285,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
2285
2367
  try:
2286
2368
  bytes_value = int(resource_str)
2287
2369
  except ValueError:
2288
- memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
2370
+ memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
2289
2371
  number, unit_index = [item.strip() for item in memory_size.split()]
2290
2372
  unit_index = unit_index[0]
2291
2373
  bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
@@ -3033,16 +3115,32 @@ def get_kubernetes_node_info(
3033
3115
  has_accelerator_nodes = True
3034
3116
  break
3035
3117
 
3036
- # Get the allocated GPU quantity by each node
3118
+ # Get the allocated resources (GPU, CPU, memory) by each node in a single call
3037
3119
  allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
3038
- error_on_get_allocated_gpu_qty_by_node = False
3120
+ allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
3121
+ error_on_get_allocated_resources = False
3122
+ # Get resource allocation. For GPU allocation, only call if there are GPU nodes
3123
+ # (same as master branch). For CPU/memory, we always need it for all nodes.
3039
3124
  if has_accelerator_nodes:
3125
+ # When there are GPU nodes, get both GPU and CPU/memory in one call
3040
3126
  try:
3041
- allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3127
+ allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
3042
3128
  context=context)
3043
3129
  except kubernetes.api_exception() as e:
3044
3130
  if e.status == 403:
3045
- error_on_get_allocated_gpu_qty_by_node = True
3131
+ error_on_get_allocated_resources = True
3132
+ pass
3133
+ else:
3134
+ raise
3135
+ else:
3136
+ # When there are no GPU nodes, we still need CPU/memory allocation
3137
+ # This is an extra API call compared to master branch
3138
+ try:
3139
+ _, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
3140
+ context=context)
3141
+ except kubernetes.api_exception() as e:
3142
+ if e.status == 403:
3143
+ error_on_get_allocated_resources = True
3046
3144
  pass
3047
3145
  else:
3048
3146
  raise
@@ -3078,16 +3176,56 @@ def get_kubernetes_node_info(
3078
3176
 
3079
3177
  accelerator_count = get_node_accelerator_count(context,
3080
3178
  node.status.allocatable)
3179
+
3180
+ # Parse CPU and memory from node capacity
3181
+ cpu_count = None
3182
+ memory_gb = None
3183
+ try:
3184
+ if 'cpu' in node.status.capacity:
3185
+ cpu_count = float(
3186
+ parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
3187
+ if 'memory' in node.status.capacity:
3188
+ memory_gb = parse_memory_resource(
3189
+ node.status.capacity['memory'], unit='G')
3190
+ except (KeyError, ValueError) as e:
3191
+ # If parsing fails, log but continue
3192
+ logger.debug(f'Failed to parse CPU/memory for node '
3193
+ f'{node.metadata.name}: {e}')
3194
+
3195
+ # Calculate free CPU and memory
3196
+ cpu_free = None
3197
+ memory_free_gb = None
3198
+ if cpu_count is not None or memory_gb is not None:
3199
+ if not error_on_get_allocated_resources:
3200
+ allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
3201
+ node.metadata.name, (0.0, 0.0))
3202
+ if cpu_count is not None:
3203
+ cpu_free = max(0.0, cpu_count - allocated_cpu)
3204
+ if memory_gb is not None:
3205
+ memory_free_gb = max(0.0, memory_gb - allocated_memory)
3206
+ # If we can't get allocation info, set free to None (unknown)
3207
+
3208
+ # Check if node is ready
3209
+ node_is_ready = node.is_ready()
3210
+
3081
3211
  if accelerator_count == 0:
3082
3212
  node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
3083
3213
  name=node.metadata.name,
3084
3214
  accelerator_type=accelerator_name,
3085
3215
  total={'accelerator_count': 0},
3086
3216
  free={'accelerators_available': 0},
3087
- ip_address=node_ip)
3217
+ ip_address=node_ip,
3218
+ cpu_count=cpu_count,
3219
+ memory_gb=memory_gb,
3220
+ cpu_free=cpu_free,
3221
+ memory_free_gb=memory_free_gb,
3222
+ is_ready=node_is_ready)
3088
3223
  continue
3089
3224
 
3090
- if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
3225
+ if not node_is_ready:
3226
+ # If node is not ready, report 0 available GPUs
3227
+ accelerators_available = 0
3228
+ elif not has_accelerator_nodes or error_on_get_allocated_resources:
3091
3229
  accelerators_available = -1
3092
3230
  else:
3093
3231
  allocated_qty = allocated_qty_by_node[node.metadata.name]
@@ -3105,7 +3243,12 @@ def get_kubernetes_node_info(
3105
3243
  accelerator_type=accelerator_name,
3106
3244
  total={'accelerator_count': int(accelerator_count)},
3107
3245
  free={'accelerators_available': int(accelerators_available)},
3108
- ip_address=node_ip)
3246
+ ip_address=node_ip,
3247
+ cpu_count=cpu_count,
3248
+ memory_gb=memory_gb,
3249
+ cpu_free=cpu_free,
3250
+ memory_free_gb=memory_free_gb,
3251
+ is_ready=node_is_ready)
3109
3252
  hint = ''
3110
3253
  if has_multi_host_tpu:
3111
3254
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
45
45
  continue
46
46
  pvc = kubernetes.core_api(
47
47
  context).read_namespaced_persistent_volume_claim(
48
- name=pvc_name, namespace=namespace)
48
+ name=pvc_name,
49
+ namespace=namespace,
50
+ _request_timeout=kubernetes.API_TIMEOUT)
49
51
  access_mode = pvc.spec.access_modes[0]
50
52
  if access_mode not in once_modes:
51
53
  continue
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
65
67
  if storage_class_name is not None:
66
68
  try:
67
69
  kubernetes.storage_api(context).read_storage_class(
68
- name=storage_class_name)
70
+ name=storage_class_name,
71
+ _request_timeout=kubernetes.API_TIMEOUT)
69
72
  except kubernetes.api_exception() as e:
70
73
  raise config_lib.KubernetesError(
71
74
  f'Check storage class {storage_class_name} error: {e}')
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
82
85
  context).delete_namespaced_persistent_volume_claim(
83
86
  name=pvc_name,
84
87
  namespace=namespace,
85
- _request_timeout=config_lib.DELETION_TIMEOUT),
88
+ _request_timeout=kubernetes.API_TIMEOUT),
86
89
  resource_type='pvc',
87
90
  resource_name=pvc_name)
88
91
  logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
@@ -119,7 +122,9 @@ def _get_volume_usedby(
119
122
  cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
120
123
  # Get all pods in the namespace
121
124
  pods = kubernetes.core_api(context).list_namespaced_pod(
122
- namespace=namespace, field_selector=field_selector)
125
+ namespace=namespace,
126
+ field_selector=field_selector,
127
+ _request_timeout=kubernetes.API_TIMEOUT)
123
128
  for pod in pods.items:
124
129
  if pod.spec.volumes is None:
125
130
  continue
@@ -164,8 +169,21 @@ def get_volume_usedby(
164
169
 
165
170
  def get_all_volumes_usedby(
166
171
  configs: List[models.VolumeConfig],
167
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
168
- """Gets the usedby resources of all volumes."""
172
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
173
+ """Gets the usedby resources of all volumes.
174
+
175
+ Args:
176
+ configs: List of VolumeConfig objects.
177
+
178
+ Returns:
179
+ usedby_pods: Dictionary of context to namespace to volume name to pods
180
+ using the volume. These may include pods not created by
181
+ SkyPilot.
182
+ usedby_clusters: Dictionary of context to namespace to volume name to
183
+ clusters using the volume.
184
+ failed_volume_names: Set of volume names whose usedby info failed to
185
+ fetch.
186
+ """
169
187
  field_selector = ','.join([
170
188
  f'status.phase!={phase}'
171
189
  for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
173
191
  label_selector = 'parent=skypilot'
174
192
  context_to_namespaces: Dict[str, Set[str]] = {}
175
193
  pvc_names = set()
194
+ original_volume_names: Dict[str, Dict[str, List[str]]] = {}
176
195
  for config in configs:
177
196
  context, namespace = _get_context_namespace(config)
178
- if context not in context_to_namespaces:
179
- context_to_namespaces[context] = set()
180
- context_to_namespaces[context].add(namespace)
197
+ context_to_namespaces.setdefault(context, set()).add(namespace)
198
+ original_volume_names.setdefault(context,
199
+ {}).setdefault(namespace,
200
+ []).append(config.name)
181
201
  pvc_names.add(config.name_on_cloud)
182
202
  cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
183
203
  # Get all pods in the namespace
184
204
  used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
185
205
  used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
206
+ failed_volume_names: Set[str] = set()
186
207
  for context, namespaces in context_to_namespaces.items():
187
208
  used_by_pods[context] = {}
188
209
  used_by_clusters[context] = {}
189
210
  for namespace in namespaces:
190
211
  used_by_pods[context][namespace] = {}
191
212
  used_by_clusters[context][namespace] = {}
192
- pods = kubernetes.core_api(context).list_namespaced_pod(
193
- namespace=namespace,
194
- field_selector=field_selector,
195
- label_selector=label_selector)
213
+ try:
214
+ pods = kubernetes.core_api(context).list_namespaced_pod(
215
+ namespace=namespace,
216
+ field_selector=field_selector,
217
+ label_selector=label_selector,
218
+ _request_timeout=kubernetes.API_TIMEOUT)
219
+ except Exception as e: # pylint: disable=broad-except
220
+ logger.debug(f'Failed to get pods in namespace {namespace} '
221
+ f'in context {context}: {e}')
222
+ # Mark all volumes in this namespace as failed
223
+ for original_volume_name in original_volume_names[context][
224
+ namespace]:
225
+ failed_volume_names.add(original_volume_name)
226
+ continue
196
227
  for pod in pods.items:
197
228
  if pod.spec.volumes is None:
198
229
  continue
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
217
248
  used_by_clusters[context][namespace][cluster_name] = []
218
249
  used_by_clusters[context][namespace][cluster_name].append(
219
250
  cluster_name)
220
- return used_by_pods, used_by_clusters
251
+ return used_by_pods, used_by_clusters, failed_volume_names
221
252
 
222
253
 
223
254
  def map_all_volumes_usedby(
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
292
323
  try:
293
324
  pvc = kubernetes.core_api(
294
325
  context).read_namespaced_persistent_volume_claim(
295
- name=pvc_name, namespace=namespace)
326
+ name=pvc_name,
327
+ namespace=namespace,
328
+ _request_timeout=kubernetes.API_TIMEOUT)
296
329
  if config is not None:
297
330
  _populate_config_from_pvc(config, pvc)
298
331
  logger.debug(f'PVC {pvc_name} already exists')
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
305
338
  raise ValueError(
306
339
  f'PVC {pvc_name} does not exist while use_existing is True.')
307
340
  pvc = kubernetes.core_api(
308
- context).create_namespaced_persistent_volume_claim(namespace=namespace,
309
- body=pvc_spec)
341
+ context).create_namespaced_persistent_volume_claim(
342
+ namespace=namespace,
343
+ body=pvc_spec,
344
+ _request_timeout=kubernetes.API_TIMEOUT)
310
345
  logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
311
346
  if config is not None:
312
347
  _populate_config_from_pvc(config, pvc)
@@ -157,9 +157,9 @@ def bulk_provision(
157
157
  logger.debug(f'SkyPilot version: {sky.__version__}; '
158
158
  f'commit: {sky.__commit__}')
159
159
  logger.debug(_TITLE.format('Provisioning'))
160
- logger.debug(
161
- 'Provision config:\n'
162
- f'{json.dumps(dataclasses.asdict(bootstrap_config), indent=2)}')
160
+ redacted_config = bootstrap_config.get_redacted_config()
161
+ logger.debug('Provision config:\n'
162
+ f'{json.dumps(redacted_config, indent=2)}')
163
163
  return _bulk_provision(cloud, region, cluster_name,
164
164
  bootstrap_config)
165
165
  except exceptions.NoClusterLaunchedError:
@@ -493,7 +493,8 @@ def _post_provision_setup(
493
493
  # commands and rsync on the pods. SSH will still be ready after a while
494
494
  # for the users to SSH into the pod.
495
495
  is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
496
- if not is_k8s_cloud:
496
+ is_slurm_cloud = cloud_name.lower() == 'slurm'
497
+ if not is_k8s_cloud and not is_slurm_cloud:
497
498
  logger.debug(
498
499
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
499
500
  wait_for_ssh(cluster_info, ssh_credentials)
@@ -635,10 +636,15 @@ def _post_provision_setup(
635
636
  status.update(
636
637
  runtime_preparation_str.format(step=3, step_name='runtime'))
637
638
 
639
+ skip_ray_setup = False
638
640
  ray_port = constants.SKY_REMOTE_RAY_PORT
639
641
  head_ray_needs_restart = True
640
642
  ray_cluster_healthy = False
641
- if (not provision_record.is_instance_just_booted(
643
+ if (launched_resources.cloud is not None and
644
+ not launched_resources.cloud.uses_ray()):
645
+ skip_ray_setup = True
646
+ logger.debug('Skip Ray cluster setup as cloud does not use Ray.')
647
+ elif (not provision_record.is_instance_just_booted(
642
648
  head_instance.instance_id)):
643
649
  # Check if head node Ray is alive
644
650
  (ray_port, ray_cluster_healthy,
@@ -663,7 +669,9 @@ def _post_provision_setup(
663
669
  'async setup to complete...')
664
670
  time.sleep(1)
665
671
 
666
- if head_ray_needs_restart:
672
+ if skip_ray_setup:
673
+ logger.debug('Skip Ray cluster setup on the head node.')
674
+ elif head_ray_needs_restart:
667
675
  logger.debug('Starting Ray on the entire cluster.')
668
676
  instance_setup.start_ray_on_head_node(
669
677
  cluster_name.name_on_cloud,
@@ -686,7 +694,9 @@ def _post_provision_setup(
686
694
  # We don't need to restart ray on worker nodes if the ray cluster is
687
695
  # already healthy, i.e. the head node has expected number of nodes
688
696
  # connected to the ray cluster.
689
- if cluster_info.num_instances > 1 and not ray_cluster_healthy:
697
+ if skip_ray_setup:
698
+ logger.debug('Skip Ray cluster setup on the worker nodes.')
699
+ elif cluster_info.num_instances > 1 and not ray_cluster_healthy:
690
700
  instance_setup.start_ray_on_worker_nodes(
691
701
  cluster_name.name_on_cloud,
692
702
  no_restart=not head_ray_needs_restart,
@@ -1,5 +1,6 @@
1
1
  """RunPod instance provisioning."""
2
2
  import time
3
+ import traceback
3
4
  from typing import Any, Dict, List, Optional, Tuple
4
5
 
5
6
  from sky import sky_logging
@@ -116,7 +117,8 @@ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
116
117
  volume_mount_path=volume_mount_path,
117
118
  )
118
119
  except Exception as e: # pylint: disable=broad-except
119
- logger.warning(f'run_instances error: {e}')
120
+ logger.warning(f'run_instances error: {e}\n'
121
+ f'Full traceback:\n{traceback.format_exc()}')
120
122
  raise
121
123
  logger.info(f'Launched instance {instance_id}.')
122
124
  created_instance_ids.append(instance_id)
@@ -80,7 +80,11 @@ def _construct_docker_login_template_name(cluster_name: str) -> str:
80
80
 
81
81
 
82
82
  def retry(func):
83
- """Decorator to retry a function."""
83
+ """Decorator to retry a function.
84
+
85
+ Only retries on transient errors. Does not retry on authorization errors
86
+ (Unauthorized, Forbidden) as these are not recoverable.
87
+ """
84
88
 
85
89
  def wrapper(*args, **kwargs):
86
90
  """Wrapper for retrying a function."""
@@ -89,6 +93,14 @@ def retry(func):
89
93
  try:
90
94
  return func(*args, **kwargs)
91
95
  except runpod.runpod.error.QueryError as e:
96
+ error_msg = str(e).lower()
97
+ # Don't retry on authorization errors - these won't recover
98
+ auth_keywords = ['unauthorized', 'forbidden', '401', '403']
99
+ if any(keyword in error_msg for keyword in auth_keywords):
100
+ logger.error(f'RunPod authorization error (not retrying): '
101
+ f'{common_utils.format_exception(e)}')
102
+ raise
103
+ cnt += 1
92
104
  if cnt >= 3:
93
105
  raise
94
106
  logger.warning('Retrying for exception: '
@@ -1,5 +1,5 @@
1
1
  """RunPod network volume provisioning."""
2
- from typing import Any, Dict, List, Optional, Tuple
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
3
 
4
4
  from sky import global_user_state
5
5
  from sky import models
@@ -194,15 +194,31 @@ def get_volume_usedby(
194
194
 
195
195
  def get_all_volumes_usedby(
196
196
  configs: List[models.VolumeConfig],
197
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
198
- """Gets the usedby resources of all volumes."""
199
- used_by_results = [get_volume_usedby(config) for config in configs]
197
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
198
+ """Gets the usedby resources of all volumes.
199
+
200
+ Args:
201
+ configs: List of VolumeConfig objects.
202
+
203
+ Returns:
204
+ usedby_pods: Dictionary of volume name to pods using the volume.
205
+ usedby_clusters: Dictionary of volume name to clusters using the volume.
206
+ failed_volume_names: Set of volume names whose usedby info failed to
207
+ fetch.
208
+ """
200
209
  used_by_pods, used_by_clusters = {}, {}
201
- for i in range(len(configs)):
202
- config = configs[i]
203
- used_by_pods[config.name_on_cloud] = used_by_results[i][0]
204
- used_by_clusters[config.name_on_cloud] = used_by_results[i][1]
205
- return used_by_pods, used_by_clusters
210
+ failed_volume_names = set()
211
+ for config in configs:
212
+ try:
213
+ usedby_pods, usedby_clusters = get_volume_usedby(config)
214
+ used_by_pods[config.name_on_cloud] = usedby_pods
215
+ used_by_clusters[config.name_on_cloud] = usedby_clusters
216
+ except Exception as e: # pylint: disable=broad-except
217
+ logger.debug(f'Failed to get usedby info for RunPod volume '
218
+ f'{config.name}: {e}')
219
+ failed_volume_names.add(config.name)
220
+ continue
221
+ return used_by_pods, used_by_clusters, failed_volume_names
206
222
 
207
223
 
208
224
  def map_all_volumes_usedby(
@@ -0,0 +1,12 @@
1
+ """Slurm provisioner for SkyPilot."""
2
+
3
+ from sky.provision.slurm.config import bootstrap_instances
4
+ from sky.provision.slurm.instance import cleanup_ports
5
+ from sky.provision.slurm.instance import get_cluster_info
6
+ from sky.provision.slurm.instance import get_command_runners
7
+ from sky.provision.slurm.instance import open_ports
8
+ from sky.provision.slurm.instance import query_instances
9
+ from sky.provision.slurm.instance import run_instances
10
+ from sky.provision.slurm.instance import stop_instances
11
+ from sky.provision.slurm.instance import terminate_instances
12
+ from sky.provision.slurm.instance import wait_instances
@@ -0,0 +1,13 @@
1
+ """Slrum-specific configuration for the provisioner."""
2
+ import logging
3
+
4
+ from sky.provision import common
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def bootstrap_instances(
10
+ region: str, cluster_name: str,
11
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
12
+ del region, cluster_name # unused
13
+ return config