skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -80,9 +80,8 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
80
80
 
81
81
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
82
82
 
83
- _JOB_STATUS_FETCH_MAX_RETRIES = 3
84
- _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
85
83
  _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
84
+ JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS = 60
86
85
 
87
86
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
88
87
  'Waiting for task to start[/]'
@@ -329,13 +328,21 @@ def ha_recovery_for_consolidation_mode() -> None:
329
328
 
330
329
 
331
330
  async def get_job_status(
332
- backend: 'backends.CloudVmRayBackend', cluster_name: str,
333
- job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
331
+ backend: 'backends.CloudVmRayBackend', cluster_name: str,
332
+ job_id: Optional[int]
333
+ ) -> Tuple[Optional['job_lib.JobStatus'], Optional[str]]:
334
334
  """Check the status of the job running on a managed job cluster.
335
335
 
336
336
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
337
337
  FAILED_SETUP or CANCELLED.
338
+
339
+ Returns:
340
+ job_status: The status of the job.
341
+ transient_error_reason: None if successful or fatal error; otherwise,
342
+ the detailed reason for the transient error.
338
343
  """
344
+ # TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
345
+ # that it can exit retry early if the cluster is down.
339
346
  # TODO(luca) make this async
340
347
  handle = await context_utils.to_thread(
341
348
  global_user_state.get_handle_from_cluster_name, cluster_name)
@@ -343,85 +350,68 @@ async def get_job_status(
343
350
  # This can happen if the cluster was preempted and background status
344
351
  # refresh already noticed and cleaned it up.
345
352
  logger.info(f'Cluster {cluster_name} not found.')
346
- return None
353
+ return None, None
347
354
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
348
355
  job_ids = None if job_id is None else [job_id]
349
- for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
350
- try:
351
- logger.info('=== Checking the job status... ===')
352
- statuses = await asyncio.wait_for(
353
- context_utils.to_thread(backend.get_job_status,
354
- handle,
355
- job_ids=job_ids,
356
- stream_logs=False),
357
- timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
358
- status = list(statuses.values())[0]
359
- if status is None:
360
- logger.info('No job found.')
356
+ try:
357
+ logger.info('=== Checking the job status... ===')
358
+ statuses = await asyncio.wait_for(
359
+ context_utils.to_thread(backend.get_job_status,
360
+ handle,
361
+ job_ids=job_ids,
362
+ stream_logs=False),
363
+ timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
364
+ status = list(statuses.values())[0]
365
+ if status is None:
366
+ logger.info('No job found.')
367
+ else:
368
+ logger.info(f'Job status: {status}')
369
+ logger.info('=' * 34)
370
+ return status, None
371
+ except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
372
+ ValueError, TypeError, asyncio.TimeoutError) as e:
373
+ # Note: Each of these exceptions has some additional conditions to
374
+ # limit how we handle it and whether or not we catch it.
375
+ potential_transient_error_reason = None
376
+ if isinstance(e, exceptions.CommandError):
377
+ returncode = e.returncode
378
+ potential_transient_error_reason = (f'Returncode: {returncode}. '
379
+ f'{e.detailed_reason}')
380
+ elif isinstance(e, grpc.RpcError):
381
+ potential_transient_error_reason = e.details()
382
+ elif isinstance(e, grpc.FutureTimeoutError):
383
+ potential_transient_error_reason = 'grpc timeout'
384
+ elif isinstance(e, asyncio.TimeoutError):
385
+ potential_transient_error_reason = (
386
+ 'Job status check timed out after '
387
+ f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
388
+ # TODO(cooperc): Gracefully handle these exceptions in the backend.
389
+ elif isinstance(e, ValueError):
390
+ # If the cluster yaml is deleted in the middle of getting the
391
+ # SSH credentials, we could see this. See
392
+ # sky/global_user_state.py get_cluster_yaml_dict.
393
+ if re.search(r'Cluster yaml .* not found', str(e)):
394
+ potential_transient_error_reason = 'Cluster yaml was deleted'
361
395
  else:
362
- logger.info(f'Job status: {status}')
363
- logger.info('=' * 34)
364
- return status
365
- except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
366
- ValueError, TypeError, asyncio.TimeoutError) as e:
367
- # Note: Each of these exceptions has some additional conditions to
368
- # limit how we handle it and whether or not we catch it.
369
- # Retry on k8s transient network errors. This is useful when using
370
- # coreweave which may have transient network issue sometimes.
371
- is_transient_error = False
372
- detailed_reason = None
373
- if isinstance(e, exceptions.CommandError):
374
- detailed_reason = e.detailed_reason
375
- if (detailed_reason is not None and
376
- _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
377
- is_transient_error = True
378
- elif isinstance(e, grpc.RpcError):
379
- detailed_reason = e.details()
380
- if e.code() in [
381
- grpc.StatusCode.UNAVAILABLE,
382
- grpc.StatusCode.DEADLINE_EXCEEDED
383
- ]:
384
- is_transient_error = True
385
- elif isinstance(e, grpc.FutureTimeoutError):
386
- detailed_reason = 'Timeout'
387
- elif isinstance(e, asyncio.TimeoutError):
388
- detailed_reason = ('Job status check timed out after '
389
- f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
390
- # TODO(cooperc): Gracefully handle these exceptions in the backend.
391
- elif isinstance(e, ValueError):
392
- # If the cluster yaml is deleted in the middle of getting the
393
- # SSH credentials, we could see this. See
394
- # sky/global_user_state.py get_cluster_yaml_dict.
395
- if re.search(r'Cluster yaml .* not found', str(e)):
396
- detailed_reason = 'Cluster yaml was deleted'
397
- else:
398
- raise
399
- elif isinstance(e, TypeError):
400
- # We will grab the SSH credentials from the cluster yaml, but if
401
- # handle.cluster_yaml is None, we will just return an empty dict
402
- # for the credentials. See
403
- # backend_utils.ssh_credential_from_yaml. Then, the credentials
404
- # are passed as kwargs to SSHCommandRunner.__init__ - see
405
- # cloud_vm_ray_backend.get_command_runners. So we can hit this
406
- # TypeError if the cluster yaml is removed from the handle right
407
- # when we pull it before the cluster is fully deleted.
408
- error_msg_to_check = (
409
- 'SSHCommandRunner.__init__() missing 2 required positional '
410
- 'arguments: \'ssh_user\' and \'ssh_private_key\'')
411
- if str(e) == error_msg_to_check:
412
- detailed_reason = 'SSH credentials were already cleaned up'
413
- else:
414
- raise
415
- if is_transient_error:
416
- logger.info('Failed to connect to the cluster. Retrying '
417
- f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
418
- logger.info('=' * 34)
419
- await asyncio.sleep(1)
396
+ raise
397
+ elif isinstance(e, TypeError):
398
+ # We will grab the SSH credentials from the cluster yaml, but if
399
+ # handle.cluster_yaml is None, we will just return an empty dict
400
+ # for the credentials. See
401
+ # backend_utils.ssh_credential_from_yaml. Then, the credentials
402
+ # are passed as kwargs to SSHCommandRunner.__init__ - see
403
+ # cloud_vm_ray_backend.get_command_runners. So we can hit this
404
+ # TypeError if the cluster yaml is removed from the handle right
405
+ # when we pull it before the cluster is fully deleted.
406
+ error_msg_to_check = (
407
+ 'SSHCommandRunner.__init__() missing 2 required positional '
408
+ 'arguments: \'ssh_user\' and \'ssh_private_key\'')
409
+ if str(e) == error_msg_to_check:
410
+ potential_transient_error_reason = ('SSH credentials were '
411
+ 'already cleaned up')
420
412
  else:
421
- logger.info(f'Failed to get job status: {detailed_reason}')
422
- logger.info('=' * 34)
423
- return None
424
- return None
413
+ raise
414
+ return None, potential_transient_error_reason
425
415
 
426
416
 
427
417
  def controller_process_alive(record: managed_job_state.ControllerPidRecord,
@@ -1570,6 +1560,7 @@ def get_managed_job_queue(
1570
1560
  handle.launched_resources.region,
1571
1561
  handle.launched_resources.zone).formatted_str()
1572
1562
  job['accelerators'] = handle.launched_resources.accelerators
1563
+ job['labels'] = handle.launched_resources.labels
1573
1564
  else:
1574
1565
  # FIXME(zongheng): display the last cached values for these.
1575
1566
  job['cluster_resources'] = '-'
@@ -1578,6 +1569,7 @@ def get_managed_job_queue(
1578
1569
  job['region'] = '-'
1579
1570
  job['zone'] = '-'
1580
1571
  job['infra'] = '-'
1572
+ job['labels'] = None
1581
1573
 
1582
1574
  if not fields or 'details' in fields:
1583
1575
  # Add details about schedule state / backoff.
@@ -1821,7 +1813,8 @@ def format_job_table(
1821
1813
  for replica in replica_info:
1822
1814
  used_by = replica.get('used_by')
1823
1815
  if used_by is not None:
1824
- job_to_worker[used_by] = replica.get('replica_id')
1816
+ for job_id in used_by:
1817
+ job_to_worker[job_id] = replica.get('replica_id')
1825
1818
  return job_to_worker
1826
1819
 
1827
1820
  # Create mapping from job_id to worker replica_id
sky/models.py CHANGED
@@ -68,6 +68,15 @@ class KubernetesNodeInfo:
68
68
  free: Dict[str, int]
69
69
  # IP address of the node (external IP preferred, fallback to internal IP)
70
70
  ip_address: Optional[str] = None
71
+ # CPU count (total CPUs available on the node)
72
+ cpu_count: Optional[float] = None
73
+ # Memory in GB (total memory available on the node)
74
+ memory_gb: Optional[float] = None
75
+ # Free CPU count (free CPUs available on the node after pod allocations)
76
+ cpu_free: Optional[float] = None
77
+ # Free memory in GB (free memory available on the node after pod
78
+ # allocations)
79
+ memory_free_gb: Optional[float] = None
71
80
  # Whether the node is ready (all conditions are satisfied)
72
81
  is_ready: bool = True
73
82
 
sky/optimizer.py CHANGED
@@ -20,6 +20,7 @@ from sky.adaptors import common as adaptors_common
20
20
  from sky.clouds import cloud as sky_cloud
21
21
  from sky.usage import usage_lib
22
22
  from sky.utils import common
23
+ from sky.utils import common_utils
23
24
  from sky.utils import env_options
24
25
  from sky.utils import log_utils
25
26
  from sky.utils import registry
@@ -1290,7 +1291,7 @@ def _check_specified_regions(task: task_lib.Task) -> None:
1290
1291
  msg = f'Task{task_name} requires '
1291
1292
  if region not in existing_contexts:
1292
1293
  if is_ssh:
1293
- infra_str = f'SSH/{region.lstrip("ssh-")}'
1294
+ infra_str = f'SSH/{common_utils.removeprefix(region, "ssh-")}'
1294
1295
  else:
1295
1296
  infra_str = f'Kubernetes/{region}'
1296
1297
  logger.warning(f'{infra_str} is not enabled.')
sky/provision/__init__.py CHANGED
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
6
6
  import functools
7
7
  import inspect
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Tuple, Type
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type
10
10
 
11
11
  from sky import models
12
12
  from sky import sky_logging
@@ -152,16 +152,18 @@ def get_volume_usedby(
152
152
  @_route_to_cloud_impl
153
153
  def get_all_volumes_usedby(
154
154
  provider_name: str, configs: List[models.VolumeConfig]
155
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
156
- """Get the usedby of a volume.
155
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
156
+ """Get the usedby of all volumes.
157
+
158
+ Args:
159
+ provider_name: Name of the provider.
160
+ configs: List of VolumeConfig objects.
157
161
 
158
162
  Returns:
159
- usedby_pods: List of dictionaries, each containing the config keys for
160
- a volume and a key containing pods using the volume.
161
- These may include pods not created by SkyPilot.
162
- usedby_clusters: List of dictionaries, each containing the config keys
163
- for a volume and a key containing clusters using
164
- the volume.
163
+ usedby_pods: Dict of usedby pods.
164
+ usedby_clusters: Dict of usedby clusters.
165
+ failed_volume_names: Set of volume names whose usedby info
166
+ failed to fetch.
165
167
  """
166
168
  raise NotImplementedError
167
169
 
@@ -144,6 +144,7 @@ DEFAULT_NAMESPACE = 'default'
144
144
  DEFAULT_SERVICE_ACCOUNT_NAME = 'skypilot-service-account'
145
145
 
146
146
  MEMORY_SIZE_UNITS = {
147
+ 'm': 0.001,
147
148
  'B': 1,
148
149
  'K': 2**10,
149
150
  'M': 2**20,
@@ -1331,12 +1332,20 @@ class V1Pod:
1331
1332
 
1332
1333
 
1333
1334
  @_retry_on_error(resource_type='pod')
1334
- def get_allocated_gpu_qty_by_node(
1335
+ def get_allocated_resources_by_node(
1335
1336
  *,
1336
1337
  context: Optional[str] = None,
1337
- ) -> Dict[str, int]:
1338
- """Gets allocated GPU quantity by each node by fetching pods in
1338
+ ) -> Tuple[Dict[str, int], Dict[str, Tuple[float, float]]]:
1339
+ """Gets allocated GPU, CPU, and memory by each node by fetching pods in
1339
1340
  all namespaces in kubernetes cluster indicated by context.
1341
+
1342
+ This function combines GPU and CPU/memory allocation tracking into a single
1343
+ API call for better performance.
1344
+
1345
+ Returns:
1346
+ Tuple of (allocated_gpu_qty_by_node, allocated_cpu_memory_by_node):
1347
+ - allocated_gpu_qty_by_node: Dict mapping node name to allocated GPU count
1348
+ - allocated_cpu_memory_by_node: Dict mapping node name to (allocated_cpu, allocated_memory_gb) tuple
1340
1349
  """
1341
1350
  if context is None:
1342
1351
  context = get_current_kube_config_context_name()
@@ -1355,29 +1364,67 @@ def get_allocated_gpu_qty_by_node(
1355
1364
  field_selector=field_selector)
1356
1365
  try:
1357
1366
  allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1367
+ allocated_cpu_memory_by_node: Dict[str, Tuple[
1368
+ float, float]] = collections.defaultdict(lambda: (0.0, 0.0))
1358
1369
  for item_dict in ijson.items(response,
1359
1370
  'items.item',
1360
1371
  buf_size=IJSON_BUFFER_SIZE):
1361
1372
  pod = V1Pod.from_dict(item_dict)
1362
1373
  if should_exclude_pod_from_gpu_allocation(pod):
1363
1374
  logger.debug(
1364
- f'Excluding pod {pod.metadata.name} from GPU count '
1375
+ f'Excluding pod {pod.metadata.name} from resource count '
1365
1376
  f'calculations on node {pod.spec.node_name}')
1366
1377
  continue
1367
- # Iterate over all the containers in the pod and sum the
1368
- # GPU requests
1378
+ if not pod.spec.node_name:
1379
+ continue
1380
+
1381
+ # Iterate over all the containers in the pod and sum the resources
1369
1382
  pod_allocated_qty = 0
1383
+ pod_allocated_cpu = 0.0
1384
+ pod_allocated_memory_gb = 0.0
1370
1385
  for container in pod.spec.containers:
1371
1386
  if container.resources.requests:
1387
+ requests = container.resources.requests
1388
+ # Parse GPU
1372
1389
  pod_allocated_qty += get_node_accelerator_count(
1373
- context, container.resources.requests)
1374
- if pod_allocated_qty > 0 and pod.spec.node_name:
1390
+ context, requests)
1391
+ # Parse CPU
1392
+ if 'cpu' in requests:
1393
+ pod_allocated_cpu += parse_cpu_or_gpu_resource_to_float(
1394
+ requests['cpu'])
1395
+ # Parse memory
1396
+ if 'memory' in requests:
1397
+ pod_allocated_memory_gb += parse_memory_resource(
1398
+ requests['memory'], unit='G')
1399
+
1400
+ if pod_allocated_qty > 0:
1375
1401
  allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1376
- return allocated_qty_by_node
1402
+ if pod_allocated_cpu > 0 or pod_allocated_memory_gb > 0:
1403
+ current_cpu, current_memory = allocated_cpu_memory_by_node[
1404
+ pod.spec.node_name]
1405
+ allocated_cpu_memory_by_node[pod.spec.node_name] = (
1406
+ current_cpu + pod_allocated_cpu,
1407
+ current_memory + pod_allocated_memory_gb)
1408
+ return allocated_qty_by_node, allocated_cpu_memory_by_node
1377
1409
  finally:
1378
1410
  response.release_conn()
1379
1411
 
1380
1412
 
1413
+ @_retry_on_error(resource_type='pod')
1414
+ def get_allocated_gpu_qty_by_node(
1415
+ *,
1416
+ context: Optional[str] = None,
1417
+ ) -> Dict[str, int]:
1418
+ """Gets allocated GPU quantity by each node by fetching pods in
1419
+ all namespaces in kubernetes cluster indicated by context.
1420
+
1421
+ Note: For better performance when you also need CPU/memory allocation,
1422
+ use get_allocated_resources_by_node() instead.
1423
+ """
1424
+ allocated_qty_by_node, _ = get_allocated_resources_by_node(context=context)
1425
+ return allocated_qty_by_node
1426
+
1427
+
1381
1428
  def check_instance_fits(context: Optional[str],
1382
1429
  instance: str) -> Tuple[bool, Optional[str]]:
1383
1430
  """Checks if the instance fits on the Kubernetes cluster.
@@ -2189,6 +2236,13 @@ def get_current_kube_config_context_name() -> Optional[str]:
2189
2236
  _, current_context = kubernetes.list_kube_config_contexts()
2190
2237
  return current_context['name']
2191
2238
  except k8s.config.config_exception.ConfigException:
2239
+ # If kubeconfig is not available, check if running in-cluster and
2240
+ # return the in-cluster context name. This is needed when kubeconfig
2241
+ # is not uploaded to the pod (e.g., remote_identity: SERVICE_ACCOUNT)
2242
+ # but we still need to know the context name for operations like
2243
+ # port mode detection.
2244
+ if is_incluster_config_available():
2245
+ return kubernetes.in_cluster_context_name()
2192
2246
  return None
2193
2247
 
2194
2248
 
@@ -2313,7 +2367,7 @@ def parse_memory_resource(resource_qty_str: str,
2313
2367
  try:
2314
2368
  bytes_value = int(resource_str)
2315
2369
  except ValueError:
2316
- memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
2370
+ memory_size = re.sub(r'([KMGTPBm]+)', r' \1', resource_str)
2317
2371
  number, unit_index = [item.strip() for item in memory_size.split()]
2318
2372
  unit_index = unit_index[0]
2319
2373
  bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
@@ -3061,16 +3115,32 @@ def get_kubernetes_node_info(
3061
3115
  has_accelerator_nodes = True
3062
3116
  break
3063
3117
 
3064
- # Get the allocated GPU quantity by each node
3118
+ # Get the allocated resources (GPU, CPU, memory) by each node in a single call
3065
3119
  allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
3066
- error_on_get_allocated_gpu_qty_by_node = False
3120
+ allocated_cpu_memory_by_node: Dict[str, Tuple[float, float]] = {}
3121
+ error_on_get_allocated_resources = False
3122
+ # Get resource allocation. For GPU allocation, only call if there are GPU nodes
3123
+ # (same as master branch). For CPU/memory, we always need it for all nodes.
3067
3124
  if has_accelerator_nodes:
3125
+ # When there are GPU nodes, get both GPU and CPU/memory in one call
3068
3126
  try:
3069
- allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3127
+ allocated_qty_by_node, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
3070
3128
  context=context)
3071
3129
  except kubernetes.api_exception() as e:
3072
3130
  if e.status == 403:
3073
- error_on_get_allocated_gpu_qty_by_node = True
3131
+ error_on_get_allocated_resources = True
3132
+ pass
3133
+ else:
3134
+ raise
3135
+ else:
3136
+ # When there are no GPU nodes, we still need CPU/memory allocation
3137
+ # This is an extra API call compared to master branch
3138
+ try:
3139
+ _, allocated_cpu_memory_by_node = get_allocated_resources_by_node(
3140
+ context=context)
3141
+ except kubernetes.api_exception() as e:
3142
+ if e.status == 403:
3143
+ error_on_get_allocated_resources = True
3074
3144
  pass
3075
3145
  else:
3076
3146
  raise
@@ -3106,6 +3176,35 @@ def get_kubernetes_node_info(
3106
3176
 
3107
3177
  accelerator_count = get_node_accelerator_count(context,
3108
3178
  node.status.allocatable)
3179
+
3180
+ # Parse CPU and memory from node capacity
3181
+ cpu_count = None
3182
+ memory_gb = None
3183
+ try:
3184
+ if 'cpu' in node.status.capacity:
3185
+ cpu_count = float(
3186
+ parse_cpu_or_gpu_resource(node.status.capacity['cpu']))
3187
+ if 'memory' in node.status.capacity:
3188
+ memory_gb = parse_memory_resource(
3189
+ node.status.capacity['memory'], unit='G')
3190
+ except (KeyError, ValueError) as e:
3191
+ # If parsing fails, log but continue
3192
+ logger.debug(f'Failed to parse CPU/memory for node '
3193
+ f'{node.metadata.name}: {e}')
3194
+
3195
+ # Calculate free CPU and memory
3196
+ cpu_free = None
3197
+ memory_free_gb = None
3198
+ if cpu_count is not None or memory_gb is not None:
3199
+ if not error_on_get_allocated_resources:
3200
+ allocated_cpu, allocated_memory = allocated_cpu_memory_by_node.get(
3201
+ node.metadata.name, (0.0, 0.0))
3202
+ if cpu_count is not None:
3203
+ cpu_free = max(0.0, cpu_count - allocated_cpu)
3204
+ if memory_gb is not None:
3205
+ memory_free_gb = max(0.0, memory_gb - allocated_memory)
3206
+ # If we can't get allocation info, set free to None (unknown)
3207
+
3109
3208
  # Check if node is ready
3110
3209
  node_is_ready = node.is_ready()
3111
3210
 
@@ -3116,13 +3215,17 @@ def get_kubernetes_node_info(
3116
3215
  total={'accelerator_count': 0},
3117
3216
  free={'accelerators_available': 0},
3118
3217
  ip_address=node_ip,
3218
+ cpu_count=cpu_count,
3219
+ memory_gb=memory_gb,
3220
+ cpu_free=cpu_free,
3221
+ memory_free_gb=memory_free_gb,
3119
3222
  is_ready=node_is_ready)
3120
3223
  continue
3121
3224
 
3122
3225
  if not node_is_ready:
3123
3226
  # If node is not ready, report 0 available GPUs
3124
3227
  accelerators_available = 0
3125
- elif not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
3228
+ elif not has_accelerator_nodes or error_on_get_allocated_resources:
3126
3229
  accelerators_available = -1
3127
3230
  else:
3128
3231
  allocated_qty = allocated_qty_by_node[node.metadata.name]
@@ -3141,6 +3244,10 @@ def get_kubernetes_node_info(
3141
3244
  total={'accelerator_count': int(accelerator_count)},
3142
3245
  free={'accelerators_available': int(accelerators_available)},
3143
3246
  ip_address=node_ip,
3247
+ cpu_count=cpu_count,
3248
+ memory_gb=memory_gb,
3249
+ cpu_free=cpu_free,
3250
+ memory_free_gb=memory_free_gb,
3144
3251
  is_ready=node_is_ready)
3145
3252
  hint = ''
3146
3253
  if has_multi_host_tpu:
@@ -45,7 +45,9 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
45
45
  continue
46
46
  pvc = kubernetes.core_api(
47
47
  context).read_namespaced_persistent_volume_claim(
48
- name=pvc_name, namespace=namespace)
48
+ name=pvc_name,
49
+ namespace=namespace,
50
+ _request_timeout=kubernetes.API_TIMEOUT)
49
51
  access_mode = pvc.spec.access_modes[0]
50
52
  if access_mode not in once_modes:
51
53
  continue
@@ -65,7 +67,8 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
65
67
  if storage_class_name is not None:
66
68
  try:
67
69
  kubernetes.storage_api(context).read_storage_class(
68
- name=storage_class_name)
70
+ name=storage_class_name,
71
+ _request_timeout=kubernetes.API_TIMEOUT)
69
72
  except kubernetes.api_exception() as e:
70
73
  raise config_lib.KubernetesError(
71
74
  f'Check storage class {storage_class_name} error: {e}')
@@ -82,7 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
82
85
  context).delete_namespaced_persistent_volume_claim(
83
86
  name=pvc_name,
84
87
  namespace=namespace,
85
- _request_timeout=config_lib.DELETION_TIMEOUT),
88
+ _request_timeout=kubernetes.API_TIMEOUT),
86
89
  resource_type='pvc',
87
90
  resource_name=pvc_name)
88
91
  logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
@@ -119,7 +122,9 @@ def _get_volume_usedby(
119
122
  cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
120
123
  # Get all pods in the namespace
121
124
  pods = kubernetes.core_api(context).list_namespaced_pod(
122
- namespace=namespace, field_selector=field_selector)
125
+ namespace=namespace,
126
+ field_selector=field_selector,
127
+ _request_timeout=kubernetes.API_TIMEOUT)
123
128
  for pod in pods.items:
124
129
  if pod.spec.volumes is None:
125
130
  continue
@@ -164,8 +169,21 @@ def get_volume_usedby(
164
169
 
165
170
  def get_all_volumes_usedby(
166
171
  configs: List[models.VolumeConfig],
167
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
168
- """Gets the usedby resources of all volumes."""
172
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
173
+ """Gets the usedby resources of all volumes.
174
+
175
+ Args:
176
+ configs: List of VolumeConfig objects.
177
+
178
+ Returns:
179
+ usedby_pods: Dictionary of context to namespace to volume name to pods
180
+ using the volume. These may include pods not created by
181
+ SkyPilot.
182
+ usedby_clusters: Dictionary of context to namespace to volume name to
183
+ clusters using the volume.
184
+ failed_volume_names: Set of volume names whose usedby info failed to
185
+ fetch.
186
+ """
169
187
  field_selector = ','.join([
170
188
  f'status.phase!={phase}'
171
189
  for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
@@ -173,26 +191,39 @@ def get_all_volumes_usedby(
173
191
  label_selector = 'parent=skypilot'
174
192
  context_to_namespaces: Dict[str, Set[str]] = {}
175
193
  pvc_names = set()
194
+ original_volume_names: Dict[str, Dict[str, List[str]]] = {}
176
195
  for config in configs:
177
196
  context, namespace = _get_context_namespace(config)
178
- if context not in context_to_namespaces:
179
- context_to_namespaces[context] = set()
180
- context_to_namespaces[context].add(namespace)
197
+ context_to_namespaces.setdefault(context, set()).add(namespace)
198
+ original_volume_names.setdefault(context,
199
+ {}).setdefault(namespace,
200
+ []).append(config.name)
181
201
  pvc_names.add(config.name_on_cloud)
182
202
  cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
183
203
  # Get all pods in the namespace
184
204
  used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
185
205
  used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
206
+ failed_volume_names: Set[str] = set()
186
207
  for context, namespaces in context_to_namespaces.items():
187
208
  used_by_pods[context] = {}
188
209
  used_by_clusters[context] = {}
189
210
  for namespace in namespaces:
190
211
  used_by_pods[context][namespace] = {}
191
212
  used_by_clusters[context][namespace] = {}
192
- pods = kubernetes.core_api(context).list_namespaced_pod(
193
- namespace=namespace,
194
- field_selector=field_selector,
195
- label_selector=label_selector)
213
+ try:
214
+ pods = kubernetes.core_api(context).list_namespaced_pod(
215
+ namespace=namespace,
216
+ field_selector=field_selector,
217
+ label_selector=label_selector,
218
+ _request_timeout=kubernetes.API_TIMEOUT)
219
+ except Exception as e: # pylint: disable=broad-except
220
+ logger.debug(f'Failed to get pods in namespace {namespace} '
221
+ f'in context {context}: {e}')
222
+ # Mark all volumes in this namespace as failed
223
+ for original_volume_name in original_volume_names[context][
224
+ namespace]:
225
+ failed_volume_names.add(original_volume_name)
226
+ continue
196
227
  for pod in pods.items:
197
228
  if pod.spec.volumes is None:
198
229
  continue
@@ -217,7 +248,7 @@ def get_all_volumes_usedby(
217
248
  used_by_clusters[context][namespace][cluster_name] = []
218
249
  used_by_clusters[context][namespace][cluster_name].append(
219
250
  cluster_name)
220
- return used_by_pods, used_by_clusters
251
+ return used_by_pods, used_by_clusters, failed_volume_names
221
252
 
222
253
 
223
254
  def map_all_volumes_usedby(
@@ -292,7 +323,9 @@ def create_persistent_volume_claim(
292
323
  try:
293
324
  pvc = kubernetes.core_api(
294
325
  context).read_namespaced_persistent_volume_claim(
295
- name=pvc_name, namespace=namespace)
326
+ name=pvc_name,
327
+ namespace=namespace,
328
+ _request_timeout=kubernetes.API_TIMEOUT)
296
329
  if config is not None:
297
330
  _populate_config_from_pvc(config, pvc)
298
331
  logger.debug(f'PVC {pvc_name} already exists')
@@ -305,8 +338,10 @@ def create_persistent_volume_claim(
305
338
  raise ValueError(
306
339
  f'PVC {pvc_name} does not exist while use_existing is True.')
307
340
  pvc = kubernetes.core_api(
308
- context).create_namespaced_persistent_volume_claim(namespace=namespace,
309
- body=pvc_spec)
341
+ context).create_namespaced_persistent_volume_claim(
342
+ namespace=namespace,
343
+ body=pvc_spec,
344
+ _request_timeout=kubernetes.API_TIMEOUT)
310
345
  logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
311
346
  if config is not None:
312
347
  _populate_config_from_pvc(config, pvc)
@@ -493,7 +493,8 @@ def _post_provision_setup(
493
493
  # commands and rsync on the pods. SSH will still be ready after a while
494
494
  # for the users to SSH into the pod.
495
495
  is_k8s_cloud = cloud_name.lower() in ['kubernetes', 'ssh']
496
- if not is_k8s_cloud:
496
+ is_slurm_cloud = cloud_name.lower() == 'slurm'
497
+ if not is_k8s_cloud and not is_slurm_cloud:
497
498
  logger.debug(
498
499
  f'\nWaiting for SSH to be available for {cluster_name!r} ...')
499
500
  wait_for_ssh(cluster_info, ssh_credentials)