skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -80,9 +80,8 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
80
80
 
81
81
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
82
82
 
83
- _JOB_STATUS_FETCH_MAX_RETRIES = 3
84
- _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
85
83
  _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
84
+ JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS = 60
86
85
 
87
86
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
88
87
  'Waiting for task to start[/]'
@@ -329,13 +328,21 @@ def ha_recovery_for_consolidation_mode() -> None:
329
328
 
330
329
 
331
330
  async def get_job_status(
332
- backend: 'backends.CloudVmRayBackend', cluster_name: str,
333
- job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
331
+ backend: 'backends.CloudVmRayBackend', cluster_name: str,
332
+ job_id: Optional[int]
333
+ ) -> Tuple[Optional['job_lib.JobStatus'], Optional[str]]:
334
334
  """Check the status of the job running on a managed job cluster.
335
335
 
336
336
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
337
337
  FAILED_SETUP or CANCELLED.
338
+
339
+ Returns:
340
+ job_status: The status of the job.
341
+ transient_error_reason: None if successful or fatal error; otherwise,
342
+ the detailed reason for the transient error.
338
343
  """
344
+ # TODO(zhwu, cooperc): Make this get job status aware of cluster status, so
345
+ # that it can exit retry early if the cluster is down.
339
346
  # TODO(luca) make this async
340
347
  handle = await context_utils.to_thread(
341
348
  global_user_state.get_handle_from_cluster_name, cluster_name)
@@ -343,85 +350,68 @@ async def get_job_status(
343
350
  # This can happen if the cluster was preempted and background status
344
351
  # refresh already noticed and cleaned it up.
345
352
  logger.info(f'Cluster {cluster_name} not found.')
346
- return None
353
+ return None, None
347
354
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
348
355
  job_ids = None if job_id is None else [job_id]
349
- for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
350
- try:
351
- logger.info('=== Checking the job status... ===')
352
- statuses = await asyncio.wait_for(
353
- context_utils.to_thread(backend.get_job_status,
354
- handle,
355
- job_ids=job_ids,
356
- stream_logs=False),
357
- timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
358
- status = list(statuses.values())[0]
359
- if status is None:
360
- logger.info('No job found.')
356
+ try:
357
+ logger.info('=== Checking the job status... ===')
358
+ statuses = await asyncio.wait_for(
359
+ context_utils.to_thread(backend.get_job_status,
360
+ handle,
361
+ job_ids=job_ids,
362
+ stream_logs=False),
363
+ timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
364
+ status = list(statuses.values())[0]
365
+ if status is None:
366
+ logger.info('No job found.')
367
+ else:
368
+ logger.info(f'Job status: {status}')
369
+ logger.info('=' * 34)
370
+ return status, None
371
+ except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
372
+ ValueError, TypeError, asyncio.TimeoutError) as e:
373
+ # Note: Each of these exceptions has some additional conditions to
374
+ # limit how we handle it and whether or not we catch it.
375
+ potential_transient_error_reason = None
376
+ if isinstance(e, exceptions.CommandError):
377
+ returncode = e.returncode
378
+ potential_transient_error_reason = (f'Returncode: {returncode}. '
379
+ f'{e.detailed_reason}')
380
+ elif isinstance(e, grpc.RpcError):
381
+ potential_transient_error_reason = e.details()
382
+ elif isinstance(e, grpc.FutureTimeoutError):
383
+ potential_transient_error_reason = 'grpc timeout'
384
+ elif isinstance(e, asyncio.TimeoutError):
385
+ potential_transient_error_reason = (
386
+ 'Job status check timed out after '
387
+ f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
388
+ # TODO(cooperc): Gracefully handle these exceptions in the backend.
389
+ elif isinstance(e, ValueError):
390
+ # If the cluster yaml is deleted in the middle of getting the
391
+ # SSH credentials, we could see this. See
392
+ # sky/global_user_state.py get_cluster_yaml_dict.
393
+ if re.search(r'Cluster yaml .* not found', str(e)):
394
+ potential_transient_error_reason = 'Cluster yaml was deleted'
361
395
  else:
362
- logger.info(f'Job status: {status}')
363
- logger.info('=' * 34)
364
- return status
365
- except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
366
- ValueError, TypeError, asyncio.TimeoutError) as e:
367
- # Note: Each of these exceptions has some additional conditions to
368
- # limit how we handle it and whether or not we catch it.
369
- # Retry on k8s transient network errors. This is useful when using
370
- # coreweave which may have transient network issue sometimes.
371
- is_transient_error = False
372
- detailed_reason = None
373
- if isinstance(e, exceptions.CommandError):
374
- detailed_reason = e.detailed_reason
375
- if (detailed_reason is not None and
376
- _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
377
- is_transient_error = True
378
- elif isinstance(e, grpc.RpcError):
379
- detailed_reason = e.details()
380
- if e.code() in [
381
- grpc.StatusCode.UNAVAILABLE,
382
- grpc.StatusCode.DEADLINE_EXCEEDED
383
- ]:
384
- is_transient_error = True
385
- elif isinstance(e, grpc.FutureTimeoutError):
386
- detailed_reason = 'Timeout'
387
- elif isinstance(e, asyncio.TimeoutError):
388
- detailed_reason = ('Job status check timed out after '
389
- f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
390
- # TODO(cooperc): Gracefully handle these exceptions in the backend.
391
- elif isinstance(e, ValueError):
392
- # If the cluster yaml is deleted in the middle of getting the
393
- # SSH credentials, we could see this. See
394
- # sky/global_user_state.py get_cluster_yaml_dict.
395
- if re.search(r'Cluster yaml .* not found', str(e)):
396
- detailed_reason = 'Cluster yaml was deleted'
397
- else:
398
- raise
399
- elif isinstance(e, TypeError):
400
- # We will grab the SSH credentials from the cluster yaml, but if
401
- # handle.cluster_yaml is None, we will just return an empty dict
402
- # for the credentials. See
403
- # backend_utils.ssh_credential_from_yaml. Then, the credentials
404
- # are passed as kwargs to SSHCommandRunner.__init__ - see
405
- # cloud_vm_ray_backend.get_command_runners. So we can hit this
406
- # TypeError if the cluster yaml is removed from the handle right
407
- # when we pull it before the cluster is fully deleted.
408
- error_msg_to_check = (
409
- 'SSHCommandRunner.__init__() missing 2 required positional '
410
- 'arguments: \'ssh_user\' and \'ssh_private_key\'')
411
- if str(e) == error_msg_to_check:
412
- detailed_reason = 'SSH credentials were already cleaned up'
413
- else:
414
- raise
415
- if is_transient_error:
416
- logger.info('Failed to connect to the cluster. Retrying '
417
- f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
418
- logger.info('=' * 34)
419
- await asyncio.sleep(1)
396
+ raise
397
+ elif isinstance(e, TypeError):
398
+ # We will grab the SSH credentials from the cluster yaml, but if
399
+ # handle.cluster_yaml is None, we will just return an empty dict
400
+ # for the credentials. See
401
+ # backend_utils.ssh_credential_from_yaml. Then, the credentials
402
+ # are passed as kwargs to SSHCommandRunner.__init__ - see
403
+ # cloud_vm_ray_backend.get_command_runners. So we can hit this
404
+ # TypeError if the cluster yaml is removed from the handle right
405
+ # when we pull it before the cluster is fully deleted.
406
+ error_msg_to_check = (
407
+ 'SSHCommandRunner.__init__() missing 2 required positional '
408
+ 'arguments: \'ssh_user\' and \'ssh_private_key\'')
409
+ if str(e) == error_msg_to_check:
410
+ potential_transient_error_reason = ('SSH credentials were '
411
+ 'already cleaned up')
420
412
  else:
421
- logger.info(f'Failed to get job status: {detailed_reason}')
422
- logger.info('=' * 34)
423
- return None
424
- return None
413
+ raise
414
+ return None, potential_transient_error_reason
425
415
 
426
416
 
427
417
  def controller_process_alive(record: managed_job_state.ControllerPidRecord,
@@ -1570,6 +1560,7 @@ def get_managed_job_queue(
1570
1560
  handle.launched_resources.region,
1571
1561
  handle.launched_resources.zone).formatted_str()
1572
1562
  job['accelerators'] = handle.launched_resources.accelerators
1563
+ job['labels'] = handle.launched_resources.labels
1573
1564
  else:
1574
1565
  # FIXME(zongheng): display the last cached values for these.
1575
1566
  job['cluster_resources'] = '-'
@@ -1578,6 +1569,7 @@ def get_managed_job_queue(
1578
1569
  job['region'] = '-'
1579
1570
  job['zone'] = '-'
1580
1571
  job['infra'] = '-'
1572
+ job['labels'] = None
1581
1573
 
1582
1574
  if not fields or 'details' in fields:
1583
1575
  # Add details about schedule state / backoff.
@@ -1821,7 +1813,8 @@ def format_job_table(
1821
1813
  for replica in replica_info:
1822
1814
  used_by = replica.get('used_by')
1823
1815
  if used_by is not None:
1824
- job_to_worker[used_by] = replica.get('replica_id')
1816
+ for job_id in used_by:
1817
+ job_to_worker[job_id] = replica.get('replica_id')
1825
1818
  return job_to_worker
1826
1819
 
1827
1820
  # Create mapping from job_id to worker replica_id
sky/models.py CHANGED
@@ -68,6 +68,17 @@ class KubernetesNodeInfo:
68
68
  free: Dict[str, int]
69
69
  # IP address of the node (external IP preferred, fallback to internal IP)
70
70
  ip_address: Optional[str] = None
71
+ # CPU count (total CPUs available on the node)
72
+ cpu_count: Optional[float] = None
73
+ # Memory in GB (total memory available on the node)
74
+ memory_gb: Optional[float] = None
75
+ # Free CPU count (free CPUs available on the node after pod allocations)
76
+ cpu_free: Optional[float] = None
77
+ # Free memory in GB (free memory available on the node after pod
78
+ # allocations)
79
+ memory_free_gb: Optional[float] = None
80
+ # Whether the node is ready (all conditions are satisfied)
81
+ is_ready: bool = True
71
82
 
72
83
 
73
84
  @dataclasses.dataclass
sky/optimizer.py CHANGED
@@ -20,6 +20,7 @@ from sky.adaptors import common as adaptors_common
20
20
  from sky.clouds import cloud as sky_cloud
21
21
  from sky.usage import usage_lib
22
22
  from sky.utils import common
23
+ from sky.utils import common_utils
23
24
  from sky.utils import env_options
24
25
  from sky.utils import log_utils
25
26
  from sky.utils import registry
@@ -781,7 +782,7 @@ class Optimizer:
781
782
  def _instance_type_str(resources: 'resources_lib.Resources') -> str:
782
783
  instance_type = resources.instance_type
783
784
  assert instance_type is not None, 'Instance type must be specified'
784
- if isinstance(resources.cloud, clouds.Kubernetes):
785
+ if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
785
786
  instance_type = '-'
786
787
  if resources.use_spot:
787
788
  instance_type = ''
@@ -865,11 +866,12 @@ class Optimizer:
865
866
  'use_spot': resources.use_spot
866
867
  }
867
868
 
868
- # Handle special case for Kubernetes and SSH clouds
869
- if isinstance(resources.cloud, clouds.Kubernetes):
869
+ # Handle special case for Kubernetes, SSH, and SLURM clouds
870
+ if isinstance(resources.cloud, (clouds.Kubernetes, clouds.Slurm)):
870
871
  # Region for Kubernetes-like clouds (SSH, Kubernetes) is the
871
- # context name, i.e. different Kubernetes clusters. We add
872
- # region to the key to show all the Kubernetes clusters in the
872
+ # context name, i.e. different Kubernetes clusters.
873
+ # Region for SLURM is the cluster name.
874
+ # We add region to the key to show all the clusters in the
873
875
  # optimizer table for better UX.
874
876
 
875
877
  if resources.cloud.__class__.__name__ == 'SSH':
@@ -1289,7 +1291,7 @@ def _check_specified_regions(task: task_lib.Task) -> None:
1289
1291
  msg = f'Task{task_name} requires '
1290
1292
  if region not in existing_contexts:
1291
1293
  if is_ssh:
1292
- infra_str = f'SSH/{region.lstrip("ssh-")}'
1294
+ infra_str = f'SSH/{common_utils.removeprefix(region, "ssh-")}'
1293
1295
  else:
1294
1296
  infra_str = f'Kubernetes/{region}'
1295
1297
  logger.warning(f'{infra_str} is not enabled.')
sky/provision/__init__.py CHANGED
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
6
6
  import functools
7
7
  import inspect
8
8
  import typing
9
- from typing import Any, Dict, List, Optional, Tuple, Type
9
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type
10
10
 
11
11
  from sky import models
12
12
  from sky import sky_logging
@@ -29,6 +29,7 @@ from sky.provision import runpod
29
29
  from sky.provision import scp
30
30
  from sky.provision import seeweb
31
31
  from sky.provision import shadeform
32
+ from sky.provision import slurm
32
33
  from sky.provision import ssh
33
34
  from sky.provision import vast
34
35
  from sky.provision import vsphere
@@ -151,16 +152,18 @@ def get_volume_usedby(
151
152
  @_route_to_cloud_impl
152
153
  def get_all_volumes_usedby(
153
154
  provider_name: str, configs: List[models.VolumeConfig]
154
- ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
155
- """Get the usedby of a volume.
155
+ ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
156
+ """Get the usedby of all volumes.
157
+
158
+ Args:
159
+ provider_name: Name of the provider.
160
+ configs: List of VolumeConfig objects.
156
161
 
157
162
  Returns:
158
- usedby_pods: List of dictionaries, each containing the config keys for
159
- a volume and a key containing pods using the volume.
160
- These may include pods not created by SkyPilot.
161
- usedby_clusters: List of dictionaries, each containing the config keys
162
- for a volume and a key containing clusters using
163
- the volume.
163
+ usedby_pods: Dict of usedby pods.
164
+ usedby_clusters: Dict of usedby clusters.
165
+ failed_volume_names: Set of volume names whose usedby info
166
+ failed to fetch.
164
167
  """
165
168
  raise NotImplementedError
166
169
 
sky/provision/common.py CHANGED
@@ -6,6 +6,7 @@ import os
6
6
  from typing import Any, Dict, List, Optional, Tuple
7
7
 
8
8
  from sky import sky_logging
9
+ from sky.utils import config_utils
9
10
  from sky.utils import env_options
10
11
  from sky.utils import resources_utils
11
12
 
@@ -36,6 +37,13 @@ class StopFailoverError(Exception):
36
37
  """
37
38
 
38
39
 
40
+ # These fields are sensitive and should be redacted from the config for logging
41
+ # purposes.
42
+ SENSITIVE_FIELDS = [
43
+ ('docker_config', 'docker_login_config', 'password'),
44
+ ]
45
+
46
+
39
47
  @dataclasses.dataclass
40
48
  class ProvisionConfig:
41
49
  """Configuration for provisioning."""
@@ -56,6 +64,18 @@ class ProvisionConfig:
56
64
  # Optional ports to open on launch of the cluster.
57
65
  ports_to_open_on_launch: Optional[List[int]]
58
66
 
67
+ def get_redacted_config(self) -> Dict[str, Any]:
68
+ """Get the redacted config."""
69
+ config = dataclasses.asdict(self)
70
+
71
+ config_copy = config_utils.Config(config)
72
+
73
+ for field_list in SENSITIVE_FIELDS:
74
+ val = config_copy.get_nested(field_list, default_value=None)
75
+ if val is not None:
76
+ config_copy.set_nested(field_list, '<redacted>')
77
+ return dict(**config_copy)
78
+
59
79
 
60
80
  # -------------------- output data model -------------------- #
61
81
 
@@ -176,6 +176,17 @@ def _with_interactive(cmd):
176
176
  return ['bash', '--login', '-c', '-i', shlex.quote(force_interactive)]
177
177
 
178
178
 
179
+ def _redact_docker_password(cmd: str) -> str:
180
+ parts = shlex.split(cmd)
181
+ for i, part in enumerate(parts):
182
+ if part.startswith('--password'):
183
+ if part.startswith('--password='):
184
+ parts[i] = '--password=<redacted>'
185
+ elif i + 1 < len(parts):
186
+ parts[i + 1] = '<redacted>'
187
+ return ' '.join(parts)
188
+
189
+
179
190
  # SkyPilot: New class to initialize docker containers on a remote node.
180
191
  # Adopted from ray.autoscaler._private.command_runner.DockerCommandRunner.
181
192
  class DockerInitializer:
@@ -219,7 +230,9 @@ class DockerInitializer:
219
230
  cmd = (f'flock {flock_args} /tmp/{flock_name} '
220
231
  f'-c {shlex.quote(cmd)}')
221
232
 
222
- logger.debug(f'+ {cmd}')
233
+ # Redact the password in the login command.
234
+ redacted_cmd = _redact_docker_password(cmd)
235
+ logger.debug(f'+ {redacted_cmd}')
223
236
  start = time.time()
224
237
  while True:
225
238
  rc, stdout, stderr = self.runner.run(
@@ -251,7 +264,7 @@ class DockerInitializer:
251
264
  break
252
265
  subprocess_utils.handle_returncode(
253
266
  rc,
254
- cmd,
267
+ redacted_cmd,
255
268
  error_msg='Failed to run docker setup commands.',
256
269
  stderr=stdout + stderr,
257
270
  # Print out the error message if the command failed.