skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py
CHANGED
|
@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
189
189
|
# can still exist in the record, and we check for credentials to avoid
|
|
190
190
|
# updating the SSH config for non-existent clusters.
|
|
191
191
|
credentials = record['credentials']
|
|
192
|
+
ips = handle.cached_external_ips
|
|
192
193
|
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
|
193
194
|
# Replace the proxy command to proxy through the SkyPilot API
|
|
194
195
|
# server with websocket.
|
|
@@ -215,12 +216,28 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
215
216
|
f'\"{escaped_executable_path} '
|
|
216
217
|
f'{escaped_websocket_proxy_path} '
|
|
217
218
|
f'{server_common.get_server_url()} '
|
|
218
|
-
f'{handle.cluster_name}
|
|
219
|
+
f'{handle.cluster_name} '
|
|
220
|
+
f'kubernetes-pod-ssh-proxy\"')
|
|
221
|
+
credentials['ssh_proxy_command'] = proxy_command
|
|
222
|
+
elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
|
|
223
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
|
224
|
+
# server with websocket.
|
|
225
|
+
escaped_executable_path = shlex.quote(sys.executable)
|
|
226
|
+
escaped_websocket_proxy_path = shlex.quote(
|
|
227
|
+
f'{directory_utils.get_sky_dir()}/templates/websocket_proxy.py')
|
|
228
|
+
# %w is a placeholder for the node index, substituted per-node
|
|
229
|
+
# in cluster_utils.SSHConfigHelper.add_cluster().
|
|
230
|
+
proxy_command = (f'{escaped_executable_path} '
|
|
231
|
+
f'{escaped_websocket_proxy_path} '
|
|
232
|
+
f'{server_common.get_server_url()} '
|
|
233
|
+
f'{handle.cluster_name} '
|
|
234
|
+
f'slurm-job-ssh-proxy %w')
|
|
219
235
|
credentials['ssh_proxy_command'] = proxy_command
|
|
220
236
|
|
|
221
237
|
cluster_utils.SSHConfigHelper.add_cluster(
|
|
222
238
|
handle.cluster_name,
|
|
223
|
-
handle.
|
|
239
|
+
handle.cluster_name_on_cloud,
|
|
240
|
+
ips,
|
|
224
241
|
credentials,
|
|
225
242
|
handle.cached_external_ssh_ports,
|
|
226
243
|
handle.docker_user,
|
|
@@ -832,7 +849,19 @@ class _NaturalOrderGroup(click.Group):
|
|
|
832
849
|
"""
|
|
833
850
|
|
|
834
851
|
def list_commands(self, ctx): # pylint: disable=unused-argument
|
|
835
|
-
|
|
852
|
+
# Preserve definition order but hide aliases (same command object) and
|
|
853
|
+
# commands explicitly marked as hidden.
|
|
854
|
+
seen_commands = set()
|
|
855
|
+
names = []
|
|
856
|
+
for name, command in self.commands.items():
|
|
857
|
+
if getattr(command, 'hidden', False):
|
|
858
|
+
continue
|
|
859
|
+
command_id = id(command)
|
|
860
|
+
if command_id in seen_commands:
|
|
861
|
+
continue
|
|
862
|
+
seen_commands.add(command_id)
|
|
863
|
+
names.append(name)
|
|
864
|
+
return names
|
|
836
865
|
|
|
837
866
|
@usage_lib.entrypoint('sky.cli', fallback=True)
|
|
838
867
|
def invoke(self, ctx):
|
|
@@ -3424,7 +3453,12 @@ def _down_or_stop_clusters(
|
|
|
3424
3453
|
click.echo(f' {name} ({first})')
|
|
3425
3454
|
|
|
3426
3455
|
if failures:
|
|
3427
|
-
|
|
3456
|
+
failure_str = 'Cluster(s) failed. See details above.'
|
|
3457
|
+
if down:
|
|
3458
|
+
failure_str += (
|
|
3459
|
+
' If you want to ignore the errors and remove the '
|
|
3460
|
+
'cluster(s) from the status table, use `sky down --purge`.')
|
|
3461
|
+
click.echo(failure_str)
|
|
3428
3462
|
|
|
3429
3463
|
|
|
3430
3464
|
@cli.command(cls=_DocumentedCodeCommand)
|
|
@@ -3535,6 +3569,10 @@ def show_gpus(
|
|
|
3535
3569
|
maximum quantities of the GPU available on a single node and the real-time
|
|
3536
3570
|
availability of the GPU across all nodes in the Kubernetes cluster.
|
|
3537
3571
|
|
|
3572
|
+
If ``--cloud slurm`` is specified, it will show the maximum quantities of
|
|
3573
|
+
the GPU available on a single node and the real-time availability of the
|
|
3574
|
+
GPU across all nodes in the Slurm cluster.
|
|
3575
|
+
|
|
3538
3576
|
Definitions of certain fields:
|
|
3539
3577
|
|
|
3540
3578
|
* ``DEVICE_MEM``: Memory of a single device; does not depend on the device
|
|
@@ -3590,6 +3628,8 @@ def show_gpus(
|
|
|
3590
3628
|
cloud_is_kubernetes = isinstance(
|
|
3591
3629
|
cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
|
|
3592
3630
|
cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
|
|
3631
|
+
cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
|
|
3632
|
+
|
|
3593
3633
|
# TODO(romilb): We should move this to the backend.
|
|
3594
3634
|
kubernetes_autoscaling = skypilot_config.get_effective_region_config(
|
|
3595
3635
|
cloud='kubernetes',
|
|
@@ -3598,6 +3638,7 @@ def show_gpus(
|
|
|
3598
3638
|
default_value=None) is not None
|
|
3599
3639
|
kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
|
|
3600
3640
|
ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
|
|
3641
|
+
slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
|
|
3601
3642
|
query_k8s_realtime_gpu = (kubernetes_is_enabled and
|
|
3602
3643
|
(cloud_name is None or cloud_is_kubernetes))
|
|
3603
3644
|
query_ssh_realtime_gpu = (ssh_is_enabled and
|
|
@@ -3657,8 +3698,9 @@ def show_gpus(
|
|
|
3657
3698
|
raise ValueError(full_err_msg)
|
|
3658
3699
|
no_permissions_str = '<no permissions>'
|
|
3659
3700
|
realtime_gpu_infos = []
|
|
3701
|
+
# Stores per-GPU totals as [ready_capacity, available, not_ready].
|
|
3660
3702
|
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3661
|
-
lambda: [0, 0])
|
|
3703
|
+
lambda: [0, 0, 0])
|
|
3662
3704
|
all_nodes_info = []
|
|
3663
3705
|
|
|
3664
3706
|
# display an aggregated table for all contexts
|
|
@@ -3669,6 +3711,33 @@ def show_gpus(
|
|
|
3669
3711
|
|
|
3670
3712
|
num_filtered_contexts = 0
|
|
3671
3713
|
|
|
3714
|
+
def _count_not_ready_gpus(
|
|
3715
|
+
nodes_info: Optional['models.KubernetesNodesInfo']
|
|
3716
|
+
) -> Dict[str, int]:
|
|
3717
|
+
"""Return counts of GPUs on not ready nodes keyed by GPU type."""
|
|
3718
|
+
not_ready_counts: Dict[str, int] = collections.defaultdict(int)
|
|
3719
|
+
if nodes_info is None:
|
|
3720
|
+
return not_ready_counts
|
|
3721
|
+
|
|
3722
|
+
node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
|
|
3723
|
+
for node_info in node_info_dict.values():
|
|
3724
|
+
accelerator_type = getattr(node_info, 'accelerator_type', None)
|
|
3725
|
+
if not accelerator_type:
|
|
3726
|
+
continue
|
|
3727
|
+
|
|
3728
|
+
total_info = getattr(node_info, 'total', {})
|
|
3729
|
+
accelerator_count = 0
|
|
3730
|
+
if isinstance(total_info, dict):
|
|
3731
|
+
accelerator_count = int(
|
|
3732
|
+
total_info.get('accelerator_count', 0))
|
|
3733
|
+
if accelerator_count <= 0:
|
|
3734
|
+
continue
|
|
3735
|
+
|
|
3736
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3737
|
+
if not node_is_ready:
|
|
3738
|
+
not_ready_counts[accelerator_type] += accelerator_count
|
|
3739
|
+
return not_ready_counts
|
|
3740
|
+
|
|
3672
3741
|
if realtime_gpu_availability_lists:
|
|
3673
3742
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
|
3674
3743
|
if not _filter_ctx(ctx):
|
|
@@ -3678,6 +3747,12 @@ def show_gpus(
|
|
|
3678
3747
|
else:
|
|
3679
3748
|
display_ctx = ctx
|
|
3680
3749
|
num_filtered_contexts += 1
|
|
3750
|
+
# Collect node info for this context before building tables so
|
|
3751
|
+
# we can exclude GPUs on not ready nodes from the totals.
|
|
3752
|
+
nodes_info = sdk.stream_and_get(
|
|
3753
|
+
sdk.kubernetes_node_info(context=ctx))
|
|
3754
|
+
context_not_ready_counts = _count_not_ready_gpus(nodes_info)
|
|
3755
|
+
|
|
3681
3756
|
realtime_gpu_table = log_utils.create_table(
|
|
3682
3757
|
['GPU', qty_header, 'UTILIZATION'])
|
|
3683
3758
|
for realtime_gpu_availability in sorted(availability_list):
|
|
@@ -3686,24 +3761,116 @@ def show_gpus(
|
|
|
3686
3761
|
available_qty = (gpu_availability.available
|
|
3687
3762
|
if gpu_availability.available != -1 else
|
|
3688
3763
|
no_permissions_str)
|
|
3764
|
+
# Exclude GPUs on not ready nodes from capacity counts.
|
|
3765
|
+
not_ready_count = min(
|
|
3766
|
+
context_not_ready_counts.get(gpu_availability.gpu, 0),
|
|
3767
|
+
gpu_availability.capacity)
|
|
3768
|
+
# Ensure capacity is never below the reported available
|
|
3769
|
+
# quantity (if available is unknown, treat as 0 for totals).
|
|
3770
|
+
available_for_totals = max(
|
|
3771
|
+
gpu_availability.available
|
|
3772
|
+
if gpu_availability.available != -1 else 0, 0)
|
|
3773
|
+
effective_capacity = max(
|
|
3774
|
+
gpu_availability.capacity - not_ready_count,
|
|
3775
|
+
available_for_totals)
|
|
3776
|
+
utilization = (
|
|
3777
|
+
f'{available_qty} of {effective_capacity} free')
|
|
3778
|
+
if not_ready_count > 0:
|
|
3779
|
+
utilization += f' ({not_ready_count} not ready)'
|
|
3689
3780
|
realtime_gpu_table.add_row([
|
|
3690
3781
|
gpu_availability.gpu,
|
|
3691
3782
|
_list_to_str(gpu_availability.counts),
|
|
3692
|
-
|
|
3783
|
+
utilization,
|
|
3693
3784
|
])
|
|
3694
3785
|
gpu = gpu_availability.gpu
|
|
3695
|
-
capacity = gpu_availability.capacity
|
|
3696
3786
|
# we want total, so skip permission denied.
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
total_gpu_info[gpu][
|
|
3700
|
-
total_gpu_info[gpu][
|
|
3787
|
+
if effective_capacity > 0 or not_ready_count > 0:
|
|
3788
|
+
total_gpu_info[gpu][0] += effective_capacity
|
|
3789
|
+
total_gpu_info[gpu][1] += available_for_totals
|
|
3790
|
+
total_gpu_info[gpu][2] += not_ready_count
|
|
3701
3791
|
realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
|
|
3702
|
-
# Collect node info for this context
|
|
3703
|
-
nodes_info = sdk.stream_and_get(
|
|
3704
|
-
sdk.kubernetes_node_info(context=ctx))
|
|
3705
3792
|
all_nodes_info.append((display_ctx, nodes_info))
|
|
3706
3793
|
if num_filtered_contexts > 1:
|
|
3794
|
+
total_realtime_gpu_table = log_utils.create_table(
|
|
3795
|
+
['GPU', 'UTILIZATION'])
|
|
3796
|
+
for gpu, stats in total_gpu_info.items():
|
|
3797
|
+
not_ready = stats[2]
|
|
3798
|
+
utilization = f'{stats[1]} of {stats[0]} free'
|
|
3799
|
+
if not_ready > 0:
|
|
3800
|
+
utilization += f' ({not_ready} not ready)'
|
|
3801
|
+
total_realtime_gpu_table.add_row([gpu, utilization])
|
|
3802
|
+
else:
|
|
3803
|
+
total_realtime_gpu_table = None
|
|
3804
|
+
|
|
3805
|
+
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
|
3806
|
+
|
|
3807
|
+
def _get_slurm_realtime_gpu_tables(
|
|
3808
|
+
name_filter: Optional[str] = None,
|
|
3809
|
+
quantity_filter: Optional[int] = None
|
|
3810
|
+
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
|
3811
|
+
Optional['prettytable.PrettyTable']]:
|
|
3812
|
+
"""Get Slurm GPU availability tables.
|
|
3813
|
+
|
|
3814
|
+
Args:
|
|
3815
|
+
name_filter: Filter GPUs by name.
|
|
3816
|
+
quantity_filter: Filter GPUs by quantity.
|
|
3817
|
+
|
|
3818
|
+
Returns:
|
|
3819
|
+
A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
|
|
3820
|
+
"""
|
|
3821
|
+
if quantity_filter:
|
|
3822
|
+
qty_header = 'QTY_FILTER'
|
|
3823
|
+
else:
|
|
3824
|
+
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
|
3825
|
+
|
|
3826
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
|
3827
|
+
sdk.realtime_slurm_gpu_availability(
|
|
3828
|
+
name_filter=name_filter, quantity_filter=quantity_filter))
|
|
3829
|
+
if not realtime_gpu_availability_lists:
|
|
3830
|
+
err_msg = 'No GPUs found in any Slurm partition. '
|
|
3831
|
+
debug_msg = 'To further debug, run: sky check slurm '
|
|
3832
|
+
if name_filter is not None:
|
|
3833
|
+
gpu_info_msg = f' {name_filter!r}'
|
|
3834
|
+
if quantity_filter is not None:
|
|
3835
|
+
gpu_info_msg += (' with requested quantity'
|
|
3836
|
+
f' {quantity_filter}')
|
|
3837
|
+
err_msg = (f'Resources{gpu_info_msg} not found '
|
|
3838
|
+
'in any Slurm partition. ')
|
|
3839
|
+
debug_msg = ('To show available accelerators on Slurm,'
|
|
3840
|
+
' run: sky show-gpus --cloud slurm ')
|
|
3841
|
+
raise ValueError(err_msg + debug_msg)
|
|
3842
|
+
|
|
3843
|
+
realtime_gpu_infos = []
|
|
3844
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
|
3845
|
+
lambda: [0, 0])
|
|
3846
|
+
|
|
3847
|
+
for (slurm_cluster,
|
|
3848
|
+
availability_list) in realtime_gpu_availability_lists:
|
|
3849
|
+
realtime_gpu_table = log_utils.create_table(
|
|
3850
|
+
['GPU', qty_header, 'UTILIZATION'])
|
|
3851
|
+
for realtime_gpu_availability in sorted(availability_list):
|
|
3852
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
|
3853
|
+
*realtime_gpu_availability)
|
|
3854
|
+
# Use the counts directly from the backend, which are already
|
|
3855
|
+
# generated in powers of 2 (plus any actual maximums)
|
|
3856
|
+
requestable_quantities = gpu_availability.counts
|
|
3857
|
+
realtime_gpu_table.add_row([
|
|
3858
|
+
gpu_availability.gpu,
|
|
3859
|
+
_list_to_str(requestable_quantities),
|
|
3860
|
+
(f'{gpu_availability.available} of '
|
|
3861
|
+
f'{gpu_availability.capacity} free'),
|
|
3862
|
+
])
|
|
3863
|
+
gpu = gpu_availability.gpu
|
|
3864
|
+
capacity = gpu_availability.capacity
|
|
3865
|
+
available = gpu_availability.available
|
|
3866
|
+
if capacity > 0:
|
|
3867
|
+
total_gpu_info[gpu][0] += capacity
|
|
3868
|
+
total_gpu_info[gpu][1] += available
|
|
3869
|
+
realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
|
|
3870
|
+
|
|
3871
|
+
# display an aggregated table for all partitions
|
|
3872
|
+
# if there are more than one partitions with GPUs
|
|
3873
|
+
if len(realtime_gpu_infos) > 1:
|
|
3707
3874
|
total_realtime_gpu_table = log_utils.create_table(
|
|
3708
3875
|
['GPU', 'UTILIZATION'])
|
|
3709
3876
|
for gpu, stats in total_gpu_info.items():
|
|
@@ -3712,14 +3879,16 @@ def show_gpus(
|
|
|
3712
3879
|
else:
|
|
3713
3880
|
total_realtime_gpu_table = None
|
|
3714
3881
|
|
|
3715
|
-
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3882
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
|
3716
3883
|
|
|
3717
3884
|
def _format_kubernetes_node_info_combined(
|
|
3718
3885
|
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
|
3719
3886
|
cloud_str: str = 'Kubernetes',
|
|
3720
3887
|
context_title_str: str = 'CONTEXT') -> str:
|
|
3721
|
-
node_table = log_utils.create_table(
|
|
3722
|
-
|
|
3888
|
+
node_table = log_utils.create_table([
|
|
3889
|
+
context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
|
|
3890
|
+
'GPU UTILIZATION'
|
|
3891
|
+
])
|
|
3723
3892
|
|
|
3724
3893
|
no_permissions_str = '<no permissions>'
|
|
3725
3894
|
hints = []
|
|
@@ -3736,10 +3905,56 @@ def show_gpus(
|
|
|
3736
3905
|
acc_type = node_info.accelerator_type
|
|
3737
3906
|
if acc_type is None:
|
|
3738
3907
|
acc_type = '-'
|
|
3908
|
+
|
|
3909
|
+
# Format CPU and memory: "X of Y free" or just "Y" if
|
|
3910
|
+
# free is unknown
|
|
3911
|
+
cpu_str = '-'
|
|
3912
|
+
if node_info.cpu_count is not None:
|
|
3913
|
+
cpu_total_str = common_utils.format_float(
|
|
3914
|
+
node_info.cpu_count, precision=0)
|
|
3915
|
+
|
|
3916
|
+
# Check if we have free CPU info (use hasattr to
|
|
3917
|
+
# check if field exists, then access directly)
|
|
3918
|
+
cpu_free = None
|
|
3919
|
+
if hasattr(node_info, 'cpu_free'):
|
|
3920
|
+
cpu_free = node_info.cpu_free
|
|
3921
|
+
if cpu_free is not None:
|
|
3922
|
+
cpu_free_str = common_utils.format_float(cpu_free,
|
|
3923
|
+
precision=0)
|
|
3924
|
+
cpu_str = f'{cpu_free_str} of {cpu_total_str} free'
|
|
3925
|
+
else:
|
|
3926
|
+
cpu_str = cpu_total_str
|
|
3927
|
+
|
|
3928
|
+
memory_str = '-'
|
|
3929
|
+
if node_info.memory_gb is not None:
|
|
3930
|
+
memory_total_str = common_utils.format_float(
|
|
3931
|
+
node_info.memory_gb, precision=0)
|
|
3932
|
+
|
|
3933
|
+
# Check if we have free memory info (use hasattr
|
|
3934
|
+
# to check if field exists, then access directly)
|
|
3935
|
+
memory_free_gb = None
|
|
3936
|
+
if hasattr(node_info, 'memory_free_gb'):
|
|
3937
|
+
memory_free_gb = node_info.memory_free_gb
|
|
3938
|
+
if memory_free_gb is not None:
|
|
3939
|
+
memory_free_str = common_utils.format_float(
|
|
3940
|
+
memory_free_gb, precision=0)
|
|
3941
|
+
memory_str = (
|
|
3942
|
+
f'{memory_free_str} of {memory_total_str} free')
|
|
3943
|
+
else:
|
|
3944
|
+
memory_str = memory_total_str
|
|
3945
|
+
|
|
3946
|
+
utilization_str = (
|
|
3947
|
+
f'{available} of '
|
|
3948
|
+
f'{node_info.total["accelerator_count"]} free')
|
|
3949
|
+
# Check if node is ready (defaults to True for backward
|
|
3950
|
+
# compatibility with older server versions)
|
|
3951
|
+
node_is_ready = getattr(node_info, 'is_ready', True)
|
|
3952
|
+
if not node_is_ready:
|
|
3953
|
+
utilization_str += ' (Node NotReady)'
|
|
3954
|
+
|
|
3739
3955
|
node_table.add_row([
|
|
3740
|
-
context_name, node_name, acc_type,
|
|
3741
|
-
|
|
3742
|
-
'free'
|
|
3956
|
+
context_name, node_name, cpu_str, memory_str, acc_type,
|
|
3957
|
+
utilization_str
|
|
3743
3958
|
])
|
|
3744
3959
|
|
|
3745
3960
|
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
|
@@ -3751,6 +3966,42 @@ def show_gpus(
|
|
|
3751
3966
|
f'{colorama.Style.RESET_ALL}\n'
|
|
3752
3967
|
f'{node_table.get_string()}')
|
|
3753
3968
|
|
|
3969
|
+
def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
|
|
3970
|
+
node_table = log_utils.create_table([
|
|
3971
|
+
'CLUSTER',
|
|
3972
|
+
'NODE',
|
|
3973
|
+
'PARTITION',
|
|
3974
|
+
'STATE',
|
|
3975
|
+
'GPU',
|
|
3976
|
+
'UTILIZATION',
|
|
3977
|
+
])
|
|
3978
|
+
|
|
3979
|
+
request_ids = [(cluster_name,
|
|
3980
|
+
sdk.slurm_node_info(slurm_cluster_name=cluster_name))
|
|
3981
|
+
for cluster_name in slurm_cluster_names]
|
|
3982
|
+
|
|
3983
|
+
for cluster_name, request_id in request_ids:
|
|
3984
|
+
nodes_info = sdk.stream_and_get(request_id)
|
|
3985
|
+
|
|
3986
|
+
for node_info in nodes_info:
|
|
3987
|
+
node_table.add_row([
|
|
3988
|
+
cluster_name,
|
|
3989
|
+
node_info.get('node_name'),
|
|
3990
|
+
node_info.get('partition', '-'),
|
|
3991
|
+
node_info.get('node_state'),
|
|
3992
|
+
node_info.get('gpu_type') or '',
|
|
3993
|
+
(f'{node_info.get("free_gpus", 0)} of '
|
|
3994
|
+
f'{node_info.get("total_gpus", 0)} free'),
|
|
3995
|
+
])
|
|
3996
|
+
|
|
3997
|
+
slurm_per_node_msg = 'Slurm per node accelerator availability'
|
|
3998
|
+
# Optional: Add hint message if needed, similar to k8s
|
|
3999
|
+
|
|
4000
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
|
4001
|
+
f'{slurm_per_node_msg}'
|
|
4002
|
+
f'{colorama.Style.RESET_ALL}\n'
|
|
4003
|
+
f'{node_table.get_string()}')
|
|
4004
|
+
|
|
3754
4005
|
def _format_kubernetes_realtime_gpu(
|
|
3755
4006
|
total_table: Optional['prettytable.PrettyTable'],
|
|
3756
4007
|
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
|
@@ -3880,6 +4131,29 @@ def show_gpus(
|
|
|
3880
4131
|
return True, print_section_titles
|
|
3881
4132
|
return False, print_section_titles
|
|
3882
4133
|
|
|
4134
|
+
def _format_slurm_realtime_gpu(
|
|
4135
|
+
total_table, slurm_realtime_infos,
|
|
4136
|
+
show_node_info: bool) -> Generator[str, None, None]:
|
|
4137
|
+
# print total table
|
|
4138
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
|
4139
|
+
'Slurm GPUs'
|
|
4140
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4141
|
+
if total_table is not None:
|
|
4142
|
+
yield from total_table.get_string()
|
|
4143
|
+
yield '\n'
|
|
4144
|
+
|
|
4145
|
+
# print individual infos.
|
|
4146
|
+
for (partition, slurm_realtime_table) in slurm_realtime_infos:
|
|
4147
|
+
partition_str = f'Slurm Cluster: {partition}'
|
|
4148
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
4149
|
+
f'{partition_str}'
|
|
4150
|
+
f'{colorama.Style.RESET_ALL}\n')
|
|
4151
|
+
yield from slurm_realtime_table.get_string()
|
|
4152
|
+
yield '\n'
|
|
4153
|
+
if show_node_info:
|
|
4154
|
+
cluster_names = [cluster for cluster, _ in slurm_realtime_infos]
|
|
4155
|
+
yield _format_slurm_node_info(cluster_names)
|
|
4156
|
+
|
|
3883
4157
|
def _output() -> Generator[str, None, None]:
|
|
3884
4158
|
gpu_table = log_utils.create_table(
|
|
3885
4159
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
|
@@ -3897,10 +4171,12 @@ def show_gpus(
|
|
|
3897
4171
|
if cloud_name is None:
|
|
3898
4172
|
clouds_to_list = [
|
|
3899
4173
|
c for c in constants.ALL_CLOUDS
|
|
3900
|
-
if c != 'kubernetes' and c != 'ssh'
|
|
4174
|
+
if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
|
|
3901
4175
|
]
|
|
3902
4176
|
|
|
3903
4177
|
k8s_messages = ''
|
|
4178
|
+
slurm_messages = ''
|
|
4179
|
+
k8s_printed = False
|
|
3904
4180
|
if accelerator_str is None:
|
|
3905
4181
|
# Collect k8s related messages in k8s_messages and print them at end
|
|
3906
4182
|
print_section_titles = False
|
|
@@ -3912,6 +4188,7 @@ def show_gpus(
|
|
|
3912
4188
|
yield '\n\n'
|
|
3913
4189
|
stop_iter_one, print_section_titles_one, k8s_messages_one = (
|
|
3914
4190
|
yield from _possibly_show_k8s_like_realtime(is_ssh))
|
|
4191
|
+
k8s_printed = True
|
|
3915
4192
|
stop_iter = stop_iter or stop_iter_one
|
|
3916
4193
|
print_section_titles = (print_section_titles or
|
|
3917
4194
|
print_section_titles_one)
|
|
@@ -3919,11 +4196,45 @@ def show_gpus(
|
|
|
3919
4196
|
prev_print_section_titles = print_section_titles_one
|
|
3920
4197
|
if stop_iter:
|
|
3921
4198
|
return
|
|
4199
|
+
# If cloud is slurm, we want to show real-time capacity
|
|
4200
|
+
if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
|
|
4201
|
+
try:
|
|
4202
|
+
# If --cloud slurm is not specified, we want to catch
|
|
4203
|
+
# the case where no GPUs are available on the cluster and
|
|
4204
|
+
# print the warning at the end.
|
|
4205
|
+
slurm_realtime_infos, total_table = (
|
|
4206
|
+
_get_slurm_realtime_gpu_tables())
|
|
4207
|
+
except ValueError as e:
|
|
4208
|
+
if not cloud_is_slurm:
|
|
4209
|
+
# Make it a note if cloud is not slurm
|
|
4210
|
+
slurm_messages += 'Note: '
|
|
4211
|
+
slurm_messages += str(e)
|
|
4212
|
+
else:
|
|
4213
|
+
print_section_titles = True
|
|
4214
|
+
if k8s_printed:
|
|
4215
|
+
yield '\n'
|
|
4216
|
+
|
|
4217
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4218
|
+
slurm_realtime_infos,
|
|
4219
|
+
show_node_info=True)
|
|
4220
|
+
|
|
4221
|
+
if cloud_is_slurm:
|
|
4222
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4223
|
+
if not slurm_is_enabled:
|
|
4224
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4225
|
+
'sky check slurm ')
|
|
4226
|
+
yield slurm_messages
|
|
4227
|
+
return
|
|
3922
4228
|
|
|
3923
4229
|
# For show_all, show the k8s message at the start since output is
|
|
3924
4230
|
# long and the user may not scroll to the end.
|
|
3925
|
-
if show_all and k8s_messages:
|
|
3926
|
-
|
|
4231
|
+
if show_all and (k8s_messages or slurm_messages):
|
|
4232
|
+
if k8s_messages:
|
|
4233
|
+
yield k8s_messages
|
|
4234
|
+
if slurm_messages:
|
|
4235
|
+
if k8s_messages:
|
|
4236
|
+
yield '\n'
|
|
4237
|
+
yield slurm_messages
|
|
3927
4238
|
yield '\n\n'
|
|
3928
4239
|
|
|
3929
4240
|
list_accelerator_counts_result = sdk.stream_and_get(
|
|
@@ -3971,9 +4282,10 @@ def show_gpus(
|
|
|
3971
4282
|
else:
|
|
3972
4283
|
yield ('\n\nHint: use -a/--all to see all accelerators '
|
|
3973
4284
|
'(including non-common ones) and pricing.')
|
|
3974
|
-
if k8s_messages:
|
|
4285
|
+
if k8s_messages or slurm_messages:
|
|
3975
4286
|
yield '\n'
|
|
3976
4287
|
yield k8s_messages
|
|
4288
|
+
yield slurm_messages
|
|
3977
4289
|
return
|
|
3978
4290
|
else:
|
|
3979
4291
|
# Parse accelerator string
|
|
@@ -4013,6 +4325,31 @@ def show_gpus(
|
|
|
4013
4325
|
if stop_iter:
|
|
4014
4326
|
return
|
|
4015
4327
|
|
|
4328
|
+
# Handle Slurm filtering by name and quantity
|
|
4329
|
+
if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
|
|
4330
|
+
not show_all):
|
|
4331
|
+
# Print section title if not showing all and instead a specific
|
|
4332
|
+
# accelerator is requested
|
|
4333
|
+
print_section_titles = True
|
|
4334
|
+
try:
|
|
4335
|
+
slurm_realtime_infos, total_table = (
|
|
4336
|
+
_get_slurm_realtime_gpu_tables(name_filter=name,
|
|
4337
|
+
quantity_filter=quantity))
|
|
4338
|
+
|
|
4339
|
+
yield from _format_slurm_realtime_gpu(total_table,
|
|
4340
|
+
slurm_realtime_infos,
|
|
4341
|
+
show_node_info=False)
|
|
4342
|
+
except ValueError as e:
|
|
4343
|
+
# In the case of a specific accelerator, show the error message
|
|
4344
|
+
# immediately (e.g., "Resources A10G not found ...")
|
|
4345
|
+
yield str(e)
|
|
4346
|
+
yield slurm_messages
|
|
4347
|
+
if cloud_is_slurm:
|
|
4348
|
+
# Do not show clouds if --cloud slurm is specified
|
|
4349
|
+
if not slurm_is_enabled:
|
|
4350
|
+
yield ('Slurm is not enabled. To fix, run: '
|
|
4351
|
+
'sky check slurm ')
|
|
4352
|
+
return
|
|
4016
4353
|
# For clouds other than Kubernetes, get the accelerator details
|
|
4017
4354
|
# Case-sensitive
|
|
4018
4355
|
list_accelerators_result = sdk.stream_and_get(
|
|
@@ -4398,6 +4735,13 @@ def volumes_ls(verbose: bool):
|
|
|
4398
4735
|
is_flag=True,
|
|
4399
4736
|
required=False,
|
|
4400
4737
|
help='Delete all volumes.')
|
|
4738
|
+
@click.option('--purge',
|
|
4739
|
+
'-p',
|
|
4740
|
+
default=False,
|
|
4741
|
+
is_flag=True,
|
|
4742
|
+
required=False,
|
|
4743
|
+
help=('Forcibly delete the volume from the volumes table even '
|
|
4744
|
+
'if the deletion API fails.'))
|
|
4401
4745
|
@click.option('--yes',
|
|
4402
4746
|
'-y',
|
|
4403
4747
|
default=False,
|
|
@@ -4406,7 +4750,12 @@ def volumes_ls(verbose: bool):
|
|
|
4406
4750
|
help='Skip confirmation prompt.')
|
|
4407
4751
|
@_add_click_options(flags.COMMON_OPTIONS)
|
|
4408
4752
|
@usage_lib.entrypoint
|
|
4409
|
-
def volumes_delete(
|
|
4753
|
+
def volumes_delete(
|
|
4754
|
+
names: List[str],
|
|
4755
|
+
all: bool, # pylint: disable=redefined-builtin
|
|
4756
|
+
purge: bool,
|
|
4757
|
+
yes: bool,
|
|
4758
|
+
async_call: bool):
|
|
4410
4759
|
"""Delete volumes.
|
|
4411
4760
|
|
|
4412
4761
|
Examples:
|
|
@@ -4421,6 +4770,9 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
|
4421
4770
|
\b
|
|
4422
4771
|
# Delete all volumes.
|
|
4423
4772
|
sky volumes delete -a
|
|
4773
|
+
\b
|
|
4774
|
+
# Forcibly delete a volume.
|
|
4775
|
+
sky volumes delete pvc1 -p
|
|
4424
4776
|
"""
|
|
4425
4777
|
if sum([bool(names), all]) != 1:
|
|
4426
4778
|
raise click.UsageError('Either --all or a name must be specified.')
|
|
@@ -4447,8 +4799,8 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
|
4447
4799
|
show_default=True)
|
|
4448
4800
|
|
|
4449
4801
|
try:
|
|
4450
|
-
_async_call_or_wait(volumes_sdk.delete(names
|
|
4451
|
-
'sky.volumes.delete')
|
|
4802
|
+
_async_call_or_wait(volumes_sdk.delete(names, purge=purge),
|
|
4803
|
+
async_call, 'sky.volumes.delete')
|
|
4452
4804
|
except Exception as e: # pylint: disable=broad-except
|
|
4453
4805
|
logger.error(f'{colorama.Fore.RED}Error deleting volumes {names}: '
|
|
4454
4806
|
f'{str(e)}{colorama.Style.RESET_ALL}')
|
|
@@ -5120,9 +5472,14 @@ def jobs_pool_apply(
|
|
|
5120
5472
|
@flags.config_option(expose_value=False)
|
|
5121
5473
|
@flags.verbose_option()
|
|
5122
5474
|
@click.argument('pool_names', required=False, type=str, nargs=-1)
|
|
5475
|
+
@click.option('--all',
|
|
5476
|
+
'-a',
|
|
5477
|
+
'show_all',
|
|
5478
|
+
is_flag=True,
|
|
5479
|
+
default=False,
|
|
5480
|
+
help='Show all workers.')
|
|
5123
5481
|
@usage_lib.entrypoint
|
|
5124
|
-
|
|
5125
|
-
def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
5482
|
+
def jobs_pool_status(verbose: bool, pool_names: List[str], show_all: bool):
|
|
5126
5483
|
"""Show statuses of pools.
|
|
5127
5484
|
|
|
5128
5485
|
Show detailed statuses of one or more pools. If POOL_NAME is not
|
|
@@ -5135,7 +5492,7 @@ def jobs_pool_status(verbose: bool, pool_names: List[str]):
|
|
|
5135
5492
|
pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
|
|
5136
5493
|
_, msg = _handle_services_request(pool_status_request_id,
|
|
5137
5494
|
service_names=pool_names_to_query,
|
|
5138
|
-
show_all=verbose,
|
|
5495
|
+
show_all=verbose or show_all,
|
|
5139
5496
|
show_endpoint=False,
|
|
5140
5497
|
pool=True,
|
|
5141
5498
|
is_called_by_user=True)
|
|
@@ -6438,9 +6795,11 @@ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
|
|
|
6438
6795
|
if not verbose:
|
|
6439
6796
|
r_id = common_utils.truncate_long_string(r_id, 36)
|
|
6440
6797
|
req_status = requests.RequestStatus(request.status)
|
|
6441
|
-
|
|
6798
|
+
user_display = status_utils.get_user_display_name(
|
|
6799
|
+
request.user_name or '-', request.user_id)
|
|
6800
|
+
row = [r_id, user_display, request.name]
|
|
6442
6801
|
if verbose:
|
|
6443
|
-
row.append(request.cluster_name)
|
|
6802
|
+
row.append(request.cluster_name or '-')
|
|
6444
6803
|
row.extend([
|
|
6445
6804
|
log_utils.readable_time_duration(request.created_at),
|
|
6446
6805
|
req_status.colored_str()
|