skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,689 @@
|
|
|
1
|
+
"""Slurm utilities for SkyPilot."""
|
|
2
|
+
import json
|
|
3
|
+
import math
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import shlex
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
from paramiko.config import SSHConfig
|
|
11
|
+
|
|
12
|
+
from sky import exceptions
|
|
13
|
+
from sky import sky_logging
|
|
14
|
+
from sky.adaptors import slurm
|
|
15
|
+
from sky.skylet import constants
|
|
16
|
+
from sky.utils import annotations
|
|
17
|
+
from sky.utils import common_utils
|
|
18
|
+
from sky.utils.db import kv_cache
|
|
19
|
+
|
|
20
|
+
logger = sky_logging.init_logger(__name__)
|
|
21
|
+
|
|
22
|
+
DEFAULT_SLURM_PATH = '~/.slurm/config'
|
|
23
|
+
SLURM_MARKER_FILE = '.sky_slurm_cluster'
|
|
24
|
+
|
|
25
|
+
# Regex pattern for parsing GPU GRES strings.
|
|
26
|
+
# Format: 'gpu[:acc_type]:acc_count(optional_extra_info)'
|
|
27
|
+
# Examples: 'gpu:8', 'gpu:H100:8', 'gpu:nvidia_h100_80gb_hbm3:8(S:0-1)'
|
|
28
|
+
_GRES_GPU_PATTERN = re.compile(r'\bgpu:(?:(?P<type>[^:(]+):)?(?P<count>\d+)',
|
|
29
|
+
re.IGNORECASE)
|
|
30
|
+
|
|
31
|
+
_SLURM_NODES_INFO_CACHE_TTL = 30 * 60
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_gpu_type_and_count(gres_str: str) -> Tuple[Optional[str], int]:
|
|
35
|
+
"""Parses GPU type and count from a GRES string.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A tuple of (GPU type, GPU count). If no GPU is found, returns (None, 0).
|
|
39
|
+
"""
|
|
40
|
+
match = _GRES_GPU_PATTERN.search(gres_str)
|
|
41
|
+
if not match:
|
|
42
|
+
return None, 0
|
|
43
|
+
return match.group('type'), int(match.group('count'))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# SSH host key filename for sshd.
|
|
47
|
+
SLURM_SSHD_HOST_KEY_FILENAME = 'skypilot_host_key'
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_slurm_ssh_config() -> SSHConfig:
|
|
51
|
+
"""Get the Slurm SSH config."""
|
|
52
|
+
slurm_config_path = os.path.expanduser(DEFAULT_SLURM_PATH)
|
|
53
|
+
slurm_config = SSHConfig.from_path(slurm_config_path)
|
|
54
|
+
return slurm_config
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@annotations.lru_cache(scope='request')
|
|
58
|
+
def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
|
|
59
|
+
cache_key = f'slurm:nodes_info:{cluster}'
|
|
60
|
+
cached = kv_cache.get_cache_entry(cache_key)
|
|
61
|
+
if cached is not None:
|
|
62
|
+
logger.debug(f'Slurm nodes info found in cache ({cache_key})')
|
|
63
|
+
return [slurm.NodeInfo(**item) for item in json.loads(cached)]
|
|
64
|
+
|
|
65
|
+
ssh_config = get_slurm_ssh_config()
|
|
66
|
+
ssh_config_dict = ssh_config.lookup(cluster)
|
|
67
|
+
client = slurm.SlurmClient(
|
|
68
|
+
ssh_config_dict['hostname'],
|
|
69
|
+
int(ssh_config_dict.get('port', 22)),
|
|
70
|
+
ssh_config_dict['user'],
|
|
71
|
+
ssh_config_dict['identityfile'][0],
|
|
72
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
73
|
+
ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
|
|
74
|
+
)
|
|
75
|
+
nodes_info = client.info_nodes()
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# Nodes in a cluster are unlikely to change frequently, so cache
|
|
79
|
+
# the result for a short period of time.
|
|
80
|
+
kv_cache.add_or_update_cache_entry(
|
|
81
|
+
cache_key, json.dumps([n._asdict() for n in nodes_info]),
|
|
82
|
+
time.time() + _SLURM_NODES_INFO_CACHE_TTL)
|
|
83
|
+
except Exception as e: # pylint: disable=broad-except
|
|
84
|
+
# Catch the error and continue.
|
|
85
|
+
# Failure to cache the result is not critical to the
|
|
86
|
+
# success of this function.
|
|
87
|
+
logger.debug(f'Failed to cache slurm nodes info for {cluster}: '
|
|
88
|
+
f'{common_utils.format_exception(e)}')
|
|
89
|
+
|
|
90
|
+
return nodes_info
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class SlurmInstanceType:
|
|
94
|
+
"""Class to represent the "Instance Type" in a Slurm cluster.
|
|
95
|
+
|
|
96
|
+
Since Slurm does not have a notion of instances, we generate
|
|
97
|
+
virtual instance types that represent the resources requested by a
|
|
98
|
+
Slurm worker node.
|
|
99
|
+
|
|
100
|
+
This name captures the following resource requests:
|
|
101
|
+
- CPU
|
|
102
|
+
- Memory
|
|
103
|
+
- Accelerators
|
|
104
|
+
|
|
105
|
+
The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
|
|
106
|
+
k is the amount of memory in GB. Accelerators can be specified by
|
|
107
|
+
appending "--{type}:{a}" where type is the accelerator type and a
|
|
108
|
+
is the number of accelerators.
|
|
109
|
+
CPU and memory can be specified as floats. Accelerator count must be int.
|
|
110
|
+
|
|
111
|
+
Examples:
|
|
112
|
+
- 4CPU--16GB
|
|
113
|
+
- 0.5CPU--1.5GB
|
|
114
|
+
- 4CPU--16GB--V100:1
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self,
|
|
118
|
+
cpus: float,
|
|
119
|
+
memory: float,
|
|
120
|
+
accelerator_count: Optional[int] = None,
|
|
121
|
+
accelerator_type: Optional[str] = None):
|
|
122
|
+
self.cpus = cpus
|
|
123
|
+
self.memory = memory
|
|
124
|
+
self.accelerator_count = accelerator_count
|
|
125
|
+
self.accelerator_type = accelerator_type
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def name(self) -> str:
|
|
129
|
+
"""Returns the name of the instance."""
|
|
130
|
+
assert self.cpus is not None
|
|
131
|
+
assert self.memory is not None
|
|
132
|
+
name = (f'{common_utils.format_float(self.cpus)}CPU--'
|
|
133
|
+
f'{common_utils.format_float(self.memory)}GB')
|
|
134
|
+
if self.accelerator_count is not None:
|
|
135
|
+
# Replace spaces with underscores in accelerator type to make it a
|
|
136
|
+
# valid logical instance type name.
|
|
137
|
+
assert self.accelerator_type is not None, self.accelerator_count
|
|
138
|
+
acc_name = self.accelerator_type.replace(' ', '_')
|
|
139
|
+
name += f'--{acc_name}:{self.accelerator_count}'
|
|
140
|
+
return name
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def is_valid_instance_type(name: str) -> bool:
|
|
144
|
+
"""Returns whether the given name is a valid instance type."""
|
|
145
|
+
pattern = re.compile(
|
|
146
|
+
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
|
147
|
+
return bool(pattern.match(name))
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def _parse_instance_type(
|
|
151
|
+
cls,
|
|
152
|
+
name: str) -> Tuple[float, float, Optional[int], Optional[str]]:
|
|
153
|
+
"""Parses and returns resources from the given InstanceType name.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
cpus | float: Number of CPUs
|
|
157
|
+
memory | float: Amount of memory in GB
|
|
158
|
+
accelerator_count | float: Number of accelerators
|
|
159
|
+
accelerator_type | str: Type of accelerator
|
|
160
|
+
"""
|
|
161
|
+
pattern = re.compile(
|
|
162
|
+
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
|
163
|
+
)
|
|
164
|
+
match = pattern.match(name)
|
|
165
|
+
if match is not None:
|
|
166
|
+
cpus = float(match.group('cpus'))
|
|
167
|
+
memory = float(match.group('memory'))
|
|
168
|
+
accelerator_count = match.group('accelerator_count')
|
|
169
|
+
accelerator_type = match.group('accelerator_type')
|
|
170
|
+
if accelerator_count is not None:
|
|
171
|
+
accelerator_count = int(accelerator_count)
|
|
172
|
+
# This is to revert the accelerator types with spaces back to
|
|
173
|
+
# the original format.
|
|
174
|
+
accelerator_type = str(accelerator_type).replace(' ', '_')
|
|
175
|
+
else:
|
|
176
|
+
accelerator_count = None
|
|
177
|
+
accelerator_type = None
|
|
178
|
+
return cpus, memory, accelerator_count, accelerator_type
|
|
179
|
+
else:
|
|
180
|
+
raise ValueError(f'Invalid instance name: {name}')
|
|
181
|
+
|
|
182
|
+
@classmethod
|
|
183
|
+
def from_instance_type(cls, name: str) -> 'SlurmInstanceType':
|
|
184
|
+
"""Returns an instance name object from the given name."""
|
|
185
|
+
if not cls.is_valid_instance_type(name):
|
|
186
|
+
raise ValueError(f'Invalid instance name: {name}')
|
|
187
|
+
cpus, memory, accelerator_count, accelerator_type = \
|
|
188
|
+
cls._parse_instance_type(name)
|
|
189
|
+
return cls(cpus=cpus,
|
|
190
|
+
memory=memory,
|
|
191
|
+
accelerator_count=accelerator_count,
|
|
192
|
+
accelerator_type=accelerator_type)
|
|
193
|
+
|
|
194
|
+
@classmethod
|
|
195
|
+
def from_resources(cls,
|
|
196
|
+
cpus: float,
|
|
197
|
+
memory: float,
|
|
198
|
+
accelerator_count: Union[float, int] = 0,
|
|
199
|
+
accelerator_type: str = '') -> 'SlurmInstanceType':
|
|
200
|
+
"""Returns an instance name object from the given resources.
|
|
201
|
+
|
|
202
|
+
If accelerator_count is not an int, it will be rounded up since GPU
|
|
203
|
+
requests in Slurm must be int.
|
|
204
|
+
|
|
205
|
+
NOTE: Should we take MIG management into account? See
|
|
206
|
+
https://slurm.schedmd.com/gres.html#MIG_Management.
|
|
207
|
+
"""
|
|
208
|
+
name = f'{cpus}CPU--{memory}GB'
|
|
209
|
+
# Round up accelerator_count if it is not an int.
|
|
210
|
+
accelerator_count = math.ceil(accelerator_count)
|
|
211
|
+
if accelerator_count > 0:
|
|
212
|
+
name += f'--{accelerator_type}:{accelerator_count}'
|
|
213
|
+
return cls(cpus=cpus,
|
|
214
|
+
memory=memory,
|
|
215
|
+
accelerator_count=accelerator_count,
|
|
216
|
+
accelerator_type=accelerator_type)
|
|
217
|
+
|
|
218
|
+
def __str__(self):
|
|
219
|
+
return self.name
|
|
220
|
+
|
|
221
|
+
def __repr__(self):
|
|
222
|
+
return (f'SlurmInstanceType(cpus={self.cpus!r}, '
|
|
223
|
+
f'memory={self.memory!r}, '
|
|
224
|
+
f'accelerator_count={self.accelerator_count!r}, '
|
|
225
|
+
f'accelerator_type={self.accelerator_type!r})')
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def instance_id(job_id: str, node: str) -> str:
|
|
229
|
+
"""Generates the SkyPilot-defined instance ID for Slurm.
|
|
230
|
+
|
|
231
|
+
A (job id, node) pair is unique within a Slurm cluster.
|
|
232
|
+
"""
|
|
233
|
+
return f'job{job_id}-{node}'
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def get_partition_from_config(provider_config: Dict[str, Any]) -> str:
|
|
237
|
+
"""Return the partition from the provider config.
|
|
238
|
+
|
|
239
|
+
The concept of partition can be mapped to a cloud zone.
|
|
240
|
+
"""
|
|
241
|
+
partition = provider_config.get('partition')
|
|
242
|
+
if partition is None:
|
|
243
|
+
raise ValueError('Partition not specified in provider config.')
|
|
244
|
+
return partition
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
@annotations.lru_cache(scope='request')
|
|
248
|
+
def get_cluster_default_partition(cluster_name: str) -> Optional[str]:
|
|
249
|
+
"""Get the default partition for a Slurm cluster.
|
|
250
|
+
|
|
251
|
+
Queries the Slurm cluster for the partition marked with an asterisk (*)
|
|
252
|
+
in sinfo output. If no default partition is marked, returns None.
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
ssh_config = get_slurm_ssh_config()
|
|
256
|
+
ssh_config_dict = ssh_config.lookup(cluster_name)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
|
|
260
|
+
f'{common_utils.format_exception(e)}') from e
|
|
261
|
+
|
|
262
|
+
client = slurm.SlurmClient(
|
|
263
|
+
ssh_config_dict['hostname'],
|
|
264
|
+
int(ssh_config_dict.get('port', 22)),
|
|
265
|
+
ssh_config_dict['user'],
|
|
266
|
+
ssh_config_dict['identityfile'][0],
|
|
267
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
268
|
+
ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return client.get_default_partition()
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def get_all_slurm_cluster_names() -> List[str]:
|
|
275
|
+
"""Get all Slurm cluster names available in the environment.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
List[str]: The list of Slurm cluster names if available,
|
|
279
|
+
an empty list otherwise.
|
|
280
|
+
"""
|
|
281
|
+
try:
|
|
282
|
+
ssh_config = get_slurm_ssh_config()
|
|
283
|
+
except FileNotFoundError:
|
|
284
|
+
return []
|
|
285
|
+
except Exception as e:
|
|
286
|
+
raise ValueError(
|
|
287
|
+
f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
|
|
288
|
+
f'{common_utils.format_exception(e)}') from e
|
|
289
|
+
|
|
290
|
+
cluster_names = []
|
|
291
|
+
for cluster in ssh_config.get_hostnames():
|
|
292
|
+
if cluster == '*':
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
cluster_names.append(cluster)
|
|
296
|
+
|
|
297
|
+
return cluster_names
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _check_cpu_mem_fits(
|
|
301
|
+
candidate_instance_type: SlurmInstanceType,
|
|
302
|
+
node_list: List[slurm.NodeInfo]) -> Tuple[bool, Optional[str]]:
|
|
303
|
+
"""Checks if instance fits on candidate nodes based on CPU and memory.
|
|
304
|
+
|
|
305
|
+
We check capacity (not allocatable) because availability can change
|
|
306
|
+
during scheduling, and we want to let the Slurm scheduler handle that.
|
|
307
|
+
"""
|
|
308
|
+
# We log max CPU and memory found on the GPU nodes for debugging.
|
|
309
|
+
max_cpu = 0
|
|
310
|
+
max_mem_gb = 0.0
|
|
311
|
+
|
|
312
|
+
for node_info in node_list:
|
|
313
|
+
node_cpus = node_info.cpus
|
|
314
|
+
node_mem_gb = node_info.memory_gb
|
|
315
|
+
|
|
316
|
+
if node_cpus > max_cpu:
|
|
317
|
+
max_cpu = node_cpus
|
|
318
|
+
max_mem_gb = node_mem_gb
|
|
319
|
+
|
|
320
|
+
if (node_cpus >= candidate_instance_type.cpus and
|
|
321
|
+
node_mem_gb >= candidate_instance_type.memory):
|
|
322
|
+
return True, None
|
|
323
|
+
|
|
324
|
+
return False, (f'Max found: {max_cpu} CPUs, '
|
|
325
|
+
f'{common_utils.format_float(max_mem_gb)}G memory')
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def check_instance_fits(
|
|
329
|
+
cluster: str,
|
|
330
|
+
instance_type: str,
|
|
331
|
+
partition: Optional[str] = None) -> Tuple[bool, Optional[str]]:
|
|
332
|
+
"""Check if the given instance type fits in the given cluster/partition.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
cluster: Name of the Slurm cluster.
|
|
336
|
+
instance_type: The instance type to check.
|
|
337
|
+
partition: Optional partition name. If None, checks all partitions.
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Tuple of (fits, reason) where fits is True if available.
|
|
341
|
+
"""
|
|
342
|
+
# Get Slurm node list in the given cluster (region).
|
|
343
|
+
try:
|
|
344
|
+
nodes = _get_slurm_nodes_info(cluster)
|
|
345
|
+
except FileNotFoundError:
|
|
346
|
+
return (False, f'Could not query Slurm cluster {cluster} '
|
|
347
|
+
f'because the Slurm configuration file '
|
|
348
|
+
f'{DEFAULT_SLURM_PATH} does not exist.')
|
|
349
|
+
except Exception as e: # pylint: disable=broad-except
|
|
350
|
+
return (False, f'Could not query Slurm cluster {cluster} '
|
|
351
|
+
f'because Slurm SSH configuration at {DEFAULT_SLURM_PATH} '
|
|
352
|
+
f'could not be loaded: {common_utils.format_exception(e)}.')
|
|
353
|
+
|
|
354
|
+
default_partition = get_cluster_default_partition(cluster)
|
|
355
|
+
|
|
356
|
+
def is_default_partition(node_partition: str) -> bool:
|
|
357
|
+
if default_partition is None:
|
|
358
|
+
return False
|
|
359
|
+
|
|
360
|
+
# info_nodes does not strip the '*' from the default partition name.
|
|
361
|
+
# But non-default partition names can also end with '*',
|
|
362
|
+
# so we need to check whether the partition name without the '*'
|
|
363
|
+
# is the same as the default partition name.
|
|
364
|
+
return (node_partition.endswith('*') and
|
|
365
|
+
node_partition[:-1] == default_partition)
|
|
366
|
+
|
|
367
|
+
partition_suffix = ''
|
|
368
|
+
if partition is not None:
|
|
369
|
+
filtered = []
|
|
370
|
+
for node_info in nodes:
|
|
371
|
+
node_partition = node_info.partition
|
|
372
|
+
if is_default_partition(node_partition):
|
|
373
|
+
# Strip '*' from default partition name.
|
|
374
|
+
node_partition = node_partition[:-1]
|
|
375
|
+
if node_partition == partition:
|
|
376
|
+
filtered.append(node_info)
|
|
377
|
+
nodes = filtered
|
|
378
|
+
partition_suffix = f' in partition {partition}'
|
|
379
|
+
|
|
380
|
+
slurm_instance_type = SlurmInstanceType.from_instance_type(instance_type)
|
|
381
|
+
acc_count = (slurm_instance_type.accelerator_count
|
|
382
|
+
if slurm_instance_type.accelerator_count is not None else 0)
|
|
383
|
+
acc_type = slurm_instance_type.accelerator_type
|
|
384
|
+
candidate_nodes = nodes
|
|
385
|
+
not_fit_reason_prefix = (
|
|
386
|
+
f'No nodes found with enough '
|
|
387
|
+
f'CPU (> {slurm_instance_type.cpus} CPUs) and/or '
|
|
388
|
+
f'memory (> {slurm_instance_type.memory} G){partition_suffix}. ')
|
|
389
|
+
if acc_type is not None:
|
|
390
|
+
assert acc_count is not None, (acc_type, acc_count)
|
|
391
|
+
|
|
392
|
+
gpu_nodes = []
|
|
393
|
+
for node_info in nodes:
|
|
394
|
+
# Extract the GPU type and count from the GRES string
|
|
395
|
+
node_acc_type, node_acc_count = get_gpu_type_and_count(
|
|
396
|
+
node_info.gres)
|
|
397
|
+
if node_acc_type is None:
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
# TODO(jwj): Handle status check.
|
|
401
|
+
|
|
402
|
+
# Check if the node has the requested GPU type and at least the
|
|
403
|
+
# requested count
|
|
404
|
+
if (node_acc_type.lower() == acc_type.lower() and
|
|
405
|
+
node_acc_count >= acc_count):
|
|
406
|
+
gpu_nodes.append(node_info)
|
|
407
|
+
if len(gpu_nodes) == 0:
|
|
408
|
+
return (False,
|
|
409
|
+
f'No GPU nodes found with at least {acc_type}:{acc_count} '
|
|
410
|
+
f'on the cluster.')
|
|
411
|
+
|
|
412
|
+
candidate_nodes = gpu_nodes
|
|
413
|
+
not_fit_reason_prefix = (
|
|
414
|
+
f'GPU nodes with {acc_type}{partition_suffix} do not have '
|
|
415
|
+
f'enough CPU (> {slurm_instance_type.cpus} CPUs) and/or '
|
|
416
|
+
f'memory (> {slurm_instance_type.memory} G). ')
|
|
417
|
+
|
|
418
|
+
# Check if CPU and memory requirements are met on at least one
|
|
419
|
+
# candidate node.
|
|
420
|
+
fits, reason = _check_cpu_mem_fits(slurm_instance_type, candidate_nodes)
|
|
421
|
+
if not fits and reason is not None:
|
|
422
|
+
reason = not_fit_reason_prefix + reason
|
|
423
|
+
return fits, reason
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
# GRES names are highly unlikely to change within a cluster.
|
|
427
|
+
# TODO(kevin): Cache using sky/utils/db/kv_cache.py too.
|
|
428
|
+
@annotations.lru_cache(scope='global', maxsize=10)
|
|
429
|
+
def get_gres_gpu_type(cluster: str, requested_gpu_type: str) -> str:
|
|
430
|
+
"""Get the actual GPU type as it appears in the cluster's GRES.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
cluster: Name of the Slurm cluster.
|
|
434
|
+
requested_gpu_type: The GPU type requested by the user.
|
|
435
|
+
|
|
436
|
+
Returns:
|
|
437
|
+
The actual GPU type as it appears in the cluster's GRES string.
|
|
438
|
+
Falls back to the requested type if not found.
|
|
439
|
+
"""
|
|
440
|
+
try:
|
|
441
|
+
ssh_config = get_slurm_ssh_config()
|
|
442
|
+
ssh_config_dict = ssh_config.lookup(cluster)
|
|
443
|
+
client = slurm.SlurmClient(
|
|
444
|
+
ssh_config_dict['hostname'],
|
|
445
|
+
int(ssh_config_dict.get('port', 22)),
|
|
446
|
+
ssh_config_dict['user'],
|
|
447
|
+
ssh_config_dict['identityfile'][0],
|
|
448
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
449
|
+
ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
nodes = client.info_nodes()
|
|
453
|
+
|
|
454
|
+
for node_info in nodes:
|
|
455
|
+
node_gpu_type, _ = get_gpu_type_and_count(node_info.gres)
|
|
456
|
+
if node_gpu_type is None:
|
|
457
|
+
continue
|
|
458
|
+
if node_gpu_type.lower() == requested_gpu_type.lower():
|
|
459
|
+
return node_gpu_type
|
|
460
|
+
except Exception as e: # pylint: disable=broad-except
|
|
461
|
+
logger.warning(
|
|
462
|
+
'Failed to determine the exact GPU GRES type from the Slurm '
|
|
463
|
+
f'cluster {cluster!r}. Falling back to '
|
|
464
|
+
f'{requested_gpu_type.lower()!r}. This may cause issues if the '
|
|
465
|
+
f'casing is incorrect. Error: {common_utils.format_exception(e)}')
|
|
466
|
+
|
|
467
|
+
# GRES names are more commonly in lowercase from what we've seen so far.
|
|
468
|
+
return requested_gpu_type.lower()
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _get_slurm_node_info_list(
|
|
472
|
+
slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
473
|
+
"""Gathers detailed information about each node in the Slurm cluster.
|
|
474
|
+
|
|
475
|
+
Raises:
|
|
476
|
+
FileNotFoundError: If the Slurm configuration file does not exist.
|
|
477
|
+
ValueError: If no Slurm cluster name is found in the Slurm
|
|
478
|
+
configuration file.
|
|
479
|
+
"""
|
|
480
|
+
# 1. Get node state and GRES using sinfo
|
|
481
|
+
|
|
482
|
+
# can raise FileNotFoundError if config file does not exist.
|
|
483
|
+
slurm_config = get_slurm_ssh_config()
|
|
484
|
+
if slurm_cluster_name is None:
|
|
485
|
+
slurm_cluster_names = get_all_slurm_cluster_names()
|
|
486
|
+
if slurm_cluster_names:
|
|
487
|
+
slurm_cluster_name = slurm_cluster_names[0]
|
|
488
|
+
if slurm_cluster_name is None:
|
|
489
|
+
raise ValueError(
|
|
490
|
+
f'No Slurm cluster name found in the {DEFAULT_SLURM_PATH} '
|
|
491
|
+
f'configuration.')
|
|
492
|
+
slurm_config_dict = slurm_config.lookup(slurm_cluster_name)
|
|
493
|
+
logger.debug(f'Slurm config dict: {slurm_config_dict}')
|
|
494
|
+
slurm_client = slurm.SlurmClient(
|
|
495
|
+
slurm_config_dict['hostname'],
|
|
496
|
+
int(slurm_config_dict.get('port', 22)),
|
|
497
|
+
slurm_config_dict['user'],
|
|
498
|
+
slurm_config_dict['identityfile'][0],
|
|
499
|
+
ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
|
|
500
|
+
ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
|
|
501
|
+
)
|
|
502
|
+
node_infos = slurm_client.info_nodes()
|
|
503
|
+
|
|
504
|
+
if not node_infos:
|
|
505
|
+
logger.warning(
|
|
506
|
+
f'`sinfo -N` returned no output on cluster {slurm_cluster_name}. '
|
|
507
|
+
f'No nodes found?')
|
|
508
|
+
return []
|
|
509
|
+
|
|
510
|
+
# 2. Process each node, aggregating partitions per node
|
|
511
|
+
slurm_nodes_info: Dict[str, Dict[str, Any]] = {}
|
|
512
|
+
|
|
513
|
+
nodes_to_jobs_gres = slurm_client.get_all_jobs_gres()
|
|
514
|
+
for node_info in node_infos:
|
|
515
|
+
node_name = node_info.node
|
|
516
|
+
state = node_info.state
|
|
517
|
+
gres_str = node_info.gres
|
|
518
|
+
partition = node_info.partition
|
|
519
|
+
|
|
520
|
+
if node_name in slurm_nodes_info:
|
|
521
|
+
slurm_nodes_info[node_name]['partitions'].append(partition)
|
|
522
|
+
continue
|
|
523
|
+
|
|
524
|
+
# Extract GPU info from GRES
|
|
525
|
+
node_gpu_type, total_gpus = get_gpu_type_and_count(gres_str)
|
|
526
|
+
if total_gpus > 0:
|
|
527
|
+
if node_gpu_type is not None:
|
|
528
|
+
node_gpu_type = node_gpu_type.upper()
|
|
529
|
+
else:
|
|
530
|
+
node_gpu_type = 'GPU'
|
|
531
|
+
|
|
532
|
+
# Get allocated GPUs
|
|
533
|
+
allocated_gpus = 0
|
|
534
|
+
# TODO(zhwu): move to enum
|
|
535
|
+
if state in ('alloc', 'mix', 'drain', 'drng', 'drained', 'resv',
|
|
536
|
+
'comp'):
|
|
537
|
+
jobs_gres = nodes_to_jobs_gres.get(node_name, [])
|
|
538
|
+
if jobs_gres:
|
|
539
|
+
for job_line in jobs_gres:
|
|
540
|
+
_, job_gpu_count = get_gpu_type_and_count(job_line)
|
|
541
|
+
allocated_gpus += job_gpu_count
|
|
542
|
+
elif state == 'alloc':
|
|
543
|
+
# If no GRES info found but node is fully allocated,
|
|
544
|
+
# assume all GPUs are in use.
|
|
545
|
+
allocated_gpus = total_gpus
|
|
546
|
+
elif state == 'idle':
|
|
547
|
+
allocated_gpus = 0
|
|
548
|
+
|
|
549
|
+
free_gpus = total_gpus - allocated_gpus if state not in ('down',
|
|
550
|
+
'drain',
|
|
551
|
+
'drng',
|
|
552
|
+
'maint') else 0
|
|
553
|
+
free_gpus = max(0, free_gpus)
|
|
554
|
+
|
|
555
|
+
slurm_nodes_info[node_name] = {
|
|
556
|
+
'node_name': node_name,
|
|
557
|
+
'slurm_cluster_name': slurm_cluster_name,
|
|
558
|
+
'partitions': [partition],
|
|
559
|
+
'node_state': state,
|
|
560
|
+
'gpu_type': node_gpu_type,
|
|
561
|
+
'total_gpus': total_gpus,
|
|
562
|
+
'free_gpus': free_gpus,
|
|
563
|
+
'vcpu_count': node_info.cpus,
|
|
564
|
+
'memory_gb': round(node_info.memory_gb, 2),
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
for node_info in slurm_nodes_info.values():
|
|
568
|
+
partitions = node_info.pop('partitions')
|
|
569
|
+
node_info['partition'] = ','.join(str(p) for p in partitions)
|
|
570
|
+
|
|
571
|
+
return list(slurm_nodes_info.values())
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def slurm_node_info(
|
|
575
|
+
slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
576
|
+
"""Gets detailed information for each node in the Slurm cluster.
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing node info.
|
|
580
|
+
"""
|
|
581
|
+
try:
|
|
582
|
+
node_list = _get_slurm_node_info_list(
|
|
583
|
+
slurm_cluster_name=slurm_cluster_name)
|
|
584
|
+
except (RuntimeError, exceptions.NotSupportedError) as e:
|
|
585
|
+
logger.debug(f'Could not retrieve Slurm node info: {e}')
|
|
586
|
+
return []
|
|
587
|
+
return node_list
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def is_inside_slurm_cluster() -> bool:
|
|
591
|
+
# Check for the marker file in the current home directory. When run by
|
|
592
|
+
# the skylet on a compute node, the HOME environment variable is set to
|
|
593
|
+
# the cluster's sky home directory by the SlurmCommandRunner.
|
|
594
|
+
marker_file = os.path.join(os.path.expanduser('~'), SLURM_MARKER_FILE)
|
|
595
|
+
return os.path.exists(marker_file)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
@annotations.lru_cache(scope='request')
|
|
599
|
+
def get_partitions(cluster_name: str) -> List[str]:
|
|
600
|
+
"""Get unique partition names available in a Slurm cluster.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
cluster_name: Name of the Slurm cluster.
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
List of unique partition names available in the cluster.
|
|
607
|
+
The default partition appears first,
|
|
608
|
+
and the rest are sorted alphabetically.
|
|
609
|
+
"""
|
|
610
|
+
try:
|
|
611
|
+
slurm_config = SSHConfig.from_path(
|
|
612
|
+
os.path.expanduser(DEFAULT_SLURM_PATH))
|
|
613
|
+
slurm_config_dict = slurm_config.lookup(cluster_name)
|
|
614
|
+
|
|
615
|
+
client = slurm.SlurmClient(
|
|
616
|
+
slurm_config_dict['hostname'],
|
|
617
|
+
int(slurm_config_dict.get('port', 22)),
|
|
618
|
+
slurm_config_dict['user'],
|
|
619
|
+
slurm_config_dict['identityfile'][0],
|
|
620
|
+
ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
|
|
621
|
+
ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
partitions_info = client.get_partitions_info()
|
|
625
|
+
default_partitions = []
|
|
626
|
+
other_partitions = []
|
|
627
|
+
for partition in partitions_info:
|
|
628
|
+
if partition.is_default:
|
|
629
|
+
default_partitions.append(partition.name)
|
|
630
|
+
else:
|
|
631
|
+
other_partitions.append(partition.name)
|
|
632
|
+
return default_partitions + sorted(other_partitions)
|
|
633
|
+
except Exception as e: # pylint: disable=broad-except
|
|
634
|
+
raise ValueError(
|
|
635
|
+
f'Failed to get partitions for cluster '
|
|
636
|
+
f'{cluster_name}: {common_utils.format_exception(e)}') from e
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def srun_sshd_command(
|
|
640
|
+
job_id: str,
|
|
641
|
+
target_node: str,
|
|
642
|
+
unix_user: str,
|
|
643
|
+
) -> str:
|
|
644
|
+
"""Build srun command for launching sshd -i inside a Slurm job.
|
|
645
|
+
|
|
646
|
+
This is used by the API server to proxy SSH connections to Slurm jobs
|
|
647
|
+
via sshd running in inetd mode within srun.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
job_id: The Slurm job ID
|
|
651
|
+
target_node: The target compute node hostname
|
|
652
|
+
unix_user: The Unix user for the job
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
List of command arguments to be extended to ssh base command
|
|
656
|
+
"""
|
|
657
|
+
# We use ~username to ensure we use the real home of the user ssh'ing in,
|
|
658
|
+
# because we override the home directory in SlurmCommandRunner.run.
|
|
659
|
+
user_home_ssh_dir = f'~{unix_user}/.ssh'
|
|
660
|
+
return shlex.join([
|
|
661
|
+
'srun',
|
|
662
|
+
'--quiet',
|
|
663
|
+
'--unbuffered',
|
|
664
|
+
'--overlap',
|
|
665
|
+
'--jobid',
|
|
666
|
+
job_id,
|
|
667
|
+
'-w',
|
|
668
|
+
target_node,
|
|
669
|
+
'/usr/sbin/sshd',
|
|
670
|
+
'-i', # Uses stdin/stdout
|
|
671
|
+
'-e', # Writes errors to stderr
|
|
672
|
+
'-f', # Use /dev/null to avoid reading system sshd_config
|
|
673
|
+
'/dev/null',
|
|
674
|
+
'-h',
|
|
675
|
+
f'{user_home_ssh_dir}/{SLURM_SSHD_HOST_KEY_FILENAME}',
|
|
676
|
+
'-o',
|
|
677
|
+
f'AuthorizedKeysFile={user_home_ssh_dir}/authorized_keys',
|
|
678
|
+
'-o',
|
|
679
|
+
'PasswordAuthentication=no',
|
|
680
|
+
'-o',
|
|
681
|
+
'PubkeyAuthentication=yes',
|
|
682
|
+
# If UsePAM is enabled, we will not be able to run sshd(8)
|
|
683
|
+
# as a non-root user.
|
|
684
|
+
# See https://man7.org/linux/man-pages/man5/sshd_config.5.html
|
|
685
|
+
'-o',
|
|
686
|
+
'UsePAM=no',
|
|
687
|
+
'-o',
|
|
688
|
+
f'AcceptEnv={constants.SKY_CLUSTER_NAME_ENV_VAR_KEY}',
|
|
689
|
+
])
|