skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/clouds/slurm.py
ADDED
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
"""Slurm."""
|
|
2
|
+
|
|
3
|
+
import typing
|
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from sky import catalog
|
|
7
|
+
from sky import clouds
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky import skypilot_config
|
|
10
|
+
from sky.adaptors import slurm
|
|
11
|
+
from sky.provision.slurm import utils as slurm_utils
|
|
12
|
+
from sky.skylet import constants
|
|
13
|
+
from sky.utils import annotations
|
|
14
|
+
from sky.utils import common_utils
|
|
15
|
+
from sky.utils import registry
|
|
16
|
+
from sky.utils import resources_utils
|
|
17
|
+
|
|
18
|
+
if typing.TYPE_CHECKING:
|
|
19
|
+
from sky import resources as resources_lib
|
|
20
|
+
from sky.utils import volume as volume_lib
|
|
21
|
+
|
|
22
|
+
logger = sky_logging.init_logger(__name__)
|
|
23
|
+
|
|
24
|
+
CREDENTIAL_PATH = slurm_utils.DEFAULT_SLURM_PATH
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@registry.CLOUD_REGISTRY.register
|
|
28
|
+
class Slurm(clouds.Cloud):
|
|
29
|
+
"""Slurm."""
|
|
30
|
+
|
|
31
|
+
_REPR = 'Slurm'
|
|
32
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
|
33
|
+
clouds.CloudImplementationFeatures.AUTOSTOP: 'Slurm does not '
|
|
34
|
+
'support autostop.',
|
|
35
|
+
clouds.CloudImplementationFeatures.STOP: 'Slurm does not support '
|
|
36
|
+
'stopping instances.',
|
|
37
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE: 'Spot instances are '
|
|
38
|
+
'not supported in '
|
|
39
|
+
'Slurm.',
|
|
40
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
41
|
+
'Customized multiple network interfaces are not supported in '
|
|
42
|
+
'Slurm.',
|
|
43
|
+
clouds.CloudImplementationFeatures.OPEN_PORTS: 'Opening ports is not '
|
|
44
|
+
'supported in Slurm.',
|
|
45
|
+
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
|
46
|
+
'Running '
|
|
47
|
+
'controllers is not '
|
|
48
|
+
'well tested with '
|
|
49
|
+
'Slurm.',
|
|
50
|
+
clouds.CloudImplementationFeatures.IMAGE_ID: 'Specifying image ID is '
|
|
51
|
+
'not supported in Slurm.',
|
|
52
|
+
clouds.CloudImplementationFeatures.DOCKER_IMAGE: 'Docker image is not '
|
|
53
|
+
'supported in Slurm.',
|
|
54
|
+
}
|
|
55
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
|
56
|
+
_regions: List[clouds.Region] = []
|
|
57
|
+
_INDENT_PREFIX = ' '
|
|
58
|
+
|
|
59
|
+
# Same as Kubernetes.
|
|
60
|
+
_DEFAULT_NUM_VCPUS_WITH_GPU = 4
|
|
61
|
+
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4
|
|
62
|
+
|
|
63
|
+
# Using the latest SkyPilot provisioner API to provision and check status.
|
|
64
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
|
65
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
|
66
|
+
|
|
67
|
+
_SSH_CONFIG_KEY_MAPPING = {
|
|
68
|
+
'identityfile': 'IdentityFile',
|
|
69
|
+
'user': 'User',
|
|
70
|
+
'hostname': 'HostName',
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
def _unsupported_features_for_resources(
|
|
75
|
+
cls,
|
|
76
|
+
resources: 'resources_lib.Resources',
|
|
77
|
+
region: Optional[str] = None,
|
|
78
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
79
|
+
del region # unused
|
|
80
|
+
# logger.critical('[BYPASS] Check Slurm's unsupported features...')
|
|
81
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
|
82
|
+
|
|
83
|
+
@classmethod
|
|
84
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
|
85
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def uses_ray(cls) -> bool:
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def get_vcpus_mem_from_instance_type(
|
|
93
|
+
cls,
|
|
94
|
+
instance_type: str,
|
|
95
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
|
96
|
+
inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
|
|
97
|
+
return inst.cpus, inst.memory
|
|
98
|
+
|
|
99
|
+
@classmethod
|
|
100
|
+
def zones_provision_loop(
|
|
101
|
+
cls,
|
|
102
|
+
*,
|
|
103
|
+
region: str,
|
|
104
|
+
num_nodes: int,
|
|
105
|
+
instance_type: str,
|
|
106
|
+
accelerators: Optional[Dict[str, int]] = None,
|
|
107
|
+
use_spot: bool = False,
|
|
108
|
+
) -> Iterator[Optional[List[clouds.Zone]]]:
|
|
109
|
+
"""Iterate over partitions (zones) for provisioning with failover.
|
|
110
|
+
|
|
111
|
+
Yields one partition at a time for failover retry logic.
|
|
112
|
+
"""
|
|
113
|
+
del num_nodes # unused
|
|
114
|
+
|
|
115
|
+
regions = cls.regions_with_offering(instance_type,
|
|
116
|
+
accelerators,
|
|
117
|
+
use_spot,
|
|
118
|
+
region=region,
|
|
119
|
+
zone=None)
|
|
120
|
+
|
|
121
|
+
for r in regions:
|
|
122
|
+
if r.zones:
|
|
123
|
+
# Yield one partition at a time for failover
|
|
124
|
+
for zone in r.zones:
|
|
125
|
+
yield [zone]
|
|
126
|
+
else:
|
|
127
|
+
# No partitions discovered, use default
|
|
128
|
+
yield None
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
|
132
|
+
def _log_skipped_clusters_once(cls, skipped_clusters: Tuple[str,
|
|
133
|
+
...]) -> None:
|
|
134
|
+
"""Log skipped clusters for only once.
|
|
135
|
+
|
|
136
|
+
We don't directly cache the result of existing_allowed_clusters
|
|
137
|
+
as the config may update the allowed clusters.
|
|
138
|
+
"""
|
|
139
|
+
if skipped_clusters:
|
|
140
|
+
logger.warning(
|
|
141
|
+
f'Slurm clusters {set(skipped_clusters)!r} specified in '
|
|
142
|
+
'"allowed_clusters" not found in ~/.slurm/config. '
|
|
143
|
+
'Ignoring these clusters.')
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def existing_allowed_clusters(cls, silent: bool = False) -> List[str]:
|
|
147
|
+
"""Get existing allowed clusters.
|
|
148
|
+
|
|
149
|
+
Returns clusters based on the following logic:
|
|
150
|
+
1. If 'allowed_clusters' is set to 'all' in ~/.sky/config.yaml,
|
|
151
|
+
return all clusters from ~/.slurm/config
|
|
152
|
+
2. If specific clusters are listed in 'allowed_clusters',
|
|
153
|
+
return only those that exist in ~/.slurm/config
|
|
154
|
+
3. If no configuration is specified, return all clusters
|
|
155
|
+
from ~/.slurm/config (default behavior)
|
|
156
|
+
"""
|
|
157
|
+
all_clusters = slurm_utils.get_all_slurm_cluster_names()
|
|
158
|
+
if len(all_clusters) == 0:
|
|
159
|
+
return []
|
|
160
|
+
|
|
161
|
+
all_clusters = set(all_clusters)
|
|
162
|
+
|
|
163
|
+
# Workspace-level allowed_clusters should take precedence over
|
|
164
|
+
# the global allowed_clusters.
|
|
165
|
+
allowed_clusters = skypilot_config.get_workspace_cloud('slurm').get(
|
|
166
|
+
'allowed_clusters', None)
|
|
167
|
+
if allowed_clusters is None:
|
|
168
|
+
allowed_clusters = skypilot_config.get_effective_region_config(
|
|
169
|
+
cloud='slurm',
|
|
170
|
+
region=None,
|
|
171
|
+
keys=('allowed_clusters',),
|
|
172
|
+
default_value=None)
|
|
173
|
+
|
|
174
|
+
allow_all_clusters = allowed_clusters == 'all'
|
|
175
|
+
if allow_all_clusters:
|
|
176
|
+
allowed_clusters = list(all_clusters)
|
|
177
|
+
|
|
178
|
+
if allowed_clusters is None:
|
|
179
|
+
# Default to all clusters if no configuration is specified
|
|
180
|
+
allowed_clusters = list(all_clusters)
|
|
181
|
+
|
|
182
|
+
existing_clusters = []
|
|
183
|
+
skipped_clusters = []
|
|
184
|
+
for cluster in allowed_clusters:
|
|
185
|
+
if cluster in all_clusters:
|
|
186
|
+
existing_clusters.append(cluster)
|
|
187
|
+
else:
|
|
188
|
+
skipped_clusters.append(cluster)
|
|
189
|
+
|
|
190
|
+
if not silent:
|
|
191
|
+
cls._log_skipped_clusters_once(tuple(sorted(skipped_clusters)))
|
|
192
|
+
|
|
193
|
+
return existing_clusters
|
|
194
|
+
|
|
195
|
+
@classmethod
|
|
196
|
+
def regions_with_offering(
|
|
197
|
+
cls,
|
|
198
|
+
instance_type: Optional[str],
|
|
199
|
+
accelerators: Optional[Dict[str, int]],
|
|
200
|
+
use_spot: bool,
|
|
201
|
+
region: Optional[str],
|
|
202
|
+
zone: Optional[str],
|
|
203
|
+
resources: Optional['resources_lib.Resources'] = None
|
|
204
|
+
) -> List[clouds.Region]:
|
|
205
|
+
del accelerators, use_spot, resources # unused
|
|
206
|
+
existing_clusters = cls.existing_allowed_clusters()
|
|
207
|
+
|
|
208
|
+
regions: List[clouds.Region] = []
|
|
209
|
+
for cluster in existing_clusters:
|
|
210
|
+
# Filter by region if specified
|
|
211
|
+
if region is not None and cluster != region:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
# Fetch partitions for this cluster and attach as zones
|
|
215
|
+
try:
|
|
216
|
+
partitions = slurm_utils.get_partitions(cluster)
|
|
217
|
+
if zone is not None:
|
|
218
|
+
# Filter by zone (partition) if specified
|
|
219
|
+
partitions = [p for p in partitions if p == zone]
|
|
220
|
+
zones = [clouds.Zone(p) for p in partitions]
|
|
221
|
+
except Exception as e: # pylint: disable=broad-except
|
|
222
|
+
logger.debug(f'Failed to get partitions for {cluster}: {e}')
|
|
223
|
+
zones = []
|
|
224
|
+
|
|
225
|
+
r = clouds.Region(cluster)
|
|
226
|
+
if zones:
|
|
227
|
+
r.set_zones(zones)
|
|
228
|
+
regions.append(r)
|
|
229
|
+
|
|
230
|
+
# Check if requested instance type will fit in the cluster.
|
|
231
|
+
if instance_type is None:
|
|
232
|
+
return regions
|
|
233
|
+
|
|
234
|
+
regions_to_return = []
|
|
235
|
+
for r in regions:
|
|
236
|
+
cluster = r.name
|
|
237
|
+
|
|
238
|
+
# Check each partition (zone) in the cluster
|
|
239
|
+
partitions_to_check = [z.name for z in r.zones] if r.zones else []
|
|
240
|
+
valid_zones = []
|
|
241
|
+
|
|
242
|
+
# TODO(kevin): Batch this check to reduce number of roundtrips.
|
|
243
|
+
for partition in partitions_to_check:
|
|
244
|
+
fits, reason = slurm_utils.check_instance_fits(
|
|
245
|
+
cluster, instance_type, partition)
|
|
246
|
+
if fits:
|
|
247
|
+
if partition:
|
|
248
|
+
valid_zones.append(clouds.Zone(partition))
|
|
249
|
+
else:
|
|
250
|
+
logger.debug(
|
|
251
|
+
f'Instance type {instance_type} does not fit in '
|
|
252
|
+
f'{cluster}/{partition}: {reason}')
|
|
253
|
+
|
|
254
|
+
if valid_zones:
|
|
255
|
+
r.set_zones(valid_zones)
|
|
256
|
+
regions_to_return.append(r)
|
|
257
|
+
|
|
258
|
+
return regions_to_return
|
|
259
|
+
|
|
260
|
+
def instance_type_to_hourly_cost(self,
|
|
261
|
+
instance_type: str,
|
|
262
|
+
use_spot: bool,
|
|
263
|
+
region: Optional[str] = None,
|
|
264
|
+
zone: Optional[str] = None) -> float:
|
|
265
|
+
"""For now, we assume zero cost for Slurm clusters."""
|
|
266
|
+
return 0.0
|
|
267
|
+
|
|
268
|
+
def accelerators_to_hourly_cost(self,
|
|
269
|
+
accelerators: Dict[str, int],
|
|
270
|
+
use_spot: bool,
|
|
271
|
+
region: Optional[str] = None,
|
|
272
|
+
zone: Optional[str] = None) -> float:
|
|
273
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
|
274
|
+
del accelerators, use_spot, region, zone # unused
|
|
275
|
+
return 0.0
|
|
276
|
+
|
|
277
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
|
278
|
+
return 0.0
|
|
279
|
+
|
|
280
|
+
def __repr__(self):
|
|
281
|
+
return self._REPR
|
|
282
|
+
|
|
283
|
+
def is_same_cloud(self, other: clouds.Cloud) -> bool:
|
|
284
|
+
# Returns true if the two clouds are the same cloud type.
|
|
285
|
+
return isinstance(other, Slurm)
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def get_default_instance_type(cls,
|
|
289
|
+
cpus: Optional[str] = None,
|
|
290
|
+
memory: Optional[str] = None,
|
|
291
|
+
disk_tier: Optional[
|
|
292
|
+
resources_utils.DiskTier] = None,
|
|
293
|
+
region: Optional[str] = None,
|
|
294
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
295
|
+
"""Returns the default instance type for Slurm."""
|
|
296
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
297
|
+
memory=memory,
|
|
298
|
+
disk_tier=disk_tier,
|
|
299
|
+
region=region,
|
|
300
|
+
zone=zone,
|
|
301
|
+
clouds='slurm')
|
|
302
|
+
|
|
303
|
+
@classmethod
|
|
304
|
+
def get_accelerators_from_instance_type(
|
|
305
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
306
|
+
inst = slurm_utils.SlurmInstanceType.from_instance_type(instance_type)
|
|
307
|
+
return {
|
|
308
|
+
inst.accelerator_type: inst.accelerator_count
|
|
309
|
+
} if (inst.accelerator_count is not None and
|
|
310
|
+
inst.accelerator_type is not None) else None
|
|
311
|
+
|
|
312
|
+
@classmethod
|
|
313
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
def make_deploy_resources_variables(
|
|
317
|
+
self,
|
|
318
|
+
resources: 'resources_lib.Resources',
|
|
319
|
+
cluster_name: 'resources_utils.ClusterName',
|
|
320
|
+
region: Optional['clouds.Region'],
|
|
321
|
+
zones: Optional[List['clouds.Zone']],
|
|
322
|
+
num_nodes: int,
|
|
323
|
+
dryrun: bool = False,
|
|
324
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
325
|
+
) -> Dict[str, Optional[str]]:
|
|
326
|
+
del cluster_name, dryrun, volume_mounts # Unused.
|
|
327
|
+
if region is not None:
|
|
328
|
+
cluster = region.name
|
|
329
|
+
else:
|
|
330
|
+
cluster = 'localcluster'
|
|
331
|
+
assert cluster is not None, 'No available Slurm cluster found.'
|
|
332
|
+
|
|
333
|
+
# Use zone as partition if specified, otherwise default
|
|
334
|
+
if zones and len(zones) > 0:
|
|
335
|
+
partition = zones[0].name
|
|
336
|
+
else:
|
|
337
|
+
partitions = slurm_utils.get_partitions(cluster)
|
|
338
|
+
if not partitions:
|
|
339
|
+
raise ValueError(f'No partitions found for cluster {cluster}.')
|
|
340
|
+
# get_partitions returns the default partition first, then sorted
|
|
341
|
+
# alphabetically, so this also handles the case where the cluster
|
|
342
|
+
# does not have a default partition.
|
|
343
|
+
partition = partitions[0]
|
|
344
|
+
|
|
345
|
+
# cluster is our target slurmctld host.
|
|
346
|
+
ssh_config = slurm_utils.get_slurm_ssh_config()
|
|
347
|
+
ssh_config_dict = ssh_config.lookup(cluster)
|
|
348
|
+
|
|
349
|
+
resources = resources.assert_launchable()
|
|
350
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
351
|
+
resources.instance_type)
|
|
352
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
353
|
+
acc_dict)
|
|
354
|
+
|
|
355
|
+
# resources.memory and cpus are none if they are not explicitly set.
|
|
356
|
+
# we fetch the default values for the instance type in that case.
|
|
357
|
+
s = slurm_utils.SlurmInstanceType.from_instance_type(
|
|
358
|
+
resources.instance_type)
|
|
359
|
+
cpus = s.cpus
|
|
360
|
+
mem = s.memory
|
|
361
|
+
# Optionally populate accelerator information.
|
|
362
|
+
acc_count = s.accelerator_count if s.accelerator_count else 0
|
|
363
|
+
acc_type = s.accelerator_type if s.accelerator_type else None
|
|
364
|
+
# Resolve the actual GPU type as it appears in the cluster's GRES.
|
|
365
|
+
# Slurm GRES types are case-sensitive.
|
|
366
|
+
if acc_type:
|
|
367
|
+
acc_type = slurm_utils.get_gres_gpu_type(cluster, acc_type)
|
|
368
|
+
|
|
369
|
+
deploy_vars = {
|
|
370
|
+
'instance_type': resources.instance_type,
|
|
371
|
+
'custom_resources': custom_resources,
|
|
372
|
+
'cpus': str(cpus),
|
|
373
|
+
'memory': str(mem),
|
|
374
|
+
'accelerator_count': str(acc_count),
|
|
375
|
+
'accelerator_type': acc_type,
|
|
376
|
+
'slurm_cluster': cluster,
|
|
377
|
+
'slurm_partition': partition,
|
|
378
|
+
# TODO(jwj): Pass SSH config in a smarter way
|
|
379
|
+
'ssh_hostname': ssh_config_dict['hostname'],
|
|
380
|
+
'ssh_port': str(ssh_config_dict.get('port', 22)),
|
|
381
|
+
'ssh_user': ssh_config_dict['user'],
|
|
382
|
+
'slurm_proxy_command': ssh_config_dict.get('proxycommand', None),
|
|
383
|
+
'slurm_proxy_jump': ssh_config_dict.get('proxyjump', None),
|
|
384
|
+
# TODO(jwj): Solve naming collision with 'ssh_private_key'.
|
|
385
|
+
# Please refer to slurm-ray.yml.j2 'ssh' and 'auth' sections.
|
|
386
|
+
'slurm_private_key': ssh_config_dict['identityfile'][0],
|
|
387
|
+
'slurm_sshd_host_key_filename':
|
|
388
|
+
(slurm_utils.SLURM_SSHD_HOST_KEY_FILENAME),
|
|
389
|
+
'slurm_cluster_name_env_var':
|
|
390
|
+
(constants.SKY_CLUSTER_NAME_ENV_VAR_KEY),
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
return deploy_vars
|
|
394
|
+
|
|
395
|
+
def _get_feasible_launchable_resources(
|
|
396
|
+
self, resources: 'resources_lib.Resources'
|
|
397
|
+
) -> 'resources_utils.FeasibleResources':
|
|
398
|
+
"""Returns a list of feasible resources for the given resources."""
|
|
399
|
+
if resources.instance_type is not None:
|
|
400
|
+
assert resources.is_launchable(), resources
|
|
401
|
+
# Check if the instance type is available in at least one cluster
|
|
402
|
+
available_regions = self.regions_with_offering(
|
|
403
|
+
resources.instance_type,
|
|
404
|
+
accelerators=None,
|
|
405
|
+
use_spot=resources.use_spot,
|
|
406
|
+
region=resources.region,
|
|
407
|
+
zone=resources.zone)
|
|
408
|
+
if not available_regions:
|
|
409
|
+
return resources_utils.FeasibleResources([], [], None)
|
|
410
|
+
|
|
411
|
+
# Return a single resource without region set.
|
|
412
|
+
# The optimizer will call make_launchables_for_valid_region_zones()
|
|
413
|
+
# which will create one resource per region/cluster.
|
|
414
|
+
resources = resources.copy(accelerators=None)
|
|
415
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
|
416
|
+
|
|
417
|
+
def _make(instance_list):
|
|
418
|
+
resource_list = []
|
|
419
|
+
for instance_type in instance_list:
|
|
420
|
+
r = resources.copy(
|
|
421
|
+
cloud=Slurm(),
|
|
422
|
+
instance_type=instance_type,
|
|
423
|
+
accelerators=None,
|
|
424
|
+
)
|
|
425
|
+
resource_list.append(r)
|
|
426
|
+
return resource_list
|
|
427
|
+
|
|
428
|
+
# Currently, handle a filter on accelerators only.
|
|
429
|
+
accelerators = resources.accelerators
|
|
430
|
+
|
|
431
|
+
default_instance_type = Slurm.get_default_instance_type(
|
|
432
|
+
cpus=resources.cpus,
|
|
433
|
+
memory=resources.memory,
|
|
434
|
+
disk_tier=resources.disk_tier,
|
|
435
|
+
region=resources.region,
|
|
436
|
+
zone=resources.zone)
|
|
437
|
+
if default_instance_type is None:
|
|
438
|
+
return resources_utils.FeasibleResources([], [], None)
|
|
439
|
+
|
|
440
|
+
if accelerators is None:
|
|
441
|
+
chosen_instance_type = default_instance_type
|
|
442
|
+
else:
|
|
443
|
+
assert len(accelerators) == 1, resources
|
|
444
|
+
|
|
445
|
+
# Build GPU-enabled instance type.
|
|
446
|
+
acc_type, acc_count = list(accelerators.items())[0]
|
|
447
|
+
|
|
448
|
+
slurm_instance_type = (slurm_utils.SlurmInstanceType.
|
|
449
|
+
from_instance_type(default_instance_type))
|
|
450
|
+
|
|
451
|
+
gpu_task_cpus = slurm_instance_type.cpus
|
|
452
|
+
if resources.cpus is None:
|
|
453
|
+
gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
|
|
454
|
+
# Special handling to bump up memory multiplier for GPU instances
|
|
455
|
+
gpu_task_memory = (float(resources.memory.strip('+')) if
|
|
456
|
+
resources.memory is not None else gpu_task_cpus *
|
|
457
|
+
self._DEFAULT_MEMORY_CPU_RATIO_WITH_GPU)
|
|
458
|
+
|
|
459
|
+
chosen_instance_type = (
|
|
460
|
+
slurm_utils.SlurmInstanceType.from_resources(
|
|
461
|
+
gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
|
|
462
|
+
|
|
463
|
+
# Check the availability of the specified instance type in all
|
|
464
|
+
# Slurm clusters.
|
|
465
|
+
available_regions = self.regions_with_offering(
|
|
466
|
+
chosen_instance_type,
|
|
467
|
+
accelerators=None,
|
|
468
|
+
use_spot=resources.use_spot,
|
|
469
|
+
region=resources.region,
|
|
470
|
+
zone=resources.zone)
|
|
471
|
+
if not available_regions:
|
|
472
|
+
return resources_utils.FeasibleResources([], [], None)
|
|
473
|
+
|
|
474
|
+
return resources_utils.FeasibleResources(_make([chosen_instance_type]),
|
|
475
|
+
[], None)
|
|
476
|
+
|
|
477
|
+
@classmethod
|
|
478
|
+
def _check_compute_credentials(
|
|
479
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
480
|
+
"""Checks if the user has access credentials to the Slurm cluster."""
|
|
481
|
+
try:
|
|
482
|
+
ssh_config = slurm_utils.get_slurm_ssh_config()
|
|
483
|
+
except FileNotFoundError:
|
|
484
|
+
return (
|
|
485
|
+
False,
|
|
486
|
+
f'Slurm configuration file {slurm_utils.DEFAULT_SLURM_PATH} '
|
|
487
|
+
'does not exist.\n'
|
|
488
|
+
f'{cls._INDENT_PREFIX}For more info: '
|
|
489
|
+
'https://docs.skypilot.co/en/latest/getting-started/'
|
|
490
|
+
'installation.html#slurm-installation')
|
|
491
|
+
except Exception as e: # pylint: disable=broad-except
|
|
492
|
+
return (False, 'Failed to load SSH configuration from '
|
|
493
|
+
f'{slurm_utils.DEFAULT_SLURM_PATH}: '
|
|
494
|
+
f'{common_utils.format_exception(e)}.')
|
|
495
|
+
existing_allowed_clusters = cls.existing_allowed_clusters()
|
|
496
|
+
|
|
497
|
+
if not existing_allowed_clusters:
|
|
498
|
+
return (False, 'No Slurm clusters found in ~/.slurm/config. '
|
|
499
|
+
'Please configure at least one Slurm cluster.')
|
|
500
|
+
|
|
501
|
+
# Check credentials for each cluster and return ctx2text mapping
|
|
502
|
+
ctx2text = {}
|
|
503
|
+
success = False
|
|
504
|
+
for cluster in existing_allowed_clusters:
|
|
505
|
+
# Retrieve the config options for a given SlurmctldHost name alias.
|
|
506
|
+
ssh_config_dict = ssh_config.lookup(cluster)
|
|
507
|
+
try:
|
|
508
|
+
client = slurm.SlurmClient(
|
|
509
|
+
ssh_config_dict['hostname'],
|
|
510
|
+
int(ssh_config_dict.get('port', 22)),
|
|
511
|
+
ssh_config_dict['user'],
|
|
512
|
+
ssh_config_dict['identityfile'][0],
|
|
513
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
514
|
+
ssh_proxy_jump=ssh_config_dict.get('proxyjump', None))
|
|
515
|
+
info = client.info()
|
|
516
|
+
logger.debug(f'Slurm cluster {cluster} sinfo: {info}')
|
|
517
|
+
ctx2text[cluster] = 'enabled'
|
|
518
|
+
success = True
|
|
519
|
+
except KeyError as e:
|
|
520
|
+
key = e.args[0]
|
|
521
|
+
ctx2text[cluster] = (
|
|
522
|
+
f'disabled. '
|
|
523
|
+
f'{cls._SSH_CONFIG_KEY_MAPPING.get(key, key.capitalize())} '
|
|
524
|
+
'is missing, please check your ~/.slurm/config '
|
|
525
|
+
'and try again.')
|
|
526
|
+
except Exception as e: # pylint: disable=broad-except
|
|
527
|
+
error_msg = (f'Credential check failed: '
|
|
528
|
+
f'{common_utils.format_exception(e)}')
|
|
529
|
+
ctx2text[cluster] = f'disabled. {error_msg}'
|
|
530
|
+
|
|
531
|
+
return success, ctx2text
|
|
532
|
+
|
|
533
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
534
|
+
########
|
|
535
|
+
# TODO #
|
|
536
|
+
########
|
|
537
|
+
# Return dictionary of credential file paths. This may look
|
|
538
|
+
# something like:
|
|
539
|
+
return {}
|
|
540
|
+
|
|
541
|
+
@classmethod
|
|
542
|
+
def get_current_user_identity(cls) -> Optional[List[str]]:
|
|
543
|
+
# NOTE: used for very advanced SkyPilot functionality
|
|
544
|
+
# Can implement later if desired
|
|
545
|
+
return None
|
|
546
|
+
|
|
547
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
|
548
|
+
return catalog.instance_type_exists(instance_type, 'slurm')
|
|
549
|
+
|
|
550
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
|
551
|
+
"""Validate region (cluster) and zone (partition).
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
region: Slurm cluster name.
|
|
555
|
+
zone: Slurm partition name (optional).
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
Tuple of (region, zone) if valid.
|
|
559
|
+
|
|
560
|
+
Raises:
|
|
561
|
+
ValueError: If cluster or partition not found.
|
|
562
|
+
"""
|
|
563
|
+
all_clusters = slurm_utils.get_all_slurm_cluster_names()
|
|
564
|
+
if region and region not in all_clusters:
|
|
565
|
+
raise ValueError(
|
|
566
|
+
f'Cluster {region} not found in Slurm config. Slurm only '
|
|
567
|
+
'supports cluster names as regions. Available '
|
|
568
|
+
f'clusters: {all_clusters}')
|
|
569
|
+
|
|
570
|
+
# Validate partition (zone) if specified
|
|
571
|
+
if zone is not None:
|
|
572
|
+
if region is None:
|
|
573
|
+
raise ValueError(
|
|
574
|
+
'Cannot specify partition (zone) without specifying '
|
|
575
|
+
'cluster (region) for Slurm.')
|
|
576
|
+
|
|
577
|
+
partitions = slurm_utils.get_partitions(region)
|
|
578
|
+
if zone not in partitions:
|
|
579
|
+
raise ValueError(
|
|
580
|
+
f'Partition {zone!r} not found in cluster {region!r}. '
|
|
581
|
+
f'Available partitions: {partitions}')
|
|
582
|
+
|
|
583
|
+
return region, zone
|
|
584
|
+
|
|
585
|
+
def accelerator_in_region_or_zone(self,
|
|
586
|
+
accelerator: str,
|
|
587
|
+
acc_count: int,
|
|
588
|
+
region: Optional[str] = None,
|
|
589
|
+
zone: Optional[str] = None) -> bool:
|
|
590
|
+
del zone # unused for now
|
|
591
|
+
regions = catalog.get_region_zones_for_accelerators(accelerator,
|
|
592
|
+
acc_count,
|
|
593
|
+
use_spot=False,
|
|
594
|
+
clouds='slurm')
|
|
595
|
+
if not regions:
|
|
596
|
+
return False
|
|
597
|
+
if region is None:
|
|
598
|
+
return True
|
|
599
|
+
return any(r.name == region for r in regions)
|
|
600
|
+
|
|
601
|
+
@classmethod
|
|
602
|
+
def expand_infras(cls) -> List[str]:
|
|
603
|
+
"""Returns a list of enabled Slurm clusters.
|
|
604
|
+
|
|
605
|
+
Each is returned as 'Slurm/cluster-name'.
|
|
606
|
+
"""
|
|
607
|
+
infras = []
|
|
608
|
+
for cluster in cls.existing_allowed_clusters(silent=True):
|
|
609
|
+
infras.append(f'{cls.canonical_name()}/{cluster}')
|
|
610
|
+
return infras
|
sky/clouds/ssh.py
CHANGED
|
@@ -9,6 +9,7 @@ from sky import skypilot_config
|
|
|
9
9
|
from sky.adaptors import kubernetes as kubernetes_adaptor
|
|
10
10
|
from sky.clouds import kubernetes
|
|
11
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.ssh_node_pools import constants as ssh_constants
|
|
12
13
|
from sky.utils import annotations
|
|
13
14
|
from sky.utils import common_utils
|
|
14
15
|
from sky.utils import registry
|
|
@@ -20,7 +21,7 @@ if typing.TYPE_CHECKING:
|
|
|
20
21
|
|
|
21
22
|
logger = sky_logging.init_logger(__name__)
|
|
22
23
|
|
|
23
|
-
SSH_NODE_POOLS_PATH =
|
|
24
|
+
SSH_NODE_POOLS_PATH = ssh_constants.DEFAULT_SSH_NODE_POOLS_PATH
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
@registry.CLOUD_REGISTRY.register()
|
|
@@ -254,7 +255,7 @@ class SSH(kubernetes.Kubernetes):
|
|
|
254
255
|
@classmethod
|
|
255
256
|
def expand_infras(cls) -> List[str]:
|
|
256
257
|
return [
|
|
257
|
-
f'{cls.canonical_name()}/{
|
|
258
|
+
f'{cls.canonical_name()}/{common_utils.removeprefix(c, "ssh-")}'
|
|
258
259
|
for c in cls.existing_allowed_contexts(silent=True)
|
|
259
260
|
]
|
|
260
261
|
|