skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/catalog/shadeform_catalog.py
CHANGED
|
@@ -7,12 +7,15 @@ and can be used to query instance types and pricing information for Shadeform.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
-
import
|
|
11
|
-
|
|
10
|
+
from sky.adaptors import common as adaptors_common
|
|
12
11
|
from sky.catalog import common
|
|
13
12
|
|
|
14
13
|
if typing.TYPE_CHECKING:
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
15
16
|
from sky.clouds import cloud
|
|
17
|
+
else:
|
|
18
|
+
pd = adaptors_common.LazyImport('pandas')
|
|
16
19
|
|
|
17
20
|
# We'll use dynamic fetching, so no static CSV file to load
|
|
18
21
|
_df = None
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Slurm Catalog."""
|
|
2
|
+
|
|
3
|
+
import collections
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
6
|
+
|
|
7
|
+
from sky import check as sky_check
|
|
8
|
+
from sky import clouds as sky_clouds
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.catalog import common
|
|
11
|
+
from sky.clouds import cloud
|
|
12
|
+
from sky.provision.slurm import utils as slurm_utils
|
|
13
|
+
from sky.utils import resources_utils
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
_DEFAULT_NUM_VCPUS = 2
|
|
18
|
+
_DEFAULT_MEMORY_CPU_RATIO = 1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
22
|
+
"""Check if the given instance type is valid for Slurm."""
|
|
23
|
+
return slurm_utils.SlurmInstanceType.is_valid_instance_type(instance_type)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
27
|
+
memory: Optional[str] = None,
|
|
28
|
+
disk_tier: Optional[
|
|
29
|
+
resources_utils.DiskTier] = None,
|
|
30
|
+
region: Optional[str] = None,
|
|
31
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
32
|
+
# Delete unused parameters.
|
|
33
|
+
del disk_tier, region, zone
|
|
34
|
+
|
|
35
|
+
# Slurm provisions resources via --cpus-per-task and --mem.
|
|
36
|
+
instance_cpus = float(
|
|
37
|
+
cpus.strip('+')) if cpus is not None else _DEFAULT_NUM_VCPUS
|
|
38
|
+
if memory is not None:
|
|
39
|
+
if memory.endswith('+'):
|
|
40
|
+
instance_mem = float(memory[:-1])
|
|
41
|
+
elif memory.endswith('x'):
|
|
42
|
+
instance_mem = float(memory[:-1]) * instance_cpus
|
|
43
|
+
else:
|
|
44
|
+
instance_mem = float(memory)
|
|
45
|
+
else:
|
|
46
|
+
instance_mem = instance_cpus * _DEFAULT_MEMORY_CPU_RATIO
|
|
47
|
+
virtual_instance_type = slurm_utils.SlurmInstanceType(
|
|
48
|
+
instance_cpus, instance_mem).name
|
|
49
|
+
return virtual_instance_type
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def list_accelerators(
|
|
53
|
+
gpus_only: bool,
|
|
54
|
+
name_filter: Optional[str],
|
|
55
|
+
region_filter: Optional[str],
|
|
56
|
+
quantity_filter: Optional[int],
|
|
57
|
+
case_sensitive: bool = True,
|
|
58
|
+
all_regions: bool = False,
|
|
59
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
60
|
+
"""List accelerators in Slurm clusters.
|
|
61
|
+
|
|
62
|
+
Returns a dictionary mapping GPU type to a list of InstanceTypeInfo objects.
|
|
63
|
+
"""
|
|
64
|
+
return list_accelerators_realtime(gpus_only, name_filter, region_filter,
|
|
65
|
+
quantity_filter, case_sensitive,
|
|
66
|
+
all_regions, require_price)[0]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def list_accelerators_realtime(
|
|
70
|
+
gpus_only: bool = True,
|
|
71
|
+
name_filter: Optional[str] = None,
|
|
72
|
+
region_filter: Optional[str] = None,
|
|
73
|
+
quantity_filter: Optional[int] = None,
|
|
74
|
+
case_sensitive: bool = True,
|
|
75
|
+
all_regions: bool = False,
|
|
76
|
+
require_price: bool = False,
|
|
77
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
|
78
|
+
int]]:
|
|
79
|
+
"""Fetches real-time accelerator information from the Slurm cluster.
|
|
80
|
+
|
|
81
|
+
Uses the `get_slurm_node_info_list` helper function.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
gpus_only: If True, only return GPU accelerators.
|
|
85
|
+
name_filter: Regex filter for accelerator names (e.g., 'V100', 'gpu').
|
|
86
|
+
region_filter: Optional filter for Slurm partitions.
|
|
87
|
+
quantity_filter: Minimum number of accelerators required per node.
|
|
88
|
+
case_sensitive: Whether name_filter is case-sensitive.
|
|
89
|
+
all_regions: Unused in Slurm context.
|
|
90
|
+
require_price: Unused in Slurm context.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A tuple of three dictionaries:
|
|
94
|
+
- qtys_map: Maps GPU type to set of InstanceTypeInfo objects for unique
|
|
95
|
+
counts found per node.
|
|
96
|
+
- total_capacity: Maps GPU type to total count across all nodes.
|
|
97
|
+
- total_available: Maps GPU type to total free count across all nodes.
|
|
98
|
+
"""
|
|
99
|
+
del gpus_only, all_regions, require_price
|
|
100
|
+
|
|
101
|
+
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
|
102
|
+
cloud.CloudCapability.COMPUTE)
|
|
103
|
+
if not sky_clouds.cloud_in_iterable(sky_clouds.Slurm(), enabled_clouds):
|
|
104
|
+
return {}, {}, {}
|
|
105
|
+
|
|
106
|
+
if region_filter is None:
|
|
107
|
+
# Get the first available cluster as default
|
|
108
|
+
all_clusters = slurm_utils.get_all_slurm_cluster_names()
|
|
109
|
+
if not all_clusters:
|
|
110
|
+
return {}, {}, {}
|
|
111
|
+
slurm_cluster = all_clusters[0]
|
|
112
|
+
else:
|
|
113
|
+
slurm_cluster = region_filter
|
|
114
|
+
|
|
115
|
+
slurm_nodes_info = slurm_utils.slurm_node_info(
|
|
116
|
+
slurm_cluster_name=slurm_cluster)
|
|
117
|
+
|
|
118
|
+
if not slurm_nodes_info:
|
|
119
|
+
# Customize error message based on filters
|
|
120
|
+
err_msg = 'No matching GPU nodes found in the Slurm cluster'
|
|
121
|
+
filters_applied = []
|
|
122
|
+
if name_filter:
|
|
123
|
+
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
124
|
+
if quantity_filter:
|
|
125
|
+
filters_applied.append(f'quantity>={quantity_filter}')
|
|
126
|
+
if filters_applied:
|
|
127
|
+
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
128
|
+
err_msg += '.'
|
|
129
|
+
logger.error(
|
|
130
|
+
err_msg) # Log as error as it indicates no usable resources found
|
|
131
|
+
raise ValueError(err_msg)
|
|
132
|
+
|
|
133
|
+
# Aggregate results into the required format
|
|
134
|
+
qtys_map: Dict[str,
|
|
135
|
+
Set[common.InstanceTypeInfo]] = collections.defaultdict(set)
|
|
136
|
+
total_capacity: Dict[str, int] = collections.defaultdict(int)
|
|
137
|
+
total_available: Dict[str, int] = collections.defaultdict(int)
|
|
138
|
+
|
|
139
|
+
for node_info in slurm_nodes_info:
|
|
140
|
+
gpu_type = node_info['gpu_type']
|
|
141
|
+
node_total_gpus = node_info['total_gpus']
|
|
142
|
+
node_free_gpus = node_info['free_gpus']
|
|
143
|
+
partition = node_info['partition']
|
|
144
|
+
|
|
145
|
+
# Apply name filter to the determined GPU type
|
|
146
|
+
regex_flags = 0 if case_sensitive else re.IGNORECASE
|
|
147
|
+
if name_filter and not re.match(
|
|
148
|
+
name_filter, gpu_type, flags=regex_flags):
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
# Apply quantity filter (total GPUs on node must meet this)
|
|
152
|
+
if quantity_filter and node_total_gpus < quantity_filter:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
# Apply partition filter if specified
|
|
156
|
+
# TODO(zhwu): when a node is in multiple partitions, the partition
|
|
157
|
+
# mapping from node to partition does not work.
|
|
158
|
+
# if partition_filter and partition != partition_filter:
|
|
159
|
+
# continue
|
|
160
|
+
|
|
161
|
+
# Create InstanceTypeInfo objects for various GPU counts
|
|
162
|
+
# Similar to Kubernetes, generate powers of 2 up to node_total_gpus
|
|
163
|
+
if node_total_gpus > 0:
|
|
164
|
+
count = 1
|
|
165
|
+
while count <= node_total_gpus:
|
|
166
|
+
instance_info = common.InstanceTypeInfo(
|
|
167
|
+
instance_type=None, # Slurm doesn't have instance types
|
|
168
|
+
accelerator_name=gpu_type,
|
|
169
|
+
accelerator_count=count,
|
|
170
|
+
cpu_count=node_info['vcpu_count'],
|
|
171
|
+
memory=node_info['memory_gb'],
|
|
172
|
+
price=0.0, # Slurm doesn't have price info
|
|
173
|
+
region=partition, # Use partition as region
|
|
174
|
+
cloud='slurm', # Specify cloud as 'slurm'
|
|
175
|
+
device_memory=0.0, # No GPU memory info from Slurm
|
|
176
|
+
spot_price=0.0, # Slurm doesn't have spot pricing
|
|
177
|
+
)
|
|
178
|
+
qtys_map[gpu_type].add(instance_info)
|
|
179
|
+
count *= 2
|
|
180
|
+
|
|
181
|
+
# Add the actual total if it's not already included
|
|
182
|
+
# (e.g., if node has 12 GPUs, include counts 1, 2, 4, 8, 12)
|
|
183
|
+
if count // 2 != node_total_gpus:
|
|
184
|
+
instance_info = common.InstanceTypeInfo(
|
|
185
|
+
instance_type=None,
|
|
186
|
+
accelerator_name=gpu_type,
|
|
187
|
+
accelerator_count=node_total_gpus,
|
|
188
|
+
cpu_count=node_info['vcpu_count'],
|
|
189
|
+
memory=node_info['memory_gb'],
|
|
190
|
+
price=0.0,
|
|
191
|
+
region=partition,
|
|
192
|
+
cloud='slurm',
|
|
193
|
+
device_memory=0.0,
|
|
194
|
+
spot_price=0.0,
|
|
195
|
+
)
|
|
196
|
+
qtys_map[gpu_type].add(instance_info)
|
|
197
|
+
|
|
198
|
+
# Map of GPU type -> total count across all matched nodes
|
|
199
|
+
total_capacity[gpu_type] += node_total_gpus
|
|
200
|
+
|
|
201
|
+
# Map of GPU type -> total *free* count across all matched nodes
|
|
202
|
+
total_available[gpu_type] += node_free_gpus
|
|
203
|
+
|
|
204
|
+
# Check if any GPUs were found after applying filters
|
|
205
|
+
if not total_capacity:
|
|
206
|
+
err_msg = 'No matching GPU nodes found in the Slurm cluster'
|
|
207
|
+
filters_applied = []
|
|
208
|
+
if name_filter:
|
|
209
|
+
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
210
|
+
if quantity_filter:
|
|
211
|
+
filters_applied.append(f'quantity>={quantity_filter}')
|
|
212
|
+
if filters_applied:
|
|
213
|
+
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
214
|
+
err_msg += '.'
|
|
215
|
+
logger.error(err_msg)
|
|
216
|
+
raise ValueError(err_msg)
|
|
217
|
+
|
|
218
|
+
# Convert sets of InstanceTypeInfo to sorted lists
|
|
219
|
+
final_qtys_map = {
|
|
220
|
+
gpu: sorted(list(instances), key=lambda x: x.accelerator_count)
|
|
221
|
+
for gpu, instances in qtys_map.items()
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
logger.debug(f'Aggregated Slurm GPU Info: '
|
|
225
|
+
f'qtys={final_qtys_map}, '
|
|
226
|
+
f'capacity={dict(total_capacity)}, '
|
|
227
|
+
f'available={dict(total_available)}')
|
|
228
|
+
|
|
229
|
+
return final_qtys_map, dict(total_capacity), dict(total_available)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def validate_region_zone(
|
|
233
|
+
region_name: Optional[str],
|
|
234
|
+
zone_name: Optional[str],
|
|
235
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
236
|
+
return (region_name, zone_name)
|
sky/catalog/vast_catalog.py
CHANGED
|
@@ -7,7 +7,10 @@ query instance types and pricing information for Vast.ai.
|
|
|
7
7
|
import typing
|
|
8
8
|
from typing import Dict, List, Optional, Tuple, Union
|
|
9
9
|
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
10
12
|
from sky.catalog import common
|
|
13
|
+
from sky.utils import resources_utils
|
|
11
14
|
from sky.utils import ux_utils
|
|
12
15
|
|
|
13
16
|
if typing.TYPE_CHECKING:
|
|
@@ -16,6 +19,17 @@ if typing.TYPE_CHECKING:
|
|
|
16
19
|
_df = common.read_catalog('vast/vms.csv')
|
|
17
20
|
|
|
18
21
|
|
|
22
|
+
def _apply_datacenter_filter(df: pd.DataFrame,
|
|
23
|
+
datacenter_only: bool) -> pd.DataFrame:
|
|
24
|
+
"""Filter dataframe by hosting_type if datacenter_only is True.
|
|
25
|
+
|
|
26
|
+
hosting_type: 0 = Consumer hosted, 1 = Datacenter hosted
|
|
27
|
+
"""
|
|
28
|
+
if not datacenter_only or 'HostingType' not in df.columns:
|
|
29
|
+
return df
|
|
30
|
+
return df[df['HostingType'] >= 1]
|
|
31
|
+
|
|
32
|
+
|
|
19
33
|
def instance_type_exists(instance_type: str) -> bool:
|
|
20
34
|
return common.instance_type_exists_impl(_df, instance_type)
|
|
21
35
|
|
|
@@ -48,13 +62,16 @@ def get_vcpus_mem_from_instance_type(
|
|
|
48
62
|
|
|
49
63
|
def get_default_instance_type(cpus: Optional[str] = None,
|
|
50
64
|
memory: Optional[str] = None,
|
|
51
|
-
disk_tier: Optional[
|
|
65
|
+
disk_tier: Optional[
|
|
66
|
+
resources_utils.DiskTier] = None,
|
|
52
67
|
region: Optional[str] = None,
|
|
53
|
-
zone: Optional[str] = None
|
|
68
|
+
zone: Optional[str] = None,
|
|
69
|
+
datacenter_only: bool = False) -> Optional[str]:
|
|
54
70
|
del disk_tier
|
|
55
71
|
# NOTE: After expanding catalog to multiple entries, you may
|
|
56
72
|
# want to specify a default instance type or family.
|
|
57
|
-
|
|
73
|
+
df = _apply_datacenter_filter(_df, datacenter_only)
|
|
74
|
+
return common.get_instance_type_for_cpus_mem_impl(df, cpus, memory, region,
|
|
58
75
|
zone)
|
|
59
76
|
|
|
60
77
|
|
|
@@ -70,12 +87,19 @@ def get_instance_type_for_accelerator(
|
|
|
70
87
|
memory: Optional[str] = None,
|
|
71
88
|
use_spot: bool = False,
|
|
72
89
|
region: Optional[str] = None,
|
|
73
|
-
zone: Optional[str] = None
|
|
74
|
-
|
|
90
|
+
zone: Optional[str] = None,
|
|
91
|
+
datacenter_only: bool = False) -> Tuple[Optional[List[str]], List[str]]:
|
|
92
|
+
"""Returns a list of instance types that have the given accelerator.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
datacenter_only: If True, only return instances hosted in datacenters
|
|
96
|
+
(hosting_type >= 1).
|
|
97
|
+
"""
|
|
75
98
|
if zone is not None:
|
|
76
99
|
with ux_utils.print_exception_no_traceback():
|
|
77
100
|
raise ValueError('Vast does not support zones.')
|
|
78
|
-
|
|
101
|
+
df = _apply_datacenter_filter(_df, datacenter_only)
|
|
102
|
+
return common.get_instance_type_for_accelerator_impl(df=df,
|
|
79
103
|
acc_name=acc_name,
|
|
80
104
|
acc_count=acc_count,
|
|
81
105
|
cpus=cpus,
|
sky/check.py
CHANGED
|
@@ -528,8 +528,9 @@ def _print_checked_cloud(
|
|
|
528
528
|
# `dict` reasons for K8s and SSH will be printed in detail in
|
|
529
529
|
# _format_enabled_cloud. Skip here unless the cloud is disabled.
|
|
530
530
|
if not isinstance(reason, str):
|
|
531
|
-
if not ok and isinstance(
|
|
532
|
-
|
|
531
|
+
if not ok and isinstance(
|
|
532
|
+
cloud_tuple[1],
|
|
533
|
+
(sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
|
|
533
534
|
if reason is not None:
|
|
534
535
|
reason_str = _format_context_details(cloud_tuple[1],
|
|
535
536
|
show_details=True,
|
|
@@ -555,7 +556,9 @@ def _print_checked_cloud(
|
|
|
555
556
|
capability_string = f'[{", ".join(enabled_capabilities)}]'
|
|
556
557
|
if verbose and cloud is not cloudflare and cloud is not coreweave:
|
|
557
558
|
activated_account = cloud.get_active_user_identity_str()
|
|
558
|
-
if isinstance(
|
|
559
|
+
if isinstance(
|
|
560
|
+
cloud_tuple[1],
|
|
561
|
+
(sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
|
|
559
562
|
detail_string = _format_context_details(cloud_tuple[1],
|
|
560
563
|
show_details=True,
|
|
561
564
|
ctx2text=ctx2text)
|
|
@@ -586,6 +589,9 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
|
|
|
586
589
|
if isinstance(cloud_type, sky_clouds.SSH):
|
|
587
590
|
# Get the cluster names by reading from the node pools file
|
|
588
591
|
contexts = sky_clouds.SSH.get_ssh_node_pool_contexts()
|
|
592
|
+
elif isinstance(cloud_type, sky_clouds.Slurm):
|
|
593
|
+
# Get the cluster names from SLURM config
|
|
594
|
+
contexts = sky_clouds.Slurm.existing_allowed_clusters()
|
|
589
595
|
else:
|
|
590
596
|
assert isinstance(cloud_type, sky_clouds.Kubernetes)
|
|
591
597
|
contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
|
@@ -650,15 +656,19 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
|
|
|
650
656
|
'configuration.'))
|
|
651
657
|
else:
|
|
652
658
|
# Default case - not set up
|
|
653
|
-
text_suffix = (': ' + _red_color('disabled. ') +
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
659
|
+
text_suffix = (': ' + _red_color('disabled. ') + _dim_color(
|
|
660
|
+
'Reason: Not set up. Use '
|
|
661
|
+
'`sky ssh up --infra '
|
|
662
|
+
f'{common_utils.removeprefix(context, "ssh-")}` '
|
|
663
|
+
'to set up.'))
|
|
658
664
|
contexts_formatted.append(
|
|
659
665
|
f'\n {symbol}{cleaned_context}{text_suffix}')
|
|
660
|
-
|
|
661
|
-
|
|
666
|
+
if isinstance(cloud_type, sky_clouds.SSH):
|
|
667
|
+
identity_str = 'SSH Node Pools'
|
|
668
|
+
elif isinstance(cloud_type, sky_clouds.Slurm):
|
|
669
|
+
identity_str = 'Allowed clusters'
|
|
670
|
+
else:
|
|
671
|
+
identity_str = 'Allowed contexts'
|
|
662
672
|
return f'\n {identity_str}:{"".join(contexts_formatted)}'
|
|
663
673
|
|
|
664
674
|
|
|
@@ -677,7 +687,11 @@ def _format_enabled_cloud(cloud_name: str,
|
|
|
677
687
|
cloud_and_capabilities = f'{cloud_name} [{", ".join(capabilities)}]'
|
|
678
688
|
title = _green_color(cloud_and_capabilities)
|
|
679
689
|
|
|
680
|
-
if cloud_name in [
|
|
690
|
+
if cloud_name in [
|
|
691
|
+
repr(sky_clouds.Kubernetes()),
|
|
692
|
+
repr(sky_clouds.SSH()),
|
|
693
|
+
repr(sky_clouds.Slurm())
|
|
694
|
+
]:
|
|
681
695
|
return (f'{title}' + _format_context_details(
|
|
682
696
|
cloud_name, show_details=False, ctx2text=ctx2text))
|
|
683
697
|
return _green_color(cloud_and_capabilities)
|