skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +122 -3
- sky/clouds/__init__.py +5 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +30 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +160 -23
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +2 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +59 -17
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +29 -15
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +177 -4
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +80 -8
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +12 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +67 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/clouds/oci.py
CHANGED
@@ -401,13 +401,15 @@ class OCI(clouds.Cloud):
|
|
401
401
|
fuzzy_candidate_list, None)
|
402
402
|
|
403
403
|
@classmethod
|
404
|
-
def _check_compute_credentials(
|
404
|
+
def _check_compute_credentials(
|
405
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
405
406
|
"""Checks if the user has access credentials to
|
406
407
|
OCI's compute service."""
|
407
408
|
return cls._check_credentials()
|
408
409
|
|
409
410
|
@classmethod
|
410
|
-
def _check_storage_credentials(
|
411
|
+
def _check_storage_credentials(
|
412
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
411
413
|
"""Checks if the user has access credentials to
|
412
414
|
OCI's storage service."""
|
413
415
|
# TODO(seungjin): Implement separate check for
|
sky/clouds/paperspace.py
CHANGED
@@ -255,7 +255,8 @@ class Paperspace(clouds.Cloud):
|
|
255
255
|
fuzzy_candidate_list, None)
|
256
256
|
|
257
257
|
@classmethod
|
258
|
-
def _check_compute_credentials(
|
258
|
+
def _check_compute_credentials(
|
259
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
259
260
|
"""Checks if the user has access credentials to
|
260
261
|
Paperspace's compute service."""
|
261
262
|
try:
|
sky/clouds/runpod.py
CHANGED
@@ -259,7 +259,8 @@ class RunPod(clouds.Cloud):
|
|
259
259
|
fuzzy_candidate_list, None)
|
260
260
|
|
261
261
|
@classmethod
|
262
|
-
def _check_compute_credentials(
|
262
|
+
def _check_compute_credentials(
|
263
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
263
264
|
"""Checks if the user has access credentials to
|
264
265
|
RunPod's compute service."""
|
265
266
|
return cls._check_credentials()
|
sky/clouds/scp.py
CHANGED
@@ -316,7 +316,8 @@ class SCP(clouds.Cloud):
|
|
316
316
|
fuzzy_candidate_list, None)
|
317
317
|
|
318
318
|
@classmethod
|
319
|
-
def _check_compute_credentials(
|
319
|
+
def _check_compute_credentials(
|
320
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
320
321
|
"""Checks if the user has access credentials to
|
321
322
|
SCP's compute service."""
|
322
323
|
try:
|
@@ -9,6 +9,8 @@ from sky.clouds.service_catalog.constants import ALL_CLOUDS
|
|
9
9
|
from sky.clouds.service_catalog.constants import CATALOG_DIR
|
10
10
|
from sky.clouds.service_catalog.constants import CATALOG_SCHEMA_VERSION
|
11
11
|
from sky.clouds.service_catalog.constants import HOSTED_CATALOG_DIR_URL
|
12
|
+
from sky.clouds.service_catalog.constants import (
|
13
|
+
HOSTED_CATALOG_DIR_URL_S3_MIRROR)
|
12
14
|
from sky.utils import resources_utils
|
13
15
|
from sky.utils import subprocess_utils
|
14
16
|
|
@@ -383,6 +385,7 @@ __all__ = [
|
|
383
385
|
# Constants
|
384
386
|
'ALL_CLOUDS',
|
385
387
|
'HOSTED_CATALOG_DIR_URL',
|
388
|
+
'HOSTED_CATALOG_DIR_URL_S3_MIRROR',
|
386
389
|
'CATALOG_SCHEMA_VERSION',
|
387
390
|
'CATALOG_DIR',
|
388
391
|
]
|
@@ -197,6 +197,8 @@ def read_catalog(filename: str,
|
|
197
197
|
with filelock.FileLock(meta_path + '.lock'):
|
198
198
|
if _need_update():
|
199
199
|
url = f'{constants.HOSTED_CATALOG_DIR_URL}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
200
|
+
url_fallback = f'{constants.HOSTED_CATALOG_DIR_URL_S3_MIRROR}/{constants.CATALOG_SCHEMA_VERSION}/{filename}' # pylint: disable=line-too-long
|
201
|
+
headers = {'User-Agent': 'SkyPilot/0.7'}
|
200
202
|
update_frequency_str = ''
|
201
203
|
if pull_frequency_hours is not None:
|
202
204
|
update_frequency_str = (
|
@@ -206,8 +208,13 @@ def read_catalog(filename: str,
|
|
206
208
|
f'Updating {cloud} catalog: {filename}') +
|
207
209
|
f'{update_frequency_str}'):
|
208
210
|
try:
|
209
|
-
r = requests.get(url=url,
|
210
|
-
|
211
|
+
r = requests.get(url=url, headers=headers)
|
212
|
+
if r.status_code == 429:
|
213
|
+
# fallback to s3 mirror, github introduced rate
|
214
|
+
# limit after 2025-05, see
|
215
|
+
# https://github.com/skypilot-org/skypilot/issues/5438
|
216
|
+
# for more details
|
217
|
+
r = requests.get(url=url_fallback, headers=headers)
|
211
218
|
r.raise_for_status()
|
212
219
|
except requests.exceptions.RequestException as e:
|
213
220
|
error_str = (f'Failed to fetch {cloud} catalog '
|
@@ -1,7 +1,8 @@
|
|
1
1
|
"""Constants used for service catalog."""
|
2
2
|
HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
|
3
|
+
HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
|
3
4
|
CATALOG_SCHEMA_VERSION = 'v7'
|
4
5
|
CATALOG_DIR = '~/.sky/catalogs'
|
5
6
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
6
7
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
7
|
-
'paperspace', 'do', 'nebius')
|
8
|
+
'paperspace', 'do', 'nebius', 'ssh')
|
@@ -0,0 +1,167 @@
|
|
1
|
+
"""SSH Catalog.
|
2
|
+
|
3
|
+
This catalog inherits from the Kubernetes catalog as SSH cloud is a wrapper
|
4
|
+
around Kubernetes that uses SSH-specific contexts.
|
5
|
+
"""
|
6
|
+
import typing
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
9
|
+
from sky import sky_logging
|
10
|
+
from sky.clouds import ssh
|
11
|
+
from sky.clouds.service_catalog import CloudFilter
|
12
|
+
from sky.clouds.service_catalog import common
|
13
|
+
from sky.clouds.service_catalog import kubernetes_catalog
|
14
|
+
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
16
|
+
|
17
|
+
if typing.TYPE_CHECKING:
|
18
|
+
import pandas as pd
|
19
|
+
else:
|
20
|
+
from sky.adaptors import common as adaptors_common
|
21
|
+
pd = adaptors_common.LazyImport('pandas')
|
22
|
+
|
23
|
+
_PULL_FREQUENCY_HOURS = 7
|
24
|
+
|
25
|
+
# Reuse the Kubernetes images catalog for SSH cloud.
|
26
|
+
# We keep pull_frequency_hours so we can remotely update the default image paths
|
27
|
+
_image_df = common.read_catalog('kubernetes/images.csv',
|
28
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
29
|
+
|
30
|
+
|
31
|
+
def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
32
|
+
"""Returns the image id from the tag.
|
33
|
+
|
34
|
+
Delegates to Kubernetes catalog implementation.
|
35
|
+
"""
|
36
|
+
return kubernetes_catalog.get_image_id_from_tag(tag, region)
|
37
|
+
|
38
|
+
|
39
|
+
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
40
|
+
"""Returns whether the image tag is valid.
|
41
|
+
|
42
|
+
Delegates to Kubernetes catalog implementation.
|
43
|
+
"""
|
44
|
+
return kubernetes_catalog.is_image_tag_valid(tag, region)
|
45
|
+
|
46
|
+
|
47
|
+
def list_accelerators(
|
48
|
+
gpus_only: bool,
|
49
|
+
name_filter: Optional[str],
|
50
|
+
region_filter: Optional[str],
|
51
|
+
quantity_filter: Optional[int],
|
52
|
+
case_sensitive: bool = True,
|
53
|
+
all_regions: bool = False,
|
54
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
55
|
+
"""List accelerators in SSH-based Kubernetes clusters.
|
56
|
+
|
57
|
+
Delegates to the Kubernetes _list_accelerators function but restricts to
|
58
|
+
SSH contexts.
|
59
|
+
"""
|
60
|
+
return _list_accelerators(gpus_only,
|
61
|
+
name_filter,
|
62
|
+
region_filter,
|
63
|
+
quantity_filter,
|
64
|
+
case_sensitive,
|
65
|
+
all_regions,
|
66
|
+
require_price,
|
67
|
+
realtime=False)[0]
|
68
|
+
|
69
|
+
|
70
|
+
def list_accelerators_realtime(
|
71
|
+
gpus_only: bool,
|
72
|
+
name_filter: Optional[str],
|
73
|
+
region_filter: Optional[str],
|
74
|
+
quantity_filter: Optional[int],
|
75
|
+
case_sensitive: bool = True,
|
76
|
+
all_regions: bool = False,
|
77
|
+
require_price: bool = True
|
78
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
79
|
+
int]]:
|
80
|
+
"""List accelerators in SSH Node Pools with real-time information.
|
81
|
+
|
82
|
+
Delegates to the Kubernetes _list_accelerators function but restricts to
|
83
|
+
SSH contexts.
|
84
|
+
"""
|
85
|
+
return _list_accelerators(gpus_only,
|
86
|
+
name_filter,
|
87
|
+
region_filter,
|
88
|
+
quantity_filter,
|
89
|
+
case_sensitive,
|
90
|
+
all_regions,
|
91
|
+
require_price,
|
92
|
+
realtime=True)
|
93
|
+
|
94
|
+
|
95
|
+
def _list_accelerators(
|
96
|
+
gpus_only: bool,
|
97
|
+
name_filter: Optional[str],
|
98
|
+
region_filter: Optional[str],
|
99
|
+
quantity_filter: Optional[int],
|
100
|
+
case_sensitive: bool = True,
|
101
|
+
all_regions: bool = False,
|
102
|
+
require_price: bool = True,
|
103
|
+
realtime: bool = False
|
104
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
105
|
+
int]]:
|
106
|
+
"""List accelerators in SSH-based Kubernetes clusters.
|
107
|
+
|
108
|
+
This is a wrapper around the Kubernetes _list_accelerators function that
|
109
|
+
restricts the contexts to SSH-specific contexts only.
|
110
|
+
|
111
|
+
If region_filter is specified and it's not an SSH context, no results will
|
112
|
+
be returned.
|
113
|
+
"""
|
114
|
+
# If a specific region is requested, ensure it's an SSH context
|
115
|
+
if region_filter is not None and not region_filter.startswith('ssh-'):
|
116
|
+
return {}, {}, {}
|
117
|
+
|
118
|
+
# Get SSH contexts
|
119
|
+
ssh_contexts = ssh.SSH.existing_allowed_contexts()
|
120
|
+
|
121
|
+
# If no contexts found, return empty results
|
122
|
+
if not ssh_contexts:
|
123
|
+
return {}, {}, {}
|
124
|
+
|
125
|
+
# If a region filter is specified and it's not a SSH context return empty
|
126
|
+
# results
|
127
|
+
if region_filter is not None and region_filter not in ssh_contexts:
|
128
|
+
return {}, {}, {}
|
129
|
+
|
130
|
+
# If region_filter is None, use the first context if all_regions is False
|
131
|
+
if region_filter is None and not all_regions and ssh_contexts:
|
132
|
+
# Use the first SSH context if no specific region requested
|
133
|
+
region_filter = ssh_contexts[0]
|
134
|
+
|
135
|
+
# Call the Kubernetes _list_accelerators with the appropriate region filter
|
136
|
+
if realtime:
|
137
|
+
return kubernetes_catalog.list_accelerators_realtime(
|
138
|
+
gpus_only, name_filter, region_filter, quantity_filter,
|
139
|
+
case_sensitive, all_regions, require_price)
|
140
|
+
else:
|
141
|
+
result = kubernetes_catalog.list_accelerators(
|
142
|
+
gpus_only, name_filter, region_filter, quantity_filter,
|
143
|
+
case_sensitive, all_regions, require_price)
|
144
|
+
return result, {}, {}
|
145
|
+
|
146
|
+
|
147
|
+
def validate_region_zone(
|
148
|
+
region_name: Optional[str],
|
149
|
+
zone_name: Optional[str],
|
150
|
+
clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]:
|
151
|
+
"""Validates the region and zone for SSH cloud.
|
152
|
+
|
153
|
+
Delegates to the Kubernetes catalog implementation but ensures
|
154
|
+
the region is a valid SSH context.
|
155
|
+
"""
|
156
|
+
# Delegate to Kubernetes implementation
|
157
|
+
region, zone = kubernetes_catalog.validate_region_zone(
|
158
|
+
region_name, zone_name, clouds)
|
159
|
+
|
160
|
+
# Get SSH contexts
|
161
|
+
ssh_contexts = ssh.SSH.existing_allowed_contexts()
|
162
|
+
|
163
|
+
# If a region is specified, ensure it's in the list of SSH contexts
|
164
|
+
if region is not None and region not in ssh_contexts:
|
165
|
+
return None, None
|
166
|
+
|
167
|
+
return region, zone
|
sky/clouds/ssh.py
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
"""SSH Node Pools"""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import typing
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
6
|
+
|
7
|
+
import yaml
|
8
|
+
|
9
|
+
from sky import sky_logging
|
10
|
+
from sky.adaptors import kubernetes as kubernetes_adaptor
|
11
|
+
from sky.clouds import kubernetes
|
12
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
13
|
+
from sky.utils import annotations
|
14
|
+
from sky.utils import registry
|
15
|
+
|
16
|
+
if typing.TYPE_CHECKING:
|
17
|
+
# Renaming to avoid shadowing variables.
|
18
|
+
from sky import resources as resources_lib
|
19
|
+
|
20
|
+
logger = sky_logging.init_logger(__name__)
|
21
|
+
|
22
|
+
SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
23
|
+
|
24
|
+
|
25
|
+
@registry.CLOUD_REGISTRY.register()
|
26
|
+
class SSH(kubernetes.Kubernetes):
|
27
|
+
"""SSH cloud implementation.
|
28
|
+
|
29
|
+
This is used by SSH Node Pools in SkyPilot, which use Kubernetes to manage
|
30
|
+
the SSH clusters.
|
31
|
+
|
32
|
+
This cloud is a thin wrapper around Kubernetes that only uses contexts
|
33
|
+
starting with 'ssh-', which are managed through `sky ssh up` command.
|
34
|
+
"""
|
35
|
+
|
36
|
+
_REPR = 'SSH'
|
37
|
+
|
38
|
+
# Keep track of contexts that have been logged as unreachable
|
39
|
+
logged_unreachable_contexts: Set[str] = set()
|
40
|
+
|
41
|
+
def __repr__(self):
|
42
|
+
return self._REPR
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def _unsupported_features_for_resources(
|
46
|
+
cls, resources: 'resources_lib.Resources'
|
47
|
+
) -> Dict[kubernetes.clouds.CloudImplementationFeatures, str]:
|
48
|
+
# Inherit all Kubernetes unsupported features
|
49
|
+
return super()._unsupported_features_for_resources(resources)
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def get_ssh_node_pool_contexts(cls) -> List[str]:
|
53
|
+
"""Get context names from ssh_node_pools.yaml file.
|
54
|
+
|
55
|
+
Reads the SSH node pools configuration file and returns
|
56
|
+
a list of context names by prepending 'ssh-' to each Node Pool name.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
A list of SSH Kubernetes context names derived from the Node Pools
|
60
|
+
in the SSH node pools file.
|
61
|
+
"""
|
62
|
+
contexts = []
|
63
|
+
|
64
|
+
if os.path.exists(SSH_NODE_POOLS_PATH):
|
65
|
+
try:
|
66
|
+
with open(SSH_NODE_POOLS_PATH, 'r', encoding='utf-8') as f:
|
67
|
+
ssh_config = yaml.safe_load(f)
|
68
|
+
if ssh_config:
|
69
|
+
# Get cluster names and prepend 'ssh-' to match
|
70
|
+
# context naming convention
|
71
|
+
contexts = [
|
72
|
+
f'ssh-{cluster_name}'
|
73
|
+
for cluster_name in ssh_config.keys()
|
74
|
+
]
|
75
|
+
except Exception: # pylint: disable=broad-except
|
76
|
+
# If there's an error reading the file, return empty list
|
77
|
+
pass
|
78
|
+
|
79
|
+
return contexts
|
80
|
+
|
81
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
82
|
+
if region == kubernetes_adaptor.in_cluster_context_name():
|
83
|
+
# If running incluster, we set region to IN_CLUSTER_REGION
|
84
|
+
# since there is no context name available.
|
85
|
+
return region, zone
|
86
|
+
|
87
|
+
all_contexts = self.existing_allowed_contexts()
|
88
|
+
|
89
|
+
if region is not None and region not in all_contexts:
|
90
|
+
region_name = region.lstrip('ssh-')
|
91
|
+
available_contexts = [c.lstrip('ssh-') for c in all_contexts]
|
92
|
+
err_str = (f'SSH Node Pool {region_name!r} is not set up. '
|
93
|
+
'Run `sky check` for more details. ')
|
94
|
+
if available_contexts:
|
95
|
+
err_str += f'Available node pools: {available_contexts}'
|
96
|
+
raise ValueError(err_str)
|
97
|
+
if zone is not None:
|
98
|
+
raise ValueError('SSH Node Pools do not support setting zone.')
|
99
|
+
return region, zone
|
100
|
+
|
101
|
+
@classmethod
|
102
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
103
|
+
def _ssh_log_skipped_contexts_once(
|
104
|
+
cls, skipped_contexts: Tuple[str, ...]) -> None:
|
105
|
+
"""Log skipped contexts for only once.
|
106
|
+
|
107
|
+
We don't directly cache the result of _filter_existing_allowed_contexts
|
108
|
+
as the admin policy may update the allowed contexts.
|
109
|
+
"""
|
110
|
+
if skipped_contexts:
|
111
|
+
count = len(set(skipped_contexts))
|
112
|
+
is_singular = count == 1
|
113
|
+
logger.warning(
|
114
|
+
f'SSH Node {("Pool" if is_singular else "Pools")} '
|
115
|
+
f'{set(skipped_contexts)!r} specified in '
|
116
|
+
f'{SSH_NODE_POOLS_PATH} {("has" if is_singular else "have")} '
|
117
|
+
'not been set up. Skipping '
|
118
|
+
f'{("that pool" if is_singular else "those pools")}. '
|
119
|
+
'Run `sky ssh up` to set up.')
|
120
|
+
|
121
|
+
@classmethod
|
122
|
+
def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
|
123
|
+
"""Get existing allowed contexts that start with 'ssh-'.
|
124
|
+
|
125
|
+
Override the Kubernetes implementation to only return contexts that
|
126
|
+
start with 'ssh-', which are created by `sky ssh up`.
|
127
|
+
|
128
|
+
Returns contexts based on clusters defined in ~/.sky/ssh_node_pools.yaml
|
129
|
+
"""
|
130
|
+
# Get all contexts from the Kubernetes implementation
|
131
|
+
all_contexts = kubernetes_utils.get_all_kube_context_names()
|
132
|
+
if not all_contexts:
|
133
|
+
return []
|
134
|
+
|
135
|
+
all_contexts = set(all_contexts)
|
136
|
+
|
137
|
+
# Filter for SSH contexts (those starting with 'ssh-')
|
138
|
+
ssh_contexts = [
|
139
|
+
context for context in all_contexts if context.startswith('ssh-')
|
140
|
+
]
|
141
|
+
|
142
|
+
# Get contexts from SSH node pools file
|
143
|
+
allowed_contexts = cls.get_ssh_node_pool_contexts()
|
144
|
+
|
145
|
+
if allowed_contexts:
|
146
|
+
# Only include allowed contexts that exist
|
147
|
+
existing_contexts = []
|
148
|
+
skipped_contexts = []
|
149
|
+
for context in allowed_contexts:
|
150
|
+
if context in ssh_contexts:
|
151
|
+
existing_contexts.append(context)
|
152
|
+
else:
|
153
|
+
skipped_contexts.append(context)
|
154
|
+
if not silent:
|
155
|
+
cls._ssh_log_skipped_contexts_once(tuple(skipped_contexts))
|
156
|
+
return existing_contexts
|
157
|
+
|
158
|
+
# If no allowed_contexts found, return all SSH contexts
|
159
|
+
return ssh_contexts
|
160
|
+
|
161
|
+
@classmethod
|
162
|
+
def _check_compute_credentials(
|
163
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
164
|
+
"""Check if the user has access credentials to SSH contexts."""
|
165
|
+
# Check for port forward dependencies - reuse Kubernetes implementation
|
166
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
167
|
+
if reasons is not None:
|
168
|
+
formatted = '\n'.join(
|
169
|
+
[reasons[0]] +
|
170
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
171
|
+
return (False, formatted)
|
172
|
+
|
173
|
+
# Get SSH contexts
|
174
|
+
try:
|
175
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
176
|
+
except Exception as e: # pylint: disable=broad-except
|
177
|
+
return (False, f'Failed to get SSH contexts: {str(e)}')
|
178
|
+
|
179
|
+
if not existing_allowed_contexts:
|
180
|
+
return (False,
|
181
|
+
'No SSH Node Pools are up. Run `sky ssh up` to set up '
|
182
|
+
f'Node Pools from {SSH_NODE_POOLS_PATH}.')
|
183
|
+
|
184
|
+
# Check credentials for each context
|
185
|
+
ctx2text = {}
|
186
|
+
success = False
|
187
|
+
for context in existing_allowed_contexts:
|
188
|
+
suc, text = super()._check_single_context(context)
|
189
|
+
success = success or suc
|
190
|
+
ctx2text[context] = text
|
191
|
+
|
192
|
+
return success, ctx2text
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def get_infras(cls) -> List[str]:
|
196
|
+
return [
|
197
|
+
f'{cls._REPR.lower()}/{c.lstrip("ssh-")}'
|
198
|
+
for c in cls.existing_allowed_contexts(silent=True)
|
199
|
+
]
|
200
|
+
|
201
|
+
@classmethod
|
202
|
+
def display_name(cls) -> str:
|
203
|
+
return 'SSH Node Pools'
|
sky/clouds/vast.py
CHANGED
@@ -237,7 +237,8 @@ class Vast(clouds.Cloud):
|
|
237
237
|
fuzzy_candidate_list, None)
|
238
238
|
|
239
239
|
@classmethod
|
240
|
-
def _check_compute_credentials(
|
240
|
+
def _check_compute_credentials(
|
241
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
241
242
|
"""Checks if the user has valid credentials for
|
242
243
|
Vast's compute service. """
|
243
244
|
try:
|
sky/clouds/vsphere.py
CHANGED
@@ -261,7 +261,8 @@ class Vsphere(clouds.Cloud):
|
|
261
261
|
fuzzy_candidate_list, None)
|
262
262
|
|
263
263
|
@classmethod
|
264
|
-
def _check_compute_credentials(
|
264
|
+
def _check_compute_credentials(
|
265
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
265
266
|
"""Checks if the user has access credentials to
|
266
267
|
vSphere's compute service."""
|
267
268
|
|
sky/core.py
CHANGED
@@ -78,14 +78,12 @@ def optimize(
|
|
78
78
|
# is shown on `sky launch`. The optimizer is also invoked during failover,
|
79
79
|
# but we do not apply the admin policy there. We should apply the admin
|
80
80
|
# policy in the optimizer, but that will require some refactoring.
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
blocked_resources=blocked_resources,
|
88
|
-
quiet=quiet)
|
81
|
+
with admin_policy_utils.apply_and_use_config_in_current_request(
|
82
|
+
dag, request_options=request_options) as dag:
|
83
|
+
return optimizer.Optimizer.optimize(dag=dag,
|
84
|
+
minimize=minimize,
|
85
|
+
blocked_resources=blocked_resources,
|
86
|
+
quiet=quiet)
|
89
87
|
|
90
88
|
|
91
89
|
@usage_lib.entrypoint
|
@@ -472,7 +470,10 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
472
470
|
message = ('Stopping spot instances is currently not supported on '
|
473
471
|
f'{resources.cloud}')
|
474
472
|
else:
|
475
|
-
|
473
|
+
cloud_name = resources.cloud.display_name(
|
474
|
+
) if resources.cloud else resources.cloud
|
475
|
+
message = ('Stopping is currently not supported for '
|
476
|
+
f'{cloud_name}')
|
476
477
|
return message
|
477
478
|
|
478
479
|
|
@@ -1017,11 +1018,20 @@ def enabled_clouds() -> List[clouds.Cloud]:
|
|
1017
1018
|
def realtime_kubernetes_gpu_availability(
|
1018
1019
|
context: Optional[str] = None,
|
1019
1020
|
name_filter: Optional[str] = None,
|
1020
|
-
quantity_filter: Optional[int] = None
|
1021
|
+
quantity_filter: Optional[int] = None,
|
1022
|
+
is_ssh: Optional[bool] = None
|
1021
1023
|
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
1022
1024
|
|
1023
1025
|
if context is None:
|
1024
|
-
|
1026
|
+
# Include contexts from both Kubernetes and SSH clouds
|
1027
|
+
kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
1028
|
+
ssh_contexts = clouds.SSH.existing_allowed_contexts()
|
1029
|
+
if is_ssh is None:
|
1030
|
+
context_list = kubernetes_contexts + ssh_contexts
|
1031
|
+
elif is_ssh:
|
1032
|
+
context_list = ssh_contexts
|
1033
|
+
else:
|
1034
|
+
context_list = kubernetes_contexts
|
1025
1035
|
else:
|
1026
1036
|
context_list = [context]
|
1027
1037
|
|
@@ -1032,7 +1042,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1032
1042
|
) -> List[models.RealtimeGpuAvailability]:
|
1033
1043
|
counts, capacity, available = service_catalog.list_accelerator_realtime(
|
1034
1044
|
gpus_only=True,
|
1035
|
-
clouds='kubernetes',
|
1045
|
+
clouds='ssh' if is_ssh else 'kubernetes',
|
1036
1046
|
name_filter=name_filter,
|
1037
1047
|
region_filter=context,
|
1038
1048
|
quantity_filter=quantity_filter,
|
@@ -1064,16 +1074,19 @@ def realtime_kubernetes_gpu_availability(
|
|
1064
1074
|
name_filter=name_filter,
|
1065
1075
|
quantity_filter=quantity_filter), context_list)
|
1066
1076
|
|
1077
|
+
cloud_identity = 'ssh' if is_ssh else 'kubernetes'
|
1078
|
+
cloud_identity_capital = 'SSH' if is_ssh else 'Kubernetes'
|
1079
|
+
|
1067
1080
|
for ctx, queried in zip(context_list, parallel_queried):
|
1068
1081
|
cumulative_count += len(queried)
|
1069
1082
|
if len(queried) == 0:
|
1070
1083
|
# don't add gpu results for clusters that don't have any
|
1071
|
-
logger.debug(f'No gpus found in
|
1084
|
+
logger.debug(f'No gpus found in {cloud_identity} cluster {ctx}')
|
1072
1085
|
continue
|
1073
1086
|
availability_lists.append((ctx, queried))
|
1074
1087
|
|
1075
1088
|
if cumulative_count == 0:
|
1076
|
-
err_msg = 'No GPUs found in any
|
1089
|
+
err_msg = f'No GPUs found in any {cloud_identity_capital} clusters. '
|
1077
1090
|
debug_msg = 'To further debug, run: sky check '
|
1078
1091
|
if name_filter is not None:
|
1079
1092
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -1081,9 +1094,9 @@ def realtime_kubernetes_gpu_availability(
|
|
1081
1094
|
gpu_info_msg += (' with requested quantity'
|
1082
1095
|
f' {quantity_filter}')
|
1083
1096
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
1084
|
-
'in
|
1085
|
-
debug_msg = ('To show available accelerators on
|
1086
|
-
' run: sky show-gpus --cloud
|
1097
|
+
f'in {cloud_identity_capital} clusters. ')
|
1098
|
+
debug_msg = (f'To show available accelerators on {cloud_identity}, '
|
1099
|
+
f' run: sky show-gpus --cloud {cloud_identity} ')
|
1087
1100
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
1088
1101
|
debug_msg)
|
1089
1102
|
raise ValueError(full_err_msg)
|
@@ -1181,3 +1194,32 @@ def local_down() -> None:
|
|
1181
1194
|
ux_utils.finishing_message('Local cluster removed.',
|
1182
1195
|
log_path=log_path,
|
1183
1196
|
is_local=True))
|
1197
|
+
|
1198
|
+
|
1199
|
+
@usage_lib.entrypoint
|
1200
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
1201
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
1202
|
+
|
1203
|
+
Args:
|
1204
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
1205
|
+
If None, the first cluster in the file is used.
|
1206
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
1207
|
+
"""
|
1208
|
+
kubernetes_deploy_utils.deploy_ssh_cluster(
|
1209
|
+
cleanup=cleanup,
|
1210
|
+
infra=infra,
|
1211
|
+
)
|
1212
|
+
|
1213
|
+
|
1214
|
+
def get_all_contexts() -> List[str]:
|
1215
|
+
"""Get all available contexts from Kubernetes and SSH clouds.
|
1216
|
+
|
1217
|
+
Returns:
|
1218
|
+
List[str]: A list of all available context names.
|
1219
|
+
"""
|
1220
|
+
kube_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
1221
|
+
ssh_contexts = clouds.SSH.get_ssh_node_pool_contexts()
|
1222
|
+
# Ensure ssh_contexts are prefixed appropriately if not already
|
1223
|
+
# For now, assuming get_ssh_node_pool_contexts already returns them
|
1224
|
+
# in the desired format (e.g., 'ssh-my-cluster')
|
1225
|
+
return sorted(list(set(kube_contexts + ssh_contexts)))
|
sky/dashboard/out/404.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/7e7ce4ff31d3977b.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/7e7ce4ff31d3977b.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/7e7ce4ff31d3977b.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/7e7ce4ff31d3977b.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"ECKwDNS9v9y3_IKFZ2lpp","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js
RENAMED
@@ -1 +1 @@
|
|
1
|
-
self.__BUILD_MANIFEST=function(s,c,e,t,a,r){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-6b0d9e5031b70c58.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":[s,e,c,t,a,"static/chunks/pages/clusters-3a748bd76e5c2984.js"],"/clusters/[cluster]":[s,e,c,t,r,a,"static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js"],"/clusters/[cluster]/[job]":[s,c,"static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js"],"/infra":[s,c,"static/chunks/pages/infra-
|
1
|
+
self.__BUILD_MANIFEST=function(s,c,e,t,a,r){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-6b0d9e5031b70c58.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":[s,e,c,t,a,"static/chunks/pages/clusters-3a748bd76e5c2984.js"],"/clusters/[cluster]":[s,e,c,t,r,a,"static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js"],"/clusters/[cluster]/[job]":[s,c,"static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js"],"/infra":[s,c,"static/chunks/pages/infra-abf08c4384190a39.js"],"/jobs":[s,e,c,t,r,"static/chunks/pages/jobs-ecd804b9272f4a7c.js"],"/jobs/[job]":[s,c,"static/chunks/pages/jobs/[job]-70756c2dad850a7e.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/infra","/jobs","/jobs/[job]"]}}("static/chunks/582-683f4f27b81996dc.js","static/chunks/480-f28cd152a98997de.js","static/chunks/312-c3c8845990db8ffc.js","static/chunks/393-e1eaa440481337ec.js","static/chunks/37-d584022b0da4ac3b.js","static/chunks/236-1a3a9440417720eb.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|