skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +19 -5
- sky/check.py +398 -171
- sky/cli.py +302 -98
- sky/client/cli.py +302 -98
- sky/client/sdk.py +104 -12
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +23 -5
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +58 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +42 -19
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +29 -7
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/fluidstack/instance.py +1 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/server.py +108 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +83 -9
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +218 -1
- sky/utils/schemas.py +75 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/clouds/oci.py
CHANGED
@@ -401,13 +401,15 @@ class OCI(clouds.Cloud):
|
|
401
401
|
fuzzy_candidate_list, None)
|
402
402
|
|
403
403
|
@classmethod
|
404
|
-
def _check_compute_credentials(
|
404
|
+
def _check_compute_credentials(
|
405
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
405
406
|
"""Checks if the user has access credentials to
|
406
407
|
OCI's compute service."""
|
407
408
|
return cls._check_credentials()
|
408
409
|
|
409
410
|
@classmethod
|
410
|
-
def _check_storage_credentials(
|
411
|
+
def _check_storage_credentials(
|
412
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
411
413
|
"""Checks if the user has access credentials to
|
412
414
|
OCI's storage service."""
|
413
415
|
# TODO(seungjin): Implement separate check for
|
sky/clouds/paperspace.py
CHANGED
@@ -255,7 +255,8 @@ class Paperspace(clouds.Cloud):
|
|
255
255
|
fuzzy_candidate_list, None)
|
256
256
|
|
257
257
|
@classmethod
|
258
|
-
def _check_compute_credentials(
|
258
|
+
def _check_compute_credentials(
|
259
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
259
260
|
"""Checks if the user has access credentials to
|
260
261
|
Paperspace's compute service."""
|
261
262
|
try:
|
sky/clouds/runpod.py
CHANGED
@@ -259,7 +259,8 @@ class RunPod(clouds.Cloud):
|
|
259
259
|
fuzzy_candidate_list, None)
|
260
260
|
|
261
261
|
@classmethod
|
262
|
-
def _check_compute_credentials(
|
262
|
+
def _check_compute_credentials(
|
263
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
263
264
|
"""Checks if the user has access credentials to
|
264
265
|
RunPod's compute service."""
|
265
266
|
return cls._check_credentials()
|
sky/clouds/scp.py
CHANGED
@@ -316,7 +316,8 @@ class SCP(clouds.Cloud):
|
|
316
316
|
fuzzy_candidate_list, None)
|
317
317
|
|
318
318
|
@classmethod
|
319
|
-
def _check_compute_credentials(
|
319
|
+
def _check_compute_credentials(
|
320
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
320
321
|
"""Checks if the user has access credentials to
|
321
322
|
SCP's compute service."""
|
322
323
|
try:
|
@@ -0,0 +1,167 @@
|
|
1
|
+
"""SSH Catalog.
|
2
|
+
|
3
|
+
This catalog inherits from the Kubernetes catalog as SSH cloud is a wrapper
|
4
|
+
around Kubernetes that uses SSH-specific contexts.
|
5
|
+
"""
|
6
|
+
import typing
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
9
|
+
from sky import sky_logging
|
10
|
+
from sky.clouds import ssh
|
11
|
+
from sky.clouds.service_catalog import CloudFilter
|
12
|
+
from sky.clouds.service_catalog import common
|
13
|
+
from sky.clouds.service_catalog import kubernetes_catalog
|
14
|
+
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
16
|
+
|
17
|
+
if typing.TYPE_CHECKING:
|
18
|
+
import pandas as pd
|
19
|
+
else:
|
20
|
+
from sky.adaptors import common as adaptors_common
|
21
|
+
pd = adaptors_common.LazyImport('pandas')
|
22
|
+
|
23
|
+
_PULL_FREQUENCY_HOURS = 7
|
24
|
+
|
25
|
+
# Reuse the Kubernetes images catalog for SSH cloud.
|
26
|
+
# We keep pull_frequency_hours so we can remotely update the default image paths
|
27
|
+
_image_df = common.read_catalog('kubernetes/images.csv',
|
28
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
29
|
+
|
30
|
+
|
31
|
+
def get_image_id_from_tag(tag: str, region: Optional[str]) -> Optional[str]:
|
32
|
+
"""Returns the image id from the tag.
|
33
|
+
|
34
|
+
Delegates to Kubernetes catalog implementation.
|
35
|
+
"""
|
36
|
+
return kubernetes_catalog.get_image_id_from_tag(tag, region)
|
37
|
+
|
38
|
+
|
39
|
+
def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
|
40
|
+
"""Returns whether the image tag is valid.
|
41
|
+
|
42
|
+
Delegates to Kubernetes catalog implementation.
|
43
|
+
"""
|
44
|
+
return kubernetes_catalog.is_image_tag_valid(tag, region)
|
45
|
+
|
46
|
+
|
47
|
+
def list_accelerators(
|
48
|
+
gpus_only: bool,
|
49
|
+
name_filter: Optional[str],
|
50
|
+
region_filter: Optional[str],
|
51
|
+
quantity_filter: Optional[int],
|
52
|
+
case_sensitive: bool = True,
|
53
|
+
all_regions: bool = False,
|
54
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
55
|
+
"""List accelerators in SSH-based Kubernetes clusters.
|
56
|
+
|
57
|
+
Delegates to the Kubernetes _list_accelerators function but restricts to
|
58
|
+
SSH contexts.
|
59
|
+
"""
|
60
|
+
return _list_accelerators(gpus_only,
|
61
|
+
name_filter,
|
62
|
+
region_filter,
|
63
|
+
quantity_filter,
|
64
|
+
case_sensitive,
|
65
|
+
all_regions,
|
66
|
+
require_price,
|
67
|
+
realtime=False)[0]
|
68
|
+
|
69
|
+
|
70
|
+
def list_accelerators_realtime(
|
71
|
+
gpus_only: bool,
|
72
|
+
name_filter: Optional[str],
|
73
|
+
region_filter: Optional[str],
|
74
|
+
quantity_filter: Optional[int],
|
75
|
+
case_sensitive: bool = True,
|
76
|
+
all_regions: bool = False,
|
77
|
+
require_price: bool = True
|
78
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
79
|
+
int]]:
|
80
|
+
"""List accelerators in SSH Node Pools with real-time information.
|
81
|
+
|
82
|
+
Delegates to the Kubernetes _list_accelerators function but restricts to
|
83
|
+
SSH contexts.
|
84
|
+
"""
|
85
|
+
return _list_accelerators(gpus_only,
|
86
|
+
name_filter,
|
87
|
+
region_filter,
|
88
|
+
quantity_filter,
|
89
|
+
case_sensitive,
|
90
|
+
all_regions,
|
91
|
+
require_price,
|
92
|
+
realtime=True)
|
93
|
+
|
94
|
+
|
95
|
+
def _list_accelerators(
|
96
|
+
gpus_only: bool,
|
97
|
+
name_filter: Optional[str],
|
98
|
+
region_filter: Optional[str],
|
99
|
+
quantity_filter: Optional[int],
|
100
|
+
case_sensitive: bool = True,
|
101
|
+
all_regions: bool = False,
|
102
|
+
require_price: bool = True,
|
103
|
+
realtime: bool = False
|
104
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
105
|
+
int]]:
|
106
|
+
"""List accelerators in SSH-based Kubernetes clusters.
|
107
|
+
|
108
|
+
This is a wrapper around the Kubernetes _list_accelerators function that
|
109
|
+
restricts the contexts to SSH-specific contexts only.
|
110
|
+
|
111
|
+
If region_filter is specified and it's not an SSH context, no results will
|
112
|
+
be returned.
|
113
|
+
"""
|
114
|
+
# If a specific region is requested, ensure it's an SSH context
|
115
|
+
if region_filter is not None and not region_filter.startswith('ssh-'):
|
116
|
+
return {}, {}, {}
|
117
|
+
|
118
|
+
# Get SSH contexts
|
119
|
+
ssh_contexts = ssh.SSH.existing_allowed_contexts()
|
120
|
+
|
121
|
+
# If no contexts found, return empty results
|
122
|
+
if not ssh_contexts:
|
123
|
+
return {}, {}, {}
|
124
|
+
|
125
|
+
# If a region filter is specified and it's not a SSH context return empty
|
126
|
+
# results
|
127
|
+
if region_filter is not None and region_filter not in ssh_contexts:
|
128
|
+
return {}, {}, {}
|
129
|
+
|
130
|
+
# If region_filter is None, use the first context if all_regions is False
|
131
|
+
if region_filter is None and not all_regions and ssh_contexts:
|
132
|
+
# Use the first SSH context if no specific region requested
|
133
|
+
region_filter = ssh_contexts[0]
|
134
|
+
|
135
|
+
# Call the Kubernetes _list_accelerators with the appropriate region filter
|
136
|
+
if realtime:
|
137
|
+
return kubernetes_catalog.list_accelerators_realtime(
|
138
|
+
gpus_only, name_filter, region_filter, quantity_filter,
|
139
|
+
case_sensitive, all_regions, require_price)
|
140
|
+
else:
|
141
|
+
result = kubernetes_catalog.list_accelerators(
|
142
|
+
gpus_only, name_filter, region_filter, quantity_filter,
|
143
|
+
case_sensitive, all_regions, require_price)
|
144
|
+
return result, {}, {}
|
145
|
+
|
146
|
+
|
147
|
+
def validate_region_zone(
|
148
|
+
region_name: Optional[str],
|
149
|
+
zone_name: Optional[str],
|
150
|
+
clouds: CloudFilter = None) -> Tuple[Optional[str], Optional[str]]:
|
151
|
+
"""Validates the region and zone for SSH cloud.
|
152
|
+
|
153
|
+
Delegates to the Kubernetes catalog implementation but ensures
|
154
|
+
the region is a valid SSH context.
|
155
|
+
"""
|
156
|
+
# Delegate to Kubernetes implementation
|
157
|
+
region, zone = kubernetes_catalog.validate_region_zone(
|
158
|
+
region_name, zone_name, clouds)
|
159
|
+
|
160
|
+
# Get SSH contexts
|
161
|
+
ssh_contexts = ssh.SSH.existing_allowed_contexts()
|
162
|
+
|
163
|
+
# If a region is specified, ensure it's in the list of SSH contexts
|
164
|
+
if region is not None and region not in ssh_contexts:
|
165
|
+
return None, None
|
166
|
+
|
167
|
+
return region, zone
|
sky/clouds/ssh.py
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
"""SSH Node Pools"""
|
2
|
+
|
3
|
+
import os
|
4
|
+
import typing
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
6
|
+
|
7
|
+
import yaml
|
8
|
+
|
9
|
+
from sky import sky_logging
|
10
|
+
from sky.adaptors import kubernetes as kubernetes_adaptor
|
11
|
+
from sky.clouds import kubernetes
|
12
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
13
|
+
from sky.utils import annotations
|
14
|
+
from sky.utils import registry
|
15
|
+
|
16
|
+
if typing.TYPE_CHECKING:
|
17
|
+
# Renaming to avoid shadowing variables.
|
18
|
+
from sky import resources as resources_lib
|
19
|
+
|
20
|
+
logger = sky_logging.init_logger(__name__)
|
21
|
+
|
22
|
+
SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
23
|
+
|
24
|
+
|
25
|
+
@registry.CLOUD_REGISTRY.register()
|
26
|
+
class SSH(kubernetes.Kubernetes):
|
27
|
+
"""SSH cloud implementation.
|
28
|
+
|
29
|
+
This is used by SSH Node Pools in SkyPilot, which use Kubernetes to manage
|
30
|
+
the SSH clusters.
|
31
|
+
|
32
|
+
This cloud is a thin wrapper around Kubernetes that only uses contexts
|
33
|
+
starting with 'ssh-', which are managed through `sky ssh up` command.
|
34
|
+
"""
|
35
|
+
|
36
|
+
_REPR = 'SSH'
|
37
|
+
|
38
|
+
# Keep track of contexts that have been logged as unreachable
|
39
|
+
logged_unreachable_contexts: Set[str] = set()
|
40
|
+
|
41
|
+
def __repr__(self):
|
42
|
+
return self._REPR
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def _unsupported_features_for_resources(
|
46
|
+
cls, resources: 'resources_lib.Resources'
|
47
|
+
) -> Dict[kubernetes.clouds.CloudImplementationFeatures, str]:
|
48
|
+
# Inherit all Kubernetes unsupported features
|
49
|
+
return super()._unsupported_features_for_resources(resources)
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def get_ssh_node_pool_contexts(cls) -> List[str]:
|
53
|
+
"""Get context names from ssh_node_pools.yaml file.
|
54
|
+
|
55
|
+
Reads the SSH node pools configuration file and returns
|
56
|
+
a list of context names by prepending 'ssh-' to each Node Pool name.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
A list of SSH Kubernetes context names derived from the Node Pools
|
60
|
+
in the SSH node pools file.
|
61
|
+
"""
|
62
|
+
contexts = []
|
63
|
+
|
64
|
+
if os.path.exists(SSH_NODE_POOLS_PATH):
|
65
|
+
try:
|
66
|
+
with open(SSH_NODE_POOLS_PATH, 'r', encoding='utf-8') as f:
|
67
|
+
ssh_config = yaml.safe_load(f)
|
68
|
+
if ssh_config:
|
69
|
+
# Get cluster names and prepend 'ssh-' to match
|
70
|
+
# context naming convention
|
71
|
+
contexts = [
|
72
|
+
f'ssh-{cluster_name}'
|
73
|
+
for cluster_name in ssh_config.keys()
|
74
|
+
]
|
75
|
+
except Exception: # pylint: disable=broad-except
|
76
|
+
# If there's an error reading the file, return empty list
|
77
|
+
pass
|
78
|
+
|
79
|
+
return contexts
|
80
|
+
|
81
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
82
|
+
if region == kubernetes_adaptor.in_cluster_context_name():
|
83
|
+
# If running incluster, we set region to IN_CLUSTER_REGION
|
84
|
+
# since there is no context name available.
|
85
|
+
return region, zone
|
86
|
+
|
87
|
+
all_contexts = self.existing_allowed_contexts()
|
88
|
+
|
89
|
+
if region is not None and region not in all_contexts:
|
90
|
+
region_name = region.lstrip('ssh-')
|
91
|
+
available_contexts = [c.lstrip('ssh-') for c in all_contexts]
|
92
|
+
err_str = (f'SSH Node Pool {region_name!r} is not set up. '
|
93
|
+
'Run `sky check` for more details. ')
|
94
|
+
if available_contexts:
|
95
|
+
err_str += f'Available node pools: {available_contexts}'
|
96
|
+
raise ValueError(err_str)
|
97
|
+
if zone is not None:
|
98
|
+
raise ValueError('SSH Node Pools do not support setting zone.')
|
99
|
+
return region, zone
|
100
|
+
|
101
|
+
@classmethod
|
102
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
103
|
+
def _ssh_log_skipped_contexts_once(
|
104
|
+
cls, skipped_contexts: Tuple[str, ...]) -> None:
|
105
|
+
"""Log skipped contexts for only once.
|
106
|
+
|
107
|
+
We don't directly cache the result of _filter_existing_allowed_contexts
|
108
|
+
as the admin policy may update the allowed contexts.
|
109
|
+
"""
|
110
|
+
if skipped_contexts:
|
111
|
+
count = len(set(skipped_contexts))
|
112
|
+
is_singular = count == 1
|
113
|
+
logger.warning(
|
114
|
+
f'SSH Node {("Pool" if is_singular else "Pools")} '
|
115
|
+
f'{set(skipped_contexts)!r} specified in '
|
116
|
+
f'{SSH_NODE_POOLS_PATH} {("has" if is_singular else "have")} '
|
117
|
+
'not been set up. Skipping '
|
118
|
+
f'{("that pool" if is_singular else "those pools")}. '
|
119
|
+
'Run `sky ssh up` to set up.')
|
120
|
+
|
121
|
+
@classmethod
|
122
|
+
def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
|
123
|
+
"""Get existing allowed contexts that start with 'ssh-'.
|
124
|
+
|
125
|
+
Override the Kubernetes implementation to only return contexts that
|
126
|
+
start with 'ssh-', which are created by `sky ssh up`.
|
127
|
+
|
128
|
+
Returns contexts based on clusters defined in ~/.sky/ssh_node_pools.yaml
|
129
|
+
"""
|
130
|
+
# Get all contexts from the Kubernetes implementation
|
131
|
+
all_contexts = kubernetes_utils.get_all_kube_context_names()
|
132
|
+
if not all_contexts:
|
133
|
+
return []
|
134
|
+
|
135
|
+
all_contexts = set(all_contexts)
|
136
|
+
|
137
|
+
# Filter for SSH contexts (those starting with 'ssh-')
|
138
|
+
ssh_contexts = [
|
139
|
+
context for context in all_contexts if context.startswith('ssh-')
|
140
|
+
]
|
141
|
+
|
142
|
+
# Get contexts from SSH node pools file
|
143
|
+
allowed_contexts = cls.get_ssh_node_pool_contexts()
|
144
|
+
|
145
|
+
if allowed_contexts:
|
146
|
+
# Only include allowed contexts that exist
|
147
|
+
existing_contexts = []
|
148
|
+
skipped_contexts = []
|
149
|
+
for context in allowed_contexts:
|
150
|
+
if context in ssh_contexts:
|
151
|
+
existing_contexts.append(context)
|
152
|
+
else:
|
153
|
+
skipped_contexts.append(context)
|
154
|
+
if not silent:
|
155
|
+
cls._ssh_log_skipped_contexts_once(tuple(skipped_contexts))
|
156
|
+
return existing_contexts
|
157
|
+
|
158
|
+
# If no allowed_contexts found, return all SSH contexts
|
159
|
+
return ssh_contexts
|
160
|
+
|
161
|
+
@classmethod
|
162
|
+
def _check_compute_credentials(
|
163
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
164
|
+
"""Check if the user has access credentials to SSH contexts."""
|
165
|
+
# Check for port forward dependencies - reuse Kubernetes implementation
|
166
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
167
|
+
if reasons is not None:
|
168
|
+
formatted = '\n'.join(
|
169
|
+
[reasons[0]] +
|
170
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
171
|
+
return (False, formatted)
|
172
|
+
|
173
|
+
# Get SSH contexts
|
174
|
+
try:
|
175
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
176
|
+
except Exception as e: # pylint: disable=broad-except
|
177
|
+
return (False, f'Failed to get SSH contexts: {str(e)}')
|
178
|
+
|
179
|
+
if not existing_allowed_contexts:
|
180
|
+
return (False,
|
181
|
+
'No SSH Node Pools are up. Run `sky ssh up` to set up '
|
182
|
+
f'Node Pools from {SSH_NODE_POOLS_PATH}.')
|
183
|
+
|
184
|
+
# Check credentials for each context
|
185
|
+
ctx2text = {}
|
186
|
+
success = False
|
187
|
+
for context in existing_allowed_contexts:
|
188
|
+
suc, text = super()._check_single_context(context)
|
189
|
+
success = success or suc
|
190
|
+
ctx2text[context] = text
|
191
|
+
|
192
|
+
return success, ctx2text
|
193
|
+
|
194
|
+
@classmethod
|
195
|
+
def get_infras(cls) -> List[str]:
|
196
|
+
return [
|
197
|
+
f'{cls._REPR.lower()}/{c.lstrip("ssh-")}'
|
198
|
+
for c in cls.existing_allowed_contexts(silent=True)
|
199
|
+
]
|
200
|
+
|
201
|
+
@classmethod
|
202
|
+
def display_name(cls) -> str:
|
203
|
+
return 'SSH Node Pools'
|
sky/clouds/vast.py
CHANGED
@@ -237,7 +237,8 @@ class Vast(clouds.Cloud):
|
|
237
237
|
fuzzy_candidate_list, None)
|
238
238
|
|
239
239
|
@classmethod
|
240
|
-
def _check_compute_credentials(
|
240
|
+
def _check_compute_credentials(
|
241
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
241
242
|
"""Checks if the user has valid credentials for
|
242
243
|
Vast's compute service. """
|
243
244
|
try:
|
sky/clouds/vsphere.py
CHANGED
@@ -261,7 +261,8 @@ class Vsphere(clouds.Cloud):
|
|
261
261
|
fuzzy_candidate_list, None)
|
262
262
|
|
263
263
|
@classmethod
|
264
|
-
def _check_compute_credentials(
|
264
|
+
def _check_compute_credentials(
|
265
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
265
266
|
"""Checks if the user has access credentials to
|
266
267
|
vSphere's compute service."""
|
267
268
|
|
sky/core.py
CHANGED
@@ -17,6 +17,7 @@ from sky import global_user_state
|
|
17
17
|
from sky import models
|
18
18
|
from sky import optimizer
|
19
19
|
from sky import sky_logging
|
20
|
+
from sky import skypilot_config
|
20
21
|
from sky import task as task_lib
|
21
22
|
from sky.backends import backend_utils
|
22
23
|
from sky.clouds import cloud as sky_cloud
|
@@ -470,7 +471,10 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
470
471
|
message = ('Stopping spot instances is currently not supported on '
|
471
472
|
f'{resources.cloud}')
|
472
473
|
else:
|
473
|
-
|
474
|
+
cloud_name = resources.cloud.display_name(
|
475
|
+
) if resources.cloud else resources.cloud
|
476
|
+
message = ('Stopping is currently not supported for '
|
477
|
+
f'{cloud_name}')
|
474
478
|
return message
|
475
479
|
|
476
480
|
|
@@ -1006,20 +1010,31 @@ def storage_delete(name: str) -> None:
|
|
1006
1010
|
# = Catalog Observe =
|
1007
1011
|
# ===================
|
1008
1012
|
@usage_lib.entrypoint
|
1009
|
-
def enabled_clouds() -> List[clouds.Cloud]:
|
1013
|
+
def enabled_clouds(workspace: Optional[str] = None) -> List[clouds.Cloud]:
|
1014
|
+
if workspace is None:
|
1015
|
+
workspace = skypilot_config.get_active_workspace()
|
1010
1016
|
return global_user_state.get_cached_enabled_clouds(
|
1011
|
-
sky_cloud.CloudCapability.COMPUTE)
|
1017
|
+
sky_cloud.CloudCapability.COMPUTE, workspace=workspace)
|
1012
1018
|
|
1013
1019
|
|
1014
1020
|
@usage_lib.entrypoint
|
1015
1021
|
def realtime_kubernetes_gpu_availability(
|
1016
1022
|
context: Optional[str] = None,
|
1017
1023
|
name_filter: Optional[str] = None,
|
1018
|
-
quantity_filter: Optional[int] = None
|
1024
|
+
quantity_filter: Optional[int] = None,
|
1025
|
+
is_ssh: Optional[bool] = None
|
1019
1026
|
) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
|
1020
1027
|
|
1021
1028
|
if context is None:
|
1022
|
-
|
1029
|
+
# Include contexts from both Kubernetes and SSH clouds
|
1030
|
+
kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
1031
|
+
ssh_contexts = clouds.SSH.existing_allowed_contexts()
|
1032
|
+
if is_ssh is None:
|
1033
|
+
context_list = kubernetes_contexts + ssh_contexts
|
1034
|
+
elif is_ssh:
|
1035
|
+
context_list = ssh_contexts
|
1036
|
+
else:
|
1037
|
+
context_list = kubernetes_contexts
|
1023
1038
|
else:
|
1024
1039
|
context_list = [context]
|
1025
1040
|
|
@@ -1030,7 +1045,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1030
1045
|
) -> List[models.RealtimeGpuAvailability]:
|
1031
1046
|
counts, capacity, available = service_catalog.list_accelerator_realtime(
|
1032
1047
|
gpus_only=True,
|
1033
|
-
clouds='kubernetes',
|
1048
|
+
clouds='ssh' if is_ssh else 'kubernetes',
|
1034
1049
|
name_filter=name_filter,
|
1035
1050
|
region_filter=context,
|
1036
1051
|
quantity_filter=quantity_filter,
|
@@ -1062,16 +1077,19 @@ def realtime_kubernetes_gpu_availability(
|
|
1062
1077
|
name_filter=name_filter,
|
1063
1078
|
quantity_filter=quantity_filter), context_list)
|
1064
1079
|
|
1080
|
+
cloud_identity = 'ssh' if is_ssh else 'kubernetes'
|
1081
|
+
cloud_identity_capital = 'SSH' if is_ssh else 'Kubernetes'
|
1082
|
+
|
1065
1083
|
for ctx, queried in zip(context_list, parallel_queried):
|
1066
1084
|
cumulative_count += len(queried)
|
1067
1085
|
if len(queried) == 0:
|
1068
1086
|
# don't add gpu results for clusters that don't have any
|
1069
|
-
logger.debug(f'No gpus found in
|
1087
|
+
logger.debug(f'No gpus found in {cloud_identity} cluster {ctx}')
|
1070
1088
|
continue
|
1071
1089
|
availability_lists.append((ctx, queried))
|
1072
1090
|
|
1073
1091
|
if cumulative_count == 0:
|
1074
|
-
err_msg = 'No GPUs found in any
|
1092
|
+
err_msg = f'No GPUs found in any {cloud_identity_capital} clusters. '
|
1075
1093
|
debug_msg = 'To further debug, run: sky check '
|
1076
1094
|
if name_filter is not None:
|
1077
1095
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -1079,9 +1097,9 @@ def realtime_kubernetes_gpu_availability(
|
|
1079
1097
|
gpu_info_msg += (' with requested quantity'
|
1080
1098
|
f' {quantity_filter}')
|
1081
1099
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
1082
|
-
'in
|
1083
|
-
debug_msg = ('To show available accelerators on
|
1084
|
-
' run: sky show-gpus --cloud
|
1100
|
+
f'in {cloud_identity_capital} clusters. ')
|
1101
|
+
debug_msg = (f'To show available accelerators on {cloud_identity}, '
|
1102
|
+
f' run: sky show-gpus --cloud {cloud_identity} ')
|
1085
1103
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
1086
1104
|
debug_msg)
|
1087
1105
|
raise ValueError(full_err_msg)
|
@@ -1179,3 +1197,32 @@ def local_down() -> None:
|
|
1179
1197
|
ux_utils.finishing_message('Local cluster removed.',
|
1180
1198
|
log_path=log_path,
|
1181
1199
|
is_local=True))
|
1200
|
+
|
1201
|
+
|
1202
|
+
@usage_lib.entrypoint
|
1203
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
1204
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
1205
|
+
|
1206
|
+
Args:
|
1207
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
1208
|
+
If None, the first cluster in the file is used.
|
1209
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
1210
|
+
"""
|
1211
|
+
kubernetes_deploy_utils.deploy_ssh_cluster(
|
1212
|
+
cleanup=cleanup,
|
1213
|
+
infra=infra,
|
1214
|
+
)
|
1215
|
+
|
1216
|
+
|
1217
|
+
def get_all_contexts() -> List[str]:
|
1218
|
+
"""Get all available contexts from Kubernetes and SSH clouds.
|
1219
|
+
|
1220
|
+
Returns:
|
1221
|
+
List[str]: A list of all available context names.
|
1222
|
+
"""
|
1223
|
+
kube_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
1224
|
+
ssh_contexts = clouds.SSH.get_ssh_node_pool_contexts()
|
1225
|
+
# Ensure ssh_contexts are prefixed appropriately if not already
|
1226
|
+
# For now, assuming get_ssh_node_pool_contexts already returns them
|
1227
|
+
# in the desired format (e.g., 'ssh-my-cluster')
|
1228
|
+
return sorted(list(set(kube_contexts + ssh_contexts)))
|
sky/dashboard/out/404.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/d2cdba64c9202dd7.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/d2cdba64c9202dd7.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-deda68c926e8d0bc.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/aHej19bZyl4hoHgrzPCn7/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"aHej19bZyl4hoHgrzPCn7","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
@@ -0,0 +1 @@
|
|
1
|
+
self.__BUILD_MANIFEST=function(s,c,e,t,a,r){return{__rewrites:{afterFiles:[],beforeFiles:[],fallback:[]},"/":["static/chunks/pages/index-6b0d9e5031b70c58.js"],"/_error":["static/chunks/pages/_error-1be831200e60c5c0.js"],"/clusters":[s,e,c,t,r,"static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js"],"/clusters/[cluster]":[s,e,c,t,a,r,"static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js"],"/clusters/[cluster]/[job]":[s,c,"static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js"],"/infra":[s,c,"static/chunks/pages/infra-e690d864aa00e2ea.js"],"/jobs":[s,e,c,t,a,"static/chunks/pages/jobs-73d5e0c369d00346.js"],"/jobs/[job]":[s,c,"static/chunks/pages/jobs/[job]-db6558a5ec687011.js"],"/users":[s,c,"static/chunks/pages/users-2d319455c3f1c3e2.js"],"/workspaces":["static/chunks/9f96d65d-5a3e4af68c26849e.js",s,e,"static/chunks/498-d7722313e5e5b4e6.js",c,t,a,"static/chunks/pages/workspaces-02a7b60f2ead275f.js"],sortedPages:["/","/_app","/_error","/clusters","/clusters/[cluster]","/clusters/[cluster]/[job]","/infra","/jobs","/jobs/[job]","/users","/workspaces"]}}("static/chunks/573-f17bd89d9f9118b3.js","static/chunks/480-ee58038f1a4afd5c.js","static/chunks/488-50d843fdb5396d32.js","static/chunks/734-5f5ce8f347b7f417.js","static/chunks/938-f347f6144075b0c8.js","static/chunks/578-7a4795009a56430c.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|