skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Utilities to setup SSH Tunnel"""
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Set
|
|
8
|
+
|
|
9
|
+
import colorama
|
|
10
|
+
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.ssh_node_pools import constants
|
|
13
|
+
from sky.ssh_node_pools.deploy import utils as deploy_utils
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Get the directory of this script
|
|
18
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_used_localhost_ports() -> Set[int]:
|
|
22
|
+
"""Get SSH port forwardings already in use on localhost"""
|
|
23
|
+
used_ports = set()
|
|
24
|
+
|
|
25
|
+
# Get ports from netstat (works on macOS and Linux)
|
|
26
|
+
try:
|
|
27
|
+
if sys.platform == 'darwin':
|
|
28
|
+
# macOS
|
|
29
|
+
result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
check=False)
|
|
33
|
+
else:
|
|
34
|
+
# Linux and other Unix-like systems
|
|
35
|
+
result = subprocess.run(['netstat', '-tln'],
|
|
36
|
+
capture_output=True,
|
|
37
|
+
text=True,
|
|
38
|
+
check=False)
|
|
39
|
+
|
|
40
|
+
if result.returncode == 0:
|
|
41
|
+
# Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
|
|
42
|
+
for line in result.stdout.splitlines():
|
|
43
|
+
if '127.0.0.1:' in line or 'localhost:' in line:
|
|
44
|
+
match = re.search(r':(64\d\d)\s', line)
|
|
45
|
+
if match:
|
|
46
|
+
port = int(match.group(1))
|
|
47
|
+
if 6400 <= port <= 6500: # Only consider our range
|
|
48
|
+
used_ports.add(port)
|
|
49
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
50
|
+
# If netstat fails, try another approach
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
# Also check ports from existing kubeconfig entries
|
|
54
|
+
try:
|
|
55
|
+
result = subprocess.run([
|
|
56
|
+
'kubectl', 'config', 'view', '-o',
|
|
57
|
+
'jsonpath=\'{.clusters[*].cluster.server}\''
|
|
58
|
+
],
|
|
59
|
+
capture_output=True,
|
|
60
|
+
text=True,
|
|
61
|
+
check=False)
|
|
62
|
+
|
|
63
|
+
if result.returncode == 0:
|
|
64
|
+
# Look for localhost URLs with ports
|
|
65
|
+
for url in result.stdout.split():
|
|
66
|
+
if 'localhost:' in url or '127.0.0.1:' in url:
|
|
67
|
+
match = re.search(r':(\d+)', url)
|
|
68
|
+
if match:
|
|
69
|
+
port = int(match.group(1))
|
|
70
|
+
if 6400 <= port <= 6500: # Only consider our range
|
|
71
|
+
used_ports.add(port)
|
|
72
|
+
except subprocess.SubprocessError:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
return used_ports
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_available_port(start: int = 6443, end: int = 6499) -> int:
|
|
79
|
+
"""Get an available port in the given range not used by other tunnels"""
|
|
80
|
+
used_ports = _get_used_localhost_ports()
|
|
81
|
+
|
|
82
|
+
# Try to use port 6443 first if available for the first cluster
|
|
83
|
+
if start == 6443 and start not in used_ports:
|
|
84
|
+
return start
|
|
85
|
+
|
|
86
|
+
# Otherwise find any available port in the range
|
|
87
|
+
available_ports = list(set(range(start, end + 1)) - used_ports)
|
|
88
|
+
|
|
89
|
+
if not available_ports:
|
|
90
|
+
# If all ports are used, pick a random one from our range
|
|
91
|
+
# (we'll terminate any existing connection in the setup)
|
|
92
|
+
return random.randint(start, end)
|
|
93
|
+
|
|
94
|
+
# Sort to get deterministic allocation
|
|
95
|
+
available_ports.sort()
|
|
96
|
+
return available_ports[0]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def setup_kubectl_ssh_tunnel(head_node,
|
|
100
|
+
ssh_user,
|
|
101
|
+
ssh_key,
|
|
102
|
+
context_name,
|
|
103
|
+
use_ssh_config=False):
|
|
104
|
+
"""Set up kubeconfig exec credential plugin for SSH tunnel"""
|
|
105
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ Setting up SSH tunnel for '
|
|
106
|
+
f'Kubernetes API access...{colorama.Style.RESET_ALL}')
|
|
107
|
+
|
|
108
|
+
# Get an available port for this cluster
|
|
109
|
+
port = get_available_port()
|
|
110
|
+
|
|
111
|
+
# Paths to scripts
|
|
112
|
+
tunnel_script = os.path.join(SCRIPT_DIR, 'tunnel', 'ssh-tunnel.sh')
|
|
113
|
+
|
|
114
|
+
# Make sure scripts are executable
|
|
115
|
+
os.chmod(tunnel_script, 0o755)
|
|
116
|
+
|
|
117
|
+
# Certificate files
|
|
118
|
+
client_cert_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
119
|
+
f'{context_name}-cert.pem')
|
|
120
|
+
client_key_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
121
|
+
f'{context_name}-key.pem')
|
|
122
|
+
|
|
123
|
+
# Update kubeconfig to use localhost with the selected port
|
|
124
|
+
deploy_utils.run_command([
|
|
125
|
+
'kubectl', 'config', 'set-cluster', context_name,
|
|
126
|
+
f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
|
|
127
|
+
])
|
|
128
|
+
|
|
129
|
+
# Build the exec args list based on auth method
|
|
130
|
+
exec_args = [
|
|
131
|
+
'--exec-command', tunnel_script, '--exec-api-version',
|
|
132
|
+
'client.authentication.k8s.io/v1beta1'
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
# Set credential TTL to force frequent tunnel checks
|
|
136
|
+
ttl_seconds = 30
|
|
137
|
+
|
|
138
|
+
# Verify if we have extracted certificate data files
|
|
139
|
+
has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
|
|
140
|
+
client_key_file)
|
|
141
|
+
if has_cert_files:
|
|
142
|
+
logger.info(f'{colorama.Fore.GREEN}Client certificate data extracted '
|
|
143
|
+
'and will be used for authentication'
|
|
144
|
+
f'{colorama.Style.RESET_ALL}')
|
|
145
|
+
|
|
146
|
+
if use_ssh_config:
|
|
147
|
+
deploy_utils.run_command(
|
|
148
|
+
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
|
149
|
+
[
|
|
150
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
|
151
|
+
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
|
152
|
+
f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
|
|
153
|
+
'--exec-arg=--host', f'--exec-arg={head_node}'
|
|
154
|
+
])
|
|
155
|
+
else:
|
|
156
|
+
deploy_utils.run_command(
|
|
157
|
+
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
|
158
|
+
[
|
|
159
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
|
160
|
+
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
|
161
|
+
f'--exec-arg={ttl_seconds}', '--exec-arg=--host',
|
|
162
|
+
f'--exec-arg={head_node}', '--exec-arg=--user',
|
|
163
|
+
f'--exec-arg={ssh_user}', '--exec-arg=--ssh-key',
|
|
164
|
+
f'--exec-arg={ssh_key}'
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel configured through '
|
|
168
|
+
'kubectl credential plugin on port '
|
|
169
|
+
f'{port}{colorama.Style.RESET_ALL}')
|
|
170
|
+
logger.info('Your kubectl connection is now tunneled through SSH '
|
|
171
|
+
f'(port {port}).')
|
|
172
|
+
logger.info('This tunnel will be automatically established when needed.')
|
|
173
|
+
logger.info(f'Credential TTL set to {ttl_seconds}s to ensure tunnel '
|
|
174
|
+
'health is checked frequently.')
|
|
175
|
+
return port
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
|
|
179
|
+
"""Clean up the SSH tunnel for a specific context"""
|
|
180
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ Cleaning up SSH tunnel for '
|
|
181
|
+
f'`{cluster_name}`...{colorama.Style.RESET_ALL}')
|
|
182
|
+
|
|
183
|
+
# Path to cleanup script
|
|
184
|
+
cleanup_script = os.path.join(SCRIPT_DIR, 'tunnel', 'cleanup-tunnel.sh')
|
|
185
|
+
|
|
186
|
+
# Make sure script is executable
|
|
187
|
+
if os.path.exists(cleanup_script):
|
|
188
|
+
os.chmod(cleanup_script, 0o755)
|
|
189
|
+
|
|
190
|
+
# Run the cleanup script
|
|
191
|
+
subprocess.run([cleanup_script, context_name],
|
|
192
|
+
stdout=subprocess.DEVNULL,
|
|
193
|
+
stderr=subprocess.DEVNULL,
|
|
194
|
+
check=False)
|
|
195
|
+
logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel for `{cluster_name}` '
|
|
196
|
+
f'cleaned up.{colorama.Style.RESET_ALL}')
|
|
197
|
+
else:
|
|
198
|
+
logger.error(f'{colorama.Fore.YELLOW}Cleanup script not found: '
|
|
199
|
+
f'{cleanup_script}{colorama.Style.RESET_ALL}')
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Utilities for SSH Node Pools Deployment"""
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import colorama
|
|
7
|
+
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.utils import ux_utils
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_ssh_cluster_dependencies(
|
|
15
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
|
16
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
raise_error: set to true when the dependency needs to be present.
|
|
20
|
+
set to false for `sky check`, where reason strings are compiled
|
|
21
|
+
at the end.
|
|
22
|
+
|
|
23
|
+
Returns: the reasons list if there are missing dependencies.
|
|
24
|
+
"""
|
|
25
|
+
# error message
|
|
26
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
27
|
+
|
|
28
|
+
# save
|
|
29
|
+
reasons = []
|
|
30
|
+
required_binaries = []
|
|
31
|
+
|
|
32
|
+
# Ensure jq is installed
|
|
33
|
+
try:
|
|
34
|
+
subprocess.run(['jq', '--version'],
|
|
35
|
+
stdout=subprocess.DEVNULL,
|
|
36
|
+
stderr=subprocess.DEVNULL,
|
|
37
|
+
check=True)
|
|
38
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
39
|
+
required_binaries.append('jq')
|
|
40
|
+
reasons.append(jq_message)
|
|
41
|
+
|
|
42
|
+
if required_binaries:
|
|
43
|
+
reasons.extend([
|
|
44
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
45
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
46
|
+
'On MacOS, install with: ',
|
|
47
|
+
f' $ brew install {" ".join(required_binaries)}',
|
|
48
|
+
])
|
|
49
|
+
if raise_error:
|
|
50
|
+
with ux_utils.print_exception_no_traceback():
|
|
51
|
+
raise RuntimeError('\n'.join(reasons))
|
|
52
|
+
return reasons
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run_command(cmd, shell=False, silent=False):
|
|
57
|
+
"""Run a local command and return the output."""
|
|
58
|
+
process = subprocess.run(cmd,
|
|
59
|
+
shell=shell,
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True,
|
|
62
|
+
check=False)
|
|
63
|
+
if process.returncode != 0:
|
|
64
|
+
if not silent:
|
|
65
|
+
logger.error(f'{colorama.Fore.RED}Error executing command: {cmd}\n'
|
|
66
|
+
f'{colorama.Style.RESET_ALL}STDOUT: {process.stdout}\n'
|
|
67
|
+
f'STDERR: {process.stderr}')
|
|
68
|
+
return None
|
|
69
|
+
return process.stdout.strip()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_effective_host_ip(hostname: str) -> str:
|
|
73
|
+
"""Get the effective IP for a hostname from SSH config."""
|
|
74
|
+
try:
|
|
75
|
+
result = subprocess.run(['ssh', '-G', hostname],
|
|
76
|
+
capture_output=True,
|
|
77
|
+
text=True,
|
|
78
|
+
check=False)
|
|
79
|
+
if result.returncode == 0:
|
|
80
|
+
for line in result.stdout.splitlines():
|
|
81
|
+
if line.startswith('hostname '):
|
|
82
|
+
return line.split(' ', 1)[1].strip()
|
|
83
|
+
except Exception: # pylint: disable=broad-except
|
|
84
|
+
pass
|
|
85
|
+
return hostname # Return the original hostname if lookup fails
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run_remote(node,
|
|
89
|
+
cmd,
|
|
90
|
+
user='',
|
|
91
|
+
ssh_key='',
|
|
92
|
+
connect_timeout=30,
|
|
93
|
+
use_ssh_config=False,
|
|
94
|
+
print_output=False,
|
|
95
|
+
use_shell=False,
|
|
96
|
+
silent=False):
|
|
97
|
+
"""Run a command on a remote machine via SSH."""
|
|
98
|
+
ssh_cmd: List[str]
|
|
99
|
+
if use_ssh_config:
|
|
100
|
+
# Use SSH config for connection parameters
|
|
101
|
+
ssh_cmd = ['ssh', node, cmd]
|
|
102
|
+
else:
|
|
103
|
+
# Use explicit parameters
|
|
104
|
+
ssh_cmd = [
|
|
105
|
+
'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
|
|
106
|
+
'-o', f'ConnectTimeout={connect_timeout}', '-o',
|
|
107
|
+
'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
if ssh_key:
|
|
111
|
+
if not os.path.isfile(ssh_key):
|
|
112
|
+
raise ValueError(f'SSH key not found: {ssh_key}')
|
|
113
|
+
ssh_cmd.extend(['-i', ssh_key])
|
|
114
|
+
|
|
115
|
+
ssh_cmd.append(f'{user}@{node}' if user else node)
|
|
116
|
+
ssh_cmd.append(cmd)
|
|
117
|
+
|
|
118
|
+
subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
|
|
119
|
+
process = subprocess.run(subprocess_cmd,
|
|
120
|
+
capture_output=True,
|
|
121
|
+
text=True,
|
|
122
|
+
check=False,
|
|
123
|
+
shell=use_shell)
|
|
124
|
+
if process.returncode != 0:
|
|
125
|
+
if not silent:
|
|
126
|
+
logger.error(f'{colorama.Fore.RED}Error executing command {cmd} on '
|
|
127
|
+
f'{node}:{colorama.Style.RESET_ALL} {process.stderr}')
|
|
128
|
+
return None
|
|
129
|
+
if print_output:
|
|
130
|
+
logger.info(process.stdout)
|
|
131
|
+
return process.stdout.strip()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def ensure_directory_exists(path):
|
|
135
|
+
"""Ensure the directory for the specified file path exists."""
|
|
136
|
+
directory = os.path.dirname(path)
|
|
137
|
+
if directory and not os.path.exists(directory):
|
|
138
|
+
os.makedirs(directory, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_gpu(node, user, ssh_key, use_ssh_config=False, is_head=False):
|
|
142
|
+
"""Check if a node has a GPU."""
|
|
143
|
+
cmd = ('command -v nvidia-smi &> /dev/null && '
|
|
144
|
+
'nvidia-smi --query-gpu=gpu_name --format=csv,noheader')
|
|
145
|
+
result = run_remote(node,
|
|
146
|
+
cmd,
|
|
147
|
+
user,
|
|
148
|
+
ssh_key,
|
|
149
|
+
use_ssh_config=use_ssh_config,
|
|
150
|
+
silent=True)
|
|
151
|
+
if result is not None:
|
|
152
|
+
# Check that all GPUs have the same type.
|
|
153
|
+
# Currently, SkyPilot does not support heterogeneous GPU node
|
|
154
|
+
# (i.e. more than one GPU type on the same node).
|
|
155
|
+
gpu_names = {
|
|
156
|
+
line.strip() for line in result.splitlines() if line.strip()
|
|
157
|
+
}
|
|
158
|
+
if not gpu_names:
|
|
159
|
+
# This can happen if nvidia-smi returns only whitespace.
|
|
160
|
+
# Set result to None to ensure this function returns False.
|
|
161
|
+
result = None
|
|
162
|
+
elif len(gpu_names) > 1:
|
|
163
|
+
# Sort for a deterministic error message.
|
|
164
|
+
sorted_gpu_names = sorted(list(gpu_names))
|
|
165
|
+
raise RuntimeError(
|
|
166
|
+
f'Node {node} has more than one GPU types '
|
|
167
|
+
f'({", ".join(sorted_gpu_names)}). '
|
|
168
|
+
'SkyPilot does not support a node with multiple GPU types.')
|
|
169
|
+
else:
|
|
170
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ GPU {list(gpu_names)[0]} '
|
|
171
|
+
f'detected on {"head" if is_head else "worker"} '
|
|
172
|
+
f'node ({node}).{colorama.Style.RESET_ALL}')
|
|
173
|
+
return result is not None
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -4,12 +4,11 @@ from typing import Any, Dict, List
|
|
|
4
4
|
|
|
5
5
|
import fastapi
|
|
6
6
|
|
|
7
|
-
from sky import core as sky_core
|
|
8
7
|
from sky.server.requests import executor
|
|
9
8
|
from sky.server.requests import payloads
|
|
10
9
|
from sky.server.requests import request_names
|
|
11
10
|
from sky.server.requests import requests as requests_lib
|
|
12
|
-
from sky.ssh_node_pools import core
|
|
11
|
+
from sky.ssh_node_pools import core
|
|
13
12
|
from sky.utils import common_utils
|
|
14
13
|
|
|
15
14
|
router = fastapi.APIRouter()
|
|
@@ -19,7 +18,7 @@ router = fastapi.APIRouter()
|
|
|
19
18
|
def get_ssh_node_pools() -> Dict[str, Any]:
|
|
20
19
|
"""Get all SSH Node Pool configurations."""
|
|
21
20
|
try:
|
|
22
|
-
return
|
|
21
|
+
return core.get_all_pools()
|
|
23
22
|
except Exception as e:
|
|
24
23
|
raise fastapi.HTTPException(
|
|
25
24
|
status_code=500,
|
|
@@ -31,7 +30,7 @@ def get_ssh_node_pools() -> Dict[str, Any]:
|
|
|
31
30
|
def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
32
31
|
"""Update SSH Node Pool configurations."""
|
|
33
32
|
try:
|
|
34
|
-
|
|
33
|
+
core.update_pools(pools_config)
|
|
35
34
|
return {'status': 'success'}
|
|
36
35
|
except Exception as e:
|
|
37
36
|
raise fastapi.HTTPException(status_code=400,
|
|
@@ -43,7 +42,7 @@ def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
|
43
42
|
def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
|
|
44
43
|
"""Delete a SSH Node Pool configuration."""
|
|
45
44
|
try:
|
|
46
|
-
if
|
|
45
|
+
if core.delete_pool(pool_name):
|
|
47
46
|
return {'status': 'success'}
|
|
48
47
|
else:
|
|
49
48
|
raise fastapi.HTTPException(
|
|
@@ -70,8 +69,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
|
|
70
69
|
detail='Missing key_name or key_file')
|
|
71
70
|
|
|
72
71
|
key_content = await key_file.read()
|
|
73
|
-
key_path =
|
|
74
|
-
key_content.decode())
|
|
72
|
+
key_path = core.upload_ssh_key(key_name, key_content.decode())
|
|
75
73
|
|
|
76
74
|
return {'status': 'success', 'key_path': key_path}
|
|
77
75
|
except fastapi.HTTPException:
|
|
@@ -87,7 +85,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
|
|
87
85
|
def list_ssh_keys() -> List[str]:
|
|
88
86
|
"""List available SSH keys."""
|
|
89
87
|
try:
|
|
90
|
-
return
|
|
88
|
+
return core.list_ssh_keys()
|
|
91
89
|
except Exception as e:
|
|
92
90
|
exception_msg = common_utils.format_exception(e)
|
|
93
91
|
raise fastapi.HTTPException(
|
|
@@ -104,7 +102,7 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
104
102
|
request_id=request.state.request_id,
|
|
105
103
|
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
106
104
|
request_body=ssh_up_body,
|
|
107
|
-
func=
|
|
105
|
+
func=core.ssh_up,
|
|
108
106
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
109
107
|
)
|
|
110
108
|
|
|
@@ -129,7 +127,7 @@ async def deploy_ssh_node_pool_general(
|
|
|
129
127
|
request_id=request.state.request_id,
|
|
130
128
|
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
131
129
|
request_body=ssh_up_body,
|
|
132
|
-
func=
|
|
130
|
+
func=core.ssh_up,
|
|
133
131
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
134
132
|
)
|
|
135
133
|
|
|
@@ -155,7 +153,7 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
155
153
|
request_id=request.state.request_id,
|
|
156
154
|
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
157
155
|
request_body=ssh_up_body,
|
|
158
|
-
func=
|
|
156
|
+
func=core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
159
157
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
160
158
|
)
|
|
161
159
|
|
|
@@ -183,7 +181,7 @@ async def down_ssh_node_pool_general(
|
|
|
183
181
|
request_id=request.state.request_id,
|
|
184
182
|
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
185
183
|
request_body=ssh_up_body,
|
|
186
|
-
func=
|
|
184
|
+
func=core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
187
185
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
188
186
|
)
|
|
189
187
|
|
|
@@ -206,7 +204,7 @@ def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
|
|
|
206
204
|
try:
|
|
207
205
|
# Call ssh_status to check the context
|
|
208
206
|
context_name = f'ssh-{pool_name}'
|
|
209
|
-
is_ready, reason =
|
|
207
|
+
is_ready, reason = core.ssh_status(context_name)
|
|
210
208
|
|
|
211
209
|
# Strip ANSI escape codes from the reason text
|
|
212
210
|
def strip_ansi_codes(text):
|
|
@@ -5,13 +5,14 @@ import subprocess
|
|
|
5
5
|
from typing import Any, Callable, Dict, List, Optional
|
|
6
6
|
import uuid
|
|
7
7
|
|
|
8
|
+
import colorama
|
|
8
9
|
import yaml
|
|
9
10
|
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.ssh_node_pools import constants
|
|
10
13
|
from sky.utils import ux_utils
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
RED = '\033[0;31m'
|
|
14
|
-
NC = '\033[0m' # No color
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def check_host_in_ssh_config(hostname: str) -> bool:
|
|
@@ -92,7 +93,8 @@ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
|
|
92
93
|
def get_cluster_config(
|
|
93
94
|
targets: Dict[str, Any],
|
|
94
95
|
cluster_name: Optional[str] = None,
|
|
95
|
-
file_path: str = DEFAULT_SSH_NODE_POOLS_PATH
|
|
96
|
+
file_path: str = constants.DEFAULT_SSH_NODE_POOLS_PATH
|
|
97
|
+
) -> Dict[str, Any]:
|
|
96
98
|
"""Get configuration for specific clusters or all clusters."""
|
|
97
99
|
if not targets:
|
|
98
100
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -186,8 +188,9 @@ def prepare_hosts_info(
|
|
|
186
188
|
else:
|
|
187
189
|
# It's a dict with potential overrides
|
|
188
190
|
if 'ip' not in host:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
+
logger.warning(f'{colorama.Fore.RED}Warning: Host missing'
|
|
192
|
+
f'\'ip\' field, skipping: {host}'
|
|
193
|
+
f'{colorama.Style.RESET_ALL}')
|
|
191
194
|
continue
|
|
192
195
|
|
|
193
196
|
# Check if this is an SSH config hostname
|
|
@@ -523,6 +523,14 @@ available_node_types:
|
|
|
523
523
|
resourceFieldRef:
|
|
524
524
|
containerName: ray-node
|
|
525
525
|
resource: requests.memory
|
|
526
|
+
# Disable Ray memory monitor to prevent Ray's memory manager
|
|
527
|
+
# from interfering with kubernetes resource manager.
|
|
528
|
+
# If ray memory monitor is enabled, the ray memory monitor kills
|
|
529
|
+
# the running job is the job uses more than 95% of allocated memory,
|
|
530
|
+
# even if the job is not misbehaving or using its full allocated memory.
|
|
531
|
+
# This behavior does not give a chance for k8s scheduler to evict the pod.
|
|
532
|
+
- name: RAY_memory_monitor_refresh_ms
|
|
533
|
+
value: "0"
|
|
526
534
|
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
|
|
527
535
|
- name: {{ key }}
|
|
528
536
|
value: {{ value }}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
cluster_name: {{cluster_name_on_cloud}}
|
|
2
|
+
|
|
3
|
+
# The maximum number of workers nodes to launch in addition to the head node.
|
|
4
|
+
max_workers: {{num_nodes - 1}}
|
|
5
|
+
upscaling_speed: {{num_nodes - 1}}
|
|
6
|
+
idle_timeout_minutes: 60
|
|
7
|
+
|
|
8
|
+
provider:
|
|
9
|
+
type: external
|
|
10
|
+
module: sky.provision.slurm
|
|
11
|
+
|
|
12
|
+
cluster: {{slurm_cluster}}
|
|
13
|
+
partition: {{slurm_partition}}
|
|
14
|
+
|
|
15
|
+
ssh:
|
|
16
|
+
hostname: {{ssh_hostname}}
|
|
17
|
+
port: {{ssh_port}}
|
|
18
|
+
user: {{ssh_user}}
|
|
19
|
+
private_key: {{slurm_private_key}}
|
|
20
|
+
{% if slurm_proxy_command is not none %}
|
|
21
|
+
proxycommand: {{slurm_proxy_command | tojson }}
|
|
22
|
+
{% endif %}
|
|
23
|
+
|
|
24
|
+
auth:
|
|
25
|
+
ssh_user: {{ssh_user}}
|
|
26
|
+
# TODO(jwj): Modify this tmp workaround.
|
|
27
|
+
# ssh_private_key: {{ssh_private_key}}
|
|
28
|
+
ssh_private_key: {{slurm_private_key}}
|
|
29
|
+
ssh_proxy_command: {{slurm_proxy_command | tojson }}
|
|
30
|
+
|
|
31
|
+
available_node_types:
|
|
32
|
+
ray_head_default:
|
|
33
|
+
resources: {}
|
|
34
|
+
node_config:
|
|
35
|
+
# From clouds/slurm.py::Slurm.make_deploy_resources_variables.
|
|
36
|
+
instance_type: {{instance_type}}
|
|
37
|
+
disk_size: {{disk_size}}
|
|
38
|
+
cpus: {{cpus}}
|
|
39
|
+
memory: {{memory}}
|
|
40
|
+
accelerator_type: {{accelerator_type}}
|
|
41
|
+
accelerator_count: {{accelerator_count}}
|
|
42
|
+
|
|
43
|
+
# TODO: more configs that is required by the provisioner to create new
|
|
44
|
+
# instances on the FluffyCloud:
|
|
45
|
+
# sky/provision/fluffycloud/instance.py::run_instances
|
|
46
|
+
|
|
47
|
+
head_node_type: ray_head_default
|
|
48
|
+
|
|
49
|
+
# Format: `REMOTE_PATH : LOCAL_PATH`
|
|
50
|
+
file_mounts: {
|
|
51
|
+
"{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
|
|
52
|
+
"{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
|
|
53
|
+
{%- for remote_path, local_path in credentials.items() %}
|
|
54
|
+
"{{remote_path}}": "{{local_path}}",
|
|
55
|
+
{%- endfor %}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
rsync_exclude: []
|
|
59
|
+
|
|
60
|
+
initialization_commands: []
|
|
61
|
+
|
|
62
|
+
# List of shell commands to run to set up nodes.
|
|
63
|
+
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
|
|
64
|
+
# connection, which is expensive. Try your best to co-locate commands into fewer
|
|
65
|
+
# items!
|
|
66
|
+
#
|
|
67
|
+
# Increment the following for catching performance bugs easier:
|
|
68
|
+
# current num items (num SSH connections): 1
|
|
69
|
+
setup_commands:
|
|
70
|
+
- {%- for initial_setup_command in initial_setup_commands %}
|
|
71
|
+
{{ initial_setup_command }}
|
|
72
|
+
{%- endfor %}
|
|
73
|
+
{{ setup_sky_dirs_commands }}
|
|
74
|
+
{{ conda_installation_commands }}
|
|
75
|
+
{{ skypilot_wheel_installation_commands }}
|
|
76
|
+
{{ copy_skypilot_templates_commands }}
|
|
77
|
+
|
|
78
|
+
head_node: {}
|
|
79
|
+
worker_nodes: {}
|
|
80
|
+
|
|
81
|
+
# These fields are required for external cloud providers.
|
|
82
|
+
head_setup_commands: []
|
|
83
|
+
worker_setup_commands: []
|
|
84
|
+
cluster_synced_files: []
|
|
85
|
+
file_mounts_sync_continuously: False
|
sky/templates/vast-ray.yml.j2
CHANGED
sky/users/model.conf
CHANGED
sky/users/permission.py
CHANGED
|
@@ -69,6 +69,26 @@ class PermissionService:
|
|
|
69
69
|
'Enforcer should be initialized after _lazy_initialize()')
|
|
70
70
|
return self.enforcer
|
|
71
71
|
|
|
72
|
+
def _get_plugin_rbac_rules(self):
|
|
73
|
+
"""Get RBAC rules from loaded plugins.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Dictionary of plugin RBAC rules, or empty dict if plugins module
|
|
77
|
+
is not available or no rules are defined.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
# pylint: disable=import-outside-toplevel
|
|
81
|
+
from sky.server import plugins as server_plugins
|
|
82
|
+
return server_plugins.get_plugin_rbac_rules()
|
|
83
|
+
except ImportError:
|
|
84
|
+
# Plugin module not available (e.g., not running as server)
|
|
85
|
+
logger.debug(
|
|
86
|
+
'Plugin module not available, skipping plugin RBAC rules')
|
|
87
|
+
return {}
|
|
88
|
+
except Exception as e: # pylint: disable=broad-except
|
|
89
|
+
logger.warning(f'Failed to get plugin RBAC rules: {e}')
|
|
90
|
+
return {}
|
|
91
|
+
|
|
72
92
|
def _maybe_initialize_basic_auth_user(self) -> None:
|
|
73
93
|
"""Initialize basic auth user if it is enabled."""
|
|
74
94
|
basic_auth = os.environ.get(constants.SKYPILOT_INITIAL_BASIC_AUTH)
|
|
@@ -101,9 +121,12 @@ class PermissionService:
|
|
|
101
121
|
enforcer = self._ensure_enforcer()
|
|
102
122
|
existing_policies = enforcer.get_policy()
|
|
103
123
|
|
|
124
|
+
# Get plugin RBAC rules dynamically
|
|
125
|
+
plugin_rules = self._get_plugin_rbac_rules()
|
|
126
|
+
|
|
104
127
|
# If we already have policies for the expected roles, skip
|
|
105
128
|
# initialization
|
|
106
|
-
role_permissions = rbac.get_role_permissions()
|
|
129
|
+
role_permissions = rbac.get_role_permissions(plugin_rules=plugin_rules)
|
|
107
130
|
expected_policies = []
|
|
108
131
|
for role, permissions in role_permissions.items():
|
|
109
132
|
if permissions['permissions'] and 'blocklist' in permissions[
|