skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
|
@@ -3,36 +3,24 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import concurrent.futures as cf
|
|
5
5
|
import os
|
|
6
|
-
import random
|
|
7
6
|
import re
|
|
8
7
|
import shlex
|
|
9
8
|
import shutil
|
|
10
|
-
import subprocess
|
|
11
|
-
import sys
|
|
12
9
|
import tempfile
|
|
13
|
-
from typing import List, Optional
|
|
10
|
+
from typing import List, Optional
|
|
14
11
|
|
|
15
12
|
import colorama
|
|
16
13
|
import yaml
|
|
17
14
|
|
|
18
15
|
from sky import sky_logging
|
|
16
|
+
from sky.ssh_node_pools import constants
|
|
17
|
+
from sky.ssh_node_pools import utils as ssh_utils
|
|
18
|
+
from sky.ssh_node_pools.deploy import tunnel_utils
|
|
19
|
+
from sky.ssh_node_pools.deploy import utils as deploy_utils
|
|
19
20
|
from sky.utils import rich_utils
|
|
20
21
|
from sky.utils import ux_utils
|
|
21
|
-
from sky.utils.kubernetes import ssh_utils
|
|
22
|
-
|
|
23
|
-
# Colors for nicer UX
|
|
24
|
-
RED = '\033[0;31m'
|
|
25
|
-
GREEN = '\033[0;32m'
|
|
26
|
-
YELLOW = '\033[1;33m'
|
|
27
|
-
WARNING_YELLOW = '\x1b[33m'
|
|
28
|
-
NC = '\033[0m' # No color
|
|
29
|
-
DIM = colorama.Style.DIM
|
|
30
|
-
CYAN = colorama.Fore.CYAN
|
|
31
|
-
RESET_ALL = colorama.Style.RESET_ALL
|
|
32
22
|
|
|
33
|
-
|
|
34
|
-
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
|
35
|
-
NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
|
23
|
+
RESET_ALL = colorama.Style.RESET_ALL
|
|
36
24
|
|
|
37
25
|
# Get the directory of this script
|
|
38
26
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
@@ -40,113 +28,14 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
40
28
|
logger = sky_logging.init_logger(__name__)
|
|
41
29
|
|
|
42
30
|
|
|
43
|
-
def run_command(cmd, shell=False, silent=False):
|
|
44
|
-
"""Run a local command and return the output."""
|
|
45
|
-
process = subprocess.run(cmd,
|
|
46
|
-
shell=shell,
|
|
47
|
-
capture_output=True,
|
|
48
|
-
text=True,
|
|
49
|
-
check=False)
|
|
50
|
-
if process.returncode != 0:
|
|
51
|
-
if not silent:
|
|
52
|
-
logger.error(f'{RED}Error executing command: {cmd}{NC}\n'
|
|
53
|
-
f'STDOUT: {process.stdout}\n'
|
|
54
|
-
f'STDERR: {process.stderr}')
|
|
55
|
-
return None
|
|
56
|
-
return process.stdout.strip()
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def get_effective_host_ip(hostname: str) -> str:
|
|
60
|
-
"""Get the effective IP for a hostname from SSH config."""
|
|
61
|
-
try:
|
|
62
|
-
result = subprocess.run(['ssh', '-G', hostname],
|
|
63
|
-
capture_output=True,
|
|
64
|
-
text=True,
|
|
65
|
-
check=False)
|
|
66
|
-
if result.returncode == 0:
|
|
67
|
-
for line in result.stdout.splitlines():
|
|
68
|
-
if line.startswith('hostname '):
|
|
69
|
-
return line.split(' ', 1)[1].strip()
|
|
70
|
-
except Exception: # pylint: disable=broad-except
|
|
71
|
-
pass
|
|
72
|
-
return hostname # Return the original hostname if lookup fails
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def run_remote(node,
|
|
76
|
-
cmd,
|
|
77
|
-
user='',
|
|
78
|
-
ssh_key='',
|
|
79
|
-
connect_timeout=30,
|
|
80
|
-
use_ssh_config=False,
|
|
81
|
-
print_output=False,
|
|
82
|
-
use_shell=False,
|
|
83
|
-
silent=False):
|
|
84
|
-
"""Run a command on a remote machine via SSH.
|
|
85
|
-
|
|
86
|
-
silent is used for gpu checking (will show error logs when no gpus are found)"""
|
|
87
|
-
ssh_cmd: List[str]
|
|
88
|
-
if use_ssh_config:
|
|
89
|
-
# Use SSH config for connection parameters
|
|
90
|
-
ssh_cmd = ['ssh', node, cmd]
|
|
91
|
-
else:
|
|
92
|
-
# Use explicit parameters
|
|
93
|
-
ssh_cmd = [
|
|
94
|
-
'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
|
|
95
|
-
'-o', f'ConnectTimeout={connect_timeout}', '-o',
|
|
96
|
-
'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
|
|
97
|
-
]
|
|
98
|
-
|
|
99
|
-
if ssh_key:
|
|
100
|
-
if not os.path.isfile(ssh_key):
|
|
101
|
-
raise ValueError(f'SSH key not found: {ssh_key}')
|
|
102
|
-
ssh_cmd.extend(['-i', ssh_key])
|
|
103
|
-
|
|
104
|
-
ssh_cmd.append(f'{user}@{node}' if user else node)
|
|
105
|
-
ssh_cmd.append(cmd)
|
|
106
|
-
|
|
107
|
-
subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
|
|
108
|
-
process = subprocess.run(subprocess_cmd,
|
|
109
|
-
capture_output=True,
|
|
110
|
-
text=True,
|
|
111
|
-
check=False,
|
|
112
|
-
shell=use_shell)
|
|
113
|
-
if process.returncode != 0:
|
|
114
|
-
if not silent:
|
|
115
|
-
logger.error(f'{RED}Error executing command {cmd} on {node}:{NC} '
|
|
116
|
-
f'{process.stderr}')
|
|
117
|
-
return None
|
|
118
|
-
if print_output:
|
|
119
|
-
logger.info(process.stdout)
|
|
120
|
-
return process.stdout.strip()
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def create_askpass_script(password):
|
|
124
|
-
"""Create an askpass script block for sudo with password."""
|
|
125
|
-
if not password:
|
|
126
|
-
return ''
|
|
127
|
-
|
|
128
|
-
return f"""
|
|
129
|
-
# Create temporary askpass script
|
|
130
|
-
ASKPASS_SCRIPT=$(mktemp)
|
|
131
|
-
trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
|
132
|
-
cat > $ASKPASS_SCRIPT << EOF
|
|
133
|
-
#!/bin/bash
|
|
134
|
-
echo {password}
|
|
135
|
-
EOF
|
|
136
|
-
chmod 700 $ASKPASS_SCRIPT
|
|
137
|
-
# Use askpass
|
|
138
|
-
export SUDO_ASKPASS=$ASKPASS_SCRIPT
|
|
139
|
-
"""
|
|
140
|
-
|
|
141
|
-
|
|
142
31
|
def progress_message(message):
|
|
143
32
|
"""Show a progress message."""
|
|
144
|
-
logger.info(f'{YELLOW}➜ {message}{
|
|
33
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ {message}{RESET_ALL}')
|
|
145
34
|
|
|
146
35
|
|
|
147
36
|
def success_message(message):
|
|
148
37
|
"""Show a success message."""
|
|
149
|
-
logger.info(f'{GREEN}✔ {message}{
|
|
38
|
+
logger.info(f'{colorama.Fore.GREEN}✔ {message}{RESET_ALL}')
|
|
150
39
|
|
|
151
40
|
|
|
152
41
|
def force_update_status(message):
|
|
@@ -154,283 +43,61 @@ def force_update_status(message):
|
|
|
154
43
|
rich_utils.force_update_status(ux_utils.spinner_message(message))
|
|
155
44
|
|
|
156
45
|
|
|
157
|
-
def
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
use_ssh_config=False):
|
|
162
|
-
"""Uninstall k3s and clean up the state on a server node."""
|
|
163
|
-
force_update_status(f'Cleaning up head node ({node})...')
|
|
164
|
-
cmd = f"""
|
|
165
|
-
{askpass_block}
|
|
166
|
-
echo 'Uninstalling k3s...' &&
|
|
167
|
-
sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
|
|
168
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
169
|
-
"""
|
|
170
|
-
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
|
171
|
-
if result is None:
|
|
172
|
-
logger.error(f'{RED}Failed to clean up head node ({node}).{NC}')
|
|
173
|
-
else:
|
|
174
|
-
success_message(f'Node {node} cleaned up successfully.')
|
|
46
|
+
def run(cleanup: bool = False,
|
|
47
|
+
infra: Optional[str] = None,
|
|
48
|
+
kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH):
|
|
49
|
+
"""Deploy a Kubernetes cluster on SSH targets.
|
|
175
50
|
|
|
51
|
+
This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
|
|
52
|
+
Kubernetes cluster on the specified machines.
|
|
176
53
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
force_update_status(f'Cleaning up worker node ({node})...')
|
|
184
|
-
cmd = f"""
|
|
185
|
-
{askpass_block}
|
|
186
|
-
echo 'Uninstalling k3s...' &&
|
|
187
|
-
sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
|
|
188
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
54
|
+
Args:
|
|
55
|
+
cleanup: Whether to clean up the cluster instead of deploying.
|
|
56
|
+
infra: Name of the cluster in ssh_node_pools.yaml to use.
|
|
57
|
+
If None, the first cluster in the file will be used.
|
|
58
|
+
kubeconfig_path: Path to save the Kubernetes configuration file.
|
|
59
|
+
If None, the default ~/.kube/config will be used.
|
|
189
60
|
"""
|
|
190
|
-
|
|
191
|
-
if
|
|
192
|
-
|
|
193
|
-
else:
|
|
194
|
-
success_message(f'Node {node} cleaned up successfully.')
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def start_agent_node(node,
|
|
198
|
-
master_addr,
|
|
199
|
-
k3s_token,
|
|
200
|
-
user,
|
|
201
|
-
ssh_key,
|
|
202
|
-
askpass_block,
|
|
203
|
-
use_ssh_config=False):
|
|
204
|
-
"""Start a k3s agent node.
|
|
205
|
-
Returns: if the start is successful, and if the node has a GPU."""
|
|
206
|
-
logger.info(f'Deploying worker node ({node}).')
|
|
207
|
-
cmd = f"""
|
|
208
|
-
{askpass_block}
|
|
209
|
-
curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
|
|
210
|
-
K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
|
|
211
|
-
"""
|
|
212
|
-
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
|
213
|
-
if result is None:
|
|
214
|
-
logger.error(
|
|
215
|
-
f'{RED}✗ Failed to deploy K3s on worker node ({node}).{NC}')
|
|
216
|
-
return node, False, False
|
|
217
|
-
success_message(
|
|
218
|
-
f'SkyPilot runtime successfully deployed on worker node ({node}).')
|
|
219
|
-
# Check if worker node has a GPU
|
|
220
|
-
if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
|
|
221
|
-
logger.info(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
|
|
222
|
-
return node, True, True
|
|
223
|
-
return node, True, False
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def check_gpu(node, user, ssh_key, use_ssh_config=False):
|
|
227
|
-
"""Check if a node has a GPU."""
|
|
228
|
-
cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
|
|
229
|
-
result = run_remote(node,
|
|
230
|
-
cmd,
|
|
231
|
-
user,
|
|
232
|
-
ssh_key,
|
|
233
|
-
use_ssh_config=use_ssh_config,
|
|
234
|
-
silent=True)
|
|
235
|
-
return result is not None
|
|
61
|
+
deploy_utils.check_ssh_cluster_dependencies()
|
|
62
|
+
action = 'Cleanup' if cleanup else 'Deployment'
|
|
63
|
+
msg_str = f'Initializing SSH Node Pools {action}...'
|
|
236
64
|
|
|
65
|
+
with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
|
|
66
|
+
try:
|
|
67
|
+
deploy_multiple_clusters(infra=infra,
|
|
68
|
+
cleanup=cleanup,
|
|
69
|
+
kubeconfig_path=kubeconfig_path)
|
|
70
|
+
except Exception as e: # pylint: disable=broad-except
|
|
71
|
+
logger.error(str(e))
|
|
72
|
+
with ux_utils.print_exception_no_traceback():
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
'Failed to deploy SkyPilot on some Node Pools.') from e
|
|
237
75
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
if directory and not os.path.exists(directory):
|
|
242
|
-
os.makedirs(directory, exist_ok=True)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
def get_used_localhost_ports() -> Set[int]:
|
|
246
|
-
"""Get SSH port forwardings already in use on localhost"""
|
|
247
|
-
used_ports = set()
|
|
248
|
-
|
|
249
|
-
# Get ports from netstat (works on macOS and Linux)
|
|
250
|
-
try:
|
|
251
|
-
if sys.platform == 'darwin':
|
|
252
|
-
# macOS
|
|
253
|
-
result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
|
|
254
|
-
capture_output=True,
|
|
255
|
-
text=True,
|
|
256
|
-
check=False)
|
|
257
|
-
else:
|
|
258
|
-
# Linux and other Unix-like systems
|
|
259
|
-
result = subprocess.run(['netstat', '-tln'],
|
|
260
|
-
capture_output=True,
|
|
261
|
-
text=True,
|
|
262
|
-
check=False)
|
|
263
|
-
|
|
264
|
-
if result.returncode == 0:
|
|
265
|
-
# Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
|
|
266
|
-
for line in result.stdout.splitlines():
|
|
267
|
-
if '127.0.0.1:' in line or 'localhost:' in line:
|
|
268
|
-
match = re.search(r':(64\d\d)\s', line)
|
|
269
|
-
if match:
|
|
270
|
-
port = int(match.group(1))
|
|
271
|
-
if 6400 <= port <= 6500: # Only consider our range
|
|
272
|
-
used_ports.add(port)
|
|
273
|
-
except (subprocess.SubprocessError, FileNotFoundError):
|
|
274
|
-
# If netstat fails, try another approach
|
|
275
|
-
pass
|
|
276
|
-
|
|
277
|
-
# Also check ports from existing kubeconfig entries
|
|
278
|
-
try:
|
|
279
|
-
result = subprocess.run([
|
|
280
|
-
'kubectl', 'config', 'view', '-o',
|
|
281
|
-
'jsonpath=\'{.clusters[*].cluster.server}\''
|
|
282
|
-
],
|
|
283
|
-
capture_output=True,
|
|
284
|
-
text=True,
|
|
285
|
-
check=False)
|
|
286
|
-
|
|
287
|
-
if result.returncode == 0:
|
|
288
|
-
# Look for localhost URLs with ports
|
|
289
|
-
for url in result.stdout.split():
|
|
290
|
-
if 'localhost:' in url or '127.0.0.1:' in url:
|
|
291
|
-
match = re.search(r':(\d+)', url)
|
|
292
|
-
if match:
|
|
293
|
-
port = int(match.group(1))
|
|
294
|
-
if 6400 <= port <= 6500: # Only consider our range
|
|
295
|
-
used_ports.add(port)
|
|
296
|
-
except subprocess.SubprocessError:
|
|
297
|
-
pass
|
|
298
|
-
|
|
299
|
-
return used_ports
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
def get_available_port(start: int = 6443, end: int = 6499) -> int:
|
|
303
|
-
"""Get an available port in the given range that's not used by other tunnels"""
|
|
304
|
-
used_ports = get_used_localhost_ports()
|
|
305
|
-
|
|
306
|
-
# Try to use port 6443 first if available for the first cluster
|
|
307
|
-
if start == 6443 and start not in used_ports:
|
|
308
|
-
return start
|
|
309
|
-
|
|
310
|
-
# Otherwise find any available port in the range
|
|
311
|
-
available_ports = list(set(range(start, end + 1)) - used_ports)
|
|
312
|
-
|
|
313
|
-
if not available_ports:
|
|
314
|
-
# If all ports are used, pick a random one from our range
|
|
315
|
-
# (we'll terminate any existing connection in the setup)
|
|
316
|
-
return random.randint(start, end)
|
|
317
|
-
|
|
318
|
-
# Sort to get deterministic allocation
|
|
319
|
-
available_ports.sort()
|
|
320
|
-
return available_ports[0]
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def setup_kubectl_ssh_tunnel(head_node,
|
|
324
|
-
ssh_user,
|
|
325
|
-
ssh_key,
|
|
326
|
-
context_name,
|
|
327
|
-
use_ssh_config=False):
|
|
328
|
-
"""Set up kubeconfig exec credential plugin for SSH tunnel"""
|
|
329
|
-
progress_message('Setting up SSH tunnel for Kubernetes API access...')
|
|
330
|
-
|
|
331
|
-
# Get an available port for this cluster
|
|
332
|
-
port = get_available_port()
|
|
333
|
-
|
|
334
|
-
# Paths to scripts
|
|
335
|
-
tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
|
|
336
|
-
|
|
337
|
-
# Make sure scripts are executable
|
|
338
|
-
os.chmod(tunnel_script, 0o755)
|
|
339
|
-
|
|
340
|
-
# Certificate files
|
|
341
|
-
client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
|
|
342
|
-
f'{context_name}-cert.pem')
|
|
343
|
-
client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
|
|
344
|
-
f'{context_name}-key.pem')
|
|
345
|
-
|
|
346
|
-
# Update kubeconfig to use localhost with the selected port
|
|
347
|
-
run_command([
|
|
348
|
-
'kubectl', 'config', 'set-cluster', context_name,
|
|
349
|
-
f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
|
|
350
|
-
])
|
|
351
|
-
|
|
352
|
-
# Build the exec args list based on auth method
|
|
353
|
-
exec_args = [
|
|
354
|
-
'--exec-command', tunnel_script, '--exec-api-version',
|
|
355
|
-
'client.authentication.k8s.io/v1beta1'
|
|
356
|
-
]
|
|
357
|
-
|
|
358
|
-
# Set credential TTL to force frequent tunnel checks
|
|
359
|
-
ttl_seconds = 30
|
|
360
|
-
|
|
361
|
-
# Verify if we have extracted certificate data files
|
|
362
|
-
has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
|
|
363
|
-
client_key_file)
|
|
364
|
-
if has_cert_files:
|
|
76
|
+
# Add empty line for ux-purposes.
|
|
77
|
+
logger.info('')
|
|
78
|
+
if cleanup:
|
|
365
79
|
logger.info(
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if use_ssh_config:
|
|
370
|
-
run_command(
|
|
371
|
-
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
|
372
|
-
[
|
|
373
|
-
'--exec-arg=--context', f'--exec-arg={context_name}',
|
|
374
|
-
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
|
375
|
-
f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
|
|
376
|
-
'--exec-arg=--host', f'--exec-arg={head_node}'
|
|
377
|
-
])
|
|
80
|
+
ux_utils.finishing_message(
|
|
81
|
+
'🎉 SSH Node Pools cleaned up successfully.'))
|
|
378
82
|
else:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
success_message(
|
|
390
|
-
f'SSH tunnel configured through kubectl credential plugin on port {port}'
|
|
391
|
-
)
|
|
392
|
-
logger.info(
|
|
393
|
-
f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
|
|
394
|
-
)
|
|
395
|
-
logger.info(
|
|
396
|
-
f'{GREEN}This tunnel will be automatically established when needed.{NC}'
|
|
397
|
-
)
|
|
398
|
-
logger.info(
|
|
399
|
-
f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
return port
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
|
|
406
|
-
"""Clean up the SSH tunnel for a specific context"""
|
|
407
|
-
progress_message(f'Cleaning up SSH tunnel for `{cluster_name}`...')
|
|
408
|
-
|
|
409
|
-
# Path to cleanup script
|
|
410
|
-
cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
|
|
411
|
-
|
|
412
|
-
# Make sure script is executable
|
|
413
|
-
if os.path.exists(cleanup_script):
|
|
414
|
-
os.chmod(cleanup_script, 0o755)
|
|
415
|
-
|
|
416
|
-
# Run the cleanup script
|
|
417
|
-
subprocess.run([cleanup_script, context_name],
|
|
418
|
-
stdout=subprocess.DEVNULL,
|
|
419
|
-
stderr=subprocess.DEVNULL,
|
|
420
|
-
check=False)
|
|
421
|
-
|
|
422
|
-
success_message(f'SSH tunnel for `{cluster_name}` cleaned up.')
|
|
423
|
-
else:
|
|
424
|
-
logger.error(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
|
|
83
|
+
logger.info(
|
|
84
|
+
ux_utils.finishing_message(
|
|
85
|
+
'🎉 SSH Node Pools set up successfully. ',
|
|
86
|
+
follow_up_message=(
|
|
87
|
+
f'Run `{colorama.Style.BRIGHT}'
|
|
88
|
+
f'sky check ssh'
|
|
89
|
+
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
90
|
+
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
91
|
+
f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
|
|
425
92
|
|
|
426
93
|
|
|
427
|
-
def
|
|
94
|
+
def deploy_multiple_clusters(
|
|
428
95
|
infra: Optional[str],
|
|
429
|
-
ssh_node_pools_file: str =
|
|
430
|
-
kubeconfig_path:
|
|
96
|
+
ssh_node_pools_file: str = constants.DEFAULT_SSH_NODE_POOLS_PATH,
|
|
97
|
+
kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH,
|
|
431
98
|
cleanup: bool = True):
|
|
432
99
|
|
|
433
|
-
kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
|
|
100
|
+
kubeconfig_path = kubeconfig_path or constants.DEFAULT_KUBECONFIG_PATH
|
|
434
101
|
kubeconfig_path = os.path.expanduser(kubeconfig_path)
|
|
435
102
|
|
|
436
103
|
failed_clusters = []
|
|
@@ -445,7 +112,7 @@ def deploy_clusters(
|
|
|
445
112
|
num_clusters = len(clusters_config)
|
|
446
113
|
cluster_names = list(clusters_config.keys())
|
|
447
114
|
cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
|
|
448
|
-
logger.info(f'{colorama.Fore.CYAN}{cluster_info}{
|
|
115
|
+
logger.info(f'{colorama.Fore.CYAN}{cluster_info}{RESET_ALL}')
|
|
449
116
|
|
|
450
117
|
# Process each cluster
|
|
451
118
|
for cluster_name, cluster_config in clusters_config.items():
|
|
@@ -457,15 +124,15 @@ def deploy_clusters(
|
|
|
457
124
|
|
|
458
125
|
if not hosts_info:
|
|
459
126
|
logger.warning(
|
|
460
|
-
f'{RED}Error: No valid hosts found
|
|
461
|
-
|
|
127
|
+
f'{colorama.Fore.RED}Error: No valid hosts found '
|
|
128
|
+
f'for cluster {cluster_name!r}. Skipping.{RESET_ALL}')
|
|
462
129
|
continue
|
|
463
130
|
|
|
464
131
|
context_name = f'ssh-{cluster_name}'
|
|
465
132
|
|
|
466
133
|
# Check cluster history
|
|
467
|
-
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
|
468
|
-
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
|
134
|
+
os.makedirs(constants.NODE_POOLS_INFO_DIR, exist_ok=True)
|
|
135
|
+
history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
469
136
|
f'{context_name}-history.yaml')
|
|
470
137
|
|
|
471
138
|
history = None
|
|
@@ -517,7 +184,7 @@ def deploy_clusters(
|
|
|
517
184
|
password = head_host['password']
|
|
518
185
|
|
|
519
186
|
# Deploy this cluster
|
|
520
|
-
unsuccessful_workers =
|
|
187
|
+
unsuccessful_workers = deploy_single_cluster(
|
|
521
188
|
cluster_name,
|
|
522
189
|
head_node,
|
|
523
190
|
worker_nodes,
|
|
@@ -556,67 +223,70 @@ def deploy_clusters(
|
|
|
556
223
|
except Exception as e: # pylint: disable=broad-except
|
|
557
224
|
reason = str(e)
|
|
558
225
|
failed_clusters.append((cluster_name, reason))
|
|
226
|
+
action = 'cleaning' if cleanup else 'deploying'
|
|
559
227
|
logger.debug(
|
|
560
|
-
f'Error
|
|
228
|
+
f'Error {action} SSH Node Pool `{cluster_name}`: {reason}')
|
|
561
229
|
|
|
562
230
|
if failed_clusters:
|
|
563
231
|
action = 'clean' if cleanup else 'deploy'
|
|
564
|
-
msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {
|
|
565
|
-
msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {
|
|
232
|
+
msg = f'{colorama.Fore.GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {RESET_ALL}'
|
|
233
|
+
msg += f'{colorama.Fore.RED}Failed to {action} {len(failed_clusters)} cluster(s): {RESET_ALL}'
|
|
566
234
|
for cluster_name, reason in failed_clusters:
|
|
567
235
|
msg += f'\n {cluster_name}: {reason}'
|
|
568
236
|
raise RuntimeError(msg)
|
|
569
237
|
|
|
570
238
|
|
|
571
|
-
def
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
239
|
+
def deploy_single_cluster(cluster_name,
|
|
240
|
+
head_node,
|
|
241
|
+
worker_nodes,
|
|
242
|
+
ssh_user,
|
|
243
|
+
ssh_key,
|
|
244
|
+
context_name,
|
|
245
|
+
password,
|
|
246
|
+
head_use_ssh_config,
|
|
247
|
+
worker_use_ssh_config,
|
|
248
|
+
kubeconfig_path,
|
|
249
|
+
cleanup,
|
|
250
|
+
worker_hosts=None,
|
|
251
|
+
history_worker_nodes=None,
|
|
252
|
+
history_workers_info=None,
|
|
253
|
+
history_use_ssh_config=None) -> List[str]:
|
|
586
254
|
"""Deploy or clean up a single Kubernetes cluster.
|
|
587
255
|
|
|
588
256
|
Returns: List of unsuccessful worker nodes.
|
|
589
257
|
"""
|
|
590
|
-
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
|
258
|
+
history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
591
259
|
f'{context_name}-history.yaml')
|
|
592
|
-
cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
|
|
260
|
+
cert_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
593
261
|
f'{context_name}-cert.pem')
|
|
594
|
-
key_file_path = os.path.join(NODE_POOLS_INFO_DIR,
|
|
595
|
-
|
|
262
|
+
key_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
263
|
+
f'{context_name}-key.pem')
|
|
264
|
+
tunnel_log_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
596
265
|
f'{context_name}-tunnel.log')
|
|
597
266
|
|
|
598
267
|
# Generate the askpass block if password is provided
|
|
599
268
|
askpass_block = create_askpass_script(password)
|
|
600
269
|
|
|
601
270
|
# Token for k3s
|
|
602
|
-
|
|
271
|
+
# TODO (kyuds): make this configurable?
|
|
272
|
+
k3s_token = constants.K3S_TOKEN
|
|
603
273
|
|
|
604
274
|
# Pre-flight checks
|
|
605
275
|
logger.info(f'Checking SSH connection to head node ({head_node})...')
|
|
606
|
-
result = run_remote(
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
if not cleanup and result is None:
|
|
276
|
+
result = deploy_utils.run_remote(
|
|
277
|
+
head_node,
|
|
278
|
+
f'echo \'SSH connection successful ({head_node})\'',
|
|
279
|
+
ssh_user,
|
|
280
|
+
ssh_key,
|
|
281
|
+
use_ssh_config=head_use_ssh_config)
|
|
282
|
+
if result is None:
|
|
615
283
|
with ux_utils.print_exception_no_traceback():
|
|
616
284
|
raise RuntimeError(
|
|
617
285
|
f'Failed to SSH to head node ({head_node}). '
|
|
618
286
|
f'Please check the SSH configuration and logs for more details.'
|
|
619
287
|
)
|
|
288
|
+
elif result.startswith('SSH connection successful'):
|
|
289
|
+
success_message(f'SSH connection established to head node {head_node}.')
|
|
620
290
|
|
|
621
291
|
# Checking history
|
|
622
292
|
history_exists = (history_worker_nodes is not None and
|
|
@@ -670,47 +340,58 @@ def deploy_cluster(cluster_name,
|
|
|
670
340
|
))
|
|
671
341
|
|
|
672
342
|
# Clean up head node
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
343
|
+
cleanup_node(head_node,
|
|
344
|
+
ssh_user,
|
|
345
|
+
ssh_key,
|
|
346
|
+
askpass_block,
|
|
347
|
+
use_ssh_config=head_use_ssh_config,
|
|
348
|
+
is_worker=False)
|
|
678
349
|
# Clean up worker nodes
|
|
679
350
|
force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
|
|
680
351
|
with cf.ThreadPoolExecutor() as executor:
|
|
681
|
-
executor.map(lambda kwargs:
|
|
352
|
+
executor.map(lambda kwargs: cleanup_node(**kwargs),
|
|
682
353
|
worker_nodes_to_cleanup)
|
|
683
354
|
|
|
684
355
|
with cf.ThreadPoolExecutor() as executor:
|
|
685
|
-
executor.map(lambda cmd: run_command(cmd, shell=True),
|
|
356
|
+
executor.map(lambda cmd: deploy_utils.run_command(cmd, shell=True),
|
|
686
357
|
remove_worker_cmds)
|
|
687
358
|
|
|
688
359
|
if cleanup:
|
|
689
|
-
|
|
690
360
|
# Remove the context from local kubeconfig if it exists
|
|
691
361
|
if os.path.isfile(kubeconfig_path):
|
|
692
362
|
logger.debug(
|
|
693
363
|
f'Removing context {context_name!r} from local kubeconfig...')
|
|
694
|
-
run_command(
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
run_command(
|
|
699
|
-
|
|
364
|
+
deploy_utils.run_command(
|
|
365
|
+
['kubectl', 'config', 'delete-context', context_name],
|
|
366
|
+
shell=False,
|
|
367
|
+
silent=True)
|
|
368
|
+
deploy_utils.run_command(
|
|
369
|
+
['kubectl', 'config', 'delete-cluster', context_name],
|
|
370
|
+
shell=False,
|
|
371
|
+
silent=True)
|
|
372
|
+
deploy_utils.run_command(
|
|
373
|
+
['kubectl', 'config', 'delete-user', context_name],
|
|
374
|
+
shell=False,
|
|
375
|
+
silent=True)
|
|
700
376
|
|
|
701
377
|
# Update the current context to the first available context
|
|
702
|
-
contexts = run_command([
|
|
378
|
+
contexts = deploy_utils.run_command([
|
|
703
379
|
'kubectl', 'config', 'view', '-o',
|
|
704
380
|
'jsonpath=\'{.contexts[0].name}\''
|
|
705
381
|
],
|
|
706
|
-
|
|
382
|
+
shell=False,
|
|
383
|
+
silent=True)
|
|
707
384
|
if contexts:
|
|
708
|
-
run_command(
|
|
709
|
-
|
|
385
|
+
deploy_utils.run_command(
|
|
386
|
+
['kubectl', 'config', 'use-context', contexts],
|
|
387
|
+
shell=False,
|
|
388
|
+
silent=True)
|
|
710
389
|
else:
|
|
711
390
|
# If no context is available, simply unset the current context
|
|
712
|
-
run_command(
|
|
713
|
-
|
|
391
|
+
deploy_utils.run_command(
|
|
392
|
+
['kubectl', 'config', 'unset', 'current-context'],
|
|
393
|
+
shell=False,
|
|
394
|
+
silent=True)
|
|
714
395
|
|
|
715
396
|
logger.debug(
|
|
716
397
|
f'Context {context_name!r} removed from local kubeconfig.')
|
|
@@ -721,7 +402,7 @@ def deploy_cluster(cluster_name,
|
|
|
721
402
|
|
|
722
403
|
# Clean up SSH tunnel after clean up kubeconfig, because the kubectl
|
|
723
404
|
# will restart the ssh tunnel if it's not running.
|
|
724
|
-
cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
|
|
405
|
+
tunnel_utils.cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
|
|
725
406
|
|
|
726
407
|
success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
|
|
727
408
|
return []
|
|
@@ -735,12 +416,12 @@ def deploy_cluster(cluster_name,
|
|
|
735
416
|
'/etc/ssh/sshd_config && sudo systemctl restart sshd && '
|
|
736
417
|
f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
|
|
737
418
|
'fi')
|
|
738
|
-
result = run_remote(head_node,
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
419
|
+
result = deploy_utils.run_remote(head_node,
|
|
420
|
+
shlex.quote(cmd),
|
|
421
|
+
ssh_user,
|
|
422
|
+
ssh_key,
|
|
423
|
+
use_ssh_config=head_use_ssh_config,
|
|
424
|
+
use_shell=True)
|
|
744
425
|
if result is None:
|
|
745
426
|
with ux_utils.print_exception_no_traceback():
|
|
746
427
|
raise RuntimeError(
|
|
@@ -749,10 +430,9 @@ def deploy_cluster(cluster_name,
|
|
|
749
430
|
|
|
750
431
|
# Get effective IP for master node if using SSH config - needed for workers to connect
|
|
751
432
|
if head_use_ssh_config:
|
|
752
|
-
effective_master_ip = get_effective_host_ip(head_node)
|
|
753
|
-
logger.info(
|
|
754
|
-
|
|
755
|
-
)
|
|
433
|
+
effective_master_ip = deploy_utils.get_effective_host_ip(head_node)
|
|
434
|
+
logger.info(f'{colorama.Fore.GREEN}Resolved head node {head_node} '
|
|
435
|
+
f'to {effective_master_ip} from SSH config{RESET_ALL}')
|
|
756
436
|
else:
|
|
757
437
|
effective_master_ip = head_node
|
|
758
438
|
|
|
@@ -780,11 +460,11 @@ def deploy_cluster(cluster_name,
|
|
|
780
460
|
exit 1
|
|
781
461
|
fi
|
|
782
462
|
"""
|
|
783
|
-
result = run_remote(head_node,
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
463
|
+
result = deploy_utils.run_remote(head_node,
|
|
464
|
+
cmd,
|
|
465
|
+
ssh_user,
|
|
466
|
+
ssh_key,
|
|
467
|
+
use_ssh_config=head_use_ssh_config)
|
|
788
468
|
if result is None:
|
|
789
469
|
with ux_utils.print_exception_no_traceback():
|
|
790
470
|
raise RuntimeError(
|
|
@@ -794,19 +474,19 @@ def deploy_cluster(cluster_name,
|
|
|
794
474
|
|
|
795
475
|
# Check if head node has a GPU
|
|
796
476
|
install_gpu = False
|
|
797
|
-
if check_gpu(head_node,
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
477
|
+
if deploy_utils.check_gpu(head_node,
|
|
478
|
+
ssh_user,
|
|
479
|
+
ssh_key,
|
|
480
|
+
use_ssh_config=head_use_ssh_config,
|
|
481
|
+
is_head=True):
|
|
802
482
|
install_gpu = True
|
|
803
483
|
|
|
804
484
|
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
|
805
|
-
master_addr = run_remote(head_node,
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
485
|
+
master_addr = deploy_utils.run_remote(head_node,
|
|
486
|
+
'hostname -I | awk \'{print $1}\'',
|
|
487
|
+
ssh_user,
|
|
488
|
+
ssh_key,
|
|
489
|
+
use_ssh_config=head_use_ssh_config)
|
|
810
490
|
if master_addr is None:
|
|
811
491
|
with ux_utils.print_exception_no_traceback():
|
|
812
492
|
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
|
@@ -824,7 +504,7 @@ def deploy_cluster(cluster_name,
|
|
|
824
504
|
i] in history_workers_info:
|
|
825
505
|
logger.info(
|
|
826
506
|
f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
|
|
827
|
-
f'Skipping...{
|
|
507
|
+
f'Skipping...{RESET_ALL}')
|
|
828
508
|
return node, True, False
|
|
829
509
|
worker_user = worker_hosts[i]['user']
|
|
830
510
|
worker_key = worker_hosts[i]['identity_file']
|
|
@@ -881,10 +561,10 @@ def deploy_cluster(cluster_name,
|
|
|
881
561
|
'IdentitiesOnly=yes', '-i', ssh_key,
|
|
882
562
|
f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
|
|
883
563
|
]
|
|
884
|
-
run_command(scp_cmd, shell=False)
|
|
564
|
+
deploy_utils.run_command(scp_cmd, shell=False)
|
|
885
565
|
|
|
886
566
|
# Create the directory for the kubeconfig file if it doesn't exist
|
|
887
|
-
ensure_directory_exists(kubeconfig_path)
|
|
567
|
+
deploy_utils.ensure_directory_exists(kubeconfig_path)
|
|
888
568
|
|
|
889
569
|
# Create empty kubeconfig if it doesn't exist
|
|
890
570
|
if not os.path.isfile(kubeconfig_path):
|
|
@@ -993,10 +673,12 @@ def deploy_cluster(cluster_name,
|
|
|
993
673
|
)
|
|
994
674
|
else:
|
|
995
675
|
logger.error(
|
|
996
|
-
f'{RED}Error:
|
|
676
|
+
f'{colorama.Fore.RED}Error: '
|
|
677
|
+
f'Certificate file is empty{RESET_ALL}')
|
|
997
678
|
except Exception as e: # pylint: disable=broad-except
|
|
998
|
-
logger.error(
|
|
999
|
-
|
|
679
|
+
logger.error(f'{colorama.Fore.RED}'
|
|
680
|
+
f'Error processing certificate data: {e}'
|
|
681
|
+
f'{RESET_ALL}')
|
|
1000
682
|
|
|
1001
683
|
if client_key_data:
|
|
1002
684
|
# Decode base64 data and save as PEM
|
|
@@ -1077,28 +759,34 @@ def deploy_cluster(cluster_name,
|
|
|
1077
759
|
'Warning: Key may not be in proper PEM format'
|
|
1078
760
|
)
|
|
1079
761
|
else:
|
|
1080
|
-
logger.error(f'{RED}Error:
|
|
762
|
+
logger.error(f'{colorama.Fore.RED}Error: '
|
|
763
|
+
f'Key file is empty{RESET_ALL}')
|
|
1081
764
|
except Exception as e: # pylint: disable=broad-except
|
|
1082
|
-
logger.error(f'{RED}
|
|
765
|
+
logger.error(f'{colorama.Fore.RED}'
|
|
766
|
+
f'Error processing key data: {e}'
|
|
767
|
+
f'{RESET_ALL}')
|
|
1083
768
|
|
|
1084
769
|
# First check if context name exists and delete it if it does
|
|
1085
770
|
# TODO(romilb): Should we throw an error here instead?
|
|
1086
|
-
run_command(
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
771
|
+
deploy_utils.run_command(
|
|
772
|
+
['kubectl', 'config', 'delete-context', context_name],
|
|
773
|
+
shell=False,
|
|
774
|
+
silent=True)
|
|
775
|
+
deploy_utils.run_command(
|
|
776
|
+
['kubectl', 'config', 'delete-cluster', context_name],
|
|
777
|
+
shell=False,
|
|
778
|
+
silent=True)
|
|
779
|
+
deploy_utils.run_command(
|
|
780
|
+
['kubectl', 'config', 'delete-user', context_name],
|
|
781
|
+
shell=False,
|
|
782
|
+
silent=True)
|
|
1095
783
|
|
|
1096
784
|
# Merge the configurations using kubectl
|
|
1097
785
|
merged_config = os.path.join(temp_dir, 'merged_config')
|
|
1098
786
|
os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
|
|
1099
787
|
with open(merged_config, 'w', encoding='utf-8') as merged_file:
|
|
1100
788
|
kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
|
|
1101
|
-
result = run_command(kubectl_cmd, shell=False)
|
|
789
|
+
result = deploy_utils.run_command(kubectl_cmd, shell=False)
|
|
1102
790
|
if result:
|
|
1103
791
|
merged_file.write(result)
|
|
1104
792
|
|
|
@@ -1106,15 +794,17 @@ def deploy_cluster(cluster_name,
|
|
|
1106
794
|
shutil.move(merged_config, kubeconfig_path)
|
|
1107
795
|
|
|
1108
796
|
# Set the new context as the current context
|
|
1109
|
-
run_command(
|
|
1110
|
-
|
|
797
|
+
deploy_utils.run_command(
|
|
798
|
+
['kubectl', 'config', 'use-context', context_name],
|
|
799
|
+
shell=False,
|
|
800
|
+
silent=True)
|
|
1111
801
|
|
|
1112
802
|
# Always set up SSH tunnel since we assume only port 22 is accessible
|
|
1113
|
-
setup_kubectl_ssh_tunnel(head_node,
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
803
|
+
tunnel_utils.setup_kubectl_ssh_tunnel(head_node,
|
|
804
|
+
ssh_user,
|
|
805
|
+
ssh_key,
|
|
806
|
+
context_name,
|
|
807
|
+
use_ssh_config=head_use_ssh_config)
|
|
1118
808
|
|
|
1119
809
|
logger.debug(f'kubectl configured with new context \'{context_name}\'.')
|
|
1120
810
|
success_message(f'SkyPilot runtime is up [{cluster_name}].')
|
|
@@ -1144,13 +834,14 @@ def deploy_cluster(cluster_name,
|
|
|
1144
834
|
done
|
|
1145
835
|
echo 'GPU operator installed successfully.'
|
|
1146
836
|
"""
|
|
1147
|
-
result = run_remote(head_node,
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
837
|
+
result = deploy_utils.run_remote(head_node,
|
|
838
|
+
cmd,
|
|
839
|
+
ssh_user,
|
|
840
|
+
ssh_key,
|
|
841
|
+
use_ssh_config=head_use_ssh_config)
|
|
1152
842
|
if result is None:
|
|
1153
|
-
logger.error(f'{RED}Failed to install GPU Operator.
|
|
843
|
+
logger.error(f'{colorama.Fore.RED}Failed to install GPU Operator.'
|
|
844
|
+
f'{RESET_ALL}')
|
|
1154
845
|
else:
|
|
1155
846
|
success_message('GPU Operator installed.')
|
|
1156
847
|
else:
|
|
@@ -1158,7 +849,7 @@ def deploy_cluster(cluster_name,
|
|
|
1158
849
|
|
|
1159
850
|
# The env var KUBECONFIG ensures sky check uses the right kubeconfig
|
|
1160
851
|
os.environ['KUBECONFIG'] = kubeconfig_path
|
|
1161
|
-
run_command(['sky', 'check', 'ssh'], shell=False)
|
|
852
|
+
deploy_utils.run_command(['sky', 'check', 'ssh'], shell=False)
|
|
1162
853
|
|
|
1163
854
|
success_message('SkyPilot configured successfully.')
|
|
1164
855
|
|
|
@@ -1167,11 +858,95 @@ def deploy_cluster(cluster_name,
|
|
|
1167
858
|
f'"{worker}"' for worker in unsuccessful_workers
|
|
1168
859
|
]
|
|
1169
860
|
|
|
1170
|
-
logger.info(
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
861
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
862
|
+
'Failed to deploy Kubernetes on the following nodes: '
|
|
863
|
+
f'{", ".join(quoted_unsuccessful_workers)}. Please check '
|
|
864
|
+
f'the logs for more details.{RESET_ALL}')
|
|
1174
865
|
else:
|
|
1175
866
|
success_message(f'Node Pool `{cluster_name}` deployed successfully.')
|
|
1176
867
|
|
|
1177
868
|
return unsuccessful_workers
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def create_askpass_script(password):
|
|
872
|
+
"""Create an askpass script block for sudo with password."""
|
|
873
|
+
if not password:
|
|
874
|
+
return ''
|
|
875
|
+
|
|
876
|
+
return f"""
|
|
877
|
+
# Create temporary askpass script
|
|
878
|
+
ASKPASS_SCRIPT=$(mktemp)
|
|
879
|
+
trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
|
880
|
+
cat > $ASKPASS_SCRIPT << EOF
|
|
881
|
+
#!/bin/bash
|
|
882
|
+
echo {password}
|
|
883
|
+
EOF
|
|
884
|
+
chmod 700 $ASKPASS_SCRIPT
|
|
885
|
+
# Use askpass
|
|
886
|
+
export SUDO_ASKPASS=$ASKPASS_SCRIPT
|
|
887
|
+
"""
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
def cleanup_node(node,
|
|
891
|
+
user,
|
|
892
|
+
ssh_key,
|
|
893
|
+
askpass_block,
|
|
894
|
+
use_ssh_config=False,
|
|
895
|
+
is_worker=True):
|
|
896
|
+
"""Uninstall k3s and clean up the state on a node."""
|
|
897
|
+
ntype = 'worker' if is_worker else 'head'
|
|
898
|
+
force_update_status(f'Cleaning up {ntype} node ({node})...')
|
|
899
|
+
script = f'k3s{"-agent" if is_worker else ""}-uninstall.sh'
|
|
900
|
+
cmd = f"""
|
|
901
|
+
{askpass_block}
|
|
902
|
+
echo 'Uninstalling k3s...' &&
|
|
903
|
+
sudo -A /usr/local/bin/{script} || true &&
|
|
904
|
+
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
905
|
+
"""
|
|
906
|
+
result = deploy_utils.run_remote(node,
|
|
907
|
+
cmd,
|
|
908
|
+
user,
|
|
909
|
+
ssh_key,
|
|
910
|
+
use_ssh_config=use_ssh_config)
|
|
911
|
+
if result is None:
|
|
912
|
+
logger.error(f'{colorama.Fore.RED}Failed to clean up {ntype} '
|
|
913
|
+
f'node ({node}).{RESET_ALL}')
|
|
914
|
+
else:
|
|
915
|
+
success_message(f'Node {node} cleaned up successfully.')
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def start_agent_node(node,
|
|
919
|
+
master_addr,
|
|
920
|
+
k3s_token,
|
|
921
|
+
user,
|
|
922
|
+
ssh_key,
|
|
923
|
+
askpass_block,
|
|
924
|
+
use_ssh_config=False):
|
|
925
|
+
"""Start a k3s agent node.
|
|
926
|
+
Returns: if the start is successful, and whether the node has a GPU."""
|
|
927
|
+
logger.info(f'Deploying worker node ({node}).')
|
|
928
|
+
cmd = f"""
|
|
929
|
+
{askpass_block}
|
|
930
|
+
curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
|
|
931
|
+
K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
|
|
932
|
+
"""
|
|
933
|
+
result = deploy_utils.run_remote(node,
|
|
934
|
+
cmd,
|
|
935
|
+
user,
|
|
936
|
+
ssh_key,
|
|
937
|
+
use_ssh_config=use_ssh_config)
|
|
938
|
+
if result is None:
|
|
939
|
+
logger.error(f'{colorama.Fore.RED}✗ Failed to deploy K3s on worker '
|
|
940
|
+
f'node ({node}).{RESET_ALL}')
|
|
941
|
+
return node, False, False
|
|
942
|
+
success_message(
|
|
943
|
+
f'SkyPilot runtime successfully deployed on worker node ({node}).')
|
|
944
|
+
# Check if worker node has a GPU
|
|
945
|
+
if deploy_utils.check_gpu(node,
|
|
946
|
+
user,
|
|
947
|
+
ssh_key,
|
|
948
|
+
use_ssh_config=use_ssh_config):
|
|
949
|
+
logger.info(f'{colorama.Fore.YELLOW}GPU detected on worker node '
|
|
950
|
+
f'({node}).{RESET_ALL}')
|
|
951
|
+
return node, True, True
|
|
952
|
+
return node, True, False
|