skypilot-nightly 1.0.0.dev20250523__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +3 -1
- sky/check.py +335 -170
- sky/cli.py +56 -13
- sky/client/cli.py +56 -13
- sky/client/sdk.py +54 -10
- sky/clouds/gcp.py +19 -3
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
- sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-3b5aad09a25f64b7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-abb7d744ecf15109.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-48dc8d67d4b60be1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-b8acf6e6735323a2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +606 -543
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +6 -3
- sky/provision/fluidstack/instance.py +1 -0
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +28 -0
- sky/server/server.py +59 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +107 -11
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/db_utils.py +53 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +166 -147
- sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
- sky/utils/kubernetes/ssh-tunnel.sh +20 -28
- sky/utils/log_utils.py +4 -0
- sky/utils/schemas.py +54 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +295 -0
- sky/workspaces/server.py +62 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +79 -63
- sky/dashboard/out/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- /sky/dashboard/out/_next/static/{ECKwDNS9v9y3_IKFZ2lpp → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,8 @@ from typing import Any, Dict, List, Optional, Set
|
|
14
14
|
|
15
15
|
import yaml
|
16
16
|
|
17
|
+
from sky.utils import ux_utils
|
18
|
+
|
17
19
|
# Colors for nicer UX
|
18
20
|
RED = '\033[0;31m'
|
19
21
|
GREEN = '\033[0;32m'
|
@@ -117,21 +119,19 @@ def parse_args():
|
|
117
119
|
def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
118
120
|
"""Load SSH targets from YAML file."""
|
119
121
|
if not os.path.exists(file_path):
|
120
|
-
|
121
|
-
|
122
|
-
sys.exit(1)
|
122
|
+
with ux_utils.print_exception_no_traceback():
|
123
|
+
raise ValueError(f'SSH Node Pools file not found: {file_path}')
|
123
124
|
|
124
125
|
try:
|
125
126
|
with open(file_path, 'r', encoding='utf-8') as f:
|
126
127
|
targets = yaml.load(f, Loader=UniqueKeySafeLoader)
|
127
128
|
return targets
|
128
129
|
except yaml.constructor.ConstructorError as e:
|
129
|
-
|
130
|
-
|
130
|
+
with ux_utils.print_exception_no_traceback():
|
131
|
+
raise ValueError(e.note) from e
|
131
132
|
except (yaml.YAMLError, IOError, OSError) as e:
|
132
|
-
|
133
|
-
|
134
|
-
sys.exit(1)
|
133
|
+
with ux_utils.print_exception_no_traceback():
|
134
|
+
raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
|
135
135
|
|
136
136
|
|
137
137
|
def check_host_in_ssh_config(hostname: str) -> bool:
|
@@ -181,31 +181,28 @@ def get_cluster_config(targets: Dict[str, Any],
|
|
181
181
|
file_path: Optional[str] = None) -> Dict[str, Any]:
|
182
182
|
"""Get configuration for specific clusters or all clusters."""
|
183
183
|
if not targets:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
file=sys.stderr)
|
188
|
-
sys.exit(1)
|
184
|
+
with ux_utils.print_exception_no_traceback():
|
185
|
+
raise ValueError(
|
186
|
+
f'No clusters defined in SSH Node Pools file {file_path}')
|
189
187
|
|
190
188
|
if cluster_name:
|
191
189
|
if cluster_name not in targets:
|
192
|
-
|
193
|
-
f'
|
194
|
-
|
195
|
-
file=sys.stderr)
|
196
|
-
sys.exit(1)
|
190
|
+
with ux_utils.print_exception_no_traceback():
|
191
|
+
raise ValueError(f'Cluster {cluster_name!r} not found in '
|
192
|
+
f'SSH Node Pools file {file_path}')
|
197
193
|
return {cluster_name: targets[cluster_name]}
|
198
194
|
|
199
195
|
# Return all clusters if no specific cluster is specified
|
200
196
|
return targets
|
201
197
|
|
202
198
|
|
203
|
-
def prepare_hosts_info(
|
199
|
+
def prepare_hosts_info(cluster_name: str,
|
200
|
+
cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
|
204
201
|
"""Prepare list of hosts with resolved user, identity_file, and password."""
|
205
202
|
if 'hosts' not in cluster_config or not cluster_config['hosts']:
|
206
|
-
|
207
|
-
|
208
|
-
|
203
|
+
with ux_utils.print_exception_no_traceback():
|
204
|
+
raise ValueError(
|
205
|
+
f'No hosts defined in cluster {cluster_name} configuration')
|
209
206
|
|
210
207
|
# Get cluster-level defaults
|
211
208
|
cluster_user = cluster_config.get('user', '')
|
@@ -627,6 +624,9 @@ def main():
|
|
627
624
|
kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
|
628
625
|
global_use_ssh_config = args.use_ssh_config
|
629
626
|
|
627
|
+
failed_clusters = []
|
628
|
+
successful_clusters = []
|
629
|
+
|
630
630
|
# Print cleanup mode marker if applicable
|
631
631
|
if args.cleanup:
|
632
632
|
print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
|
@@ -636,23 +636,20 @@ def main():
|
|
636
636
|
# Using command line arguments - legacy mode
|
637
637
|
if args.ssh_key and not os.path.isfile(
|
638
638
|
args.ssh_key) and not global_use_ssh_config:
|
639
|
-
|
640
|
-
|
641
|
-
sys.exit(1)
|
639
|
+
with ux_utils.print_exception_no_traceback():
|
640
|
+
raise ValueError(f'SSH key not found: {args.ssh_key}')
|
642
641
|
|
643
642
|
if not os.path.isfile(args.ips_file):
|
644
|
-
|
645
|
-
|
646
|
-
sys.exit(1)
|
643
|
+
with ux_utils.print_exception_no_traceback():
|
644
|
+
raise ValueError(f'IPs file not found: {args.ips_file}')
|
647
645
|
|
648
646
|
with open(args.ips_file, 'r', encoding='utf-8') as f:
|
649
647
|
hosts = [line.strip() for line in f if line.strip()]
|
650
648
|
|
651
649
|
if not hosts:
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
sys.exit(1)
|
650
|
+
with ux_utils.print_exception_no_traceback():
|
651
|
+
raise ValueError(
|
652
|
+
'Hosts file is empty or not formatted correctly.')
|
656
653
|
|
657
654
|
head_node = hosts[0]
|
658
655
|
worker_nodes = hosts[1:]
|
@@ -688,108 +685,132 @@ def main():
|
|
688
685
|
|
689
686
|
# Process each cluster
|
690
687
|
for cluster_name, cluster_config in clusters_config.items():
|
691
|
-
|
692
|
-
|
693
|
-
hosts_info = prepare_hosts_info(cluster_config)
|
694
|
-
|
695
|
-
if not hosts_info:
|
688
|
+
try:
|
689
|
+
print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
|
696
690
|
print(
|
697
|
-
f'{
|
698
|
-
)
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
691
|
+
f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
|
692
|
+
hosts_info = prepare_hosts_info(cluster_name, cluster_config)
|
693
|
+
|
694
|
+
if not hosts_info:
|
695
|
+
print(
|
696
|
+
f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
|
697
|
+
)
|
698
|
+
continue
|
699
|
+
|
700
|
+
# Generate a unique context name for each cluster
|
701
|
+
context_name = args.context_name
|
702
|
+
if context_name == 'default':
|
703
|
+
context_name = 'ssh-' + cluster_name
|
704
|
+
|
705
|
+
# Check cluster history
|
706
|
+
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
707
|
+
history_yaml_file = os.path.join(
|
708
|
+
NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
|
709
|
+
|
710
|
+
history = None
|
711
|
+
if os.path.exists(history_yaml_file):
|
712
|
+
print(
|
713
|
+
f'{YELLOW}Loading history from {history_yaml_file}{NC}')
|
714
|
+
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
715
|
+
history = yaml.safe_load(f)
|
716
|
+
else:
|
717
|
+
print(f'{YELLOW}No history found for {context_name}.{NC}')
|
718
|
+
|
719
|
+
history_workers_info = None
|
720
|
+
history_worker_nodes = None
|
721
|
+
history_use_ssh_config = None
|
722
|
+
# Do not support changing anything besides hosts for now
|
723
|
+
if history is not None:
|
724
|
+
for key in ['user', 'identity_file', 'password']:
|
725
|
+
if history.get(key) != cluster_config.get(key):
|
726
|
+
raise ValueError(
|
727
|
+
f'Cluster configuration has changed for field {key!r}. '
|
728
|
+
f'Previous value: {history.get(key)}, '
|
729
|
+
f'Current value: {cluster_config.get(key)}')
|
730
|
+
history_hosts_info = prepare_hosts_info(
|
731
|
+
cluster_name, history)
|
732
|
+
if history_hosts_info[0] != hosts_info[0]:
|
726
733
|
raise ValueError(
|
727
|
-
f'Cluster configuration has changed for
|
728
|
-
f'Previous value: {
|
729
|
-
f'Current value: {
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
734
|
+
f'Cluster configuration has changed for master node. '
|
735
|
+
f'Previous value: {history_hosts_info[0]}, '
|
736
|
+
f'Current value: {hosts_info[0]}')
|
737
|
+
history_workers_info = history_hosts_info[1:] if len(
|
738
|
+
history_hosts_info) > 1 else []
|
739
|
+
history_worker_nodes = [
|
740
|
+
h['ip'] for h in history_workers_info
|
741
|
+
]
|
742
|
+
history_use_ssh_config = [
|
743
|
+
h.get('use_ssh_config', False)
|
744
|
+
for h in history_workers_info
|
745
|
+
]
|
746
|
+
|
747
|
+
# Use the first host as the head node and the rest as worker nodes
|
748
|
+
head_host = hosts_info[0]
|
749
|
+
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
750
|
+
|
751
|
+
head_node = head_host['ip']
|
752
|
+
worker_nodes = [h['ip'] for h in worker_hosts]
|
753
|
+
ssh_user = head_host['user']
|
754
|
+
ssh_key = head_host['identity_file']
|
755
|
+
head_use_ssh_config = global_use_ssh_config or head_host.get(
|
756
|
+
'use_ssh_config', False)
|
757
|
+
worker_use_ssh_config = [
|
758
|
+
global_use_ssh_config or h.get('use_ssh_config', False)
|
759
|
+
for h in worker_hosts
|
741
760
|
]
|
761
|
+
password = head_host['password']
|
762
|
+
|
763
|
+
# Deploy this cluster
|
764
|
+
unsuccessful_workers = deploy_cluster(
|
765
|
+
head_node,
|
766
|
+
worker_nodes,
|
767
|
+
ssh_user,
|
768
|
+
ssh_key,
|
769
|
+
context_name,
|
770
|
+
password,
|
771
|
+
head_use_ssh_config,
|
772
|
+
worker_use_ssh_config,
|
773
|
+
kubeconfig_path,
|
774
|
+
args.cleanup,
|
775
|
+
worker_hosts=worker_hosts,
|
776
|
+
history_worker_nodes=history_worker_nodes,
|
777
|
+
history_workers_info=history_workers_info,
|
778
|
+
history_use_ssh_config=history_use_ssh_config)
|
779
|
+
|
780
|
+
if not args.cleanup:
|
781
|
+
successful_hosts = []
|
782
|
+
for host in cluster_config['hosts']:
|
783
|
+
if isinstance(host, str):
|
784
|
+
host_node = host
|
785
|
+
else:
|
786
|
+
host_node = host['ip']
|
787
|
+
if host_node not in unsuccessful_workers:
|
788
|
+
successful_hosts.append(host)
|
789
|
+
cluster_config['hosts'] = successful_hosts
|
790
|
+
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
791
|
+
print(
|
792
|
+
f'{YELLOW}Writing history to {history_yaml_file}{NC}'
|
793
|
+
)
|
794
|
+
yaml.dump(cluster_config, f)
|
742
795
|
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
head_node,
|
762
|
-
worker_nodes,
|
763
|
-
ssh_user,
|
764
|
-
ssh_key,
|
765
|
-
context_name,
|
766
|
-
password,
|
767
|
-
head_use_ssh_config,
|
768
|
-
worker_use_ssh_config,
|
769
|
-
kubeconfig_path,
|
770
|
-
args.cleanup,
|
771
|
-
worker_hosts=worker_hosts,
|
772
|
-
history_worker_nodes=history_worker_nodes,
|
773
|
-
history_workers_info=history_workers_info,
|
774
|
-
history_use_ssh_config=history_use_ssh_config)
|
775
|
-
|
776
|
-
if not args.cleanup:
|
777
|
-
successful_hosts = []
|
778
|
-
for host in cluster_config['hosts']:
|
779
|
-
if isinstance(host, str):
|
780
|
-
host_node = host
|
781
|
-
else:
|
782
|
-
host_node = host['ip']
|
783
|
-
if host_node not in unsuccessful_workers:
|
784
|
-
successful_hosts.append(host)
|
785
|
-
cluster_config['hosts'] = successful_hosts
|
786
|
-
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
787
|
-
print(f'{YELLOW}Writing history to {history_yaml_file}{NC}')
|
788
|
-
yaml.dump(cluster_config, f)
|
789
|
-
|
790
|
-
print(
|
791
|
-
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
792
|
-
)
|
796
|
+
print(
|
797
|
+
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
798
|
+
)
|
799
|
+
successful_clusters.append(cluster_name)
|
800
|
+
except Exception as e: # pylint: disable=broad-except
|
801
|
+
reason = str(e)
|
802
|
+
failed_clusters.append((cluster_name, reason))
|
803
|
+
print(
|
804
|
+
f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
|
805
|
+
) # Print for internal logging
|
806
|
+
|
807
|
+
if failed_clusters:
|
808
|
+
action = 'clean' if args.cleanup else 'deploy'
|
809
|
+
msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
|
810
|
+
msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
|
811
|
+
for cluster_name, reason in failed_clusters:
|
812
|
+
msg += f'\n {cluster_name}: {reason}'
|
813
|
+
raise RuntimeError(msg)
|
793
814
|
|
794
815
|
|
795
816
|
def deploy_cluster(head_node,
|
@@ -839,11 +860,11 @@ def deploy_cluster(head_node,
|
|
839
860
|
# For SkySSHUpLineProcessor
|
840
861
|
print_output=True)
|
841
862
|
if result is None:
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
863
|
+
with ux_utils.print_exception_no_traceback():
|
864
|
+
raise RuntimeError(
|
865
|
+
f'Failed to SSH to head node ({head_node}). '
|
866
|
+
f'Please check the SSH configuration and logs for more details.'
|
867
|
+
)
|
847
868
|
|
848
869
|
# Checking history
|
849
870
|
history_exists = (history_worker_nodes is not None and
|
@@ -981,10 +1002,10 @@ def deploy_cluster(head_node,
|
|
981
1002
|
print_output=True,
|
982
1003
|
use_shell=True)
|
983
1004
|
if result is None:
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
1005
|
+
with ux_utils.print_exception_no_traceback():
|
1006
|
+
raise RuntimeError(
|
1007
|
+
f'Failed to setup TCP forwarding on head node ({head_node}). '
|
1008
|
+
f'Please check the SSH configuration.')
|
988
1009
|
|
989
1010
|
# Get effective IP for master node if using SSH config - needed for workers to connect
|
990
1011
|
if head_use_ssh_config:
|
@@ -1024,9 +1045,9 @@ def deploy_cluster(head_node,
|
|
1024
1045
|
ssh_key,
|
1025
1046
|
use_ssh_config=head_use_ssh_config)
|
1026
1047
|
if result is None:
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1048
|
+
with ux_utils.print_exception_no_traceback():
|
1049
|
+
raise RuntimeError(
|
1050
|
+
f'Failed to deploy K3s on head node ({head_node}).')
|
1030
1051
|
success_message(f'K3s deployed on head node ({head_node}).')
|
1031
1052
|
|
1032
1053
|
# Check if head node has a GPU
|
@@ -1045,11 +1066,9 @@ def deploy_cluster(head_node,
|
|
1045
1066
|
ssh_key,
|
1046
1067
|
use_ssh_config=head_use_ssh_config)
|
1047
1068
|
if master_addr is None:
|
1048
|
-
|
1049
|
-
f'
|
1050
|
-
|
1051
|
-
file=sys.stderr)
|
1052
|
-
sys.exit(1)
|
1069
|
+
with ux_utils.print_exception_no_traceback():
|
1070
|
+
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
1071
|
+
f'Please check the SSH configuration.')
|
1053
1072
|
print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
|
1054
1073
|
|
1055
1074
|
# Step 2: Install k3s on worker nodes and join them to the master node
|
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
|
|
26
26
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
27
27
|
|
28
28
|
|
29
|
+
def check_ssh_cluster_dependencies(
|
30
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
31
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
raise_error: set to true when the dependency needs to be present.
|
35
|
+
set to false for `sky check`, where reason strings are compiled
|
36
|
+
at the end.
|
37
|
+
|
38
|
+
Returns: the reasons list if there are missing dependencies.
|
39
|
+
"""
|
40
|
+
# error message
|
41
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
42
|
+
|
43
|
+
# save
|
44
|
+
reasons = []
|
45
|
+
required_binaries = []
|
46
|
+
|
47
|
+
# Ensure jq is installed
|
48
|
+
try:
|
49
|
+
subprocess.run(['jq', '--version'],
|
50
|
+
stdout=subprocess.DEVNULL,
|
51
|
+
stderr=subprocess.DEVNULL,
|
52
|
+
check=True)
|
53
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
54
|
+
required_binaries.append('jq')
|
55
|
+
reasons.append(jq_message)
|
56
|
+
|
57
|
+
if required_binaries:
|
58
|
+
reasons.extend([
|
59
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
60
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
61
|
+
'On MacOS, install with: ',
|
62
|
+
f' $ brew install {" ".join(required_binaries)}',
|
63
|
+
])
|
64
|
+
if raise_error:
|
65
|
+
with ux_utils.print_exception_no_traceback():
|
66
|
+
raise RuntimeError('\n'.join(reasons))
|
67
|
+
return reasons
|
68
|
+
return None
|
69
|
+
|
70
|
+
|
29
71
|
def deploy_ssh_cluster(cleanup: bool = False,
|
30
72
|
infra: Optional[str] = None,
|
31
73
|
kubeconfig_path: Optional[str] = None):
|
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
41
83
|
kubeconfig_path: Path to save the Kubernetes configuration file.
|
42
84
|
If None, the default ~/.kube/config will be used.
|
43
85
|
"""
|
86
|
+
check_ssh_cluster_dependencies()
|
87
|
+
|
44
88
|
# Prepare command to call deploy_remote_cluster.py script
|
45
89
|
# TODO(romilb): We should move this to a native python method/class call
|
46
90
|
# instead of invoking a script with subprocess.
|
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
81
125
|
cmd=deploy_command,
|
82
126
|
log_path=log_path,
|
83
127
|
require_outputs=True,
|
84
|
-
stream_logs=False,
|
128
|
+
stream_logs=False,
|
85
129
|
line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
|
86
|
-
is_local=
|
130
|
+
is_local=False),
|
87
131
|
cwd=cwd,
|
88
132
|
env=env)
|
89
133
|
|
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
91
135
|
success = True
|
92
136
|
else:
|
93
137
|
with ux_utils.print_exception_no_traceback():
|
94
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=
|
95
|
-
raise RuntimeError('Failed to deploy SkyPilot on
|
96
|
-
f'
|
138
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
139
|
+
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
140
|
+
f'{log_hint}'
|
97
141
|
f'\nError: {stderr}')
|
98
142
|
|
99
143
|
if success:
|
@@ -188,14 +188,17 @@ generate_credentials_json() {
|
|
188
188
|
debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
|
189
189
|
|
190
190
|
# Check if we can create proper JSON with `jq`
|
191
|
-
if command -v jq &>/dev/null; then
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
191
|
+
if ! command -v jq &>/dev/null; then
|
192
|
+
echo "jq is not installed. Please install jq to use this script." >&2
|
193
|
+
exit 1
|
194
|
+
fi
|
195
|
+
debug_log "Using jq for JSON formatting"
|
196
|
+
|
197
|
+
# Create a temporary file for the JSON output to avoid shell escaping issues
|
198
|
+
local TEMP_JSON_FILE=$(mktemp)
|
199
|
+
|
200
|
+
# Write the JSON to the temporary file using jq for proper JSON formatting
|
201
|
+
cat > "$TEMP_JSON_FILE" << EOL
|
199
202
|
{
|
200
203
|
"apiVersion": "client.authentication.k8s.io/v1beta1",
|
201
204
|
"kind": "ExecCredential",
|
@@ -207,25 +210,14 @@ generate_credentials_json() {
|
|
207
210
|
}
|
208
211
|
EOL
|
209
212
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
else
|
219
|
-
debug_log "jq is not available, using simpler formatting method"
|
220
|
-
|
221
|
-
# Alternative approach: encode with base64 and use the token field instead
|
222
|
-
# This works because kubectl will decode token data properly
|
223
|
-
local combined_data=$(echo -n "${client_cert_data}:${client_key_data}" | base64 | tr -d '\n')
|
224
|
-
|
225
|
-
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"$combined_data\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
226
|
-
|
227
|
-
debug_log "Sent certificate data as encoded token instead of direct certificate fields"
|
228
|
-
fi
|
213
|
+
# Read the JSON from the file
|
214
|
+
local json_response=$(cat "$TEMP_JSON_FILE")
|
215
|
+
|
216
|
+
# Clean up
|
217
|
+
rm -f "$TEMP_JSON_FILE"
|
218
|
+
|
219
|
+
# Output the JSON
|
220
|
+
echo "$json_response"
|
229
221
|
else
|
230
222
|
# Fallback to token-based credential for tunnel-only authentication
|
231
223
|
echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
|
@@ -384,4 +376,4 @@ fi
|
|
384
376
|
|
385
377
|
# Return valid credential format with certificates if available
|
386
378
|
generate_credentials_json
|
387
|
-
exit 0
|
379
|
+
exit 0
|
sky/utils/log_utils.py
CHANGED
@@ -497,6 +497,10 @@ class SkySSHUpLineProcessor(LineProcessor):
|
|
497
497
|
f'✗ Failed to setup TCP forwarding on head node {node_name}.'
|
498
498
|
f'{colorama.Style.RESET_ALL}')
|
499
499
|
|
500
|
+
if 'Error in deploying SSH Target' in log_line:
|
501
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
|
502
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
503
|
+
|
500
504
|
def __exit__(self, except_type: Optional[Type[BaseException]],
|
501
505
|
except_value: Optional[BaseException],
|
502
506
|
traceback: Optional[types.TracebackType]) -> None:
|
sky/utils/schemas.py
CHANGED
@@ -1173,6 +1173,54 @@ def get_config_schema():
|
|
1173
1173
|
}
|
1174
1174
|
}
|
1175
1175
|
|
1176
|
+
workspace_schema = {'type': 'string'}
|
1177
|
+
|
1178
|
+
allowed_workspace_cloud_names = list(
|
1179
|
+
service_catalog.ALL_CLOUDS) + ['cloudflare']
|
1180
|
+
# Create pattern for non-GCP clouds (all clouds except gcp)
|
1181
|
+
non_gcp_clouds = [
|
1182
|
+
cloud for cloud in allowed_workspace_cloud_names
|
1183
|
+
if cloud.lower() != 'gcp'
|
1184
|
+
]
|
1185
|
+
non_gcp_cloud_regex = '|'.join(non_gcp_clouds)
|
1186
|
+
workspaces_schema = {
|
1187
|
+
'type': 'object',
|
1188
|
+
'required': [],
|
1189
|
+
# each key is a workspace name
|
1190
|
+
'additionalProperties': {
|
1191
|
+
'type': 'object',
|
1192
|
+
'additionalProperties': False,
|
1193
|
+
'patternProperties': {
|
1194
|
+
# Pattern for non-GCP clouds - only allows 'disabled' property
|
1195
|
+
f'^({non_gcp_cloud_regex})$': {
|
1196
|
+
'type': 'object',
|
1197
|
+
'additionalProperties': False,
|
1198
|
+
'properties': {
|
1199
|
+
'disabled': {
|
1200
|
+
'type': 'boolean'
|
1201
|
+
}
|
1202
|
+
},
|
1203
|
+
},
|
1204
|
+
},
|
1205
|
+
'properties': {
|
1206
|
+
# Explicit definition for GCP allows both project_id and
|
1207
|
+
# disabled
|
1208
|
+
'gcp': {
|
1209
|
+
'type': 'object',
|
1210
|
+
'properties': {
|
1211
|
+
'project_id': {
|
1212
|
+
'type': 'string'
|
1213
|
+
},
|
1214
|
+
'disabled': {
|
1215
|
+
'type': 'boolean'
|
1216
|
+
}
|
1217
|
+
},
|
1218
|
+
'additionalProperties': False,
|
1219
|
+
},
|
1220
|
+
},
|
1221
|
+
},
|
1222
|
+
}
|
1223
|
+
|
1176
1224
|
provision_configs = {
|
1177
1225
|
'type': 'object',
|
1178
1226
|
'required': [],
|
@@ -1199,6 +1247,10 @@ def get_config_schema():
|
|
1199
1247
|
'required': [],
|
1200
1248
|
'additionalProperties': False,
|
1201
1249
|
'properties': {
|
1250
|
+
# TODO Replace this with whatever syang cooks up
|
1251
|
+
'workspace': {
|
1252
|
+
'type': 'string',
|
1253
|
+
},
|
1202
1254
|
'jobs': controller_resources_schema,
|
1203
1255
|
'serve': controller_resources_schema,
|
1204
1256
|
'allowed_clouds': allowed_clouds,
|
@@ -1206,6 +1258,8 @@ def get_config_schema():
|
|
1206
1258
|
'docker': docker_configs,
|
1207
1259
|
'nvidia_gpus': gpu_configs,
|
1208
1260
|
'api_server': api_server,
|
1261
|
+
'active_workspace': workspace_schema,
|
1262
|
+
'workspaces': workspaces_schema,
|
1209
1263
|
'provision': provision_configs,
|
1210
1264
|
**cloud_configs,
|
1211
1265
|
},
|
File without changes
|