skypilot-nightly 1.0.0.dev20250523__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +3 -1
- sky/check.py +332 -170
- sky/cli.py +44 -11
- sky/client/cli.py +44 -11
- sky/client/sdk.py +54 -10
- sky/clouds/gcp.py +19 -3
- sky/core.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +42 -19
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +6 -3
- sky/provision/fluidstack/instance.py +1 -0
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +11 -0
- sky/server/server.py +68 -5
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +83 -9
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/kubernetes/deploy_remote_cluster.py +150 -147
- sky/utils/log_utils.py +4 -0
- sky/utils/schemas.py +54 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +66 -59
- sky/dashboard/out/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- /sky/dashboard/out/_next/static/{ECKwDNS9v9y3_IKFZ2lpp → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
@@ -48,7 +48,8 @@ class StatusColumn:
|
|
48
48
|
def show_status_table(cluster_records: List[_ClusterRecord],
|
49
49
|
show_all: bool,
|
50
50
|
show_user: bool,
|
51
|
-
query_clusters: Optional[List[str]] = None
|
51
|
+
query_clusters: Optional[List[str]] = None,
|
52
|
+
show_workspaces: bool = False) -> int:
|
52
53
|
"""Compute cluster table values and display.
|
53
54
|
|
54
55
|
Returns:
|
@@ -56,7 +57,6 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
56
57
|
STOPPED.
|
57
58
|
"""
|
58
59
|
# TODO(zhwu): Update the information for autostop clusters.
|
59
|
-
|
60
60
|
status_columns = [
|
61
61
|
StatusColumn('NAME', _get_name),
|
62
62
|
]
|
@@ -66,6 +66,9 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
66
66
|
StatusColumn('USER_ID', _get_user_hash, show_by_default=False))
|
67
67
|
|
68
68
|
status_columns += [
|
69
|
+
StatusColumn('WORKSPACE',
|
70
|
+
_get_workspace,
|
71
|
+
show_by_default=show_workspaces),
|
69
72
|
StatusColumn('INFRA', _get_infra, truncate=not show_all),
|
70
73
|
StatusColumn('RESOURCES', _get_resources, truncate=not show_all),
|
71
74
|
StatusColumn('STATUS', _get_status_colored),
|
@@ -106,12 +109,13 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
106
109
|
for cluster in query_clusters
|
107
110
|
if cluster not in cluster_names
|
108
111
|
]
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
112
|
+
if not_found_clusters:
|
113
|
+
cluster_str = 'Cluster'
|
114
|
+
if len(not_found_clusters) > 1:
|
115
|
+
cluster_str += 's'
|
116
|
+
cluster_str += ' '
|
117
|
+
cluster_str += ', '.join(not_found_clusters)
|
118
|
+
click.echo(f'{cluster_str} not found.')
|
115
119
|
elif not cluster_records:
|
116
120
|
click.echo('No existing clusters.')
|
117
121
|
return num_pending_autostop
|
@@ -243,6 +247,12 @@ def _get_status(cluster_record: _ClusterRecord,
|
|
243
247
|
return cluster_record['status']
|
244
248
|
|
245
249
|
|
250
|
+
def _get_workspace(cluster_record: _ClusterRecord,
|
251
|
+
truncate: bool = True) -> str:
|
252
|
+
del truncate
|
253
|
+
return cluster_record['workspace']
|
254
|
+
|
255
|
+
|
246
256
|
def _get_status_colored(cluster_record: _ClusterRecord,
|
247
257
|
truncate: bool = True) -> str:
|
248
258
|
del truncate
|
@@ -14,6 +14,8 @@ from typing import Any, Dict, List, Optional, Set
|
|
14
14
|
|
15
15
|
import yaml
|
16
16
|
|
17
|
+
from sky.utils import ux_utils
|
18
|
+
|
17
19
|
# Colors for nicer UX
|
18
20
|
RED = '\033[0;31m'
|
19
21
|
GREEN = '\033[0;32m'
|
@@ -117,21 +119,19 @@ def parse_args():
|
|
117
119
|
def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
118
120
|
"""Load SSH targets from YAML file."""
|
119
121
|
if not os.path.exists(file_path):
|
120
|
-
|
121
|
-
|
122
|
-
sys.exit(1)
|
122
|
+
with ux_utils.print_exception_no_traceback():
|
123
|
+
raise ValueError(f'SSH Node Pools file not found: {file_path}')
|
123
124
|
|
124
125
|
try:
|
125
126
|
with open(file_path, 'r', encoding='utf-8') as f:
|
126
127
|
targets = yaml.load(f, Loader=UniqueKeySafeLoader)
|
127
128
|
return targets
|
128
129
|
except yaml.constructor.ConstructorError as e:
|
129
|
-
|
130
|
-
|
130
|
+
with ux_utils.print_exception_no_traceback():
|
131
|
+
raise ValueError(e.note) from e
|
131
132
|
except (yaml.YAMLError, IOError, OSError) as e:
|
132
|
-
|
133
|
-
|
134
|
-
sys.exit(1)
|
133
|
+
with ux_utils.print_exception_no_traceback():
|
134
|
+
raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
|
135
135
|
|
136
136
|
|
137
137
|
def check_host_in_ssh_config(hostname: str) -> bool:
|
@@ -181,31 +181,28 @@ def get_cluster_config(targets: Dict[str, Any],
|
|
181
181
|
file_path: Optional[str] = None) -> Dict[str, Any]:
|
182
182
|
"""Get configuration for specific clusters or all clusters."""
|
183
183
|
if not targets:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
file=sys.stderr)
|
188
|
-
sys.exit(1)
|
184
|
+
with ux_utils.print_exception_no_traceback():
|
185
|
+
raise ValueError(
|
186
|
+
f'No clusters defined in SSH Node Pools file {file_path}')
|
189
187
|
|
190
188
|
if cluster_name:
|
191
189
|
if cluster_name not in targets:
|
192
|
-
|
193
|
-
f'
|
194
|
-
|
195
|
-
file=sys.stderr)
|
196
|
-
sys.exit(1)
|
190
|
+
with ux_utils.print_exception_no_traceback():
|
191
|
+
raise ValueError(f'Cluster {cluster_name!r} not found in '
|
192
|
+
f'SSH Node Pools file {file_path}')
|
197
193
|
return {cluster_name: targets[cluster_name]}
|
198
194
|
|
199
195
|
# Return all clusters if no specific cluster is specified
|
200
196
|
return targets
|
201
197
|
|
202
198
|
|
203
|
-
def prepare_hosts_info(
|
199
|
+
def prepare_hosts_info(cluster_name: str,
|
200
|
+
cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
|
204
201
|
"""Prepare list of hosts with resolved user, identity_file, and password."""
|
205
202
|
if 'hosts' not in cluster_config or not cluster_config['hosts']:
|
206
|
-
|
207
|
-
|
208
|
-
|
203
|
+
with ux_utils.print_exception_no_traceback():
|
204
|
+
raise ValueError(
|
205
|
+
f'No hosts defined in cluster {cluster_name} configuration')
|
209
206
|
|
210
207
|
# Get cluster-level defaults
|
211
208
|
cluster_user = cluster_config.get('user', '')
|
@@ -636,23 +633,20 @@ def main():
|
|
636
633
|
# Using command line arguments - legacy mode
|
637
634
|
if args.ssh_key and not os.path.isfile(
|
638
635
|
args.ssh_key) and not global_use_ssh_config:
|
639
|
-
|
640
|
-
|
641
|
-
sys.exit(1)
|
636
|
+
with ux_utils.print_exception_no_traceback():
|
637
|
+
raise ValueError(f'SSH key not found: {args.ssh_key}')
|
642
638
|
|
643
639
|
if not os.path.isfile(args.ips_file):
|
644
|
-
|
645
|
-
|
646
|
-
sys.exit(1)
|
640
|
+
with ux_utils.print_exception_no_traceback():
|
641
|
+
raise ValueError(f'IPs file not found: {args.ips_file}')
|
647
642
|
|
648
643
|
with open(args.ips_file, 'r', encoding='utf-8') as f:
|
649
644
|
hosts = [line.strip() for line in f if line.strip()]
|
650
645
|
|
651
646
|
if not hosts:
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
sys.exit(1)
|
647
|
+
with ux_utils.print_exception_no_traceback():
|
648
|
+
raise ValueError(
|
649
|
+
'Hosts file is empty or not formatted correctly.')
|
656
650
|
|
657
651
|
head_node = hosts[0]
|
658
652
|
worker_nodes = hosts[1:]
|
@@ -688,108 +682,121 @@ def main():
|
|
688
682
|
|
689
683
|
# Process each cluster
|
690
684
|
for cluster_name, cluster_config in clusters_config.items():
|
691
|
-
|
692
|
-
|
693
|
-
hosts_info = prepare_hosts_info(cluster_config)
|
694
|
-
|
695
|
-
if not hosts_info:
|
685
|
+
try:
|
686
|
+
print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
|
696
687
|
print(
|
697
|
-
f'{
|
698
|
-
)
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
688
|
+
f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
|
689
|
+
hosts_info = prepare_hosts_info(cluster_name, cluster_config)
|
690
|
+
|
691
|
+
if not hosts_info:
|
692
|
+
print(
|
693
|
+
f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
|
694
|
+
)
|
695
|
+
continue
|
696
|
+
|
697
|
+
# Generate a unique context name for each cluster
|
698
|
+
context_name = args.context_name
|
699
|
+
if context_name == 'default':
|
700
|
+
context_name = 'ssh-' + cluster_name
|
701
|
+
|
702
|
+
# Check cluster history
|
703
|
+
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
704
|
+
history_yaml_file = os.path.join(
|
705
|
+
NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
|
706
|
+
|
707
|
+
history = None
|
708
|
+
if os.path.exists(history_yaml_file):
|
709
|
+
print(
|
710
|
+
f'{YELLOW}Loading history from {history_yaml_file}{NC}')
|
711
|
+
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
712
|
+
history = yaml.safe_load(f)
|
713
|
+
else:
|
714
|
+
print(f'{YELLOW}No history found for {context_name}.{NC}')
|
715
|
+
|
716
|
+
history_workers_info = None
|
717
|
+
history_worker_nodes = None
|
718
|
+
history_use_ssh_config = None
|
719
|
+
# Do not support changing anything besides hosts for now
|
720
|
+
if history is not None:
|
721
|
+
for key in ['user', 'identity_file', 'password']:
|
722
|
+
if history.get(key) != cluster_config.get(key):
|
723
|
+
raise ValueError(
|
724
|
+
f'Cluster configuration has changed for field {key!r}. '
|
725
|
+
f'Previous value: {history.get(key)}, '
|
726
|
+
f'Current value: {cluster_config.get(key)}')
|
727
|
+
history_hosts_info = prepare_hosts_info(
|
728
|
+
cluster_name, history)
|
729
|
+
if history_hosts_info[0] != hosts_info[0]:
|
726
730
|
raise ValueError(
|
727
|
-
f'Cluster configuration has changed for
|
728
|
-
f'Previous value: {
|
729
|
-
f'Current value: {
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
731
|
+
f'Cluster configuration has changed for master node. '
|
732
|
+
f'Previous value: {history_hosts_info[0]}, '
|
733
|
+
f'Current value: {hosts_info[0]}')
|
734
|
+
history_workers_info = history_hosts_info[1:] if len(
|
735
|
+
history_hosts_info) > 1 else []
|
736
|
+
history_worker_nodes = [
|
737
|
+
h['ip'] for h in history_workers_info
|
738
|
+
]
|
739
|
+
history_use_ssh_config = [
|
740
|
+
h.get('use_ssh_config', False)
|
741
|
+
for h in history_workers_info
|
742
|
+
]
|
743
|
+
|
744
|
+
# Use the first host as the head node and the rest as worker nodes
|
745
|
+
head_host = hosts_info[0]
|
746
|
+
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
747
|
+
|
748
|
+
head_node = head_host['ip']
|
749
|
+
worker_nodes = [h['ip'] for h in worker_hosts]
|
750
|
+
ssh_user = head_host['user']
|
751
|
+
ssh_key = head_host['identity_file']
|
752
|
+
head_use_ssh_config = global_use_ssh_config or head_host.get(
|
753
|
+
'use_ssh_config', False)
|
754
|
+
worker_use_ssh_config = [
|
755
|
+
global_use_ssh_config or h.get('use_ssh_config', False)
|
756
|
+
for h in worker_hosts
|
741
757
|
]
|
758
|
+
password = head_host['password']
|
759
|
+
|
760
|
+
# Deploy this cluster
|
761
|
+
unsuccessful_workers = deploy_cluster(
|
762
|
+
head_node,
|
763
|
+
worker_nodes,
|
764
|
+
ssh_user,
|
765
|
+
ssh_key,
|
766
|
+
context_name,
|
767
|
+
password,
|
768
|
+
head_use_ssh_config,
|
769
|
+
worker_use_ssh_config,
|
770
|
+
kubeconfig_path,
|
771
|
+
args.cleanup,
|
772
|
+
worker_hosts=worker_hosts,
|
773
|
+
history_worker_nodes=history_worker_nodes,
|
774
|
+
history_workers_info=history_workers_info,
|
775
|
+
history_use_ssh_config=history_use_ssh_config)
|
776
|
+
|
777
|
+
if not args.cleanup:
|
778
|
+
successful_hosts = []
|
779
|
+
for host in cluster_config['hosts']:
|
780
|
+
if isinstance(host, str):
|
781
|
+
host_node = host
|
782
|
+
else:
|
783
|
+
host_node = host['ip']
|
784
|
+
if host_node not in unsuccessful_workers:
|
785
|
+
successful_hosts.append(host)
|
786
|
+
cluster_config['hosts'] = successful_hosts
|
787
|
+
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
788
|
+
print(
|
789
|
+
f'{YELLOW}Writing history to {history_yaml_file}{NC}'
|
790
|
+
)
|
791
|
+
yaml.dump(cluster_config, f)
|
742
792
|
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
ssh_key = head_host['identity_file']
|
751
|
-
head_use_ssh_config = global_use_ssh_config or head_host.get(
|
752
|
-
'use_ssh_config', False)
|
753
|
-
worker_use_ssh_config = [
|
754
|
-
global_use_ssh_config or h.get('use_ssh_config', False)
|
755
|
-
for h in worker_hosts
|
756
|
-
]
|
757
|
-
password = head_host['password']
|
758
|
-
|
759
|
-
# Deploy this cluster
|
760
|
-
unsuccessful_workers = deploy_cluster(
|
761
|
-
head_node,
|
762
|
-
worker_nodes,
|
763
|
-
ssh_user,
|
764
|
-
ssh_key,
|
765
|
-
context_name,
|
766
|
-
password,
|
767
|
-
head_use_ssh_config,
|
768
|
-
worker_use_ssh_config,
|
769
|
-
kubeconfig_path,
|
770
|
-
args.cleanup,
|
771
|
-
worker_hosts=worker_hosts,
|
772
|
-
history_worker_nodes=history_worker_nodes,
|
773
|
-
history_workers_info=history_workers_info,
|
774
|
-
history_use_ssh_config=history_use_ssh_config)
|
775
|
-
|
776
|
-
if not args.cleanup:
|
777
|
-
successful_hosts = []
|
778
|
-
for host in cluster_config['hosts']:
|
779
|
-
if isinstance(host, str):
|
780
|
-
host_node = host
|
781
|
-
else:
|
782
|
-
host_node = host['ip']
|
783
|
-
if host_node not in unsuccessful_workers:
|
784
|
-
successful_hosts.append(host)
|
785
|
-
cluster_config['hosts'] = successful_hosts
|
786
|
-
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
787
|
-
print(f'{YELLOW}Writing history to {history_yaml_file}{NC}')
|
788
|
-
yaml.dump(cluster_config, f)
|
789
|
-
|
790
|
-
print(
|
791
|
-
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
792
|
-
)
|
793
|
+
print(
|
794
|
+
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
795
|
+
)
|
796
|
+
except Exception as e: # pylint: disable=broad-except
|
797
|
+
print(
|
798
|
+
f'{RED}Error deploying SSH Node Pool {cluster_name}: {e}{NC}'
|
799
|
+
)
|
793
800
|
|
794
801
|
|
795
802
|
def deploy_cluster(head_node,
|
@@ -839,11 +846,9 @@ def deploy_cluster(head_node,
|
|
839
846
|
# For SkySSHUpLineProcessor
|
840
847
|
print_output=True)
|
841
848
|
if result is None:
|
842
|
-
|
843
|
-
f'
|
844
|
-
|
845
|
-
file=sys.stderr)
|
846
|
-
sys.exit(1)
|
849
|
+
with ux_utils.print_exception_no_traceback():
|
850
|
+
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
851
|
+
f'Please check the SSH configuration.')
|
847
852
|
|
848
853
|
# Checking history
|
849
854
|
history_exists = (history_worker_nodes is not None and
|
@@ -981,10 +986,10 @@ def deploy_cluster(head_node,
|
|
981
986
|
print_output=True,
|
982
987
|
use_shell=True)
|
983
988
|
if result is None:
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
989
|
+
with ux_utils.print_exception_no_traceback():
|
990
|
+
raise RuntimeError(
|
991
|
+
f'Failed to setup TCP forwarding on head node ({head_node}). '
|
992
|
+
f'Please check the SSH configuration.')
|
988
993
|
|
989
994
|
# Get effective IP for master node if using SSH config - needed for workers to connect
|
990
995
|
if head_use_ssh_config:
|
@@ -1024,9 +1029,9 @@ def deploy_cluster(head_node,
|
|
1024
1029
|
ssh_key,
|
1025
1030
|
use_ssh_config=head_use_ssh_config)
|
1026
1031
|
if result is None:
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1032
|
+
with ux_utils.print_exception_no_traceback():
|
1033
|
+
raise RuntimeError(
|
1034
|
+
f'Failed to deploy K3s on head node ({head_node}).')
|
1030
1035
|
success_message(f'K3s deployed on head node ({head_node}).')
|
1031
1036
|
|
1032
1037
|
# Check if head node has a GPU
|
@@ -1045,11 +1050,9 @@ def deploy_cluster(head_node,
|
|
1045
1050
|
ssh_key,
|
1046
1051
|
use_ssh_config=head_use_ssh_config)
|
1047
1052
|
if master_addr is None:
|
1048
|
-
|
1049
|
-
f'
|
1050
|
-
|
1051
|
-
file=sys.stderr)
|
1052
|
-
sys.exit(1)
|
1053
|
+
with ux_utils.print_exception_no_traceback():
|
1054
|
+
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
1055
|
+
f'Please check the SSH configuration.')
|
1053
1056
|
print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
|
1054
1057
|
|
1055
1058
|
# Step 2: Install k3s on worker nodes and join them to the master node
|
sky/utils/log_utils.py
CHANGED
@@ -497,6 +497,10 @@ class SkySSHUpLineProcessor(LineProcessor):
|
|
497
497
|
f'✗ Failed to setup TCP forwarding on head node {node_name}.'
|
498
498
|
f'{colorama.Style.RESET_ALL}')
|
499
499
|
|
500
|
+
if 'Error in deploying SSH Target' in log_line:
|
501
|
+
logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
|
502
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
503
|
+
|
500
504
|
def __exit__(self, except_type: Optional[Type[BaseException]],
|
501
505
|
except_value: Optional[BaseException],
|
502
506
|
traceback: Optional[types.TracebackType]) -> None:
|
sky/utils/schemas.py
CHANGED
@@ -1173,6 +1173,54 @@ def get_config_schema():
|
|
1173
1173
|
}
|
1174
1174
|
}
|
1175
1175
|
|
1176
|
+
workspace_schema = {'type': 'string'}
|
1177
|
+
|
1178
|
+
allowed_workspace_cloud_names = list(
|
1179
|
+
service_catalog.ALL_CLOUDS) + ['cloudflare']
|
1180
|
+
# Create pattern for non-GCP clouds (all clouds except gcp)
|
1181
|
+
non_gcp_clouds = [
|
1182
|
+
cloud for cloud in allowed_workspace_cloud_names
|
1183
|
+
if cloud.lower() != 'gcp'
|
1184
|
+
]
|
1185
|
+
non_gcp_cloud_regex = '|'.join(non_gcp_clouds)
|
1186
|
+
workspaces_schema = {
|
1187
|
+
'type': 'object',
|
1188
|
+
'required': [],
|
1189
|
+
# each key is a workspace name
|
1190
|
+
'additionalProperties': {
|
1191
|
+
'type': 'object',
|
1192
|
+
'additionalProperties': False,
|
1193
|
+
'patternProperties': {
|
1194
|
+
# Pattern for non-GCP clouds - only allows 'disabled' property
|
1195
|
+
f'^({non_gcp_cloud_regex})$': {
|
1196
|
+
'type': 'object',
|
1197
|
+
'additionalProperties': False,
|
1198
|
+
'properties': {
|
1199
|
+
'disabled': {
|
1200
|
+
'type': 'boolean'
|
1201
|
+
}
|
1202
|
+
},
|
1203
|
+
},
|
1204
|
+
},
|
1205
|
+
'properties': {
|
1206
|
+
# Explicit definition for GCP allows both project_id and
|
1207
|
+
# disabled
|
1208
|
+
'gcp': {
|
1209
|
+
'type': 'object',
|
1210
|
+
'properties': {
|
1211
|
+
'project_id': {
|
1212
|
+
'type': 'string'
|
1213
|
+
},
|
1214
|
+
'disabled': {
|
1215
|
+
'type': 'boolean'
|
1216
|
+
}
|
1217
|
+
},
|
1218
|
+
'additionalProperties': False,
|
1219
|
+
},
|
1220
|
+
},
|
1221
|
+
},
|
1222
|
+
}
|
1223
|
+
|
1176
1224
|
provision_configs = {
|
1177
1225
|
'type': 'object',
|
1178
1226
|
'required': [],
|
@@ -1199,6 +1247,10 @@ def get_config_schema():
|
|
1199
1247
|
'required': [],
|
1200
1248
|
'additionalProperties': False,
|
1201
1249
|
'properties': {
|
1250
|
+
# TODO Replace this with whatever syang cooks up
|
1251
|
+
'workspace': {
|
1252
|
+
'type': 'string',
|
1253
|
+
},
|
1202
1254
|
'jobs': controller_resources_schema,
|
1203
1255
|
'serve': controller_resources_schema,
|
1204
1256
|
'allowed_clouds': allowed_clouds,
|
@@ -1206,6 +1258,8 @@ def get_config_schema():
|
|
1206
1258
|
'docker': docker_configs,
|
1207
1259
|
'nvidia_gpus': gpu_configs,
|
1208
1260
|
'api_server': api_server,
|
1261
|
+
'active_workspace': workspace_schema,
|
1262
|
+
'workspaces': workspaces_schema,
|
1209
1263
|
'provision': provision_configs,
|
1210
1264
|
**cloud_configs,
|
1211
1265
|
},
|