skypilot-nightly 1.0.0.dev20250523__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +62 -45
  3. sky/backends/cloud_vm_ray_backend.py +3 -1
  4. sky/check.py +335 -170
  5. sky/cli.py +56 -13
  6. sky/client/cli.py +56 -13
  7. sky/client/sdk.py +54 -10
  8. sky/clouds/gcp.py +19 -3
  9. sky/core.py +5 -2
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  15. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  16. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  17. sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-3b5aad09a25f64b7.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  25. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/infra-abb7d744ecf15109.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-48dc8d67d4b60be1.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  29. sky/dashboard/out/_next/static/chunks/pages/users-b8acf6e6735323a2.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  34. sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/dashboard/out/users.html +1 -0
  43. sky/dashboard/out/workspace/new.html +1 -0
  44. sky/dashboard/out/workspaces/[name].html +1 -0
  45. sky/dashboard/out/workspaces.html +1 -0
  46. sky/data/storage.py +1 -1
  47. sky/global_user_state.py +606 -543
  48. sky/jobs/constants.py +1 -1
  49. sky/jobs/server/core.py +72 -56
  50. sky/jobs/state.py +26 -5
  51. sky/jobs/utils.py +65 -13
  52. sky/optimizer.py +6 -3
  53. sky/provision/fluidstack/instance.py +1 -0
  54. sky/serve/server/core.py +9 -6
  55. sky/server/html/token_page.html +6 -1
  56. sky/server/requests/executor.py +1 -0
  57. sky/server/requests/payloads.py +28 -0
  58. sky/server/server.py +59 -5
  59. sky/setup_files/dependencies.py +1 -0
  60. sky/skylet/constants.py +4 -1
  61. sky/skypilot_config.py +107 -11
  62. sky/utils/cli_utils/status_utils.py +18 -8
  63. sky/utils/db_utils.py +53 -0
  64. sky/utils/kubernetes/config_map_utils.py +133 -0
  65. sky/utils/kubernetes/deploy_remote_cluster.py +166 -147
  66. sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
  67. sky/utils/kubernetes/ssh-tunnel.sh +20 -28
  68. sky/utils/log_utils.py +4 -0
  69. sky/utils/schemas.py +54 -0
  70. sky/workspaces/__init__.py +0 -0
  71. sky/workspaces/core.py +295 -0
  72. sky/workspaces/server.py +62 -0
  73. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
  74. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +79 -63
  75. sky/dashboard/out/_next/static/ECKwDNS9v9y3_IKFZ2lpp/_buildManifest.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  77. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  78. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  79. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  82. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +0 -1
  87. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  89. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  90. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  91. /sky/dashboard/out/_next/static/{ECKwDNS9v9y3_IKFZ2lpp → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
  92. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20250523.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,8 @@ from typing import Any, Dict, List, Optional, Set
14
14
 
15
15
  import yaml
16
16
 
17
+ from sky.utils import ux_utils
18
+
17
19
  # Colors for nicer UX
18
20
  RED = '\033[0;31m'
19
21
  GREEN = '\033[0;32m'
@@ -117,21 +119,19 @@ def parse_args():
117
119
  def load_ssh_targets(file_path: str) -> Dict[str, Any]:
118
120
  """Load SSH targets from YAML file."""
119
121
  if not os.path.exists(file_path):
120
- print(f'{RED}Error: SSH Node Pools file not found: {file_path}{NC}',
121
- file=sys.stderr)
122
- sys.exit(1)
122
+ with ux_utils.print_exception_no_traceback():
123
+ raise ValueError(f'SSH Node Pools file not found: {file_path}')
123
124
 
124
125
  try:
125
126
  with open(file_path, 'r', encoding='utf-8') as f:
126
127
  targets = yaml.load(f, Loader=UniqueKeySafeLoader)
127
128
  return targets
128
129
  except yaml.constructor.ConstructorError as e:
129
- print(f'{RED}{e.note}{NC}', file=sys.stderr)
130
- sys.exit(1)
130
+ with ux_utils.print_exception_no_traceback():
131
+ raise ValueError(e.note) from e
131
132
  except (yaml.YAMLError, IOError, OSError) as e:
132
- print(f'{RED}Error loading SSH Node Pools file: {e}{NC}',
133
- file=sys.stderr)
134
- sys.exit(1)
133
+ with ux_utils.print_exception_no_traceback():
134
+ raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
135
135
 
136
136
 
137
137
  def check_host_in_ssh_config(hostname: str) -> bool:
@@ -181,31 +181,28 @@ def get_cluster_config(targets: Dict[str, Any],
181
181
  file_path: Optional[str] = None) -> Dict[str, Any]:
182
182
  """Get configuration for specific clusters or all clusters."""
183
183
  if not targets:
184
- print(
185
- f'{RED}Error: No clusters defined in SSH Node Pools '
186
- f'file {file_path}{NC}',
187
- file=sys.stderr)
188
- sys.exit(1)
184
+ with ux_utils.print_exception_no_traceback():
185
+ raise ValueError(
186
+ f'No clusters defined in SSH Node Pools file {file_path}')
189
187
 
190
188
  if cluster_name:
191
189
  if cluster_name not in targets:
192
- print(
193
- f'{RED}Error: Cluster {cluster_name!r} not found in '
194
- f'SSH Node Pools file {file_path}{NC}',
195
- file=sys.stderr)
196
- sys.exit(1)
190
+ with ux_utils.print_exception_no_traceback():
191
+ raise ValueError(f'Cluster {cluster_name!r} not found in '
192
+ f'SSH Node Pools file {file_path}')
197
193
  return {cluster_name: targets[cluster_name]}
198
194
 
199
195
  # Return all clusters if no specific cluster is specified
200
196
  return targets
201
197
 
202
198
 
203
- def prepare_hosts_info(cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
199
+ def prepare_hosts_info(cluster_name: str,
200
+ cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
204
201
  """Prepare list of hosts with resolved user, identity_file, and password."""
205
202
  if 'hosts' not in cluster_config or not cluster_config['hosts']:
206
- print(f'{RED}Error: No hosts defined in cluster configuration{NC}',
207
- file=sys.stderr)
208
- sys.exit(1)
203
+ with ux_utils.print_exception_no_traceback():
204
+ raise ValueError(
205
+ f'No hosts defined in cluster {cluster_name} configuration')
209
206
 
210
207
  # Get cluster-level defaults
211
208
  cluster_user = cluster_config.get('user', '')
@@ -627,6 +624,9 @@ def main():
627
624
  kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
628
625
  global_use_ssh_config = args.use_ssh_config
629
626
 
627
+ failed_clusters = []
628
+ successful_clusters = []
629
+
630
630
  # Print cleanup mode marker if applicable
631
631
  if args.cleanup:
632
632
  print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
@@ -636,23 +636,20 @@ def main():
636
636
  # Using command line arguments - legacy mode
637
637
  if args.ssh_key and not os.path.isfile(
638
638
  args.ssh_key) and not global_use_ssh_config:
639
- print(f'{RED}Error: SSH key not found: {args.ssh_key}{NC}',
640
- file=sys.stderr)
641
- sys.exit(1)
639
+ with ux_utils.print_exception_no_traceback():
640
+ raise ValueError(f'SSH key not found: {args.ssh_key}')
642
641
 
643
642
  if not os.path.isfile(args.ips_file):
644
- print(f'{RED}Error: IPs file not found: {args.ips_file}{NC}',
645
- file=sys.stderr)
646
- sys.exit(1)
643
+ with ux_utils.print_exception_no_traceback():
644
+ raise ValueError(f'IPs file not found: {args.ips_file}')
647
645
 
648
646
  with open(args.ips_file, 'r', encoding='utf-8') as f:
649
647
  hosts = [line.strip() for line in f if line.strip()]
650
648
 
651
649
  if not hosts:
652
- print(
653
- f'{RED}Error: Hosts file is empty or not formatted correctly.{NC}',
654
- file=sys.stderr)
655
- sys.exit(1)
650
+ with ux_utils.print_exception_no_traceback():
651
+ raise ValueError(
652
+ 'Hosts file is empty or not formatted correctly.')
656
653
 
657
654
  head_node = hosts[0]
658
655
  worker_nodes = hosts[1:]
@@ -688,108 +685,132 @@ def main():
688
685
 
689
686
  # Process each cluster
690
687
  for cluster_name, cluster_config in clusters_config.items():
691
- print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
692
- print(f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
693
- hosts_info = prepare_hosts_info(cluster_config)
694
-
695
- if not hosts_info:
688
+ try:
689
+ print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
696
690
  print(
697
- f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
698
- )
699
- continue
700
-
701
- # Generate a unique context name for each cluster
702
- context_name = args.context_name
703
- if context_name == 'default':
704
- context_name = 'ssh-' + cluster_name
705
-
706
- # Check cluster history
707
- os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
708
- history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
709
- f'{context_name}-history.yaml')
710
-
711
- history = None
712
- if os.path.exists(history_yaml_file):
713
- print(f'{YELLOW}Loading history from {history_yaml_file}{NC}')
714
- with open(history_yaml_file, 'r', encoding='utf-8') as f:
715
- history = yaml.safe_load(f)
716
- else:
717
- print(f'{YELLOW}No history found for {context_name}.{NC}')
718
-
719
- history_workers_info = None
720
- history_worker_nodes = None
721
- history_use_ssh_config = None
722
- # Do not support changing anything besides hosts for now
723
- if history is not None:
724
- for key in ['user', 'identity_file', 'password']:
725
- if history.get(key) != cluster_config.get(key):
691
+ f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
692
+ hosts_info = prepare_hosts_info(cluster_name, cluster_config)
693
+
694
+ if not hosts_info:
695
+ print(
696
+ f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
697
+ )
698
+ continue
699
+
700
+ # Generate a unique context name for each cluster
701
+ context_name = args.context_name
702
+ if context_name == 'default':
703
+ context_name = 'ssh-' + cluster_name
704
+
705
+ # Check cluster history
706
+ os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
707
+ history_yaml_file = os.path.join(
708
+ NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
709
+
710
+ history = None
711
+ if os.path.exists(history_yaml_file):
712
+ print(
713
+ f'{YELLOW}Loading history from {history_yaml_file}{NC}')
714
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
715
+ history = yaml.safe_load(f)
716
+ else:
717
+ print(f'{YELLOW}No history found for {context_name}.{NC}')
718
+
719
+ history_workers_info = None
720
+ history_worker_nodes = None
721
+ history_use_ssh_config = None
722
+ # Do not support changing anything besides hosts for now
723
+ if history is not None:
724
+ for key in ['user', 'identity_file', 'password']:
725
+ if history.get(key) != cluster_config.get(key):
726
+ raise ValueError(
727
+ f'Cluster configuration has changed for field {key!r}. '
728
+ f'Previous value: {history.get(key)}, '
729
+ f'Current value: {cluster_config.get(key)}')
730
+ history_hosts_info = prepare_hosts_info(
731
+ cluster_name, history)
732
+ if history_hosts_info[0] != hosts_info[0]:
726
733
  raise ValueError(
727
- f'Cluster configuration has changed for field {key!r}. '
728
- f'Previous value: {history.get(key)}, '
729
- f'Current value: {cluster_config.get(key)}')
730
- history_hosts_info = prepare_hosts_info(history)
731
- if history_hosts_info[0] != hosts_info[0]:
732
- raise ValueError(
733
- f'Cluster configuration has changed for master node. '
734
- f'Previous value: {history_hosts_info[0]}, '
735
- f'Current value: {hosts_info[0]}')
736
- history_workers_info = history_hosts_info[1:] if len(
737
- history_hosts_info) > 1 else []
738
- history_worker_nodes = [h['ip'] for h in history_workers_info]
739
- history_use_ssh_config = [
740
- h.get('use_ssh_config', False) for h in history_workers_info
734
+ f'Cluster configuration has changed for master node. '
735
+ f'Previous value: {history_hosts_info[0]}, '
736
+ f'Current value: {hosts_info[0]}')
737
+ history_workers_info = history_hosts_info[1:] if len(
738
+ history_hosts_info) > 1 else []
739
+ history_worker_nodes = [
740
+ h['ip'] for h in history_workers_info
741
+ ]
742
+ history_use_ssh_config = [
743
+ h.get('use_ssh_config', False)
744
+ for h in history_workers_info
745
+ ]
746
+
747
+ # Use the first host as the head node and the rest as worker nodes
748
+ head_host = hosts_info[0]
749
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
750
+
751
+ head_node = head_host['ip']
752
+ worker_nodes = [h['ip'] for h in worker_hosts]
753
+ ssh_user = head_host['user']
754
+ ssh_key = head_host['identity_file']
755
+ head_use_ssh_config = global_use_ssh_config or head_host.get(
756
+ 'use_ssh_config', False)
757
+ worker_use_ssh_config = [
758
+ global_use_ssh_config or h.get('use_ssh_config', False)
759
+ for h in worker_hosts
741
760
  ]
761
+ password = head_host['password']
762
+
763
+ # Deploy this cluster
764
+ unsuccessful_workers = deploy_cluster(
765
+ head_node,
766
+ worker_nodes,
767
+ ssh_user,
768
+ ssh_key,
769
+ context_name,
770
+ password,
771
+ head_use_ssh_config,
772
+ worker_use_ssh_config,
773
+ kubeconfig_path,
774
+ args.cleanup,
775
+ worker_hosts=worker_hosts,
776
+ history_worker_nodes=history_worker_nodes,
777
+ history_workers_info=history_workers_info,
778
+ history_use_ssh_config=history_use_ssh_config)
779
+
780
+ if not args.cleanup:
781
+ successful_hosts = []
782
+ for host in cluster_config['hosts']:
783
+ if isinstance(host, str):
784
+ host_node = host
785
+ else:
786
+ host_node = host['ip']
787
+ if host_node not in unsuccessful_workers:
788
+ successful_hosts.append(host)
789
+ cluster_config['hosts'] = successful_hosts
790
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
791
+ print(
792
+ f'{YELLOW}Writing history to {history_yaml_file}{NC}'
793
+ )
794
+ yaml.dump(cluster_config, f)
742
795
 
743
- # Use the first host as the head node and the rest as worker nodes
744
- head_host = hosts_info[0]
745
- worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
746
-
747
- head_node = head_host['ip']
748
- worker_nodes = [h['ip'] for h in worker_hosts]
749
- ssh_user = head_host['user']
750
- ssh_key = head_host['identity_file']
751
- head_use_ssh_config = global_use_ssh_config or head_host.get(
752
- 'use_ssh_config', False)
753
- worker_use_ssh_config = [
754
- global_use_ssh_config or h.get('use_ssh_config', False)
755
- for h in worker_hosts
756
- ]
757
- password = head_host['password']
758
-
759
- # Deploy this cluster
760
- unsuccessful_workers = deploy_cluster(
761
- head_node,
762
- worker_nodes,
763
- ssh_user,
764
- ssh_key,
765
- context_name,
766
- password,
767
- head_use_ssh_config,
768
- worker_use_ssh_config,
769
- kubeconfig_path,
770
- args.cleanup,
771
- worker_hosts=worker_hosts,
772
- history_worker_nodes=history_worker_nodes,
773
- history_workers_info=history_workers_info,
774
- history_use_ssh_config=history_use_ssh_config)
775
-
776
- if not args.cleanup:
777
- successful_hosts = []
778
- for host in cluster_config['hosts']:
779
- if isinstance(host, str):
780
- host_node = host
781
- else:
782
- host_node = host['ip']
783
- if host_node not in unsuccessful_workers:
784
- successful_hosts.append(host)
785
- cluster_config['hosts'] = successful_hosts
786
- with open(history_yaml_file, 'w', encoding='utf-8') as f:
787
- print(f'{YELLOW}Writing history to {history_yaml_file}{NC}')
788
- yaml.dump(cluster_config, f)
789
-
790
- print(
791
- f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
792
- )
796
+ print(
797
+ f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
798
+ )
799
+ successful_clusters.append(cluster_name)
800
+ except Exception as e: # pylint: disable=broad-except
801
+ reason = str(e)
802
+ failed_clusters.append((cluster_name, reason))
803
+ print(
804
+ f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
805
+ ) # Print for internal logging
806
+
807
+ if failed_clusters:
808
+ action = 'clean' if args.cleanup else 'deploy'
809
+ msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
810
+ msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
811
+ for cluster_name, reason in failed_clusters:
812
+ msg += f'\n {cluster_name}: {reason}'
813
+ raise RuntimeError(msg)
793
814
 
794
815
 
795
816
  def deploy_cluster(head_node,
@@ -839,11 +860,11 @@ def deploy_cluster(head_node,
839
860
  # For SkySSHUpLineProcessor
840
861
  print_output=True)
841
862
  if result is None:
842
- print(
843
- f'{RED}Failed to SSH to head node ({head_node}). '
844
- f'Please check the SSH configuration.{NC}',
845
- file=sys.stderr)
846
- sys.exit(1)
863
+ with ux_utils.print_exception_no_traceback():
864
+ raise RuntimeError(
865
+ f'Failed to SSH to head node ({head_node}). '
866
+ f'Please check the SSH configuration and logs for more details.'
867
+ )
847
868
 
848
869
  # Checking history
849
870
  history_exists = (history_worker_nodes is not None and
@@ -981,10 +1002,10 @@ def deploy_cluster(head_node,
981
1002
  print_output=True,
982
1003
  use_shell=True)
983
1004
  if result is None:
984
- print(
985
- f'{RED}Failed to setup TCP forwarding on head node ({head_node}). '
986
- f'Please check the SSH configuration.{NC}',
987
- file=sys.stderr)
1005
+ with ux_utils.print_exception_no_traceback():
1006
+ raise RuntimeError(
1007
+ f'Failed to setup TCP forwarding on head node ({head_node}). '
1008
+ f'Please check the SSH configuration.')
988
1009
 
989
1010
  # Get effective IP for master node if using SSH config - needed for workers to connect
990
1011
  if head_use_ssh_config:
@@ -1024,9 +1045,9 @@ def deploy_cluster(head_node,
1024
1045
  ssh_key,
1025
1046
  use_ssh_config=head_use_ssh_config)
1026
1047
  if result is None:
1027
- print(f'{RED}Failed to deploy K3s on head node ({head_node}). {NC}',
1028
- file=sys.stderr)
1029
- sys.exit(1)
1048
+ with ux_utils.print_exception_no_traceback():
1049
+ raise RuntimeError(
1050
+ f'Failed to deploy K3s on head node ({head_node}).')
1030
1051
  success_message(f'K3s deployed on head node ({head_node}).')
1031
1052
 
1032
1053
  # Check if head node has a GPU
@@ -1045,11 +1066,9 @@ def deploy_cluster(head_node,
1045
1066
  ssh_key,
1046
1067
  use_ssh_config=head_use_ssh_config)
1047
1068
  if master_addr is None:
1048
- print(
1049
- f'{RED}Failed to SSH to head node ({head_node}). '
1050
- f'Please check the SSH configuration.{NC}',
1051
- file=sys.stderr)
1052
- sys.exit(1)
1069
+ with ux_utils.print_exception_no_traceback():
1070
+ raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
1071
+ f'Please check the SSH configuration.')
1053
1072
  print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
1054
1073
 
1055
1074
  # Step 2: Install k3s on worker nodes and join them to the master node
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
26
26
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
27
27
 
28
28
 
29
+ def check_ssh_cluster_dependencies(
30
+ raise_error: bool = True) -> Optional[List[str]]:
31
+ """Checks if the dependencies for ssh cluster are installed.
32
+
33
+ Args:
34
+ raise_error: set to true when the dependency needs to be present.
35
+ set to false for `sky check`, where reason strings are compiled
36
+ at the end.
37
+
38
+ Returns: the reasons list if there are missing dependencies.
39
+ """
40
+ # error message
41
+ jq_message = ('`jq` is required to setup ssh cluster.')
42
+
43
+ # save
44
+ reasons = []
45
+ required_binaries = []
46
+
47
+ # Ensure jq is installed
48
+ try:
49
+ subprocess.run(['jq', '--version'],
50
+ stdout=subprocess.DEVNULL,
51
+ stderr=subprocess.DEVNULL,
52
+ check=True)
53
+ except (FileNotFoundError, subprocess.CalledProcessError):
54
+ required_binaries.append('jq')
55
+ reasons.append(jq_message)
56
+
57
+ if required_binaries:
58
+ reasons.extend([
59
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
60
+ f' $ sudo apt install {" ".join(required_binaries)}',
61
+ 'On MacOS, install with: ',
62
+ f' $ brew install {" ".join(required_binaries)}',
63
+ ])
64
+ if raise_error:
65
+ with ux_utils.print_exception_no_traceback():
66
+ raise RuntimeError('\n'.join(reasons))
67
+ return reasons
68
+ return None
69
+
70
+
29
71
  def deploy_ssh_cluster(cleanup: bool = False,
30
72
  infra: Optional[str] = None,
31
73
  kubeconfig_path: Optional[str] = None):
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
41
83
  kubeconfig_path: Path to save the Kubernetes configuration file.
42
84
  If None, the default ~/.kube/config will be used.
43
85
  """
86
+ check_ssh_cluster_dependencies()
87
+
44
88
  # Prepare command to call deploy_remote_cluster.py script
45
89
  # TODO(romilb): We should move this to a native python method/class call
46
90
  # instead of invoking a script with subprocess.
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
81
125
  cmd=deploy_command,
82
126
  log_path=log_path,
83
127
  require_outputs=True,
84
- stream_logs=False, # TODO: Fixme to False after we fix the logging
128
+ stream_logs=False,
85
129
  line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
86
- is_local=True),
130
+ is_local=False),
87
131
  cwd=cwd,
88
132
  env=env)
89
133
 
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
91
135
  success = True
92
136
  else:
93
137
  with ux_utils.print_exception_no_traceback():
94
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
95
- raise RuntimeError('Failed to deploy SkyPilot on SSH targets. '
96
- f'Full log: {log_hint}'
138
+ log_hint = ux_utils.log_path_hint(log_path, is_local=False)
139
+ raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
140
+ f'{log_hint}'
97
141
  f'\nError: {stderr}')
98
142
 
99
143
  if success:
@@ -188,14 +188,17 @@ generate_credentials_json() {
188
188
  debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
189
 
190
190
  # Check if we can create proper JSON with `jq`
191
- if command -v jq &>/dev/null; then
192
- debug_log "Using jq for JSON formatting"
193
-
194
- # Create a temporary file for the JSON output to avoid shell escaping issues
195
- local TEMP_JSON_FILE=$(mktemp)
196
-
197
- # Write the JSON to the temporary file using jq for proper JSON formatting
198
- cat > "$TEMP_JSON_FILE" << EOL
191
+ if ! command -v jq &>/dev/null; then
192
+ echo "jq is not installed. Please install jq to use this script." >&2
193
+ exit 1
194
+ fi
195
+ debug_log "Using jq for JSON formatting"
196
+
197
+ # Create a temporary file for the JSON output to avoid shell escaping issues
198
+ local TEMP_JSON_FILE=$(mktemp)
199
+
200
+ # Write the JSON to the temporary file using jq for proper JSON formatting
201
+ cat > "$TEMP_JSON_FILE" << EOL
199
202
  {
200
203
  "apiVersion": "client.authentication.k8s.io/v1beta1",
201
204
  "kind": "ExecCredential",
@@ -207,25 +210,14 @@ generate_credentials_json() {
207
210
  }
208
211
  EOL
209
212
 
210
- # Read the JSON from the file
211
- local json_response=$(cat "$TEMP_JSON_FILE")
212
-
213
- # Clean up
214
- rm -f "$TEMP_JSON_FILE"
215
-
216
- # Output the JSON
217
- echo "$json_response"
218
- else
219
- debug_log "jq is not available, using simpler formatting method"
220
-
221
- # Alternative approach: encode with base64 and use the token field instead
222
- # This works because kubectl will decode token data properly
223
- local combined_data=$(echo -n "${client_cert_data}:${client_key_data}" | base64 | tr -d '\n')
224
-
225
- echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"$combined_data\",\"expirationTimestamp\":\"$expiration_time\"}}"
226
-
227
- debug_log "Sent certificate data as encoded token instead of direct certificate fields"
228
- fi
213
+ # Read the JSON from the file
214
+ local json_response=$(cat "$TEMP_JSON_FILE")
215
+
216
+ # Clean up
217
+ rm -f "$TEMP_JSON_FILE"
218
+
219
+ # Output the JSON
220
+ echo "$json_response"
229
221
  else
230
222
  # Fallback to token-based credential for tunnel-only authentication
231
223
  echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
@@ -384,4 +376,4 @@ fi
384
376
 
385
377
  # Return valid credential format with certificates if available
386
378
  generate_credentials_json
387
- exit 0
379
+ exit 0
sky/utils/log_utils.py CHANGED
@@ -497,6 +497,10 @@ class SkySSHUpLineProcessor(LineProcessor):
497
497
  f'✗ Failed to setup TCP forwarding on head node {node_name}.'
498
498
  f'{colorama.Style.RESET_ALL}')
499
499
 
500
+ if 'Error in deploying SSH Target' in log_line:
501
+ logger.info(f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.RED}'
502
+ f'{log_line.strip()}{colorama.Style.RESET_ALL}')
503
+
500
504
  def __exit__(self, except_type: Optional[Type[BaseException]],
501
505
  except_value: Optional[BaseException],
502
506
  traceback: Optional[types.TracebackType]) -> None:
sky/utils/schemas.py CHANGED
@@ -1173,6 +1173,54 @@ def get_config_schema():
1173
1173
  }
1174
1174
  }
1175
1175
 
1176
+ workspace_schema = {'type': 'string'}
1177
+
1178
+ allowed_workspace_cloud_names = list(
1179
+ service_catalog.ALL_CLOUDS) + ['cloudflare']
1180
+ # Create pattern for non-GCP clouds (all clouds except gcp)
1181
+ non_gcp_clouds = [
1182
+ cloud for cloud in allowed_workspace_cloud_names
1183
+ if cloud.lower() != 'gcp'
1184
+ ]
1185
+ non_gcp_cloud_regex = '|'.join(non_gcp_clouds)
1186
+ workspaces_schema = {
1187
+ 'type': 'object',
1188
+ 'required': [],
1189
+ # each key is a workspace name
1190
+ 'additionalProperties': {
1191
+ 'type': 'object',
1192
+ 'additionalProperties': False,
1193
+ 'patternProperties': {
1194
+ # Pattern for non-GCP clouds - only allows 'disabled' property
1195
+ f'^({non_gcp_cloud_regex})$': {
1196
+ 'type': 'object',
1197
+ 'additionalProperties': False,
1198
+ 'properties': {
1199
+ 'disabled': {
1200
+ 'type': 'boolean'
1201
+ }
1202
+ },
1203
+ },
1204
+ },
1205
+ 'properties': {
1206
+ # Explicit definition for GCP allows both project_id and
1207
+ # disabled
1208
+ 'gcp': {
1209
+ 'type': 'object',
1210
+ 'properties': {
1211
+ 'project_id': {
1212
+ 'type': 'string'
1213
+ },
1214
+ 'disabled': {
1215
+ 'type': 'boolean'
1216
+ }
1217
+ },
1218
+ 'additionalProperties': False,
1219
+ },
1220
+ },
1221
+ },
1222
+ }
1223
+
1176
1224
  provision_configs = {
1177
1225
  'type': 'object',
1178
1226
  'required': [],
@@ -1199,6 +1247,10 @@ def get_config_schema():
1199
1247
  'required': [],
1200
1248
  'additionalProperties': False,
1201
1249
  'properties': {
1250
+ # TODO Replace this with whatever syang cooks up
1251
+ 'workspace': {
1252
+ 'type': 'string',
1253
+ },
1202
1254
  'jobs': controller_resources_schema,
1203
1255
  'serve': controller_resources_schema,
1204
1256
  'allowed_clouds': allowed_clouds,
@@ -1206,6 +1258,8 @@ def get_config_schema():
1206
1258
  'docker': docker_configs,
1207
1259
  'nvidia_gpus': gpu_configs,
1208
1260
  'api_server': api_server,
1261
+ 'active_workspace': workspace_schema,
1262
+ 'workspaces': workspaces_schema,
1209
1263
  'provision': provision_configs,
1210
1264
  **cloud_configs,
1211
1265
  },
File without changes