skypilot-nightly 1.0.0.dev20250524__py3-none-any.whl → 1.0.0.dev20250526__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/check.py +4 -1
  3. sky/cli.py +13 -3
  4. sky/client/cli.py +13 -3
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/7GEgRyZKRaSnYZCV1Jwol/_buildManifest.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/25-062253ea41fb8eec.js +6 -0
  8. sky/dashboard/out/_next/static/chunks/480-5a0de8b6570ea105.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/578-d351125af46c293f.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/734-a6e01d7f98904741.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/938-59956af3950b02ed.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/pages/_app-96a715a6fb01e228.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/pages/{infra-e690d864aa00e2ea.js → infra-abb7d744ecf15109.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/workspace/new-bbf436f41381e169.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7733c960685b4385.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/pages/workspaces-5ed48b3201b998c8.js +1 -0
  17. sky/dashboard/out/_next/static/css/28558d57108b05ae.css +3 -0
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/index.html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -0
  27. sky/dashboard/out/workspaces/[name].html +1 -0
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/global_user_state.py +592 -552
  30. sky/server/requests/payloads.py +17 -0
  31. sky/server/server.py +4 -13
  32. sky/setup_files/dependencies.py +1 -0
  33. sky/skypilot_config.py +31 -9
  34. sky/utils/db_utils.py +53 -0
  35. sky/utils/kubernetes/config_map_utils.py +133 -0
  36. sky/utils/kubernetes/deploy_remote_cluster.py +20 -4
  37. sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
  38. sky/utils/kubernetes/ssh-tunnel.sh +20 -28
  39. sky/workspaces/__init__.py +0 -0
  40. sky/workspaces/core.py +295 -0
  41. sky/workspaces/server.py +62 -0
  42. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/METADATA +2 -1
  43. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/RECORD +51 -42
  44. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +0 -1
  45. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +0 -1
  46. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +0 -6
  47. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +0 -1
  48. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +0 -1
  49. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +0 -1
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +0 -1
  51. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +0 -3
  52. /sky/dashboard/out/_next/static/{aHej19bZyl4hoHgrzPCn7 → 7GEgRyZKRaSnYZCV1Jwol}/_ssgManifest.js +0 -0
  53. /sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-37c042a356f8e608.js → [job]-3b5aad09a25f64b7.js} +0 -0
  54. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-db6558a5ec687011.js → [job]-48dc8d67d4b60be1.js} +0 -0
  55. /sky/dashboard/out/_next/static/chunks/pages/{users-2d319455c3f1c3e2.js → users-b8acf6e6735323a2.js} +0 -0
  56. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/WHEEL +0 -0
  57. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250526.dist-info}/top_level.txt +0 -0
@@ -536,3 +536,20 @@ class UploadZipFileResponse(pydantic.BaseModel):
536
536
  class EnabledCloudsBody(RequestBody):
537
537
  """The request body for the enabled clouds endpoint."""
538
538
  workspace: Optional[str] = None
539
+
540
+
541
+ class UpdateWorkspaceBody(RequestBody):
542
+ """The request body for updating a specific workspace configuration."""
543
+ workspace_name: str = '' # Will be set from path parameter
544
+ config: Dict[str, Any]
545
+
546
+
547
+ class CreateWorkspaceBody(RequestBody):
548
+ """The request body for creating a new workspace."""
549
+ workspace_name: str = '' # Will be set from path parameter
550
+ config: Dict[str, Any]
551
+
552
+
553
+ class DeleteWorkspaceBody(RequestBody):
554
+ """The request body for deleting a workspace."""
555
+ workspace_name: str
sky/server/server.py CHANGED
@@ -34,7 +34,6 @@ from sky import execution
34
34
  from sky import global_user_state
35
35
  from sky import models
36
36
  from sky import sky_logging
37
- from sky import skypilot_config
38
37
  from sky.clouds import service_catalog
39
38
  from sky.data import storage_utils
40
39
  from sky.jobs.server import server as jobs_rest
@@ -59,6 +58,7 @@ from sky.utils import dag_utils
59
58
  from sky.utils import env_options
60
59
  from sky.utils import status_lib
61
60
  from sky.utils import subprocess_utils
61
+ from sky.workspaces import server as workspaces_rest
62
62
 
63
63
  # pylint: disable=ungrouped-imports
64
64
  if sys.version_info >= (3, 10):
@@ -255,6 +255,9 @@ app.add_middleware(AuthProxyMiddleware)
255
255
  app.add_middleware(RequestIDMiddleware)
256
256
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
257
257
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
258
+ app.include_router(workspaces_rest.router,
259
+ prefix='/workspaces',
260
+ tags=['workspaces'])
258
261
 
259
262
 
260
263
  @app.get('/token')
@@ -323,18 +326,6 @@ async def enabled_clouds(request: fastapi.Request,
323
326
  )
324
327
 
325
328
 
326
- @app.get('/workspaces')
327
- async def get_workspace_config(request: fastapi.Request) -> None:
328
- """Gets workspace config on the server."""
329
- executor.schedule_request(
330
- request_id=request.state.request_id,
331
- request_name='workspaces',
332
- request_body=payloads.RequestBody(),
333
- func=skypilot_config.get_workspaces,
334
- schedule_type=requests_lib.ScheduleType.SHORT,
335
- )
336
-
337
-
338
329
  @app.post('/realtime_kubernetes_gpu_availability')
339
330
  async def realtime_kubernetes_gpu_availability(
340
331
  request: fastapi.Request,
@@ -56,6 +56,7 @@ install_requires = [
56
56
  'aiofiles',
57
57
  'httpx',
58
58
  'setproctitle',
59
+ 'sqlalchemy',
59
60
  ]
60
61
 
61
62
  local_ray = [
sky/skypilot_config.py CHANGED
@@ -57,6 +57,8 @@ import threading
57
57
  import typing
58
58
  from typing import Any, Dict, Iterator, List, Optional, Tuple
59
59
 
60
+ import filelock
61
+
60
62
  from sky import exceptions
61
63
  from sky import sky_logging
62
64
  from sky.adaptors import common as adaptors_common
@@ -66,6 +68,7 @@ from sky.utils import config_utils
66
68
  from sky.utils import context
67
69
  from sky.utils import schemas
68
70
  from sky.utils import ux_utils
71
+ from sky.utils.kubernetes import config_map_utils
69
72
 
70
73
  if typing.TYPE_CHECKING:
71
74
  import yaml
@@ -120,10 +123,17 @@ class ConfigContext:
120
123
 
121
124
 
122
125
  # The global loaded config.
126
+ _active_workspace_context = threading.local()
123
127
  _global_config_context = ConfigContext()
124
- _reload_config_lock = threading.Lock()
125
128
 
126
- _active_workspace_context = threading.local()
129
+ SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config.lock'
130
+
131
+
132
+ def get_skypilot_config_lock_path() -> str:
133
+ """Get the path for the SkyPilot config lock file."""
134
+ lock_path = os.path.expanduser(SKYPILOT_CONFIG_LOCK_PATH)
135
+ os.makedirs(os.path.dirname(lock_path), exist_ok=True)
136
+ return lock_path
127
137
 
128
138
 
129
139
  def _get_config_context() -> ConfigContext:
@@ -389,7 +399,7 @@ def overlay_skypilot_config(
389
399
 
390
400
  def safe_reload_config() -> None:
391
401
  """Reloads the config, safe to be called concurrently."""
392
- with _reload_config_lock:
402
+ with filelock.FileLock(get_skypilot_config_lock_path()):
393
403
  _reload_config()
394
404
 
395
405
 
@@ -691,9 +701,21 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
691
701
  return parsed_config
692
702
 
693
703
 
694
- def get_workspaces() -> Dict[str, Any]:
695
- """Returns the workspace config."""
696
- workspaces = get_nested(('workspaces',), default_value={})
697
- if constants.SKYPILOT_DEFAULT_WORKSPACE not in workspaces:
698
- workspaces[constants.SKYPILOT_DEFAULT_WORKSPACE] = {}
699
- return workspaces
704
+ def update_config_no_lock(config: config_utils.Config) -> None:
705
+ """Dumps the new config to a file and syncs to ConfigMap if in Kubernetes.
706
+
707
+ Args:
708
+ config: The config to save and sync.
709
+ """
710
+ global_config_path = os.path.expanduser(get_user_config_path())
711
+
712
+ # Always save to the local file (PVC in Kubernetes, local file otherwise)
713
+ common_utils.dump_yaml(global_config_path, dict(config))
714
+
715
+ if config_map_utils.is_running_in_kubernetes():
716
+ # In Kubernetes, sync the PVC config to ConfigMap for user convenience
717
+ # PVC file is the source of truth, ConfigMap is just a mirror for easy
718
+ # access
719
+ config_map_utils.patch_configmap_with_config(config, global_config_path)
720
+
721
+ _reload_config()
sky/utils/db_utils.py CHANGED
@@ -1,9 +1,17 @@
1
1
  """Utils for sky databases."""
2
2
  import contextlib
3
+ import enum
3
4
  import sqlite3
4
5
  import threading
6
+ import typing
5
7
  from typing import Any, Callable, Optional
6
8
 
9
+ import sqlalchemy
10
+ from sqlalchemy import exc as sqlalchemy_exc
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sqlalchemy.orm import Session
14
+
7
15
  # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
16
  # obtains a database lock (not necessarily during connection, but whenever it is
9
17
  # needed). It is not a connection timeout.
@@ -21,6 +29,11 @@ from typing import Any, Callable, Optional
21
29
  _DB_TIMEOUT_S = 60
22
30
 
23
31
 
32
+ class SQLAlchemyDialect(enum.Enum):
33
+ SQLITE = 'sqlite'
34
+ POSTGRESQL = 'postgresql'
35
+
36
+
24
37
  @contextlib.contextmanager
25
38
  def safe_cursor(db_path: str):
26
39
  """A newly created, auto-committing, auto-closing cursor."""
@@ -71,6 +84,46 @@ def add_column_to_table(
71
84
  conn.commit()
72
85
 
73
86
 
87
+ def add_column_to_table_sqlalchemy(
88
+ session: 'Session',
89
+ table_name: str,
90
+ column_name: str,
91
+ column_type: str,
92
+ copy_from: Optional[str] = None,
93
+ value_to_replace_existing_entries: Optional[Any] = None,
94
+ ):
95
+ """Add a column to a table."""
96
+ dialect = session.bind.dialect
97
+ if dialect.name == SQLAlchemyDialect.SQLITE.value:
98
+ try:
99
+ session.execute(
100
+ sqlalchemy.text(f'ALTER TABLE {table_name} '
101
+ f'ADD COLUMN {column_name} {column_type}'))
102
+ if copy_from is not None:
103
+ session.execute(
104
+ sqlalchemy.text(f'UPDATE {table_name} '
105
+ f'SET {column_name} = {copy_from}'))
106
+ if value_to_replace_existing_entries is not None:
107
+ session.execute(
108
+ sqlalchemy.text(f'UPDATE {table_name} '
109
+ f'SET {column_name} = :replacement_value '
110
+ f'WHERE {column_name} IS NULL'),
111
+ {'replacement_value': value_to_replace_existing_entries})
112
+ except sqlalchemy_exc.OperationalError as e:
113
+ if 'duplicate column name' in str(e):
114
+ pass
115
+ else:
116
+ raise
117
+ elif dialect.name == SQLAlchemyDialect.POSTGRESQL.value:
118
+ # TODO(syang) support postgres dialect
119
+ session.rollback()
120
+ raise ValueError('Unsupported database dialect')
121
+ else:
122
+ session.rollback()
123
+ raise ValueError('Unsupported database dialect')
124
+ session.commit()
125
+
126
+
74
127
  def rename_column(
75
128
  cursor: 'sqlite3.Cursor',
76
129
  conn: 'sqlite3.Connection',
@@ -0,0 +1,133 @@
1
+ """Utilities for Kubernetes ConfigMap operations in SkyPilot."""
2
+ import os
3
+
4
+ from sky import sky_logging
5
+ from sky import skypilot_config
6
+ from sky.adaptors import kubernetes
7
+ from sky.utils import common_utils
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ # Kubernetes ConfigMap sync constants
12
+ _KUBE_SERVICE_ACCOUNT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount'
13
+ _CONFIGMAP_SYNC_TIMEOUT = 10 # seconds
14
+
15
+
16
+ def is_running_in_kubernetes() -> bool:
17
+ """Check if we're running inside a Kubernetes pod."""
18
+ return os.path.exists(f'{_KUBE_SERVICE_ACCOUNT_PATH}/token')
19
+
20
+
21
+ def _get_kubernetes_namespace() -> str:
22
+ """Get the current Kubernetes namespace from the service account."""
23
+ try:
24
+ namespace_file = f'{_KUBE_SERVICE_ACCOUNT_PATH}/namespace'
25
+ if os.path.exists(namespace_file):
26
+ with open(namespace_file, encoding='utf-8') as f:
27
+ return f.read().strip()
28
+ except (OSError, IOError):
29
+ pass
30
+ return 'default'
31
+
32
+
33
+ def _get_configmap_name() -> str:
34
+ """Get the ConfigMap name for the SkyPilot config."""
35
+ release_name = (os.getenv('HELM_RELEASE_NAME') or
36
+ os.getenv('SKYPILOT_RELEASE_NAME') or 'skypilot')
37
+ return f'{release_name}-config'
38
+
39
+
40
+ def initialize_configmap_sync_on_startup(config_file_path: str) -> None:
41
+ """Initialize ConfigMap sync on API server startup.
42
+
43
+ This syncs existing PVC config to ConfigMap if ConfigMap doesn't exist.
44
+ This handles the upgrade scenario where an existing deployment has
45
+ workspace configs on PVC but no ConfigMap exists.
46
+
47
+ Args:
48
+ config_file_path: Path to the config file to sync.
49
+ """
50
+ config_file_path = os.path.expanduser(config_file_path)
51
+ if not is_running_in_kubernetes() or not os.path.exists(config_file_path):
52
+ return
53
+
54
+ try:
55
+ namespace = _get_kubernetes_namespace()
56
+ configmap_name = _get_configmap_name()
57
+
58
+ # Check if ConfigMap exists
59
+ try:
60
+ kubernetes.core_api().read_namespaced_config_map(
61
+ name=configmap_name, namespace=namespace)
62
+ # ConfigMap exists, don't overwrite it
63
+ logger.debug(f'ConfigMap {configmap_name} already exists')
64
+ return
65
+ except kubernetes.kubernetes.client.rest.ApiException as e:
66
+ if e.status != 404:
67
+ raise
68
+ # ConfigMap doesn't exist, create it
69
+
70
+ current_config = skypilot_config.parse_and_validate_config_file(
71
+ config_file_path)
72
+ config_yaml = common_utils.dump_yaml_str(dict(current_config))
73
+
74
+ configmap_body = {
75
+ 'apiVersion': 'v1',
76
+ 'kind': 'ConfigMap',
77
+ 'metadata': {
78
+ 'name': configmap_name,
79
+ 'namespace': namespace,
80
+ 'labels': {
81
+ 'app.kubernetes.io/name': 'skypilot',
82
+ 'app.kubernetes.io/component': 'config'
83
+ }
84
+ },
85
+ 'data': {
86
+ 'config.yaml': config_yaml
87
+ }
88
+ }
89
+
90
+ kubernetes.core_api().create_namespaced_config_map(
91
+ namespace=namespace,
92
+ body=configmap_body,
93
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
94
+
95
+ logger.info(f'Synced PVC config to new ConfigMap {configmap_name}')
96
+
97
+ except Exception as e: # pylint: disable=broad-except
98
+ logger.warning(f'Failed to initialize ConfigMap sync: {e}')
99
+
100
+
101
+ def patch_configmap_with_config(config, config_file_path: str) -> None:
102
+ """Patch the Kubernetes ConfigMap with the updated config.
103
+
104
+ Args:
105
+ config: The updated config to sync to the ConfigMap.
106
+ config_file_path: Path to the config file for fallback sync.
107
+ """
108
+ if not is_running_in_kubernetes():
109
+ return
110
+
111
+ try:
112
+ namespace = _get_kubernetes_namespace()
113
+ configmap_name = _get_configmap_name()
114
+ config_yaml = common_utils.dump_yaml_str(dict(config))
115
+ patch_body = {'data': {'config.yaml': config_yaml}}
116
+
117
+ try:
118
+ kubernetes.core_api().patch_namespaced_config_map(
119
+ name=configmap_name,
120
+ namespace=namespace,
121
+ body=patch_body,
122
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
123
+ logger.debug(f'Synced config to ConfigMap {configmap_name}')
124
+ except kubernetes.kubernetes.client.rest.ApiException as e:
125
+ if e.status == 404:
126
+ # ConfigMap doesn't exist, create it
127
+ logger.info(f'ConfigMap {configmap_name} not found, creating')
128
+ initialize_configmap_sync_on_startup(config_file_path)
129
+ else:
130
+ raise
131
+
132
+ except Exception as e: # pylint: disable=broad-except
133
+ logger.warning(f'Failed to sync config to ConfigMap: {e}')
@@ -624,6 +624,9 @@ def main():
624
624
  kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
625
625
  global_use_ssh_config = args.use_ssh_config
626
626
 
627
+ failed_clusters = []
628
+ successful_clusters = []
629
+
627
630
  # Print cleanup mode marker if applicable
628
631
  if args.cleanup:
629
632
  print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
@@ -793,10 +796,21 @@ def main():
793
796
  print(
794
797
  f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
795
798
  )
799
+ successful_clusters.append(cluster_name)
796
800
  except Exception as e: # pylint: disable=broad-except
801
+ reason = str(e)
802
+ failed_clusters.append((cluster_name, reason))
797
803
  print(
798
- f'{RED}Error deploying SSH Node Pool {cluster_name}: {e}{NC}'
799
- )
804
+ f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
805
+ ) # Print for internal logging
806
+
807
+ if failed_clusters:
808
+ action = 'clean' if args.cleanup else 'deploy'
809
+ msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
810
+ msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
811
+ for cluster_name, reason in failed_clusters:
812
+ msg += f'\n {cluster_name}: {reason}'
813
+ raise RuntimeError(msg)
800
814
 
801
815
 
802
816
  def deploy_cluster(head_node,
@@ -847,8 +861,10 @@ def deploy_cluster(head_node,
847
861
  print_output=True)
848
862
  if result is None:
849
863
  with ux_utils.print_exception_no_traceback():
850
- raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
851
- f'Please check the SSH configuration.')
864
+ raise RuntimeError(
865
+ f'Failed to SSH to head node ({head_node}). '
866
+ f'Please check the SSH configuration and logs for more details.'
867
+ )
852
868
 
853
869
  # Checking history
854
870
  history_exists = (history_worker_nodes is not None and
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
26
26
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
27
27
 
28
28
 
29
+ def check_ssh_cluster_dependencies(
30
+ raise_error: bool = True) -> Optional[List[str]]:
31
+ """Checks if the dependencies for ssh cluster are installed.
32
+
33
+ Args:
34
+ raise_error: set to true when the dependency needs to be present.
35
+ set to false for `sky check`, where reason strings are compiled
36
+ at the end.
37
+
38
+ Returns: the reasons list if there are missing dependencies.
39
+ """
40
+ # error message
41
+ jq_message = ('`jq` is required to setup ssh cluster.')
42
+
43
+ # save
44
+ reasons = []
45
+ required_binaries = []
46
+
47
+ # Ensure jq is installed
48
+ try:
49
+ subprocess.run(['jq', '--version'],
50
+ stdout=subprocess.DEVNULL,
51
+ stderr=subprocess.DEVNULL,
52
+ check=True)
53
+ except (FileNotFoundError, subprocess.CalledProcessError):
54
+ required_binaries.append('jq')
55
+ reasons.append(jq_message)
56
+
57
+ if required_binaries:
58
+ reasons.extend([
59
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
60
+ f' $ sudo apt install {" ".join(required_binaries)}',
61
+ 'On MacOS, install with: ',
62
+ f' $ brew install {" ".join(required_binaries)}',
63
+ ])
64
+ if raise_error:
65
+ with ux_utils.print_exception_no_traceback():
66
+ raise RuntimeError('\n'.join(reasons))
67
+ return reasons
68
+ return None
69
+
70
+
29
71
  def deploy_ssh_cluster(cleanup: bool = False,
30
72
  infra: Optional[str] = None,
31
73
  kubeconfig_path: Optional[str] = None):
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
41
83
  kubeconfig_path: Path to save the Kubernetes configuration file.
42
84
  If None, the default ~/.kube/config will be used.
43
85
  """
86
+ check_ssh_cluster_dependencies()
87
+
44
88
  # Prepare command to call deploy_remote_cluster.py script
45
89
  # TODO(romilb): We should move this to a native python method/class call
46
90
  # instead of invoking a script with subprocess.
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
81
125
  cmd=deploy_command,
82
126
  log_path=log_path,
83
127
  require_outputs=True,
84
- stream_logs=False, # TODO: Fixme to False after we fix the logging
128
+ stream_logs=False,
85
129
  line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
86
- is_local=True),
130
+ is_local=False),
87
131
  cwd=cwd,
88
132
  env=env)
89
133
 
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
91
135
  success = True
92
136
  else:
93
137
  with ux_utils.print_exception_no_traceback():
94
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
95
- raise RuntimeError('Failed to deploy SkyPilot on SSH targets. '
96
- f'Full log: {log_hint}'
138
+ log_hint = ux_utils.log_path_hint(log_path, is_local=False)
139
+ raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
140
+ f'{log_hint}'
97
141
  f'\nError: {stderr}')
98
142
 
99
143
  if success:
@@ -188,14 +188,17 @@ generate_credentials_json() {
188
188
  debug_log "Key data length: $(echo -n "$client_key_data" | wc -c) bytes"
189
189
 
190
190
  # Check if we can create proper JSON with `jq`
191
- if command -v jq &>/dev/null; then
192
- debug_log "Using jq for JSON formatting"
193
-
194
- # Create a temporary file for the JSON output to avoid shell escaping issues
195
- local TEMP_JSON_FILE=$(mktemp)
196
-
197
- # Write the JSON to the temporary file using jq for proper JSON formatting
198
- cat > "$TEMP_JSON_FILE" << EOL
191
+ if ! command -v jq &>/dev/null; then
192
+ echo "jq is not installed. Please install jq to use this script." >&2
193
+ exit 1
194
+ fi
195
+ debug_log "Using jq for JSON formatting"
196
+
197
+ # Create a temporary file for the JSON output to avoid shell escaping issues
198
+ local TEMP_JSON_FILE=$(mktemp)
199
+
200
+ # Write the JSON to the temporary file using jq for proper JSON formatting
201
+ cat > "$TEMP_JSON_FILE" << EOL
199
202
  {
200
203
  "apiVersion": "client.authentication.k8s.io/v1beta1",
201
204
  "kind": "ExecCredential",
@@ -207,25 +210,14 @@ generate_credentials_json() {
207
210
  }
208
211
  EOL
209
212
 
210
- # Read the JSON from the file
211
- local json_response=$(cat "$TEMP_JSON_FILE")
212
-
213
- # Clean up
214
- rm -f "$TEMP_JSON_FILE"
215
-
216
- # Output the JSON
217
- echo "$json_response"
218
- else
219
- debug_log "jq is not available, using simpler formatting method"
220
-
221
- # Alternative approach: encode with base64 and use the token field instead
222
- # This works because kubectl will decode token data properly
223
- local combined_data=$(echo -n "${client_cert_data}:${client_key_data}" | base64 | tr -d '\n')
224
-
225
- echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"$combined_data\",\"expirationTimestamp\":\"$expiration_time\"}}"
226
-
227
- debug_log "Sent certificate data as encoded token instead of direct certificate fields"
228
- fi
213
+ # Read the JSON from the file
214
+ local json_response=$(cat "$TEMP_JSON_FILE")
215
+
216
+ # Clean up
217
+ rm -f "$TEMP_JSON_FILE"
218
+
219
+ # Output the JSON
220
+ echo "$json_response"
229
221
  else
230
222
  # Fallback to token-based credential for tunnel-only authentication
231
223
  echo "{\"apiVersion\":\"client.authentication.k8s.io/v1beta1\",\"kind\":\"ExecCredential\",\"status\":{\"token\":\"k8s-ssh-tunnel-token\",\"expirationTimestamp\":\"$expiration_time\"}}"
@@ -384,4 +376,4 @@ fi
384
376
 
385
377
  # Return valid credential format with certificates if available
386
378
  generate_credentials_json
387
- exit 0
379
+ exit 0
File without changes