skypilot-nightly 1.0.0.dev20250524__py3-none-any.whl → 1.0.0.dev20250527__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sky/__init__.py +2 -2
  2. sky/check.py +32 -6
  3. sky/cli.py +17 -24
  4. sky/client/cli.py +17 -24
  5. sky/client/sdk.py +5 -2
  6. sky/clouds/cloud.py +2 -2
  7. sky/clouds/kubernetes.py +10 -5
  8. sky/clouds/service_catalog/kubernetes_catalog.py +4 -0
  9. sky/clouds/ssh.py +24 -8
  10. sky/core.py +20 -2
  11. sky/dashboard/out/404.html +1 -1
  12. sky/dashboard/out/_next/static/D5bjIfl4Ob3SV3LJz3CO0/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/236-e220ba0c35bf089e.js +6 -0
  14. sky/dashboard/out/_next/static/chunks/{498-d7722313e5e5b4e6.js → 320-afea3ddcc5bd1c6c.js} +1 -16
  15. sky/dashboard/out/_next/static/chunks/470-1d784f5c8750744a.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/578-24f35aa98d38d638.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/627-31b701e69f52db0c.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/843-e35d71cf1c7f706e.js +11 -0
  19. sky/dashboard/out/_next/static/chunks/990-f85643b521f7ca65.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/_app-3985f074c163a856.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-339b59921ccfe266.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e23fcddf60578a0d.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e6d1ec6e1ac5b29.js → clusters-8afda8efa5b74997.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/config-72b8c6c2edfd0e39.js +6 -0
  25. sky/dashboard/out/_next/static/chunks/pages/infra-1521baab6992916b.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4d913940b4fa6f5a.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/jobs-ff7e8e377d02b651.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/users-9900af52acf8648d.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/workspace/new-63763ffa3edb4508.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-3ede7a13caf23375.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/workspaces-72330c4d0fc9a4a2.js +1 -0
  32. sky/dashboard/out/_next/static/css/6a1c0d711a4bdaf1.css +3 -0
  33. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  34. sky/dashboard/out/clusters/[cluster].html +1 -1
  35. sky/dashboard/out/clusters.html +1 -1
  36. sky/dashboard/out/config.html +1 -0
  37. sky/dashboard/out/index.html +1 -1
  38. sky/dashboard/out/infra.html +1 -1
  39. sky/dashboard/out/jobs/[job].html +1 -1
  40. sky/dashboard/out/jobs.html +1 -1
  41. sky/dashboard/out/users.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -0
  43. sky/dashboard/out/workspaces/[name].html +1 -0
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/global_user_state.py +592 -552
  46. sky/server/constants.py +1 -1
  47. sky/server/requests/payloads.py +33 -3
  48. sky/server/requests/serializers/decoders.py +0 -11
  49. sky/server/server.py +23 -22
  50. sky/setup_files/dependencies.py +1 -0
  51. sky/skypilot_config.py +35 -9
  52. sky/utils/db_utils.py +53 -0
  53. sky/utils/kubernetes/config_map_utils.py +133 -0
  54. sky/utils/kubernetes/deploy_remote_cluster.py +20 -4
  55. sky/utils/kubernetes/exec_kubeconfig_converter.py +19 -0
  56. sky/utils/kubernetes/kubernetes_deploy_utils.py +49 -5
  57. sky/utils/kubernetes/ssh-tunnel.sh +20 -28
  58. sky/utils/schemas.py +57 -5
  59. sky/workspaces/__init__.py +0 -0
  60. sky/workspaces/core.py +431 -0
  61. sky/workspaces/server.py +87 -0
  62. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/METADATA +2 -1
  63. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/RECORD +69 -57
  64. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/WHEEL +1 -1
  65. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +0 -1
  66. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +0 -1
  67. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +0 -6
  68. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +0 -1
  69. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +0 -1
  70. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +0 -16
  73. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +0 -1
  74. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +0 -1
  75. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +0 -1
  78. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +0 -3
  79. /sky/dashboard/out/_next/static/{aHej19bZyl4hoHgrzPCn7 → D5bjIfl4Ob3SV3LJz3CO0}/_ssgManifest.js +0 -0
  80. /sky/dashboard/out/_next/static/chunks/{573-f17bd89d9f9118b3.js → 573-82bd40a37af834f1.js} +0 -0
  81. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/entry_points.txt +0 -0
  82. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/licenses/LICENSE +0 -0
  83. {skypilot_nightly-1.0.0.dev20250524.dist-info → skypilot_nightly-1.0.0.dev20250527.dist-info}/top_level.txt +0 -0
sky/server/constants.py CHANGED
@@ -7,7 +7,7 @@ from sky.skylet import constants
7
7
  # API server version, whenever there is a change in API server that requires a
8
8
  # restart of the local API server or error out when the client does not match
9
9
  # the server version.
10
- API_VERSION = '5'
10
+ API_VERSION = '6'
11
11
 
12
12
  # Prefix for API request names.
13
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -72,6 +72,8 @@ def request_body_env_vars() -> dict:
72
72
 
73
73
  def get_override_skypilot_config_from_client() -> Dict[str, Any]:
74
74
  """Returns the override configs from the client."""
75
+ if annotations.is_on_api_server:
76
+ return {}
75
77
  config = skypilot_config.to_dict()
76
78
  # Remove the API server config, as we should not specify the SkyPilot
77
79
  # server endpoint on the server side. This avoids the warning at
@@ -134,6 +136,12 @@ class CheckBody(RequestBody):
134
136
  workspace: Optional[str] = None
135
137
 
136
138
 
139
+ class EnabledCloudsBody(RequestBody):
140
+ """The request body for the enabled clouds endpoint."""
141
+ workspace: Optional[str] = None
142
+ expand: bool = False
143
+
144
+
137
145
  class DagRequestBody(RequestBody):
138
146
  """Request body base class for endpoints with a dag."""
139
147
  dag: str
@@ -533,6 +541,28 @@ class UploadZipFileResponse(pydantic.BaseModel):
533
541
  missing_chunks: Optional[List[str]] = None
534
542
 
535
543
 
536
- class EnabledCloudsBody(RequestBody):
537
- """The request body for the enabled clouds endpoint."""
538
- workspace: Optional[str] = None
544
+ class UpdateWorkspaceBody(RequestBody):
545
+ """The request body for updating a specific workspace configuration."""
546
+ workspace_name: str = '' # Will be set from path parameter
547
+ config: Dict[str, Any]
548
+
549
+
550
+ class CreateWorkspaceBody(RequestBody):
551
+ """The request body for creating a new workspace."""
552
+ workspace_name: str = '' # Will be set from path parameter
553
+ config: Dict[str, Any]
554
+
555
+
556
+ class DeleteWorkspaceBody(RequestBody):
557
+ """The request body for deleting a workspace."""
558
+ workspace_name: str
559
+
560
+
561
+ class UpdateConfigBody(RequestBody):
562
+ """The request body for updating the entire SkyPilot configuration."""
563
+ config: Dict[str, Any]
564
+
565
+
566
+ class GetConfigBody(RequestBody):
567
+ """The request body for getting the entire SkyPilot configuration."""
568
+ pass
@@ -12,7 +12,6 @@ from sky.provision.kubernetes import utils as kubernetes_utils
12
12
  from sky.serve import serve_state
13
13
  from sky.server import constants as server_constants
14
14
  from sky.skylet import job_lib
15
- from sky.utils import registry
16
15
  from sky.utils import status_lib
17
16
 
18
17
  if typing.TYPE_CHECKING:
@@ -135,16 +134,6 @@ def decode_cost_report(
135
134
  return return_value
136
135
 
137
136
 
138
- @register_decoders('enabled_clouds')
139
- def decode_enabled_clouds(return_value: List[str]) -> List['clouds.Cloud']:
140
- clouds = []
141
- for cloud_name in return_value:
142
- cloud = registry.CLOUD_REGISTRY.from_str(cloud_name)
143
- assert cloud is not None, return_value
144
- clouds.append(cloud)
145
- return clouds
146
-
147
-
148
137
  @register_decoders('list_accelerators')
149
138
  def decode_list_accelerators(
150
139
  return_value: Dict[str, List[List[Any]]]
sky/server/server.py CHANGED
@@ -34,7 +34,6 @@ from sky import execution
34
34
  from sky import global_user_state
35
35
  from sky import models
36
36
  from sky import sky_logging
37
- from sky import skypilot_config
38
37
  from sky.clouds import service_catalog
39
38
  from sky.data import storage_utils
40
39
  from sky.jobs.server import server as jobs_rest
@@ -59,6 +58,7 @@ from sky.utils import dag_utils
59
58
  from sky.utils import env_options
60
59
  from sky.utils import status_lib
61
60
  from sky.utils import subprocess_utils
61
+ from sky.workspaces import server as workspaces_rest
62
62
 
63
63
  # pylint: disable=ungrouped-imports
64
64
  if sys.version_info >= (3, 10):
@@ -127,6 +127,11 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
127
127
 
128
128
  async def dispatch(self, request: fastapi.Request, call_next):
129
129
  auth_user = _get_auth_user_header(request)
130
+
131
+ # Add user to database if auth_user is present
132
+ if auth_user is not None:
133
+ global_user_state.add_or_update_user(auth_user)
134
+
130
135
  body = await request.body()
131
136
  if auth_user and body:
132
137
  try:
@@ -137,10 +142,16 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
137
142
  logger.debug(f'Overriding user for {request.state.request_id}: '
138
143
  f'{auth_user.name}, {auth_user.id}')
139
144
  if 'env_vars' in original_json:
140
- original_json['env_vars'][
141
- constants.USER_ID_ENV_VAR] = auth_user.id
142
- original_json['env_vars'][
143
- constants.USER_ENV_VAR] = auth_user.name
145
+ if isinstance(original_json.get('env_vars'), dict):
146
+ original_json['env_vars'][
147
+ constants.USER_ID_ENV_VAR] = auth_user.id
148
+ original_json['env_vars'][
149
+ constants.USER_ENV_VAR] = auth_user.name
150
+ else:
151
+ logger.warning(
152
+ f'"env_vars" in request body is not a dictionary '
153
+ f'for request {request.state.request_id}. '
154
+ 'Skipping user info injection into body.')
144
155
  request._body = json.dumps(original_json).encode('utf-8') # pylint: disable=protected-access
145
156
  return await call_next(request)
146
157
 
@@ -255,14 +266,14 @@ app.add_middleware(AuthProxyMiddleware)
255
266
  app.add_middleware(RequestIDMiddleware)
256
267
  app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
257
268
  app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
269
+ app.include_router(workspaces_rest.router,
270
+ prefix='/workspaces',
271
+ tags=['workspaces'])
258
272
 
259
273
 
260
274
  @app.get('/token')
261
275
  async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
262
- # If we have auth info, save this user to the database.
263
276
  user = _get_auth_user_header(request)
264
- if user is not None:
265
- global_user_state.add_or_update_user(user)
266
277
 
267
278
  token_data = {
268
279
  'v': 1, # Token version number, bump for backwards incompatible.
@@ -312,29 +323,19 @@ async def check(request: fastapi.Request,
312
323
 
313
324
  @app.get('/enabled_clouds')
314
325
  async def enabled_clouds(request: fastapi.Request,
315
- workspace: Optional[str] = None) -> None:
326
+ workspace: Optional[str] = None,
327
+ expand: bool = False) -> None:
316
328
  """Gets enabled clouds on the server."""
317
329
  executor.schedule_request(
318
330
  request_id=request.state.request_id,
319
331
  request_name='enabled_clouds',
320
- request_body=payloads.EnabledCloudsBody(workspace=workspace),
332
+ request_body=payloads.EnabledCloudsBody(workspace=workspace,
333
+ expand=expand),
321
334
  func=core.enabled_clouds,
322
335
  schedule_type=requests_lib.ScheduleType.SHORT,
323
336
  )
324
337
 
325
338
 
326
- @app.get('/workspaces')
327
- async def get_workspace_config(request: fastapi.Request) -> None:
328
- """Gets workspace config on the server."""
329
- executor.schedule_request(
330
- request_id=request.state.request_id,
331
- request_name='workspaces',
332
- request_body=payloads.RequestBody(),
333
- func=skypilot_config.get_workspaces,
334
- schedule_type=requests_lib.ScheduleType.SHORT,
335
- )
336
-
337
-
338
339
  @app.post('/realtime_kubernetes_gpu_availability')
339
340
  async def realtime_kubernetes_gpu_availability(
340
341
  request: fastapi.Request,
@@ -56,6 +56,7 @@ install_requires = [
56
56
  'aiofiles',
57
57
  'httpx',
58
58
  'setproctitle',
59
+ 'sqlalchemy',
59
60
  ]
60
61
 
61
62
  local_ray = [
sky/skypilot_config.py CHANGED
@@ -57,6 +57,8 @@ import threading
57
57
  import typing
58
58
  from typing import Any, Dict, Iterator, List, Optional, Tuple
59
59
 
60
+ import filelock
61
+
60
62
  from sky import exceptions
61
63
  from sky import sky_logging
62
64
  from sky.adaptors import common as adaptors_common
@@ -66,6 +68,7 @@ from sky.utils import config_utils
66
68
  from sky.utils import context
67
69
  from sky.utils import schemas
68
70
  from sky.utils import ux_utils
71
+ from sky.utils.kubernetes import config_map_utils
69
72
 
70
73
  if typing.TYPE_CHECKING:
71
74
  import yaml
@@ -120,10 +123,17 @@ class ConfigContext:
120
123
 
121
124
 
122
125
  # The global loaded config.
126
+ _active_workspace_context = threading.local()
123
127
  _global_config_context = ConfigContext()
124
- _reload_config_lock = threading.Lock()
125
128
 
126
- _active_workspace_context = threading.local()
129
+ SKYPILOT_CONFIG_LOCK_PATH = '~/.sky/locks/.skypilot_config.lock'
130
+
131
+
132
+ def get_skypilot_config_lock_path() -> str:
133
+ """Get the path for the SkyPilot config lock file."""
134
+ lock_path = os.path.expanduser(SKYPILOT_CONFIG_LOCK_PATH)
135
+ os.makedirs(os.path.dirname(lock_path), exist_ok=True)
136
+ return lock_path
127
137
 
128
138
 
129
139
  def _get_config_context() -> ConfigContext:
@@ -289,6 +299,10 @@ def get_nested(keys: Tuple[str, ...],
289
299
  def get_workspace_cloud(cloud: str,
290
300
  workspace: Optional[str] = None) -> config_utils.Config:
291
301
  """Returns the workspace config."""
302
+ # TODO(zhwu): Instead of just returning the workspace specific config, we
303
+ # should return the config that already merges the global config, so that
304
+ # the caller does not need to manually merge the global config with
305
+ # the workspace specific config.
292
306
  if workspace is None:
293
307
  workspace = get_active_workspace()
294
308
  clouds = get_nested(keys=(
@@ -389,7 +403,7 @@ def overlay_skypilot_config(
389
403
 
390
404
  def safe_reload_config() -> None:
391
405
  """Reloads the config, safe to be called concurrently."""
392
- with _reload_config_lock:
406
+ with filelock.FileLock(get_skypilot_config_lock_path()):
393
407
  _reload_config()
394
408
 
395
409
 
@@ -691,9 +705,21 @@ def apply_cli_config(cli_config: Optional[List[str]]) -> Dict[str, Any]:
691
705
  return parsed_config
692
706
 
693
707
 
694
- def get_workspaces() -> Dict[str, Any]:
695
- """Returns the workspace config."""
696
- workspaces = get_nested(('workspaces',), default_value={})
697
- if constants.SKYPILOT_DEFAULT_WORKSPACE not in workspaces:
698
- workspaces[constants.SKYPILOT_DEFAULT_WORKSPACE] = {}
699
- return workspaces
708
+ def update_config_no_lock(config: config_utils.Config) -> None:
709
+ """Dumps the new config to a file and syncs to ConfigMap if in Kubernetes.
710
+
711
+ Args:
712
+ config: The config to save and sync.
713
+ """
714
+ global_config_path = os.path.expanduser(get_user_config_path())
715
+
716
+ # Always save to the local file (PVC in Kubernetes, local file otherwise)
717
+ common_utils.dump_yaml(global_config_path, dict(config))
718
+
719
+ if config_map_utils.is_running_in_kubernetes():
720
+ # In Kubernetes, sync the PVC config to ConfigMap for user convenience
721
+ # PVC file is the source of truth, ConfigMap is just a mirror for easy
722
+ # access
723
+ config_map_utils.patch_configmap_with_config(config, global_config_path)
724
+
725
+ _reload_config()
sky/utils/db_utils.py CHANGED
@@ -1,9 +1,17 @@
1
1
  """Utils for sky databases."""
2
2
  import contextlib
3
+ import enum
3
4
  import sqlite3
4
5
  import threading
6
+ import typing
5
7
  from typing import Any, Callable, Optional
6
8
 
9
+ import sqlalchemy
10
+ from sqlalchemy import exc as sqlalchemy_exc
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sqlalchemy.orm import Session
14
+
7
15
  # This parameter (passed to sqlite3.connect) controls how long we will wait to
8
16
  # obtains a database lock (not necessarily during connection, but whenever it is
9
17
  # needed). It is not a connection timeout.
@@ -21,6 +29,11 @@ from typing import Any, Callable, Optional
21
29
  _DB_TIMEOUT_S = 60
22
30
 
23
31
 
32
+ class SQLAlchemyDialect(enum.Enum):
33
+ SQLITE = 'sqlite'
34
+ POSTGRESQL = 'postgresql'
35
+
36
+
24
37
  @contextlib.contextmanager
25
38
  def safe_cursor(db_path: str):
26
39
  """A newly created, auto-committing, auto-closing cursor."""
@@ -71,6 +84,46 @@ def add_column_to_table(
71
84
  conn.commit()
72
85
 
73
86
 
87
+ def add_column_to_table_sqlalchemy(
88
+ session: 'Session',
89
+ table_name: str,
90
+ column_name: str,
91
+ column_type: str,
92
+ copy_from: Optional[str] = None,
93
+ value_to_replace_existing_entries: Optional[Any] = None,
94
+ ):
95
+ """Add a column to a table."""
96
+ dialect = session.bind.dialect
97
+ if dialect.name == SQLAlchemyDialect.SQLITE.value:
98
+ try:
99
+ session.execute(
100
+ sqlalchemy.text(f'ALTER TABLE {table_name} '
101
+ f'ADD COLUMN {column_name} {column_type}'))
102
+ if copy_from is not None:
103
+ session.execute(
104
+ sqlalchemy.text(f'UPDATE {table_name} '
105
+ f'SET {column_name} = {copy_from}'))
106
+ if value_to_replace_existing_entries is not None:
107
+ session.execute(
108
+ sqlalchemy.text(f'UPDATE {table_name} '
109
+ f'SET {column_name} = :replacement_value '
110
+ f'WHERE {column_name} IS NULL'),
111
+ {'replacement_value': value_to_replace_existing_entries})
112
+ except sqlalchemy_exc.OperationalError as e:
113
+ if 'duplicate column name' in str(e):
114
+ pass
115
+ else:
116
+ raise
117
+ elif dialect.name == SQLAlchemyDialect.POSTGRESQL.value:
118
+ # TODO(syang) support postgres dialect
119
+ session.rollback()
120
+ raise ValueError('Unsupported database dialect')
121
+ else:
122
+ session.rollback()
123
+ raise ValueError('Unsupported database dialect')
124
+ session.commit()
125
+
126
+
74
127
  def rename_column(
75
128
  cursor: 'sqlite3.Cursor',
76
129
  conn: 'sqlite3.Connection',
@@ -0,0 +1,133 @@
1
+ """Utilities for Kubernetes ConfigMap operations in SkyPilot."""
2
+ import os
3
+
4
+ from sky import sky_logging
5
+ from sky import skypilot_config
6
+ from sky.adaptors import kubernetes
7
+ from sky.utils import common_utils
8
+
9
+ logger = sky_logging.init_logger(__name__)
10
+
11
+ # Kubernetes ConfigMap sync constants
12
+ _KUBE_SERVICE_ACCOUNT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount'
13
+ _CONFIGMAP_SYNC_TIMEOUT = 10 # seconds
14
+
15
+
16
+ def is_running_in_kubernetes() -> bool:
17
+ """Check if we're running inside a Kubernetes pod."""
18
+ return os.path.exists(f'{_KUBE_SERVICE_ACCOUNT_PATH}/token')
19
+
20
+
21
+ def _get_kubernetes_namespace() -> str:
22
+ """Get the current Kubernetes namespace from the service account."""
23
+ try:
24
+ namespace_file = f'{_KUBE_SERVICE_ACCOUNT_PATH}/namespace'
25
+ if os.path.exists(namespace_file):
26
+ with open(namespace_file, encoding='utf-8') as f:
27
+ return f.read().strip()
28
+ except (OSError, IOError):
29
+ pass
30
+ return 'default'
31
+
32
+
33
+ def _get_configmap_name() -> str:
34
+ """Get the ConfigMap name for the SkyPilot config."""
35
+ release_name = (os.getenv('HELM_RELEASE_NAME') or
36
+ os.getenv('SKYPILOT_RELEASE_NAME') or 'skypilot')
37
+ return f'{release_name}-config'
38
+
39
+
40
+ def initialize_configmap_sync_on_startup(config_file_path: str) -> None:
41
+ """Initialize ConfigMap sync on API server startup.
42
+
43
+ This syncs existing PVC config to ConfigMap if ConfigMap doesn't exist.
44
+ This handles the upgrade scenario where an existing deployment has
45
+ workspace configs on PVC but no ConfigMap exists.
46
+
47
+ Args:
48
+ config_file_path: Path to the config file to sync.
49
+ """
50
+ config_file_path = os.path.expanduser(config_file_path)
51
+ if not is_running_in_kubernetes() or not os.path.exists(config_file_path):
52
+ return
53
+
54
+ try:
55
+ namespace = _get_kubernetes_namespace()
56
+ configmap_name = _get_configmap_name()
57
+
58
+ # Check if ConfigMap exists
59
+ try:
60
+ kubernetes.core_api().read_namespaced_config_map(
61
+ name=configmap_name, namespace=namespace)
62
+ # ConfigMap exists, don't overwrite it
63
+ logger.debug(f'ConfigMap {configmap_name} already exists')
64
+ return
65
+ except kubernetes.kubernetes.client.rest.ApiException as e:
66
+ if e.status != 404:
67
+ raise
68
+ # ConfigMap doesn't exist, create it
69
+
70
+ current_config = skypilot_config.parse_and_validate_config_file(
71
+ config_file_path)
72
+ config_yaml = common_utils.dump_yaml_str(dict(current_config))
73
+
74
+ configmap_body = {
75
+ 'apiVersion': 'v1',
76
+ 'kind': 'ConfigMap',
77
+ 'metadata': {
78
+ 'name': configmap_name,
79
+ 'namespace': namespace,
80
+ 'labels': {
81
+ 'app.kubernetes.io/name': 'skypilot',
82
+ 'app.kubernetes.io/component': 'config'
83
+ }
84
+ },
85
+ 'data': {
86
+ 'config.yaml': config_yaml
87
+ }
88
+ }
89
+
90
+ kubernetes.core_api().create_namespaced_config_map(
91
+ namespace=namespace,
92
+ body=configmap_body,
93
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
94
+
95
+ logger.info(f'Synced PVC config to new ConfigMap {configmap_name}')
96
+
97
+ except Exception as e: # pylint: disable=broad-except
98
+ logger.warning(f'Failed to initialize ConfigMap sync: {e}')
99
+
100
+
101
+ def patch_configmap_with_config(config, config_file_path: str) -> None:
102
+ """Patch the Kubernetes ConfigMap with the updated config.
103
+
104
+ Args:
105
+ config: The updated config to sync to the ConfigMap.
106
+ config_file_path: Path to the config file for fallback sync.
107
+ """
108
+ if not is_running_in_kubernetes():
109
+ return
110
+
111
+ try:
112
+ namespace = _get_kubernetes_namespace()
113
+ configmap_name = _get_configmap_name()
114
+ config_yaml = common_utils.dump_yaml_str(dict(config))
115
+ patch_body = {'data': {'config.yaml': config_yaml}}
116
+
117
+ try:
118
+ kubernetes.core_api().patch_namespaced_config_map(
119
+ name=configmap_name,
120
+ namespace=namespace,
121
+ body=patch_body,
122
+ _request_timeout=_CONFIGMAP_SYNC_TIMEOUT)
123
+ logger.debug(f'Synced config to ConfigMap {configmap_name}')
124
+ except kubernetes.kubernetes.client.rest.ApiException as e:
125
+ if e.status == 404:
126
+ # ConfigMap doesn't exist, create it
127
+ logger.info(f'ConfigMap {configmap_name} not found, creating')
128
+ initialize_configmap_sync_on_startup(config_file_path)
129
+ else:
130
+ raise
131
+
132
+ except Exception as e: # pylint: disable=broad-except
133
+ logger.warning(f'Failed to sync config to ConfigMap: {e}')
@@ -624,6 +624,9 @@ def main():
624
624
  kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
625
625
  global_use_ssh_config = args.use_ssh_config
626
626
 
627
+ failed_clusters = []
628
+ successful_clusters = []
629
+
627
630
  # Print cleanup mode marker if applicable
628
631
  if args.cleanup:
629
632
  print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
@@ -793,10 +796,21 @@ def main():
793
796
  print(
794
797
  f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
795
798
  )
799
+ successful_clusters.append(cluster_name)
796
800
  except Exception as e: # pylint: disable=broad-except
801
+ reason = str(e)
802
+ failed_clusters.append((cluster_name, reason))
797
803
  print(
798
- f'{RED}Error deploying SSH Node Pool {cluster_name}: {e}{NC}'
799
- )
804
+ f'{RED}Error deploying SSH Node Pool {cluster_name}: {reason}{NC}'
805
+ ) # Print for internal logging
806
+
807
+ if failed_clusters:
808
+ action = 'clean' if args.cleanup else 'deploy'
809
+ msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
810
+ msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
811
+ for cluster_name, reason in failed_clusters:
812
+ msg += f'\n {cluster_name}: {reason}'
813
+ raise RuntimeError(msg)
800
814
 
801
815
 
802
816
  def deploy_cluster(head_node,
@@ -847,8 +861,10 @@ def deploy_cluster(head_node,
847
861
  print_output=True)
848
862
  if result is None:
849
863
  with ux_utils.print_exception_no_traceback():
850
- raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
851
- f'Please check the SSH configuration.')
864
+ raise RuntimeError(
865
+ f'Failed to SSH to head node ({head_node}). '
866
+ f'Please check the SSH configuration and logs for more details.'
867
+ )
852
868
 
853
869
  # Checking history
854
870
  history_exists = (history_worker_nodes is not None and
@@ -5,6 +5,9 @@ the 'command' field in the exec configuration, leaving only the executable name.
5
5
  This is useful when moving between different environments where auth plugin
6
6
  executables might be installed in different locations.
7
7
 
8
+ For Nebius kubeconfigs, it also changes the --profile argument to 'sky' to
9
+ ensure compatibility with SkyPilot's expected profile configuration.
10
+
8
11
  It assumes the target environment has the auth executable available in PATH.
9
12
  If not, you'll need to update your environment container to include the auth
10
13
  executable in PATH.
@@ -21,6 +24,8 @@ import yaml
21
24
  def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
22
25
  """Strip path information from exec plugin commands in a kubeconfig file.
23
26
 
27
+ For Nebius kubeconfigs, also changes the --profile argument to 'sky'.
28
+
24
29
  Args:
25
30
  kubeconfig_path (str): Path to the input kubeconfig file
26
31
  output_path (str): Path where the modified kubeconfig will be saved
@@ -40,6 +45,20 @@ def strip_auth_plugin_paths(kubeconfig_path: str, output_path: str):
40
45
  exec_info['command'] = executable
41
46
  updated = True
42
47
 
48
+ # Handle Nebius kubeconfigs: change --profile to 'sky'
49
+ if executable == 'nebius' or current_command == 'nebius':
50
+ args = exec_info.get('args', [])
51
+ if args and '--profile' in args:
52
+ try:
53
+ profile_index = args.index('--profile')
54
+ if profile_index + 1 < len(args):
55
+ old_profile = args[profile_index + 1]
56
+ if old_profile != 'sky':
57
+ args[profile_index + 1] = 'sky'
58
+ updated = True
59
+ except ValueError:
60
+ pass # --profile not found in args
61
+
43
62
  if updated:
44
63
  with open(output_path, 'w', encoding='utf-8') as file:
45
64
  yaml.safe_dump(config, file)
@@ -26,6 +26,48 @@ logger = sky_logging.init_logger(__name__)
26
26
  DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
27
27
 
28
28
 
29
+ def check_ssh_cluster_dependencies(
30
+ raise_error: bool = True) -> Optional[List[str]]:
31
+ """Checks if the dependencies for ssh cluster are installed.
32
+
33
+ Args:
34
+ raise_error: set to true when the dependency needs to be present.
35
+ set to false for `sky check`, where reason strings are compiled
36
+ at the end.
37
+
38
+ Returns: the reasons list if there are missing dependencies.
39
+ """
40
+ # error message
41
+ jq_message = ('`jq` is required to setup ssh cluster.')
42
+
43
+ # save
44
+ reasons = []
45
+ required_binaries = []
46
+
47
+ # Ensure jq is installed
48
+ try:
49
+ subprocess.run(['jq', '--version'],
50
+ stdout=subprocess.DEVNULL,
51
+ stderr=subprocess.DEVNULL,
52
+ check=True)
53
+ except (FileNotFoundError, subprocess.CalledProcessError):
54
+ required_binaries.append('jq')
55
+ reasons.append(jq_message)
56
+
57
+ if required_binaries:
58
+ reasons.extend([
59
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
60
+ f' $ sudo apt install {" ".join(required_binaries)}',
61
+ 'On MacOS, install with: ',
62
+ f' $ brew install {" ".join(required_binaries)}',
63
+ ])
64
+ if raise_error:
65
+ with ux_utils.print_exception_no_traceback():
66
+ raise RuntimeError('\n'.join(reasons))
67
+ return reasons
68
+ return None
69
+
70
+
29
71
  def deploy_ssh_cluster(cleanup: bool = False,
30
72
  infra: Optional[str] = None,
31
73
  kubeconfig_path: Optional[str] = None):
@@ -41,6 +83,8 @@ def deploy_ssh_cluster(cleanup: bool = False,
41
83
  kubeconfig_path: Path to save the Kubernetes configuration file.
42
84
  If None, the default ~/.kube/config will be used.
43
85
  """
86
+ check_ssh_cluster_dependencies()
87
+
44
88
  # Prepare command to call deploy_remote_cluster.py script
45
89
  # TODO(romilb): We should move this to a native python method/class call
46
90
  # instead of invoking a script with subprocess.
@@ -81,9 +125,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
81
125
  cmd=deploy_command,
82
126
  log_path=log_path,
83
127
  require_outputs=True,
84
- stream_logs=False, # TODO: Fixme to False after we fix the logging
128
+ stream_logs=False,
85
129
  line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
86
- is_local=True),
130
+ is_local=False),
87
131
  cwd=cwd,
88
132
  env=env)
89
133
 
@@ -91,9 +135,9 @@ def deploy_ssh_cluster(cleanup: bool = False,
91
135
  success = True
92
136
  else:
93
137
  with ux_utils.print_exception_no_traceback():
94
- log_hint = ux_utils.log_path_hint(log_path, is_local=True)
95
- raise RuntimeError('Failed to deploy SkyPilot on SSH targets. '
96
- f'Full log: {log_hint}'
138
+ log_hint = ux_utils.log_path_hint(log_path, is_local=False)
139
+ raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
140
+ f'{log_hint}'
97
141
  f'\nError: {stderr}')
98
142
 
99
143
  if success: