skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (61) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -1
  3. sky/backends/cloud_vm_ray_backend.py +9 -2
  4. sky/client/cli/command.py +40 -26
  5. sky/client/sdk.py +132 -65
  6. sky/client/sdk_async.py +1 -1
  7. sky/core.py +5 -2
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → -DXZksWqf2waNHeU9YTQe}/_buildManifest.js +1 -1
  10. sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
  11. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
  12. sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
  15. sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -1
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  25. sky/dashboard/out/jobs.html +1 -1
  26. sky/dashboard/out/users.html +1 -1
  27. sky/dashboard/out/volumes.html +1 -1
  28. sky/dashboard/out/workspace/new.html +1 -1
  29. sky/dashboard/out/workspaces/[name].html +1 -1
  30. sky/dashboard/out/workspaces.html +1 -1
  31. sky/execution.py +6 -4
  32. sky/global_user_state.py +8 -1
  33. sky/jobs/client/sdk.py +27 -20
  34. sky/jobs/controller.py +2 -1
  35. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  36. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  37. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  38. sky/serve/client/impl.py +11 -8
  39. sky/serve/client/sdk.py +7 -7
  40. sky/serve/serve_state.py +437 -340
  41. sky/serve/server/impl.py +2 -2
  42. sky/server/common.py +12 -8
  43. sky/server/constants.py +1 -1
  44. sky/setup_files/alembic.ini +4 -0
  45. sky/utils/cli_utils/status_utils.py +1 -1
  46. sky/utils/db/db_utils.py +31 -0
  47. sky/utils/db/migration_utils.py +5 -1
  48. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  49. sky/utils/resource_checker.py +162 -21
  50. sky/volumes/client/sdk.py +4 -4
  51. sky/workspaces/core.py +210 -6
  52. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +2 -2
  53. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +58 -55
  54. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
  55. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
  56. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
  57. /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
  58. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
  59. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
  60. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
  61. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/serve/server/impl.py CHANGED
@@ -198,9 +198,9 @@ def up(
198
198
  # We need a unique integer per sky.serve.up call to avoid name
199
199
  # conflict. Originally in non-consolidation mode, this is the ray
200
200
  # job id; now we use the request id hash instead. Here we also
201
- # make sure it is a 63-bit integer to avoid overflow on sqlalchemy.
201
+ # make sure it is a 32-bit integer to avoid overflow on sqlalchemy.
202
202
  rid = common_utils.get_current_request_id()
203
- controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFFFFFFFFFF
203
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFF
204
204
 
205
205
  vars_to_fill = {
206
206
  'remote_task_yaml_path': remote_tmp_task_yaml_path,
sky/server/common.py CHANGED
@@ -16,8 +16,8 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import (Any, Callable, cast, Dict, Literal, Optional, Tuple,
20
- TypeVar, Union)
19
+ from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
20
+ Tuple, TypeVar, Union)
21
21
  from urllib import parse
22
22
  import uuid
23
23
 
@@ -89,16 +89,20 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
89
89
  'restarting the API server.'
90
90
  f'{colorama.Style.RESET_ALL}')
91
91
 
92
- RequestId = str
92
+ T = TypeVar('T')
93
+ P = ParamSpec('P')
94
+
95
+
96
+ class RequestId(str, Generic[T]):
97
+ pass
98
+
99
+
93
100
  ApiVersion = Optional[str]
94
101
 
95
102
  logger = sky_logging.init_logger(__name__)
96
103
 
97
104
  hinted_for_server_install_version_mismatch = False
98
105
 
99
- T = TypeVar('T')
100
- P = ParamSpec('P')
101
-
102
106
 
103
107
  class ApiServerStatus(enum.Enum):
104
108
  HEALTHY = 'healthy'
@@ -491,7 +495,7 @@ def handle_request_error(response: 'requests.Response') -> None:
491
495
  f'{response.text}')
492
496
 
493
497
 
494
- def get_request_id(response: 'requests.Response') -> RequestId:
498
+ def get_request_id(response: 'requests.Response') -> RequestId[T]:
495
499
  handle_request_error(response)
496
500
  request_id = response.headers.get('X-Skypilot-Request-ID')
497
501
  if request_id is None:
@@ -502,7 +506,7 @@ def get_request_id(response: 'requests.Response') -> RequestId:
502
506
  'Failed to get request ID from SkyPilot API server at '
503
507
  f'{get_server_url()}. Response: {response.status_code} '
504
508
  f'{response.text}')
505
- return request_id
509
+ return RequestId[T](request_id)
506
510
 
507
511
 
508
512
  def _start_api_server(deploy: bool = False,
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 15
13
+ API_VERSION = 16
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -94,6 +94,10 @@ version_table = alembic_version_state_db
94
94
  version_locations = %(here)s/../schemas/db/spot_jobs
95
95
  version_table = alembic_version_spot_jobs_db
96
96
 
97
+ [serve_db]
98
+ version_locations = %(here)s/../schemas/db/serve_state
99
+ version_table = alembic_version_serve_state_db
100
+
97
101
  [post_write_hooks]
98
102
  # post_write_hooks defines scripts or Python functions that are run
99
103
  # on newly generated revision scripts. See the documentation for further
@@ -401,7 +401,7 @@ def _get_estimated_cost_for_cost_report(
401
401
 
402
402
 
403
403
  def show_kubernetes_cluster_status_table(
404
- clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
404
+ clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
405
405
  show_all: bool) -> None:
406
406
  """Compute cluster table values and display for Kubernetes clusters."""
407
407
  status_columns = [
sky/utils/db/db_utils.py CHANGED
@@ -205,6 +205,37 @@ def add_column_to_table_alembic(
205
205
  raise
206
206
 
207
207
 
208
+ def drop_column_from_table_alembic(
209
+ table_name: str,
210
+ column_name: str,
211
+ ):
212
+ """Drop a column from a table using Alembic operations.
213
+
214
+ Args:
215
+ table_name: Name of the table to drop column from.
216
+ column_name: Name of the column to drop.
217
+ """
218
+ from alembic import op # pylint: disable=import-outside-toplevel
219
+
220
+ # Check if column exists before trying to drop it
221
+ bind = op.get_bind()
222
+ inspector = sqlalchemy.inspect(bind)
223
+ columns = [col['name'] for col in inspector.get_columns(table_name)]
224
+
225
+ if column_name not in columns:
226
+ # Column doesn't exist; nothing to do
227
+ return
228
+
229
+ try:
230
+ op.drop_column(table_name, column_name)
231
+ except (sqlalchemy_exc.ProgrammingError,
232
+ sqlalchemy_exc.OperationalError) as e:
233
+ if 'does not exist' in str(e).lower():
234
+ pass # Already dropped
235
+ else:
236
+ raise
237
+
238
+
208
239
  class SQLiteConn(threading.local):
209
240
  """Thread-local connection to the sqlite3 database."""
210
241
 
@@ -19,13 +19,17 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '002'
22
+ GLOBAL_USER_STATE_VERSION = '004'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
26
26
  SPOT_JOBS_VERSION = '003'
27
27
  SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
28
28
 
29
+ SERVE_DB_NAME = 'serve_db'
30
+ SERVE_VERSION = '001'
31
+ SERVE_LOCK_PATH = '~/.sky/locks/.serve_db.lock'
32
+
29
33
 
30
34
  def get_engine(db_name: str):
31
35
  conn_string = None
@@ -1276,7 +1276,9 @@ def deploy_cluster(head_node,
1276
1276
  print(
1277
1277
  ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1278
1278
  )
1279
- print(' • Connect to pod with VSCode: code --remote ssh-remote+devbox ')
1279
+ print(
1280
+ ' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
1281
+ )
1280
1282
  # Print completion marker for current cluster
1281
1283
  print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1282
1284
 
@@ -74,27 +74,7 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
74
74
  ValueError: If any resource has active clusters or managed jobs.
75
75
  """
76
76
 
77
- def get_all_clusters():
78
- return global_user_state.get_clusters()
79
-
80
- def get_all_managed_jobs():
81
- # pylint: disable=import-outside-toplevel
82
- from sky.jobs.server import core as managed_jobs_core
83
- try:
84
- return managed_jobs_core.queue(refresh=False,
85
- skip_finished=True,
86
- all_users=True)
87
- except exceptions.ClusterNotUpError:
88
- logger.warning('All jobs should be finished.')
89
- return []
90
-
91
- # Fetch both clusters and jobs in parallel
92
- with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
93
- clusters_future = executor.submit(get_all_clusters)
94
- jobs_future = executor.submit(get_all_managed_jobs)
95
-
96
- all_clusters = clusters_future.result()
97
- all_managed_jobs = jobs_future.result()
77
+ all_clusters, all_managed_jobs = _get_active_resources()
98
78
 
99
79
  # Collect all error messages instead of raising immediately
100
80
  error_messages = []
@@ -134,6 +114,11 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
134
114
  # If this resource has issues, add to overall error messages
135
115
  if resource_errors:
136
116
  resource_error_summary = ' and '.join(resource_errors)
117
+ if resource_type == 'user':
118
+ # resource_name is user_id
119
+ user_info = global_user_state.get_user(resource_name)
120
+ if user_info and user_info.name:
121
+ resource_name = user_info.name
137
122
  error_messages.append(
138
123
  f'Cannot {operation} {resource_type} {resource_name!r} '
139
124
  f'because it has {resource_error_summary}.')
@@ -151,3 +136,159 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
151
136
  '\n'.join(f'• {msg}' for msg in error_messages) +
152
137
  '\nPlease terminate these resources first.')
153
138
  raise ValueError(full_message)
139
+
140
+
141
+ def check_users_workspaces_active_resources(
142
+ user_ids: List[str],
143
+ workspace_names: List[str]) -> Tuple[str, List[str]]:
144
+ """Check if all the active clusters or managed jobs in workspaces
145
+ belong to the user_ids. If not, return the error message.
146
+
147
+ Args:
148
+ user_ids: List of user_id.
149
+ workspace_names: List of workspace_name.
150
+
151
+ Returns:
152
+ resource_error_summary: str
153
+ missed_users_names: List[str]
154
+ """
155
+ all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
156
+ workspace_names)
157
+ resource_errors = []
158
+ missed_users = set()
159
+ active_cluster_names = []
160
+ active_job_names = []
161
+ # Check clusters
162
+ if all_clusters:
163
+ for cluster in all_clusters:
164
+ user_hash = cluster.get('user_hash')
165
+ if user_hash and user_hash not in user_ids:
166
+ missed_users.add(user_hash)
167
+ active_cluster_names.append(cluster['name'])
168
+ if active_cluster_names:
169
+ cluster_list = ', '.join(active_cluster_names)
170
+ resource_errors.append(
171
+ f'{len(active_cluster_names)} active cluster(s):'
172
+ f' {cluster_list}')
173
+
174
+ # Check managed jobs
175
+ if all_managed_jobs:
176
+ for job in all_managed_jobs:
177
+ user_hash = job.get('user_hash')
178
+ if user_hash and user_hash not in user_ids:
179
+ missed_users.add(user_hash)
180
+ active_job_names.append(str(job['job_id']))
181
+ if active_job_names:
182
+ job_list = ', '.join(active_job_names)
183
+ resource_errors.append(f'{len(active_job_names)} active'
184
+ f' managed job(s): {job_list}')
185
+
186
+ resource_error_summary = ''
187
+ if resource_errors:
188
+ resource_error_summary = ' and '.join(resource_errors)
189
+ missed_users_names = []
190
+ if missed_users:
191
+ all_users = global_user_state.get_all_users()
192
+ missed_users_names = [
193
+ user.name if user.name else user.id
194
+ for user in all_users
195
+ if user.id in missed_users
196
+ ]
197
+ return resource_error_summary, missed_users_names
198
+
199
+
200
+ def _get_active_resources_for_workspaces(
201
+ workspace_names: List[str]
202
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
203
+ """Get active clusters or managed jobs for workspaces.
204
+
205
+ Args:
206
+ workspace_names: List of workspace_name.
207
+
208
+ Returns:
209
+ all_clusters: List[Dict[str, Any]]
210
+ all_managed_jobs: List[Dict[str, Any]]
211
+ """
212
+ if not workspace_names:
213
+ return [], []
214
+
215
+ def filter_by_workspaces(workspace_names: List[str]):
216
+ return lambda resource: (resource.get(
217
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
218
+ workspace_names)
219
+
220
+ return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
221
+
222
+
223
+ def _get_active_resources_by_names(
224
+ resource_names: List[str],
225
+ filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
226
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
227
+ """Get active clusters or managed jobs.
228
+
229
+ Args:
230
+ resource_names: List of resource_name.
231
+ filter_factory: Function that takes a resource_name and returns a filter
232
+ function for clusters/jobs.
233
+
234
+ Returns:
235
+ all_clusters: List[Dict[str, Any]]
236
+ all_managed_jobs: List[Dict[str, Any]]
237
+ """
238
+
239
+ all_clusters, all_managed_jobs = _get_active_resources()
240
+
241
+ resource_clusters = []
242
+ resource_active_jobs = []
243
+
244
+ # Check each resource against the fetched data,
245
+ # return the active resources by names
246
+ resource_filter = filter_factory(resource_names)
247
+
248
+ # Filter clusters for this resource
249
+ if all_clusters:
250
+ resource_clusters = [
251
+ cluster for cluster in all_clusters if resource_filter(cluster)
252
+ ]
253
+
254
+ # Filter managed jobs for this resource
255
+ if all_managed_jobs:
256
+ resource_active_jobs = [
257
+ job for job in all_managed_jobs if resource_filter(job)
258
+ ]
259
+
260
+ return resource_clusters, resource_active_jobs
261
+
262
+
263
+ def _get_active_resources(
264
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
265
+ """Get all active clusters and managed jobs.
266
+
267
+ Returns:
268
+ all_clusters: List[Dict[str, Any]]
269
+ all_managed_jobs: List[Dict[str, Any]]
270
+ """
271
+
272
+ def get_all_clusters():
273
+ return global_user_state.get_clusters()
274
+
275
+ def get_all_managed_jobs():
276
+ # pylint: disable=import-outside-toplevel
277
+ from sky.jobs.server import core as managed_jobs_core
278
+ try:
279
+ return managed_jobs_core.queue(refresh=False,
280
+ skip_finished=True,
281
+ all_users=True)
282
+ except exceptions.ClusterNotUpError:
283
+ logger.warning('All jobs should be finished.')
284
+ return []
285
+
286
+ # Fetch both clusters and jobs in parallel
287
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
288
+ clusters_future = executor.submit(get_all_clusters)
289
+ jobs_future = executor.submit(get_all_managed_jobs)
290
+
291
+ all_clusters = clusters_future.result()
292
+ all_managed_jobs = jobs_future.result()
293
+
294
+ return all_clusters, all_managed_jobs
sky/volumes/client/sdk.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import json
3
3
  import typing
4
- from typing import List
4
+ from typing import Any, Dict, List
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
@@ -24,7 +24,7 @@ logger = sky_logging.init_logger(__name__)
24
24
  @usage_lib.entrypoint
25
25
  @server_common.check_server_healthy_or_start
26
26
  @annotations.client_api
27
- def apply(volume: volume_lib.Volume) -> server_common.RequestId:
27
+ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
28
28
  """Creates or registers a volume.
29
29
 
30
30
  Args:
@@ -50,7 +50,7 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId:
50
50
  @usage_lib.entrypoint
51
51
  @server_common.check_server_healthy_or_start
52
52
  @annotations.client_api
53
- def ls() -> server_common.RequestId:
53
+ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
54
54
  """Lists all volumes.
55
55
 
56
56
  Returns:
@@ -65,7 +65,7 @@ def ls() -> server_common.RequestId:
65
65
  @usage_lib.entrypoint
66
66
  @server_common.check_server_healthy_or_start
67
67
  @annotations.client_api
68
- def delete(names: List[str]) -> server_common.RequestId:
68
+ def delete(names: List[str]) -> server_common.RequestId[None]:
69
69
  """Deletes volumes.
70
70
 
71
71
  Args:
sky/workspaces/core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Workspace management core."""
2
2
 
3
+ from dataclasses import dataclass
3
4
  from typing import Any, Callable, Dict, List, Tuple
4
5
 
5
6
  import filelock
@@ -9,12 +10,14 @@ from sky import exceptions
9
10
  from sky import models
10
11
  from sky import sky_logging
11
12
  from sky import skypilot_config
13
+ from sky.backends import backend_utils
12
14
  from sky.skylet import constants
13
15
  from sky.usage import usage_lib
14
16
  from sky.users import permission
15
17
  from sky.utils import annotations
16
18
  from sky.utils import common_utils
17
19
  from sky.utils import config_utils
20
+ from sky.utils import locks
18
21
  from sky.utils import resource_checker
19
22
  from sky.utils import schemas
20
23
  from sky.workspaces import utils as workspaces_utils
@@ -24,6 +27,37 @@ logger = sky_logging.init_logger(__name__)
24
27
  # Lock for workspace configuration updates to prevent race conditions
25
28
  _WORKSPACE_CONFIG_LOCK_TIMEOUT_SECONDS = 60
26
29
 
30
+
31
+ @dataclass
32
+ class WorkspaceConfigComparison:
33
+ """Result of comparing current and new workspace configurations.
34
+
35
+ This class encapsulates the results of analyzing differences between
36
+ workspace configurations, particularly focusing on user access changes
37
+ and their implications for resource validation.
38
+
39
+ Attributes:
40
+ only_user_access_changes: True if only allowed_users or private changed
41
+ private_changed: True if private setting changed
42
+ private_old: Old private setting value
43
+ private_new: New private setting value
44
+ allowed_users_changed: True if allowed_users changed
45
+ allowed_users_old: Old allowed users list
46
+ allowed_users_new: New allowed users list
47
+ removed_users: Users removed from allowed_users
48
+ added_users: Users added to allowed_users
49
+ """
50
+ only_user_access_changes: bool
51
+ private_changed: bool
52
+ private_old: bool
53
+ private_new: bool
54
+ allowed_users_changed: bool
55
+ allowed_users_old: List[str]
56
+ allowed_users_new: List[str]
57
+ removed_users: List[str]
58
+ added_users: List[str]
59
+
60
+
27
61
  # =========================
28
62
  # = Workspace Management =
29
63
  # =========================
@@ -95,6 +129,153 @@ def _validate_workspace_config(workspace_name: str,
95
129
  raise ValueError(str(e)) from e
96
130
 
97
131
 
132
+ def _compare_workspace_configs(
133
+ current_config: Dict[str, Any],
134
+ new_config: Dict[str, Any],
135
+ ) -> WorkspaceConfigComparison:
136
+ """Compare current and new workspace configurations.
137
+
138
+ Args:
139
+ current_config: The current workspace configuration.
140
+ new_config: The new workspace configuration.
141
+
142
+ Returns:
143
+ WorkspaceConfigComparison object containing the comparison results.
144
+ """
145
+ # Get private settings
146
+ private_old = current_config.get('private', False)
147
+ private_new = new_config.get('private', False)
148
+ private_changed = private_old != private_new
149
+
150
+ # Get allowed users (resolve to user IDs for comparison)
151
+ allowed_users_old = workspaces_utils.get_workspace_users(
152
+ current_config) if private_old else []
153
+ allowed_users_new = workspaces_utils.get_workspace_users(
154
+ new_config) if private_new else []
155
+
156
+ # Convert to sets for easier comparison
157
+ old_users_set = set(allowed_users_old)
158
+ new_users_set = set(allowed_users_new)
159
+
160
+ allowed_users_changed = old_users_set != new_users_set
161
+ removed_users = list(old_users_set - new_users_set)
162
+ added_users = list(new_users_set - old_users_set)
163
+
164
+ # Check if only user access related fields changed
165
+ # Create copies without the user access fields for comparison
166
+ current_without_access = {
167
+ k: v
168
+ for k, v in current_config.items()
169
+ if k not in ['private', 'allowed_users']
170
+ }
171
+ new_without_access = {
172
+ k: v
173
+ for k, v in new_config.items()
174
+ if k not in ['private', 'allowed_users']
175
+ }
176
+
177
+ only_user_access_changes = current_without_access == new_without_access
178
+
179
+ return WorkspaceConfigComparison(
180
+ only_user_access_changes=only_user_access_changes,
181
+ private_changed=private_changed,
182
+ private_old=private_old,
183
+ private_new=private_new,
184
+ allowed_users_changed=allowed_users_changed,
185
+ allowed_users_old=allowed_users_old,
186
+ allowed_users_new=allowed_users_new,
187
+ removed_users=removed_users,
188
+ added_users=added_users)
189
+
190
+
191
+ def _validate_workspace_config_changes(workspace_name: str,
192
+ current_config: Dict[str, Any],
193
+ new_config: Dict[str, Any]) -> None:
194
+ """Validate workspace configuration changes based on active resources.
195
+
196
+ This function implements the logic:
197
+ - If only allowed_users or private changed:
198
+ - If private changed from true to false: allow it
199
+ - If private changed from false to true: check that all active resources
200
+ belong to allowed_users
201
+ - If private didn't change: check that removed users don't have active
202
+ resources
203
+ - Otherwise: check that workspace has no active resources
204
+
205
+ Args:
206
+ workspace_name: The name of the workspace.
207
+ current_config: The current workspace configuration.
208
+ new_config: The new workspace configuration.
209
+
210
+ Raises:
211
+ ValueError: If the configuration change is not allowed due to active
212
+ resources.
213
+ """
214
+ config_comparison = _compare_workspace_configs(current_config, new_config)
215
+
216
+ if config_comparison.only_user_access_changes:
217
+ # Only user access settings changed
218
+ if config_comparison.private_changed:
219
+ if (config_comparison.private_old and
220
+ not config_comparison.private_new):
221
+ # Changed from private to public - always allow
222
+ logger.info(
223
+ f'Workspace {workspace_name!r} changed from private to'
224
+ f' public.')
225
+ return
226
+ elif (not config_comparison.private_old and
227
+ config_comparison.private_new):
228
+ # Changed from public to private - check that all active
229
+ # resources belong to the new allowed users
230
+ logger.info(
231
+ f'Workspace {workspace_name!r} changed from public to'
232
+ f' private. Checking that all active resources belong'
233
+ f' to allowed users.')
234
+
235
+ error_summary, missed_users_names = (
236
+ resource_checker.check_users_workspaces_active_resources(
237
+ config_comparison.allowed_users_new, [workspace_name]))
238
+ if error_summary:
239
+ error_msg=f'Cannot change workspace {workspace_name!r}' \
240
+ f' to private '
241
+ if missed_users_names:
242
+ missed_users_list = ', '.join(missed_users_names)
243
+ if len(missed_users_names) == 1:
244
+ error_msg += f'because the user ' \
245
+ f'{missed_users_list!r} has {error_summary}'
246
+ else:
247
+ error_msg += f'because the users ' \
248
+ f'{missed_users_list!r} have {error_summary}'
249
+ error_msg += ' but not in the allowed_users list.' \
250
+ ' Please either add the users to allowed_users or' \
251
+ ' ask them to terminate their resources.'
252
+ raise ValueError(error_msg)
253
+ else:
254
+ # Private setting didn't change, but allowed_users changed
255
+ if (config_comparison.allowed_users_changed and
256
+ config_comparison.removed_users):
257
+ # Check that removed users don't have active resources
258
+ logger.info(
259
+ f'Checking that removed users'
260
+ f' {config_comparison.removed_users} do not have'
261
+ f' active resources in workspace {workspace_name!r}.')
262
+ user_operations = []
263
+ for user_id in config_comparison.removed_users:
264
+ user_operations.append((user_id, 'remove'))
265
+ resource_checker.check_no_active_resources_for_users(
266
+ user_operations)
267
+ else:
268
+ # Other configuration changes - check that workspace has no active
269
+ # resources
270
+ logger.info(
271
+ f'Non-user-access configuration changes detected for'
272
+ f' workspace {workspace_name!r}. Checking that workspace has'
273
+ f' no active resources.')
274
+ resource_checker.check_no_active_resources_for_workspaces([
275
+ (workspace_name, 'update')
276
+ ])
277
+
278
+
98
279
  @usage_lib.entrypoint
99
280
  def update_workspace(workspace_name: str, config: Dict[str,
100
281
  Any]) -> Dict[str, Any]:
@@ -109,17 +290,40 @@ def update_workspace(workspace_name: str, config: Dict[str,
109
290
 
110
291
  Raises:
111
292
  ValueError: If the workspace configuration is invalid, or if there are
112
- active clusters or managed jobs in the workspace.
293
+ active clusters or managed jobs that prevent the configuration
294
+ change.
295
+ The validation logic depends on what changed:
296
+ - If only allowed_users or private changed:
297
+ - Private true->false: Always allowed
298
+ - Private false->true: All active resources must belong to
299
+ allowed_users
300
+ - allowed_users changes: Removed users must not have active
301
+ resources
302
+ - Other changes: Workspace must have no active resources
113
303
  FileNotFoundError: If the config file cannot be found.
114
304
  PermissionError: If the config file cannot be written.
115
305
  """
116
306
  _validate_workspace_config(workspace_name, config)
117
307
 
118
- # Check for active clusters and managed jobs in the workspace
119
- # TODO(zhwu): we should allow the edits that only contain changes to
120
- # allowed_users or private.
121
- resource_checker.check_no_active_resources_for_workspaces([(workspace_name,
122
- 'update')])
308
+ # Get the current workspace configuration for comparison
309
+ current_workspaces = skypilot_config.get_nested(('workspaces',),
310
+ default_value={})
311
+ current_config = current_workspaces.get(workspace_name, {})
312
+
313
+ if current_config:
314
+ lock_id = backend_utils.workspace_lock_id(workspace_name)
315
+ lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
316
+ try:
317
+ with locks.get_lock(lock_id, lock_timeout):
318
+ # Validate the configuration changes based on active resources
319
+ _validate_workspace_config_changes(workspace_name,
320
+ current_config, config)
321
+ except locks.LockTimeout as e:
322
+ raise RuntimeError(
323
+ f'Failed to validate workspace {workspace_name!r} due to '
324
+ 'a timeout when trying to access database. Please '
325
+ f'try again or manually remove the lock at {lock_id}. '
326
+ f'{common_utils.format_exception(e)}') from None
123
327
 
124
328
  def update_workspace_fn(workspaces: Dict[str, Any]) -> None:
125
329
  """Function to update workspace inside the lock."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250807
3
+ Version: 1.0.0.dev20250808
4
4
  Summary: SkyPilot: Run AI on Any Infra — Unified, Faster, Cheaper.
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -238,7 +238,7 @@ Dynamic: summary
238
238
  ----
239
239
 
240
240
  :fire: *News* :fire:
241
- - [Aug 2025] Run and serve **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**example**](./llm/gpt-oss/)
241
+ - [Aug 2025] Serve and finetune **OpenAI GPT-OSS models** (gpt-oss-120b, gpt-oss-20b) with one command on any infra: [**serve**](./llm/gpt-oss/) + [**LoRA and full finetuning**](./llm/gpt-oss-finetuning/)
242
242
  - [Jul 2025] Run distributed **RL training for LLMs** with Verl (PPO, GRPO) on any cloud: [**example**](./llm/verl/)
243
243
  - [Jul 2025] 🎉 SkyPilot v0.10.0 released! [**blog post**](https://blog.skypilot.co/announcing-skypilot-0.10.0/), [**release notes**](https://github.com/skypilot-org/skypilot/releases/tag/v0.10.0)
244
244
  - [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)