skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (91) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/backends/backend_utils.py +57 -7
  4. sky/backends/cloud_vm_ray_backend.py +50 -8
  5. sky/client/cli/command.py +60 -26
  6. sky/client/sdk.py +132 -65
  7. sky/client/sdk_async.py +1 -1
  8. sky/core.py +10 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
  18. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  19. sky/dashboard/out/clusters/[cluster].html +1 -1
  20. sky/dashboard/out/clusters.html +1 -1
  21. sky/dashboard/out/config.html +1 -1
  22. sky/dashboard/out/index.html +1 -1
  23. sky/dashboard/out/infra/[context].html +1 -1
  24. sky/dashboard/out/infra.html +1 -1
  25. sky/dashboard/out/jobs/[job].html +1 -1
  26. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/volumes.html +1 -1
  30. sky/dashboard/out/workspace/new.html +1 -1
  31. sky/dashboard/out/workspaces/[name].html +1 -1
  32. sky/dashboard/out/workspaces.html +1 -1
  33. sky/execution.py +21 -4
  34. sky/global_user_state.py +110 -1
  35. sky/jobs/client/sdk.py +27 -20
  36. sky/jobs/controller.py +2 -1
  37. sky/jobs/recovery_strategy.py +3 -0
  38. sky/jobs/server/core.py +4 -0
  39. sky/jobs/utils.py +9 -2
  40. sky/provision/__init__.py +3 -2
  41. sky/provision/aws/instance.py +5 -4
  42. sky/provision/azure/instance.py +5 -4
  43. sky/provision/cudo/instance.py +5 -4
  44. sky/provision/do/instance.py +5 -4
  45. sky/provision/fluidstack/instance.py +5 -4
  46. sky/provision/gcp/instance.py +5 -4
  47. sky/provision/hyperbolic/instance.py +5 -4
  48. sky/provision/kubernetes/instance.py +36 -6
  49. sky/provision/lambda_cloud/instance.py +5 -4
  50. sky/provision/nebius/instance.py +5 -4
  51. sky/provision/oci/instance.py +5 -4
  52. sky/provision/paperspace/instance.py +5 -4
  53. sky/provision/provisioner.py +6 -0
  54. sky/provision/runpod/instance.py +5 -4
  55. sky/provision/scp/instance.py +5 -5
  56. sky/provision/vast/instance.py +5 -5
  57. sky/provision/vsphere/instance.py +5 -4
  58. sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
  59. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  60. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  61. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  62. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  63. sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
  64. sky/serve/client/impl.py +11 -8
  65. sky/serve/client/sdk.py +7 -7
  66. sky/serve/serve_state.py +437 -340
  67. sky/serve/serve_utils.py +37 -3
  68. sky/serve/server/impl.py +2 -2
  69. sky/server/common.py +12 -8
  70. sky/server/constants.py +1 -1
  71. sky/setup_files/alembic.ini +4 -0
  72. sky/skypilot_config.py +4 -4
  73. sky/users/permission.py +1 -1
  74. sky/utils/cli_utils/status_utils.py +10 -1
  75. sky/utils/db/db_utils.py +53 -1
  76. sky/utils/db/migration_utils.py +5 -1
  77. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  78. sky/utils/resource_checker.py +162 -21
  79. sky/volumes/client/sdk.py +4 -4
  80. sky/workspaces/core.py +210 -6
  81. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
  82. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
  83. sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
  85. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
  87. /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
  88. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
  89. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
  90. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
37
37
  from sky.utils import annotations
38
38
  from sky.utils import command_runner
39
39
  from sky.utils import common_utils
40
+ from sky.utils import controller_utils
40
41
  from sky.utils import log_utils
41
42
  from sky.utils import message_utils
42
43
  from sky.utils import resources_utils
@@ -259,14 +260,47 @@ def get_service_filelock_path(pool: str) -> str:
259
260
  return str(path)
260
261
 
261
262
 
263
+ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
264
+ pool: bool) -> None:
265
+ """Validate the consolidation mode config."""
266
+ # Check whether the consolidation mode config is changed.
267
+ controller = controller_utils.get_controller_for_pool(pool).value
268
+ if current_is_consolidation_mode:
269
+ controller_cn = controller.cluster_name
270
+ if global_user_state.get_cluster_from_name(controller_cn) is not None:
271
+ with ux_utils.print_exception_no_traceback():
272
+ raise exceptions.InconsistentConsolidationModeError(
273
+ f'{colorama.Fore.RED}Consolidation mode for '
274
+ f'{controller.controller_type} is enabled, but the '
275
+ f'controller cluster {controller_cn} is still running. '
276
+ 'Please terminate the controller cluster first.'
277
+ f'{colorama.Style.RESET_ALL}')
278
+ else:
279
+ noun = 'pool' if pool else 'service'
280
+ all_services = [
281
+ svc for svc in serve_state.get_services() if svc['pool'] == pool
282
+ ]
283
+ if all_services:
284
+ with ux_utils.print_exception_no_traceback():
285
+ raise exceptions.InconsistentConsolidationModeError(
286
+ f'{colorama.Fore.RED}Consolidation mode for '
287
+ f'{controller.controller_type} is disabled, but there are '
288
+ f'still {len(all_services)} {noun}s running. Please '
289
+ f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
290
+
291
+
262
292
  @annotations.lru_cache(scope='request', maxsize=1)
263
293
  def is_consolidation_mode(pool: bool = False) -> bool:
264
294
  # Use jobs config for pool consolidation mode.
265
- controller_type = 'jobs' if pool else 'serve'
295
+ controller = controller_utils.get_controller_for_pool(pool).value
266
296
  consolidation_mode = skypilot_config.get_nested(
267
- (controller_type, 'controller', 'consolidation_mode'),
297
+ (controller.controller_type, 'controller', 'consolidation_mode'),
268
298
  default_value=False)
269
- # _check_consolidation_mode_consistency(consolidation_mode, pool)
299
+ # We should only do this check on API server, as the controller will not
300
+ # have related config and will always seemingly disabled for consolidation
301
+ # mode. Check #6611 for more details.
302
+ if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
303
+ _validate_consolidation_mode_config(consolidation_mode, pool)
270
304
  return consolidation_mode
271
305
 
272
306
 
sky/serve/server/impl.py CHANGED
@@ -198,9 +198,9 @@ def up(
198
198
  # We need a unique integer per sky.serve.up call to avoid name
199
199
  # conflict. Originally in non-consolidation mode, this is the ray
200
200
  # job id; now we use the request id hash instead. Here we also
201
- # make sure it is a 63-bit integer to avoid overflow on sqlalchemy.
201
+ # make sure it is a 32-bit integer to avoid overflow on sqlalchemy.
202
202
  rid = common_utils.get_current_request_id()
203
- controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFFFFFFFFFF
203
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFF
204
204
 
205
205
  vars_to_fill = {
206
206
  'remote_task_yaml_path': remote_tmp_task_yaml_path,
sky/server/common.py CHANGED
@@ -16,8 +16,8 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import (Any, Callable, cast, Dict, Literal, Optional, Tuple,
20
- TypeVar, Union)
19
+ from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
20
+ Tuple, TypeVar, Union)
21
21
  from urllib import parse
22
22
  import uuid
23
23
 
@@ -89,16 +89,20 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
89
89
  'restarting the API server.'
90
90
  f'{colorama.Style.RESET_ALL}')
91
91
 
92
- RequestId = str
92
+ T = TypeVar('T')
93
+ P = ParamSpec('P')
94
+
95
+
96
+ class RequestId(str, Generic[T]):
97
+ pass
98
+
99
+
93
100
  ApiVersion = Optional[str]
94
101
 
95
102
  logger = sky_logging.init_logger(__name__)
96
103
 
97
104
  hinted_for_server_install_version_mismatch = False
98
105
 
99
- T = TypeVar('T')
100
- P = ParamSpec('P')
101
-
102
106
 
103
107
  class ApiServerStatus(enum.Enum):
104
108
  HEALTHY = 'healthy'
@@ -491,7 +495,7 @@ def handle_request_error(response: 'requests.Response') -> None:
491
495
  f'{response.text}')
492
496
 
493
497
 
494
- def get_request_id(response: 'requests.Response') -> RequestId:
498
+ def get_request_id(response: 'requests.Response') -> RequestId[T]:
495
499
  handle_request_error(response)
496
500
  request_id = response.headers.get('X-Skypilot-Request-ID')
497
501
  if request_id is None:
@@ -502,7 +506,7 @@ def get_request_id(response: 'requests.Response') -> RequestId:
502
506
  'Failed to get request ID from SkyPilot API server at '
503
507
  f'{get_server_url()}. Response: {response.status_code} '
504
508
  f'{response.text}')
505
- return request_id
509
+ return RequestId[T](request_id)
506
510
 
507
511
 
508
512
  def _start_api_server(deploy: bool = False,
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 15
13
+ API_VERSION = 16
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -94,6 +94,10 @@ version_table = alembic_version_state_db
94
94
  version_locations = %(here)s/../schemas/db/spot_jobs
95
95
  version_table = alembic_version_spot_jobs_db
96
96
 
97
+ [serve_db]
98
+ version_locations = %(here)s/../schemas/db/serve_state
99
+ version_table = alembic_version_serve_state_db
100
+
97
101
  [post_write_hooks]
98
102
  # post_write_hooks defines scripts or Python functions that are run
99
103
  # on newly generated revision scripts. See the documentation for further
sky/skypilot_config.py CHANGED
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
575
575
  with _DB_USE_LOCK:
576
576
  sqlalchemy_engine = sqlalchemy.create_engine(db_url,
577
577
  poolclass=NullPool)
578
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
579
- sqlalchemy_engine)
578
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
579
+ sqlalchemy_engine)
580
580
 
581
581
  def _get_config_yaml_from_db(
582
582
  key: str) -> Optional[config_utils.Config]:
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
867
867
  with _DB_USE_LOCK:
868
868
  sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
869
869
  poolclass=NullPool)
870
- db_utils.add_tables_to_db_sqlalchemy(Base.metadata,
871
- sqlalchemy_engine)
870
+ db_utils.add_all_tables_to_db_sqlalchemy(
871
+ Base.metadata, sqlalchemy_engine)
872
872
 
873
873
  def _set_config_yaml_to_db(key: str,
874
874
  config: config_utils.Config):
sky/users/permission.py CHANGED
@@ -44,7 +44,7 @@ class PermissionService:
44
44
  if _enforcer_instance is None:
45
45
  _enforcer_instance = self
46
46
  engine = global_user_state.initialize_and_get_db()
47
- db_utils.add_tables_to_db_sqlalchemy(
47
+ db_utils.add_all_tables_to_db_sqlalchemy(
48
48
  sqlalchemy_adapter.Base.metadata, engine)
49
49
  adapter = sqlalchemy_adapter.Adapter(engine)
50
50
  model_path = os.path.join(os.path.dirname(__file__),
@@ -81,6 +81,7 @@ def show_status_table(cluster_records: List[_ClusterRecord],
81
81
  _get_command,
82
82
  truncate=not show_all,
83
83
  show_by_default=False),
84
+ StatusColumn('LAST_EVENT', _get_last_event, show_by_default=False),
84
85
  ]
85
86
 
86
87
  columns = []
@@ -314,6 +315,14 @@ def _get_head_ip(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
314
315
  return handle.head_ip
315
316
 
316
317
 
318
+ def _get_last_event(cluster_record: _ClusterRecord,
319
+ truncate: bool = True) -> str:
320
+ del truncate
321
+ if cluster_record.get('last_event', None) is None:
322
+ return 'No recorded events.'
323
+ return cluster_record['last_event']
324
+
325
+
317
326
  def _is_pending_autostop(cluster_record: _ClusterRecord) -> bool:
318
327
  # autostop < 0 means nothing scheduled.
319
328
  return cluster_record['autostop'] >= 0 and _get_status(
@@ -401,7 +410,7 @@ def _get_estimated_cost_for_cost_report(
401
410
 
402
411
 
403
412
  def show_kubernetes_cluster_status_table(
404
- clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
413
+ clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
405
414
  show_all: bool) -> None:
406
415
  """Compute cluster table values and display for Kubernetes clusters."""
407
416
  status_columns = [
sky/utils/db/db_utils.py CHANGED
@@ -87,7 +87,7 @@ def add_column_to_table(
87
87
  conn.commit()
88
88
 
89
89
 
90
- def add_tables_to_db_sqlalchemy(
90
+ def add_all_tables_to_db_sqlalchemy(
91
91
  metadata: sqlalchemy.MetaData,
92
92
  engine: sqlalchemy.Engine,
93
93
  ):
@@ -103,6 +103,27 @@ def add_tables_to_db_sqlalchemy(
103
103
  raise
104
104
 
105
105
 
106
+ def add_table_to_db_sqlalchemy(
107
+ metadata: sqlalchemy.MetaData,
108
+ engine: sqlalchemy.Engine,
109
+ table_name: str,
110
+ ):
111
+ """Add a specific table to the database."""
112
+ try:
113
+ table = metadata.tables[table_name]
114
+ except KeyError as e:
115
+ raise e
116
+
117
+ try:
118
+ table.create(bind=engine, checkfirst=True)
119
+ except (sqlalchemy_exc.OperationalError,
120
+ sqlalchemy_exc.ProgrammingError) as e:
121
+ if 'already exists' in str(e):
122
+ pass
123
+ else:
124
+ raise
125
+
126
+
106
127
  def add_column_to_table_sqlalchemy(
107
128
  session: 'Session',
108
129
  table_name: str,
@@ -205,6 +226,37 @@ def add_column_to_table_alembic(
205
226
  raise
206
227
 
207
228
 
229
+ def drop_column_from_table_alembic(
230
+ table_name: str,
231
+ column_name: str,
232
+ ):
233
+ """Drop a column from a table using Alembic operations.
234
+
235
+ Args:
236
+ table_name: Name of the table to drop column from.
237
+ column_name: Name of the column to drop.
238
+ """
239
+ from alembic import op # pylint: disable=import-outside-toplevel
240
+
241
+ # Check if column exists before trying to drop it
242
+ bind = op.get_bind()
243
+ inspector = sqlalchemy.inspect(bind)
244
+ columns = [col['name'] for col in inspector.get_columns(table_name)]
245
+
246
+ if column_name not in columns:
247
+ # Column doesn't exist; nothing to do
248
+ return
249
+
250
+ try:
251
+ op.drop_column(table_name, column_name)
252
+ except (sqlalchemy_exc.ProgrammingError,
253
+ sqlalchemy_exc.OperationalError) as e:
254
+ if 'does not exist' in str(e).lower():
255
+ pass # Already dropped
256
+ else:
257
+ raise
258
+
259
+
208
260
  class SQLiteConn(threading.local):
209
261
  """Thread-local connection to the sqlite3 database."""
210
262
 
@@ -19,13 +19,17 @@ logger = sky_logging.init_logger(__name__)
19
19
  DB_INIT_LOCK_TIMEOUT_SECONDS = 10
20
20
 
21
21
  GLOBAL_USER_STATE_DB_NAME = 'state_db'
22
- GLOBAL_USER_STATE_VERSION = '002'
22
+ GLOBAL_USER_STATE_VERSION = '005'
23
23
  GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
24
24
 
25
25
  SPOT_JOBS_DB_NAME = 'spot_jobs_db'
26
26
  SPOT_JOBS_VERSION = '003'
27
27
  SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
28
28
 
29
+ SERVE_DB_NAME = 'serve_db'
30
+ SERVE_VERSION = '001'
31
+ SERVE_LOCK_PATH = '~/.sky/locks/.serve_db.lock'
32
+
29
33
 
30
34
  def get_engine(db_name: str):
31
35
  conn_string = None
@@ -1276,7 +1276,9 @@ def deploy_cluster(head_node,
1276
1276
  print(
1277
1277
  ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1278
1278
  )
1279
- print(' • Connect to pod with VSCode: code --remote ssh-remote+devbox ')
1279
+ print(
1280
+ ' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
1281
+ )
1280
1282
  # Print completion marker for current cluster
1281
1283
  print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1282
1284
 
@@ -74,27 +74,7 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
74
74
  ValueError: If any resource has active clusters or managed jobs.
75
75
  """
76
76
 
77
- def get_all_clusters():
78
- return global_user_state.get_clusters()
79
-
80
- def get_all_managed_jobs():
81
- # pylint: disable=import-outside-toplevel
82
- from sky.jobs.server import core as managed_jobs_core
83
- try:
84
- return managed_jobs_core.queue(refresh=False,
85
- skip_finished=True,
86
- all_users=True)
87
- except exceptions.ClusterNotUpError:
88
- logger.warning('All jobs should be finished.')
89
- return []
90
-
91
- # Fetch both clusters and jobs in parallel
92
- with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
93
- clusters_future = executor.submit(get_all_clusters)
94
- jobs_future = executor.submit(get_all_managed_jobs)
95
-
96
- all_clusters = clusters_future.result()
97
- all_managed_jobs = jobs_future.result()
77
+ all_clusters, all_managed_jobs = _get_active_resources()
98
78
 
99
79
  # Collect all error messages instead of raising immediately
100
80
  error_messages = []
@@ -134,6 +114,11 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
134
114
  # If this resource has issues, add to overall error messages
135
115
  if resource_errors:
136
116
  resource_error_summary = ' and '.join(resource_errors)
117
+ if resource_type == 'user':
118
+ # resource_name is user_id
119
+ user_info = global_user_state.get_user(resource_name)
120
+ if user_info and user_info.name:
121
+ resource_name = user_info.name
137
122
  error_messages.append(
138
123
  f'Cannot {operation} {resource_type} {resource_name!r} '
139
124
  f'because it has {resource_error_summary}.')
@@ -151,3 +136,159 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
151
136
  '\n'.join(f'• {msg}' for msg in error_messages) +
152
137
  '\nPlease terminate these resources first.')
153
138
  raise ValueError(full_message)
139
+
140
+
141
+ def check_users_workspaces_active_resources(
142
+ user_ids: List[str],
143
+ workspace_names: List[str]) -> Tuple[str, List[str]]:
144
+ """Check if all the active clusters or managed jobs in workspaces
145
+ belong to the user_ids. If not, return the error message.
146
+
147
+ Args:
148
+ user_ids: List of user_id.
149
+ workspace_names: List of workspace_name.
150
+
151
+ Returns:
152
+ resource_error_summary: str
153
+ missed_users_names: List[str]
154
+ """
155
+ all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
156
+ workspace_names)
157
+ resource_errors = []
158
+ missed_users = set()
159
+ active_cluster_names = []
160
+ active_job_names = []
161
+ # Check clusters
162
+ if all_clusters:
163
+ for cluster in all_clusters:
164
+ user_hash = cluster.get('user_hash')
165
+ if user_hash and user_hash not in user_ids:
166
+ missed_users.add(user_hash)
167
+ active_cluster_names.append(cluster['name'])
168
+ if active_cluster_names:
169
+ cluster_list = ', '.join(active_cluster_names)
170
+ resource_errors.append(
171
+ f'{len(active_cluster_names)} active cluster(s):'
172
+ f' {cluster_list}')
173
+
174
+ # Check managed jobs
175
+ if all_managed_jobs:
176
+ for job in all_managed_jobs:
177
+ user_hash = job.get('user_hash')
178
+ if user_hash and user_hash not in user_ids:
179
+ missed_users.add(user_hash)
180
+ active_job_names.append(str(job['job_id']))
181
+ if active_job_names:
182
+ job_list = ', '.join(active_job_names)
183
+ resource_errors.append(f'{len(active_job_names)} active'
184
+ f' managed job(s): {job_list}')
185
+
186
+ resource_error_summary = ''
187
+ if resource_errors:
188
+ resource_error_summary = ' and '.join(resource_errors)
189
+ missed_users_names = []
190
+ if missed_users:
191
+ all_users = global_user_state.get_all_users()
192
+ missed_users_names = [
193
+ user.name if user.name else user.id
194
+ for user in all_users
195
+ if user.id in missed_users
196
+ ]
197
+ return resource_error_summary, missed_users_names
198
+
199
+
200
+ def _get_active_resources_for_workspaces(
201
+ workspace_names: List[str]
202
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
203
+ """Get active clusters or managed jobs for workspaces.
204
+
205
+ Args:
206
+ workspace_names: List of workspace_name.
207
+
208
+ Returns:
209
+ all_clusters: List[Dict[str, Any]]
210
+ all_managed_jobs: List[Dict[str, Any]]
211
+ """
212
+ if not workspace_names:
213
+ return [], []
214
+
215
+ def filter_by_workspaces(workspace_names: List[str]):
216
+ return lambda resource: (resource.get(
217
+ 'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
218
+ workspace_names)
219
+
220
+ return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
221
+
222
+
223
+ def _get_active_resources_by_names(
224
+ resource_names: List[str],
225
+ filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
226
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
227
+ """Get active clusters or managed jobs.
228
+
229
+ Args:
230
+ resource_names: List of resource_name.
231
+ filter_factory: Function that takes a resource_name and returns a filter
232
+ function for clusters/jobs.
233
+
234
+ Returns:
235
+ all_clusters: List[Dict[str, Any]]
236
+ all_managed_jobs: List[Dict[str, Any]]
237
+ """
238
+
239
+ all_clusters, all_managed_jobs = _get_active_resources()
240
+
241
+ resource_clusters = []
242
+ resource_active_jobs = []
243
+
244
+ # Check each resource against the fetched data,
245
+ # return the active resources by names
246
+ resource_filter = filter_factory(resource_names)
247
+
248
+ # Filter clusters for this resource
249
+ if all_clusters:
250
+ resource_clusters = [
251
+ cluster for cluster in all_clusters if resource_filter(cluster)
252
+ ]
253
+
254
+ # Filter managed jobs for this resource
255
+ if all_managed_jobs:
256
+ resource_active_jobs = [
257
+ job for job in all_managed_jobs if resource_filter(job)
258
+ ]
259
+
260
+ return resource_clusters, resource_active_jobs
261
+
262
+
263
+ def _get_active_resources(
264
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
265
+ """Get all active clusters and managed jobs.
266
+
267
+ Returns:
268
+ all_clusters: List[Dict[str, Any]]
269
+ all_managed_jobs: List[Dict[str, Any]]
270
+ """
271
+
272
+ def get_all_clusters():
273
+ return global_user_state.get_clusters()
274
+
275
+ def get_all_managed_jobs():
276
+ # pylint: disable=import-outside-toplevel
277
+ from sky.jobs.server import core as managed_jobs_core
278
+ try:
279
+ return managed_jobs_core.queue(refresh=False,
280
+ skip_finished=True,
281
+ all_users=True)
282
+ except exceptions.ClusterNotUpError:
283
+ logger.warning('All jobs should be finished.')
284
+ return []
285
+
286
+ # Fetch both clusters and jobs in parallel
287
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
288
+ clusters_future = executor.submit(get_all_clusters)
289
+ jobs_future = executor.submit(get_all_managed_jobs)
290
+
291
+ all_clusters = clusters_future.result()
292
+ all_managed_jobs = jobs_future.result()
293
+
294
+ return all_clusters, all_managed_jobs
sky/volumes/client/sdk.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import json
3
3
  import typing
4
- from typing import List
4
+ from typing import Any, Dict, List
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
@@ -24,7 +24,7 @@ logger = sky_logging.init_logger(__name__)
24
24
  @usage_lib.entrypoint
25
25
  @server_common.check_server_healthy_or_start
26
26
  @annotations.client_api
27
- def apply(volume: volume_lib.Volume) -> server_common.RequestId:
27
+ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
28
28
  """Creates or registers a volume.
29
29
 
30
30
  Args:
@@ -50,7 +50,7 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId:
50
50
  @usage_lib.entrypoint
51
51
  @server_common.check_server_healthy_or_start
52
52
  @annotations.client_api
53
- def ls() -> server_common.RequestId:
53
+ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
54
54
  """Lists all volumes.
55
55
 
56
56
  Returns:
@@ -65,7 +65,7 @@ def ls() -> server_common.RequestId:
65
65
  @usage_lib.entrypoint
66
66
  @server_common.check_server_healthy_or_start
67
67
  @annotations.client_api
68
- def delete(names: List[str]) -> server_common.RequestId:
68
+ def delete(names: List[str]) -> server_common.RequestId[None]:
69
69
  """Deletes volumes.
70
70
 
71
71
  Args: