skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +57 -7
- sky/backends/cloud_vm_ray_backend.py +50 -8
- sky/client/cli/command.py +60 -26
- sky/client/sdk.py +132 -65
- sky/client/sdk_async.py +1 -1
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +21 -4
- sky/global_user_state.py +110 -1
- sky/jobs/client/sdk.py +27 -20
- sky/jobs/controller.py +2 -1
- sky/jobs/recovery_strategy.py +3 -0
- sky/jobs/server/core.py +4 -0
- sky/jobs/utils.py +9 -2
- sky/provision/__init__.py +3 -2
- sky/provision/aws/instance.py +5 -4
- sky/provision/azure/instance.py +5 -4
- sky/provision/cudo/instance.py +5 -4
- sky/provision/do/instance.py +5 -4
- sky/provision/fluidstack/instance.py +5 -4
- sky/provision/gcp/instance.py +5 -4
- sky/provision/hyperbolic/instance.py +5 -4
- sky/provision/kubernetes/instance.py +36 -6
- sky/provision/lambda_cloud/instance.py +5 -4
- sky/provision/nebius/instance.py +5 -4
- sky/provision/oci/instance.py +5 -4
- sky/provision/paperspace/instance.py +5 -4
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +5 -4
- sky/provision/scp/instance.py +5 -5
- sky/provision/vast/instance.py +5 -5
- sky/provision/vsphere/instance.py +5 -4
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/serve/client/impl.py +11 -8
- sky/serve/client/sdk.py +7 -7
- sky/serve/serve_state.py +437 -340
- sky/serve/serve_utils.py +37 -3
- sky/serve/server/impl.py +2 -2
- sky/server/common.py +12 -8
- sky/server/constants.py +1 -1
- sky/setup_files/alembic.ini +4 -0
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +10 -1
- sky/utils/db/db_utils.py +53 -1
- sky/utils/db/migration_utils.py +5 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
- sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
- /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -37,6 +37,7 @@ from sky.skylet import job_lib
|
|
|
37
37
|
from sky.utils import annotations
|
|
38
38
|
from sky.utils import command_runner
|
|
39
39
|
from sky.utils import common_utils
|
|
40
|
+
from sky.utils import controller_utils
|
|
40
41
|
from sky.utils import log_utils
|
|
41
42
|
from sky.utils import message_utils
|
|
42
43
|
from sky.utils import resources_utils
|
|
@@ -259,14 +260,47 @@ def get_service_filelock_path(pool: str) -> str:
|
|
|
259
260
|
return str(path)
|
|
260
261
|
|
|
261
262
|
|
|
263
|
+
def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
|
|
264
|
+
pool: bool) -> None:
|
|
265
|
+
"""Validate the consolidation mode config."""
|
|
266
|
+
# Check whether the consolidation mode config is changed.
|
|
267
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
268
|
+
if current_is_consolidation_mode:
|
|
269
|
+
controller_cn = controller.cluster_name
|
|
270
|
+
if global_user_state.get_cluster_from_name(controller_cn) is not None:
|
|
271
|
+
with ux_utils.print_exception_no_traceback():
|
|
272
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
273
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
274
|
+
f'{controller.controller_type} is enabled, but the '
|
|
275
|
+
f'controller cluster {controller_cn} is still running. '
|
|
276
|
+
'Please terminate the controller cluster first.'
|
|
277
|
+
f'{colorama.Style.RESET_ALL}')
|
|
278
|
+
else:
|
|
279
|
+
noun = 'pool' if pool else 'service'
|
|
280
|
+
all_services = [
|
|
281
|
+
svc for svc in serve_state.get_services() if svc['pool'] == pool
|
|
282
|
+
]
|
|
283
|
+
if all_services:
|
|
284
|
+
with ux_utils.print_exception_no_traceback():
|
|
285
|
+
raise exceptions.InconsistentConsolidationModeError(
|
|
286
|
+
f'{colorama.Fore.RED}Consolidation mode for '
|
|
287
|
+
f'{controller.controller_type} is disabled, but there are '
|
|
288
|
+
f'still {len(all_services)} {noun}s running. Please '
|
|
289
|
+
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
|
|
290
|
+
|
|
291
|
+
|
|
262
292
|
@annotations.lru_cache(scope='request', maxsize=1)
|
|
263
293
|
def is_consolidation_mode(pool: bool = False) -> bool:
|
|
264
294
|
# Use jobs config for pool consolidation mode.
|
|
265
|
-
|
|
295
|
+
controller = controller_utils.get_controller_for_pool(pool).value
|
|
266
296
|
consolidation_mode = skypilot_config.get_nested(
|
|
267
|
-
(controller_type, 'controller', 'consolidation_mode'),
|
|
297
|
+
(controller.controller_type, 'controller', 'consolidation_mode'),
|
|
268
298
|
default_value=False)
|
|
269
|
-
#
|
|
299
|
+
# We should only do this check on API server, as the controller will not
|
|
300
|
+
# have related config and will always seemingly disabled for consolidation
|
|
301
|
+
# mode. Check #6611 for more details.
|
|
302
|
+
if os.environ.get(skylet_constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
|
|
303
|
+
_validate_consolidation_mode_config(consolidation_mode, pool)
|
|
270
304
|
return consolidation_mode
|
|
271
305
|
|
|
272
306
|
|
sky/serve/server/impl.py
CHANGED
|
@@ -198,9 +198,9 @@ def up(
|
|
|
198
198
|
# We need a unique integer per sky.serve.up call to avoid name
|
|
199
199
|
# conflict. Originally in non-consolidation mode, this is the ray
|
|
200
200
|
# job id; now we use the request id hash instead. Here we also
|
|
201
|
-
# make sure it is a
|
|
201
|
+
# make sure it is a 32-bit integer to avoid overflow on sqlalchemy.
|
|
202
202
|
rid = common_utils.get_current_request_id()
|
|
203
|
-
controller_job_id = hash(uuid.UUID(rid).int) &
|
|
203
|
+
controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFF
|
|
204
204
|
|
|
205
205
|
vars_to_fill = {
|
|
206
206
|
'remote_task_yaml_path': remote_tmp_task_yaml_path,
|
sky/server/common.py
CHANGED
|
@@ -16,8 +16,8 @@ import tempfile
|
|
|
16
16
|
import threading
|
|
17
17
|
import time
|
|
18
18
|
import typing
|
|
19
|
-
from typing import (Any, Callable, cast, Dict, Literal, Optional,
|
|
20
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
20
|
+
Tuple, TypeVar, Union)
|
|
21
21
|
from urllib import parse
|
|
22
22
|
import uuid
|
|
23
23
|
|
|
@@ -89,16 +89,20 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
|
|
|
89
89
|
'restarting the API server.'
|
|
90
90
|
f'{colorama.Style.RESET_ALL}')
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
T = TypeVar('T')
|
|
93
|
+
P = ParamSpec('P')
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RequestId(str, Generic[T]):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
|
|
93
100
|
ApiVersion = Optional[str]
|
|
94
101
|
|
|
95
102
|
logger = sky_logging.init_logger(__name__)
|
|
96
103
|
|
|
97
104
|
hinted_for_server_install_version_mismatch = False
|
|
98
105
|
|
|
99
|
-
T = TypeVar('T')
|
|
100
|
-
P = ParamSpec('P')
|
|
101
|
-
|
|
102
106
|
|
|
103
107
|
class ApiServerStatus(enum.Enum):
|
|
104
108
|
HEALTHY = 'healthy'
|
|
@@ -491,7 +495,7 @@ def handle_request_error(response: 'requests.Response') -> None:
|
|
|
491
495
|
f'{response.text}')
|
|
492
496
|
|
|
493
497
|
|
|
494
|
-
def get_request_id(response: 'requests.Response') -> RequestId:
|
|
498
|
+
def get_request_id(response: 'requests.Response') -> RequestId[T]:
|
|
495
499
|
handle_request_error(response)
|
|
496
500
|
request_id = response.headers.get('X-Skypilot-Request-ID')
|
|
497
501
|
if request_id is None:
|
|
@@ -502,7 +506,7 @@ def get_request_id(response: 'requests.Response') -> RequestId:
|
|
|
502
506
|
'Failed to get request ID from SkyPilot API server at '
|
|
503
507
|
f'{get_server_url()}. Response: {response.status_code} '
|
|
504
508
|
f'{response.text}')
|
|
505
|
-
return request_id
|
|
509
|
+
return RequestId[T](request_id)
|
|
506
510
|
|
|
507
511
|
|
|
508
512
|
def _start_api_server(deploy: bool = False,
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 16
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/setup_files/alembic.ini
CHANGED
|
@@ -94,6 +94,10 @@ version_table = alembic_version_state_db
|
|
|
94
94
|
version_locations = %(here)s/../schemas/db/spot_jobs
|
|
95
95
|
version_table = alembic_version_spot_jobs_db
|
|
96
96
|
|
|
97
|
+
[serve_db]
|
|
98
|
+
version_locations = %(here)s/../schemas/db/serve_state
|
|
99
|
+
version_table = alembic_version_serve_state_db
|
|
100
|
+
|
|
97
101
|
[post_write_hooks]
|
|
98
102
|
# post_write_hooks defines scripts or Python functions that are run
|
|
99
103
|
# on newly generated revision scripts. See the documentation for further
|
sky/skypilot_config.py
CHANGED
|
@@ -575,8 +575,8 @@ def _reload_config_as_server() -> None:
|
|
|
575
575
|
with _DB_USE_LOCK:
|
|
576
576
|
sqlalchemy_engine = sqlalchemy.create_engine(db_url,
|
|
577
577
|
poolclass=NullPool)
|
|
578
|
-
db_utils.
|
|
579
|
-
|
|
578
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata,
|
|
579
|
+
sqlalchemy_engine)
|
|
580
580
|
|
|
581
581
|
def _get_config_yaml_from_db(
|
|
582
582
|
key: str) -> Optional[config_utils.Config]:
|
|
@@ -867,8 +867,8 @@ def update_api_server_config_no_lock(config: config_utils.Config) -> None:
|
|
|
867
867
|
with _DB_USE_LOCK:
|
|
868
868
|
sqlalchemy_engine = sqlalchemy.create_engine(existing_db_url,
|
|
869
869
|
poolclass=NullPool)
|
|
870
|
-
db_utils.
|
|
871
|
-
|
|
870
|
+
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
871
|
+
Base.metadata, sqlalchemy_engine)
|
|
872
872
|
|
|
873
873
|
def _set_config_yaml_to_db(key: str,
|
|
874
874
|
config: config_utils.Config):
|
sky/users/permission.py
CHANGED
|
@@ -44,7 +44,7 @@ class PermissionService:
|
|
|
44
44
|
if _enforcer_instance is None:
|
|
45
45
|
_enforcer_instance = self
|
|
46
46
|
engine = global_user_state.initialize_and_get_db()
|
|
47
|
-
db_utils.
|
|
47
|
+
db_utils.add_all_tables_to_db_sqlalchemy(
|
|
48
48
|
sqlalchemy_adapter.Base.metadata, engine)
|
|
49
49
|
adapter = sqlalchemy_adapter.Adapter(engine)
|
|
50
50
|
model_path = os.path.join(os.path.dirname(__file__),
|
|
@@ -81,6 +81,7 @@ def show_status_table(cluster_records: List[_ClusterRecord],
|
|
|
81
81
|
_get_command,
|
|
82
82
|
truncate=not show_all,
|
|
83
83
|
show_by_default=False),
|
|
84
|
+
StatusColumn('LAST_EVENT', _get_last_event, show_by_default=False),
|
|
84
85
|
]
|
|
85
86
|
|
|
86
87
|
columns = []
|
|
@@ -314,6 +315,14 @@ def _get_head_ip(cluster_record: _ClusterRecord, truncate: bool = True) -> str:
|
|
|
314
315
|
return handle.head_ip
|
|
315
316
|
|
|
316
317
|
|
|
318
|
+
def _get_last_event(cluster_record: _ClusterRecord,
|
|
319
|
+
truncate: bool = True) -> str:
|
|
320
|
+
del truncate
|
|
321
|
+
if cluster_record.get('last_event', None) is None:
|
|
322
|
+
return 'No recorded events.'
|
|
323
|
+
return cluster_record['last_event']
|
|
324
|
+
|
|
325
|
+
|
|
317
326
|
def _is_pending_autostop(cluster_record: _ClusterRecord) -> bool:
|
|
318
327
|
# autostop < 0 means nothing scheduled.
|
|
319
328
|
return cluster_record['autostop'] >= 0 and _get_status(
|
|
@@ -401,7 +410,7 @@ def _get_estimated_cost_for_cost_report(
|
|
|
401
410
|
|
|
402
411
|
|
|
403
412
|
def show_kubernetes_cluster_status_table(
|
|
404
|
-
clusters: List['kubernetes_utils.
|
|
413
|
+
clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
|
405
414
|
show_all: bool) -> None:
|
|
406
415
|
"""Compute cluster table values and display for Kubernetes clusters."""
|
|
407
416
|
status_columns = [
|
sky/utils/db/db_utils.py
CHANGED
|
@@ -87,7 +87,7 @@ def add_column_to_table(
|
|
|
87
87
|
conn.commit()
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
def
|
|
90
|
+
def add_all_tables_to_db_sqlalchemy(
|
|
91
91
|
metadata: sqlalchemy.MetaData,
|
|
92
92
|
engine: sqlalchemy.Engine,
|
|
93
93
|
):
|
|
@@ -103,6 +103,27 @@ def add_tables_to_db_sqlalchemy(
|
|
|
103
103
|
raise
|
|
104
104
|
|
|
105
105
|
|
|
106
|
+
def add_table_to_db_sqlalchemy(
|
|
107
|
+
metadata: sqlalchemy.MetaData,
|
|
108
|
+
engine: sqlalchemy.Engine,
|
|
109
|
+
table_name: str,
|
|
110
|
+
):
|
|
111
|
+
"""Add a specific table to the database."""
|
|
112
|
+
try:
|
|
113
|
+
table = metadata.tables[table_name]
|
|
114
|
+
except KeyError as e:
|
|
115
|
+
raise e
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
table.create(bind=engine, checkfirst=True)
|
|
119
|
+
except (sqlalchemy_exc.OperationalError,
|
|
120
|
+
sqlalchemy_exc.ProgrammingError) as e:
|
|
121
|
+
if 'already exists' in str(e):
|
|
122
|
+
pass
|
|
123
|
+
else:
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
|
|
106
127
|
def add_column_to_table_sqlalchemy(
|
|
107
128
|
session: 'Session',
|
|
108
129
|
table_name: str,
|
|
@@ -205,6 +226,37 @@ def add_column_to_table_alembic(
|
|
|
205
226
|
raise
|
|
206
227
|
|
|
207
228
|
|
|
229
|
+
def drop_column_from_table_alembic(
|
|
230
|
+
table_name: str,
|
|
231
|
+
column_name: str,
|
|
232
|
+
):
|
|
233
|
+
"""Drop a column from a table using Alembic operations.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
table_name: Name of the table to drop column from.
|
|
237
|
+
column_name: Name of the column to drop.
|
|
238
|
+
"""
|
|
239
|
+
from alembic import op # pylint: disable=import-outside-toplevel
|
|
240
|
+
|
|
241
|
+
# Check if column exists before trying to drop it
|
|
242
|
+
bind = op.get_bind()
|
|
243
|
+
inspector = sqlalchemy.inspect(bind)
|
|
244
|
+
columns = [col['name'] for col in inspector.get_columns(table_name)]
|
|
245
|
+
|
|
246
|
+
if column_name not in columns:
|
|
247
|
+
# Column doesn't exist; nothing to do
|
|
248
|
+
return
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
op.drop_column(table_name, column_name)
|
|
252
|
+
except (sqlalchemy_exc.ProgrammingError,
|
|
253
|
+
sqlalchemy_exc.OperationalError) as e:
|
|
254
|
+
if 'does not exist' in str(e).lower():
|
|
255
|
+
pass # Already dropped
|
|
256
|
+
else:
|
|
257
|
+
raise
|
|
258
|
+
|
|
259
|
+
|
|
208
260
|
class SQLiteConn(threading.local):
|
|
209
261
|
"""Thread-local connection to the sqlite3 database."""
|
|
210
262
|
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -19,13 +19,17 @@ logger = sky_logging.init_logger(__name__)
|
|
|
19
19
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
20
20
|
|
|
21
21
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
22
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
22
|
+
GLOBAL_USER_STATE_VERSION = '005'
|
|
23
23
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
24
24
|
|
|
25
25
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
|
26
26
|
SPOT_JOBS_VERSION = '003'
|
|
27
27
|
SPOT_JOBS_LOCK_PATH = '~/.sky/locks/.spot_jobs_db.lock'
|
|
28
28
|
|
|
29
|
+
SERVE_DB_NAME = 'serve_db'
|
|
30
|
+
SERVE_VERSION = '001'
|
|
31
|
+
SERVE_LOCK_PATH = '~/.sky/locks/.serve_db.lock'
|
|
32
|
+
|
|
29
33
|
|
|
30
34
|
def get_engine(db_name: str):
|
|
31
35
|
conn_string = None
|
|
@@ -1276,7 +1276,9 @@ def deploy_cluster(head_node,
|
|
|
1276
1276
|
print(
|
|
1277
1277
|
' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
|
|
1278
1278
|
)
|
|
1279
|
-
print(
|
|
1279
|
+
print(
|
|
1280
|
+
' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
|
|
1281
|
+
)
|
|
1280
1282
|
# Print completion marker for current cluster
|
|
1281
1283
|
print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
|
|
1282
1284
|
|
sky/utils/resource_checker.py
CHANGED
|
@@ -74,27 +74,7 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
|
74
74
|
ValueError: If any resource has active clusters or managed jobs.
|
|
75
75
|
"""
|
|
76
76
|
|
|
77
|
-
|
|
78
|
-
return global_user_state.get_clusters()
|
|
79
|
-
|
|
80
|
-
def get_all_managed_jobs():
|
|
81
|
-
# pylint: disable=import-outside-toplevel
|
|
82
|
-
from sky.jobs.server import core as managed_jobs_core
|
|
83
|
-
try:
|
|
84
|
-
return managed_jobs_core.queue(refresh=False,
|
|
85
|
-
skip_finished=True,
|
|
86
|
-
all_users=True)
|
|
87
|
-
except exceptions.ClusterNotUpError:
|
|
88
|
-
logger.warning('All jobs should be finished.')
|
|
89
|
-
return []
|
|
90
|
-
|
|
91
|
-
# Fetch both clusters and jobs in parallel
|
|
92
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
93
|
-
clusters_future = executor.submit(get_all_clusters)
|
|
94
|
-
jobs_future = executor.submit(get_all_managed_jobs)
|
|
95
|
-
|
|
96
|
-
all_clusters = clusters_future.result()
|
|
97
|
-
all_managed_jobs = jobs_future.result()
|
|
77
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
98
78
|
|
|
99
79
|
# Collect all error messages instead of raising immediately
|
|
100
80
|
error_messages = []
|
|
@@ -134,6 +114,11 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
|
134
114
|
# If this resource has issues, add to overall error messages
|
|
135
115
|
if resource_errors:
|
|
136
116
|
resource_error_summary = ' and '.join(resource_errors)
|
|
117
|
+
if resource_type == 'user':
|
|
118
|
+
# resource_name is user_id
|
|
119
|
+
user_info = global_user_state.get_user(resource_name)
|
|
120
|
+
if user_info and user_info.name:
|
|
121
|
+
resource_name = user_info.name
|
|
137
122
|
error_messages.append(
|
|
138
123
|
f'Cannot {operation} {resource_type} {resource_name!r} '
|
|
139
124
|
f'because it has {resource_error_summary}.')
|
|
@@ -151,3 +136,159 @@ def _check_active_resources(resource_operations: List[Tuple[str, str]],
|
|
|
151
136
|
'\n'.join(f'• {msg}' for msg in error_messages) +
|
|
152
137
|
'\nPlease terminate these resources first.')
|
|
153
138
|
raise ValueError(full_message)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_users_workspaces_active_resources(
|
|
142
|
+
user_ids: List[str],
|
|
143
|
+
workspace_names: List[str]) -> Tuple[str, List[str]]:
|
|
144
|
+
"""Check if all the active clusters or managed jobs in workspaces
|
|
145
|
+
belong to the user_ids. If not, return the error message.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
user_ids: List of user_id.
|
|
149
|
+
workspace_names: List of workspace_name.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
resource_error_summary: str
|
|
153
|
+
missed_users_names: List[str]
|
|
154
|
+
"""
|
|
155
|
+
all_clusters, all_managed_jobs = _get_active_resources_for_workspaces(
|
|
156
|
+
workspace_names)
|
|
157
|
+
resource_errors = []
|
|
158
|
+
missed_users = set()
|
|
159
|
+
active_cluster_names = []
|
|
160
|
+
active_job_names = []
|
|
161
|
+
# Check clusters
|
|
162
|
+
if all_clusters:
|
|
163
|
+
for cluster in all_clusters:
|
|
164
|
+
user_hash = cluster.get('user_hash')
|
|
165
|
+
if user_hash and user_hash not in user_ids:
|
|
166
|
+
missed_users.add(user_hash)
|
|
167
|
+
active_cluster_names.append(cluster['name'])
|
|
168
|
+
if active_cluster_names:
|
|
169
|
+
cluster_list = ', '.join(active_cluster_names)
|
|
170
|
+
resource_errors.append(
|
|
171
|
+
f'{len(active_cluster_names)} active cluster(s):'
|
|
172
|
+
f' {cluster_list}')
|
|
173
|
+
|
|
174
|
+
# Check managed jobs
|
|
175
|
+
if all_managed_jobs:
|
|
176
|
+
for job in all_managed_jobs:
|
|
177
|
+
user_hash = job.get('user_hash')
|
|
178
|
+
if user_hash and user_hash not in user_ids:
|
|
179
|
+
missed_users.add(user_hash)
|
|
180
|
+
active_job_names.append(str(job['job_id']))
|
|
181
|
+
if active_job_names:
|
|
182
|
+
job_list = ', '.join(active_job_names)
|
|
183
|
+
resource_errors.append(f'{len(active_job_names)} active'
|
|
184
|
+
f' managed job(s): {job_list}')
|
|
185
|
+
|
|
186
|
+
resource_error_summary = ''
|
|
187
|
+
if resource_errors:
|
|
188
|
+
resource_error_summary = ' and '.join(resource_errors)
|
|
189
|
+
missed_users_names = []
|
|
190
|
+
if missed_users:
|
|
191
|
+
all_users = global_user_state.get_all_users()
|
|
192
|
+
missed_users_names = [
|
|
193
|
+
user.name if user.name else user.id
|
|
194
|
+
for user in all_users
|
|
195
|
+
if user.id in missed_users
|
|
196
|
+
]
|
|
197
|
+
return resource_error_summary, missed_users_names
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _get_active_resources_for_workspaces(
|
|
201
|
+
workspace_names: List[str]
|
|
202
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
203
|
+
"""Get active clusters or managed jobs for workspaces.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
workspace_names: List of workspace_name.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
all_clusters: List[Dict[str, Any]]
|
|
210
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
211
|
+
"""
|
|
212
|
+
if not workspace_names:
|
|
213
|
+
return [], []
|
|
214
|
+
|
|
215
|
+
def filter_by_workspaces(workspace_names: List[str]):
|
|
216
|
+
return lambda resource: (resource.get(
|
|
217
|
+
'workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
218
|
+
workspace_names)
|
|
219
|
+
|
|
220
|
+
return _get_active_resources_by_names(workspace_names, filter_by_workspaces)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _get_active_resources_by_names(
|
|
224
|
+
resource_names: List[str],
|
|
225
|
+
filter_factory: Callable[[List[str]], Callable[[Dict[str, Any]], bool]]
|
|
226
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
227
|
+
"""Get active clusters or managed jobs.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
resource_names: List of resource_name.
|
|
231
|
+
filter_factory: Function that takes a resource_name and returns a filter
|
|
232
|
+
function for clusters/jobs.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
all_clusters: List[Dict[str, Any]]
|
|
236
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
all_clusters, all_managed_jobs = _get_active_resources()
|
|
240
|
+
|
|
241
|
+
resource_clusters = []
|
|
242
|
+
resource_active_jobs = []
|
|
243
|
+
|
|
244
|
+
# Check each resource against the fetched data,
|
|
245
|
+
# return the active resources by names
|
|
246
|
+
resource_filter = filter_factory(resource_names)
|
|
247
|
+
|
|
248
|
+
# Filter clusters for this resource
|
|
249
|
+
if all_clusters:
|
|
250
|
+
resource_clusters = [
|
|
251
|
+
cluster for cluster in all_clusters if resource_filter(cluster)
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
# Filter managed jobs for this resource
|
|
255
|
+
if all_managed_jobs:
|
|
256
|
+
resource_active_jobs = [
|
|
257
|
+
job for job in all_managed_jobs if resource_filter(job)
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
return resource_clusters, resource_active_jobs
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _get_active_resources(
|
|
264
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
|
|
265
|
+
"""Get all active clusters and managed jobs.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
all_clusters: List[Dict[str, Any]]
|
|
269
|
+
all_managed_jobs: List[Dict[str, Any]]
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def get_all_clusters():
|
|
273
|
+
return global_user_state.get_clusters()
|
|
274
|
+
|
|
275
|
+
def get_all_managed_jobs():
|
|
276
|
+
# pylint: disable=import-outside-toplevel
|
|
277
|
+
from sky.jobs.server import core as managed_jobs_core
|
|
278
|
+
try:
|
|
279
|
+
return managed_jobs_core.queue(refresh=False,
|
|
280
|
+
skip_finished=True,
|
|
281
|
+
all_users=True)
|
|
282
|
+
except exceptions.ClusterNotUpError:
|
|
283
|
+
logger.warning('All jobs should be finished.')
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
# Fetch both clusters and jobs in parallel
|
|
287
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
288
|
+
clusters_future = executor.submit(get_all_clusters)
|
|
289
|
+
jobs_future = executor.submit(get_all_managed_jobs)
|
|
290
|
+
|
|
291
|
+
all_clusters = clusters_future.result()
|
|
292
|
+
all_managed_jobs = jobs_future.result()
|
|
293
|
+
|
|
294
|
+
return all_clusters, all_managed_jobs
|
sky/volumes/client/sdk.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""SDK functions for managed jobs."""
|
|
2
2
|
import json
|
|
3
3
|
import typing
|
|
4
|
-
from typing import List
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
@@ -24,7 +24,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
24
|
@usage_lib.entrypoint
|
|
25
25
|
@server_common.check_server_healthy_or_start
|
|
26
26
|
@annotations.client_api
|
|
27
|
-
def apply(volume: volume_lib.Volume) -> server_common.RequestId:
|
|
27
|
+
def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
|
|
28
28
|
"""Creates or registers a volume.
|
|
29
29
|
|
|
30
30
|
Args:
|
|
@@ -50,7 +50,7 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId:
|
|
|
50
50
|
@usage_lib.entrypoint
|
|
51
51
|
@server_common.check_server_healthy_or_start
|
|
52
52
|
@annotations.client_api
|
|
53
|
-
def ls() -> server_common.RequestId:
|
|
53
|
+
def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
54
54
|
"""Lists all volumes.
|
|
55
55
|
|
|
56
56
|
Returns:
|
|
@@ -65,7 +65,7 @@ def ls() -> server_common.RequestId:
|
|
|
65
65
|
@usage_lib.entrypoint
|
|
66
66
|
@server_common.check_server_healthy_or_start
|
|
67
67
|
@annotations.client_api
|
|
68
|
-
def delete(names: List[str]) -> server_common.RequestId:
|
|
68
|
+
def delete(names: List[str]) -> server_common.RequestId[None]:
|
|
69
69
|
"""Deletes volumes.
|
|
70
70
|
|
|
71
71
|
Args:
|