skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +448 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +18 -1
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +24 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -100,6 +100,13 @@ job_info_table = sqlalchemy.Table(
|
|
|
100
100
|
sqlalchemy.Column('original_user_yaml_path',
|
|
101
101
|
sqlalchemy.Text,
|
|
102
102
|
server_default=None),
|
|
103
|
+
sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
|
|
104
|
+
sqlalchemy.Column('current_cluster_name',
|
|
105
|
+
sqlalchemy.Text,
|
|
106
|
+
server_default=None),
|
|
107
|
+
sqlalchemy.Column('job_id_on_pool_cluster',
|
|
108
|
+
sqlalchemy.Integer,
|
|
109
|
+
server_default=None),
|
|
103
110
|
)
|
|
104
111
|
|
|
105
112
|
ha_recovery_script_table = sqlalchemy.Table(
|
|
@@ -215,6 +222,9 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
|
215
222
|
'priority': r['priority'],
|
|
216
223
|
'entrypoint': r['entrypoint'],
|
|
217
224
|
'original_user_yaml_path': r['original_user_yaml_path'],
|
|
225
|
+
'pool': r['pool'],
|
|
226
|
+
'current_cluster_name': r['current_cluster_name'],
|
|
227
|
+
'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
|
|
218
228
|
}
|
|
219
229
|
|
|
220
230
|
|
|
@@ -451,8 +461,8 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
|
|
451
461
|
|
|
452
462
|
|
|
453
463
|
@_init_db
|
|
454
|
-
def set_job_info_without_job_id(name: str, workspace: str,
|
|
455
|
-
|
|
464
|
+
def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
465
|
+
pool: Optional[str]) -> int:
|
|
456
466
|
assert _SQLALCHEMY_ENGINE is not None
|
|
457
467
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
458
468
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -469,6 +479,7 @@ def set_job_info_without_job_id(name: str, workspace: str,
|
|
|
469
479
|
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
470
480
|
workspace=workspace,
|
|
471
481
|
entrypoint=entrypoint,
|
|
482
|
+
pool=pool,
|
|
472
483
|
)
|
|
473
484
|
|
|
474
485
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
@@ -1278,6 +1289,56 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1278
1289
|
return updated_count == 0
|
|
1279
1290
|
|
|
1280
1291
|
|
|
1292
|
+
@_init_db
|
|
1293
|
+
def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
1294
|
+
"""Get the pool from the job id."""
|
|
1295
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1296
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1297
|
+
pool = session.execute(
|
|
1298
|
+
sqlalchemy.select(job_info_table.c.pool).where(
|
|
1299
|
+
job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1300
|
+
return pool[0] if pool else None
|
|
1301
|
+
|
|
1302
|
+
|
|
1303
|
+
@_init_db
|
|
1304
|
+
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1305
|
+
"""Set the current cluster name for a job."""
|
|
1306
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1307
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1308
|
+
session.query(job_info_table).filter(
|
|
1309
|
+
job_info_table.c.spot_job_id == job_id).update(
|
|
1310
|
+
{job_info_table.c.current_cluster_name: current_cluster_name})
|
|
1311
|
+
session.commit()
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
@_init_db
|
|
1315
|
+
def set_job_id_on_pool_cluster(job_id: int,
|
|
1316
|
+
job_id_on_pool_cluster: int) -> None:
|
|
1317
|
+
"""Set the job id on the pool cluster for a job."""
|
|
1318
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1319
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1320
|
+
session.query(job_info_table).filter(
|
|
1321
|
+
job_info_table.c.spot_job_id == job_id).update({
|
|
1322
|
+
job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
|
|
1323
|
+
})
|
|
1324
|
+
session.commit()
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
@_init_db
|
|
1328
|
+
def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
1329
|
+
"""Get the cluster name and job id on the pool from the managed job id."""
|
|
1330
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1331
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1332
|
+
info = session.execute(
|
|
1333
|
+
sqlalchemy.select(
|
|
1334
|
+
job_info_table.c.current_cluster_name,
|
|
1335
|
+
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1336
|
+
job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
1337
|
+
if info is None:
|
|
1338
|
+
return None, None
|
|
1339
|
+
return info[0], info[1]
|
|
1340
|
+
|
|
1341
|
+
|
|
1281
1342
|
@_init_db
|
|
1282
1343
|
def scheduler_set_launching(job_id: int,
|
|
1283
1344
|
current_state: ManagedJobScheduleState) -> None:
|
|
@@ -1398,28 +1459,68 @@ def get_num_launching_jobs() -> int:
|
|
|
1398
1459
|
sqlalchemy.select(
|
|
1399
1460
|
sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1400
1461
|
).select_from(job_info_table).where(
|
|
1401
|
-
|
|
1402
|
-
|
|
1462
|
+
sqlalchemy.and_(
|
|
1463
|
+
job_info_table.c.schedule_state ==
|
|
1464
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1465
|
+
# We only count jobs that are not in the pool, because the
|
|
1466
|
+
# job in the pool does not actually calling the sky.launch.
|
|
1467
|
+
job_info_table.c.pool.is_(None)))).fetchone()[0]
|
|
1403
1468
|
|
|
1404
1469
|
|
|
1405
1470
|
@_init_db
|
|
1406
|
-
def get_num_alive_jobs() -> int:
|
|
1471
|
+
def get_num_alive_jobs(pool: Optional[str] = None) -> int:
|
|
1407
1472
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1408
1473
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1474
|
+
where_conditions = [
|
|
1475
|
+
job_info_table.c.schedule_state.in_([
|
|
1476
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1477
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1478
|
+
ManagedJobScheduleState.ALIVE.value,
|
|
1479
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1480
|
+
])
|
|
1481
|
+
]
|
|
1482
|
+
|
|
1483
|
+
if pool is not None:
|
|
1484
|
+
where_conditions.append(job_info_table.c.pool == pool)
|
|
1485
|
+
|
|
1409
1486
|
return session.execute(
|
|
1410
1487
|
sqlalchemy.select(
|
|
1411
1488
|
sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1412
1489
|
).select_from(job_info_table).where(
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1490
|
+
sqlalchemy.and_(*where_conditions))).fetchone()[0]
|
|
1491
|
+
|
|
1492
|
+
|
|
1493
|
+
@_init_db
|
|
1494
|
+
def get_nonterminal_job_ids_by_pool(pool: str,
|
|
1495
|
+
cluster_name: Optional[str] = None
|
|
1496
|
+
) -> List[int]:
|
|
1497
|
+
"""Get nonterminal job ids in a pool."""
|
|
1498
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1499
|
+
|
|
1500
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1501
|
+
query = sqlalchemy.select(
|
|
1502
|
+
spot_table.c.spot_job_id.distinct()).select_from(
|
|
1503
|
+
spot_table.outerjoin(
|
|
1504
|
+
job_info_table,
|
|
1505
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1506
|
+
and_conditions = [
|
|
1507
|
+
~spot_table.c.status.in_([
|
|
1508
|
+
status.value for status in ManagedJobStatus.terminal_statuses()
|
|
1509
|
+
]),
|
|
1510
|
+
job_info_table.c.pool == pool,
|
|
1511
|
+
]
|
|
1512
|
+
if cluster_name is not None:
|
|
1513
|
+
and_conditions.append(
|
|
1514
|
+
job_info_table.c.current_cluster_name == cluster_name)
|
|
1515
|
+
query = query.where(sqlalchemy.and_(*and_conditions)).order_by(
|
|
1516
|
+
spot_table.c.spot_job_id.asc())
|
|
1517
|
+
rows = session.execute(query).fetchall()
|
|
1518
|
+
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
1519
|
+
return job_ids
|
|
1419
1520
|
|
|
1420
1521
|
|
|
1421
1522
|
@_init_db
|
|
1422
|
-
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
1523
|
+
def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
|
|
1423
1524
|
"""Get the next job that should transition to LAUNCHING.
|
|
1424
1525
|
|
|
1425
1526
|
Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
|
|
@@ -1442,23 +1543,26 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
|
1442
1543
|
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1443
1544
|
])).scalar_subquery()
|
|
1444
1545
|
# Main query for waiting jobs
|
|
1546
|
+
select_conds = [
|
|
1547
|
+
job_info_table.c.schedule_state.in_([
|
|
1548
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1549
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1550
|
+
]),
|
|
1551
|
+
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1552
|
+
max_priority_subquery, 0),
|
|
1553
|
+
]
|
|
1554
|
+
if pool is not None:
|
|
1555
|
+
select_conds.append(job_info_table.c.pool == pool)
|
|
1445
1556
|
query = sqlalchemy.select(
|
|
1446
1557
|
job_info_table.c.spot_job_id,
|
|
1447
1558
|
job_info_table.c.schedule_state,
|
|
1448
1559
|
job_info_table.c.dag_yaml_path,
|
|
1449
1560
|
job_info_table.c.env_file_path,
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
]),
|
|
1456
|
-
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1457
|
-
max_priority_subquery, 0),
|
|
1458
|
-
)).order_by(
|
|
1459
|
-
job_info_table.c.priority.desc(),
|
|
1460
|
-
job_info_table.c.spot_job_id.asc(),
|
|
1461
|
-
).limit(1)
|
|
1561
|
+
job_info_table.c.pool,
|
|
1562
|
+
).where(sqlalchemy.and_(*select_conds)).order_by(
|
|
1563
|
+
job_info_table.c.priority.desc(),
|
|
1564
|
+
job_info_table.c.spot_job_id.asc(),
|
|
1565
|
+
).limit(1)
|
|
1462
1566
|
waiting_job_row = session.execute(query).fetchone()
|
|
1463
1567
|
if waiting_job_row is None:
|
|
1464
1568
|
return None
|
|
@@ -1468,6 +1572,7 @@ def get_waiting_job() -> Optional[Dict[str, Any]]:
|
|
|
1468
1572
|
'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
|
|
1469
1573
|
'dag_yaml_path': waiting_job_row[2],
|
|
1470
1574
|
'env_file_path': waiting_job_row[3],
|
|
1575
|
+
'pool': waiting_job_row[4],
|
|
1471
1576
|
}
|
|
1472
1577
|
|
|
1473
1578
|
|
sky/jobs/utils.py
CHANGED
|
@@ -30,7 +30,6 @@ from sky.backends import backend_utils
|
|
|
30
30
|
from sky.jobs import constants as managed_job_constants
|
|
31
31
|
from sky.jobs import scheduler
|
|
32
32
|
from sky.jobs import state as managed_job_state
|
|
33
|
-
from sky.server import common as server_common
|
|
34
33
|
from sky.skylet import constants
|
|
35
34
|
from sky.skylet import job_lib
|
|
36
35
|
from sky.skylet import log_lib
|
|
@@ -39,7 +38,6 @@ from sky.utils import annotations
|
|
|
39
38
|
from sky.utils import command_runner
|
|
40
39
|
from sky.utils import common_utils
|
|
41
40
|
from sky.utils import controller_utils
|
|
42
|
-
from sky.utils import env_options
|
|
43
41
|
from sky.utils import infra_utils
|
|
44
42
|
from sky.utils import log_utils
|
|
45
43
|
from sky.utils import message_utils
|
|
@@ -136,12 +134,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
|
|
136
134
|
def _validate_consolidation_mode_config(
|
|
137
135
|
current_is_consolidation_mode: bool) -> None:
|
|
138
136
|
"""Validate the consolidation mode config."""
|
|
139
|
-
if (current_is_consolidation_mode and
|
|
140
|
-
not env_options.Options.IS_DEVELOPER.get() and
|
|
141
|
-
server_common.is_api_server_local()):
|
|
142
|
-
with ux_utils.print_exception_no_traceback():
|
|
143
|
-
raise exceptions.NotSupportedError(
|
|
144
|
-
'Consolidation mode is not supported when running locally.')
|
|
145
137
|
# Check whether the consolidation mode config is changed.
|
|
146
138
|
if current_is_consolidation_mode:
|
|
147
139
|
controller_cn = (
|
|
@@ -239,8 +231,8 @@ def ha_recovery_for_consolidation_mode():
|
|
|
239
231
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
|
240
232
|
|
|
241
233
|
|
|
242
|
-
def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
243
|
-
|
|
234
|
+
def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
235
|
+
job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
|
|
244
236
|
"""Check the status of the job running on a managed job cluster.
|
|
245
237
|
|
|
246
238
|
It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
|
|
@@ -253,10 +245,13 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
|
253
245
|
logger.info(f'Cluster {cluster_name} not found.')
|
|
254
246
|
return None
|
|
255
247
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
|
248
|
+
job_ids = None if job_id is None else [job_id]
|
|
256
249
|
for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
|
|
257
250
|
try:
|
|
258
251
|
logger.info('=== Checking the job status... ===')
|
|
259
|
-
statuses = backend.get_job_status(handle,
|
|
252
|
+
statuses = backend.get_job_status(handle,
|
|
253
|
+
job_ids=job_ids,
|
|
254
|
+
stream_logs=False)
|
|
260
255
|
status = list(statuses.values())[0]
|
|
261
256
|
if status is None:
|
|
262
257
|
logger.info('No job found.')
|
|
@@ -323,13 +318,20 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
323
318
|
error_msg = None
|
|
324
319
|
tasks = managed_job_state.get_managed_jobs(job_id)
|
|
325
320
|
for task in tasks:
|
|
326
|
-
|
|
327
|
-
|
|
321
|
+
pool = task.get('pool', None)
|
|
322
|
+
if pool is None:
|
|
323
|
+
task_name = task['job_name']
|
|
324
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
325
|
+
task_name, job_id)
|
|
326
|
+
else:
|
|
327
|
+
cluster_name, _ = (
|
|
328
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
328
329
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
329
330
|
cluster_name)
|
|
330
331
|
if handle is not None:
|
|
331
332
|
try:
|
|
332
|
-
|
|
333
|
+
if pool is None:
|
|
334
|
+
terminate_cluster(cluster_name)
|
|
333
335
|
except Exception as e: # pylint: disable=broad-except
|
|
334
336
|
error_msg = (
|
|
335
337
|
f'Failed to terminate cluster {cluster_name}: '
|
|
@@ -510,10 +512,10 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
|
510
512
|
|
|
511
513
|
|
|
512
514
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
513
|
-
get_end_time: bool) -> float:
|
|
515
|
+
job_id: Optional[int], get_end_time: bool) -> float:
|
|
514
516
|
"""Get the submitted/ended time of the job."""
|
|
515
517
|
code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
|
|
516
|
-
job_id=
|
|
518
|
+
job_id=job_id, get_ended_time=get_end_time)
|
|
517
519
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
|
518
520
|
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
519
521
|
code,
|
|
@@ -527,14 +529,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
|
|
527
529
|
|
|
528
530
|
|
|
529
531
|
def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
|
|
530
|
-
cluster_name: str) -> float:
|
|
532
|
+
cluster_name: str, job_id: Optional[int]) -> float:
|
|
531
533
|
"""Try to get the end time of the job.
|
|
532
534
|
|
|
533
535
|
If the job is preempted or we can't connect to the instance for whatever
|
|
534
536
|
reason, fall back to the current time.
|
|
535
537
|
"""
|
|
536
538
|
try:
|
|
537
|
-
return get_job_timestamp(backend,
|
|
539
|
+
return get_job_timestamp(backend,
|
|
540
|
+
cluster_name,
|
|
541
|
+
job_id=job_id,
|
|
542
|
+
get_end_time=True)
|
|
538
543
|
except exceptions.CommandError as e:
|
|
539
544
|
if e.returncode == 255:
|
|
540
545
|
# Failed to connect - probably the instance was preempted since the
|
|
@@ -556,8 +561,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
|
|
|
556
561
|
if event_callback is None or task is None:
|
|
557
562
|
return
|
|
558
563
|
event_callback = event_callback.strip()
|
|
559
|
-
|
|
560
|
-
|
|
564
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
565
|
+
if pool is not None:
|
|
566
|
+
cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
|
|
567
|
+
else:
|
|
568
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
569
|
+
task.name, job_id) if task.name else None
|
|
561
570
|
logger.info(f'=== START: event callback for {status!r} ===')
|
|
562
571
|
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
563
572
|
'managed_job_event',
|
|
@@ -684,6 +693,15 @@ def cancel_job_by_name(job_name: str,
|
|
|
684
693
|
return f'{job_name!r} {msg}'
|
|
685
694
|
|
|
686
695
|
|
|
696
|
+
def cancel_jobs_by_pool(pool_name: str,
|
|
697
|
+
current_workspace: Optional[str] = None) -> str:
|
|
698
|
+
"""Cancel all jobs in a pool."""
|
|
699
|
+
job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
|
|
700
|
+
if not job_ids:
|
|
701
|
+
return f'No running job found in pool {pool_name!r}.'
|
|
702
|
+
return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
|
|
703
|
+
|
|
704
|
+
|
|
687
705
|
def stream_logs_by_id(job_id: int,
|
|
688
706
|
follow: bool = True,
|
|
689
707
|
tail: Optional[int] = None) -> Tuple[str, int]:
|
|
@@ -777,12 +795,19 @@ def stream_logs_by_id(job_id: int,
|
|
|
777
795
|
|
|
778
796
|
while should_keep_logging(managed_job_status):
|
|
779
797
|
handle = None
|
|
798
|
+
job_id_to_tail = None
|
|
780
799
|
if task_id is not None:
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
800
|
+
pool = managed_job_state.get_pool_from_job_id(job_id)
|
|
801
|
+
if pool is not None:
|
|
802
|
+
cluster_name, job_id_to_tail = (
|
|
803
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
804
|
+
else:
|
|
805
|
+
task_name = managed_job_state.get_task_name(job_id, task_id)
|
|
806
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
807
|
+
task_name, job_id)
|
|
808
|
+
if cluster_name is not None:
|
|
809
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
810
|
+
cluster_name)
|
|
786
811
|
|
|
787
812
|
# Check the handle: The cluster can be preempted and removed from
|
|
788
813
|
# the table before the managed job state is updated by the
|
|
@@ -814,7 +839,7 @@ def stream_logs_by_id(job_id: int,
|
|
|
814
839
|
status_display.stop()
|
|
815
840
|
tail_param = tail if tail is not None else 0
|
|
816
841
|
returncode = backend.tail_logs(handle,
|
|
817
|
-
job_id=
|
|
842
|
+
job_id=job_id_to_tail,
|
|
818
843
|
managed_job_id=job_id,
|
|
819
844
|
follow=follow,
|
|
820
845
|
tail=tail_param)
|
|
@@ -1132,9 +1157,15 @@ def dump_managed_job_queue() -> str:
|
|
|
1132
1157
|
job['status'] = job['status'].value
|
|
1133
1158
|
job['schedule_state'] = job['schedule_state'].value
|
|
1134
1159
|
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1160
|
+
pool = managed_job_state.get_pool_from_job_id(job['job_id'])
|
|
1161
|
+
if pool is not None:
|
|
1162
|
+
cluster_name, _ = managed_job_state.get_pool_submit_info(
|
|
1163
|
+
job['job_id'])
|
|
1164
|
+
else:
|
|
1165
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1166
|
+
job['task_name'], job['job_id'])
|
|
1167
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
1168
|
+
cluster_name) if cluster_name is not None else None
|
|
1138
1169
|
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1139
1170
|
resources_str = resources_utils.get_readable_resources_repr(
|
|
1140
1171
|
handle, simplify=True)
|
|
@@ -1145,6 +1176,11 @@ def dump_managed_job_queue() -> str:
|
|
|
1145
1176
|
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1146
1177
|
job['region'] = handle.launched_resources.region
|
|
1147
1178
|
job['zone'] = handle.launched_resources.zone
|
|
1179
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1180
|
+
str(handle.launched_resources.cloud),
|
|
1181
|
+
handle.launched_resources.region,
|
|
1182
|
+
handle.launched_resources.zone).formatted_str()
|
|
1183
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1148
1184
|
else:
|
|
1149
1185
|
# FIXME(zongheng): display the last cached values for these.
|
|
1150
1186
|
job['cluster_resources'] = '-'
|
|
@@ -1152,6 +1188,7 @@ def dump_managed_job_queue() -> str:
|
|
|
1152
1188
|
job['cloud'] = '-'
|
|
1153
1189
|
job['region'] = '-'
|
|
1154
1190
|
job['zone'] = '-'
|
|
1191
|
+
job['infra'] = '-'
|
|
1155
1192
|
|
|
1156
1193
|
# Add details about schedule state / backoff.
|
|
1157
1194
|
state_details = None
|
|
@@ -1292,10 +1329,13 @@ def format_job_table(
|
|
|
1292
1329
|
'JOB DURATION',
|
|
1293
1330
|
'#RECOVERIES',
|
|
1294
1331
|
'STATUS',
|
|
1332
|
+
'WORKER_POOL',
|
|
1295
1333
|
]
|
|
1296
1334
|
if show_all:
|
|
1297
1335
|
# TODO: move SCHED. STATE to a separate flag (e.g. --debug)
|
|
1298
1336
|
columns += [
|
|
1337
|
+
'WORKER_CLUSTER',
|
|
1338
|
+
'WORKER_JOB_ID',
|
|
1299
1339
|
'STARTED',
|
|
1300
1340
|
'INFRA',
|
|
1301
1341
|
'RESOURCES',
|
|
@@ -1405,11 +1445,14 @@ def format_job_table(
|
|
|
1405
1445
|
job_duration,
|
|
1406
1446
|
recovery_cnt,
|
|
1407
1447
|
status_str,
|
|
1448
|
+
job_tasks[0].get('pool', '-'),
|
|
1408
1449
|
]
|
|
1409
1450
|
if show_all:
|
|
1410
1451
|
details = job_tasks[current_task_id].get('details')
|
|
1411
1452
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
|
1412
1453
|
job_values.extend([
|
|
1454
|
+
'-',
|
|
1455
|
+
'-',
|
|
1413
1456
|
'-',
|
|
1414
1457
|
'-',
|
|
1415
1458
|
'-',
|
|
@@ -1445,37 +1488,43 @@ def format_job_table(
|
|
|
1445
1488
|
job_duration,
|
|
1446
1489
|
task['recovery_count'],
|
|
1447
1490
|
task['status'].colored_str(),
|
|
1491
|
+
task.get('pool', '-'),
|
|
1448
1492
|
]
|
|
1449
1493
|
if show_all:
|
|
1450
1494
|
# schedule_state is only set at the job level, so if we have
|
|
1451
1495
|
# more than one task, only display on the aggregated row.
|
|
1452
1496
|
schedule_state = (task['schedule_state']
|
|
1453
1497
|
if len(job_tasks) == 1 else '-')
|
|
1454
|
-
|
|
1455
|
-
if
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1498
|
+
infra_str = task.get('infra')
|
|
1499
|
+
if infra_str is None:
|
|
1500
|
+
cloud = task.get('cloud')
|
|
1501
|
+
if cloud is None:
|
|
1502
|
+
# Backward compatibility for old jobs controller without
|
|
1503
|
+
# cloud info returned, we parse it from the cluster
|
|
1504
|
+
# resources
|
|
1505
|
+
# TODO(zhwu): remove this after 0.12.0
|
|
1506
|
+
cloud = task['cluster_resources'].split('(')[0].split(
|
|
1507
|
+
'x')[-1]
|
|
1508
|
+
task['cluster_resources'] = task[
|
|
1509
|
+
'cluster_resources'].replace(f'{cloud}(',
|
|
1510
|
+
'(').replace(
|
|
1511
|
+
'x ', 'x')
|
|
1512
|
+
region = task['region']
|
|
1513
|
+
zone = task.get('zone')
|
|
1514
|
+
if cloud == '-':
|
|
1515
|
+
cloud = None
|
|
1516
|
+
if region == '-':
|
|
1517
|
+
region = None
|
|
1518
|
+
if zone == '-':
|
|
1519
|
+
zone = None
|
|
1520
|
+
infra_str = infra_utils.InfraInfo(cloud, region,
|
|
1521
|
+
zone).formatted_str()
|
|
1475
1522
|
values.extend([
|
|
1523
|
+
task.get('current_cluster_name', '-'),
|
|
1524
|
+
task.get('job_id_on_pool_cluster', '-'),
|
|
1476
1525
|
# STARTED
|
|
1477
1526
|
log_utils.readable_time_duration(task['start_at']),
|
|
1478
|
-
|
|
1527
|
+
infra_str,
|
|
1479
1528
|
task['cluster_resources'],
|
|
1480
1529
|
schedule_state,
|
|
1481
1530
|
generate_details(task.get('details'),
|
|
@@ -1567,6 +1616,15 @@ class ManagedJobCodeGen:
|
|
|
1567
1616
|
""")
|
|
1568
1617
|
return cls._build(code)
|
|
1569
1618
|
|
|
1619
|
+
@classmethod
|
|
1620
|
+
def cancel_jobs_by_pool(cls, pool_name: str) -> str:
|
|
1621
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1622
|
+
code = textwrap.dedent(f"""\
|
|
1623
|
+
msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
|
|
1624
|
+
print(msg, end="", flush=True)
|
|
1625
|
+
""")
|
|
1626
|
+
return cls._build(code)
|
|
1627
|
+
|
|
1570
1628
|
@classmethod
|
|
1571
1629
|
def get_version_and_job_table(cls) -> str:
|
|
1572
1630
|
"""Generate code to get controller version and raw job table."""
|
|
@@ -15,6 +15,9 @@ INFINIBAND_ENV_VARS = {
|
|
|
15
15
|
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
+
# pylint: disable=line-too-long
|
|
19
|
+
INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
|
|
20
|
+
|
|
18
21
|
# Docker run options for InfiniBand support
|
|
19
22
|
INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
|
|
20
23
|
|
sky/py.typed
ADDED
|
File without changes
|
sky/resources.py
CHANGED
|
@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
|
|
|
19
19
|
from sky.provision import docker_utils
|
|
20
20
|
from sky.provision.gcp import constants as gcp_constants
|
|
21
21
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
22
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
22
23
|
from sky.skylet import constants
|
|
23
24
|
from sky.utils import accelerator_registry
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -1260,15 +1261,19 @@ class Resources:
|
|
|
1260
1261
|
ValueError: if the attribute is invalid.
|
|
1261
1262
|
"""
|
|
1262
1263
|
|
|
1263
|
-
if
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1264
|
+
if self._network_tier == resources_utils.NetworkTier.BEST:
|
|
1265
|
+
if isinstance(self._cloud, clouds.GCP):
|
|
1266
|
+
# Handle GPU Direct TCPX requirement for docker images
|
|
1267
|
+
if self._image_id is None:
|
|
1268
|
+
self._image_id = {
|
|
1269
|
+
self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
|
|
1270
|
+
}
|
|
1271
|
+
elif isinstance(self._cloud, clouds.Nebius):
|
|
1272
|
+
if self._image_id is None:
|
|
1273
|
+
self._image_id = {
|
|
1274
|
+
self._region: nebius_constants.INFINIBAND_IMAGE_ID
|
|
1275
|
+
}
|
|
1276
|
+
elif self._image_id:
|
|
1272
1277
|
# Custom image specified - validate it's a docker image
|
|
1273
1278
|
# Check if any of the specified images are not docker images
|
|
1274
1279
|
non_docker_images = []
|
|
@@ -1280,14 +1285,13 @@ class Resources:
|
|
|
1280
1285
|
if non_docker_images:
|
|
1281
1286
|
with ux_utils.print_exception_no_traceback():
|
|
1282
1287
|
raise ValueError(
|
|
1283
|
-
f'When using network_tier=BEST
|
|
1288
|
+
f'When using network_tier=BEST, image_id '
|
|
1284
1289
|
f'must be a docker image. '
|
|
1285
1290
|
f'Found non-docker images: '
|
|
1286
1291
|
f'{", ".join(non_docker_images)}. '
|
|
1287
1292
|
f'Please either: (1) use a docker image '
|
|
1288
1293
|
f'(prefix with "docker:"), or '
|
|
1289
|
-
f'(2) leave image_id empty to use the default
|
|
1290
|
-
f'GPU Direct TCPX image.')
|
|
1294
|
+
f'(2) leave image_id empty to use the default')
|
|
1291
1295
|
|
|
1292
1296
|
if self._image_id is None:
|
|
1293
1297
|
return
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Columns for cluster pool.
|
|
2
|
+
|
|
3
|
+
Revision ID: 002
|
|
4
|
+
Revises: 001
|
|
5
|
+
Create Date: 2025-07-18
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from typing import Sequence, Union
|
|
10
|
+
|
|
11
|
+
from alembic import op
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
|
|
14
|
+
from sky.utils.db import db_utils
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = '002'
|
|
18
|
+
down_revision: Union[str, Sequence[str], None] = '001'
|
|
19
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
20
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade():
|
|
24
|
+
"""Add columns for cluster pool."""
|
|
25
|
+
with op.get_context().autocommit_block():
|
|
26
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
27
|
+
'pool',
|
|
28
|
+
sa.Text(),
|
|
29
|
+
server_default=None)
|
|
30
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
31
|
+
'current_cluster_name',
|
|
32
|
+
sa.Text(),
|
|
33
|
+
server_default=None)
|
|
34
|
+
db_utils.add_column_to_table_alembic('job_info',
|
|
35
|
+
'job_id_on_pool_cluster',
|
|
36
|
+
sa.Integer(),
|
|
37
|
+
server_default=None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def downgrade():
|
|
41
|
+
"""Remove columns for cluster pool."""
|
|
42
|
+
pass
|