skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/client/cli/command.py +118 -30
- sky/client/cli/table_utils.py +14 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
- sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/client/sdk.py +28 -9
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +7 -3
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +281 -166
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +51 -25
- sky/server/requests/serializers/decoders.py +23 -10
- sky/server/requests/serializers/encoders.py +5 -4
- sky/server/rest.py +35 -1
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/log_lib.py +8 -1
- sky/skylet/services.py +5 -5
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +32 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/utils/subprocess_utils.py +13 -1
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -108,6 +108,21 @@ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
|
108
108
|
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
109
109
|
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
110
110
|
|
|
111
|
+
# The response fields for managed jobs that require cluster handle
|
|
112
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
113
|
+
'cluster_resources',
|
|
114
|
+
'cluster_resources_full',
|
|
115
|
+
'cloud',
|
|
116
|
+
'region',
|
|
117
|
+
'zone',
|
|
118
|
+
'infra',
|
|
119
|
+
'accelerators',
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# The response fields for managed jobs that are not stored in the database
|
|
123
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
124
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
125
|
+
|
|
111
126
|
|
|
112
127
|
class ManagedJobQueueResultType(enum.Enum):
|
|
113
128
|
"""The type of the managed job queue result."""
|
|
@@ -1313,11 +1328,85 @@ def dump_managed_job_queue(
|
|
|
1313
1328
|
limit: Optional[int] = None,
|
|
1314
1329
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1315
1330
|
statuses: Optional[List[str]] = None,
|
|
1331
|
+
fields: Optional[List[str]] = None,
|
|
1316
1332
|
) -> str:
|
|
1317
1333
|
return message_utils.encode_payload(
|
|
1318
1334
|
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1319
1335
|
workspace_match, name_match, pool_match, page,
|
|
1320
|
-
limit, user_hashes, statuses))
|
|
1336
|
+
limit, user_hashes, statuses, fields))
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1340
|
+
"""Update the fields list to include the necessary fields.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
fields: The fields to update.
|
|
1344
|
+
|
|
1345
|
+
It will:
|
|
1346
|
+
- Add the necessary dependent fields to the list.
|
|
1347
|
+
- Remove the fields that are not in the DB.
|
|
1348
|
+
- Determine if cluster handle is required.
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1352
|
+
cluster handle is required.
|
|
1353
|
+
"""
|
|
1354
|
+
cluster_handle_required = True
|
|
1355
|
+
if _cluster_handle_not_required(fields):
|
|
1356
|
+
cluster_handle_required = False
|
|
1357
|
+
# Copy the list to avoid modifying the original list
|
|
1358
|
+
new_fields = fields.copy()
|
|
1359
|
+
# status and job_id are always included
|
|
1360
|
+
if 'status' not in new_fields:
|
|
1361
|
+
new_fields.append('status')
|
|
1362
|
+
if 'job_id' not in new_fields:
|
|
1363
|
+
new_fields.append('job_id')
|
|
1364
|
+
# user_hash is required if user_name is present
|
|
1365
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1366
|
+
new_fields.append('user_hash')
|
|
1367
|
+
if 'job_duration' in new_fields:
|
|
1368
|
+
if 'last_recovered_at' not in new_fields:
|
|
1369
|
+
new_fields.append('last_recovered_at')
|
|
1370
|
+
if 'end_at' not in new_fields:
|
|
1371
|
+
new_fields.append('end_at')
|
|
1372
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1373
|
+
new_fields.append('task_name')
|
|
1374
|
+
if 'details' in new_fields:
|
|
1375
|
+
if 'schedule_state' not in new_fields:
|
|
1376
|
+
new_fields.append('schedule_state')
|
|
1377
|
+
if 'priority' not in new_fields:
|
|
1378
|
+
new_fields.append('priority')
|
|
1379
|
+
if 'failure_reason' not in new_fields:
|
|
1380
|
+
new_fields.append('failure_reason')
|
|
1381
|
+
if ('user_yaml' in new_fields and
|
|
1382
|
+
'original_user_yaml_path' not in new_fields):
|
|
1383
|
+
new_fields.append('original_user_yaml_path')
|
|
1384
|
+
if cluster_handle_required:
|
|
1385
|
+
if 'task_name' not in new_fields:
|
|
1386
|
+
new_fields.append('task_name')
|
|
1387
|
+
if 'current_cluster_name' not in new_fields:
|
|
1388
|
+
new_fields.append('current_cluster_name')
|
|
1389
|
+
# Remove _NON_DB_FIELDS
|
|
1390
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1391
|
+
# don't need to include them in the updated fields.
|
|
1392
|
+
for field in _NON_DB_FIELDS:
|
|
1393
|
+
if field in new_fields:
|
|
1394
|
+
new_fields.remove(field)
|
|
1395
|
+
return new_fields, cluster_handle_required
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1399
|
+
"""Determine if cluster handle is not required.
|
|
1400
|
+
|
|
1401
|
+
Args:
|
|
1402
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1403
|
+
fields.
|
|
1404
|
+
|
|
1405
|
+
Returns:
|
|
1406
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1407
|
+
False otherwise.
|
|
1408
|
+
"""
|
|
1409
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1321
1410
|
|
|
1322
1411
|
|
|
1323
1412
|
def get_managed_job_queue(
|
|
@@ -1331,146 +1420,154 @@ def get_managed_job_queue(
|
|
|
1331
1420
|
limit: Optional[int] = None,
|
|
1332
1421
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1333
1422
|
statuses: Optional[List[str]] = None,
|
|
1423
|
+
fields: Optional[List[str]] = None,
|
|
1334
1424
|
) -> Dict[str, Any]:
|
|
1335
|
-
|
|
1336
|
-
# detection) requires a full view of the jobs table.
|
|
1337
|
-
jobs = managed_job_state.get_managed_jobs()
|
|
1338
|
-
|
|
1339
|
-
# Figure out what the highest priority blocking job is. We need to know in
|
|
1340
|
-
# order to determine if other jobs are blocked by a higher priority job, or
|
|
1341
|
-
# just by the limited controller resources.
|
|
1342
|
-
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1343
|
-
for job in jobs:
|
|
1344
|
-
if job['schedule_state'] not in (
|
|
1345
|
-
# LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
|
|
1346
|
-
# lower priority.
|
|
1347
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING,
|
|
1348
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
|
|
1349
|
-
# It's possible for a WAITING/ALIVE_WAITING job to be ready to
|
|
1350
|
-
# launch, but the scheduler just hasn't run yet.
|
|
1351
|
-
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1352
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
|
|
1353
|
-
# This job will not block others.
|
|
1354
|
-
continue
|
|
1425
|
+
"""Get the managed job queue.
|
|
1355
1426
|
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1427
|
+
Args:
|
|
1428
|
+
skip_finished: Whether to skip finished jobs.
|
|
1429
|
+
accessible_workspaces: The accessible workspaces.
|
|
1430
|
+
job_ids: The job ids.
|
|
1431
|
+
workspace_match: The workspace name to match.
|
|
1432
|
+
name_match: The job name to match.
|
|
1433
|
+
pool_match: The pool name to match.
|
|
1434
|
+
page: The page number.
|
|
1435
|
+
limit: The limit number.
|
|
1436
|
+
user_hashes: The user hashes.
|
|
1437
|
+
statuses: The statuses.
|
|
1438
|
+
fields: The fields to include in the response.
|
|
1359
1439
|
|
|
1360
|
-
|
|
1440
|
+
Returns:
|
|
1441
|
+
A dictionary containing the managed job queue.
|
|
1442
|
+
"""
|
|
1443
|
+
cluster_handle_required = True
|
|
1444
|
+
updated_fields = None
|
|
1445
|
+
# The caller only need to specify the fields in the
|
|
1446
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1447
|
+
# function will add the necessary dependent fields to the list, for
|
|
1448
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1449
|
+
# function will add `['user_hash']` to the list.
|
|
1450
|
+
if fields:
|
|
1451
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1452
|
+
|
|
1453
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1454
|
+
|
|
1455
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1456
|
+
fields=fields,
|
|
1457
|
+
job_ids=job_ids,
|
|
1458
|
+
accessible_workspaces=accessible_workspaces,
|
|
1459
|
+
workspace_match=workspace_match,
|
|
1460
|
+
name_match=name_match,
|
|
1461
|
+
pool_match=pool_match,
|
|
1462
|
+
user_hashes=user_hashes,
|
|
1463
|
+
skip_finished=skip_finished,
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1467
|
+
fields=updated_fields,
|
|
1468
|
+
job_ids=job_ids,
|
|
1469
|
+
accessible_workspaces=accessible_workspaces,
|
|
1470
|
+
workspace_match=workspace_match,
|
|
1471
|
+
name_match=name_match,
|
|
1472
|
+
pool_match=pool_match,
|
|
1473
|
+
user_hashes=user_hashes,
|
|
1474
|
+
statuses=statuses,
|
|
1475
|
+
skip_finished=skip_finished,
|
|
1476
|
+
page=page,
|
|
1477
|
+
limit=limit,
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
if cluster_handle_required:
|
|
1481
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1482
|
+
cluster_name_to_handle = (
|
|
1483
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1361
1484
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1370
|
-
accessible_workspaces
|
|
1371
|
-
]
|
|
1372
|
-
if skip_finished:
|
|
1373
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
1374
|
-
# finished, we will include all its tasks.
|
|
1375
|
-
non_finished_tasks = list(
|
|
1376
|
-
filter(
|
|
1377
|
-
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1378
|
-
'status']).is_terminal(), jobs))
|
|
1379
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1380
|
-
jobs = list(
|
|
1381
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1382
|
-
if job_ids:
|
|
1383
|
-
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1384
|
-
|
|
1385
|
-
jobs, total, status_counts = filter_jobs(jobs,
|
|
1386
|
-
workspace_match,
|
|
1387
|
-
name_match,
|
|
1388
|
-
pool_match,
|
|
1389
|
-
page,
|
|
1390
|
-
limit,
|
|
1391
|
-
statuses=statuses)
|
|
1392
|
-
|
|
1393
|
-
job_ids = set(job['job_id'] for job in jobs)
|
|
1394
|
-
job_id_to_pool_info = (
|
|
1395
|
-
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1396
|
-
cluster_names: Dict[int, str] = {}
|
|
1397
|
-
for job in jobs:
|
|
1398
|
-
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1399
|
-
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1400
|
-
if pool_info and pool_info[0]:
|
|
1401
|
-
cluster_name = pool_info[1]
|
|
1402
|
-
else:
|
|
1403
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1404
|
-
job['task_name'], job['job_id'])
|
|
1405
|
-
cluster_names[job['job_id']] = cluster_name
|
|
1406
|
-
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1407
|
-
set(cluster_names.values()))
|
|
1485
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1486
|
+
if not fields or 'details' in fields:
|
|
1487
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1488
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1489
|
+
# job, or just by the limited controller resources.
|
|
1490
|
+
highest_blocking_priority = (
|
|
1491
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
1408
1492
|
|
|
1409
1493
|
for job in jobs:
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
end_at
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1494
|
+
if not fields or 'job_duration' in fields:
|
|
1495
|
+
end_at = job['end_at']
|
|
1496
|
+
if end_at is None:
|
|
1497
|
+
end_at = time.time()
|
|
1498
|
+
|
|
1499
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1500
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1501
|
+
# When job is recovering, the duration is exact
|
|
1502
|
+
# job['job_duration']
|
|
1503
|
+
job_duration = job['job_duration']
|
|
1504
|
+
elif job_submitted_at > 0:
|
|
1505
|
+
job_duration = end_at - job_submitted_at
|
|
1506
|
+
else:
|
|
1507
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1508
|
+
# is not set yet, i.e. the job is not started.
|
|
1509
|
+
job_duration = 0
|
|
1510
|
+
job['job_duration'] = job_duration
|
|
1425
1511
|
job['status'] = job['status'].value
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
cluster_name = cluster_names[job['job_id']]
|
|
1429
|
-
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1430
|
-
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1431
|
-
resources_str = resources_utils.get_readable_resources_repr(
|
|
1432
|
-
handle, simplify=True)
|
|
1433
|
-
resources_str_full = resources_utils.get_readable_resources_repr(
|
|
1434
|
-
handle, simplify=False)
|
|
1435
|
-
job['cluster_resources'] = resources_str
|
|
1436
|
-
job['cluster_resources_full'] = resources_str_full
|
|
1437
|
-
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1438
|
-
job['region'] = handle.launched_resources.region
|
|
1439
|
-
job['zone'] = handle.launched_resources.zone
|
|
1440
|
-
job['infra'] = infra_utils.InfraInfo(
|
|
1441
|
-
str(handle.launched_resources.cloud),
|
|
1442
|
-
handle.launched_resources.region,
|
|
1443
|
-
handle.launched_resources.zone).formatted_str()
|
|
1444
|
-
job['accelerators'] = handle.launched_resources.accelerators
|
|
1512
|
+
if not fields or 'schedule_state' in fields:
|
|
1513
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
1445
1514
|
else:
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1515
|
+
job['schedule_state'] = None
|
|
1516
|
+
|
|
1517
|
+
if cluster_handle_required:
|
|
1518
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1519
|
+
if cluster_name is None:
|
|
1520
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1521
|
+
job['task_name'], job['job_id'])
|
|
1522
|
+
handle = cluster_name_to_handle.get(
|
|
1523
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1524
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1525
|
+
resources_str = resources_utils.get_readable_resources_repr(
|
|
1526
|
+
handle, simplify=True)
|
|
1527
|
+
resources_str_full = (
|
|
1528
|
+
resources_utils.get_readable_resources_repr(handle,
|
|
1529
|
+
simplify=False))
|
|
1530
|
+
job['cluster_resources'] = resources_str
|
|
1531
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1532
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1533
|
+
job['region'] = handle.launched_resources.region
|
|
1534
|
+
job['zone'] = handle.launched_resources.zone
|
|
1535
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1536
|
+
str(handle.launched_resources.cloud),
|
|
1537
|
+
handle.launched_resources.region,
|
|
1538
|
+
handle.launched_resources.zone).formatted_str()
|
|
1539
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1463
1540
|
else:
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1541
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1542
|
+
job['cluster_resources'] = '-'
|
|
1543
|
+
job['cluster_resources_full'] = '-'
|
|
1544
|
+
job['cloud'] = '-'
|
|
1545
|
+
job['region'] = '-'
|
|
1546
|
+
job['zone'] = '-'
|
|
1547
|
+
job['infra'] = '-'
|
|
1548
|
+
|
|
1549
|
+
if not fields or 'details' in fields:
|
|
1550
|
+
# Add details about schedule state / backoff.
|
|
1551
|
+
state_details = None
|
|
1552
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1553
|
+
state_details = 'In backoff, waiting for resources'
|
|
1554
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1555
|
+
priority = job.get('priority')
|
|
1556
|
+
if (priority is not None and
|
|
1557
|
+
priority < highest_blocking_priority):
|
|
1558
|
+
# Job is lower priority than some other blocking job.
|
|
1559
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1560
|
+
else:
|
|
1561
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1562
|
+
|
|
1563
|
+
if state_details and job['failure_reason']:
|
|
1564
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1565
|
+
elif state_details:
|
|
1566
|
+
job['details'] = state_details
|
|
1567
|
+
elif job['failure_reason']:
|
|
1568
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1569
|
+
else:
|
|
1570
|
+
job['details'] = None
|
|
1474
1571
|
|
|
1475
1572
|
return {
|
|
1476
1573
|
'jobs': jobs,
|
|
@@ -1581,21 +1678,14 @@ def load_managed_job_queue(
|
|
|
1581
1678
|
total_no_filter = total
|
|
1582
1679
|
result_type = ManagedJobQueueResultType.LIST
|
|
1583
1680
|
|
|
1584
|
-
|
|
1681
|
+
all_users = global_user_state.get_all_users()
|
|
1682
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
1585
1683
|
for job in jobs:
|
|
1684
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1586
1685
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1587
1686
|
# Skip jobs that do not have user_hash info.
|
|
1588
1687
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1589
|
-
|
|
1590
|
-
user_hash_to_user = global_user_state.get_users(
|
|
1591
|
-
job_id_to_user_hash.values())
|
|
1592
|
-
|
|
1593
|
-
for job in jobs:
|
|
1594
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1595
|
-
if job['job_id'] in job_id_to_user_hash:
|
|
1596
|
-
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1597
|
-
user = user_hash_to_user.get(user_hash, None)
|
|
1598
|
-
job['user_name'] = user.name if user is not None else None
|
|
1688
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1599
1689
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1600
1690
|
|
|
1601
1691
|
|
|
@@ -1620,29 +1710,37 @@ def _get_job_status_from_tasks(
|
|
|
1620
1710
|
|
|
1621
1711
|
|
|
1622
1712
|
@typing.overload
|
|
1623
|
-
def format_job_table(
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
|
|
1713
|
+
def format_job_table(
|
|
1714
|
+
tasks: List[Dict[str, Any]],
|
|
1715
|
+
show_all: bool,
|
|
1716
|
+
show_user: bool,
|
|
1717
|
+
return_rows: Literal[False] = False,
|
|
1718
|
+
max_jobs: Optional[int] = None,
|
|
1719
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1720
|
+
) -> str:
|
|
1628
1721
|
...
|
|
1629
1722
|
|
|
1630
1723
|
|
|
1631
1724
|
@typing.overload
|
|
1632
|
-
def format_job_table(
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1725
|
+
def format_job_table(
|
|
1726
|
+
tasks: List[Dict[str, Any]],
|
|
1727
|
+
show_all: bool,
|
|
1728
|
+
show_user: bool,
|
|
1729
|
+
return_rows: Literal[True],
|
|
1730
|
+
max_jobs: Optional[int] = None,
|
|
1731
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1732
|
+
) -> List[List[str]]:
|
|
1637
1733
|
...
|
|
1638
1734
|
|
|
1639
1735
|
|
|
1640
1736
|
def format_job_table(
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1737
|
+
tasks: List[Dict[str, Any]],
|
|
1738
|
+
show_all: bool,
|
|
1739
|
+
show_user: bool,
|
|
1740
|
+
return_rows: bool = False,
|
|
1741
|
+
max_jobs: Optional[int] = None,
|
|
1742
|
+
job_status_counts: Optional[Dict[str, int]] = None,
|
|
1743
|
+
) -> Union[str, List[List[str]]]:
|
|
1646
1744
|
"""Returns managed jobs as a formatted string.
|
|
1647
1745
|
|
|
1648
1746
|
Args:
|
|
@@ -1651,6 +1749,7 @@ def format_job_table(
|
|
|
1651
1749
|
max_jobs: The maximum number of jobs to show in the table.
|
|
1652
1750
|
return_rows: If True, return the rows as a list of strings instead of
|
|
1653
1751
|
all rows concatenated into a single string.
|
|
1752
|
+
job_status_counts: The counts of each job status.
|
|
1654
1753
|
|
|
1655
1754
|
Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
|
|
1656
1755
|
a list of "rows" (each of which is a list of str).
|
|
@@ -1672,12 +1771,8 @@ def format_job_table(
|
|
|
1672
1771
|
# by the task_id.
|
|
1673
1772
|
jobs[get_hash(task)].append(task)
|
|
1674
1773
|
|
|
1675
|
-
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1676
1774
|
workspaces = set()
|
|
1677
1775
|
for job_tasks in jobs.values():
|
|
1678
|
-
managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
|
|
1679
|
-
if not managed_job_status.is_terminal():
|
|
1680
|
-
status_counts[managed_job_status.value] += 1
|
|
1681
1776
|
workspaces.add(job_tasks[0].get('workspace',
|
|
1682
1777
|
constants.SKYPILOT_DEFAULT_WORKSPACE))
|
|
1683
1778
|
|
|
@@ -1720,9 +1815,15 @@ def format_job_table(
|
|
|
1720
1815
|
job_table = log_utils.create_table(columns)
|
|
1721
1816
|
|
|
1722
1817
|
status_counts: Dict[str, int] = collections.defaultdict(int)
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1818
|
+
if job_status_counts:
|
|
1819
|
+
for status_value, count in job_status_counts.items():
|
|
1820
|
+
status = managed_job_state.ManagedJobStatus(status_value)
|
|
1821
|
+
if not status.is_terminal():
|
|
1822
|
+
status_counts[status_value] = count
|
|
1823
|
+
else:
|
|
1824
|
+
for task in tasks:
|
|
1825
|
+
if not task['status'].is_terminal():
|
|
1826
|
+
status_counts[task['status'].value] += 1
|
|
1726
1827
|
|
|
1727
1828
|
all_tasks = tasks
|
|
1728
1829
|
if max_jobs is not None:
|
|
@@ -2014,6 +2115,7 @@ class ManagedJobCodeGen:
|
|
|
2014
2115
|
limit: Optional[int] = None,
|
|
2015
2116
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
2016
2117
|
statuses: Optional[List[str]] = None,
|
|
2118
|
+
fields: Optional[List[str]] = None,
|
|
2017
2119
|
) -> str:
|
|
2018
2120
|
code = textwrap.dedent(f"""\
|
|
2019
2121
|
if managed_job_version < 9:
|
|
@@ -2032,7 +2134,7 @@ class ManagedJobCodeGen:
|
|
|
2032
2134
|
page={page!r},
|
|
2033
2135
|
limit={limit!r},
|
|
2034
2136
|
user_hashes={user_hashes!r})
|
|
2035
|
-
|
|
2137
|
+
elif managed_job_version < 12:
|
|
2036
2138
|
job_table = utils.dump_managed_job_queue(
|
|
2037
2139
|
skip_finished={skip_finished},
|
|
2038
2140
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -2044,6 +2146,19 @@ class ManagedJobCodeGen:
|
|
|
2044
2146
|
limit={limit!r},
|
|
2045
2147
|
user_hashes={user_hashes!r},
|
|
2046
2148
|
statuses={statuses!r})
|
|
2149
|
+
else:
|
|
2150
|
+
job_table = utils.dump_managed_job_queue(
|
|
2151
|
+
skip_finished={skip_finished},
|
|
2152
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2153
|
+
job_ids={job_ids!r},
|
|
2154
|
+
workspace_match={workspace_match!r},
|
|
2155
|
+
name_match={name_match!r},
|
|
2156
|
+
pool_match={pool_match!r},
|
|
2157
|
+
page={page!r},
|
|
2158
|
+
limit={limit!r},
|
|
2159
|
+
user_hashes={user_hashes!r},
|
|
2160
|
+
statuses={statuses!r},
|
|
2161
|
+
fields={fields!r})
|
|
2047
2162
|
print(job_table, flush=True)
|
|
2048
2163
|
""")
|
|
2049
2164
|
return cls._build(code)
|
sky/schemas/api/responses.py
CHANGED
|
@@ -160,6 +160,8 @@ class StorageRecord(ResponseBaseModel):
|
|
|
160
160
|
# and therefore can be non-optional.
|
|
161
161
|
class ManagedJobRecord(ResponseBaseModel):
|
|
162
162
|
"""A single managed job record."""
|
|
163
|
+
# The job_id in the spot table
|
|
164
|
+
task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
|
|
163
165
|
job_id: Optional[int] = None
|
|
164
166
|
task_id: Optional[int] = None
|
|
165
167
|
job_name: Optional[str] = None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Initial schema for sky config database
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2025-10-21
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
from sky.skypilot_config import Base
|
|
12
|
+
from sky.utils.db import db_utils
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = '001'
|
|
16
|
+
down_revision = None
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade():
|
|
22
|
+
"""Create initial schema for config_yaml table"""
|
|
23
|
+
with op.get_context().autocommit_block():
|
|
24
|
+
# Create all tables with their current schema
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade():
|
|
29
|
+
"""Drop all tables"""
|
|
30
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
sky/serve/server/server.py
CHANGED
|
@@ -23,7 +23,7 @@ async def up(
|
|
|
23
23
|
request: fastapi.Request,
|
|
24
24
|
up_body: payloads.ServeUpBody,
|
|
25
25
|
) -> None:
|
|
26
|
-
executor.
|
|
26
|
+
await executor.schedule_request_async(
|
|
27
27
|
request_id=request.state.request_id,
|
|
28
28
|
request_name='serve.up',
|
|
29
29
|
request_body=up_body,
|
|
@@ -38,7 +38,7 @@ async def update(
|
|
|
38
38
|
request: fastapi.Request,
|
|
39
39
|
update_body: payloads.ServeUpdateBody,
|
|
40
40
|
) -> None:
|
|
41
|
-
executor.
|
|
41
|
+
await executor.schedule_request_async(
|
|
42
42
|
request_id=request.state.request_id,
|
|
43
43
|
request_name='serve.update',
|
|
44
44
|
request_body=update_body,
|
|
@@ -53,7 +53,7 @@ async def down(
|
|
|
53
53
|
request: fastapi.Request,
|
|
54
54
|
down_body: payloads.ServeDownBody,
|
|
55
55
|
) -> None:
|
|
56
|
-
executor.
|
|
56
|
+
await executor.schedule_request_async(
|
|
57
57
|
request_id=request.state.request_id,
|
|
58
58
|
request_name='serve.down',
|
|
59
59
|
request_body=down_body,
|
|
@@ -68,7 +68,7 @@ async def terminate_replica(
|
|
|
68
68
|
request: fastapi.Request,
|
|
69
69
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
70
|
) -> None:
|
|
71
|
-
executor.
|
|
71
|
+
await executor.schedule_request_async(
|
|
72
72
|
request_id=request.state.request_id,
|
|
73
73
|
request_name='serve.terminate_replica',
|
|
74
74
|
request_body=terminate_replica_body,
|
|
@@ -83,7 +83,7 @@ async def status(
|
|
|
83
83
|
request: fastapi.Request,
|
|
84
84
|
status_body: payloads.ServeStatusBody,
|
|
85
85
|
) -> None:
|
|
86
|
-
executor.
|
|
86
|
+
await executor.schedule_request_async(
|
|
87
87
|
request_id=request.state.request_id,
|
|
88
88
|
request_name='serve.status',
|
|
89
89
|
request_body=status_body,
|
|
@@ -99,7 +99,7 @@ async def tail_logs(
|
|
|
99
99
|
background_tasks: fastapi.BackgroundTasks
|
|
100
100
|
) -> fastapi.responses.StreamingResponse:
|
|
101
101
|
executor.check_request_thread_executor_available()
|
|
102
|
-
request_task = executor.
|
|
102
|
+
request_task = await executor.prepare_request_async(
|
|
103
103
|
request_id=request.state.request_id,
|
|
104
104
|
request_name='serve.logs',
|
|
105
105
|
request_body=log_body,
|
|
@@ -132,7 +132,7 @@ async def download_logs(
|
|
|
132
132
|
# We should reuse the original request body, so that the env vars, such as
|
|
133
133
|
# user hash, are kept the same.
|
|
134
134
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
135
|
-
executor.
|
|
135
|
+
await executor.schedule_request_async(
|
|
136
136
|
request_id=request.state.request_id,
|
|
137
137
|
request_name='serve.sync_down_logs',
|
|
138
138
|
request_body=download_logs_body,
|
sky/server/auth/oauth2_proxy.py
CHANGED
|
@@ -126,13 +126,10 @@ class OAuth2ProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
|
126
126
|
|
|
127
127
|
async def _authenticate(self, request: fastapi.Request, call_next,
|
|
128
128
|
session: aiohttp.ClientSession):
|
|
129
|
-
forwarded_headers =
|
|
129
|
+
forwarded_headers = {}
|
|
130
130
|
auth_url = f'{self.proxy_base}/oauth2/auth'
|
|
131
131
|
forwarded_headers['X-Forwarded-Uri'] = str(request.url).rstrip('/')
|
|
132
|
-
|
|
133
|
-
# to reduce the auth overhead.
|
|
134
|
-
forwarded_headers.pop('content-length', None)
|
|
135
|
-
forwarded_headers.pop('content-type', None)
|
|
132
|
+
forwarded_headers['Host'] = request.url.hostname
|
|
136
133
|
logger.debug(f'authenticate request: {auth_url}, '
|
|
137
134
|
f'headers: {forwarded_headers}')
|
|
138
135
|
|