skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +12 -19
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +73 -72
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -108,6 +108,21 @@ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
|
108
108
|
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
109
109
|
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
110
110
|
|
|
111
|
+
# The response fields for managed jobs that require cluster handle
|
|
112
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
113
|
+
'cluster_resources',
|
|
114
|
+
'cluster_resources_full',
|
|
115
|
+
'cloud',
|
|
116
|
+
'region',
|
|
117
|
+
'zone',
|
|
118
|
+
'infra',
|
|
119
|
+
'accelerators',
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# The response fields for managed jobs that are not stored in the database
|
|
123
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
124
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
125
|
+
|
|
111
126
|
|
|
112
127
|
class ManagedJobQueueResultType(enum.Enum):
|
|
113
128
|
"""The type of the managed job queue result."""
|
|
@@ -1313,11 +1328,85 @@ def dump_managed_job_queue(
|
|
|
1313
1328
|
limit: Optional[int] = None,
|
|
1314
1329
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1315
1330
|
statuses: Optional[List[str]] = None,
|
|
1331
|
+
fields: Optional[List[str]] = None,
|
|
1316
1332
|
) -> str:
|
|
1317
1333
|
return message_utils.encode_payload(
|
|
1318
1334
|
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1319
1335
|
workspace_match, name_match, pool_match, page,
|
|
1320
|
-
limit, user_hashes, statuses))
|
|
1336
|
+
limit, user_hashes, statuses, fields))
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1340
|
+
"""Update the fields list to include the necessary fields.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
fields: The fields to update.
|
|
1344
|
+
|
|
1345
|
+
It will:
|
|
1346
|
+
- Add the necessary dependent fields to the list.
|
|
1347
|
+
- Remove the fields that are not in the DB.
|
|
1348
|
+
- Determine if cluster handle is required.
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1352
|
+
cluster handle is required.
|
|
1353
|
+
"""
|
|
1354
|
+
cluster_handle_required = True
|
|
1355
|
+
if _cluster_handle_not_required(fields):
|
|
1356
|
+
cluster_handle_required = False
|
|
1357
|
+
# Copy the list to avoid modifying the original list
|
|
1358
|
+
new_fields = fields.copy()
|
|
1359
|
+
# status and job_id are always included
|
|
1360
|
+
if 'status' not in new_fields:
|
|
1361
|
+
new_fields.append('status')
|
|
1362
|
+
if 'job_id' not in new_fields:
|
|
1363
|
+
new_fields.append('job_id')
|
|
1364
|
+
# user_hash is required if user_name is present
|
|
1365
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1366
|
+
new_fields.append('user_hash')
|
|
1367
|
+
if 'job_duration' in new_fields:
|
|
1368
|
+
if 'last_recovered_at' not in new_fields:
|
|
1369
|
+
new_fields.append('last_recovered_at')
|
|
1370
|
+
if 'end_at' not in new_fields:
|
|
1371
|
+
new_fields.append('end_at')
|
|
1372
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1373
|
+
new_fields.append('task_name')
|
|
1374
|
+
if 'details' in new_fields:
|
|
1375
|
+
if 'schedule_state' not in new_fields:
|
|
1376
|
+
new_fields.append('schedule_state')
|
|
1377
|
+
if 'priority' not in new_fields:
|
|
1378
|
+
new_fields.append('priority')
|
|
1379
|
+
if 'failure_reason' not in new_fields:
|
|
1380
|
+
new_fields.append('failure_reason')
|
|
1381
|
+
if ('user_yaml' in new_fields and
|
|
1382
|
+
'original_user_yaml_path' not in new_fields):
|
|
1383
|
+
new_fields.append('original_user_yaml_path')
|
|
1384
|
+
if cluster_handle_required:
|
|
1385
|
+
if 'task_name' not in new_fields:
|
|
1386
|
+
new_fields.append('task_name')
|
|
1387
|
+
if 'current_cluster_name' not in new_fields:
|
|
1388
|
+
new_fields.append('current_cluster_name')
|
|
1389
|
+
# Remove _NON_DB_FIELDS
|
|
1390
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1391
|
+
# don't need to include them in the updated fields.
|
|
1392
|
+
for field in _NON_DB_FIELDS:
|
|
1393
|
+
if field in new_fields:
|
|
1394
|
+
new_fields.remove(field)
|
|
1395
|
+
return new_fields, cluster_handle_required
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1399
|
+
"""Determine if cluster handle is not required.
|
|
1400
|
+
|
|
1401
|
+
Args:
|
|
1402
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1403
|
+
fields.
|
|
1404
|
+
|
|
1405
|
+
Returns:
|
|
1406
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1407
|
+
False otherwise.
|
|
1408
|
+
"""
|
|
1409
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1321
1410
|
|
|
1322
1411
|
|
|
1323
1412
|
def get_managed_job_queue(
|
|
@@ -1331,146 +1420,154 @@ def get_managed_job_queue(
|
|
|
1331
1420
|
limit: Optional[int] = None,
|
|
1332
1421
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1333
1422
|
statuses: Optional[List[str]] = None,
|
|
1423
|
+
fields: Optional[List[str]] = None,
|
|
1334
1424
|
) -> Dict[str, Any]:
|
|
1335
|
-
|
|
1336
|
-
# detection) requires a full view of the jobs table.
|
|
1337
|
-
jobs = managed_job_state.get_managed_jobs()
|
|
1425
|
+
"""Get the managed job queue.
|
|
1338
1426
|
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1352
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
|
|
1353
|
-
# This job will not block others.
|
|
1354
|
-
continue
|
|
1355
|
-
|
|
1356
|
-
priority = job.get('priority')
|
|
1357
|
-
if priority is not None and priority > highest_blocking_priority:
|
|
1358
|
-
highest_blocking_priority = priority
|
|
1427
|
+
Args:
|
|
1428
|
+
skip_finished: Whether to skip finished jobs.
|
|
1429
|
+
accessible_workspaces: The accessible workspaces.
|
|
1430
|
+
job_ids: The job ids.
|
|
1431
|
+
workspace_match: The workspace name to match.
|
|
1432
|
+
name_match: The job name to match.
|
|
1433
|
+
pool_match: The pool name to match.
|
|
1434
|
+
page: The page number.
|
|
1435
|
+
limit: The limit number.
|
|
1436
|
+
user_hashes: The user hashes.
|
|
1437
|
+
statuses: The statuses.
|
|
1438
|
+
fields: The fields to include in the response.
|
|
1359
1439
|
|
|
1360
|
-
|
|
1440
|
+
Returns:
|
|
1441
|
+
A dictionary containing the managed job queue.
|
|
1442
|
+
"""
|
|
1443
|
+
cluster_handle_required = True
|
|
1444
|
+
updated_fields = None
|
|
1445
|
+
# The caller only need to specify the fields in the
|
|
1446
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1447
|
+
# function will add the necessary dependent fields to the list, for
|
|
1448
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1449
|
+
# function will add `['user_hash']` to the list.
|
|
1450
|
+
if fields:
|
|
1451
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1452
|
+
|
|
1453
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1454
|
+
|
|
1455
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1456
|
+
fields=fields,
|
|
1457
|
+
job_ids=job_ids,
|
|
1458
|
+
accessible_workspaces=accessible_workspaces,
|
|
1459
|
+
workspace_match=workspace_match,
|
|
1460
|
+
name_match=name_match,
|
|
1461
|
+
pool_match=pool_match,
|
|
1462
|
+
user_hashes=user_hashes,
|
|
1463
|
+
skip_finished=skip_finished,
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1467
|
+
fields=updated_fields,
|
|
1468
|
+
job_ids=job_ids,
|
|
1469
|
+
accessible_workspaces=accessible_workspaces,
|
|
1470
|
+
workspace_match=workspace_match,
|
|
1471
|
+
name_match=name_match,
|
|
1472
|
+
pool_match=pool_match,
|
|
1473
|
+
user_hashes=user_hashes,
|
|
1474
|
+
statuses=statuses,
|
|
1475
|
+
skip_finished=skip_finished,
|
|
1476
|
+
page=page,
|
|
1477
|
+
limit=limit,
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
if cluster_handle_required:
|
|
1481
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1482
|
+
cluster_name_to_handle = (
|
|
1483
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1361
1484
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1370
|
-
accessible_workspaces
|
|
1371
|
-
]
|
|
1372
|
-
if skip_finished:
|
|
1373
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
1374
|
-
# finished, we will include all its tasks.
|
|
1375
|
-
non_finished_tasks = list(
|
|
1376
|
-
filter(
|
|
1377
|
-
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1378
|
-
'status']).is_terminal(), jobs))
|
|
1379
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1380
|
-
jobs = list(
|
|
1381
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1382
|
-
if job_ids:
|
|
1383
|
-
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1384
|
-
|
|
1385
|
-
jobs, total, status_counts = filter_jobs(jobs,
|
|
1386
|
-
workspace_match,
|
|
1387
|
-
name_match,
|
|
1388
|
-
pool_match,
|
|
1389
|
-
page,
|
|
1390
|
-
limit,
|
|
1391
|
-
statuses=statuses)
|
|
1392
|
-
|
|
1393
|
-
job_ids = set(job['job_id'] for job in jobs)
|
|
1394
|
-
job_id_to_pool_info = (
|
|
1395
|
-
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1396
|
-
cluster_names: Dict[int, str] = {}
|
|
1397
|
-
for job in jobs:
|
|
1398
|
-
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1399
|
-
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1400
|
-
if pool_info and pool_info[0]:
|
|
1401
|
-
cluster_name = pool_info[1]
|
|
1402
|
-
else:
|
|
1403
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1404
|
-
job['task_name'], job['job_id'])
|
|
1405
|
-
cluster_names[job['job_id']] = cluster_name
|
|
1406
|
-
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1407
|
-
set(cluster_names.values()))
|
|
1485
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1486
|
+
if not fields or 'details' in fields:
|
|
1487
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1488
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1489
|
+
# job, or just by the limited controller resources.
|
|
1490
|
+
highest_blocking_priority = (
|
|
1491
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
1408
1492
|
|
|
1409
1493
|
for job in jobs:
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
end_at
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1494
|
+
if not fields or 'job_duration' in fields:
|
|
1495
|
+
end_at = job['end_at']
|
|
1496
|
+
if end_at is None:
|
|
1497
|
+
end_at = time.time()
|
|
1498
|
+
|
|
1499
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1500
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1501
|
+
# When job is recovering, the duration is exact
|
|
1502
|
+
# job['job_duration']
|
|
1503
|
+
job_duration = job['job_duration']
|
|
1504
|
+
elif job_submitted_at > 0:
|
|
1505
|
+
job_duration = end_at - job_submitted_at
|
|
1506
|
+
else:
|
|
1507
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1508
|
+
# is not set yet, i.e. the job is not started.
|
|
1509
|
+
job_duration = 0
|
|
1510
|
+
job['job_duration'] = job_duration
|
|
1425
1511
|
job['status'] = job['status'].value
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
cluster_name = cluster_names[job['job_id']]
|
|
1429
|
-
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1430
|
-
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1431
|
-
resources_str = resources_utils.get_readable_resources_repr(
|
|
1432
|
-
handle, simplify=True)
|
|
1433
|
-
resources_str_full = resources_utils.get_readable_resources_repr(
|
|
1434
|
-
handle, simplify=False)
|
|
1435
|
-
job['cluster_resources'] = resources_str
|
|
1436
|
-
job['cluster_resources_full'] = resources_str_full
|
|
1437
|
-
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1438
|
-
job['region'] = handle.launched_resources.region
|
|
1439
|
-
job['zone'] = handle.launched_resources.zone
|
|
1440
|
-
job['infra'] = infra_utils.InfraInfo(
|
|
1441
|
-
str(handle.launched_resources.cloud),
|
|
1442
|
-
handle.launched_resources.region,
|
|
1443
|
-
handle.launched_resources.zone).formatted_str()
|
|
1444
|
-
job['accelerators'] = handle.launched_resources.accelerators
|
|
1512
|
+
if not fields or 'schedule_state' in fields:
|
|
1513
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
1445
1514
|
else:
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1515
|
+
job['schedule_state'] = None
|
|
1516
|
+
|
|
1517
|
+
if cluster_handle_required:
|
|
1518
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1519
|
+
if cluster_name is None:
|
|
1520
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1521
|
+
job['task_name'], job['job_id'])
|
|
1522
|
+
handle = cluster_name_to_handle.get(
|
|
1523
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1524
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1525
|
+
resources_str = resources_utils.get_readable_resources_repr(
|
|
1526
|
+
handle, simplify=True)
|
|
1527
|
+
resources_str_full = (
|
|
1528
|
+
resources_utils.get_readable_resources_repr(handle,
|
|
1529
|
+
simplify=False))
|
|
1530
|
+
job['cluster_resources'] = resources_str
|
|
1531
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1532
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1533
|
+
job['region'] = handle.launched_resources.region
|
|
1534
|
+
job['zone'] = handle.launched_resources.zone
|
|
1535
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1536
|
+
str(handle.launched_resources.cloud),
|
|
1537
|
+
handle.launched_resources.region,
|
|
1538
|
+
handle.launched_resources.zone).formatted_str()
|
|
1539
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1463
1540
|
else:
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1541
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1542
|
+
job['cluster_resources'] = '-'
|
|
1543
|
+
job['cluster_resources_full'] = '-'
|
|
1544
|
+
job['cloud'] = '-'
|
|
1545
|
+
job['region'] = '-'
|
|
1546
|
+
job['zone'] = '-'
|
|
1547
|
+
job['infra'] = '-'
|
|
1548
|
+
|
|
1549
|
+
if not fields or 'details' in fields:
|
|
1550
|
+
# Add details about schedule state / backoff.
|
|
1551
|
+
state_details = None
|
|
1552
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1553
|
+
state_details = 'In backoff, waiting for resources'
|
|
1554
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1555
|
+
priority = job.get('priority')
|
|
1556
|
+
if (priority is not None and
|
|
1557
|
+
priority < highest_blocking_priority):
|
|
1558
|
+
# Job is lower priority than some other blocking job.
|
|
1559
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1560
|
+
else:
|
|
1561
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1562
|
+
|
|
1563
|
+
if state_details and job['failure_reason']:
|
|
1564
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1565
|
+
elif state_details:
|
|
1566
|
+
job['details'] = state_details
|
|
1567
|
+
elif job['failure_reason']:
|
|
1568
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1569
|
+
else:
|
|
1570
|
+
job['details'] = None
|
|
1474
1571
|
|
|
1475
1572
|
return {
|
|
1476
1573
|
'jobs': jobs,
|
|
@@ -1581,21 +1678,14 @@ def load_managed_job_queue(
|
|
|
1581
1678
|
total_no_filter = total
|
|
1582
1679
|
result_type = ManagedJobQueueResultType.LIST
|
|
1583
1680
|
|
|
1584
|
-
|
|
1681
|
+
all_users = global_user_state.get_all_users()
|
|
1682
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
1585
1683
|
for job in jobs:
|
|
1684
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1586
1685
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1587
1686
|
# Skip jobs that do not have user_hash info.
|
|
1588
1687
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1589
|
-
|
|
1590
|
-
user_hash_to_user = global_user_state.get_users(
|
|
1591
|
-
job_id_to_user_hash.values())
|
|
1592
|
-
|
|
1593
|
-
for job in jobs:
|
|
1594
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1595
|
-
if job['job_id'] in job_id_to_user_hash:
|
|
1596
|
-
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1597
|
-
user = user_hash_to_user.get(user_hash, None)
|
|
1598
|
-
job['user_name'] = user.name if user is not None else None
|
|
1688
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1599
1689
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1600
1690
|
|
|
1601
1691
|
|
|
@@ -2014,6 +2104,7 @@ class ManagedJobCodeGen:
|
|
|
2014
2104
|
limit: Optional[int] = None,
|
|
2015
2105
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
2016
2106
|
statuses: Optional[List[str]] = None,
|
|
2107
|
+
fields: Optional[List[str]] = None,
|
|
2017
2108
|
) -> str:
|
|
2018
2109
|
code = textwrap.dedent(f"""\
|
|
2019
2110
|
if managed_job_version < 9:
|
|
@@ -2032,7 +2123,7 @@ class ManagedJobCodeGen:
|
|
|
2032
2123
|
page={page!r},
|
|
2033
2124
|
limit={limit!r},
|
|
2034
2125
|
user_hashes={user_hashes!r})
|
|
2035
|
-
|
|
2126
|
+
elif managed_job_version < 12:
|
|
2036
2127
|
job_table = utils.dump_managed_job_queue(
|
|
2037
2128
|
skip_finished={skip_finished},
|
|
2038
2129
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -2044,6 +2135,19 @@ class ManagedJobCodeGen:
|
|
|
2044
2135
|
limit={limit!r},
|
|
2045
2136
|
user_hashes={user_hashes!r},
|
|
2046
2137
|
statuses={statuses!r})
|
|
2138
|
+
else:
|
|
2139
|
+
job_table = utils.dump_managed_job_queue(
|
|
2140
|
+
skip_finished={skip_finished},
|
|
2141
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2142
|
+
job_ids={job_ids!r},
|
|
2143
|
+
workspace_match={workspace_match!r},
|
|
2144
|
+
name_match={name_match!r},
|
|
2145
|
+
pool_match={pool_match!r},
|
|
2146
|
+
page={page!r},
|
|
2147
|
+
limit={limit!r},
|
|
2148
|
+
user_hashes={user_hashes!r},
|
|
2149
|
+
statuses={statuses!r},
|
|
2150
|
+
fields={fields!r})
|
|
2047
2151
|
print(job_table, flush=True)
|
|
2048
2152
|
""")
|
|
2049
2153
|
return cls._build(code)
|
sky/schemas/api/responses.py
CHANGED
|
@@ -160,6 +160,8 @@ class StorageRecord(ResponseBaseModel):
|
|
|
160
160
|
# and therefore can be non-optional.
|
|
161
161
|
class ManagedJobRecord(ResponseBaseModel):
|
|
162
162
|
"""A single managed job record."""
|
|
163
|
+
# The job_id in the spot table
|
|
164
|
+
task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
|
|
163
165
|
job_id: Optional[int] = None
|
|
164
166
|
task_id: Optional[int] = None
|
|
165
167
|
job_name: Optional[str] = None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Initial schema for sky config database
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2025-10-21
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
from sky.skypilot_config import Base
|
|
12
|
+
from sky.utils.db import db_utils
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = '001'
|
|
16
|
+
down_revision = None
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade():
|
|
22
|
+
"""Create initial schema for config_yaml table"""
|
|
23
|
+
with op.get_context().autocommit_block():
|
|
24
|
+
# Create all tables with their current schema
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade():
|
|
29
|
+
"""Drop all tables"""
|
|
30
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
sky/serve/server/server.py
CHANGED
|
@@ -23,7 +23,7 @@ async def up(
|
|
|
23
23
|
request: fastapi.Request,
|
|
24
24
|
up_body: payloads.ServeUpBody,
|
|
25
25
|
) -> None:
|
|
26
|
-
executor.
|
|
26
|
+
await executor.schedule_request_async(
|
|
27
27
|
request_id=request.state.request_id,
|
|
28
28
|
request_name='serve.up',
|
|
29
29
|
request_body=up_body,
|
|
@@ -38,7 +38,7 @@ async def update(
|
|
|
38
38
|
request: fastapi.Request,
|
|
39
39
|
update_body: payloads.ServeUpdateBody,
|
|
40
40
|
) -> None:
|
|
41
|
-
executor.
|
|
41
|
+
await executor.schedule_request_async(
|
|
42
42
|
request_id=request.state.request_id,
|
|
43
43
|
request_name='serve.update',
|
|
44
44
|
request_body=update_body,
|
|
@@ -53,7 +53,7 @@ async def down(
|
|
|
53
53
|
request: fastapi.Request,
|
|
54
54
|
down_body: payloads.ServeDownBody,
|
|
55
55
|
) -> None:
|
|
56
|
-
executor.
|
|
56
|
+
await executor.schedule_request_async(
|
|
57
57
|
request_id=request.state.request_id,
|
|
58
58
|
request_name='serve.down',
|
|
59
59
|
request_body=down_body,
|
|
@@ -68,7 +68,7 @@ async def terminate_replica(
|
|
|
68
68
|
request: fastapi.Request,
|
|
69
69
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
70
|
) -> None:
|
|
71
|
-
executor.
|
|
71
|
+
await executor.schedule_request_async(
|
|
72
72
|
request_id=request.state.request_id,
|
|
73
73
|
request_name='serve.terminate_replica',
|
|
74
74
|
request_body=terminate_replica_body,
|
|
@@ -83,7 +83,7 @@ async def status(
|
|
|
83
83
|
request: fastapi.Request,
|
|
84
84
|
status_body: payloads.ServeStatusBody,
|
|
85
85
|
) -> None:
|
|
86
|
-
executor.
|
|
86
|
+
await executor.schedule_request_async(
|
|
87
87
|
request_id=request.state.request_id,
|
|
88
88
|
request_name='serve.status',
|
|
89
89
|
request_body=status_body,
|
|
@@ -99,7 +99,7 @@ async def tail_logs(
|
|
|
99
99
|
background_tasks: fastapi.BackgroundTasks
|
|
100
100
|
) -> fastapi.responses.StreamingResponse:
|
|
101
101
|
executor.check_request_thread_executor_available()
|
|
102
|
-
request_task = executor.
|
|
102
|
+
request_task = await executor.prepare_request_async(
|
|
103
103
|
request_id=request.state.request_id,
|
|
104
104
|
request_name='serve.logs',
|
|
105
105
|
request_body=log_body,
|
|
@@ -132,7 +132,7 @@ async def download_logs(
|
|
|
132
132
|
# We should reuse the original request body, so that the env vars, such as
|
|
133
133
|
# user hash, are kept the same.
|
|
134
134
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
135
|
-
executor.
|
|
135
|
+
await executor.schedule_request_async(
|
|
136
136
|
request_id=request.state.request_id,
|
|
137
137
|
request_name='serve.sync_down_logs',
|
|
138
138
|
request_body=download_logs_body,
|
sky/server/common.py
CHANGED
|
@@ -17,7 +17,6 @@ import time
|
|
|
17
17
|
import typing
|
|
18
18
|
from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
|
|
19
19
|
Tuple, TypeVar, Union)
|
|
20
|
-
from urllib import parse
|
|
21
20
|
import uuid
|
|
22
21
|
|
|
23
22
|
import cachetools
|
|
@@ -342,18 +341,7 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
|
342
341
|
@annotations.lru_cache(scope='global')
|
|
343
342
|
def get_dashboard_url(server_url: str,
|
|
344
343
|
starting_page: Optional[str] = None) -> str:
|
|
345
|
-
|
|
346
|
-
# format of https://username:password@example.com:8080/path
|
|
347
|
-
# We need to remove the username and password and only
|
|
348
|
-
# return `https://example.com:8080/path`
|
|
349
|
-
parsed = parse.urlparse(server_url)
|
|
350
|
-
# Reconstruct the URL without credentials but keeping the scheme
|
|
351
|
-
dashboard_url = f'{parsed.scheme}://{parsed.hostname}'
|
|
352
|
-
if parsed.port:
|
|
353
|
-
dashboard_url = f'{dashboard_url}:{parsed.port}'
|
|
354
|
-
if parsed.path:
|
|
355
|
-
dashboard_url = f'{dashboard_url}{parsed.path}'
|
|
356
|
-
dashboard_url = dashboard_url.rstrip('/')
|
|
344
|
+
dashboard_url = server_url.rstrip('/')
|
|
357
345
|
dashboard_url = f'{dashboard_url}/dashboard'
|
|
358
346
|
if starting_page:
|
|
359
347
|
dashboard_url = f'{dashboard_url}/{starting_page}'
|
sky/server/requests/executor.py
CHANGED
|
@@ -329,10 +329,7 @@ def override_request_env_and_config(
|
|
|
329
329
|
# through the execution.
|
|
330
330
|
user = models.User(id=request_body.env_vars[constants.USER_ID_ENV_VAR],
|
|
331
331
|
name=request_body.env_vars[constants.USER_ENV_VAR])
|
|
332
|
-
global_user_state.add_or_update_user(user)
|
|
333
|
-
# Refetch the user to get the latest user info, including the created_at
|
|
334
|
-
# field.
|
|
335
|
-
user = global_user_state.get_user(user.id)
|
|
332
|
+
_, user = global_user_state.add_or_update_user(user, return_user=True)
|
|
336
333
|
|
|
337
334
|
# Force color to be enabled.
|
|
338
335
|
os.environ['CLICOLOR_FORCE'] = '1'
|
|
@@ -689,7 +686,7 @@ async def _execute_request_coroutine(request: api_requests.Request):
|
|
|
689
686
|
ctx.cancel()
|
|
690
687
|
|
|
691
688
|
|
|
692
|
-
def
|
|
689
|
+
async def prepare_request_async(
|
|
693
690
|
request_id: str,
|
|
694
691
|
request_name: str,
|
|
695
692
|
request_body: payloads.RequestBody,
|
|
@@ -715,7 +712,7 @@ def prepare_request(
|
|
|
715
712
|
user_id=user_id,
|
|
716
713
|
cluster_name=request_cluster_name)
|
|
717
714
|
|
|
718
|
-
if not api_requests.
|
|
715
|
+
if not await api_requests.create_if_not_exists_async(request):
|
|
719
716
|
raise exceptions.RequestAlreadyExistsError(
|
|
720
717
|
f'Request {request_id} already exists.')
|
|
721
718
|
|
|
@@ -723,17 +720,18 @@ def prepare_request(
|
|
|
723
720
|
return request
|
|
724
721
|
|
|
725
722
|
|
|
726
|
-
def
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
723
|
+
async def schedule_request_async(request_id: str,
|
|
724
|
+
request_name: str,
|
|
725
|
+
request_body: payloads.RequestBody,
|
|
726
|
+
func: Callable[P, Any],
|
|
727
|
+
request_cluster_name: Optional[str] = None,
|
|
728
|
+
ignore_return_value: bool = False,
|
|
729
|
+
schedule_type: api_requests.ScheduleType = (
|
|
730
|
+
api_requests.ScheduleType.LONG),
|
|
731
|
+
is_skypilot_system: bool = False,
|
|
732
|
+
precondition: Optional[
|
|
733
|
+
preconditions.Precondition] = None,
|
|
734
|
+
retryable: bool = False) -> None:
|
|
737
735
|
"""Enqueue a request to the request queue.
|
|
738
736
|
|
|
739
737
|
Args:
|
|
@@ -754,9 +752,11 @@ def schedule_request(request_id: str,
|
|
|
754
752
|
The precondition is waited asynchronously and does not block the
|
|
755
753
|
caller.
|
|
756
754
|
"""
|
|
757
|
-
request_task =
|
|
758
|
-
|
|
759
|
-
|
|
755
|
+
request_task = await prepare_request_async(request_id, request_name,
|
|
756
|
+
request_body, func,
|
|
757
|
+
request_cluster_name,
|
|
758
|
+
schedule_type,
|
|
759
|
+
is_skypilot_system)
|
|
760
760
|
schedule_prepared_request(request_task, ignore_return_value, precondition,
|
|
761
761
|
retryable)
|
|
762
762
|
|