skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/backends/backend_utils.py +11 -11
- sky/backends/cloud_vm_ray_backend.py +15 -4
- sky/client/cli/command.py +39 -10
- sky/client/cli/flags.py +4 -2
- sky/client/sdk.py +26 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +2 -2
- sky/global_user_state.py +137 -37
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +21 -12
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/provision/kubernetes/network.py +9 -6
- sky/provision/provisioner.py +8 -0
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +8 -7
- sky/server/common.py +10 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +4 -2
- sky/server/requests/executor.py +30 -28
- sky/server/requests/payloads.py +5 -1
- sky/server/requests/preconditions.py +9 -4
- sky/server/requests/requests.py +130 -53
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +91 -58
- sky/server/stream_utils.py +127 -38
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/asyncio_utils.py +63 -3
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
- sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
- /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
|
@@ -108,6 +108,21 @@ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
|
|
|
108
108
|
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
|
|
109
109
|
'~/.sky/.jobs_controller_consolidation_reloaded_signal')
|
|
110
110
|
|
|
111
|
+
# The response fields for managed jobs that require cluster handle
|
|
112
|
+
_CLUSTER_HANDLE_FIELDS = [
|
|
113
|
+
'cluster_resources',
|
|
114
|
+
'cluster_resources_full',
|
|
115
|
+
'cloud',
|
|
116
|
+
'region',
|
|
117
|
+
'zone',
|
|
118
|
+
'infra',
|
|
119
|
+
'accelerators',
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# The response fields for managed jobs that are not stored in the database
|
|
123
|
+
# These fields will be mapped to the DB fields in the `_update_fields`.
|
|
124
|
+
_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
|
|
125
|
+
|
|
111
126
|
|
|
112
127
|
class ManagedJobQueueResultType(enum.Enum):
|
|
113
128
|
"""The type of the managed job queue result."""
|
|
@@ -1313,11 +1328,85 @@ def dump_managed_job_queue(
|
|
|
1313
1328
|
limit: Optional[int] = None,
|
|
1314
1329
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1315
1330
|
statuses: Optional[List[str]] = None,
|
|
1331
|
+
fields: Optional[List[str]] = None,
|
|
1316
1332
|
) -> str:
|
|
1317
1333
|
return message_utils.encode_payload(
|
|
1318
1334
|
get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
|
|
1319
1335
|
workspace_match, name_match, pool_match, page,
|
|
1320
|
-
limit, user_hashes, statuses))
|
|
1336
|
+
limit, user_hashes, statuses, fields))
|
|
1337
|
+
|
|
1338
|
+
|
|
1339
|
+
def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
|
|
1340
|
+
"""Update the fields list to include the necessary fields.
|
|
1341
|
+
|
|
1342
|
+
Args:
|
|
1343
|
+
fields: The fields to update.
|
|
1344
|
+
|
|
1345
|
+
It will:
|
|
1346
|
+
- Add the necessary dependent fields to the list.
|
|
1347
|
+
- Remove the fields that are not in the DB.
|
|
1348
|
+
- Determine if cluster handle is required.
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
A tuple containing the updated fields and a boolean indicating if
|
|
1352
|
+
cluster handle is required.
|
|
1353
|
+
"""
|
|
1354
|
+
cluster_handle_required = True
|
|
1355
|
+
if _cluster_handle_not_required(fields):
|
|
1356
|
+
cluster_handle_required = False
|
|
1357
|
+
# Copy the list to avoid modifying the original list
|
|
1358
|
+
new_fields = fields.copy()
|
|
1359
|
+
# status and job_id are always included
|
|
1360
|
+
if 'status' not in new_fields:
|
|
1361
|
+
new_fields.append('status')
|
|
1362
|
+
if 'job_id' not in new_fields:
|
|
1363
|
+
new_fields.append('job_id')
|
|
1364
|
+
# user_hash is required if user_name is present
|
|
1365
|
+
if 'user_name' in new_fields and 'user_hash' not in new_fields:
|
|
1366
|
+
new_fields.append('user_hash')
|
|
1367
|
+
if 'job_duration' in new_fields:
|
|
1368
|
+
if 'last_recovered_at' not in new_fields:
|
|
1369
|
+
new_fields.append('last_recovered_at')
|
|
1370
|
+
if 'end_at' not in new_fields:
|
|
1371
|
+
new_fields.append('end_at')
|
|
1372
|
+
if 'job_name' in new_fields and 'task_name' not in new_fields:
|
|
1373
|
+
new_fields.append('task_name')
|
|
1374
|
+
if 'details' in new_fields:
|
|
1375
|
+
if 'schedule_state' not in new_fields:
|
|
1376
|
+
new_fields.append('schedule_state')
|
|
1377
|
+
if 'priority' not in new_fields:
|
|
1378
|
+
new_fields.append('priority')
|
|
1379
|
+
if 'failure_reason' not in new_fields:
|
|
1380
|
+
new_fields.append('failure_reason')
|
|
1381
|
+
if ('user_yaml' in new_fields and
|
|
1382
|
+
'original_user_yaml_path' not in new_fields):
|
|
1383
|
+
new_fields.append('original_user_yaml_path')
|
|
1384
|
+
if cluster_handle_required:
|
|
1385
|
+
if 'task_name' not in new_fields:
|
|
1386
|
+
new_fields.append('task_name')
|
|
1387
|
+
if 'current_cluster_name' not in new_fields:
|
|
1388
|
+
new_fields.append('current_cluster_name')
|
|
1389
|
+
# Remove _NON_DB_FIELDS
|
|
1390
|
+
# These fields have been mapped to the DB fields in the above code, so we
|
|
1391
|
+
# don't need to include them in the updated fields.
|
|
1392
|
+
for field in _NON_DB_FIELDS:
|
|
1393
|
+
if field in new_fields:
|
|
1394
|
+
new_fields.remove(field)
|
|
1395
|
+
return new_fields, cluster_handle_required
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def _cluster_handle_not_required(fields: List[str]) -> bool:
|
|
1399
|
+
"""Determine if cluster handle is not required.
|
|
1400
|
+
|
|
1401
|
+
Args:
|
|
1402
|
+
fields: The fields to check if they contain any of the cluster handle
|
|
1403
|
+
fields.
|
|
1404
|
+
|
|
1405
|
+
Returns:
|
|
1406
|
+
True if the fields do not contain any of the cluster handle fields,
|
|
1407
|
+
False otherwise.
|
|
1408
|
+
"""
|
|
1409
|
+
return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
|
|
1321
1410
|
|
|
1322
1411
|
|
|
1323
1412
|
def get_managed_job_queue(
|
|
@@ -1331,146 +1420,154 @@ def get_managed_job_queue(
|
|
|
1331
1420
|
limit: Optional[int] = None,
|
|
1332
1421
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1333
1422
|
statuses: Optional[List[str]] = None,
|
|
1423
|
+
fields: Optional[List[str]] = None,
|
|
1334
1424
|
) -> Dict[str, Any]:
|
|
1335
|
-
|
|
1336
|
-
# detection) requires a full view of the jobs table.
|
|
1337
|
-
jobs = managed_job_state.get_managed_jobs()
|
|
1425
|
+
"""Get the managed job queue.
|
|
1338
1426
|
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
managed_job_state.ManagedJobScheduleState.WAITING,
|
|
1352
|
-
managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
|
|
1353
|
-
# This job will not block others.
|
|
1354
|
-
continue
|
|
1355
|
-
|
|
1356
|
-
priority = job.get('priority')
|
|
1357
|
-
if priority is not None and priority > highest_blocking_priority:
|
|
1358
|
-
highest_blocking_priority = priority
|
|
1427
|
+
Args:
|
|
1428
|
+
skip_finished: Whether to skip finished jobs.
|
|
1429
|
+
accessible_workspaces: The accessible workspaces.
|
|
1430
|
+
job_ids: The job ids.
|
|
1431
|
+
workspace_match: The workspace name to match.
|
|
1432
|
+
name_match: The job name to match.
|
|
1433
|
+
pool_match: The pool name to match.
|
|
1434
|
+
page: The page number.
|
|
1435
|
+
limit: The limit number.
|
|
1436
|
+
user_hashes: The user hashes.
|
|
1437
|
+
statuses: The statuses.
|
|
1438
|
+
fields: The fields to include in the response.
|
|
1359
1439
|
|
|
1360
|
-
|
|
1440
|
+
Returns:
|
|
1441
|
+
A dictionary containing the managed job queue.
|
|
1442
|
+
"""
|
|
1443
|
+
cluster_handle_required = True
|
|
1444
|
+
updated_fields = None
|
|
1445
|
+
# The caller only need to specify the fields in the
|
|
1446
|
+
# `class ManagedJobRecord` in `response.py`, and the `_update_fields`
|
|
1447
|
+
# function will add the necessary dependent fields to the list, for
|
|
1448
|
+
# example, if the caller specifies `['user_name']`, the `_update_fields`
|
|
1449
|
+
# function will add `['user_hash']` to the list.
|
|
1450
|
+
if fields:
|
|
1451
|
+
updated_fields, cluster_handle_required = _update_fields(fields)
|
|
1452
|
+
|
|
1453
|
+
total_no_filter = managed_job_state.get_managed_jobs_total()
|
|
1454
|
+
|
|
1455
|
+
status_counts = managed_job_state.get_status_count_with_filters(
|
|
1456
|
+
fields=fields,
|
|
1457
|
+
job_ids=job_ids,
|
|
1458
|
+
accessible_workspaces=accessible_workspaces,
|
|
1459
|
+
workspace_match=workspace_match,
|
|
1460
|
+
name_match=name_match,
|
|
1461
|
+
pool_match=pool_match,
|
|
1462
|
+
user_hashes=user_hashes,
|
|
1463
|
+
skip_finished=skip_finished,
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
jobs, total = managed_job_state.get_managed_jobs_with_filters(
|
|
1467
|
+
fields=updated_fields,
|
|
1468
|
+
job_ids=job_ids,
|
|
1469
|
+
accessible_workspaces=accessible_workspaces,
|
|
1470
|
+
workspace_match=workspace_match,
|
|
1471
|
+
name_match=name_match,
|
|
1472
|
+
pool_match=pool_match,
|
|
1473
|
+
user_hashes=user_hashes,
|
|
1474
|
+
statuses=statuses,
|
|
1475
|
+
skip_finished=skip_finished,
|
|
1476
|
+
page=page,
|
|
1477
|
+
limit=limit,
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
if cluster_handle_required:
|
|
1481
|
+
# Fetch the cluster name to handle map for managed clusters only.
|
|
1482
|
+
cluster_name_to_handle = (
|
|
1483
|
+
global_user_state.get_cluster_name_to_handle_map(is_managed=True))
|
|
1361
1484
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
|
|
1370
|
-
accessible_workspaces
|
|
1371
|
-
]
|
|
1372
|
-
if skip_finished:
|
|
1373
|
-
# Filter out the finished jobs. If a multi-task job is partially
|
|
1374
|
-
# finished, we will include all its tasks.
|
|
1375
|
-
non_finished_tasks = list(
|
|
1376
|
-
filter(
|
|
1377
|
-
lambda job: not managed_job_state.ManagedJobStatus(job[
|
|
1378
|
-
'status']).is_terminal(), jobs))
|
|
1379
|
-
non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
|
|
1380
|
-
jobs = list(
|
|
1381
|
-
filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
|
|
1382
|
-
if job_ids:
|
|
1383
|
-
jobs = [job for job in jobs if job['job_id'] in job_ids]
|
|
1384
|
-
|
|
1385
|
-
jobs, total, status_counts = filter_jobs(jobs,
|
|
1386
|
-
workspace_match,
|
|
1387
|
-
name_match,
|
|
1388
|
-
pool_match,
|
|
1389
|
-
page,
|
|
1390
|
-
limit,
|
|
1391
|
-
statuses=statuses)
|
|
1392
|
-
|
|
1393
|
-
job_ids = set(job['job_id'] for job in jobs)
|
|
1394
|
-
job_id_to_pool_info = (
|
|
1395
|
-
managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
|
|
1396
|
-
cluster_names: Dict[int, str] = {}
|
|
1397
|
-
for job in jobs:
|
|
1398
|
-
# pool info is (pool, cluster_name, job_id_on_pool_cluster)
|
|
1399
|
-
pool_info = job_id_to_pool_info.get(job['job_id'], None)
|
|
1400
|
-
if pool_info and pool_info[0]:
|
|
1401
|
-
cluster_name = pool_info[1]
|
|
1402
|
-
else:
|
|
1403
|
-
cluster_name = generate_managed_job_cluster_name(
|
|
1404
|
-
job['task_name'], job['job_id'])
|
|
1405
|
-
cluster_names[job['job_id']] = cluster_name
|
|
1406
|
-
cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
|
|
1407
|
-
set(cluster_names.values()))
|
|
1485
|
+
highest_blocking_priority = constants.MIN_PRIORITY
|
|
1486
|
+
if not fields or 'details' in fields:
|
|
1487
|
+
# Figure out what the highest priority blocking job is. We need to know
|
|
1488
|
+
# in order to determine if other jobs are blocked by a higher priority
|
|
1489
|
+
# job, or just by the limited controller resources.
|
|
1490
|
+
highest_blocking_priority = (
|
|
1491
|
+
managed_job_state.get_managed_jobs_highest_priority())
|
|
1408
1492
|
|
|
1409
1493
|
for job in jobs:
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
end_at
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1494
|
+
if not fields or 'job_duration' in fields:
|
|
1495
|
+
end_at = job['end_at']
|
|
1496
|
+
if end_at is None:
|
|
1497
|
+
end_at = time.time()
|
|
1498
|
+
|
|
1499
|
+
job_submitted_at = job['last_recovered_at'] - job['job_duration']
|
|
1500
|
+
if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
|
|
1501
|
+
# When job is recovering, the duration is exact
|
|
1502
|
+
# job['job_duration']
|
|
1503
|
+
job_duration = job['job_duration']
|
|
1504
|
+
elif job_submitted_at > 0:
|
|
1505
|
+
job_duration = end_at - job_submitted_at
|
|
1506
|
+
else:
|
|
1507
|
+
# When job_start_at <= 0, that means the last_recovered_at
|
|
1508
|
+
# is not set yet, i.e. the job is not started.
|
|
1509
|
+
job_duration = 0
|
|
1510
|
+
job['job_duration'] = job_duration
|
|
1425
1511
|
job['status'] = job['status'].value
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
cluster_name = cluster_names[job['job_id']]
|
|
1429
|
-
handle = cluster_name_to_handles.get(cluster_name, None)
|
|
1430
|
-
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1431
|
-
resources_str = resources_utils.get_readable_resources_repr(
|
|
1432
|
-
handle, simplify=True)
|
|
1433
|
-
resources_str_full = resources_utils.get_readable_resources_repr(
|
|
1434
|
-
handle, simplify=False)
|
|
1435
|
-
job['cluster_resources'] = resources_str
|
|
1436
|
-
job['cluster_resources_full'] = resources_str_full
|
|
1437
|
-
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1438
|
-
job['region'] = handle.launched_resources.region
|
|
1439
|
-
job['zone'] = handle.launched_resources.zone
|
|
1440
|
-
job['infra'] = infra_utils.InfraInfo(
|
|
1441
|
-
str(handle.launched_resources.cloud),
|
|
1442
|
-
handle.launched_resources.region,
|
|
1443
|
-
handle.launched_resources.zone).formatted_str()
|
|
1444
|
-
job['accelerators'] = handle.launched_resources.accelerators
|
|
1512
|
+
if not fields or 'schedule_state' in fields:
|
|
1513
|
+
job['schedule_state'] = job['schedule_state'].value
|
|
1445
1514
|
else:
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1515
|
+
job['schedule_state'] = None
|
|
1516
|
+
|
|
1517
|
+
if cluster_handle_required:
|
|
1518
|
+
cluster_name = job.get('current_cluster_name', None)
|
|
1519
|
+
if cluster_name is None:
|
|
1520
|
+
cluster_name = generate_managed_job_cluster_name(
|
|
1521
|
+
job['task_name'], job['job_id'])
|
|
1522
|
+
handle = cluster_name_to_handle.get(
|
|
1523
|
+
cluster_name, None) if cluster_name is not None else None
|
|
1524
|
+
if isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1525
|
+
resources_str = resources_utils.get_readable_resources_repr(
|
|
1526
|
+
handle, simplify=True)
|
|
1527
|
+
resources_str_full = (
|
|
1528
|
+
resources_utils.get_readable_resources_repr(handle,
|
|
1529
|
+
simplify=False))
|
|
1530
|
+
job['cluster_resources'] = resources_str
|
|
1531
|
+
job['cluster_resources_full'] = resources_str_full
|
|
1532
|
+
job['cloud'] = str(handle.launched_resources.cloud)
|
|
1533
|
+
job['region'] = handle.launched_resources.region
|
|
1534
|
+
job['zone'] = handle.launched_resources.zone
|
|
1535
|
+
job['infra'] = infra_utils.InfraInfo(
|
|
1536
|
+
str(handle.launched_resources.cloud),
|
|
1537
|
+
handle.launched_resources.region,
|
|
1538
|
+
handle.launched_resources.zone).formatted_str()
|
|
1539
|
+
job['accelerators'] = handle.launched_resources.accelerators
|
|
1463
1540
|
else:
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1541
|
+
# FIXME(zongheng): display the last cached values for these.
|
|
1542
|
+
job['cluster_resources'] = '-'
|
|
1543
|
+
job['cluster_resources_full'] = '-'
|
|
1544
|
+
job['cloud'] = '-'
|
|
1545
|
+
job['region'] = '-'
|
|
1546
|
+
job['zone'] = '-'
|
|
1547
|
+
job['infra'] = '-'
|
|
1548
|
+
|
|
1549
|
+
if not fields or 'details' in fields:
|
|
1550
|
+
# Add details about schedule state / backoff.
|
|
1551
|
+
state_details = None
|
|
1552
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
|
1553
|
+
state_details = 'In backoff, waiting for resources'
|
|
1554
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
|
1555
|
+
priority = job.get('priority')
|
|
1556
|
+
if (priority is not None and
|
|
1557
|
+
priority < highest_blocking_priority):
|
|
1558
|
+
# Job is lower priority than some other blocking job.
|
|
1559
|
+
state_details = 'Waiting for higher priority jobs to launch'
|
|
1560
|
+
else:
|
|
1561
|
+
state_details = 'Waiting for other jobs to launch'
|
|
1562
|
+
|
|
1563
|
+
if state_details and job['failure_reason']:
|
|
1564
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
|
1565
|
+
elif state_details:
|
|
1566
|
+
job['details'] = state_details
|
|
1567
|
+
elif job['failure_reason']:
|
|
1568
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
|
1569
|
+
else:
|
|
1570
|
+
job['details'] = None
|
|
1474
1571
|
|
|
1475
1572
|
return {
|
|
1476
1573
|
'jobs': jobs,
|
|
@@ -1581,21 +1678,14 @@ def load_managed_job_queue(
|
|
|
1581
1678
|
total_no_filter = total
|
|
1582
1679
|
result_type = ManagedJobQueueResultType.LIST
|
|
1583
1680
|
|
|
1584
|
-
|
|
1681
|
+
all_users = global_user_state.get_all_users()
|
|
1682
|
+
all_users_map = {user.id: user.name for user in all_users}
|
|
1585
1683
|
for job in jobs:
|
|
1684
|
+
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1586
1685
|
if 'user_hash' in job and job['user_hash'] is not None:
|
|
1587
1686
|
# Skip jobs that do not have user_hash info.
|
|
1588
1687
|
# TODO(cooperc): Remove check before 0.12.0.
|
|
1589
|
-
|
|
1590
|
-
user_hash_to_user = global_user_state.get_users(
|
|
1591
|
-
job_id_to_user_hash.values())
|
|
1592
|
-
|
|
1593
|
-
for job in jobs:
|
|
1594
|
-
job['status'] = managed_job_state.ManagedJobStatus(job['status'])
|
|
1595
|
-
if job['job_id'] in job_id_to_user_hash:
|
|
1596
|
-
user_hash = job_id_to_user_hash[job['job_id']]
|
|
1597
|
-
user = user_hash_to_user.get(user_hash, None)
|
|
1598
|
-
job['user_name'] = user.name if user is not None else None
|
|
1688
|
+
job['user_name'] = all_users_map.get(job['user_hash'])
|
|
1599
1689
|
return jobs, total, result_type, total_no_filter, status_counts
|
|
1600
1690
|
|
|
1601
1691
|
|
|
@@ -2014,6 +2104,7 @@ class ManagedJobCodeGen:
|
|
|
2014
2104
|
limit: Optional[int] = None,
|
|
2015
2105
|
user_hashes: Optional[List[Optional[str]]] = None,
|
|
2016
2106
|
statuses: Optional[List[str]] = None,
|
|
2107
|
+
fields: Optional[List[str]] = None,
|
|
2017
2108
|
) -> str:
|
|
2018
2109
|
code = textwrap.dedent(f"""\
|
|
2019
2110
|
if managed_job_version < 9:
|
|
@@ -2032,7 +2123,7 @@ class ManagedJobCodeGen:
|
|
|
2032
2123
|
page={page!r},
|
|
2033
2124
|
limit={limit!r},
|
|
2034
2125
|
user_hashes={user_hashes!r})
|
|
2035
|
-
|
|
2126
|
+
elif managed_job_version < 12:
|
|
2036
2127
|
job_table = utils.dump_managed_job_queue(
|
|
2037
2128
|
skip_finished={skip_finished},
|
|
2038
2129
|
accessible_workspaces={accessible_workspaces!r},
|
|
@@ -2044,6 +2135,19 @@ class ManagedJobCodeGen:
|
|
|
2044
2135
|
limit={limit!r},
|
|
2045
2136
|
user_hashes={user_hashes!r},
|
|
2046
2137
|
statuses={statuses!r})
|
|
2138
|
+
else:
|
|
2139
|
+
job_table = utils.dump_managed_job_queue(
|
|
2140
|
+
skip_finished={skip_finished},
|
|
2141
|
+
accessible_workspaces={accessible_workspaces!r},
|
|
2142
|
+
job_ids={job_ids!r},
|
|
2143
|
+
workspace_match={workspace_match!r},
|
|
2144
|
+
name_match={name_match!r},
|
|
2145
|
+
pool_match={pool_match!r},
|
|
2146
|
+
page={page!r},
|
|
2147
|
+
limit={limit!r},
|
|
2148
|
+
user_hashes={user_hashes!r},
|
|
2149
|
+
statuses={statuses!r},
|
|
2150
|
+
fields={fields!r})
|
|
2047
2151
|
print(job_table, flush=True)
|
|
2048
2152
|
""")
|
|
2049
2153
|
return cls._build(code)
|
|
@@ -48,8 +48,10 @@ def _open_ports_using_loadbalancer(
|
|
|
48
48
|
service_name = _LOADBALANCER_SERVICE_NAME.format(
|
|
49
49
|
cluster_name_on_cloud=cluster_name_on_cloud)
|
|
50
50
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
51
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
52
|
+
|
|
51
53
|
content = network_utils.fill_loadbalancer_template(
|
|
52
|
-
namespace=
|
|
54
|
+
namespace=namespace,
|
|
53
55
|
context=context,
|
|
54
56
|
service_name=service_name,
|
|
55
57
|
ports=ports,
|
|
@@ -103,7 +105,7 @@ def _open_ports_using_ingress(
|
|
|
103
105
|
# To avoid this, we change ingress creation into one object containing
|
|
104
106
|
# multiple rules.
|
|
105
107
|
content = network_utils.fill_ingress_template(
|
|
106
|
-
namespace=
|
|
108
|
+
namespace=namespace,
|
|
107
109
|
context=context,
|
|
108
110
|
service_details=service_details,
|
|
109
111
|
ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
|
|
@@ -165,9 +167,10 @@ def _cleanup_ports_for_loadbalancer(
|
|
|
165
167
|
# TODO(aylei): test coverage
|
|
166
168
|
context = provider_config.get(
|
|
167
169
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
|
170
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
168
171
|
network_utils.delete_namespaced_service(
|
|
169
172
|
context=context,
|
|
170
|
-
namespace=
|
|
173
|
+
namespace=namespace,
|
|
171
174
|
service_name=service_name,
|
|
172
175
|
)
|
|
173
176
|
|
|
@@ -180,19 +183,19 @@ def _cleanup_ports_for_ingress(
|
|
|
180
183
|
# Delete services for each port
|
|
181
184
|
context = provider_config.get(
|
|
182
185
|
'context', kubernetes_utils.get_current_kube_config_context_name())
|
|
186
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
183
187
|
for port in ports:
|
|
184
188
|
service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
|
|
185
189
|
network_utils.delete_namespaced_service(
|
|
186
190
|
context=context,
|
|
187
|
-
namespace=
|
|
188
|
-
kubernetes_utils.DEFAULT_NAMESPACE),
|
|
191
|
+
namespace=namespace,
|
|
189
192
|
service_name=service_name,
|
|
190
193
|
)
|
|
191
194
|
|
|
192
195
|
# Delete the single ingress used for all ports
|
|
193
196
|
ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
|
|
194
197
|
network_utils.delete_namespaced_ingress(
|
|
195
|
-
namespace=
|
|
198
|
+
namespace=namespace,
|
|
196
199
|
context=kubernetes_utils.get_context_from_config(provider_config),
|
|
197
200
|
ingress_name=ingress_name,
|
|
198
201
|
)
|
sky/provision/provisioner.py
CHANGED
|
@@ -442,6 +442,14 @@ def _post_provision_setup(
|
|
|
442
442
|
cluster_name.name_on_cloud,
|
|
443
443
|
provider_config=provider_config)
|
|
444
444
|
|
|
445
|
+
# Update cluster info in handle so cluster instance ids are set. This
|
|
446
|
+
# allows us to expose provision logs to debug nodes that failed during post
|
|
447
|
+
# provision setup.
|
|
448
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
|
449
|
+
cluster_name.display_name)
|
|
450
|
+
handle.cached_cluster_info = cluster_info
|
|
451
|
+
global_user_state.update_cluster_handle(cluster_name.display_name, handle)
|
|
452
|
+
|
|
445
453
|
if cluster_info.num_instances > 1:
|
|
446
454
|
# Only worker nodes have logs in the per-instance log directory. Head
|
|
447
455
|
# node's log will be redirected to the main log file.
|
sky/schemas/api/responses.py
CHANGED
|
@@ -160,6 +160,8 @@ class StorageRecord(ResponseBaseModel):
|
|
|
160
160
|
# and therefore can be non-optional.
|
|
161
161
|
class ManagedJobRecord(ResponseBaseModel):
|
|
162
162
|
"""A single managed job record."""
|
|
163
|
+
# The job_id in the spot table
|
|
164
|
+
task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
|
|
163
165
|
job_id: Optional[int] = None
|
|
164
166
|
task_id: Optional[int] = None
|
|
165
167
|
job_name: Optional[str] = None
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Initial schema for sky config database
|
|
2
|
+
|
|
3
|
+
Revision ID: 001
|
|
4
|
+
Revises:
|
|
5
|
+
Create Date: 2025-10-21
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
from alembic import op
|
|
10
|
+
|
|
11
|
+
from sky.skypilot_config import Base
|
|
12
|
+
from sky.utils.db import db_utils
|
|
13
|
+
|
|
14
|
+
# revision identifiers, used by Alembic.
|
|
15
|
+
revision = '001'
|
|
16
|
+
down_revision = None
|
|
17
|
+
branch_labels = None
|
|
18
|
+
depends_on = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def upgrade():
|
|
22
|
+
"""Create initial schema for config_yaml table"""
|
|
23
|
+
with op.get_context().autocommit_block():
|
|
24
|
+
# Create all tables with their current schema
|
|
25
|
+
db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade():
|
|
29
|
+
"""Drop all tables"""
|
|
30
|
+
Base.metadata.drop_all(bind=op.get_bind())
|
sky/serve/server/server.py
CHANGED
|
@@ -23,7 +23,7 @@ async def up(
|
|
|
23
23
|
request: fastapi.Request,
|
|
24
24
|
up_body: payloads.ServeUpBody,
|
|
25
25
|
) -> None:
|
|
26
|
-
executor.
|
|
26
|
+
await executor.schedule_request_async(
|
|
27
27
|
request_id=request.state.request_id,
|
|
28
28
|
request_name='serve.up',
|
|
29
29
|
request_body=up_body,
|
|
@@ -38,7 +38,7 @@ async def update(
|
|
|
38
38
|
request: fastapi.Request,
|
|
39
39
|
update_body: payloads.ServeUpdateBody,
|
|
40
40
|
) -> None:
|
|
41
|
-
executor.
|
|
41
|
+
await executor.schedule_request_async(
|
|
42
42
|
request_id=request.state.request_id,
|
|
43
43
|
request_name='serve.update',
|
|
44
44
|
request_body=update_body,
|
|
@@ -53,7 +53,7 @@ async def down(
|
|
|
53
53
|
request: fastapi.Request,
|
|
54
54
|
down_body: payloads.ServeDownBody,
|
|
55
55
|
) -> None:
|
|
56
|
-
executor.
|
|
56
|
+
await executor.schedule_request_async(
|
|
57
57
|
request_id=request.state.request_id,
|
|
58
58
|
request_name='serve.down',
|
|
59
59
|
request_body=down_body,
|
|
@@ -68,7 +68,7 @@ async def terminate_replica(
|
|
|
68
68
|
request: fastapi.Request,
|
|
69
69
|
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
|
70
70
|
) -> None:
|
|
71
|
-
executor.
|
|
71
|
+
await executor.schedule_request_async(
|
|
72
72
|
request_id=request.state.request_id,
|
|
73
73
|
request_name='serve.terminate_replica',
|
|
74
74
|
request_body=terminate_replica_body,
|
|
@@ -83,7 +83,7 @@ async def status(
|
|
|
83
83
|
request: fastapi.Request,
|
|
84
84
|
status_body: payloads.ServeStatusBody,
|
|
85
85
|
) -> None:
|
|
86
|
-
executor.
|
|
86
|
+
await executor.schedule_request_async(
|
|
87
87
|
request_id=request.state.request_id,
|
|
88
88
|
request_name='serve.status',
|
|
89
89
|
request_body=status_body,
|
|
@@ -99,7 +99,7 @@ async def tail_logs(
|
|
|
99
99
|
background_tasks: fastapi.BackgroundTasks
|
|
100
100
|
) -> fastapi.responses.StreamingResponse:
|
|
101
101
|
executor.check_request_thread_executor_available()
|
|
102
|
-
request_task = executor.
|
|
102
|
+
request_task = await executor.prepare_request_async(
|
|
103
103
|
request_id=request.state.request_id,
|
|
104
104
|
request_name='serve.logs',
|
|
105
105
|
request_body=log_body,
|
|
@@ -114,6 +114,7 @@ async def tail_logs(
|
|
|
114
114
|
request_id=request_task.request_id,
|
|
115
115
|
logs_path=request_task.log_path,
|
|
116
116
|
background_tasks=background_tasks,
|
|
117
|
+
kill_request_on_disconnect=False,
|
|
117
118
|
)
|
|
118
119
|
|
|
119
120
|
|
|
@@ -131,7 +132,7 @@ async def download_logs(
|
|
|
131
132
|
# We should reuse the original request body, so that the env vars, such as
|
|
132
133
|
# user hash, are kept the same.
|
|
133
134
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
134
|
-
executor.
|
|
135
|
+
await executor.schedule_request_async(
|
|
135
136
|
request_id=request.state.request_id,
|
|
136
137
|
request_name='serve.sync_down_logs',
|
|
137
138
|
request_body=download_logs_body,
|