skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +64 -0
  3. sky/backends/backend_utils.py +11 -11
  4. sky/backends/cloud_vm_ray_backend.py +15 -4
  5. sky/client/cli/command.py +39 -10
  6. sky/client/cli/flags.py +4 -2
  7. sky/client/sdk.py +26 -3
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  12. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/storage.py +2 -2
  46. sky/global_user_state.py +137 -37
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +4 -2
  49. sky/jobs/server/server.py +21 -12
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +248 -144
  52. sky/provision/kubernetes/network.py +9 -6
  53. sky/provision/provisioner.py +8 -0
  54. sky/schemas/api/responses.py +2 -0
  55. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  56. sky/serve/server/server.py +8 -7
  57. sky/server/common.py +10 -15
  58. sky/server/constants.py +1 -1
  59. sky/server/daemons.py +4 -2
  60. sky/server/requests/executor.py +30 -28
  61. sky/server/requests/payloads.py +5 -1
  62. sky/server/requests/preconditions.py +9 -4
  63. sky/server/requests/requests.py +130 -53
  64. sky/server/requests/serializers/encoders.py +3 -3
  65. sky/server/server.py +91 -58
  66. sky/server/stream_utils.py +127 -38
  67. sky/server/uvicorn.py +18 -17
  68. sky/setup_files/alembic.ini +4 -0
  69. sky/skylet/services.py +5 -5
  70. sky/skypilot_config.py +87 -75
  71. sky/ssh_node_pools/server.py +4 -4
  72. sky/users/permission.py +4 -0
  73. sky/utils/asyncio_utils.py +63 -3
  74. sky/utils/db/db_utils.py +11 -3
  75. sky/utils/db/migration_utils.py +7 -3
  76. sky/volumes/server/server.py +3 -3
  77. sky/workspaces/server.py +6 -6
  78. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
  79. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
  80. sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  82. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  83. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  87. sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
  88. /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  89. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  90. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  92. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -108,6 +108,21 @@ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
108
108
  _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
109
109
  '~/.sky/.jobs_controller_consolidation_reloaded_signal')
110
110
 
111
+ # The response fields for managed jobs that require cluster handle
112
+ _CLUSTER_HANDLE_FIELDS = [
113
+ 'cluster_resources',
114
+ 'cluster_resources_full',
115
+ 'cloud',
116
+ 'region',
117
+ 'zone',
118
+ 'infra',
119
+ 'accelerators',
120
+ ]
121
+
122
+ # The response fields for managed jobs that are not stored in the database
123
+ # These fields will be mapped to the DB fields in the `_update_fields`.
124
+ _NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
125
+
111
126
 
112
127
  class ManagedJobQueueResultType(enum.Enum):
113
128
  """The type of the managed job queue result."""
@@ -1313,11 +1328,85 @@ def dump_managed_job_queue(
1313
1328
  limit: Optional[int] = None,
1314
1329
  user_hashes: Optional[List[Optional[str]]] = None,
1315
1330
  statuses: Optional[List[str]] = None,
1331
+ fields: Optional[List[str]] = None,
1316
1332
  ) -> str:
1317
1333
  return message_utils.encode_payload(
1318
1334
  get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
1319
1335
  workspace_match, name_match, pool_match, page,
1320
- limit, user_hashes, statuses))
1336
+ limit, user_hashes, statuses, fields))
1337
+
1338
+
1339
+ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
1340
+ """Update the fields list to include the necessary fields.
1341
+
1342
+ Args:
1343
+ fields: The fields to update.
1344
+
1345
+ It will:
1346
+ - Add the necessary dependent fields to the list.
1347
+ - Remove the fields that are not in the DB.
1348
+ - Determine if cluster handle is required.
1349
+
1350
+ Returns:
1351
+ A tuple containing the updated fields and a boolean indicating if
1352
+ cluster handle is required.
1353
+ """
1354
+ cluster_handle_required = True
1355
+ if _cluster_handle_not_required(fields):
1356
+ cluster_handle_required = False
1357
+ # Copy the list to avoid modifying the original list
1358
+ new_fields = fields.copy()
1359
+ # status and job_id are always included
1360
+ if 'status' not in new_fields:
1361
+ new_fields.append('status')
1362
+ if 'job_id' not in new_fields:
1363
+ new_fields.append('job_id')
1364
+ # user_hash is required if user_name is present
1365
+ if 'user_name' in new_fields and 'user_hash' not in new_fields:
1366
+ new_fields.append('user_hash')
1367
+ if 'job_duration' in new_fields:
1368
+ if 'last_recovered_at' not in new_fields:
1369
+ new_fields.append('last_recovered_at')
1370
+ if 'end_at' not in new_fields:
1371
+ new_fields.append('end_at')
1372
+ if 'job_name' in new_fields and 'task_name' not in new_fields:
1373
+ new_fields.append('task_name')
1374
+ if 'details' in new_fields:
1375
+ if 'schedule_state' not in new_fields:
1376
+ new_fields.append('schedule_state')
1377
+ if 'priority' not in new_fields:
1378
+ new_fields.append('priority')
1379
+ if 'failure_reason' not in new_fields:
1380
+ new_fields.append('failure_reason')
1381
+ if ('user_yaml' in new_fields and
1382
+ 'original_user_yaml_path' not in new_fields):
1383
+ new_fields.append('original_user_yaml_path')
1384
+ if cluster_handle_required:
1385
+ if 'task_name' not in new_fields:
1386
+ new_fields.append('task_name')
1387
+ if 'current_cluster_name' not in new_fields:
1388
+ new_fields.append('current_cluster_name')
1389
+ # Remove _NON_DB_FIELDS
1390
+ # These fields have been mapped to the DB fields in the above code, so we
1391
+ # don't need to include them in the updated fields.
1392
+ for field in _NON_DB_FIELDS:
1393
+ if field in new_fields:
1394
+ new_fields.remove(field)
1395
+ return new_fields, cluster_handle_required
1396
+
1397
+
1398
+ def _cluster_handle_not_required(fields: List[str]) -> bool:
1399
+ """Determine if cluster handle is not required.
1400
+
1401
+ Args:
1402
+ fields: The fields to check if they contain any of the cluster handle
1403
+ fields.
1404
+
1405
+ Returns:
1406
+ True if the fields do not contain any of the cluster handle fields,
1407
+ False otherwise.
1408
+ """
1409
+ return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
1321
1410
 
1322
1411
 
1323
1412
  def get_managed_job_queue(
@@ -1331,146 +1420,154 @@ def get_managed_job_queue(
1331
1420
  limit: Optional[int] = None,
1332
1421
  user_hashes: Optional[List[Optional[str]]] = None,
1333
1422
  statuses: Optional[List[str]] = None,
1423
+ fields: Optional[List[str]] = None,
1334
1424
  ) -> Dict[str, Any]:
1335
- # Make sure to get all jobs - some logic below (e.g. high priority job
1336
- # detection) requires a full view of the jobs table.
1337
- jobs = managed_job_state.get_managed_jobs()
1425
+ """Get the managed job queue.
1338
1426
 
1339
- # Figure out what the highest priority blocking job is. We need to know in
1340
- # order to determine if other jobs are blocked by a higher priority job, or
1341
- # just by the limited controller resources.
1342
- highest_blocking_priority = constants.MIN_PRIORITY
1343
- for job in jobs:
1344
- if job['schedule_state'] not in (
1345
- # LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
1346
- # lower priority.
1347
- managed_job_state.ManagedJobScheduleState.LAUNCHING,
1348
- managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
1349
- # It's possible for a WAITING/ALIVE_WAITING job to be ready to
1350
- # launch, but the scheduler just hasn't run yet.
1351
- managed_job_state.ManagedJobScheduleState.WAITING,
1352
- managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
1353
- # This job will not block others.
1354
- continue
1355
-
1356
- priority = job.get('priority')
1357
- if priority is not None and priority > highest_blocking_priority:
1358
- highest_blocking_priority = priority
1427
+ Args:
1428
+ skip_finished: Whether to skip finished jobs.
1429
+ accessible_workspaces: The accessible workspaces.
1430
+ job_ids: The job ids.
1431
+ workspace_match: The workspace name to match.
1432
+ name_match: The job name to match.
1433
+ pool_match: The pool name to match.
1434
+ page: The page number.
1435
+ limit: The limit number.
1436
+ user_hashes: The user hashes.
1437
+ statuses: The statuses.
1438
+ fields: The fields to include in the response.
1359
1439
 
1360
- total_no_filter = len(jobs)
1440
+ Returns:
1441
+ A dictionary containing the managed job queue.
1442
+ """
1443
+ cluster_handle_required = True
1444
+ updated_fields = None
1445
+ # The caller only need to specify the fields in the
1446
+ # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
1447
+ # function will add the necessary dependent fields to the list, for
1448
+ # example, if the caller specifies `['user_name']`, the `_update_fields`
1449
+ # function will add `['user_hash']` to the list.
1450
+ if fields:
1451
+ updated_fields, cluster_handle_required = _update_fields(fields)
1452
+
1453
+ total_no_filter = managed_job_state.get_managed_jobs_total()
1454
+
1455
+ status_counts = managed_job_state.get_status_count_with_filters(
1456
+ fields=fields,
1457
+ job_ids=job_ids,
1458
+ accessible_workspaces=accessible_workspaces,
1459
+ workspace_match=workspace_match,
1460
+ name_match=name_match,
1461
+ pool_match=pool_match,
1462
+ user_hashes=user_hashes,
1463
+ skip_finished=skip_finished,
1464
+ )
1465
+
1466
+ jobs, total = managed_job_state.get_managed_jobs_with_filters(
1467
+ fields=updated_fields,
1468
+ job_ids=job_ids,
1469
+ accessible_workspaces=accessible_workspaces,
1470
+ workspace_match=workspace_match,
1471
+ name_match=name_match,
1472
+ pool_match=pool_match,
1473
+ user_hashes=user_hashes,
1474
+ statuses=statuses,
1475
+ skip_finished=skip_finished,
1476
+ page=page,
1477
+ limit=limit,
1478
+ )
1479
+
1480
+ if cluster_handle_required:
1481
+ # Fetch the cluster name to handle map for managed clusters only.
1482
+ cluster_name_to_handle = (
1483
+ global_user_state.get_cluster_name_to_handle_map(is_managed=True))
1361
1484
 
1362
- if user_hashes:
1363
- jobs = [
1364
- job for job in jobs if job.get('user_hash', None) in user_hashes
1365
- ]
1366
- if accessible_workspaces:
1367
- jobs = [
1368
- job for job in jobs
1369
- if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
1370
- accessible_workspaces
1371
- ]
1372
- if skip_finished:
1373
- # Filter out the finished jobs. If a multi-task job is partially
1374
- # finished, we will include all its tasks.
1375
- non_finished_tasks = list(
1376
- filter(
1377
- lambda job: not managed_job_state.ManagedJobStatus(job[
1378
- 'status']).is_terminal(), jobs))
1379
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
1380
- jobs = list(
1381
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
1382
- if job_ids:
1383
- jobs = [job for job in jobs if job['job_id'] in job_ids]
1384
-
1385
- jobs, total, status_counts = filter_jobs(jobs,
1386
- workspace_match,
1387
- name_match,
1388
- pool_match,
1389
- page,
1390
- limit,
1391
- statuses=statuses)
1392
-
1393
- job_ids = set(job['job_id'] for job in jobs)
1394
- job_id_to_pool_info = (
1395
- managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
1396
- cluster_names: Dict[int, str] = {}
1397
- for job in jobs:
1398
- # pool info is (pool, cluster_name, job_id_on_pool_cluster)
1399
- pool_info = job_id_to_pool_info.get(job['job_id'], None)
1400
- if pool_info and pool_info[0]:
1401
- cluster_name = pool_info[1]
1402
- else:
1403
- cluster_name = generate_managed_job_cluster_name(
1404
- job['task_name'], job['job_id'])
1405
- cluster_names[job['job_id']] = cluster_name
1406
- cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
1407
- set(cluster_names.values()))
1485
+ highest_blocking_priority = constants.MIN_PRIORITY
1486
+ if not fields or 'details' in fields:
1487
+ # Figure out what the highest priority blocking job is. We need to know
1488
+ # in order to determine if other jobs are blocked by a higher priority
1489
+ # job, or just by the limited controller resources.
1490
+ highest_blocking_priority = (
1491
+ managed_job_state.get_managed_jobs_highest_priority())
1408
1492
 
1409
1493
  for job in jobs:
1410
- end_at = job['end_at']
1411
- if end_at is None:
1412
- end_at = time.time()
1413
-
1414
- job_submitted_at = job['last_recovered_at'] - job['job_duration']
1415
- if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1416
- # When job is recovering, the duration is exact job['job_duration']
1417
- job_duration = job['job_duration']
1418
- elif job_submitted_at > 0:
1419
- job_duration = end_at - job_submitted_at
1420
- else:
1421
- # When job_start_at <= 0, that means the last_recovered_at is not
1422
- # set yet, i.e. the job is not started.
1423
- job_duration = 0
1424
- job['job_duration'] = job_duration
1494
+ if not fields or 'job_duration' in fields:
1495
+ end_at = job['end_at']
1496
+ if end_at is None:
1497
+ end_at = time.time()
1498
+
1499
+ job_submitted_at = job['last_recovered_at'] - job['job_duration']
1500
+ if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1501
+ # When job is recovering, the duration is exact
1502
+ # job['job_duration']
1503
+ job_duration = job['job_duration']
1504
+ elif job_submitted_at > 0:
1505
+ job_duration = end_at - job_submitted_at
1506
+ else:
1507
+ # When job_start_at <= 0, that means the last_recovered_at
1508
+ # is not set yet, i.e. the job is not started.
1509
+ job_duration = 0
1510
+ job['job_duration'] = job_duration
1425
1511
  job['status'] = job['status'].value
1426
- job['schedule_state'] = job['schedule_state'].value
1427
-
1428
- cluster_name = cluster_names[job['job_id']]
1429
- handle = cluster_name_to_handles.get(cluster_name, None)
1430
- if isinstance(handle, backends.CloudVmRayResourceHandle):
1431
- resources_str = resources_utils.get_readable_resources_repr(
1432
- handle, simplify=True)
1433
- resources_str_full = resources_utils.get_readable_resources_repr(
1434
- handle, simplify=False)
1435
- job['cluster_resources'] = resources_str
1436
- job['cluster_resources_full'] = resources_str_full
1437
- job['cloud'] = str(handle.launched_resources.cloud)
1438
- job['region'] = handle.launched_resources.region
1439
- job['zone'] = handle.launched_resources.zone
1440
- job['infra'] = infra_utils.InfraInfo(
1441
- str(handle.launched_resources.cloud),
1442
- handle.launched_resources.region,
1443
- handle.launched_resources.zone).formatted_str()
1444
- job['accelerators'] = handle.launched_resources.accelerators
1512
+ if not fields or 'schedule_state' in fields:
1513
+ job['schedule_state'] = job['schedule_state'].value
1445
1514
  else:
1446
- # FIXME(zongheng): display the last cached values for these.
1447
- job['cluster_resources'] = '-'
1448
- job['cluster_resources_full'] = '-'
1449
- job['cloud'] = '-'
1450
- job['region'] = '-'
1451
- job['zone'] = '-'
1452
- job['infra'] = '-'
1453
-
1454
- # Add details about schedule state / backoff.
1455
- state_details = None
1456
- if job['schedule_state'] == 'ALIVE_BACKOFF':
1457
- state_details = 'In backoff, waiting for resources'
1458
- elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1459
- priority = job.get('priority')
1460
- if (priority is not None and priority < highest_blocking_priority):
1461
- # Job is lower priority than some other blocking job.
1462
- state_details = 'Waiting for higher priority jobs to launch'
1515
+ job['schedule_state'] = None
1516
+
1517
+ if cluster_handle_required:
1518
+ cluster_name = job.get('current_cluster_name', None)
1519
+ if cluster_name is None:
1520
+ cluster_name = generate_managed_job_cluster_name(
1521
+ job['task_name'], job['job_id'])
1522
+ handle = cluster_name_to_handle.get(
1523
+ cluster_name, None) if cluster_name is not None else None
1524
+ if isinstance(handle, backends.CloudVmRayResourceHandle):
1525
+ resources_str = resources_utils.get_readable_resources_repr(
1526
+ handle, simplify=True)
1527
+ resources_str_full = (
1528
+ resources_utils.get_readable_resources_repr(handle,
1529
+ simplify=False))
1530
+ job['cluster_resources'] = resources_str
1531
+ job['cluster_resources_full'] = resources_str_full
1532
+ job['cloud'] = str(handle.launched_resources.cloud)
1533
+ job['region'] = handle.launched_resources.region
1534
+ job['zone'] = handle.launched_resources.zone
1535
+ job['infra'] = infra_utils.InfraInfo(
1536
+ str(handle.launched_resources.cloud),
1537
+ handle.launched_resources.region,
1538
+ handle.launched_resources.zone).formatted_str()
1539
+ job['accelerators'] = handle.launched_resources.accelerators
1463
1540
  else:
1464
- state_details = 'Waiting for other jobs to launch'
1465
-
1466
- if state_details and job['failure_reason']:
1467
- job['details'] = f'{state_details} - {job["failure_reason"]}'
1468
- elif state_details:
1469
- job['details'] = state_details
1470
- elif job['failure_reason']:
1471
- job['details'] = f'Failure: {job["failure_reason"]}'
1472
- else:
1473
- job['details'] = None
1541
+ # FIXME(zongheng): display the last cached values for these.
1542
+ job['cluster_resources'] = '-'
1543
+ job['cluster_resources_full'] = '-'
1544
+ job['cloud'] = '-'
1545
+ job['region'] = '-'
1546
+ job['zone'] = '-'
1547
+ job['infra'] = '-'
1548
+
1549
+ if not fields or 'details' in fields:
1550
+ # Add details about schedule state / backoff.
1551
+ state_details = None
1552
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
1553
+ state_details = 'In backoff, waiting for resources'
1554
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1555
+ priority = job.get('priority')
1556
+ if (priority is not None and
1557
+ priority < highest_blocking_priority):
1558
+ # Job is lower priority than some other blocking job.
1559
+ state_details = 'Waiting for higher priority jobs to launch'
1560
+ else:
1561
+ state_details = 'Waiting for other jobs to launch'
1562
+
1563
+ if state_details and job['failure_reason']:
1564
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
1565
+ elif state_details:
1566
+ job['details'] = state_details
1567
+ elif job['failure_reason']:
1568
+ job['details'] = f'Failure: {job["failure_reason"]}'
1569
+ else:
1570
+ job['details'] = None
1474
1571
 
1475
1572
  return {
1476
1573
  'jobs': jobs,
@@ -1581,21 +1678,14 @@ def load_managed_job_queue(
1581
1678
  total_no_filter = total
1582
1679
  result_type = ManagedJobQueueResultType.LIST
1583
1680
 
1584
- job_id_to_user_hash: Dict[int, str] = {}
1681
+ all_users = global_user_state.get_all_users()
1682
+ all_users_map = {user.id: user.name for user in all_users}
1585
1683
  for job in jobs:
1684
+ job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1586
1685
  if 'user_hash' in job and job['user_hash'] is not None:
1587
1686
  # Skip jobs that do not have user_hash info.
1588
1687
  # TODO(cooperc): Remove check before 0.12.0.
1589
- job_id_to_user_hash[job['job_id']] = job['user_hash']
1590
- user_hash_to_user = global_user_state.get_users(
1591
- job_id_to_user_hash.values())
1592
-
1593
- for job in jobs:
1594
- job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1595
- if job['job_id'] in job_id_to_user_hash:
1596
- user_hash = job_id_to_user_hash[job['job_id']]
1597
- user = user_hash_to_user.get(user_hash, None)
1598
- job['user_name'] = user.name if user is not None else None
1688
+ job['user_name'] = all_users_map.get(job['user_hash'])
1599
1689
  return jobs, total, result_type, total_no_filter, status_counts
1600
1690
 
1601
1691
 
@@ -2014,6 +2104,7 @@ class ManagedJobCodeGen:
2014
2104
  limit: Optional[int] = None,
2015
2105
  user_hashes: Optional[List[Optional[str]]] = None,
2016
2106
  statuses: Optional[List[str]] = None,
2107
+ fields: Optional[List[str]] = None,
2017
2108
  ) -> str:
2018
2109
  code = textwrap.dedent(f"""\
2019
2110
  if managed_job_version < 9:
@@ -2032,7 +2123,7 @@ class ManagedJobCodeGen:
2032
2123
  page={page!r},
2033
2124
  limit={limit!r},
2034
2125
  user_hashes={user_hashes!r})
2035
- else:
2126
+ elif managed_job_version < 12:
2036
2127
  job_table = utils.dump_managed_job_queue(
2037
2128
  skip_finished={skip_finished},
2038
2129
  accessible_workspaces={accessible_workspaces!r},
@@ -2044,6 +2135,19 @@ class ManagedJobCodeGen:
2044
2135
  limit={limit!r},
2045
2136
  user_hashes={user_hashes!r},
2046
2137
  statuses={statuses!r})
2138
+ else:
2139
+ job_table = utils.dump_managed_job_queue(
2140
+ skip_finished={skip_finished},
2141
+ accessible_workspaces={accessible_workspaces!r},
2142
+ job_ids={job_ids!r},
2143
+ workspace_match={workspace_match!r},
2144
+ name_match={name_match!r},
2145
+ pool_match={pool_match!r},
2146
+ page={page!r},
2147
+ limit={limit!r},
2148
+ user_hashes={user_hashes!r},
2149
+ statuses={statuses!r},
2150
+ fields={fields!r})
2047
2151
  print(job_table, flush=True)
2048
2152
  """)
2049
2153
  return cls._build(code)
@@ -48,8 +48,10 @@ def _open_ports_using_loadbalancer(
48
48
  service_name = _LOADBALANCER_SERVICE_NAME.format(
49
49
  cluster_name_on_cloud=cluster_name_on_cloud)
50
50
  context = kubernetes_utils.get_context_from_config(provider_config)
51
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
52
+
51
53
  content = network_utils.fill_loadbalancer_template(
52
- namespace=provider_config.get('namespace', 'default'),
54
+ namespace=namespace,
53
55
  context=context,
54
56
  service_name=service_name,
55
57
  ports=ports,
@@ -103,7 +105,7 @@ def _open_ports_using_ingress(
103
105
  # To avoid this, we change ingress creation into one object containing
104
106
  # multiple rules.
105
107
  content = network_utils.fill_ingress_template(
106
- namespace=provider_config.get('namespace', 'default'),
108
+ namespace=namespace,
107
109
  context=context,
108
110
  service_details=service_details,
109
111
  ingress_name=f'{cluster_name_on_cloud}-skypilot-ingress',
@@ -165,9 +167,10 @@ def _cleanup_ports_for_loadbalancer(
165
167
  # TODO(aylei): test coverage
166
168
  context = provider_config.get(
167
169
  'context', kubernetes_utils.get_current_kube_config_context_name())
170
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
168
171
  network_utils.delete_namespaced_service(
169
172
  context=context,
170
- namespace=provider_config.get('namespace', 'default'),
173
+ namespace=namespace,
171
174
  service_name=service_name,
172
175
  )
173
176
 
@@ -180,19 +183,19 @@ def _cleanup_ports_for_ingress(
180
183
  # Delete services for each port
181
184
  context = provider_config.get(
182
185
  'context', kubernetes_utils.get_current_kube_config_context_name())
186
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
183
187
  for port in ports:
184
188
  service_name = f'{cluster_name_on_cloud}--skypilot-svc--{port}'
185
189
  network_utils.delete_namespaced_service(
186
190
  context=context,
187
- namespace=provider_config.get('namespace',
188
- kubernetes_utils.DEFAULT_NAMESPACE),
191
+ namespace=namespace,
189
192
  service_name=service_name,
190
193
  )
191
194
 
192
195
  # Delete the single ingress used for all ports
193
196
  ingress_name = f'{cluster_name_on_cloud}-skypilot-ingress'
194
197
  network_utils.delete_namespaced_ingress(
195
- namespace=kubernetes_utils.get_namespace_from_config(provider_config),
198
+ namespace=namespace,
196
199
  context=kubernetes_utils.get_context_from_config(provider_config),
197
200
  ingress_name=ingress_name,
198
201
  )
@@ -442,6 +442,14 @@ def _post_provision_setup(
442
442
  cluster_name.name_on_cloud,
443
443
  provider_config=provider_config)
444
444
 
445
+ # Update cluster info in handle so cluster instance ids are set. This
446
+ # allows us to expose provision logs to debug nodes that failed during post
447
+ # provision setup.
448
+ handle = global_user_state.get_handle_from_cluster_name(
449
+ cluster_name.display_name)
450
+ handle.cached_cluster_info = cluster_info
451
+ global_user_state.update_cluster_handle(cluster_name.display_name, handle)
452
+
445
453
  if cluster_info.num_instances > 1:
446
454
  # Only worker nodes have logs in the per-instance log directory. Head
447
455
  # node's log will be redirected to the main log file.
@@ -160,6 +160,8 @@ class StorageRecord(ResponseBaseModel):
160
160
  # and therefore can be non-optional.
161
161
  class ManagedJobRecord(ResponseBaseModel):
162
162
  """A single managed job record."""
163
+ # The job_id in the spot table
164
+ task_job_id: Optional[int] = pydantic.Field(None, alias='_job_id')
163
165
  job_id: Optional[int] = None
164
166
  task_id: Optional[int] = None
165
167
  job_name: Optional[str] = None
@@ -0,0 +1,30 @@
1
+ """Initial schema for sky config database
2
+
3
+ Revision ID: 001
4
+ Revises:
5
+ Create Date: 2025-10-21
6
+
7
+ """
8
+ # pylint: disable=invalid-name
9
+ from alembic import op
10
+
11
+ from sky.skypilot_config import Base
12
+ from sky.utils.db import db_utils
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = '001'
16
+ down_revision = None
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade():
22
+ """Create initial schema for config_yaml table"""
23
+ with op.get_context().autocommit_block():
24
+ # Create all tables with their current schema
25
+ db_utils.add_all_tables_to_db_sqlalchemy(Base.metadata, op.get_bind())
26
+
27
+
28
+ def downgrade():
29
+ """Drop all tables"""
30
+ Base.metadata.drop_all(bind=op.get_bind())
@@ -23,7 +23,7 @@ async def up(
23
23
  request: fastapi.Request,
24
24
  up_body: payloads.ServeUpBody,
25
25
  ) -> None:
26
- executor.schedule_request(
26
+ await executor.schedule_request_async(
27
27
  request_id=request.state.request_id,
28
28
  request_name='serve.up',
29
29
  request_body=up_body,
@@ -38,7 +38,7 @@ async def update(
38
38
  request: fastapi.Request,
39
39
  update_body: payloads.ServeUpdateBody,
40
40
  ) -> None:
41
- executor.schedule_request(
41
+ await executor.schedule_request_async(
42
42
  request_id=request.state.request_id,
43
43
  request_name='serve.update',
44
44
  request_body=update_body,
@@ -53,7 +53,7 @@ async def down(
53
53
  request: fastapi.Request,
54
54
  down_body: payloads.ServeDownBody,
55
55
  ) -> None:
56
- executor.schedule_request(
56
+ await executor.schedule_request_async(
57
57
  request_id=request.state.request_id,
58
58
  request_name='serve.down',
59
59
  request_body=down_body,
@@ -68,7 +68,7 @@ async def terminate_replica(
68
68
  request: fastapi.Request,
69
69
  terminate_replica_body: payloads.ServeTerminateReplicaBody,
70
70
  ) -> None:
71
- executor.schedule_request(
71
+ await executor.schedule_request_async(
72
72
  request_id=request.state.request_id,
73
73
  request_name='serve.terminate_replica',
74
74
  request_body=terminate_replica_body,
@@ -83,7 +83,7 @@ async def status(
83
83
  request: fastapi.Request,
84
84
  status_body: payloads.ServeStatusBody,
85
85
  ) -> None:
86
- executor.schedule_request(
86
+ await executor.schedule_request_async(
87
87
  request_id=request.state.request_id,
88
88
  request_name='serve.status',
89
89
  request_body=status_body,
@@ -99,7 +99,7 @@ async def tail_logs(
99
99
  background_tasks: fastapi.BackgroundTasks
100
100
  ) -> fastapi.responses.StreamingResponse:
101
101
  executor.check_request_thread_executor_available()
102
- request_task = executor.prepare_request(
102
+ request_task = await executor.prepare_request_async(
103
103
  request_id=request.state.request_id,
104
104
  request_name='serve.logs',
105
105
  request_body=log_body,
@@ -114,6 +114,7 @@ async def tail_logs(
114
114
  request_id=request_task.request_id,
115
115
  logs_path=request_task.log_path,
116
116
  background_tasks=background_tasks,
117
+ kill_request_on_disconnect=False,
117
118
  )
118
119
 
119
120
 
@@ -131,7 +132,7 @@ async def download_logs(
131
132
  # We should reuse the original request body, so that the env vars, such as
132
133
  # user hash, are kept the same.
133
134
  download_logs_body.local_dir = str(logs_dir_on_api_server)
134
- executor.schedule_request(
135
+ await executor.schedule_request_async(
135
136
  request_id=request.state.request_id,
136
137
  request_name='serve.sync_down_logs',
137
138
  request_body=download_logs_body,