skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250529__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/cli.py +13 -3
  3. sky/client/cli.py +13 -3
  4. sky/client/oauth.py +82 -0
  5. sky/client/sdk.py +60 -10
  6. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
  7. sky/dashboard/out/404.html +1 -1
  8. sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +1 -0
  9. sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +6 -0
  11. sky/dashboard/out/_next/static/chunks/{856-62b87c68917b08ed.js → 856-59a1760784c9e770.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/pages/config-7c48919fe030bc43.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-909f1ceb0fcf1b99.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/pages/infra-d4c6875c88771e17.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +6 -0
  16. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  17. sky/dashboard/out/clusters/[cluster].html +1 -1
  18. sky/dashboard/out/clusters.html +1 -1
  19. sky/dashboard/out/config.html +1 -1
  20. sky/dashboard/out/index.html +1 -1
  21. sky/dashboard/out/infra/[context].html +1 -0
  22. sky/dashboard/out/infra.html +1 -1
  23. sky/dashboard/out/jobs/[job].html +1 -1
  24. sky/dashboard/out/jobs.html +1 -1
  25. sky/dashboard/out/users.html +1 -1
  26. sky/dashboard/out/workspace/new.html +1 -1
  27. sky/dashboard/out/workspaces/[name].html +1 -1
  28. sky/dashboard/out/workspaces.html +1 -1
  29. sky/exceptions.py +1 -1
  30. sky/jobs/client/sdk.py +1 -0
  31. sky/jobs/constants.py +2 -0
  32. sky/jobs/controller.py +3 -5
  33. sky/jobs/recovery_strategy.py +148 -102
  34. sky/jobs/scheduler.py +23 -8
  35. sky/jobs/server/core.py +16 -0
  36. sky/jobs/state.py +130 -35
  37. sky/jobs/utils.py +30 -4
  38. sky/resources.py +16 -1
  39. sky/server/common.py +6 -2
  40. sky/server/html/token_page.html +32 -6
  41. sky/server/server.py +3 -1
  42. sky/setup_files/dependencies.py +7 -1
  43. sky/skylet/constants.py +1 -1
  44. sky/task.py +26 -0
  45. sky/templates/jobs-controller.yaml.j2 +2 -1
  46. sky/utils/schemas.py +12 -0
  47. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/METADATA +3 -1
  48. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/RECORD +53 -49
  49. sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
  50. sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
  51. sky/dashboard/out/_next/static/chunks/pages/config-41738d1896fc02fe.js +0 -6
  52. sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
  53. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
  54. /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → HvNkg7hqKM1p0ptAcdDcF}/_ssgManifest.js +0 -0
  55. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/WHEEL +0 -0
  56. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -120,7 +120,8 @@ def create_table(cursor, conn):
120
120
  dag_yaml_path TEXT,
121
121
  env_file_path TEXT,
122
122
  user_hash TEXT,
123
- workspace TEXT DEFAULT NULL)""")
123
+ workspace TEXT DEFAULT NULL,
124
+ priority INTEGER DEFAULT 500)""")
124
125
 
125
126
  db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
126
127
  'TEXT')
@@ -142,6 +143,14 @@ def create_table(cursor, conn):
142
143
  'workspace',
143
144
  'TEXT DEFAULT NULL',
144
145
  value_to_replace_existing_entries='default')
146
+
147
+ db_utils.add_column_to_table(cursor,
148
+ conn,
149
+ 'job_info',
150
+ 'priority',
151
+ 'INTEGER',
152
+ value_to_replace_existing_entries=500)
153
+
145
154
  conn.commit()
146
155
 
147
156
 
@@ -199,6 +208,7 @@ columns = [
199
208
  'env_file_path',
200
209
  'user_hash',
201
210
  'workspace',
211
+ 'priority',
202
212
  ]
203
213
 
204
214
 
@@ -215,7 +225,7 @@ class ManagedJobStatus(enum.Enum):
215
225
  reset to INIT or SETTING_UP multiple times (depending on the preemptions).
216
226
 
217
227
  However, a managed job only has one ManagedJobStatus on the jobs controller.
218
- ManagedJobStatus = [PENDING, SUBMITTED, STARTING, RUNNING, ...]
228
+ ManagedJobStatus = [PENDING, STARTING, RUNNING, ...]
219
229
  Mapping from JobStatus to ManagedJobStatus:
220
230
  INIT -> STARTING/RECOVERING
221
231
  SETTING_UP -> RUNNING
@@ -235,10 +245,14 @@ class ManagedJobStatus(enum.Enum):
235
245
  # PENDING: Waiting for the jobs controller to have a slot to run the
236
246
  # controller process.
237
247
  PENDING = 'PENDING'
248
+ # SUBMITTED: This state used to be briefly set before immediately changing
249
+ # to STARTING. Its use was removed in #5682. We keep it for backwards
250
+ # compatibility, so we can still parse old jobs databases that may have jobs
251
+ # in this state.
252
+ # TODO(cooperc): remove this in v0.12.0
253
+ DEPRECATED_SUBMITTED = 'SUBMITTED'
238
254
  # The submitted_at timestamp of the managed job in the 'spot' table will be
239
255
  # set to the time when the job controller begins running.
240
- # SUBMITTED: The jobs controller starts the controller process.
241
- SUBMITTED = 'SUBMITTED'
242
256
  # STARTING: The controller process is launching the cluster for the managed
243
257
  # job.
244
258
  STARTING = 'STARTING'
@@ -314,7 +328,6 @@ class ManagedJobStatus(enum.Enum):
314
328
 
315
329
  _SPOT_STATUS_TO_COLOR = {
316
330
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
317
- ManagedJobStatus.SUBMITTED: colorama.Fore.BLUE,
318
331
  ManagedJobStatus.STARTING: colorama.Fore.BLUE,
319
332
  ManagedJobStatus.RUNNING: colorama.Fore.GREEN,
320
333
  ManagedJobStatus.RECOVERING: colorama.Fore.CYAN,
@@ -326,6 +339,8 @@ _SPOT_STATUS_TO_COLOR = {
326
339
  ManagedJobStatus.FAILED_CONTROLLER: colorama.Fore.RED,
327
340
  ManagedJobStatus.CANCELLING: colorama.Fore.YELLOW,
328
341
  ManagedJobStatus.CANCELLED: colorama.Fore.YELLOW,
342
+ # TODO(cooperc): backwards compatibility, remove this in v0.12.0
343
+ ManagedJobStatus.DEPRECATED_SUBMITTED: colorama.Fore.BLUE,
329
344
  }
330
345
 
331
346
 
@@ -342,8 +357,12 @@ class ManagedJobScheduleState(enum.Enum):
342
357
  - LAUNCHING -> ALIVE: The launch attempt was completed. It may have
343
358
  succeeded or failed. The job controller is not allowed to sky.launch again
344
359
  without transitioning to ALIVE_WAITING and then LAUNCHING.
360
+ - LAUNCHING -> ALIVE_BACKOFF: The launch failed to find resources, and is
361
+ in backoff waiting for resources.
345
362
  - ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
346
363
  either for recovery or to launch a subsequent task.
364
+ - ALIVE_BACKOFF -> ALIVE_WAITING: The backoff period has ended, and the job
365
+ controller wants to try to launch again.
347
366
  - ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
348
367
  controller may launch again.
349
368
  - LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
@@ -357,6 +376,7 @@ class ManagedJobScheduleState(enum.Enum):
357
376
  state or vice versa. (In fact, schedule state is defined on the job and
358
377
  status on the task.)
359
378
  - INACTIVE or WAITING should only be seen when a job is PENDING.
379
+ - ALIVE_BACKOFF should only be seen when a job is STARTING.
360
380
  - ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
361
381
  tasks, or needs to retry launching.
362
382
  - LAUNCHING and ALIVE can be seen in many different statuses.
@@ -382,6 +402,9 @@ class ManagedJobScheduleState(enum.Enum):
382
402
  # The job is running sky.launch, or soon will, using a limited number of
383
403
  # allowed launch slots.
384
404
  LAUNCHING = 'LAUNCHING'
405
+ # The job is alive, but is in backoff waiting for resources - a special case
406
+ # of ALIVE.
407
+ ALIVE_BACKOFF = 'ALIVE_BACKOFF'
385
408
  # The controller for the job is running, but it's not currently launching.
386
409
  ALIVE = 'ALIVE'
387
410
  # The job is in a terminal state. (Not necessarily SUCCEEDED.)
@@ -411,11 +434,11 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
411
434
  ManagedJobStatus.PENDING.value))
412
435
 
413
436
 
414
- def set_submitted(job_id: int, task_id: int, run_timestamp: str,
415
- submit_time: float, resources_str: str,
416
- specs: Dict[str, Union[str,
417
- int]], callback_func: CallbackType):
418
- """Set the task to submitted.
437
+ def set_starting(job_id: int, task_id: int, run_timestamp: str,
438
+ submit_time: float, resources_str: str,
439
+ specs: Dict[str, Union[str,
440
+ int]], callback_func: CallbackType):
441
+ """Set the task to starting state.
419
442
 
420
443
  Args:
421
444
  job_id: The managed job ID.
@@ -432,6 +455,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
432
455
  # make it easier to find them based on one of the values.
433
456
  # Also, using the earlier timestamp should be closer to the term
434
457
  # `submit_at`, which represents the time the managed task is submitted.
458
+ logger.info('Launching the spot cluster...')
435
459
  with db_utils.safe_cursor(_DB_PATH) as cursor:
436
460
  cursor.execute(
437
461
  """\
@@ -445,19 +469,54 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
445
469
  task_id=(?) AND
446
470
  status=(?) AND
447
471
  end_at IS null""",
448
- (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
472
+ (resources_str, submit_time, ManagedJobStatus.STARTING.value,
449
473
  run_timestamp, json.dumps(specs), job_id, task_id,
450
474
  ManagedJobStatus.PENDING.value))
451
475
  if cursor.rowcount != 1:
452
476
  raise exceptions.ManagedJobStatusError(
453
- f'Failed to set the task to submitted. '
477
+ 'Failed to set the task to starting. '
454
478
  f'({cursor.rowcount} rows updated)')
479
+ # SUBMITTED is no longer used, but we keep it for backward compatibility.
480
+ # TODO(cooperc): remove this in v0.12.0
455
481
  callback_func('SUBMITTED')
482
+ callback_func('STARTING')
456
483
 
457
484
 
458
- def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
459
- """Set the task to starting state."""
460
- logger.info('Launching the spot cluster...')
485
+ def set_backoff_pending(job_id: int, task_id: int):
486
+ """Set the task to PENDING state if it is in backoff.
487
+
488
+ This should only be used to transition from STARTING or RECOVERING back to
489
+ PENDING.
490
+ """
491
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
492
+ cursor.execute(
493
+ """\
494
+ UPDATE spot SET status=(?)
495
+ WHERE spot_job_id=(?) AND
496
+ task_id=(?) AND
497
+ status IN (?, ?) AND
498
+ end_at IS null""", (ManagedJobStatus.PENDING.value, job_id, task_id,
499
+ ManagedJobStatus.STARTING.value,
500
+ ManagedJobStatus.RECOVERING.value))
501
+ logger.debug('back to PENDING')
502
+ if cursor.rowcount != 1:
503
+ raise exceptions.ManagedJobStatusError(
504
+ 'Failed to set the task back to pending. '
505
+ f'({cursor.rowcount} rows updated)')
506
+ # Do not call callback_func here, as we don't use the callback for PENDING.
507
+
508
+
509
+ def set_restarting(job_id: int, task_id: int, recovering: bool):
510
+ """Set the task back to STARTING or RECOVERING from PENDING.
511
+
512
+ This should not be used for the initial transition from PENDING to STARTING.
513
+ In that case, use set_starting instead. This function should only be used
514
+ after using set_backoff_pending to transition back to PENDING during
515
+ launch retry backoff.
516
+ """
517
+ target_status = ManagedJobStatus.STARTING.value
518
+ if recovering:
519
+ target_status = ManagedJobStatus.RECOVERING.value
461
520
  with db_utils.safe_cursor(_DB_PATH) as cursor:
462
521
  cursor.execute(
463
522
  """\
@@ -465,13 +524,15 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
465
524
  WHERE spot_job_id=(?) AND
466
525
  task_id=(?) AND
467
526
  status=(?) AND
468
- end_at IS null""", (ManagedJobStatus.STARTING.value, job_id,
469
- task_id, ManagedJobStatus.SUBMITTED.value))
527
+ end_at IS null""",
528
+ (target_status, job_id, task_id, ManagedJobStatus.PENDING.value))
529
+ logger.debug(f'back to {target_status}')
470
530
  if cursor.rowcount != 1:
471
531
  raise exceptions.ManagedJobStatusError(
472
- f'Failed to set the task to starting. '
532
+ f'Failed to set the task back to {target_status}. '
473
533
  f'({cursor.rowcount} rows updated)')
474
- callback_func('STARTING')
534
+ # Do not call callback_func here, as it should only be invoked for the
535
+ # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
475
536
 
476
537
 
477
538
  def set_started(job_id: int, task_id: int, start_time: float,
@@ -1004,16 +1065,16 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
1004
1065
 
1005
1066
 
1006
1067
  def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
1007
- user_hash: str) -> None:
1068
+ user_hash: str, priority: int) -> None:
1008
1069
  """Do not call without holding the scheduler lock."""
1009
1070
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1010
1071
  updated_count = cursor.execute(
1011
1072
  'UPDATE job_info SET '
1012
1073
  'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
1013
- ' user_hash = (?) '
1074
+ ' user_hash = (?), priority = (?) '
1014
1075
  'WHERE spot_job_id = (?) AND schedule_state = (?)',
1015
1076
  (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
1016
- env_file_path, user_hash, job_id,
1077
+ env_file_path, user_hash, priority, job_id,
1017
1078
  ManagedJobScheduleState.INACTIVE.value)).rowcount
1018
1079
  assert updated_count == 1, (job_id, updated_count)
1019
1080
 
@@ -1043,15 +1104,28 @@ def scheduler_set_alive(job_id: int) -> None:
1043
1104
  assert updated_count == 1, (job_id, updated_count)
1044
1105
 
1045
1106
 
1046
- def scheduler_set_alive_waiting(job_id: int) -> None:
1107
+ def scheduler_set_alive_backoff(job_id: int) -> None:
1047
1108
  """Do not call without holding the scheduler lock."""
1048
1109
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1049
1110
  updated_count = cursor.execute(
1050
1111
  'UPDATE job_info SET '
1051
1112
  'schedule_state = (?) '
1052
1113
  'WHERE spot_job_id = (?) AND schedule_state = (?)',
1114
+ (ManagedJobScheduleState.ALIVE_BACKOFF.value, job_id,
1115
+ ManagedJobScheduleState.LAUNCHING.value)).rowcount
1116
+ assert updated_count == 1, (job_id, updated_count)
1117
+
1118
+
1119
+ def scheduler_set_alive_waiting(job_id: int) -> None:
1120
+ """Do not call without holding the scheduler lock."""
1121
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
1122
+ updated_count = cursor.execute(
1123
+ 'UPDATE job_info SET '
1124
+ 'schedule_state = (?) '
1125
+ 'WHERE spot_job_id = (?) AND schedule_state IN (?, ?)',
1053
1126
  (ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
1054
- ManagedJobScheduleState.ALIVE.value)).rowcount
1127
+ ManagedJobScheduleState.ALIVE.value,
1128
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).rowcount
1055
1129
  assert updated_count == 1, (job_id, updated_count)
1056
1130
 
1057
1131
 
@@ -1099,32 +1173,53 @@ def get_num_alive_jobs() -> int:
1099
1173
  return cursor.execute(
1100
1174
  'SELECT COUNT(*) '
1101
1175
  'FROM job_info '
1102
- 'WHERE schedule_state IN (?, ?, ?)',
1176
+ 'WHERE schedule_state IN (?, ?, ?, ?)',
1103
1177
  (ManagedJobScheduleState.ALIVE_WAITING.value,
1104
1178
  ManagedJobScheduleState.LAUNCHING.value,
1105
- ManagedJobScheduleState.ALIVE.value)).fetchone()[0]
1179
+ ManagedJobScheduleState.ALIVE.value,
1180
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()[0]
1106
1181
 
1107
1182
 
1108
1183
  def get_waiting_job() -> Optional[Dict[str, Any]]:
1109
1184
  """Get the next job that should transition to LAUNCHING.
1110
1185
 
1186
+ Selects the highest-priority (lowest numerical value) WAITING or
1187
+ ALIVE_WAITING job, provided its priority value is less than or equal to any
1188
+ currently LAUNCHING or ALIVE_BACKOFF job.
1189
+
1111
1190
  Backwards compatibility note: jobs submitted before #4485 will have no
1112
1191
  schedule_state and will be ignored by this SQL query.
1113
1192
  """
1114
1193
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1115
- row = cursor.execute(
1194
+ # Get the highest-priority (lowest numerical value) WAITING or
1195
+ # ALIVE_WAITING job whose priority value is less than or equal to
1196
+ # the highest priority (numerically smallest) LAUNCHING or
1197
+ # ALIVE_BACKOFF job's priority.
1198
+ waiting_job_row = cursor.execute(
1116
1199
  'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
1117
1200
  'FROM job_info '
1118
- 'WHERE schedule_state in (?, ?) '
1119
- 'ORDER BY spot_job_id LIMIT 1',
1201
+ 'WHERE schedule_state IN (?, ?) '
1202
+ 'AND priority <= COALESCE('
1203
+ ' (SELECT MIN(priority) '
1204
+ ' FROM job_info '
1205
+ ' WHERE schedule_state IN (?, ?)), '
1206
+ ' 1000'
1207
+ ')'
1208
+ 'ORDER BY priority ASC, spot_job_id ASC LIMIT 1',
1120
1209
  (ManagedJobScheduleState.WAITING.value,
1121
- ManagedJobScheduleState.ALIVE_WAITING.value)).fetchone()
1210
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1211
+ ManagedJobScheduleState.LAUNCHING.value,
1212
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()
1213
+
1214
+ if waiting_job_row is None:
1215
+ return None
1216
+
1122
1217
  return {
1123
- 'job_id': row[0],
1124
- 'schedule_state': ManagedJobScheduleState(row[1]),
1125
- 'dag_yaml_path': row[2],
1126
- 'env_file_path': row[3],
1127
- } if row is not None else None
1218
+ 'job_id': waiting_job_row[0],
1219
+ 'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
1220
+ 'dag_yaml_path': waiting_job_row[2],
1221
+ 'env_file_path': waiting_job_row[3],
1222
+ }
1128
1223
 
1129
1224
 
1130
1225
  def get_workspace(job_id: int) -> str:
sky/jobs/utils.py CHANGED
@@ -953,6 +953,22 @@ def dump_managed_job_queue() -> str:
953
953
  job['region'] = '-'
954
954
  job['zone'] = '-'
955
955
 
956
+ # Add details about schedule state / backoff.
957
+ state_details = None
958
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
959
+ state_details = 'In backoff, waiting for resources'
960
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
961
+ state_details = 'Waiting for other jobs to launch'
962
+
963
+ if state_details and job['failure_reason']:
964
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
965
+ elif state_details:
966
+ job['details'] = state_details
967
+ elif job['failure_reason']:
968
+ job['details'] = f'Failure: {job["failure_reason"]}'
969
+ else:
970
+ job['details'] = None
971
+
956
972
  return message_utils.encode_payload(jobs)
957
973
 
958
974
 
@@ -981,7 +997,7 @@ def _get_job_status_from_tasks(
981
997
  # Use the first non-succeeded status.
982
998
  if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
983
999
  # TODO(zhwu): we should not blindly use the first non-
984
- # succeeded as the status could be changed to SUBMITTED
1000
+ # succeeded as the status could be changed to PENDING
985
1001
  # when going from one task to the next one, which can be
986
1002
  # confusing.
987
1003
  break
@@ -1063,6 +1079,7 @@ def format_job_table(
1063
1079
  'TASK',
1064
1080
  *(['WORKSPACE'] if show_workspace else []),
1065
1081
  'NAME',
1082
+ 'PRIORITY',
1066
1083
  *user_cols,
1067
1084
  'REQUESTED',
1068
1085
  'SUBMITTED',
@@ -1092,7 +1109,10 @@ def format_job_table(
1092
1109
  # by the task_id.
1093
1110
  jobs[get_hash(task)].append(task)
1094
1111
 
1095
- def generate_details(failure_reason: Optional[str]) -> str:
1112
+ def generate_details(details: Optional[str],
1113
+ failure_reason: Optional[str]) -> str:
1114
+ if details is not None:
1115
+ return details
1096
1116
  if failure_reason is not None:
1097
1117
  return f'Failure: {failure_reason}'
1098
1118
  return '-'
@@ -1131,6 +1151,7 @@ def format_job_table(
1131
1151
  submitted_at = None
1132
1152
  end_at: Optional[int] = 0
1133
1153
  recovery_cnt = 0
1154
+ priority = job_tasks[0].get('priority', '-')
1134
1155
  managed_job_status, current_task_id = _get_job_status_from_tasks(
1135
1156
  job_tasks)
1136
1157
  for task in job_tasks:
@@ -1166,6 +1187,7 @@ def format_job_table(
1166
1187
  '',
1167
1188
  *([''] if show_workspace else []),
1168
1189
  job_name,
1190
+ str(priority),
1169
1191
  *user_values,
1170
1192
  '-',
1171
1193
  submitted,
@@ -1175,13 +1197,14 @@ def format_job_table(
1175
1197
  status_str,
1176
1198
  ]
1177
1199
  if show_all:
1200
+ details = job_tasks[current_task_id].get('details')
1178
1201
  failure_reason = job_tasks[current_task_id]['failure_reason']
1179
1202
  job_values.extend([
1180
1203
  '-',
1181
1204
  '-',
1182
1205
  '-',
1183
1206
  job_tasks[0]['schedule_state'],
1184
- generate_details(failure_reason),
1207
+ generate_details(details, failure_reason),
1185
1208
  ])
1186
1209
  if tasks_have_k8s_user:
1187
1210
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1195,11 +1218,13 @@ def format_job_table(
1195
1218
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1196
1219
  user_values = get_user_column_values(task)
1197
1220
  task_workspace = '-' if len(job_tasks) > 1 else workspace
1221
+ priority = task.get('priority', '-')
1198
1222
  values = [
1199
1223
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1200
1224
  task['task_id'] if len(job_tasks) > 1 else '-',
1201
1225
  *([task_workspace] if show_workspace else []),
1202
1226
  task['task_name'],
1227
+ str(priority),
1203
1228
  *user_values,
1204
1229
  task['resources'],
1205
1230
  # SUBMITTED
@@ -1244,7 +1269,8 @@ def format_job_table(
1244
1269
  infra.formatted_str(),
1245
1270
  task['cluster_resources'],
1246
1271
  schedule_state,
1247
- generate_details(task['failure_reason']),
1272
+ generate_details(task.get('details'),
1273
+ task['failure_reason']),
1248
1274
  ])
1249
1275
  if tasks_have_k8s_user:
1250
1276
  values.insert(0, task.get('user', '-'))
sky/resources.py CHANGED
@@ -98,7 +98,7 @@ class Resources:
98
98
  """
99
99
  # If any fields changed, increment the version. For backward compatibility,
100
100
  # modify the __setstate__ method to handle the old version.
101
- _VERSION = 24
101
+ _VERSION = 25
102
102
 
103
103
  def __init__(
104
104
  self,
@@ -294,6 +294,8 @@ class Resources:
294
294
  }
295
295
  else:
296
296
  self._image_id = image_id
297
+ if isinstance(self._cloud, clouds.Kubernetes):
298
+ _maybe_add_docker_prefix_to_image_id(self._image_id)
297
299
  self._is_image_managed = _is_image_managed
298
300
 
299
301
  if isinstance(disk_tier, str):
@@ -2075,6 +2077,10 @@ class Resources:
2075
2077
  if version < 24:
2076
2078
  self._volumes = None
2077
2079
 
2080
+ if version < 25:
2081
+ if isinstance(state.get('_cloud', None), clouds.Kubernetes):
2082
+ _maybe_add_docker_prefix_to_image_id(state['_image_id'])
2083
+
2078
2084
  self.__dict__.update(state)
2079
2085
 
2080
2086
 
@@ -2111,3 +2117,12 @@ class LaunchableResources(Resources):
2111
2117
  """
2112
2118
  self.assert_launchable()
2113
2119
  return typing.cast(LaunchableResources, super().copy(**override))
2120
+
2121
+
2122
+ def _maybe_add_docker_prefix_to_image_id(
2123
+ image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
2124
+ if image_id_dict is None:
2125
+ return
2126
+ for k, v in image_id_dict.items():
2127
+ if not v.startswith('docker:'):
2128
+ image_id_dict[k] = f'docker:{v}'
sky/server/common.py CHANGED
@@ -159,7 +159,8 @@ def get_server_url(host: Optional[str] = None) -> str:
159
159
 
160
160
 
161
161
  @annotations.lru_cache(scope='global')
162
- def get_dashboard_url(server_url: str) -> str:
162
+ def get_dashboard_url(server_url: str,
163
+ starting_page: Optional[str] = None) -> str:
163
164
  # The server_url may include username or password with the
164
165
  # format of https://username:password@example.com:8080/path
165
166
  # We need to remove the username and password and only
@@ -172,7 +173,10 @@ def get_dashboard_url(server_url: str) -> str:
172
173
  if parsed.path:
173
174
  dashboard_url = f'{dashboard_url}{parsed.path}'
174
175
  dashboard_url = dashboard_url.rstrip('/')
175
- return f'{dashboard_url}/dashboard'
176
+ dashboard_url = f'{dashboard_url}/dashboard'
177
+ if starting_page:
178
+ dashboard_url = f'{dashboard_url}/{starting_page}'
179
+ return dashboard_url
176
180
 
177
181
 
178
182
  @annotations.lru_cache(scope='global')
@@ -100,6 +100,9 @@
100
100
  color: #5f6368;
101
101
  margin-top: 30px;
102
102
  }
103
+ .local-port-info {
104
+ display: none;
105
+ }
103
106
  </style>
104
107
  </head>
105
108
  <body>
@@ -114,14 +117,18 @@
114
117
  <path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
115
118
  </svg>
116
119
  </div>
117
- <h1>Sign in to SkyPilot CLI</h1>
120
+ <h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
121
+ <h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
118
122
  <p class="user-identifier">USER_PLACEHOLDER</p>
119
- <p>You are seeing this page because a SkyPilot command requires authentication.</p>
120
- <p>Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
121
- <div id="token-box" class="code-block">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
122
- <button id="copy-btn" class="copy-button">Copy Token</button>
123
+ <!-- display token info by default -->
124
+ <p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
125
+ <p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
126
+ <div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
127
+ <button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
128
+ <p class="footer-text no-local-port">You can close this tab after copying the token.</p>
123
129
 
124
- <p class="footer-text">You can close this tab after copying the token.</p>
130
+ <!-- don't display local port info unless successful -->
131
+ <p class="local-port-info">You can now close this tab.</p>
125
132
  </div>
126
133
 
127
134
  <script>
@@ -154,6 +161,25 @@
154
161
  copyBtn.textContent = 'Copy Token';
155
162
  }, 2000);
156
163
  });
164
+
165
+ function hideTokenInfo() {
166
+ const noLocalPortElems = document.querySelectorAll('.no-local-port');
167
+ noLocalPortElems.forEach(elem => {
168
+ elem.style.display = 'none';
169
+ });
170
+ const localPortInfoElems = document.querySelectorAll('.local-port-info');
171
+ localPortInfoElems.forEach(elem => {
172
+ elem.classList.remove('local-port-info');
173
+ });
174
+ }
175
+
176
+ if (window.location.search.includes('local_port=')) {
177
+ const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
178
+ fetch(uri, {
179
+ method: 'POST',
180
+ body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
181
+ }).then(hideTokenInfo)
182
+ }
157
183
  </script>
158
184
  </body>
159
185
  </html>
sky/server/server.py CHANGED
@@ -272,7 +272,9 @@ app.include_router(workspaces_rest.router,
272
272
 
273
273
 
274
274
  @app.get('/token')
275
- async def token(request: fastapi.Request) -> fastapi.responses.HTMLResponse:
275
+ async def token(request: fastapi.Request,
276
+ local_port: Optional[int] = None) -> fastapi.responses.Response:
277
+ del local_port # local_port is used by the served js, but ignored by server
276
278
  user = _get_auth_user_header(request)
277
279
 
278
280
  token_data = {
@@ -118,7 +118,13 @@ extras_require: Dict[str, List[str]] = {
118
118
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
119
119
  # parameter for stopping instances. Reference:
120
120
  # https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
121
- 'gcp': ['google-api-python-client>=2.69.0', 'google-cloud-storage'],
121
+ 'gcp': [
122
+ 'google-api-python-client>=2.69.0',
123
+ 'google-cloud-storage',
124
+ # see https://github.com/conda/conda/issues/13619
125
+ # see https://github.com/googleapis/google-api-python-client/issues/2554
126
+ 'pyopenssl >= 23.2.0, <24.3.0',
127
+ ],
122
128
  'ibm': [
123
129
  'ibm-cloud-sdk-core',
124
130
  'ibm-vpc',
sky/skylet/constants.py CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
89
  # cluster yaml is updated.
90
90
  #
91
91
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '12'
92
+ SKYLET_VERSION = '13'
93
93
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
94
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
95
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/task.py CHANGED
@@ -292,6 +292,8 @@ class Task:
292
292
  self.resources: Union[List[sky.Resources],
293
293
  Set[sky.Resources]] = {sky.Resources()}
294
294
  self._service: Optional[service_spec.SkyServiceSpec] = None
295
+ # The priority of the managed job running this task.
296
+ self._job_priority: Optional[int] = None
295
297
  # Resources that this task cannot run on.
296
298
  self.blocked_resources = blocked_resources
297
299
 
@@ -629,6 +631,10 @@ class Task:
629
631
  service = service_spec.SkyServiceSpec.from_yaml_config(service)
630
632
  task.set_service(service)
631
633
 
634
+ job = config.pop('job', None)
635
+ if job is not None and 'priority' in job:
636
+ task.set_job_priority(job['priority'])
637
+
632
638
  assert not config, f'Invalid task args: {config.keys()}'
633
639
  return task
634
640
 
@@ -831,6 +837,23 @@ class Task:
831
837
  self._service = service
832
838
  return self
833
839
 
840
+ @property
841
+ def job_priority(self) -> Optional[int]:
842
+ """The priority of the managed job running this task."""
843
+ return self._job_priority
844
+
845
+ def set_job_priority(self, priority: int) -> 'Task':
846
+ """Sets the job priority for this task.
847
+
848
+ Args:
849
+ priority: an integer between 0 and 1000.
850
+
851
+ Returns:
852
+ self: The current task, with job priority set.
853
+ """
854
+ self._job_priority = priority
855
+ return self
856
+
834
857
  def set_time_estimator(self, func: Callable[['sky.Resources'],
835
858
  int]) -> 'Task':
836
859
  """Sets a func mapping resources to estimated time (secs).
@@ -1274,6 +1297,9 @@ class Task:
1274
1297
  if self.service is not None:
1275
1298
  add_if_not_none('service', self.service.to_yaml_config())
1276
1299
 
1300
+ if self.job_priority is not None:
1301
+ add_if_not_none('job', {'priority': self.job_priority})
1302
+
1277
1303
  add_if_not_none('num_nodes', self.num_nodes)
1278
1304
 
1279
1305
  if self.inputs is not None:
@@ -66,7 +66,8 @@ run: |
66
66
  # managed_job_codegen.set_pending() before we get here.
67
67
  python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
68
68
  --job-id $SKYPILOT_INTERNAL_JOB_ID \
69
- --env-file {{remote_env_file_path}}
69
+ --env-file {{remote_env_file_path}} \
70
+ --priority {{priority}}
70
71
 
71
72
 
72
73
  envs: