skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +99 -16
  3. sky/authentication.py +54 -7
  4. sky/backends/backend_utils.py +35 -22
  5. sky/backends/cloud_vm_ray_backend.py +30 -15
  6. sky/check.py +1 -1
  7. sky/cli.py +20 -8
  8. sky/client/cli.py +20 -8
  9. sky/client/oauth.py +82 -0
  10. sky/client/sdk.py +60 -10
  11. sky/clouds/nebius.py +55 -14
  12. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
  18. sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
  21. sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
  24. sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
  26. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
  29. sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
  35. sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
  36. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  37. sky/dashboard/out/clusters/[cluster].html +1 -1
  38. sky/dashboard/out/clusters.html +1 -1
  39. sky/dashboard/out/config.html +1 -1
  40. sky/dashboard/out/index.html +1 -1
  41. sky/dashboard/out/infra/[context].html +1 -0
  42. sky/dashboard/out/infra.html +1 -1
  43. sky/dashboard/out/jobs/[job].html +1 -1
  44. sky/dashboard/out/jobs.html +1 -1
  45. sky/dashboard/out/users.html +1 -1
  46. sky/dashboard/out/workspace/new.html +1 -1
  47. sky/dashboard/out/workspaces/[name].html +1 -1
  48. sky/dashboard/out/workspaces.html +1 -1
  49. sky/exceptions.py +11 -1
  50. sky/global_user_state.py +149 -1
  51. sky/jobs/client/sdk.py +1 -0
  52. sky/jobs/constants.py +3 -1
  53. sky/jobs/controller.py +3 -5
  54. sky/jobs/recovery_strategy.py +148 -102
  55. sky/jobs/scheduler.py +23 -8
  56. sky/jobs/server/core.py +16 -0
  57. sky/jobs/state.py +153 -39
  58. sky/jobs/utils.py +33 -5
  59. sky/provision/kubernetes/utils.py +2 -1
  60. sky/provision/provisioner.py +15 -10
  61. sky/resources.py +16 -1
  62. sky/serve/controller.py +10 -7
  63. sky/serve/replica_managers.py +22 -18
  64. sky/serve/service.py +5 -4
  65. sky/server/common.py +11 -4
  66. sky/server/html/token_page.html +32 -6
  67. sky/server/server.py +3 -1
  68. sky/server/stream_utils.py +21 -0
  69. sky/setup_files/dependencies.py +7 -1
  70. sky/skylet/constants.py +1 -1
  71. sky/task.py +26 -0
  72. sky/templates/jobs-controller.yaml.j2 +2 -1
  73. sky/templates/kubernetes-ray.yml.j2 +19 -1
  74. sky/utils/common_utils.py +66 -0
  75. sky/utils/rich_utils.py +5 -0
  76. sky/utils/schemas.py +32 -1
  77. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
  78. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
  79. sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
  81. sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
  82. sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
  83. sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
  84. sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
  90. sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
  92. sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
  93. /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
  94. /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
  95. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
  96. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
  97. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
  98. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -91,6 +91,7 @@ def launch(
91
91
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
92
92
 
93
93
  task_names = set()
94
+ priority = None
94
95
  for task_ in dag.tasks:
95
96
  if task_.name in task_names:
96
97
  with ux_utils.print_exception_no_traceback():
@@ -100,6 +101,20 @@ def launch(
100
101
  'name only and comment out the task names (so that they '
101
102
  'will be auto-generated) .')
102
103
  task_names.add(task_.name)
104
+ if task_.job_priority is not None:
105
+ if (priority is not None and priority != task_.job_priority):
106
+ with ux_utils.print_exception_no_traceback():
107
+ raise ValueError(
108
+ 'Multiple tasks in the DAG have different priorities. '
109
+ 'Either specify a priority in only one task, or set '
110
+ 'the same priority for each task.')
111
+ priority = task_.job_priority
112
+
113
+ if priority is None:
114
+ priority = managed_job_constants.DEFAULT_PRIORITY
115
+
116
+ if priority < 0 or priority > 1000:
117
+ raise ValueError(f'Priority must be between 0 and 1000, got {priority}')
103
118
 
104
119
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
105
120
 
@@ -186,6 +201,7 @@ def launch(
186
201
  service_catalog_common.get_modified_catalog_file_mounts(),
187
202
  'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
188
203
  'dashboard_user_id': common.SERVER_ID,
204
+ 'priority': priority,
189
205
  **controller_utils.shared_controller_vars_to_fill(
190
206
  controller,
191
207
  remote_user_config_path=remote_user_config_path,
sky/jobs/state.py CHANGED
@@ -120,7 +120,9 @@ def create_table(cursor, conn):
120
120
  dag_yaml_path TEXT,
121
121
  env_file_path TEXT,
122
122
  user_hash TEXT,
123
- workspace TEXT DEFAULT NULL)""")
123
+ workspace TEXT DEFAULT NULL,
124
+ priority INTEGER DEFAULT 500,
125
+ entrypoint TEXT DEFAULT NULL)""")
124
126
 
125
127
  db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
126
128
  'TEXT')
@@ -142,6 +144,15 @@ def create_table(cursor, conn):
142
144
  'workspace',
143
145
  'TEXT DEFAULT NULL',
144
146
  value_to_replace_existing_entries='default')
147
+
148
+ db_utils.add_column_to_table(cursor,
149
+ conn,
150
+ 'job_info',
151
+ 'priority',
152
+ 'INTEGER',
153
+ value_to_replace_existing_entries=500)
154
+
155
+ db_utils.add_column_to_table(cursor, conn, 'job_info', 'entrypoint', 'TEXT')
145
156
  conn.commit()
146
157
 
147
158
 
@@ -199,6 +210,8 @@ columns = [
199
210
  'env_file_path',
200
211
  'user_hash',
201
212
  'workspace',
213
+ 'priority',
214
+ 'entrypoint',
202
215
  ]
203
216
 
204
217
 
@@ -215,7 +228,7 @@ class ManagedJobStatus(enum.Enum):
215
228
  reset to INIT or SETTING_UP multiple times (depending on the preemptions).
216
229
 
217
230
  However, a managed job only has one ManagedJobStatus on the jobs controller.
218
- ManagedJobStatus = [PENDING, SUBMITTED, STARTING, RUNNING, ...]
231
+ ManagedJobStatus = [PENDING, STARTING, RUNNING, ...]
219
232
  Mapping from JobStatus to ManagedJobStatus:
220
233
  INIT -> STARTING/RECOVERING
221
234
  SETTING_UP -> RUNNING
@@ -235,10 +248,14 @@ class ManagedJobStatus(enum.Enum):
235
248
  # PENDING: Waiting for the jobs controller to have a slot to run the
236
249
  # controller process.
237
250
  PENDING = 'PENDING'
251
+ # SUBMITTED: This state used to be briefly set before immediately changing
252
+ # to STARTING. Its use was removed in #5682. We keep it for backwards
253
+ # compatibility, so we can still parse old jobs databases that may have jobs
254
+ # in this state.
255
+ # TODO(cooperc): remove this in v0.12.0
256
+ DEPRECATED_SUBMITTED = 'SUBMITTED'
238
257
  # The submitted_at timestamp of the managed job in the 'spot' table will be
239
258
  # set to the time when the job controller begins running.
240
- # SUBMITTED: The jobs controller starts the controller process.
241
- SUBMITTED = 'SUBMITTED'
242
259
  # STARTING: The controller process is launching the cluster for the managed
243
260
  # job.
244
261
  STARTING = 'STARTING'
@@ -314,7 +331,6 @@ class ManagedJobStatus(enum.Enum):
314
331
 
315
332
  _SPOT_STATUS_TO_COLOR = {
316
333
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
317
- ManagedJobStatus.SUBMITTED: colorama.Fore.BLUE,
318
334
  ManagedJobStatus.STARTING: colorama.Fore.BLUE,
319
335
  ManagedJobStatus.RUNNING: colorama.Fore.GREEN,
320
336
  ManagedJobStatus.RECOVERING: colorama.Fore.CYAN,
@@ -326,6 +342,8 @@ _SPOT_STATUS_TO_COLOR = {
326
342
  ManagedJobStatus.FAILED_CONTROLLER: colorama.Fore.RED,
327
343
  ManagedJobStatus.CANCELLING: colorama.Fore.YELLOW,
328
344
  ManagedJobStatus.CANCELLED: colorama.Fore.YELLOW,
345
+ # TODO(cooperc): backwards compatibility, remove this in v0.12.0
346
+ ManagedJobStatus.DEPRECATED_SUBMITTED: colorama.Fore.BLUE,
329
347
  }
330
348
 
331
349
 
@@ -342,8 +360,12 @@ class ManagedJobScheduleState(enum.Enum):
342
360
  - LAUNCHING -> ALIVE: The launch attempt was completed. It may have
343
361
  succeeded or failed. The job controller is not allowed to sky.launch again
344
362
  without transitioning to ALIVE_WAITING and then LAUNCHING.
363
+ - LAUNCHING -> ALIVE_BACKOFF: The launch failed to find resources, and is
364
+ in backoff waiting for resources.
345
365
  - ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
346
366
  either for recovery or to launch a subsequent task.
367
+ - ALIVE_BACKOFF -> ALIVE_WAITING: The backoff period has ended, and the job
368
+ controller wants to try to launch again.
347
369
  - ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
348
370
  controller may launch again.
349
371
  - LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
@@ -357,6 +379,7 @@ class ManagedJobScheduleState(enum.Enum):
357
379
  state or vice versa. (In fact, schedule state is defined on the job and
358
380
  status on the task.)
359
381
  - INACTIVE or WAITING should only be seen when a job is PENDING.
382
+ - ALIVE_BACKOFF should only be seen when a job is STARTING.
360
383
  - ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
361
384
  tasks, or needs to retry launching.
362
385
  - LAUNCHING and ALIVE can be seen in many different statuses.
@@ -382,6 +405,9 @@ class ManagedJobScheduleState(enum.Enum):
382
405
  # The job is running sky.launch, or soon will, using a limited number of
383
406
  # allowed launch slots.
384
407
  LAUNCHING = 'LAUNCHING'
408
+ # The job is alive, but is in backoff waiting for resources - a special case
409
+ # of ALIVE.
410
+ ALIVE_BACKOFF = 'ALIVE_BACKOFF'
385
411
  # The controller for the job is running, but it's not currently launching.
386
412
  ALIVE = 'ALIVE'
387
413
  # The job is in a terminal state. (Not necessarily SUCCEEDED.)
@@ -389,14 +415,15 @@ class ManagedJobScheduleState(enum.Enum):
389
415
 
390
416
 
391
417
  # === Status transition functions ===
392
- def set_job_info(job_id: int, name: str, workspace: str):
418
+ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
393
419
  with db_utils.safe_cursor(_DB_PATH) as cursor:
394
420
  cursor.execute(
395
421
  """\
396
422
  INSERT INTO job_info
397
- (spot_job_id, name, schedule_state, workspace)
398
- VALUES (?, ?, ?, ?)""",
399
- (job_id, name, ManagedJobScheduleState.INACTIVE.value, workspace))
423
+ (spot_job_id, name, schedule_state, workspace, entrypoint)
424
+ VALUES (?, ?, ?, ?, ?)""",
425
+ (job_id, name, ManagedJobScheduleState.INACTIVE.value, workspace,
426
+ entrypoint))
400
427
 
401
428
 
402
429
  def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
@@ -411,11 +438,11 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
411
438
  ManagedJobStatus.PENDING.value))
412
439
 
413
440
 
414
- def set_submitted(job_id: int, task_id: int, run_timestamp: str,
415
- submit_time: float, resources_str: str,
416
- specs: Dict[str, Union[str,
417
- int]], callback_func: CallbackType):
418
- """Set the task to submitted.
441
+ def set_starting(job_id: int, task_id: int, run_timestamp: str,
442
+ submit_time: float, resources_str: str,
443
+ specs: Dict[str, Union[str,
444
+ int]], callback_func: CallbackType):
445
+ """Set the task to starting state.
419
446
 
420
447
  Args:
421
448
  job_id: The managed job ID.
@@ -432,6 +459,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
432
459
  # make it easier to find them based on one of the values.
433
460
  # Also, using the earlier timestamp should be closer to the term
434
461
  # `submit_at`, which represents the time the managed task is submitted.
462
+ logger.info('Launching the spot cluster...')
435
463
  with db_utils.safe_cursor(_DB_PATH) as cursor:
436
464
  cursor.execute(
437
465
  """\
@@ -445,19 +473,54 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
445
473
  task_id=(?) AND
446
474
  status=(?) AND
447
475
  end_at IS null""",
448
- (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
476
+ (resources_str, submit_time, ManagedJobStatus.STARTING.value,
449
477
  run_timestamp, json.dumps(specs), job_id, task_id,
450
478
  ManagedJobStatus.PENDING.value))
451
479
  if cursor.rowcount != 1:
452
480
  raise exceptions.ManagedJobStatusError(
453
- f'Failed to set the task to submitted. '
481
+ 'Failed to set the task to starting. '
454
482
  f'({cursor.rowcount} rows updated)')
483
+ # SUBMITTED is no longer used, but we keep it for backward compatibility.
484
+ # TODO(cooperc): remove this in v0.12.0
455
485
  callback_func('SUBMITTED')
486
+ callback_func('STARTING')
456
487
 
457
488
 
458
- def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
459
- """Set the task to starting state."""
460
- logger.info('Launching the spot cluster...')
489
+ def set_backoff_pending(job_id: int, task_id: int):
490
+ """Set the task to PENDING state if it is in backoff.
491
+
492
+ This should only be used to transition from STARTING or RECOVERING back to
493
+ PENDING.
494
+ """
495
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
496
+ cursor.execute(
497
+ """\
498
+ UPDATE spot SET status=(?)
499
+ WHERE spot_job_id=(?) AND
500
+ task_id=(?) AND
501
+ status IN (?, ?) AND
502
+ end_at IS null""", (ManagedJobStatus.PENDING.value, job_id, task_id,
503
+ ManagedJobStatus.STARTING.value,
504
+ ManagedJobStatus.RECOVERING.value))
505
+ logger.debug('back to PENDING')
506
+ if cursor.rowcount != 1:
507
+ raise exceptions.ManagedJobStatusError(
508
+ 'Failed to set the task back to pending. '
509
+ f'({cursor.rowcount} rows updated)')
510
+ # Do not call callback_func here, as we don't use the callback for PENDING.
511
+
512
+
513
+ def set_restarting(job_id: int, task_id: int, recovering: bool):
514
+ """Set the task back to STARTING or RECOVERING from PENDING.
515
+
516
+ This should not be used for the initial transition from PENDING to STARTING.
517
+ In that case, use set_starting instead. This function should only be used
518
+ after using set_backoff_pending to transition back to PENDING during
519
+ launch retry backoff.
520
+ """
521
+ target_status = ManagedJobStatus.STARTING.value
522
+ if recovering:
523
+ target_status = ManagedJobStatus.RECOVERING.value
461
524
  with db_utils.safe_cursor(_DB_PATH) as cursor:
462
525
  cursor.execute(
463
526
  """\
@@ -465,13 +528,15 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
465
528
  WHERE spot_job_id=(?) AND
466
529
  task_id=(?) AND
467
530
  status=(?) AND
468
- end_at IS null""", (ManagedJobStatus.STARTING.value, job_id,
469
- task_id, ManagedJobStatus.SUBMITTED.value))
531
+ end_at IS null""",
532
+ (target_status, job_id, task_id, ManagedJobStatus.PENDING.value))
533
+ logger.debug(f'back to {target_status}')
470
534
  if cursor.rowcount != 1:
471
535
  raise exceptions.ManagedJobStatusError(
472
- f'Failed to set the task to starting. '
536
+ f'Failed to set the task back to {target_status}. '
473
537
  f'({cursor.rowcount} rows updated)')
474
- callback_func('STARTING')
538
+ # Do not call callback_func here, as it should only be invoked for the
539
+ # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
475
540
 
476
541
 
477
542
  def set_started(job_id: int, task_id: int, start_time: float,
@@ -947,6 +1012,21 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
947
1012
  job_dict['schedule_state'])
948
1013
  if job_dict['job_name'] is None:
949
1014
  job_dict['job_name'] = job_dict['task_name']
1015
+
1016
+ # Add YAML content and command for managed jobs
1017
+ dag_yaml_path = job_dict.get('dag_yaml_path')
1018
+ if dag_yaml_path:
1019
+ try:
1020
+ with open(dag_yaml_path, 'r', encoding='utf-8') as f:
1021
+ job_dict['dag_yaml'] = f.read()
1022
+ except (FileNotFoundError, IOError, OSError):
1023
+ job_dict['dag_yaml'] = None
1024
+
1025
+ # Generate a command that could be used to launch this job
1026
+ # Format: sky jobs launch <yaml_path>
1027
+ else:
1028
+ job_dict['dag_yaml'] = None
1029
+
950
1030
  jobs.append(job_dict)
951
1031
  return jobs
952
1032
 
@@ -1004,16 +1084,16 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
1004
1084
 
1005
1085
 
1006
1086
  def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
1007
- user_hash: str) -> None:
1087
+ user_hash: str, priority: int) -> None:
1008
1088
  """Do not call without holding the scheduler lock."""
1009
1089
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1010
1090
  updated_count = cursor.execute(
1011
1091
  'UPDATE job_info SET '
1012
1092
  'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
1013
- ' user_hash = (?) '
1093
+ ' user_hash = (?), priority = (?) '
1014
1094
  'WHERE spot_job_id = (?) AND schedule_state = (?)',
1015
1095
  (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
1016
- env_file_path, user_hash, job_id,
1096
+ env_file_path, user_hash, priority, job_id,
1017
1097
  ManagedJobScheduleState.INACTIVE.value)).rowcount
1018
1098
  assert updated_count == 1, (job_id, updated_count)
1019
1099
 
@@ -1043,15 +1123,28 @@ def scheduler_set_alive(job_id: int) -> None:
1043
1123
  assert updated_count == 1, (job_id, updated_count)
1044
1124
 
1045
1125
 
1046
- def scheduler_set_alive_waiting(job_id: int) -> None:
1126
+ def scheduler_set_alive_backoff(job_id: int) -> None:
1047
1127
  """Do not call without holding the scheduler lock."""
1048
1128
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1049
1129
  updated_count = cursor.execute(
1050
1130
  'UPDATE job_info SET '
1051
1131
  'schedule_state = (?) '
1052
1132
  'WHERE spot_job_id = (?) AND schedule_state = (?)',
1133
+ (ManagedJobScheduleState.ALIVE_BACKOFF.value, job_id,
1134
+ ManagedJobScheduleState.LAUNCHING.value)).rowcount
1135
+ assert updated_count == 1, (job_id, updated_count)
1136
+
1137
+
1138
+ def scheduler_set_alive_waiting(job_id: int) -> None:
1139
+ """Do not call without holding the scheduler lock."""
1140
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
1141
+ updated_count = cursor.execute(
1142
+ 'UPDATE job_info SET '
1143
+ 'schedule_state = (?) '
1144
+ 'WHERE spot_job_id = (?) AND schedule_state IN (?, ?)',
1053
1145
  (ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
1054
- ManagedJobScheduleState.ALIVE.value)).rowcount
1146
+ ManagedJobScheduleState.ALIVE.value,
1147
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).rowcount
1055
1148
  assert updated_count == 1, (job_id, updated_count)
1056
1149
 
1057
1150
 
@@ -1099,32 +1192,53 @@ def get_num_alive_jobs() -> int:
1099
1192
  return cursor.execute(
1100
1193
  'SELECT COUNT(*) '
1101
1194
  'FROM job_info '
1102
- 'WHERE schedule_state IN (?, ?, ?)',
1195
+ 'WHERE schedule_state IN (?, ?, ?, ?)',
1103
1196
  (ManagedJobScheduleState.ALIVE_WAITING.value,
1104
1197
  ManagedJobScheduleState.LAUNCHING.value,
1105
- ManagedJobScheduleState.ALIVE.value)).fetchone()[0]
1198
+ ManagedJobScheduleState.ALIVE.value,
1199
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()[0]
1106
1200
 
1107
1201
 
1108
1202
  def get_waiting_job() -> Optional[Dict[str, Any]]:
1109
1203
  """Get the next job that should transition to LAUNCHING.
1110
1204
 
1205
+ Selects the highest-priority (lowest numerical value) WAITING or
1206
+ ALIVE_WAITING job, provided its priority value is less than or equal to any
1207
+ currently LAUNCHING or ALIVE_BACKOFF job.
1208
+
1111
1209
  Backwards compatibility note: jobs submitted before #4485 will have no
1112
1210
  schedule_state and will be ignored by this SQL query.
1113
1211
  """
1114
1212
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1115
- row = cursor.execute(
1213
+ # Get the highest-priority (lowest numerical value) WAITING or
1214
+ # ALIVE_WAITING job whose priority value is less than or equal to
1215
+ # the highest priority (numerically smallest) LAUNCHING or
1216
+ # ALIVE_BACKOFF job's priority.
1217
+ waiting_job_row = cursor.execute(
1116
1218
  'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
1117
1219
  'FROM job_info '
1118
- 'WHERE schedule_state in (?, ?) '
1119
- 'ORDER BY spot_job_id LIMIT 1',
1220
+ 'WHERE schedule_state IN (?, ?) '
1221
+ 'AND priority <= COALESCE('
1222
+ ' (SELECT MIN(priority) '
1223
+ ' FROM job_info '
1224
+ ' WHERE schedule_state IN (?, ?)), '
1225
+ ' 1000'
1226
+ ')'
1227
+ 'ORDER BY priority ASC, spot_job_id ASC LIMIT 1',
1120
1228
  (ManagedJobScheduleState.WAITING.value,
1121
- ManagedJobScheduleState.ALIVE_WAITING.value)).fetchone()
1229
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1230
+ ManagedJobScheduleState.LAUNCHING.value,
1231
+ ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()
1232
+
1233
+ if waiting_job_row is None:
1234
+ return None
1235
+
1122
1236
  return {
1123
- 'job_id': row[0],
1124
- 'schedule_state': ManagedJobScheduleState(row[1]),
1125
- 'dag_yaml_path': row[2],
1126
- 'env_file_path': row[3],
1127
- } if row is not None else None
1237
+ 'job_id': waiting_job_row[0],
1238
+ 'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
1239
+ 'dag_yaml_path': waiting_job_row[2],
1240
+ 'env_file_path': waiting_job_row[3],
1241
+ }
1128
1242
 
1129
1243
 
1130
1244
  def get_workspace(job_id: int) -> str:
sky/jobs/utils.py CHANGED
@@ -953,6 +953,22 @@ def dump_managed_job_queue() -> str:
953
953
  job['region'] = '-'
954
954
  job['zone'] = '-'
955
955
 
956
+ # Add details about schedule state / backoff.
957
+ state_details = None
958
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
959
+ state_details = 'In backoff, waiting for resources'
960
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
961
+ state_details = 'Waiting for other jobs to launch'
962
+
963
+ if state_details and job['failure_reason']:
964
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
965
+ elif state_details:
966
+ job['details'] = state_details
967
+ elif job['failure_reason']:
968
+ job['details'] = f'Failure: {job["failure_reason"]}'
969
+ else:
970
+ job['details'] = None
971
+
956
972
  return message_utils.encode_payload(jobs)
957
973
 
958
974
 
@@ -981,7 +997,7 @@ def _get_job_status_from_tasks(
981
997
  # Use the first non-succeeded status.
982
998
  if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
983
999
  # TODO(zhwu): we should not blindly use the first non-
984
- # succeeded as the status could be changed to SUBMITTED
1000
+ # succeeded as the status could be changed to PENDING
985
1001
  # when going from one task to the next one, which can be
986
1002
  # confusing.
987
1003
  break
@@ -1063,6 +1079,7 @@ def format_job_table(
1063
1079
  'TASK',
1064
1080
  *(['WORKSPACE'] if show_workspace else []),
1065
1081
  'NAME',
1082
+ 'PRIORITY',
1066
1083
  *user_cols,
1067
1084
  'REQUESTED',
1068
1085
  'SUBMITTED',
@@ -1092,7 +1109,10 @@ def format_job_table(
1092
1109
  # by the task_id.
1093
1110
  jobs[get_hash(task)].append(task)
1094
1111
 
1095
- def generate_details(failure_reason: Optional[str]) -> str:
1112
+ def generate_details(details: Optional[str],
1113
+ failure_reason: Optional[str]) -> str:
1114
+ if details is not None:
1115
+ return details
1096
1116
  if failure_reason is not None:
1097
1117
  return f'Failure: {failure_reason}'
1098
1118
  return '-'
@@ -1131,6 +1151,7 @@ def format_job_table(
1131
1151
  submitted_at = None
1132
1152
  end_at: Optional[int] = 0
1133
1153
  recovery_cnt = 0
1154
+ priority = job_tasks[0].get('priority', '-')
1134
1155
  managed_job_status, current_task_id = _get_job_status_from_tasks(
1135
1156
  job_tasks)
1136
1157
  for task in job_tasks:
@@ -1166,6 +1187,7 @@ def format_job_table(
1166
1187
  '',
1167
1188
  *([''] if show_workspace else []),
1168
1189
  job_name,
1190
+ str(priority),
1169
1191
  *user_values,
1170
1192
  '-',
1171
1193
  submitted,
@@ -1175,13 +1197,14 @@ def format_job_table(
1175
1197
  status_str,
1176
1198
  ]
1177
1199
  if show_all:
1200
+ details = job_tasks[current_task_id].get('details')
1178
1201
  failure_reason = job_tasks[current_task_id]['failure_reason']
1179
1202
  job_values.extend([
1180
1203
  '-',
1181
1204
  '-',
1182
1205
  '-',
1183
1206
  job_tasks[0]['schedule_state'],
1184
- generate_details(failure_reason),
1207
+ generate_details(details, failure_reason),
1185
1208
  ])
1186
1209
  if tasks_have_k8s_user:
1187
1210
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1195,11 +1218,13 @@ def format_job_table(
1195
1218
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1196
1219
  user_values = get_user_column_values(task)
1197
1220
  task_workspace = '-' if len(job_tasks) > 1 else workspace
1221
+ priority = task.get('priority', '-')
1198
1222
  values = [
1199
1223
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1200
1224
  task['task_id'] if len(job_tasks) > 1 else '-',
1201
1225
  *([task_workspace] if show_workspace else []),
1202
1226
  task['task_name'],
1227
+ str(priority),
1203
1228
  *user_values,
1204
1229
  task['resources'],
1205
1230
  # SUBMITTED
@@ -1244,7 +1269,8 @@ def format_job_table(
1244
1269
  infra.formatted_str(),
1245
1270
  task['cluster_resources'],
1246
1271
  schedule_state,
1247
- generate_details(task['failure_reason']),
1272
+ generate_details(task.get('details'),
1273
+ task['failure_reason']),
1248
1274
  ])
1249
1275
  if tasks_have_k8s_user:
1250
1276
  values.insert(0, task.get('user', '-'))
@@ -1362,13 +1388,15 @@ class ManagedJobCodeGen:
1362
1388
 
1363
1389
  @classmethod
1364
1390
  def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1365
- workspace) -> str:
1391
+ workspace: str, entrypoint: str) -> str:
1366
1392
  dag_name = managed_job_dag.name
1367
1393
  # Add the managed job to queue table.
1368
1394
  code = textwrap.dedent(f"""\
1369
1395
  set_job_info_kwargs = {{'workspace': {workspace!r}}}
1370
1396
  if managed_job_version < 4:
1371
1397
  set_job_info_kwargs = {{}}
1398
+ if managed_job_version >= 5:
1399
+ set_job_info_kwargs['entrypoint'] = {entrypoint!r}
1372
1400
  managed_job_state.set_job_info(
1373
1401
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
1374
1402
  """)
@@ -15,6 +15,7 @@ from urllib.parse import urlparse
15
15
  import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import models
19
20
  from sky import sky_logging
20
21
  from sky import skypilot_config
@@ -2810,7 +2811,7 @@ def set_autodown_annotations(handle: 'backends.CloudVmRayResourceHandle',
2810
2811
  tags = {
2811
2812
  provision_constants.TAG_RAY_CLUSTER_NAME: handle.cluster_name_on_cloud,
2812
2813
  }
2813
- ray_config = common_utils.read_yaml(handle.cluster_yaml)
2814
+ ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
2814
2815
  provider_config = ray_config['provider']
2815
2816
  namespace = get_namespace_from_config(provider_config)
2816
2817
  context = get_context_from_config(provider_config)
@@ -15,6 +15,7 @@ import colorama
15
15
  import sky
16
16
  from sky import clouds
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import provision
19
20
  from sky import sky_logging
20
21
  from sky import skypilot_config
@@ -118,7 +119,7 @@ def bulk_provision(
118
119
  Cloud specific exceptions: If the provisioning process failed, cloud-
119
120
  specific exceptions will be raised by the cloud APIs.
120
121
  """
121
- original_config = common_utils.read_yaml(cluster_yaml)
122
+ original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
122
123
  head_node_type = original_config['head_node_type']
123
124
  bootstrap_config = provision_common.ProvisionConfig(
124
125
  provider_config=original_config['provider'],
@@ -413,9 +414,11 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
413
414
 
414
415
  def _post_provision_setup(
415
416
  cloud_name: str, cluster_name: resources_utils.ClusterName,
416
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
417
+ handle_cluster_yaml: str,
418
+ provision_record: provision_common.ProvisionRecord,
417
419
  custom_resource: Optional[str]) -> provision_common.ClusterInfo:
418
- config_from_yaml = common_utils.read_yaml(cluster_yaml)
420
+ config_from_yaml = global_user_state.get_cluster_yaml_dict(
421
+ handle_cluster_yaml)
419
422
  provider_config = config_from_yaml.get('provider')
420
423
  cluster_info = provision.get_cluster_info(cloud_name,
421
424
  provision_record.region,
@@ -446,7 +449,7 @@ def _post_provision_setup(
446
449
  # TODO(suquark): Move wheel build here in future PRs.
447
450
  # We don't set docker_user here, as we are configuring the VM itself.
448
451
  ssh_credentials = backend_utils.ssh_credential_from_yaml(
449
- cluster_yaml, ssh_user=cluster_info.ssh_user)
452
+ handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
450
453
  docker_config = config_from_yaml.get('docker', {})
451
454
 
452
455
  with rich_utils.safe_status(
@@ -657,7 +660,8 @@ def _post_provision_setup(
657
660
  @timeline.event
658
661
  def post_provision_runtime_setup(
659
662
  cloud_name: str, cluster_name: resources_utils.ClusterName,
660
- cluster_yaml: str, provision_record: provision_common.ProvisionRecord,
663
+ handle_cluster_yaml: str,
664
+ provision_record: provision_common.ProvisionRecord,
661
665
  custom_resource: Optional[str],
662
666
  log_dir: str) -> provision_common.ClusterInfo:
663
667
  """Run internal setup commands after provisioning and before user setup.
@@ -675,11 +679,12 @@ def post_provision_runtime_setup(
675
679
  with provision_logging.setup_provision_logging(log_dir):
676
680
  try:
677
681
  logger.debug(_TITLE.format('System Setup After Provision'))
678
- return _post_provision_setup(cloud_name,
679
- cluster_name,
680
- cluster_yaml=cluster_yaml,
681
- provision_record=provision_record,
682
- custom_resource=custom_resource)
682
+ return _post_provision_setup(
683
+ cloud_name,
684
+ cluster_name,
685
+ handle_cluster_yaml=handle_cluster_yaml,
686
+ provision_record=provision_record,
687
+ custom_resource=custom_resource)
683
688
  except Exception: # pylint: disable=broad-except
684
689
  logger.error(
685
690
  ux_utils.error_message(
sky/resources.py CHANGED
@@ -98,7 +98,7 @@ class Resources:
98
98
  """
99
99
  # If any fields changed, increment the version. For backward compatibility,
100
100
  # modify the __setstate__ method to handle the old version.
101
- _VERSION = 24
101
+ _VERSION = 25
102
102
 
103
103
  def __init__(
104
104
  self,
@@ -294,6 +294,8 @@ class Resources:
294
294
  }
295
295
  else:
296
296
  self._image_id = image_id
297
+ if isinstance(self._cloud, clouds.Kubernetes):
298
+ _maybe_add_docker_prefix_to_image_id(self._image_id)
297
299
  self._is_image_managed = _is_image_managed
298
300
 
299
301
  if isinstance(disk_tier, str):
@@ -2075,6 +2077,10 @@ class Resources:
2075
2077
  if version < 24:
2076
2078
  self._volumes = None
2077
2079
 
2080
+ if version < 25:
2081
+ if isinstance(state.get('_cloud', None), clouds.Kubernetes):
2082
+ _maybe_add_docker_prefix_to_image_id(state['_image_id'])
2083
+
2078
2084
  self.__dict__.update(state)
2079
2085
 
2080
2086
 
@@ -2111,3 +2117,12 @@ class LaunchableResources(Resources):
2111
2117
  """
2112
2118
  self.assert_launchable()
2113
2119
  return typing.cast(LaunchableResources, super().copy(**override))
2120
+
2121
+
2122
+ def _maybe_add_docker_prefix_to_image_id(
2123
+ image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
2124
+ if image_id_dict is None:
2125
+ return
2126
+ for k, v in image_id_dict.items():
2127
+ if not v.startswith('docker:'):
2128
+ image_id_dict[k] = f'docker:{v}'