skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250529__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +13 -3
- sky/client/cli.py +13 -3
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +6 -0
- sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +6 -0
- sky/dashboard/out/_next/static/chunks/{856-62b87c68917b08ed.js → 856-59a1760784c9e770.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-7c48919fe030bc43.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-909f1ceb0fcf1b99.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-d4c6875c88771e17.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +6 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +1 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +130 -35
- sky/jobs/utils.py +30 -4
- sky/resources.py +16 -1
- sky/server/common.py +6 -2
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/utils/schemas.py +12 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/RECORD +53 -49
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-41738d1896fc02fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → HvNkg7hqKM1p0ptAcdDcF}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -120,7 +120,8 @@ def create_table(cursor, conn):
|
|
120
120
|
dag_yaml_path TEXT,
|
121
121
|
env_file_path TEXT,
|
122
122
|
user_hash TEXT,
|
123
|
-
workspace TEXT DEFAULT NULL
|
123
|
+
workspace TEXT DEFAULT NULL,
|
124
|
+
priority INTEGER DEFAULT 500)""")
|
124
125
|
|
125
126
|
db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
|
126
127
|
'TEXT')
|
@@ -142,6 +143,14 @@ def create_table(cursor, conn):
|
|
142
143
|
'workspace',
|
143
144
|
'TEXT DEFAULT NULL',
|
144
145
|
value_to_replace_existing_entries='default')
|
146
|
+
|
147
|
+
db_utils.add_column_to_table(cursor,
|
148
|
+
conn,
|
149
|
+
'job_info',
|
150
|
+
'priority',
|
151
|
+
'INTEGER',
|
152
|
+
value_to_replace_existing_entries=500)
|
153
|
+
|
145
154
|
conn.commit()
|
146
155
|
|
147
156
|
|
@@ -199,6 +208,7 @@ columns = [
|
|
199
208
|
'env_file_path',
|
200
209
|
'user_hash',
|
201
210
|
'workspace',
|
211
|
+
'priority',
|
202
212
|
]
|
203
213
|
|
204
214
|
|
@@ -215,7 +225,7 @@ class ManagedJobStatus(enum.Enum):
|
|
215
225
|
reset to INIT or SETTING_UP multiple times (depending on the preemptions).
|
216
226
|
|
217
227
|
However, a managed job only has one ManagedJobStatus on the jobs controller.
|
218
|
-
ManagedJobStatus = [PENDING,
|
228
|
+
ManagedJobStatus = [PENDING, STARTING, RUNNING, ...]
|
219
229
|
Mapping from JobStatus to ManagedJobStatus:
|
220
230
|
INIT -> STARTING/RECOVERING
|
221
231
|
SETTING_UP -> RUNNING
|
@@ -235,10 +245,14 @@ class ManagedJobStatus(enum.Enum):
|
|
235
245
|
# PENDING: Waiting for the jobs controller to have a slot to run the
|
236
246
|
# controller process.
|
237
247
|
PENDING = 'PENDING'
|
248
|
+
# SUBMITTED: This state used to be briefly set before immediately changing
|
249
|
+
# to STARTING. Its use was removed in #5682. We keep it for backwards
|
250
|
+
# compatibility, so we can still parse old jobs databases that may have jobs
|
251
|
+
# in this state.
|
252
|
+
# TODO(cooperc): remove this in v0.12.0
|
253
|
+
DEPRECATED_SUBMITTED = 'SUBMITTED'
|
238
254
|
# The submitted_at timestamp of the managed job in the 'spot' table will be
|
239
255
|
# set to the time when the job controller begins running.
|
240
|
-
# SUBMITTED: The jobs controller starts the controller process.
|
241
|
-
SUBMITTED = 'SUBMITTED'
|
242
256
|
# STARTING: The controller process is launching the cluster for the managed
|
243
257
|
# job.
|
244
258
|
STARTING = 'STARTING'
|
@@ -314,7 +328,6 @@ class ManagedJobStatus(enum.Enum):
|
|
314
328
|
|
315
329
|
_SPOT_STATUS_TO_COLOR = {
|
316
330
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
317
|
-
ManagedJobStatus.SUBMITTED: colorama.Fore.BLUE,
|
318
331
|
ManagedJobStatus.STARTING: colorama.Fore.BLUE,
|
319
332
|
ManagedJobStatus.RUNNING: colorama.Fore.GREEN,
|
320
333
|
ManagedJobStatus.RECOVERING: colorama.Fore.CYAN,
|
@@ -326,6 +339,8 @@ _SPOT_STATUS_TO_COLOR = {
|
|
326
339
|
ManagedJobStatus.FAILED_CONTROLLER: colorama.Fore.RED,
|
327
340
|
ManagedJobStatus.CANCELLING: colorama.Fore.YELLOW,
|
328
341
|
ManagedJobStatus.CANCELLED: colorama.Fore.YELLOW,
|
342
|
+
# TODO(cooperc): backwards compatibility, remove this in v0.12.0
|
343
|
+
ManagedJobStatus.DEPRECATED_SUBMITTED: colorama.Fore.BLUE,
|
329
344
|
}
|
330
345
|
|
331
346
|
|
@@ -342,8 +357,12 @@ class ManagedJobScheduleState(enum.Enum):
|
|
342
357
|
- LAUNCHING -> ALIVE: The launch attempt was completed. It may have
|
343
358
|
succeeded or failed. The job controller is not allowed to sky.launch again
|
344
359
|
without transitioning to ALIVE_WAITING and then LAUNCHING.
|
360
|
+
- LAUNCHING -> ALIVE_BACKOFF: The launch failed to find resources, and is
|
361
|
+
in backoff waiting for resources.
|
345
362
|
- ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
|
346
363
|
either for recovery or to launch a subsequent task.
|
364
|
+
- ALIVE_BACKOFF -> ALIVE_WAITING: The backoff period has ended, and the job
|
365
|
+
controller wants to try to launch again.
|
347
366
|
- ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
|
348
367
|
controller may launch again.
|
349
368
|
- LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
|
@@ -357,6 +376,7 @@ class ManagedJobScheduleState(enum.Enum):
|
|
357
376
|
state or vice versa. (In fact, schedule state is defined on the job and
|
358
377
|
status on the task.)
|
359
378
|
- INACTIVE or WAITING should only be seen when a job is PENDING.
|
379
|
+
- ALIVE_BACKOFF should only be seen when a job is STARTING.
|
360
380
|
- ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
|
361
381
|
tasks, or needs to retry launching.
|
362
382
|
- LAUNCHING and ALIVE can be seen in many different statuses.
|
@@ -382,6 +402,9 @@ class ManagedJobScheduleState(enum.Enum):
|
|
382
402
|
# The job is running sky.launch, or soon will, using a limited number of
|
383
403
|
# allowed launch slots.
|
384
404
|
LAUNCHING = 'LAUNCHING'
|
405
|
+
# The job is alive, but is in backoff waiting for resources - a special case
|
406
|
+
# of ALIVE.
|
407
|
+
ALIVE_BACKOFF = 'ALIVE_BACKOFF'
|
385
408
|
# The controller for the job is running, but it's not currently launching.
|
386
409
|
ALIVE = 'ALIVE'
|
387
410
|
# The job is in a terminal state. (Not necessarily SUCCEEDED.)
|
@@ -411,11 +434,11 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
411
434
|
ManagedJobStatus.PENDING.value))
|
412
435
|
|
413
436
|
|
414
|
-
def
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
"""Set the task to
|
437
|
+
def set_starting(job_id: int, task_id: int, run_timestamp: str,
|
438
|
+
submit_time: float, resources_str: str,
|
439
|
+
specs: Dict[str, Union[str,
|
440
|
+
int]], callback_func: CallbackType):
|
441
|
+
"""Set the task to starting state.
|
419
442
|
|
420
443
|
Args:
|
421
444
|
job_id: The managed job ID.
|
@@ -432,6 +455,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
432
455
|
# make it easier to find them based on one of the values.
|
433
456
|
# Also, using the earlier timestamp should be closer to the term
|
434
457
|
# `submit_at`, which represents the time the managed task is submitted.
|
458
|
+
logger.info('Launching the spot cluster...')
|
435
459
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
436
460
|
cursor.execute(
|
437
461
|
"""\
|
@@ -445,19 +469,54 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
445
469
|
task_id=(?) AND
|
446
470
|
status=(?) AND
|
447
471
|
end_at IS null""",
|
448
|
-
(resources_str, submit_time, ManagedJobStatus.
|
472
|
+
(resources_str, submit_time, ManagedJobStatus.STARTING.value,
|
449
473
|
run_timestamp, json.dumps(specs), job_id, task_id,
|
450
474
|
ManagedJobStatus.PENDING.value))
|
451
475
|
if cursor.rowcount != 1:
|
452
476
|
raise exceptions.ManagedJobStatusError(
|
453
|
-
|
477
|
+
'Failed to set the task to starting. '
|
454
478
|
f'({cursor.rowcount} rows updated)')
|
479
|
+
# SUBMITTED is no longer used, but we keep it for backward compatibility.
|
480
|
+
# TODO(cooperc): remove this in v0.12.0
|
455
481
|
callback_func('SUBMITTED')
|
482
|
+
callback_func('STARTING')
|
456
483
|
|
457
484
|
|
458
|
-
def
|
459
|
-
"""Set the task to
|
460
|
-
|
485
|
+
def set_backoff_pending(job_id: int, task_id: int):
|
486
|
+
"""Set the task to PENDING state if it is in backoff.
|
487
|
+
|
488
|
+
This should only be used to transition from STARTING or RECOVERING back to
|
489
|
+
PENDING.
|
490
|
+
"""
|
491
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
492
|
+
cursor.execute(
|
493
|
+
"""\
|
494
|
+
UPDATE spot SET status=(?)
|
495
|
+
WHERE spot_job_id=(?) AND
|
496
|
+
task_id=(?) AND
|
497
|
+
status IN (?, ?) AND
|
498
|
+
end_at IS null""", (ManagedJobStatus.PENDING.value, job_id, task_id,
|
499
|
+
ManagedJobStatus.STARTING.value,
|
500
|
+
ManagedJobStatus.RECOVERING.value))
|
501
|
+
logger.debug('back to PENDING')
|
502
|
+
if cursor.rowcount != 1:
|
503
|
+
raise exceptions.ManagedJobStatusError(
|
504
|
+
'Failed to set the task back to pending. '
|
505
|
+
f'({cursor.rowcount} rows updated)')
|
506
|
+
# Do not call callback_func here, as we don't use the callback for PENDING.
|
507
|
+
|
508
|
+
|
509
|
+
def set_restarting(job_id: int, task_id: int, recovering: bool):
|
510
|
+
"""Set the task back to STARTING or RECOVERING from PENDING.
|
511
|
+
|
512
|
+
This should not be used for the initial transition from PENDING to STARTING.
|
513
|
+
In that case, use set_starting instead. This function should only be used
|
514
|
+
after using set_backoff_pending to transition back to PENDING during
|
515
|
+
launch retry backoff.
|
516
|
+
"""
|
517
|
+
target_status = ManagedJobStatus.STARTING.value
|
518
|
+
if recovering:
|
519
|
+
target_status = ManagedJobStatus.RECOVERING.value
|
461
520
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
462
521
|
cursor.execute(
|
463
522
|
"""\
|
@@ -465,13 +524,15 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
|
|
465
524
|
WHERE spot_job_id=(?) AND
|
466
525
|
task_id=(?) AND
|
467
526
|
status=(?) AND
|
468
|
-
end_at IS null""",
|
469
|
-
|
527
|
+
end_at IS null""",
|
528
|
+
(target_status, job_id, task_id, ManagedJobStatus.PENDING.value))
|
529
|
+
logger.debug(f'back to {target_status}')
|
470
530
|
if cursor.rowcount != 1:
|
471
531
|
raise exceptions.ManagedJobStatusError(
|
472
|
-
f'Failed to set the task to
|
532
|
+
f'Failed to set the task back to {target_status}. '
|
473
533
|
f'({cursor.rowcount} rows updated)')
|
474
|
-
callback_func
|
534
|
+
# Do not call callback_func here, as it should only be invoked for the
|
535
|
+
# initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
|
475
536
|
|
476
537
|
|
477
538
|
def set_started(job_id: int, task_id: int, start_time: float,
|
@@ -1004,16 +1065,16 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
|
1004
1065
|
|
1005
1066
|
|
1006
1067
|
def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
|
1007
|
-
user_hash: str) -> None:
|
1068
|
+
user_hash: str, priority: int) -> None:
|
1008
1069
|
"""Do not call without holding the scheduler lock."""
|
1009
1070
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1010
1071
|
updated_count = cursor.execute(
|
1011
1072
|
'UPDATE job_info SET '
|
1012
1073
|
'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
|
1013
|
-
' user_hash = (?) '
|
1074
|
+
' user_hash = (?), priority = (?) '
|
1014
1075
|
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
1015
1076
|
(ManagedJobScheduleState.WAITING.value, dag_yaml_path,
|
1016
|
-
env_file_path, user_hash, job_id,
|
1077
|
+
env_file_path, user_hash, priority, job_id,
|
1017
1078
|
ManagedJobScheduleState.INACTIVE.value)).rowcount
|
1018
1079
|
assert updated_count == 1, (job_id, updated_count)
|
1019
1080
|
|
@@ -1043,15 +1104,28 @@ def scheduler_set_alive(job_id: int) -> None:
|
|
1043
1104
|
assert updated_count == 1, (job_id, updated_count)
|
1044
1105
|
|
1045
1106
|
|
1046
|
-
def
|
1107
|
+
def scheduler_set_alive_backoff(job_id: int) -> None:
|
1047
1108
|
"""Do not call without holding the scheduler lock."""
|
1048
1109
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1049
1110
|
updated_count = cursor.execute(
|
1050
1111
|
'UPDATE job_info SET '
|
1051
1112
|
'schedule_state = (?) '
|
1052
1113
|
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
1114
|
+
(ManagedJobScheduleState.ALIVE_BACKOFF.value, job_id,
|
1115
|
+
ManagedJobScheduleState.LAUNCHING.value)).rowcount
|
1116
|
+
assert updated_count == 1, (job_id, updated_count)
|
1117
|
+
|
1118
|
+
|
1119
|
+
def scheduler_set_alive_waiting(job_id: int) -> None:
|
1120
|
+
"""Do not call without holding the scheduler lock."""
|
1121
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1122
|
+
updated_count = cursor.execute(
|
1123
|
+
'UPDATE job_info SET '
|
1124
|
+
'schedule_state = (?) '
|
1125
|
+
'WHERE spot_job_id = (?) AND schedule_state IN (?, ?)',
|
1053
1126
|
(ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
|
1054
|
-
ManagedJobScheduleState.ALIVE.value
|
1127
|
+
ManagedJobScheduleState.ALIVE.value,
|
1128
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).rowcount
|
1055
1129
|
assert updated_count == 1, (job_id, updated_count)
|
1056
1130
|
|
1057
1131
|
|
@@ -1099,32 +1173,53 @@ def get_num_alive_jobs() -> int:
|
|
1099
1173
|
return cursor.execute(
|
1100
1174
|
'SELECT COUNT(*) '
|
1101
1175
|
'FROM job_info '
|
1102
|
-
'WHERE schedule_state IN (?, ?, ?)',
|
1176
|
+
'WHERE schedule_state IN (?, ?, ?, ?)',
|
1103
1177
|
(ManagedJobScheduleState.ALIVE_WAITING.value,
|
1104
1178
|
ManagedJobScheduleState.LAUNCHING.value,
|
1105
|
-
ManagedJobScheduleState.ALIVE.value
|
1179
|
+
ManagedJobScheduleState.ALIVE.value,
|
1180
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()[0]
|
1106
1181
|
|
1107
1182
|
|
1108
1183
|
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
1109
1184
|
"""Get the next job that should transition to LAUNCHING.
|
1110
1185
|
|
1186
|
+
Selects the highest-priority (lowest numerical value) WAITING or
|
1187
|
+
ALIVE_WAITING job, provided its priority value is less than or equal to any
|
1188
|
+
currently LAUNCHING or ALIVE_BACKOFF job.
|
1189
|
+
|
1111
1190
|
Backwards compatibility note: jobs submitted before #4485 will have no
|
1112
1191
|
schedule_state and will be ignored by this SQL query.
|
1113
1192
|
"""
|
1114
1193
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1115
|
-
|
1194
|
+
# Get the highest-priority (lowest numerical value) WAITING or
|
1195
|
+
# ALIVE_WAITING job whose priority value is less than or equal to
|
1196
|
+
# the highest priority (numerically smallest) LAUNCHING or
|
1197
|
+
# ALIVE_BACKOFF job's priority.
|
1198
|
+
waiting_job_row = cursor.execute(
|
1116
1199
|
'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
|
1117
1200
|
'FROM job_info '
|
1118
|
-
'WHERE schedule_state
|
1119
|
-
'
|
1201
|
+
'WHERE schedule_state IN (?, ?) '
|
1202
|
+
'AND priority <= COALESCE('
|
1203
|
+
' (SELECT MIN(priority) '
|
1204
|
+
' FROM job_info '
|
1205
|
+
' WHERE schedule_state IN (?, ?)), '
|
1206
|
+
' 1000'
|
1207
|
+
')'
|
1208
|
+
'ORDER BY priority ASC, spot_job_id ASC LIMIT 1',
|
1120
1209
|
(ManagedJobScheduleState.WAITING.value,
|
1121
|
-
ManagedJobScheduleState.ALIVE_WAITING.value
|
1210
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
1211
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
1212
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()
|
1213
|
+
|
1214
|
+
if waiting_job_row is None:
|
1215
|
+
return None
|
1216
|
+
|
1122
1217
|
return {
|
1123
|
-
'job_id':
|
1124
|
-
'schedule_state': ManagedJobScheduleState(
|
1125
|
-
'dag_yaml_path':
|
1126
|
-
'env_file_path':
|
1127
|
-
}
|
1218
|
+
'job_id': waiting_job_row[0],
|
1219
|
+
'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
|
1220
|
+
'dag_yaml_path': waiting_job_row[2],
|
1221
|
+
'env_file_path': waiting_job_row[3],
|
1222
|
+
}
|
1128
1223
|
|
1129
1224
|
|
1130
1225
|
def get_workspace(job_id: int) -> str:
|
sky/jobs/utils.py
CHANGED
@@ -953,6 +953,22 @@ def dump_managed_job_queue() -> str:
|
|
953
953
|
job['region'] = '-'
|
954
954
|
job['zone'] = '-'
|
955
955
|
|
956
|
+
# Add details about schedule state / backoff.
|
957
|
+
state_details = None
|
958
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
959
|
+
state_details = 'In backoff, waiting for resources'
|
960
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
961
|
+
state_details = 'Waiting for other jobs to launch'
|
962
|
+
|
963
|
+
if state_details and job['failure_reason']:
|
964
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
965
|
+
elif state_details:
|
966
|
+
job['details'] = state_details
|
967
|
+
elif job['failure_reason']:
|
968
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
969
|
+
else:
|
970
|
+
job['details'] = None
|
971
|
+
|
956
972
|
return message_utils.encode_payload(jobs)
|
957
973
|
|
958
974
|
|
@@ -981,7 +997,7 @@ def _get_job_status_from_tasks(
|
|
981
997
|
# Use the first non-succeeded status.
|
982
998
|
if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
|
983
999
|
# TODO(zhwu): we should not blindly use the first non-
|
984
|
-
# succeeded as the status could be changed to
|
1000
|
+
# succeeded as the status could be changed to PENDING
|
985
1001
|
# when going from one task to the next one, which can be
|
986
1002
|
# confusing.
|
987
1003
|
break
|
@@ -1063,6 +1079,7 @@ def format_job_table(
|
|
1063
1079
|
'TASK',
|
1064
1080
|
*(['WORKSPACE'] if show_workspace else []),
|
1065
1081
|
'NAME',
|
1082
|
+
'PRIORITY',
|
1066
1083
|
*user_cols,
|
1067
1084
|
'REQUESTED',
|
1068
1085
|
'SUBMITTED',
|
@@ -1092,7 +1109,10 @@ def format_job_table(
|
|
1092
1109
|
# by the task_id.
|
1093
1110
|
jobs[get_hash(task)].append(task)
|
1094
1111
|
|
1095
|
-
def generate_details(
|
1112
|
+
def generate_details(details: Optional[str],
|
1113
|
+
failure_reason: Optional[str]) -> str:
|
1114
|
+
if details is not None:
|
1115
|
+
return details
|
1096
1116
|
if failure_reason is not None:
|
1097
1117
|
return f'Failure: {failure_reason}'
|
1098
1118
|
return '-'
|
@@ -1131,6 +1151,7 @@ def format_job_table(
|
|
1131
1151
|
submitted_at = None
|
1132
1152
|
end_at: Optional[int] = 0
|
1133
1153
|
recovery_cnt = 0
|
1154
|
+
priority = job_tasks[0].get('priority', '-')
|
1134
1155
|
managed_job_status, current_task_id = _get_job_status_from_tasks(
|
1135
1156
|
job_tasks)
|
1136
1157
|
for task in job_tasks:
|
@@ -1166,6 +1187,7 @@ def format_job_table(
|
|
1166
1187
|
'',
|
1167
1188
|
*([''] if show_workspace else []),
|
1168
1189
|
job_name,
|
1190
|
+
str(priority),
|
1169
1191
|
*user_values,
|
1170
1192
|
'-',
|
1171
1193
|
submitted,
|
@@ -1175,13 +1197,14 @@ def format_job_table(
|
|
1175
1197
|
status_str,
|
1176
1198
|
]
|
1177
1199
|
if show_all:
|
1200
|
+
details = job_tasks[current_task_id].get('details')
|
1178
1201
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
1179
1202
|
job_values.extend([
|
1180
1203
|
'-',
|
1181
1204
|
'-',
|
1182
1205
|
'-',
|
1183
1206
|
job_tasks[0]['schedule_state'],
|
1184
|
-
generate_details(failure_reason),
|
1207
|
+
generate_details(details, failure_reason),
|
1185
1208
|
])
|
1186
1209
|
if tasks_have_k8s_user:
|
1187
1210
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1195,11 +1218,13 @@ def format_job_table(
|
|
1195
1218
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
1196
1219
|
user_values = get_user_column_values(task)
|
1197
1220
|
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
1221
|
+
priority = task.get('priority', '-')
|
1198
1222
|
values = [
|
1199
1223
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
1200
1224
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
1201
1225
|
*([task_workspace] if show_workspace else []),
|
1202
1226
|
task['task_name'],
|
1227
|
+
str(priority),
|
1203
1228
|
*user_values,
|
1204
1229
|
task['resources'],
|
1205
1230
|
# SUBMITTED
|
@@ -1244,7 +1269,8 @@ def format_job_table(
|
|
1244
1269
|
infra.formatted_str(),
|
1245
1270
|
task['cluster_resources'],
|
1246
1271
|
schedule_state,
|
1247
|
-
generate_details(task
|
1272
|
+
generate_details(task.get('details'),
|
1273
|
+
task['failure_reason']),
|
1248
1274
|
])
|
1249
1275
|
if tasks_have_k8s_user:
|
1250
1276
|
values.insert(0, task.get('user', '-'))
|
sky/resources.py
CHANGED
@@ -98,7 +98,7 @@ class Resources:
|
|
98
98
|
"""
|
99
99
|
# If any fields changed, increment the version. For backward compatibility,
|
100
100
|
# modify the __setstate__ method to handle the old version.
|
101
|
-
_VERSION =
|
101
|
+
_VERSION = 25
|
102
102
|
|
103
103
|
def __init__(
|
104
104
|
self,
|
@@ -294,6 +294,8 @@ class Resources:
|
|
294
294
|
}
|
295
295
|
else:
|
296
296
|
self._image_id = image_id
|
297
|
+
if isinstance(self._cloud, clouds.Kubernetes):
|
298
|
+
_maybe_add_docker_prefix_to_image_id(self._image_id)
|
297
299
|
self._is_image_managed = _is_image_managed
|
298
300
|
|
299
301
|
if isinstance(disk_tier, str):
|
@@ -2075,6 +2077,10 @@ class Resources:
|
|
2075
2077
|
if version < 24:
|
2076
2078
|
self._volumes = None
|
2077
2079
|
|
2080
|
+
if version < 25:
|
2081
|
+
if isinstance(state.get('_cloud', None), clouds.Kubernetes):
|
2082
|
+
_maybe_add_docker_prefix_to_image_id(state['_image_id'])
|
2083
|
+
|
2078
2084
|
self.__dict__.update(state)
|
2079
2085
|
|
2080
2086
|
|
@@ -2111,3 +2117,12 @@ class LaunchableResources(Resources):
|
|
2111
2117
|
"""
|
2112
2118
|
self.assert_launchable()
|
2113
2119
|
return typing.cast(LaunchableResources, super().copy(**override))
|
2120
|
+
|
2121
|
+
|
2122
|
+
def _maybe_add_docker_prefix_to_image_id(
|
2123
|
+
image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
|
2124
|
+
if image_id_dict is None:
|
2125
|
+
return
|
2126
|
+
for k, v in image_id_dict.items():
|
2127
|
+
if not v.startswith('docker:'):
|
2128
|
+
image_id_dict[k] = f'docker:{v}'
|
sky/server/common.py
CHANGED
@@ -159,7 +159,8 @@ def get_server_url(host: Optional[str] = None) -> str:
|
|
159
159
|
|
160
160
|
|
161
161
|
@annotations.lru_cache(scope='global')
|
162
|
-
def get_dashboard_url(server_url: str
|
162
|
+
def get_dashboard_url(server_url: str,
|
163
|
+
starting_page: Optional[str] = None) -> str:
|
163
164
|
# The server_url may include username or password with the
|
164
165
|
# format of https://username:password@example.com:8080/path
|
165
166
|
# We need to remove the username and password and only
|
@@ -172,7 +173,10 @@ def get_dashboard_url(server_url: str) -> str:
|
|
172
173
|
if parsed.path:
|
173
174
|
dashboard_url = f'{dashboard_url}{parsed.path}'
|
174
175
|
dashboard_url = dashboard_url.rstrip('/')
|
175
|
-
|
176
|
+
dashboard_url = f'{dashboard_url}/dashboard'
|
177
|
+
if starting_page:
|
178
|
+
dashboard_url = f'{dashboard_url}/{starting_page}'
|
179
|
+
return dashboard_url
|
176
180
|
|
177
181
|
|
178
182
|
@annotations.lru_cache(scope='global')
|
sky/server/html/token_page.html
CHANGED
@@ -100,6 +100,9 @@
|
|
100
100
|
color: #5f6368;
|
101
101
|
margin-top: 30px;
|
102
102
|
}
|
103
|
+
.local-port-info {
|
104
|
+
display: none;
|
105
|
+
}
|
103
106
|
</style>
|
104
107
|
</head>
|
105
108
|
<body>
|
@@ -114,14 +117,18 @@
|
|
114
117
|
<path d="M16.632 21.3918L15.2651 27.6605L21.3357 25.6091L30.3276 16.6172L16.632 21.3918Z" fill="#39A4DD"/>
|
115
118
|
</svg>
|
116
119
|
</div>
|
117
|
-
<h1>Sign in to SkyPilot CLI</h1>
|
120
|
+
<h1 class="no-local-port">Sign in to SkyPilot CLI</h1>
|
121
|
+
<h1 class="local-port-info">Successfully signed into SkyPilot CLI</h1>
|
118
122
|
<p class="user-identifier">USER_PLACEHOLDER</p>
|
119
|
-
|
120
|
-
<p>
|
121
|
-
<
|
122
|
-
<
|
123
|
+
<!-- display token info by default -->
|
124
|
+
<p class="no-local-port">You are seeing this page because a SkyPilot command requires authentication.</p>
|
125
|
+
<p class="no-local-port">Please copy the following token and paste it into your SkyPilot CLI prompt:</p>
|
126
|
+
<div id="token-box" class="code-block no-local-port">SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER</div>
|
127
|
+
<button id="copy-btn" class="copy-button no-local-port">Copy Token</button>
|
128
|
+
<p class="footer-text no-local-port">You can close this tab after copying the token.</p>
|
123
129
|
|
124
|
-
|
130
|
+
<!-- don't display local port info unless successful -->
|
131
|
+
<p class="local-port-info">You can now close this tab.</p>
|
125
132
|
</div>
|
126
133
|
|
127
134
|
<script>
|
@@ -154,6 +161,25 @@
|
|
154
161
|
copyBtn.textContent = 'Copy Token';
|
155
162
|
}, 2000);
|
156
163
|
});
|
164
|
+
|
165
|
+
function hideTokenInfo() {
|
166
|
+
const noLocalPortElems = document.querySelectorAll('.no-local-port');
|
167
|
+
noLocalPortElems.forEach(elem => {
|
168
|
+
elem.style.display = 'none';
|
169
|
+
});
|
170
|
+
const localPortInfoElems = document.querySelectorAll('.local-port-info');
|
171
|
+
localPortInfoElems.forEach(elem => {
|
172
|
+
elem.classList.remove('local-port-info');
|
173
|
+
});
|
174
|
+
}
|
175
|
+
|
176
|
+
if (window.location.search.includes('local_port=')) {
|
177
|
+
const uri = `http://localhost:${window.location.search.split('local_port=')[1]}`;
|
178
|
+
fetch(uri, {
|
179
|
+
method: 'POST',
|
180
|
+
body: 'SKYPILOT_API_SERVER_USER_TOKEN_PLACEHOLDER'
|
181
|
+
}).then(hideTokenInfo)
|
182
|
+
}
|
157
183
|
</script>
|
158
184
|
</body>
|
159
185
|
</html>
|
sky/server/server.py
CHANGED
@@ -272,7 +272,9 @@ app.include_router(workspaces_rest.router,
|
|
272
272
|
|
273
273
|
|
274
274
|
@app.get('/token')
|
275
|
-
async def token(request: fastapi.Request
|
275
|
+
async def token(request: fastapi.Request,
|
276
|
+
local_port: Optional[int] = None) -> fastapi.responses.Response:
|
277
|
+
del local_port # local_port is used by the served js, but ignored by server
|
276
278
|
user = _get_auth_user_header(request)
|
277
279
|
|
278
280
|
token_data = {
|
sky/setup_files/dependencies.py
CHANGED
@@ -118,7 +118,13 @@ extras_require: Dict[str, List[str]] = {
|
|
118
118
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
119
119
|
# parameter for stopping instances. Reference:
|
120
120
|
# https://github.com/googleapis/google-api-python-client/commit/f6e9d3869ed605b06f7cbf2e8cf2db25108506e6
|
121
|
-
'gcp': [
|
121
|
+
'gcp': [
|
122
|
+
'google-api-python-client>=2.69.0',
|
123
|
+
'google-cloud-storage',
|
124
|
+
# see https://github.com/conda/conda/issues/13619
|
125
|
+
# see https://github.com/googleapis/google-api-python-client/issues/2554
|
126
|
+
'pyopenssl >= 23.2.0, <24.3.0',
|
127
|
+
],
|
122
128
|
'ibm': [
|
123
129
|
'ibm-cloud-sdk-core',
|
124
130
|
'ibm-vpc',
|
sky/skylet/constants.py
CHANGED
@@ -89,7 +89,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
89
89
|
# cluster yaml is updated.
|
90
90
|
#
|
91
91
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
92
|
-
SKYLET_VERSION = '
|
92
|
+
SKYLET_VERSION = '13'
|
93
93
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
94
94
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
95
95
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/task.py
CHANGED
@@ -292,6 +292,8 @@ class Task:
|
|
292
292
|
self.resources: Union[List[sky.Resources],
|
293
293
|
Set[sky.Resources]] = {sky.Resources()}
|
294
294
|
self._service: Optional[service_spec.SkyServiceSpec] = None
|
295
|
+
# The priority of the managed job running this task.
|
296
|
+
self._job_priority: Optional[int] = None
|
295
297
|
# Resources that this task cannot run on.
|
296
298
|
self.blocked_resources = blocked_resources
|
297
299
|
|
@@ -629,6 +631,10 @@ class Task:
|
|
629
631
|
service = service_spec.SkyServiceSpec.from_yaml_config(service)
|
630
632
|
task.set_service(service)
|
631
633
|
|
634
|
+
job = config.pop('job', None)
|
635
|
+
if job is not None and 'priority' in job:
|
636
|
+
task.set_job_priority(job['priority'])
|
637
|
+
|
632
638
|
assert not config, f'Invalid task args: {config.keys()}'
|
633
639
|
return task
|
634
640
|
|
@@ -831,6 +837,23 @@ class Task:
|
|
831
837
|
self._service = service
|
832
838
|
return self
|
833
839
|
|
840
|
+
@property
|
841
|
+
def job_priority(self) -> Optional[int]:
|
842
|
+
"""The priority of the managed job running this task."""
|
843
|
+
return self._job_priority
|
844
|
+
|
845
|
+
def set_job_priority(self, priority: int) -> 'Task':
|
846
|
+
"""Sets the job priority for this task.
|
847
|
+
|
848
|
+
Args:
|
849
|
+
priority: an integer between 0 and 1000.
|
850
|
+
|
851
|
+
Returns:
|
852
|
+
self: The current task, with job priority set.
|
853
|
+
"""
|
854
|
+
self._job_priority = priority
|
855
|
+
return self
|
856
|
+
|
834
857
|
def set_time_estimator(self, func: Callable[['sky.Resources'],
|
835
858
|
int]) -> 'Task':
|
836
859
|
"""Sets a func mapping resources to estimated time (secs).
|
@@ -1274,6 +1297,9 @@ class Task:
|
|
1274
1297
|
if self.service is not None:
|
1275
1298
|
add_if_not_none('service', self.service.to_yaml_config())
|
1276
1299
|
|
1300
|
+
if self.job_priority is not None:
|
1301
|
+
add_if_not_none('job', {'priority': self.job_priority})
|
1302
|
+
|
1277
1303
|
add_if_not_none('num_nodes', self.num_nodes)
|
1278
1304
|
|
1279
1305
|
if self.inputs is not None:
|
@@ -66,7 +66,8 @@ run: |
|
|
66
66
|
# managed_job_codegen.set_pending() before we get here.
|
67
67
|
python -u -m sky.jobs.scheduler {{remote_user_yaml_path}} \
|
68
68
|
--job-id $SKYPILOT_INTERNAL_JOB_ID \
|
69
|
-
--env-file {{remote_env_file_path}}
|
69
|
+
--env-file {{remote_env_file_path}} \
|
70
|
+
--priority {{priority}}
|
70
71
|
|
71
72
|
|
72
73
|
envs:
|