skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +99 -16
- sky/authentication.py +54 -7
- sky/backends/backend_utils.py +35 -22
- sky/backends/cloud_vm_ray_backend.py +30 -15
- sky/check.py +1 -1
- sky/cli.py +20 -8
- sky/client/cli.py +20 -8
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/nebius.py +55 -14
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
- sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
- sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
- sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -1
- sky/global_user_state.py +149 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +153 -39
- sky/jobs/utils.py +33 -5
- sky/provision/kubernetes/utils.py +2 -1
- sky/provision/provisioner.py +15 -10
- sky/resources.py +16 -1
- sky/serve/controller.py +10 -7
- sky/serve/replica_managers.py +22 -18
- sky/serve/service.py +5 -4
- sky/server/common.py +11 -4
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/server/stream_utils.py +21 -0
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/templates/kubernetes-ray.yml.j2 +19 -1
- sky/utils/common_utils.py +66 -0
- sky/utils/rich_utils.py +5 -0
- sky/utils/schemas.py +32 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py
CHANGED
@@ -91,6 +91,7 @@ def launch(
|
|
91
91
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
92
92
|
|
93
93
|
task_names = set()
|
94
|
+
priority = None
|
94
95
|
for task_ in dag.tasks:
|
95
96
|
if task_.name in task_names:
|
96
97
|
with ux_utils.print_exception_no_traceback():
|
@@ -100,6 +101,20 @@ def launch(
|
|
100
101
|
'name only and comment out the task names (so that they '
|
101
102
|
'will be auto-generated) .')
|
102
103
|
task_names.add(task_.name)
|
104
|
+
if task_.job_priority is not None:
|
105
|
+
if (priority is not None and priority != task_.job_priority):
|
106
|
+
with ux_utils.print_exception_no_traceback():
|
107
|
+
raise ValueError(
|
108
|
+
'Multiple tasks in the DAG have different priorities. '
|
109
|
+
'Either specify a priority in only one task, or set '
|
110
|
+
'the same priority for each task.')
|
111
|
+
priority = task_.job_priority
|
112
|
+
|
113
|
+
if priority is None:
|
114
|
+
priority = managed_job_constants.DEFAULT_PRIORITY
|
115
|
+
|
116
|
+
if priority < 0 or priority > 1000:
|
117
|
+
raise ValueError(f'Priority must be between 0 and 1000, got {priority}')
|
103
118
|
|
104
119
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
105
120
|
|
@@ -186,6 +201,7 @@ def launch(
|
|
186
201
|
service_catalog_common.get_modified_catalog_file_mounts(),
|
187
202
|
'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
|
188
203
|
'dashboard_user_id': common.SERVER_ID,
|
204
|
+
'priority': priority,
|
189
205
|
**controller_utils.shared_controller_vars_to_fill(
|
190
206
|
controller,
|
191
207
|
remote_user_config_path=remote_user_config_path,
|
sky/jobs/state.py
CHANGED
@@ -120,7 +120,9 @@ def create_table(cursor, conn):
|
|
120
120
|
dag_yaml_path TEXT,
|
121
121
|
env_file_path TEXT,
|
122
122
|
user_hash TEXT,
|
123
|
-
workspace TEXT DEFAULT NULL
|
123
|
+
workspace TEXT DEFAULT NULL,
|
124
|
+
priority INTEGER DEFAULT 500,
|
125
|
+
entrypoint TEXT DEFAULT NULL)""")
|
124
126
|
|
125
127
|
db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
|
126
128
|
'TEXT')
|
@@ -142,6 +144,15 @@ def create_table(cursor, conn):
|
|
142
144
|
'workspace',
|
143
145
|
'TEXT DEFAULT NULL',
|
144
146
|
value_to_replace_existing_entries='default')
|
147
|
+
|
148
|
+
db_utils.add_column_to_table(cursor,
|
149
|
+
conn,
|
150
|
+
'job_info',
|
151
|
+
'priority',
|
152
|
+
'INTEGER',
|
153
|
+
value_to_replace_existing_entries=500)
|
154
|
+
|
155
|
+
db_utils.add_column_to_table(cursor, conn, 'job_info', 'entrypoint', 'TEXT')
|
145
156
|
conn.commit()
|
146
157
|
|
147
158
|
|
@@ -199,6 +210,8 @@ columns = [
|
|
199
210
|
'env_file_path',
|
200
211
|
'user_hash',
|
201
212
|
'workspace',
|
213
|
+
'priority',
|
214
|
+
'entrypoint',
|
202
215
|
]
|
203
216
|
|
204
217
|
|
@@ -215,7 +228,7 @@ class ManagedJobStatus(enum.Enum):
|
|
215
228
|
reset to INIT or SETTING_UP multiple times (depending on the preemptions).
|
216
229
|
|
217
230
|
However, a managed job only has one ManagedJobStatus on the jobs controller.
|
218
|
-
ManagedJobStatus = [PENDING,
|
231
|
+
ManagedJobStatus = [PENDING, STARTING, RUNNING, ...]
|
219
232
|
Mapping from JobStatus to ManagedJobStatus:
|
220
233
|
INIT -> STARTING/RECOVERING
|
221
234
|
SETTING_UP -> RUNNING
|
@@ -235,10 +248,14 @@ class ManagedJobStatus(enum.Enum):
|
|
235
248
|
# PENDING: Waiting for the jobs controller to have a slot to run the
|
236
249
|
# controller process.
|
237
250
|
PENDING = 'PENDING'
|
251
|
+
# SUBMITTED: This state used to be briefly set before immediately changing
|
252
|
+
# to STARTING. Its use was removed in #5682. We keep it for backwards
|
253
|
+
# compatibility, so we can still parse old jobs databases that may have jobs
|
254
|
+
# in this state.
|
255
|
+
# TODO(cooperc): remove this in v0.12.0
|
256
|
+
DEPRECATED_SUBMITTED = 'SUBMITTED'
|
238
257
|
# The submitted_at timestamp of the managed job in the 'spot' table will be
|
239
258
|
# set to the time when the job controller begins running.
|
240
|
-
# SUBMITTED: The jobs controller starts the controller process.
|
241
|
-
SUBMITTED = 'SUBMITTED'
|
242
259
|
# STARTING: The controller process is launching the cluster for the managed
|
243
260
|
# job.
|
244
261
|
STARTING = 'STARTING'
|
@@ -314,7 +331,6 @@ class ManagedJobStatus(enum.Enum):
|
|
314
331
|
|
315
332
|
_SPOT_STATUS_TO_COLOR = {
|
316
333
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
317
|
-
ManagedJobStatus.SUBMITTED: colorama.Fore.BLUE,
|
318
334
|
ManagedJobStatus.STARTING: colorama.Fore.BLUE,
|
319
335
|
ManagedJobStatus.RUNNING: colorama.Fore.GREEN,
|
320
336
|
ManagedJobStatus.RECOVERING: colorama.Fore.CYAN,
|
@@ -326,6 +342,8 @@ _SPOT_STATUS_TO_COLOR = {
|
|
326
342
|
ManagedJobStatus.FAILED_CONTROLLER: colorama.Fore.RED,
|
327
343
|
ManagedJobStatus.CANCELLING: colorama.Fore.YELLOW,
|
328
344
|
ManagedJobStatus.CANCELLED: colorama.Fore.YELLOW,
|
345
|
+
# TODO(cooperc): backwards compatibility, remove this in v0.12.0
|
346
|
+
ManagedJobStatus.DEPRECATED_SUBMITTED: colorama.Fore.BLUE,
|
329
347
|
}
|
330
348
|
|
331
349
|
|
@@ -342,8 +360,12 @@ class ManagedJobScheduleState(enum.Enum):
|
|
342
360
|
- LAUNCHING -> ALIVE: The launch attempt was completed. It may have
|
343
361
|
succeeded or failed. The job controller is not allowed to sky.launch again
|
344
362
|
without transitioning to ALIVE_WAITING and then LAUNCHING.
|
363
|
+
- LAUNCHING -> ALIVE_BACKOFF: The launch failed to find resources, and is
|
364
|
+
in backoff waiting for resources.
|
345
365
|
- ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
|
346
366
|
either for recovery or to launch a subsequent task.
|
367
|
+
- ALIVE_BACKOFF -> ALIVE_WAITING: The backoff period has ended, and the job
|
368
|
+
controller wants to try to launch again.
|
347
369
|
- ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
|
348
370
|
controller may launch again.
|
349
371
|
- LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
|
@@ -357,6 +379,7 @@ class ManagedJobScheduleState(enum.Enum):
|
|
357
379
|
state or vice versa. (In fact, schedule state is defined on the job and
|
358
380
|
status on the task.)
|
359
381
|
- INACTIVE or WAITING should only be seen when a job is PENDING.
|
382
|
+
- ALIVE_BACKOFF should only be seen when a job is STARTING.
|
360
383
|
- ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
|
361
384
|
tasks, or needs to retry launching.
|
362
385
|
- LAUNCHING and ALIVE can be seen in many different statuses.
|
@@ -382,6 +405,9 @@ class ManagedJobScheduleState(enum.Enum):
|
|
382
405
|
# The job is running sky.launch, or soon will, using a limited number of
|
383
406
|
# allowed launch slots.
|
384
407
|
LAUNCHING = 'LAUNCHING'
|
408
|
+
# The job is alive, but is in backoff waiting for resources - a special case
|
409
|
+
# of ALIVE.
|
410
|
+
ALIVE_BACKOFF = 'ALIVE_BACKOFF'
|
385
411
|
# The controller for the job is running, but it's not currently launching.
|
386
412
|
ALIVE = 'ALIVE'
|
387
413
|
# The job is in a terminal state. (Not necessarily SUCCEEDED.)
|
@@ -389,14 +415,15 @@ class ManagedJobScheduleState(enum.Enum):
|
|
389
415
|
|
390
416
|
|
391
417
|
# === Status transition functions ===
|
392
|
-
def set_job_info(job_id: int, name: str, workspace: str):
|
418
|
+
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
|
393
419
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
394
420
|
cursor.execute(
|
395
421
|
"""\
|
396
422
|
INSERT INTO job_info
|
397
|
-
(spot_job_id, name, schedule_state, workspace)
|
398
|
-
VALUES (?, ?, ?, ?)""",
|
399
|
-
(job_id, name, ManagedJobScheduleState.INACTIVE.value, workspace
|
423
|
+
(spot_job_id, name, schedule_state, workspace, entrypoint)
|
424
|
+
VALUES (?, ?, ?, ?, ?)""",
|
425
|
+
(job_id, name, ManagedJobScheduleState.INACTIVE.value, workspace,
|
426
|
+
entrypoint))
|
400
427
|
|
401
428
|
|
402
429
|
def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
@@ -411,11 +438,11 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
411
438
|
ManagedJobStatus.PENDING.value))
|
412
439
|
|
413
440
|
|
414
|
-
def
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
"""Set the task to
|
441
|
+
def set_starting(job_id: int, task_id: int, run_timestamp: str,
|
442
|
+
submit_time: float, resources_str: str,
|
443
|
+
specs: Dict[str, Union[str,
|
444
|
+
int]], callback_func: CallbackType):
|
445
|
+
"""Set the task to starting state.
|
419
446
|
|
420
447
|
Args:
|
421
448
|
job_id: The managed job ID.
|
@@ -432,6 +459,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
432
459
|
# make it easier to find them based on one of the values.
|
433
460
|
# Also, using the earlier timestamp should be closer to the term
|
434
461
|
# `submit_at`, which represents the time the managed task is submitted.
|
462
|
+
logger.info('Launching the spot cluster...')
|
435
463
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
436
464
|
cursor.execute(
|
437
465
|
"""\
|
@@ -445,19 +473,54 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
445
473
|
task_id=(?) AND
|
446
474
|
status=(?) AND
|
447
475
|
end_at IS null""",
|
448
|
-
(resources_str, submit_time, ManagedJobStatus.
|
476
|
+
(resources_str, submit_time, ManagedJobStatus.STARTING.value,
|
449
477
|
run_timestamp, json.dumps(specs), job_id, task_id,
|
450
478
|
ManagedJobStatus.PENDING.value))
|
451
479
|
if cursor.rowcount != 1:
|
452
480
|
raise exceptions.ManagedJobStatusError(
|
453
|
-
|
481
|
+
'Failed to set the task to starting. '
|
454
482
|
f'({cursor.rowcount} rows updated)')
|
483
|
+
# SUBMITTED is no longer used, but we keep it for backward compatibility.
|
484
|
+
# TODO(cooperc): remove this in v0.12.0
|
455
485
|
callback_func('SUBMITTED')
|
486
|
+
callback_func('STARTING')
|
456
487
|
|
457
488
|
|
458
|
-
def
|
459
|
-
"""Set the task to
|
460
|
-
|
489
|
+
def set_backoff_pending(job_id: int, task_id: int):
|
490
|
+
"""Set the task to PENDING state if it is in backoff.
|
491
|
+
|
492
|
+
This should only be used to transition from STARTING or RECOVERING back to
|
493
|
+
PENDING.
|
494
|
+
"""
|
495
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
496
|
+
cursor.execute(
|
497
|
+
"""\
|
498
|
+
UPDATE spot SET status=(?)
|
499
|
+
WHERE spot_job_id=(?) AND
|
500
|
+
task_id=(?) AND
|
501
|
+
status IN (?, ?) AND
|
502
|
+
end_at IS null""", (ManagedJobStatus.PENDING.value, job_id, task_id,
|
503
|
+
ManagedJobStatus.STARTING.value,
|
504
|
+
ManagedJobStatus.RECOVERING.value))
|
505
|
+
logger.debug('back to PENDING')
|
506
|
+
if cursor.rowcount != 1:
|
507
|
+
raise exceptions.ManagedJobStatusError(
|
508
|
+
'Failed to set the task back to pending. '
|
509
|
+
f'({cursor.rowcount} rows updated)')
|
510
|
+
# Do not call callback_func here, as we don't use the callback for PENDING.
|
511
|
+
|
512
|
+
|
513
|
+
def set_restarting(job_id: int, task_id: int, recovering: bool):
|
514
|
+
"""Set the task back to STARTING or RECOVERING from PENDING.
|
515
|
+
|
516
|
+
This should not be used for the initial transition from PENDING to STARTING.
|
517
|
+
In that case, use set_starting instead. This function should only be used
|
518
|
+
after using set_backoff_pending to transition back to PENDING during
|
519
|
+
launch retry backoff.
|
520
|
+
"""
|
521
|
+
target_status = ManagedJobStatus.STARTING.value
|
522
|
+
if recovering:
|
523
|
+
target_status = ManagedJobStatus.RECOVERING.value
|
461
524
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
462
525
|
cursor.execute(
|
463
526
|
"""\
|
@@ -465,13 +528,15 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
|
|
465
528
|
WHERE spot_job_id=(?) AND
|
466
529
|
task_id=(?) AND
|
467
530
|
status=(?) AND
|
468
|
-
end_at IS null""",
|
469
|
-
|
531
|
+
end_at IS null""",
|
532
|
+
(target_status, job_id, task_id, ManagedJobStatus.PENDING.value))
|
533
|
+
logger.debug(f'back to {target_status}')
|
470
534
|
if cursor.rowcount != 1:
|
471
535
|
raise exceptions.ManagedJobStatusError(
|
472
|
-
f'Failed to set the task to
|
536
|
+
f'Failed to set the task back to {target_status}. '
|
473
537
|
f'({cursor.rowcount} rows updated)')
|
474
|
-
callback_func
|
538
|
+
# Do not call callback_func here, as it should only be invoked for the
|
539
|
+
# initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
|
475
540
|
|
476
541
|
|
477
542
|
def set_started(job_id: int, task_id: int, start_time: float,
|
@@ -947,6 +1012,21 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
947
1012
|
job_dict['schedule_state'])
|
948
1013
|
if job_dict['job_name'] is None:
|
949
1014
|
job_dict['job_name'] = job_dict['task_name']
|
1015
|
+
|
1016
|
+
# Add YAML content and command for managed jobs
|
1017
|
+
dag_yaml_path = job_dict.get('dag_yaml_path')
|
1018
|
+
if dag_yaml_path:
|
1019
|
+
try:
|
1020
|
+
with open(dag_yaml_path, 'r', encoding='utf-8') as f:
|
1021
|
+
job_dict['dag_yaml'] = f.read()
|
1022
|
+
except (FileNotFoundError, IOError, OSError):
|
1023
|
+
job_dict['dag_yaml'] = None
|
1024
|
+
|
1025
|
+
# Generate a command that could be used to launch this job
|
1026
|
+
# Format: sky jobs launch <yaml_path>
|
1027
|
+
else:
|
1028
|
+
job_dict['dag_yaml'] = None
|
1029
|
+
|
950
1030
|
jobs.append(job_dict)
|
951
1031
|
return jobs
|
952
1032
|
|
@@ -1004,16 +1084,16 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
|
1004
1084
|
|
1005
1085
|
|
1006
1086
|
def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
|
1007
|
-
user_hash: str) -> None:
|
1087
|
+
user_hash: str, priority: int) -> None:
|
1008
1088
|
"""Do not call without holding the scheduler lock."""
|
1009
1089
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1010
1090
|
updated_count = cursor.execute(
|
1011
1091
|
'UPDATE job_info SET '
|
1012
1092
|
'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
|
1013
|
-
' user_hash = (?) '
|
1093
|
+
' user_hash = (?), priority = (?) '
|
1014
1094
|
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
1015
1095
|
(ManagedJobScheduleState.WAITING.value, dag_yaml_path,
|
1016
|
-
env_file_path, user_hash, job_id,
|
1096
|
+
env_file_path, user_hash, priority, job_id,
|
1017
1097
|
ManagedJobScheduleState.INACTIVE.value)).rowcount
|
1018
1098
|
assert updated_count == 1, (job_id, updated_count)
|
1019
1099
|
|
@@ -1043,15 +1123,28 @@ def scheduler_set_alive(job_id: int) -> None:
|
|
1043
1123
|
assert updated_count == 1, (job_id, updated_count)
|
1044
1124
|
|
1045
1125
|
|
1046
|
-
def
|
1126
|
+
def scheduler_set_alive_backoff(job_id: int) -> None:
|
1047
1127
|
"""Do not call without holding the scheduler lock."""
|
1048
1128
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1049
1129
|
updated_count = cursor.execute(
|
1050
1130
|
'UPDATE job_info SET '
|
1051
1131
|
'schedule_state = (?) '
|
1052
1132
|
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
1133
|
+
(ManagedJobScheduleState.ALIVE_BACKOFF.value, job_id,
|
1134
|
+
ManagedJobScheduleState.LAUNCHING.value)).rowcount
|
1135
|
+
assert updated_count == 1, (job_id, updated_count)
|
1136
|
+
|
1137
|
+
|
1138
|
+
def scheduler_set_alive_waiting(job_id: int) -> None:
|
1139
|
+
"""Do not call without holding the scheduler lock."""
|
1140
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1141
|
+
updated_count = cursor.execute(
|
1142
|
+
'UPDATE job_info SET '
|
1143
|
+
'schedule_state = (?) '
|
1144
|
+
'WHERE spot_job_id = (?) AND schedule_state IN (?, ?)',
|
1053
1145
|
(ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
|
1054
|
-
ManagedJobScheduleState.ALIVE.value
|
1146
|
+
ManagedJobScheduleState.ALIVE.value,
|
1147
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).rowcount
|
1055
1148
|
assert updated_count == 1, (job_id, updated_count)
|
1056
1149
|
|
1057
1150
|
|
@@ -1099,32 +1192,53 @@ def get_num_alive_jobs() -> int:
|
|
1099
1192
|
return cursor.execute(
|
1100
1193
|
'SELECT COUNT(*) '
|
1101
1194
|
'FROM job_info '
|
1102
|
-
'WHERE schedule_state IN (?, ?, ?)',
|
1195
|
+
'WHERE schedule_state IN (?, ?, ?, ?)',
|
1103
1196
|
(ManagedJobScheduleState.ALIVE_WAITING.value,
|
1104
1197
|
ManagedJobScheduleState.LAUNCHING.value,
|
1105
|
-
ManagedJobScheduleState.ALIVE.value
|
1198
|
+
ManagedJobScheduleState.ALIVE.value,
|
1199
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()[0]
|
1106
1200
|
|
1107
1201
|
|
1108
1202
|
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
1109
1203
|
"""Get the next job that should transition to LAUNCHING.
|
1110
1204
|
|
1205
|
+
Selects the highest-priority (lowest numerical value) WAITING or
|
1206
|
+
ALIVE_WAITING job, provided its priority value is less than or equal to any
|
1207
|
+
currently LAUNCHING or ALIVE_BACKOFF job.
|
1208
|
+
|
1111
1209
|
Backwards compatibility note: jobs submitted before #4485 will have no
|
1112
1210
|
schedule_state and will be ignored by this SQL query.
|
1113
1211
|
"""
|
1114
1212
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1115
|
-
|
1213
|
+
# Get the highest-priority (lowest numerical value) WAITING or
|
1214
|
+
# ALIVE_WAITING job whose priority value is less than or equal to
|
1215
|
+
# the highest priority (numerically smallest) LAUNCHING or
|
1216
|
+
# ALIVE_BACKOFF job's priority.
|
1217
|
+
waiting_job_row = cursor.execute(
|
1116
1218
|
'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
|
1117
1219
|
'FROM job_info '
|
1118
|
-
'WHERE schedule_state
|
1119
|
-
'
|
1220
|
+
'WHERE schedule_state IN (?, ?) '
|
1221
|
+
'AND priority <= COALESCE('
|
1222
|
+
' (SELECT MIN(priority) '
|
1223
|
+
' FROM job_info '
|
1224
|
+
' WHERE schedule_state IN (?, ?)), '
|
1225
|
+
' 1000'
|
1226
|
+
')'
|
1227
|
+
'ORDER BY priority ASC, spot_job_id ASC LIMIT 1',
|
1120
1228
|
(ManagedJobScheduleState.WAITING.value,
|
1121
|
-
ManagedJobScheduleState.ALIVE_WAITING.value
|
1229
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
1230
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
1231
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value)).fetchone()
|
1232
|
+
|
1233
|
+
if waiting_job_row is None:
|
1234
|
+
return None
|
1235
|
+
|
1122
1236
|
return {
|
1123
|
-
'job_id':
|
1124
|
-
'schedule_state': ManagedJobScheduleState(
|
1125
|
-
'dag_yaml_path':
|
1126
|
-
'env_file_path':
|
1127
|
-
}
|
1237
|
+
'job_id': waiting_job_row[0],
|
1238
|
+
'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
|
1239
|
+
'dag_yaml_path': waiting_job_row[2],
|
1240
|
+
'env_file_path': waiting_job_row[3],
|
1241
|
+
}
|
1128
1242
|
|
1129
1243
|
|
1130
1244
|
def get_workspace(job_id: int) -> str:
|
sky/jobs/utils.py
CHANGED
@@ -953,6 +953,22 @@ def dump_managed_job_queue() -> str:
|
|
953
953
|
job['region'] = '-'
|
954
954
|
job['zone'] = '-'
|
955
955
|
|
956
|
+
# Add details about schedule state / backoff.
|
957
|
+
state_details = None
|
958
|
+
if job['schedule_state'] == 'ALIVE_BACKOFF':
|
959
|
+
state_details = 'In backoff, waiting for resources'
|
960
|
+
elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
|
961
|
+
state_details = 'Waiting for other jobs to launch'
|
962
|
+
|
963
|
+
if state_details and job['failure_reason']:
|
964
|
+
job['details'] = f'{state_details} - {job["failure_reason"]}'
|
965
|
+
elif state_details:
|
966
|
+
job['details'] = state_details
|
967
|
+
elif job['failure_reason']:
|
968
|
+
job['details'] = f'Failure: {job["failure_reason"]}'
|
969
|
+
else:
|
970
|
+
job['details'] = None
|
971
|
+
|
956
972
|
return message_utils.encode_payload(jobs)
|
957
973
|
|
958
974
|
|
@@ -981,7 +997,7 @@ def _get_job_status_from_tasks(
|
|
981
997
|
# Use the first non-succeeded status.
|
982
998
|
if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
|
983
999
|
# TODO(zhwu): we should not blindly use the first non-
|
984
|
-
# succeeded as the status could be changed to
|
1000
|
+
# succeeded as the status could be changed to PENDING
|
985
1001
|
# when going from one task to the next one, which can be
|
986
1002
|
# confusing.
|
987
1003
|
break
|
@@ -1063,6 +1079,7 @@ def format_job_table(
|
|
1063
1079
|
'TASK',
|
1064
1080
|
*(['WORKSPACE'] if show_workspace else []),
|
1065
1081
|
'NAME',
|
1082
|
+
'PRIORITY',
|
1066
1083
|
*user_cols,
|
1067
1084
|
'REQUESTED',
|
1068
1085
|
'SUBMITTED',
|
@@ -1092,7 +1109,10 @@ def format_job_table(
|
|
1092
1109
|
# by the task_id.
|
1093
1110
|
jobs[get_hash(task)].append(task)
|
1094
1111
|
|
1095
|
-
def generate_details(
|
1112
|
+
def generate_details(details: Optional[str],
|
1113
|
+
failure_reason: Optional[str]) -> str:
|
1114
|
+
if details is not None:
|
1115
|
+
return details
|
1096
1116
|
if failure_reason is not None:
|
1097
1117
|
return f'Failure: {failure_reason}'
|
1098
1118
|
return '-'
|
@@ -1131,6 +1151,7 @@ def format_job_table(
|
|
1131
1151
|
submitted_at = None
|
1132
1152
|
end_at: Optional[int] = 0
|
1133
1153
|
recovery_cnt = 0
|
1154
|
+
priority = job_tasks[0].get('priority', '-')
|
1134
1155
|
managed_job_status, current_task_id = _get_job_status_from_tasks(
|
1135
1156
|
job_tasks)
|
1136
1157
|
for task in job_tasks:
|
@@ -1166,6 +1187,7 @@ def format_job_table(
|
|
1166
1187
|
'',
|
1167
1188
|
*([''] if show_workspace else []),
|
1168
1189
|
job_name,
|
1190
|
+
str(priority),
|
1169
1191
|
*user_values,
|
1170
1192
|
'-',
|
1171
1193
|
submitted,
|
@@ -1175,13 +1197,14 @@ def format_job_table(
|
|
1175
1197
|
status_str,
|
1176
1198
|
]
|
1177
1199
|
if show_all:
|
1200
|
+
details = job_tasks[current_task_id].get('details')
|
1178
1201
|
failure_reason = job_tasks[current_task_id]['failure_reason']
|
1179
1202
|
job_values.extend([
|
1180
1203
|
'-',
|
1181
1204
|
'-',
|
1182
1205
|
'-',
|
1183
1206
|
job_tasks[0]['schedule_state'],
|
1184
|
-
generate_details(failure_reason),
|
1207
|
+
generate_details(details, failure_reason),
|
1185
1208
|
])
|
1186
1209
|
if tasks_have_k8s_user:
|
1187
1210
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -1195,11 +1218,13 @@ def format_job_table(
|
|
1195
1218
|
submitted = log_utils.readable_time_duration(task['submitted_at'])
|
1196
1219
|
user_values = get_user_column_values(task)
|
1197
1220
|
task_workspace = '-' if len(job_tasks) > 1 else workspace
|
1221
|
+
priority = task.get('priority', '-')
|
1198
1222
|
values = [
|
1199
1223
|
task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
|
1200
1224
|
task['task_id'] if len(job_tasks) > 1 else '-',
|
1201
1225
|
*([task_workspace] if show_workspace else []),
|
1202
1226
|
task['task_name'],
|
1227
|
+
str(priority),
|
1203
1228
|
*user_values,
|
1204
1229
|
task['resources'],
|
1205
1230
|
# SUBMITTED
|
@@ -1244,7 +1269,8 @@ def format_job_table(
|
|
1244
1269
|
infra.formatted_str(),
|
1245
1270
|
task['cluster_resources'],
|
1246
1271
|
schedule_state,
|
1247
|
-
generate_details(task
|
1272
|
+
generate_details(task.get('details'),
|
1273
|
+
task['failure_reason']),
|
1248
1274
|
])
|
1249
1275
|
if tasks_have_k8s_user:
|
1250
1276
|
values.insert(0, task.get('user', '-'))
|
@@ -1362,13 +1388,15 @@ class ManagedJobCodeGen:
|
|
1362
1388
|
|
1363
1389
|
@classmethod
|
1364
1390
|
def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
|
1365
|
-
workspace) -> str:
|
1391
|
+
workspace: str, entrypoint: str) -> str:
|
1366
1392
|
dag_name = managed_job_dag.name
|
1367
1393
|
# Add the managed job to queue table.
|
1368
1394
|
code = textwrap.dedent(f"""\
|
1369
1395
|
set_job_info_kwargs = {{'workspace': {workspace!r}}}
|
1370
1396
|
if managed_job_version < 4:
|
1371
1397
|
set_job_info_kwargs = {{}}
|
1398
|
+
if managed_job_version >= 5:
|
1399
|
+
set_job_info_kwargs['entrypoint'] = {entrypoint!r}
|
1372
1400
|
managed_job_state.set_job_info(
|
1373
1401
|
{job_id}, {dag_name!r}, **set_job_info_kwargs)
|
1374
1402
|
""")
|
@@ -15,6 +15,7 @@ from urllib.parse import urlparse
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
17
|
from sky import exceptions
|
18
|
+
from sky import global_user_state
|
18
19
|
from sky import models
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky import skypilot_config
|
@@ -2810,7 +2811,7 @@ def set_autodown_annotations(handle: 'backends.CloudVmRayResourceHandle',
|
|
2810
2811
|
tags = {
|
2811
2812
|
provision_constants.TAG_RAY_CLUSTER_NAME: handle.cluster_name_on_cloud,
|
2812
2813
|
}
|
2813
|
-
ray_config =
|
2814
|
+
ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
2814
2815
|
provider_config = ray_config['provider']
|
2815
2816
|
namespace = get_namespace_from_config(provider_config)
|
2816
2817
|
context = get_context_from_config(provider_config)
|
sky/provision/provisioner.py
CHANGED
@@ -15,6 +15,7 @@ import colorama
|
|
15
15
|
import sky
|
16
16
|
from sky import clouds
|
17
17
|
from sky import exceptions
|
18
|
+
from sky import global_user_state
|
18
19
|
from sky import provision
|
19
20
|
from sky import sky_logging
|
20
21
|
from sky import skypilot_config
|
@@ -118,7 +119,7 @@ def bulk_provision(
|
|
118
119
|
Cloud specific exceptions: If the provisioning process failed, cloud-
|
119
120
|
specific exceptions will be raised by the cloud APIs.
|
120
121
|
"""
|
121
|
-
original_config =
|
122
|
+
original_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
122
123
|
head_node_type = original_config['head_node_type']
|
123
124
|
bootstrap_config = provision_common.ProvisionConfig(
|
124
125
|
provider_config=original_config['provider'],
|
@@ -413,9 +414,11 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo,
|
|
413
414
|
|
414
415
|
def _post_provision_setup(
|
415
416
|
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
416
|
-
|
417
|
+
handle_cluster_yaml: str,
|
418
|
+
provision_record: provision_common.ProvisionRecord,
|
417
419
|
custom_resource: Optional[str]) -> provision_common.ClusterInfo:
|
418
|
-
config_from_yaml =
|
420
|
+
config_from_yaml = global_user_state.get_cluster_yaml_dict(
|
421
|
+
handle_cluster_yaml)
|
419
422
|
provider_config = config_from_yaml.get('provider')
|
420
423
|
cluster_info = provision.get_cluster_info(cloud_name,
|
421
424
|
provision_record.region,
|
@@ -446,7 +449,7 @@ def _post_provision_setup(
|
|
446
449
|
# TODO(suquark): Move wheel build here in future PRs.
|
447
450
|
# We don't set docker_user here, as we are configuring the VM itself.
|
448
451
|
ssh_credentials = backend_utils.ssh_credential_from_yaml(
|
449
|
-
|
452
|
+
handle_cluster_yaml, ssh_user=cluster_info.ssh_user)
|
450
453
|
docker_config = config_from_yaml.get('docker', {})
|
451
454
|
|
452
455
|
with rich_utils.safe_status(
|
@@ -657,7 +660,8 @@ def _post_provision_setup(
|
|
657
660
|
@timeline.event
|
658
661
|
def post_provision_runtime_setup(
|
659
662
|
cloud_name: str, cluster_name: resources_utils.ClusterName,
|
660
|
-
|
663
|
+
handle_cluster_yaml: str,
|
664
|
+
provision_record: provision_common.ProvisionRecord,
|
661
665
|
custom_resource: Optional[str],
|
662
666
|
log_dir: str) -> provision_common.ClusterInfo:
|
663
667
|
"""Run internal setup commands after provisioning and before user setup.
|
@@ -675,11 +679,12 @@ def post_provision_runtime_setup(
|
|
675
679
|
with provision_logging.setup_provision_logging(log_dir):
|
676
680
|
try:
|
677
681
|
logger.debug(_TITLE.format('System Setup After Provision'))
|
678
|
-
return _post_provision_setup(
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
682
|
+
return _post_provision_setup(
|
683
|
+
cloud_name,
|
684
|
+
cluster_name,
|
685
|
+
handle_cluster_yaml=handle_cluster_yaml,
|
686
|
+
provision_record=provision_record,
|
687
|
+
custom_resource=custom_resource)
|
683
688
|
except Exception: # pylint: disable=broad-except
|
684
689
|
logger.error(
|
685
690
|
ux_utils.error_message(
|
sky/resources.py
CHANGED
@@ -98,7 +98,7 @@ class Resources:
|
|
98
98
|
"""
|
99
99
|
# If any fields changed, increment the version. For backward compatibility,
|
100
100
|
# modify the __setstate__ method to handle the old version.
|
101
|
-
_VERSION =
|
101
|
+
_VERSION = 25
|
102
102
|
|
103
103
|
def __init__(
|
104
104
|
self,
|
@@ -294,6 +294,8 @@ class Resources:
|
|
294
294
|
}
|
295
295
|
else:
|
296
296
|
self._image_id = image_id
|
297
|
+
if isinstance(self._cloud, clouds.Kubernetes):
|
298
|
+
_maybe_add_docker_prefix_to_image_id(self._image_id)
|
297
299
|
self._is_image_managed = _is_image_managed
|
298
300
|
|
299
301
|
if isinstance(disk_tier, str):
|
@@ -2075,6 +2077,10 @@ class Resources:
|
|
2075
2077
|
if version < 24:
|
2076
2078
|
self._volumes = None
|
2077
2079
|
|
2080
|
+
if version < 25:
|
2081
|
+
if isinstance(state.get('_cloud', None), clouds.Kubernetes):
|
2082
|
+
_maybe_add_docker_prefix_to_image_id(state['_image_id'])
|
2083
|
+
|
2078
2084
|
self.__dict__.update(state)
|
2079
2085
|
|
2080
2086
|
|
@@ -2111,3 +2117,12 @@ class LaunchableResources(Resources):
|
|
2111
2117
|
"""
|
2112
2118
|
self.assert_launchable()
|
2113
2119
|
return typing.cast(LaunchableResources, super().copy(**override))
|
2120
|
+
|
2121
|
+
|
2122
|
+
def _maybe_add_docker_prefix_to_image_id(
|
2123
|
+
image_id_dict: Optional[Dict[Optional[str], str]]) -> None:
|
2124
|
+
if image_id_dict is None:
|
2125
|
+
return
|
2126
|
+
for k, v in image_id_dict.items():
|
2127
|
+
if not v.startswith('docker:'):
|
2128
|
+
image_id_dict[k] = f'docker:{v}'
|