skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250529__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +13 -3
- sky/client/cli.py +13 -3
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +6 -0
- sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +6 -0
- sky/dashboard/out/_next/static/chunks/{856-62b87c68917b08ed.js → 856-59a1760784c9e770.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-7c48919fe030bc43.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-909f1ceb0fcf1b99.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-d4c6875c88771e17.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +6 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +1 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +2 -0
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +130 -35
- sky/jobs/utils.py +30 -4
- sky/resources.py +16 -1
- sky/server/common.py +6 -2
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/utils/schemas.py +12 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/RECORD +53 -49
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-41738d1896fc02fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → HvNkg7hqKM1p0ptAcdDcF}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
@@ -18,6 +18,7 @@ from sky import global_user_state
|
|
18
18
|
from sky import sky_logging
|
19
19
|
from sky.backends import backend_utils
|
20
20
|
from sky.jobs import scheduler
|
21
|
+
from sky.jobs import state
|
21
22
|
from sky.jobs import utils as managed_job_utils
|
22
23
|
from sky.skylet import job_lib
|
23
24
|
from sky.usage import usage_lib
|
@@ -49,7 +50,7 @@ class StrategyExecutor:
|
|
49
50
|
|
50
51
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
51
52
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
52
|
-
job_id: int) -> None:
|
53
|
+
job_id: int, task_id: int) -> None:
|
53
54
|
"""Initialize the strategy executor.
|
54
55
|
|
55
56
|
Args:
|
@@ -65,11 +66,13 @@ class StrategyExecutor:
|
|
65
66
|
self.backend = backend
|
66
67
|
self.max_restarts_on_errors = max_restarts_on_errors
|
67
68
|
self.job_id = job_id
|
69
|
+
self.task_id = task_id
|
68
70
|
self.restart_cnt_on_failure = 0
|
69
71
|
|
70
72
|
@classmethod
|
71
73
|
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
72
|
-
task: 'task_lib.Task', job_id: int
|
74
|
+
task: 'task_lib.Task', job_id: int,
|
75
|
+
task_id: int) -> 'StrategyExecutor':
|
73
76
|
"""Create a strategy from a task."""
|
74
77
|
|
75
78
|
resource_list = list(task.resources)
|
@@ -100,7 +103,7 @@ class StrategyExecutor:
|
|
100
103
|
from_str(job_recovery_name))
|
101
104
|
assert job_recovery_strategy is not None, job_recovery_name
|
102
105
|
return job_recovery_strategy(cluster_name, backend, task,
|
103
|
-
max_restarts_on_errors, job_id)
|
106
|
+
max_restarts_on_errors, job_id, task_id)
|
104
107
|
|
105
108
|
def launch(self) -> float:
|
106
109
|
"""Launch the cluster for the first time.
|
@@ -235,7 +238,8 @@ class StrategyExecutor:
|
|
235
238
|
|
236
239
|
def _launch(self,
|
237
240
|
max_retry: Optional[int] = 3,
|
238
|
-
raise_on_failure: bool = True
|
241
|
+
raise_on_failure: bool = True,
|
242
|
+
recovery: bool = False) -> Optional[float]:
|
239
243
|
"""Implementation of launch().
|
240
244
|
|
241
245
|
The function will wait until the job starts running, but will leave the
|
@@ -275,98 +279,134 @@ class StrategyExecutor:
|
|
275
279
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
276
280
|
while True:
|
277
281
|
retry_cnt += 1
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
#
|
282
|
-
#
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
exceptions.ResourcesUnavailableError)
|
309
|
-
for err in e.failover_history):
|
310
|
-
# _launch() (this function) should fail/exit directly,
|
311
|
-
# if none of the failover reasons were because of
|
312
|
-
# resource unavailability or no failover was attempted
|
313
|
-
# (the optimizer cannot find feasible resources for
|
314
|
-
# requested resources), i.e., e.failover_history is
|
315
|
-
# empty. Failing directly avoids the infinite loop of
|
316
|
-
# retrying the launch when, e.g., an invalid cluster
|
317
|
-
# name is used and --retry-until-up is specified.
|
318
|
-
reasons = (e.failover_history
|
319
|
-
if e.failover_history else [e])
|
320
|
-
reasons_str = '; '.join(
|
321
|
-
common_utils.format_exception(err)
|
322
|
-
for err in reasons)
|
323
|
-
logger.error(
|
324
|
-
'Failure happened before provisioning. Failover '
|
325
|
-
f'reasons: {reasons_str}')
|
282
|
+
try:
|
283
|
+
with scheduler.scheduled_launch(self.job_id):
|
284
|
+
# The job state may have been PENDING during backoff -
|
285
|
+
# update to STARTING or RECOVERING.
|
286
|
+
# On the first attempt (when retry_cnt is 1), we should
|
287
|
+
# already be in STARTING or RECOVERING.
|
288
|
+
if retry_cnt > 1:
|
289
|
+
state.set_restarting(self.job_id, self.task_id,
|
290
|
+
recovery)
|
291
|
+
try:
|
292
|
+
usage_lib.messages.usage.set_internal()
|
293
|
+
# Detach setup, so that the setup failure can be
|
294
|
+
# detected by the controller process (job_status ->
|
295
|
+
# FAILED_SETUP).
|
296
|
+
execution.launch(
|
297
|
+
self.dag,
|
298
|
+
cluster_name=self.cluster_name,
|
299
|
+
# We expect to tear down the cluster as soon as the
|
300
|
+
# job is finished. However, in case the controller
|
301
|
+
# dies, set autodown to try and avoid a resource
|
302
|
+
# leak.
|
303
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
304
|
+
down=True,
|
305
|
+
_is_launched_by_jobs_controller=True)
|
306
|
+
logger.info('Managed job cluster launched.')
|
307
|
+
except (exceptions.InvalidClusterNameError,
|
308
|
+
exceptions.NoCloudAccessError,
|
309
|
+
exceptions.ResourcesMismatchError) as e:
|
310
|
+
logger.error('Failure happened before provisioning. '
|
311
|
+
f'{common_utils.format_exception(e)}')
|
326
312
|
if raise_on_failure:
|
327
|
-
raise exceptions.ProvisionPrechecksError(
|
328
|
-
|
329
|
-
logger.info('Failed to launch a cluster with error: '
|
330
|
-
f'{common_utils.format_exception(e)})')
|
331
|
-
except Exception as e: # pylint: disable=broad-except
|
332
|
-
# If the launch fails, it will be recovered by the following
|
333
|
-
# code.
|
334
|
-
logger.info('Failed to launch a cluster with error: '
|
335
|
-
f'{common_utils.format_exception(e)})')
|
336
|
-
with ux_utils.enable_traceback():
|
337
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
338
|
-
else: # No exception, the launch succeeds.
|
339
|
-
# At this point, a sky.launch() has succeeded. Cluster may
|
340
|
-
# be UP (no preemption since) or DOWN (newly preempted).
|
341
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
342
|
-
if job_submitted_at is not None:
|
343
|
-
return job_submitted_at
|
344
|
-
# The job fails to start on the cluster, retry the launch.
|
345
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
346
|
-
# for future debugging.
|
347
|
-
logger.info(
|
348
|
-
'Failed to successfully submit the job to the '
|
349
|
-
'launched cluster, due to unexpected submission errors '
|
350
|
-
'or the cluster being preempted during job submission.')
|
351
|
-
|
352
|
-
# If we get here, the launch did not succeed. Tear down the
|
353
|
-
# cluster and retry.
|
354
|
-
managed_job_utils.terminate_cluster(self.cluster_name)
|
355
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
356
|
-
# Retry forever if max_retry is None.
|
357
|
-
if raise_on_failure:
|
358
|
-
with ux_utils.print_exception_no_traceback():
|
359
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
360
|
-
'Resources unavailable: failed to launch '
|
361
|
-
f'clusters after {max_retry} retries.')
|
362
|
-
else:
|
313
|
+
raise exceptions.ProvisionPrechecksError(
|
314
|
+
reasons=[e])
|
363
315
|
return None
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
316
|
+
except exceptions.ResourcesUnavailableError as e:
|
317
|
+
# This is raised when the launch fails due to prechecks
|
318
|
+
# or after failing over through all the candidates.
|
319
|
+
# Please refer to the docstring of `sky.launch` for more
|
320
|
+
# details of how the exception will be structured.
|
321
|
+
if not any(
|
322
|
+
isinstance(err,
|
323
|
+
exceptions.ResourcesUnavailableError)
|
324
|
+
for err in e.failover_history):
|
325
|
+
# _launch() (this function) should fail/exit
|
326
|
+
# directly, if none of the failover reasons were
|
327
|
+
# because of resource unavailability or no failover
|
328
|
+
# was attempted (the optimizer cannot find feasible
|
329
|
+
# resources for requested resources), i.e.,
|
330
|
+
# e.failover_history is empty. Failing directly
|
331
|
+
# avoids the infinite loop of retrying the launch
|
332
|
+
# when, e.g., an invalid cluster name is used and
|
333
|
+
# --retry-until-up is specified.
|
334
|
+
reasons = (e.failover_history
|
335
|
+
if e.failover_history else [e])
|
336
|
+
reasons_str = '; '.join(
|
337
|
+
common_utils.format_exception(err)
|
338
|
+
for err in reasons)
|
339
|
+
logger.error(
|
340
|
+
'Failure happened before provisioning. '
|
341
|
+
f'Failover reasons: {reasons_str}')
|
342
|
+
if raise_on_failure:
|
343
|
+
raise exceptions.ProvisionPrechecksError(
|
344
|
+
reasons)
|
345
|
+
return None
|
346
|
+
logger.info('Failed to launch a cluster with error: '
|
347
|
+
f'{common_utils.format_exception(e)})')
|
348
|
+
except Exception as e: # pylint: disable=broad-except
|
349
|
+
# If the launch fails, it will be recovered by the
|
350
|
+
# following code.
|
351
|
+
logger.info('Failed to launch a cluster with error: '
|
352
|
+
f'{common_utils.format_exception(e)})')
|
353
|
+
with ux_utils.enable_traceback():
|
354
|
+
logger.info(
|
355
|
+
f' Traceback: {traceback.format_exc()}')
|
356
|
+
else: # No exception, the launch succeeds.
|
357
|
+
# At this point, a sky.launch() has succeeded. Cluster
|
358
|
+
# may be UP (no preemption since) or DOWN (newly
|
359
|
+
# preempted).
|
360
|
+
job_submitted_at = (
|
361
|
+
self._wait_until_job_starts_on_cluster())
|
362
|
+
if job_submitted_at is not None:
|
363
|
+
return job_submitted_at
|
364
|
+
# The job fails to start on the cluster, retry the
|
365
|
+
# launch.
|
366
|
+
# TODO(zhwu): log the unexpected error to usage
|
367
|
+
# collection for future debugging.
|
368
|
+
logger.info(
|
369
|
+
'Failed to successfully submit the job to the '
|
370
|
+
'launched cluster, due to unexpected submission '
|
371
|
+
'errors or the cluster being preempted during '
|
372
|
+
'job submission.')
|
373
|
+
|
374
|
+
# If we get here, the launch did not succeed. Tear down the
|
375
|
+
# cluster and retry.
|
376
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
377
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
378
|
+
# Retry forever if max_retry is None.
|
379
|
+
if raise_on_failure:
|
380
|
+
with ux_utils.print_exception_no_traceback():
|
381
|
+
raise (
|
382
|
+
exceptions.ManagedJobReachedMaxRetriesError(
|
383
|
+
'Resources unavailable: failed to '
|
384
|
+
f'launch clusters after {max_retry} '
|
385
|
+
'retries.'))
|
386
|
+
else:
|
387
|
+
return None
|
388
|
+
|
389
|
+
# Raise NoClusterLaunchedError to indicate that the job is
|
390
|
+
# in retry backoff. This will trigger special handling in
|
391
|
+
# scheduler.schedule_launched().
|
392
|
+
# We will exit the scheduled_launch context so that the
|
393
|
+
# schedule state is ALIVE_BACKOFF during the backoff. This
|
394
|
+
# allows other jobs to launch.
|
395
|
+
raise exceptions.NoClusterLaunchedError()
|
396
|
+
|
397
|
+
except exceptions.NoClusterLaunchedError:
|
398
|
+
# Update the status to PENDING during backoff.
|
399
|
+
state.set_backoff_pending(self.job_id, self.task_id)
|
400
|
+
# Calculate the backoff time and sleep.
|
401
|
+
gap_seconds = backoff.current_backoff()
|
402
|
+
logger.info('Retrying to launch the cluster in '
|
403
|
+
f'{gap_seconds:.1f} seconds.')
|
404
|
+
time.sleep(gap_seconds)
|
405
|
+
continue
|
406
|
+
else:
|
407
|
+
# The inner loop should either return or throw
|
408
|
+
# NoClusterLaunchedError.
|
409
|
+
assert False, 'Unreachable'
|
370
410
|
|
371
411
|
def should_restart_on_failure(self) -> bool:
|
372
412
|
"""Increments counter & checks if job should be restarted on a failure.
|
@@ -389,9 +429,9 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
389
429
|
|
390
430
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
391
431
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
392
|
-
job_id: int) -> None:
|
432
|
+
job_id: int, task_id: int) -> None:
|
393
433
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
394
|
-
job_id)
|
434
|
+
job_id, task_id)
|
395
435
|
# Note down the cloud/region of the launched cluster, so that we can
|
396
436
|
# first retry in the same cloud/region. (Inside recover() we may not
|
397
437
|
# rely on cluster handle, as it can be None if the cluster is
|
@@ -400,8 +440,10 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
400
440
|
|
401
441
|
def _launch(self,
|
402
442
|
max_retry: Optional[int] = 3,
|
403
|
-
raise_on_failure: bool = True
|
404
|
-
|
443
|
+
raise_on_failure: bool = True,
|
444
|
+
recovery: bool = False) -> Optional[float]:
|
445
|
+
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
446
|
+
recovery)
|
405
447
|
if job_submitted_at is not None:
|
406
448
|
# Only record the cloud/region if the launch is successful.
|
407
449
|
handle = global_user_state.get_handle_from_cluster_name(
|
@@ -436,7 +478,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
436
478
|
cloud=launched_cloud, region=launched_region, zone=None)
|
437
479
|
task.set_resources({new_resources})
|
438
480
|
# Not using self.launch to avoid the retry until up logic.
|
439
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
481
|
+
job_submitted_at = self._launch(raise_on_failure=False,
|
482
|
+
recovery=True)
|
440
483
|
# Restore the original dag, i.e. reset the region constraint.
|
441
484
|
task.set_resources(original_resources)
|
442
485
|
if job_submitted_at is not None:
|
@@ -452,7 +495,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
452
495
|
'cloud/region.')
|
453
496
|
# Not using self.launch to avoid the retry until up logic.
|
454
497
|
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
455
|
-
raise_on_failure=False
|
498
|
+
raise_on_failure=False,
|
499
|
+
recovery=True)
|
456
500
|
if job_submitted_at is None:
|
457
501
|
# Failed to launch the cluster.
|
458
502
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
@@ -524,7 +568,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
524
568
|
region=launched_region)
|
525
569
|
}
|
526
570
|
# Not using self.launch to avoid the retry until up logic.
|
527
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
571
|
+
job_submitted_at = self._launch(raise_on_failure=False,
|
572
|
+
recovery=True)
|
528
573
|
task.blocked_resources = None
|
529
574
|
if job_submitted_at is not None:
|
530
575
|
return job_submitted_at
|
@@ -535,7 +580,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
535
580
|
'cloud/region.')
|
536
581
|
# Not using self.launch to avoid the retry until up logic.
|
537
582
|
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
538
|
-
raise_on_failure=False
|
583
|
+
raise_on_failure=False,
|
584
|
+
recovery=True)
|
539
585
|
if job_submitted_at is None:
|
540
586
|
# Failed to launch the cluster.
|
541
587
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
sky/jobs/scheduler.py
CHANGED
@@ -45,6 +45,7 @@ import typing
|
|
45
45
|
|
46
46
|
import filelock
|
47
47
|
|
48
|
+
from sky import exceptions
|
48
49
|
from sky import sky_logging
|
49
50
|
from sky.adaptors import common as adaptors_common
|
50
51
|
from sky.jobs import constants as managed_job_constants
|
@@ -190,7 +191,8 @@ def maybe_schedule_next_jobs() -> None:
|
|
190
191
|
pass
|
191
192
|
|
192
193
|
|
193
|
-
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str
|
194
|
+
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str,
|
195
|
+
priority: int) -> None:
|
194
196
|
"""Submit an existing job to the scheduler.
|
195
197
|
|
196
198
|
This should be called after a job is created in the `spot` table as
|
@@ -202,7 +204,7 @@ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
|
|
202
204
|
"""
|
203
205
|
with filelock.FileLock(_get_lock_path()):
|
204
206
|
state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
|
205
|
-
common_utils.get_user_hash())
|
207
|
+
common_utils.get_user_hash(), priority)
|
206
208
|
maybe_schedule_next_jobs()
|
207
209
|
|
208
210
|
|
@@ -240,11 +242,19 @@ def scheduled_launch(job_id: int):
|
|
240
242
|
state.ManagedJobScheduleState.LAUNCHING):
|
241
243
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
242
244
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
245
|
+
try:
|
246
|
+
yield
|
247
|
+
except exceptions.NoClusterLaunchedError:
|
248
|
+
# NoClusterLaunchedError is indicates that the job is in retry backoff.
|
249
|
+
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
250
|
+
with filelock.FileLock(_get_lock_path()):
|
251
|
+
state.scheduler_set_alive_backoff(job_id)
|
252
|
+
raise
|
253
|
+
else:
|
254
|
+
with filelock.FileLock(_get_lock_path()):
|
255
|
+
state.scheduler_set_alive(job_id)
|
256
|
+
finally:
|
257
|
+
maybe_schedule_next_jobs()
|
248
258
|
|
249
259
|
|
250
260
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
@@ -309,5 +319,10 @@ if __name__ == '__main__':
|
|
309
319
|
parser.add_argument('--env-file',
|
310
320
|
type=str,
|
311
321
|
help='The path to the controller env file.')
|
322
|
+
parser.add_argument(
|
323
|
+
'--priority',
|
324
|
+
type=int,
|
325
|
+
default=500,
|
326
|
+
help='Job priority (0-1000, lower is higher). Default: 500.')
|
312
327
|
args = parser.parse_args()
|
313
|
-
submit_job(args.job_id, args.dag_yaml, args.env_file)
|
328
|
+
submit_job(args.job_id, args.dag_yaml, args.env_file, args.priority)
|
sky/jobs/server/core.py
CHANGED
@@ -91,6 +91,7 @@ def launch(
|
|
91
91
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
92
92
|
|
93
93
|
task_names = set()
|
94
|
+
priority = None
|
94
95
|
for task_ in dag.tasks:
|
95
96
|
if task_.name in task_names:
|
96
97
|
with ux_utils.print_exception_no_traceback():
|
@@ -100,6 +101,20 @@ def launch(
|
|
100
101
|
'name only and comment out the task names (so that they '
|
101
102
|
'will be auto-generated) .')
|
102
103
|
task_names.add(task_.name)
|
104
|
+
if task_.job_priority is not None:
|
105
|
+
if (priority is not None and priority != task_.job_priority):
|
106
|
+
with ux_utils.print_exception_no_traceback():
|
107
|
+
raise ValueError(
|
108
|
+
'Multiple tasks in the DAG have different priorities. '
|
109
|
+
'Either specify a priority in only one task, or set '
|
110
|
+
'the same priority for each task.')
|
111
|
+
priority = task_.job_priority
|
112
|
+
|
113
|
+
if priority is None:
|
114
|
+
priority = managed_job_constants.DEFAULT_PRIORITY
|
115
|
+
|
116
|
+
if priority < 0 or priority > 1000:
|
117
|
+
raise ValueError(f'Priority must be between 0 and 1000, got {priority}')
|
103
118
|
|
104
119
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
105
120
|
|
@@ -186,6 +201,7 @@ def launch(
|
|
186
201
|
service_catalog_common.get_modified_catalog_file_mounts(),
|
187
202
|
'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
|
188
203
|
'dashboard_user_id': common.SERVER_ID,
|
204
|
+
'priority': priority,
|
189
205
|
**controller_utils.shared_controller_vars_to_fill(
|
190
206
|
controller,
|
191
207
|
remote_user_config_path=remote_user_config_path,
|