skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -29
- sky/cli.py +11 -34
- sky/core.py +8 -5
- sky/data/storage.py +16 -7
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +14 -16
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +251 -17
- sky/jobs/utils.py +287 -64
- sky/provision/kubernetes/instance.py +1 -1
- sky/resources.py +1 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +2 -26
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/utils/resources_utils.py +25 -21
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/RECORD +30 -29
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
@@ -17,6 +17,7 @@ from sky import global_user_state
|
|
17
17
|
from sky import sky_logging
|
18
18
|
from sky import status_lib
|
19
19
|
from sky.backends import backend_utils
|
20
|
+
from sky.jobs import scheduler
|
20
21
|
from sky.jobs import utils as managed_job_utils
|
21
22
|
from sky.skylet import job_lib
|
22
23
|
from sky.usage import usage_lib
|
@@ -42,45 +43,20 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
42
43
|
_AUTODOWN_MINUTES = 5
|
43
44
|
|
44
45
|
|
45
|
-
def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
46
|
-
"""Terminate the cluster."""
|
47
|
-
retry_cnt = 0
|
48
|
-
while True:
|
49
|
-
try:
|
50
|
-
usage_lib.messages.usage.set_internal()
|
51
|
-
sky.down(cluster_name)
|
52
|
-
return
|
53
|
-
except exceptions.ClusterDoesNotExist:
|
54
|
-
# The cluster is already down.
|
55
|
-
logger.debug(f'The cluster {cluster_name} is already down.')
|
56
|
-
return
|
57
|
-
except Exception as e: # pylint: disable=broad-except
|
58
|
-
retry_cnt += 1
|
59
|
-
if retry_cnt >= max_retry:
|
60
|
-
raise RuntimeError(
|
61
|
-
f'Failed to terminate the cluster {cluster_name}.') from e
|
62
|
-
logger.error(
|
63
|
-
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
64
|
-
f'Details: {common_utils.format_exception(e)}')
|
65
|
-
with ux_utils.enable_traceback():
|
66
|
-
logger.error(f' Traceback: {traceback.format_exc()}')
|
67
|
-
|
68
|
-
|
69
46
|
class StrategyExecutor:
|
70
47
|
"""Handle the launching, recovery and termination of managed job clusters"""
|
71
48
|
|
72
49
|
RETRY_INIT_GAP_SECONDS = 60
|
73
50
|
|
74
51
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
75
|
-
task: 'task_lib.Task',
|
76
|
-
|
52
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
53
|
+
job_id: int) -> None:
|
77
54
|
"""Initialize the strategy executor.
|
78
55
|
|
79
56
|
Args:
|
80
57
|
cluster_name: The name of the cluster.
|
81
58
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
82
59
|
task: The task to execute.
|
83
|
-
retry_until_up: Whether to retry until the cluster is up.
|
84
60
|
"""
|
85
61
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
86
62
|
'Only CloudVMRayBackend is supported.')
|
@@ -88,8 +64,8 @@ class StrategyExecutor:
|
|
88
64
|
self.dag.add(task)
|
89
65
|
self.cluster_name = cluster_name
|
90
66
|
self.backend = backend
|
91
|
-
self.retry_until_up = retry_until_up
|
92
67
|
self.max_restarts_on_errors = max_restarts_on_errors
|
68
|
+
self.job_id = job_id
|
93
69
|
self.restart_cnt_on_failure = 0
|
94
70
|
|
95
71
|
def __init_subclass__(cls, name: str, default: bool = False):
|
@@ -102,7 +78,7 @@ class StrategyExecutor:
|
|
102
78
|
|
103
79
|
@classmethod
|
104
80
|
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
105
|
-
task: 'task_lib.Task',
|
81
|
+
task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
|
106
82
|
"""Create a strategy from a task."""
|
107
83
|
|
108
84
|
resource_list = list(task.resources)
|
@@ -127,8 +103,9 @@ class StrategyExecutor:
|
|
127
103
|
job_recovery_name = job_recovery
|
128
104
|
max_restarts_on_errors = 0
|
129
105
|
return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
|
130
|
-
task,
|
131
|
-
max_restarts_on_errors
|
106
|
+
task,
|
107
|
+
max_restarts_on_errors,
|
108
|
+
job_id)
|
132
109
|
|
133
110
|
def launch(self) -> float:
|
134
111
|
"""Launch the cluster for the first time.
|
@@ -142,10 +119,7 @@ class StrategyExecutor:
|
|
142
119
|
Raises: Please refer to the docstring of self._launch().
|
143
120
|
"""
|
144
121
|
|
145
|
-
|
146
|
-
job_submit_at = self._launch(max_retry=None)
|
147
|
-
else:
|
148
|
-
job_submit_at = self._launch()
|
122
|
+
job_submit_at = self._launch(max_retry=None)
|
149
123
|
assert job_submit_at is not None
|
150
124
|
return job_submit_at
|
151
125
|
|
@@ -195,7 +169,7 @@ class StrategyExecutor:
|
|
195
169
|
f'{common_utils.format_exception(e)}\n'
|
196
170
|
'Terminating the cluster explicitly to ensure no '
|
197
171
|
'remaining job process interferes with recovery.')
|
198
|
-
terminate_cluster(self.cluster_name)
|
172
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
199
173
|
|
200
174
|
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
201
175
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
@@ -304,89 +278,96 @@ class StrategyExecutor:
|
|
304
278
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
305
279
|
while True:
|
306
280
|
retry_cnt += 1
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
raise exceptions.ProvisionPrechecksError(reasons=[e])
|
330
|
-
return None
|
331
|
-
except exceptions.ResourcesUnavailableError as e:
|
332
|
-
# This is raised when the launch fails due to prechecks or
|
333
|
-
# after failing over through all the candidates.
|
334
|
-
# Please refer to the docstring of `sky.launch` for more
|
335
|
-
# details of how the exception will be structured.
|
336
|
-
if not any(
|
337
|
-
isinstance(err, exceptions.ResourcesUnavailableError)
|
338
|
-
for err in e.failover_history):
|
339
|
-
# _launch() (this function) should fail/exit directly, if
|
340
|
-
# none of the failover reasons were because of resource
|
341
|
-
# unavailability or no failover was attempted (the optimizer
|
342
|
-
# cannot find feasible resources for requested resources),
|
343
|
-
# i.e., e.failover_history is empty.
|
344
|
-
# Failing directly avoids the infinite loop of retrying
|
345
|
-
# the launch when, e.g., an invalid cluster name is used
|
346
|
-
# and --retry-until-up is specified.
|
347
|
-
reasons = (e.failover_history
|
348
|
-
if e.failover_history else [e])
|
349
|
-
reasons_str = '; '.join(
|
350
|
-
common_utils.format_exception(err) for err in reasons)
|
351
|
-
logger.error(
|
352
|
-
'Failure happened before provisioning. Failover '
|
353
|
-
f'reasons: {reasons_str}')
|
281
|
+
with scheduler.scheduled_launch(self.job_id):
|
282
|
+
try:
|
283
|
+
usage_lib.messages.usage.set_internal()
|
284
|
+
# Detach setup, so that the setup failure can be detected
|
285
|
+
# by the controller process (job_status -> FAILED_SETUP).
|
286
|
+
sky.launch(
|
287
|
+
self.dag,
|
288
|
+
cluster_name=self.cluster_name,
|
289
|
+
# We expect to tear down the cluster as soon as the job
|
290
|
+
# is finished. However, in case the controller dies, set
|
291
|
+
# autodown to try and avoid a resource leak.
|
292
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
293
|
+
down=True,
|
294
|
+
detach_setup=True,
|
295
|
+
detach_run=True,
|
296
|
+
_is_launched_by_jobs_controller=True)
|
297
|
+
logger.info('Managed job cluster launched.')
|
298
|
+
except (exceptions.InvalidClusterNameError,
|
299
|
+
exceptions.NoCloudAccessError,
|
300
|
+
exceptions.ResourcesMismatchError) as e:
|
301
|
+
logger.error('Failure happened before provisioning. '
|
302
|
+
f'{common_utils.format_exception(e)}')
|
354
303
|
if raise_on_failure:
|
355
|
-
raise exceptions.ProvisionPrechecksError(reasons)
|
356
|
-
return None
|
357
|
-
logger.info('Failed to launch a cluster with error: '
|
358
|
-
f'{common_utils.format_exception(e)})')
|
359
|
-
except Exception as e: # pylint: disable=broad-except
|
360
|
-
# If the launch fails, it will be recovered by the following
|
361
|
-
# code.
|
362
|
-
logger.info('Failed to launch a cluster with error: '
|
363
|
-
f'{common_utils.format_exception(e)})')
|
364
|
-
with ux_utils.enable_traceback():
|
365
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
366
|
-
else: # No exception, the launch succeeds.
|
367
|
-
# At this point, a sky.launch() has succeeded. Cluster may be
|
368
|
-
# UP (no preemption since) or DOWN (newly preempted).
|
369
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
370
|
-
if job_submitted_at is not None:
|
371
|
-
return job_submitted_at
|
372
|
-
# The job fails to start on the cluster, retry the launch.
|
373
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
374
|
-
# for future debugging.
|
375
|
-
logger.info(
|
376
|
-
'Failed to successfully submit the job to the '
|
377
|
-
'launched cluster, due to unexpected submission errors or '
|
378
|
-
'the cluster being preempted during job submission.')
|
379
|
-
|
380
|
-
terminate_cluster(self.cluster_name)
|
381
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
382
|
-
# Retry forever if max_retry is None.
|
383
|
-
if raise_on_failure:
|
384
|
-
with ux_utils.print_exception_no_traceback():
|
385
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
386
|
-
'Resources unavailable: failed to launch clusters '
|
387
|
-
f'after {max_retry} retries.')
|
388
|
-
else:
|
304
|
+
raise exceptions.ProvisionPrechecksError(reasons=[e])
|
389
305
|
return None
|
306
|
+
except exceptions.ResourcesUnavailableError as e:
|
307
|
+
# This is raised when the launch fails due to prechecks or
|
308
|
+
# after failing over through all the candidates.
|
309
|
+
# Please refer to the docstring of `sky.launch` for more
|
310
|
+
# details of how the exception will be structured.
|
311
|
+
if not any(
|
312
|
+
isinstance(err,
|
313
|
+
exceptions.ResourcesUnavailableError)
|
314
|
+
for err in e.failover_history):
|
315
|
+
# _launch() (this function) should fail/exit directly,
|
316
|
+
# if none of the failover reasons were because of
|
317
|
+
# resource unavailability or no failover was attempted
|
318
|
+
# (the optimizer cannot find feasible resources for
|
319
|
+
# requested resources), i.e., e.failover_history is
|
320
|
+
# empty. Failing directly avoids the infinite loop of
|
321
|
+
# retrying the launch when, e.g., an invalid cluster
|
322
|
+
# name is used and --retry-until-up is specified.
|
323
|
+
reasons = (e.failover_history
|
324
|
+
if e.failover_history else [e])
|
325
|
+
reasons_str = '; '.join(
|
326
|
+
common_utils.format_exception(err)
|
327
|
+
for err in reasons)
|
328
|
+
logger.error(
|
329
|
+
'Failure happened before provisioning. Failover '
|
330
|
+
f'reasons: {reasons_str}')
|
331
|
+
if raise_on_failure:
|
332
|
+
raise exceptions.ProvisionPrechecksError(reasons)
|
333
|
+
return None
|
334
|
+
logger.info('Failed to launch a cluster with error: '
|
335
|
+
f'{common_utils.format_exception(e)})')
|
336
|
+
except Exception as e: # pylint: disable=broad-except
|
337
|
+
# If the launch fails, it will be recovered by the following
|
338
|
+
# code.
|
339
|
+
logger.info('Failed to launch a cluster with error: '
|
340
|
+
f'{common_utils.format_exception(e)})')
|
341
|
+
with ux_utils.enable_traceback():
|
342
|
+
logger.info(f' Traceback: {traceback.format_exc()}')
|
343
|
+
else: # No exception, the launch succeeds.
|
344
|
+
# At this point, a sky.launch() has succeeded. Cluster may
|
345
|
+
# be UP (no preemption since) or DOWN (newly preempted).
|
346
|
+
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
347
|
+
if job_submitted_at is not None:
|
348
|
+
return job_submitted_at
|
349
|
+
# The job fails to start on the cluster, retry the launch.
|
350
|
+
# TODO(zhwu): log the unexpected error to usage collection
|
351
|
+
# for future debugging.
|
352
|
+
logger.info(
|
353
|
+
'Failed to successfully submit the job to the '
|
354
|
+
'launched cluster, due to unexpected submission errors '
|
355
|
+
'or the cluster being preempted during job submission.')
|
356
|
+
|
357
|
+
# If we get here, the launch did not succeed. Tear down the
|
358
|
+
# cluster and retry.
|
359
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
360
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
361
|
+
# Retry forever if max_retry is None.
|
362
|
+
if raise_on_failure:
|
363
|
+
with ux_utils.print_exception_no_traceback():
|
364
|
+
raise exceptions.ManagedJobReachedMaxRetriesError(
|
365
|
+
'Resources unavailable: failed to launch '
|
366
|
+
f'clusters after {max_retry} retries.')
|
367
|
+
else:
|
368
|
+
return None
|
369
|
+
# Exit the scheduled_launch context so that the scheulde state is
|
370
|
+
# ALIVE during the backoff. This allows other jobs to launch.
|
390
371
|
gap_seconds = backoff.current_backoff()
|
391
372
|
logger.info('Retrying to launch the cluster in '
|
392
373
|
f'{gap_seconds:.1f} seconds.')
|
@@ -411,10 +392,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
411
392
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
412
393
|
|
413
394
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
414
|
-
task: 'task_lib.Task',
|
415
|
-
|
416
|
-
super().__init__(cluster_name, backend, task,
|
417
|
-
|
395
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
396
|
+
job_id: int) -> None:
|
397
|
+
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
398
|
+
job_id)
|
418
399
|
# Note down the cloud/region of the launched cluster, so that we can
|
419
400
|
# first retry in the same cloud/region. (Inside recover() we may not
|
420
401
|
# rely on cluster handle, as it can be None if the cluster is
|
@@ -468,7 +449,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
468
449
|
# Step 2
|
469
450
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
470
451
|
'region.')
|
471
|
-
terminate_cluster(self.cluster_name)
|
452
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
472
453
|
|
473
454
|
# Step 3
|
474
455
|
logger.debug('Relaunch the cluster without constraining to prior '
|
@@ -478,16 +459,11 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
478
459
|
raise_on_failure=False)
|
479
460
|
if job_submitted_at is None:
|
480
461
|
# Failed to launch the cluster.
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
continue
|
487
|
-
with ux_utils.print_exception_no_traceback():
|
488
|
-
raise exceptions.ResourcesUnavailableError(
|
489
|
-
f'Failed to recover the cluster after retrying '
|
490
|
-
f'{self._MAX_RETRY_CNT} times.')
|
462
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
463
|
+
logger.info('Retrying to recover the cluster in '
|
464
|
+
f'{gap_seconds:.1f} seconds.')
|
465
|
+
time.sleep(gap_seconds)
|
466
|
+
continue
|
491
467
|
|
492
468
|
return job_submitted_at
|
493
469
|
|
@@ -531,7 +507,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
531
507
|
|
532
508
|
# Step 1
|
533
509
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
534
|
-
terminate_cluster(self.cluster_name)
|
510
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
535
511
|
|
536
512
|
# Step 2
|
537
513
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
@@ -566,15 +542,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
566
542
|
raise_on_failure=False)
|
567
543
|
if job_submitted_at is None:
|
568
544
|
# Failed to launch the cluster.
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
continue
|
575
|
-
with ux_utils.print_exception_no_traceback():
|
576
|
-
raise exceptions.ResourcesUnavailableError(
|
577
|
-
f'Failed to recover the cluster after retrying '
|
578
|
-
f'{self._MAX_RETRY_CNT} times.')
|
545
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
546
|
+
logger.info('Retrying to recover the cluster in '
|
547
|
+
f'{gap_seconds:.1f} seconds.')
|
548
|
+
time.sleep(gap_seconds)
|
549
|
+
continue
|
579
550
|
|
580
551
|
return job_submitted_at
|
sky/jobs/scheduler.py
ADDED
@@ -0,0 +1,283 @@
|
|
1
|
+
"""Scheduler for managed jobs.
|
2
|
+
|
3
|
+
Once managed jobs are submitted via submit_job, the scheduler is responsible for
|
4
|
+
the business logic of deciding when they are allowed to start, and choosing the
|
5
|
+
right one to start. The scheduler will also schedule jobs that are already live
|
6
|
+
but waiting to launch a new task or recover.
|
7
|
+
|
8
|
+
The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
|
9
|
+
be called from any code running on the managed jobs controller instance to
|
10
|
+
trigger scheduling of new jobs if possible. This function should be called
|
11
|
+
immediately after any state change that could result in jobs newly being able to
|
12
|
+
be scheduled.
|
13
|
+
|
14
|
+
The scheduling logic limits the number of running jobs according to two limits:
|
15
|
+
1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
|
16
|
+
once, based on the number of CPUs. (See _get_launch_parallelism.) This the
|
17
|
+
most compute-intensive part of the job lifecycle, which is why we have an
|
18
|
+
additional limit.
|
19
|
+
2. The number of jobs that can be running at any given time, based on the amount
|
20
|
+
of memory. (See _get_job_parallelism.) Since the job controller is doing very
|
21
|
+
little once a job starts (just checking its status periodically), the most
|
22
|
+
significant resource it consumes is memory.
|
23
|
+
|
24
|
+
The state of the scheduler is entirely determined by the schedule_state column
|
25
|
+
of all the jobs in the job_info table. This column should only be modified via
|
26
|
+
the functions defined in this file. We will always hold the lock while modifying
|
27
|
+
this state. See state.ManagedJobScheduleState.
|
28
|
+
|
29
|
+
Nomenclature:
|
30
|
+
- job: same as managed job (may include multiple tasks)
|
31
|
+
- launch/launching: launching a cluster (sky.launch) as part of a job
|
32
|
+
- start/run: create the job controller process for a job
|
33
|
+
- schedule: transition a job to the LAUNCHING state, whether a new job or a job
|
34
|
+
that is already alive
|
35
|
+
- alive: a job controller exists (includes multiple schedule_states: ALIVE,
|
36
|
+
ALIVE_WAITING, LAUNCHING)
|
37
|
+
"""
|
38
|
+
|
39
|
+
from argparse import ArgumentParser
|
40
|
+
import contextlib
|
41
|
+
from functools import lru_cache
|
42
|
+
import os
|
43
|
+
import time
|
44
|
+
|
45
|
+
import filelock
|
46
|
+
import psutil
|
47
|
+
|
48
|
+
from sky import sky_logging
|
49
|
+
from sky.jobs import constants as managed_job_constants
|
50
|
+
from sky.jobs import state
|
51
|
+
from sky.skylet import constants
|
52
|
+
from sky.utils import subprocess_utils
|
53
|
+
|
54
|
+
logger = sky_logging.init_logger('sky.jobs.controller')
|
55
|
+
|
56
|
+
# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
|
57
|
+
# parallelism control or updating the schedule_state of any job.
|
58
|
+
# Any code that takes this lock must conclude by calling
|
59
|
+
# maybe_schedule_next_jobs.
|
60
|
+
_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
|
61
|
+
_ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
|
62
|
+
|
63
|
+
|
64
|
+
@lru_cache(maxsize=1)
|
65
|
+
def _get_lock_path() -> str:
|
66
|
+
path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
|
67
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
68
|
+
return path
|
69
|
+
|
70
|
+
|
71
|
+
def maybe_schedule_next_jobs() -> None:
|
72
|
+
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
73
|
+
|
74
|
+
Here, "schedule" means to select job that is waiting, and allow it to
|
75
|
+
proceed. It does NOT mean to submit a job to the scheduler.
|
76
|
+
|
77
|
+
For newly submitted jobs, scheduling means updating the state of the jobs,
|
78
|
+
and starting the job controller process. For jobs that are already alive but
|
79
|
+
are waiting to launch a new task or recover, just update the state of the
|
80
|
+
job to indicate that the launch can proceed.
|
81
|
+
|
82
|
+
This function transitions jobs into LAUNCHING on a best-effort basis. That
|
83
|
+
is, if we can start any jobs, we will, but if not, we will exit (almost)
|
84
|
+
immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
|
85
|
+
be started now (either because the lock is held, or because there are not
|
86
|
+
enough resources), another call to this function will be made whenever that
|
87
|
+
situation is resolved. (If the lock is held, the lock holder should start
|
88
|
+
the jobs. If there aren't enough resources, the next controller to exit and
|
89
|
+
free up resources should start the jobs.)
|
90
|
+
|
91
|
+
If this function obtains the lock, it will launch as many jobs as possible
|
92
|
+
before releasing the lock. This is what allows other calls to exit
|
93
|
+
immediately if the lock is held, while ensuring that all jobs are started as
|
94
|
+
soon as possible.
|
95
|
+
|
96
|
+
This uses subprocess_utils.launch_new_process_tree() to start the controller
|
97
|
+
processes, which should be safe to call from pretty much any code running on
|
98
|
+
the jobs controller instance. New job controller processes will be detached
|
99
|
+
from the current process and there will not be a parent/child relationship.
|
100
|
+
See launch_new_process_tree for more.
|
101
|
+
"""
|
102
|
+
try:
|
103
|
+
# We must use a global lock rather than a per-job lock to ensure correct
|
104
|
+
# parallelism control. If we cannot obtain the lock, exit immediately.
|
105
|
+
# The current lock holder is expected to launch any jobs it can before
|
106
|
+
# releasing the lock.
|
107
|
+
with filelock.FileLock(_get_lock_path(), blocking=False):
|
108
|
+
while True:
|
109
|
+
maybe_next_job = state.get_waiting_job()
|
110
|
+
if maybe_next_job is None:
|
111
|
+
# Nothing left to start, break from scheduling loop
|
112
|
+
break
|
113
|
+
|
114
|
+
current_state = maybe_next_job['schedule_state']
|
115
|
+
|
116
|
+
assert current_state in (
|
117
|
+
state.ManagedJobScheduleState.ALIVE_WAITING,
|
118
|
+
state.ManagedJobScheduleState.WAITING), maybe_next_job
|
119
|
+
|
120
|
+
# Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
|
121
|
+
# since they will have been submitted and therefore started
|
122
|
+
# first. The requirements to launch in an alive job are more
|
123
|
+
# lenient, so there is no way that we wouldn't be able to launch
|
124
|
+
# an ALIVE_WAITING job, but we would be able to launch a WAITING
|
125
|
+
# job.
|
126
|
+
if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
|
127
|
+
if not _can_lauch_in_alive_job():
|
128
|
+
# Can't schedule anything, break from scheduling loop.
|
129
|
+
break
|
130
|
+
elif current_state == state.ManagedJobScheduleState.WAITING:
|
131
|
+
if not _can_start_new_job():
|
132
|
+
# Can't schedule anything, break from scheduling loop.
|
133
|
+
break
|
134
|
+
|
135
|
+
logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
|
136
|
+
state.scheduler_set_launching(maybe_next_job['job_id'],
|
137
|
+
current_state)
|
138
|
+
|
139
|
+
if current_state == state.ManagedJobScheduleState.WAITING:
|
140
|
+
# The job controller has not been started yet. We must start
|
141
|
+
# it.
|
142
|
+
|
143
|
+
job_id = maybe_next_job['job_id']
|
144
|
+
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
145
|
+
|
146
|
+
# If the command line here is changed, please also update
|
147
|
+
# utils._controller_process_alive. `--job-id X` should be at
|
148
|
+
# the end.
|
149
|
+
run_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};'
|
150
|
+
'python -u -m sky.jobs.controller '
|
151
|
+
f'{dag_yaml_path} --job-id {job_id}')
|
152
|
+
|
153
|
+
logs_dir = os.path.expanduser(
|
154
|
+
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
155
|
+
os.makedirs(logs_dir, exist_ok=True)
|
156
|
+
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
157
|
+
|
158
|
+
pid = subprocess_utils.launch_new_process_tree(
|
159
|
+
run_cmd, log_output=log_path)
|
160
|
+
state.set_job_controller_pid(job_id, pid)
|
161
|
+
|
162
|
+
logger.debug(f'Job {job_id} started with pid {pid}')
|
163
|
+
|
164
|
+
except filelock.Timeout:
|
165
|
+
# If we can't get the lock, just exit. The process holding the lock
|
166
|
+
# should launch any pending jobs.
|
167
|
+
pass
|
168
|
+
|
169
|
+
|
170
|
+
def submit_job(job_id: int, dag_yaml_path: str) -> None:
|
171
|
+
"""Submit an existing job to the scheduler.
|
172
|
+
|
173
|
+
This should be called after a job is created in the `spot` table as
|
174
|
+
PENDING. It will tell the scheduler to try and start the job controller, if
|
175
|
+
there are resources available. It may block to acquire the lock, so it
|
176
|
+
should not be on the critical path for `sky jobs launch -d`.
|
177
|
+
"""
|
178
|
+
with filelock.FileLock(_get_lock_path()):
|
179
|
+
state.scheduler_set_waiting(job_id, dag_yaml_path)
|
180
|
+
maybe_schedule_next_jobs()
|
181
|
+
|
182
|
+
|
183
|
+
@contextlib.contextmanager
|
184
|
+
def scheduled_launch(job_id: int):
|
185
|
+
"""Launch as part of an ongoing job.
|
186
|
+
|
187
|
+
A newly started job will already be LAUNCHING, and this will immediately
|
188
|
+
enter the context.
|
189
|
+
|
190
|
+
If a job is ongoing (ALIVE schedule_state), there are two scenarios where we
|
191
|
+
may need to call sky.launch again during the course of a job controller:
|
192
|
+
- for tasks after the first task
|
193
|
+
- for recovery
|
194
|
+
|
195
|
+
This function will mark the job as ALIVE_WAITING, which indicates to the
|
196
|
+
scheduler that it wants to transition back to LAUNCHING. Then, it will wait
|
197
|
+
until the scheduler transitions the job state, before entering the context.
|
198
|
+
|
199
|
+
On exiting the context, the job will transition to ALIVE.
|
200
|
+
|
201
|
+
This should only be used within the job controller for the given job_id. If
|
202
|
+
multiple uses of this context are nested, behavior is undefined. Don't do
|
203
|
+
that.
|
204
|
+
"""
|
205
|
+
|
206
|
+
# If we're already in LAUNCHING schedule_state, we don't need to wait.
|
207
|
+
# This may be the case for the first launch of a job.
|
208
|
+
if (state.get_job_schedule_state(job_id) !=
|
209
|
+
state.ManagedJobScheduleState.LAUNCHING):
|
210
|
+
# Since we aren't LAUNCHING, we need to wait to be scheduled.
|
211
|
+
_set_alive_waiting(job_id)
|
212
|
+
|
213
|
+
while (state.get_job_schedule_state(job_id) !=
|
214
|
+
state.ManagedJobScheduleState.LAUNCHING):
|
215
|
+
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
216
|
+
|
217
|
+
yield
|
218
|
+
|
219
|
+
with filelock.FileLock(_get_lock_path()):
|
220
|
+
state.scheduler_set_alive(job_id)
|
221
|
+
maybe_schedule_next_jobs()
|
222
|
+
|
223
|
+
|
224
|
+
def job_done(job_id: int, idempotent: bool = False) -> None:
|
225
|
+
"""Transition a job to DONE.
|
226
|
+
|
227
|
+
If idempotent is True, this will not raise an error if the job is already
|
228
|
+
DONE.
|
229
|
+
|
230
|
+
The job could be in any terminal ManagedJobStatus. However, once DONE, it
|
231
|
+
should never transition back to another state.
|
232
|
+
"""
|
233
|
+
if idempotent and (state.get_job_schedule_state(job_id)
|
234
|
+
== state.ManagedJobScheduleState.DONE):
|
235
|
+
return
|
236
|
+
|
237
|
+
with filelock.FileLock(_get_lock_path()):
|
238
|
+
state.scheduler_set_done(job_id, idempotent)
|
239
|
+
maybe_schedule_next_jobs()
|
240
|
+
|
241
|
+
|
242
|
+
def _set_alive_waiting(job_id: int) -> None:
|
243
|
+
"""Should use wait_until_launch_okay() to transition to this state."""
|
244
|
+
with filelock.FileLock(_get_lock_path()):
|
245
|
+
state.scheduler_set_alive_waiting(job_id)
|
246
|
+
maybe_schedule_next_jobs()
|
247
|
+
|
248
|
+
|
249
|
+
def _get_job_parallelism() -> int:
|
250
|
+
# Assume a running job uses 350MB memory.
|
251
|
+
# We observe 230-300 in practice.
|
252
|
+
job_memory = 350 * 1024 * 1024
|
253
|
+
return max(psutil.virtual_memory().total // job_memory, 1)
|
254
|
+
|
255
|
+
|
256
|
+
def _get_launch_parallelism() -> int:
|
257
|
+
cpus = os.cpu_count()
|
258
|
+
return cpus * 4 if cpus is not None else 1
|
259
|
+
|
260
|
+
|
261
|
+
def _can_start_new_job() -> bool:
|
262
|
+
launching_jobs = state.get_num_launching_jobs()
|
263
|
+
alive_jobs = state.get_num_alive_jobs()
|
264
|
+
return launching_jobs < _get_launch_parallelism(
|
265
|
+
) and alive_jobs < _get_job_parallelism()
|
266
|
+
|
267
|
+
|
268
|
+
def _can_lauch_in_alive_job() -> bool:
|
269
|
+
launching_jobs = state.get_num_launching_jobs()
|
270
|
+
return launching_jobs < _get_launch_parallelism()
|
271
|
+
|
272
|
+
|
273
|
+
if __name__ == '__main__':
|
274
|
+
parser = ArgumentParser()
|
275
|
+
parser.add_argument('--job-id',
|
276
|
+
required=True,
|
277
|
+
type=int,
|
278
|
+
help='Job id for the controller job.')
|
279
|
+
parser.add_argument('dag_yaml',
|
280
|
+
type=str,
|
281
|
+
help='The path to the user job yaml file.')
|
282
|
+
args = parser.parse_args()
|
283
|
+
submit_job(args.job_id, args.dag_yaml)
|