skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,6 +17,7 @@ from sky import global_user_state
17
17
  from sky import sky_logging
18
18
  from sky import status_lib
19
19
  from sky.backends import backend_utils
20
+ from sky.jobs import scheduler
20
21
  from sky.jobs import utils as managed_job_utils
21
22
  from sky.skylet import job_lib
22
23
  from sky.usage import usage_lib
@@ -42,45 +43,20 @@ MAX_JOB_CHECKING_RETRY = 10
42
43
  _AUTODOWN_MINUTES = 5
43
44
 
44
45
 
45
- def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
46
- """Terminate the cluster."""
47
- retry_cnt = 0
48
- while True:
49
- try:
50
- usage_lib.messages.usage.set_internal()
51
- sky.down(cluster_name)
52
- return
53
- except exceptions.ClusterDoesNotExist:
54
- # The cluster is already down.
55
- logger.debug(f'The cluster {cluster_name} is already down.')
56
- return
57
- except Exception as e: # pylint: disable=broad-except
58
- retry_cnt += 1
59
- if retry_cnt >= max_retry:
60
- raise RuntimeError(
61
- f'Failed to terminate the cluster {cluster_name}.') from e
62
- logger.error(
63
- f'Failed to terminate the cluster {cluster_name}. Retrying.'
64
- f'Details: {common_utils.format_exception(e)}')
65
- with ux_utils.enable_traceback():
66
- logger.error(f' Traceback: {traceback.format_exc()}')
67
-
68
-
69
46
  class StrategyExecutor:
70
47
  """Handle the launching, recovery and termination of managed job clusters"""
71
48
 
72
49
  RETRY_INIT_GAP_SECONDS = 60
73
50
 
74
51
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
75
- task: 'task_lib.Task', retry_until_up: bool,
76
- max_restarts_on_errors: int) -> None:
52
+ task: 'task_lib.Task', max_restarts_on_errors: int,
53
+ job_id: int) -> None:
77
54
  """Initialize the strategy executor.
78
55
 
79
56
  Args:
80
57
  cluster_name: The name of the cluster.
81
58
  backend: The backend to use. Only CloudVMRayBackend is supported.
82
59
  task: The task to execute.
83
- retry_until_up: Whether to retry until the cluster is up.
84
60
  """
85
61
  assert isinstance(backend, backends.CloudVmRayBackend), (
86
62
  'Only CloudVMRayBackend is supported.')
@@ -88,8 +64,8 @@ class StrategyExecutor:
88
64
  self.dag.add(task)
89
65
  self.cluster_name = cluster_name
90
66
  self.backend = backend
91
- self.retry_until_up = retry_until_up
92
67
  self.max_restarts_on_errors = max_restarts_on_errors
68
+ self.job_id = job_id
93
69
  self.restart_cnt_on_failure = 0
94
70
 
95
71
  def __init_subclass__(cls, name: str, default: bool = False):
@@ -102,7 +78,7 @@ class StrategyExecutor:
102
78
 
103
79
  @classmethod
104
80
  def make(cls, cluster_name: str, backend: 'backends.Backend',
105
- task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor':
81
+ task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
106
82
  """Create a strategy from a task."""
107
83
 
108
84
  resource_list = list(task.resources)
@@ -127,8 +103,9 @@ class StrategyExecutor:
127
103
  job_recovery_name = job_recovery
128
104
  max_restarts_on_errors = 0
129
105
  return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
130
- task, retry_until_up,
131
- max_restarts_on_errors)
106
+ task,
107
+ max_restarts_on_errors,
108
+ job_id)
132
109
 
133
110
  def launch(self) -> float:
134
111
  """Launch the cluster for the first time.
@@ -142,10 +119,7 @@ class StrategyExecutor:
142
119
  Raises: Please refer to the docstring of self._launch().
143
120
  """
144
121
 
145
- if self.retry_until_up:
146
- job_submit_at = self._launch(max_retry=None)
147
- else:
148
- job_submit_at = self._launch()
122
+ job_submit_at = self._launch(max_retry=None)
149
123
  assert job_submit_at is not None
150
124
  return job_submit_at
151
125
 
@@ -195,7 +169,7 @@ class StrategyExecutor:
195
169
  f'{common_utils.format_exception(e)}\n'
196
170
  'Terminating the cluster explicitly to ensure no '
197
171
  'remaining job process interferes with recovery.')
198
- terminate_cluster(self.cluster_name)
172
+ managed_job_utils.terminate_cluster(self.cluster_name)
199
173
 
200
174
  def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
201
175
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -304,89 +278,96 @@ class StrategyExecutor:
304
278
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
305
279
  while True:
306
280
  retry_cnt += 1
307
- try:
308
- usage_lib.messages.usage.set_internal()
309
- # Detach setup, so that the setup failure can be detected
310
- # by the controller process (job_status -> FAILED_SETUP).
311
- sky.launch(
312
- self.dag,
313
- cluster_name=self.cluster_name,
314
- # We expect to tear down the cluster as soon as the job is
315
- # finished. However, in case the controller dies, set
316
- # autodown to try and avoid a resource leak.
317
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
318
- down=True,
319
- detach_setup=True,
320
- detach_run=True,
321
- _is_launched_by_jobs_controller=True)
322
- logger.info('Managed job cluster launched.')
323
- except (exceptions.InvalidClusterNameError,
324
- exceptions.NoCloudAccessError,
325
- exceptions.ResourcesMismatchError) as e:
326
- logger.error('Failure happened before provisioning. '
327
- f'{common_utils.format_exception(e)}')
328
- if raise_on_failure:
329
- raise exceptions.ProvisionPrechecksError(reasons=[e])
330
- return None
331
- except exceptions.ResourcesUnavailableError as e:
332
- # This is raised when the launch fails due to prechecks or
333
- # after failing over through all the candidates.
334
- # Please refer to the docstring of `sky.launch` for more
335
- # details of how the exception will be structured.
336
- if not any(
337
- isinstance(err, exceptions.ResourcesUnavailableError)
338
- for err in e.failover_history):
339
- # _launch() (this function) should fail/exit directly, if
340
- # none of the failover reasons were because of resource
341
- # unavailability or no failover was attempted (the optimizer
342
- # cannot find feasible resources for requested resources),
343
- # i.e., e.failover_history is empty.
344
- # Failing directly avoids the infinite loop of retrying
345
- # the launch when, e.g., an invalid cluster name is used
346
- # and --retry-until-up is specified.
347
- reasons = (e.failover_history
348
- if e.failover_history else [e])
349
- reasons_str = '; '.join(
350
- common_utils.format_exception(err) for err in reasons)
351
- logger.error(
352
- 'Failure happened before provisioning. Failover '
353
- f'reasons: {reasons_str}')
281
+ with scheduler.scheduled_launch(self.job_id):
282
+ try:
283
+ usage_lib.messages.usage.set_internal()
284
+ # Detach setup, so that the setup failure can be detected
285
+ # by the controller process (job_status -> FAILED_SETUP).
286
+ sky.launch(
287
+ self.dag,
288
+ cluster_name=self.cluster_name,
289
+ # We expect to tear down the cluster as soon as the job
290
+ # is finished. However, in case the controller dies, set
291
+ # autodown to try and avoid a resource leak.
292
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
293
+ down=True,
294
+ detach_setup=True,
295
+ detach_run=True,
296
+ _is_launched_by_jobs_controller=True)
297
+ logger.info('Managed job cluster launched.')
298
+ except (exceptions.InvalidClusterNameError,
299
+ exceptions.NoCloudAccessError,
300
+ exceptions.ResourcesMismatchError) as e:
301
+ logger.error('Failure happened before provisioning. '
302
+ f'{common_utils.format_exception(e)}')
354
303
  if raise_on_failure:
355
- raise exceptions.ProvisionPrechecksError(reasons)
356
- return None
357
- logger.info('Failed to launch a cluster with error: '
358
- f'{common_utils.format_exception(e)})')
359
- except Exception as e: # pylint: disable=broad-except
360
- # If the launch fails, it will be recovered by the following
361
- # code.
362
- logger.info('Failed to launch a cluster with error: '
363
- f'{common_utils.format_exception(e)})')
364
- with ux_utils.enable_traceback():
365
- logger.info(f' Traceback: {traceback.format_exc()}')
366
- else: # No exception, the launch succeeds.
367
- # At this point, a sky.launch() has succeeded. Cluster may be
368
- # UP (no preemption since) or DOWN (newly preempted).
369
- job_submitted_at = self._wait_until_job_starts_on_cluster()
370
- if job_submitted_at is not None:
371
- return job_submitted_at
372
- # The job fails to start on the cluster, retry the launch.
373
- # TODO(zhwu): log the unexpected error to usage collection
374
- # for future debugging.
375
- logger.info(
376
- 'Failed to successfully submit the job to the '
377
- 'launched cluster, due to unexpected submission errors or '
378
- 'the cluster being preempted during job submission.')
379
-
380
- terminate_cluster(self.cluster_name)
381
- if max_retry is not None and retry_cnt >= max_retry:
382
- # Retry forever if max_retry is None.
383
- if raise_on_failure:
384
- with ux_utils.print_exception_no_traceback():
385
- raise exceptions.ManagedJobReachedMaxRetriesError(
386
- 'Resources unavailable: failed to launch clusters '
387
- f'after {max_retry} retries.')
388
- else:
304
+ raise exceptions.ProvisionPrechecksError(reasons=[e])
389
305
  return None
306
+ except exceptions.ResourcesUnavailableError as e:
307
+ # This is raised when the launch fails due to prechecks or
308
+ # after failing over through all the candidates.
309
+ # Please refer to the docstring of `sky.launch` for more
310
+ # details of how the exception will be structured.
311
+ if not any(
312
+ isinstance(err,
313
+ exceptions.ResourcesUnavailableError)
314
+ for err in e.failover_history):
315
+ # _launch() (this function) should fail/exit directly,
316
+ # if none of the failover reasons were because of
317
+ # resource unavailability or no failover was attempted
318
+ # (the optimizer cannot find feasible resources for
319
+ # requested resources), i.e., e.failover_history is
320
+ # empty. Failing directly avoids the infinite loop of
321
+ # retrying the launch when, e.g., an invalid cluster
322
+ # name is used and --retry-until-up is specified.
323
+ reasons = (e.failover_history
324
+ if e.failover_history else [e])
325
+ reasons_str = '; '.join(
326
+ common_utils.format_exception(err)
327
+ for err in reasons)
328
+ logger.error(
329
+ 'Failure happened before provisioning. Failover '
330
+ f'reasons: {reasons_str}')
331
+ if raise_on_failure:
332
+ raise exceptions.ProvisionPrechecksError(reasons)
333
+ return None
334
+ logger.info('Failed to launch a cluster with error: '
335
+ f'{common_utils.format_exception(e)})')
336
+ except Exception as e: # pylint: disable=broad-except
337
+ # If the launch fails, it will be recovered by the following
338
+ # code.
339
+ logger.info('Failed to launch a cluster with error: '
340
+ f'{common_utils.format_exception(e)})')
341
+ with ux_utils.enable_traceback():
342
+ logger.info(f' Traceback: {traceback.format_exc()}')
343
+ else: # No exception, the launch succeeds.
344
+ # At this point, a sky.launch() has succeeded. Cluster may
345
+ # be UP (no preemption since) or DOWN (newly preempted).
346
+ job_submitted_at = self._wait_until_job_starts_on_cluster()
347
+ if job_submitted_at is not None:
348
+ return job_submitted_at
349
+ # The job fails to start on the cluster, retry the launch.
350
+ # TODO(zhwu): log the unexpected error to usage collection
351
+ # for future debugging.
352
+ logger.info(
353
+ 'Failed to successfully submit the job to the '
354
+ 'launched cluster, due to unexpected submission errors '
355
+ 'or the cluster being preempted during job submission.')
356
+
357
+ # If we get here, the launch did not succeed. Tear down the
358
+ # cluster and retry.
359
+ managed_job_utils.terminate_cluster(self.cluster_name)
360
+ if max_retry is not None and retry_cnt >= max_retry:
361
+ # Retry forever if max_retry is None.
362
+ if raise_on_failure:
363
+ with ux_utils.print_exception_no_traceback():
364
+ raise exceptions.ManagedJobReachedMaxRetriesError(
365
+ 'Resources unavailable: failed to launch '
366
+ f'clusters after {max_retry} retries.')
367
+ else:
368
+ return None
369
+ # Exit the scheduled_launch context so that the scheulde state is
370
+ # ALIVE during the backoff. This allows other jobs to launch.
390
371
  gap_seconds = backoff.current_backoff()
391
372
  logger.info('Retrying to launch the cluster in '
392
373
  f'{gap_seconds:.1f} seconds.')
@@ -411,10 +392,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
411
392
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
412
393
 
413
394
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
414
- task: 'task_lib.Task', retry_until_up: bool,
415
- max_restarts_on_errors: int) -> None:
416
- super().__init__(cluster_name, backend, task, retry_until_up,
417
- max_restarts_on_errors)
395
+ task: 'task_lib.Task', max_restarts_on_errors: int,
396
+ job_id: int) -> None:
397
+ super().__init__(cluster_name, backend, task, max_restarts_on_errors,
398
+ job_id)
418
399
  # Note down the cloud/region of the launched cluster, so that we can
419
400
  # first retry in the same cloud/region. (Inside recover() we may not
420
401
  # rely on cluster handle, as it can be None if the cluster is
@@ -468,7 +449,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
468
449
  # Step 2
469
450
  logger.debug('Terminating unhealthy cluster and reset cloud '
470
451
  'region.')
471
- terminate_cluster(self.cluster_name)
452
+ managed_job_utils.terminate_cluster(self.cluster_name)
472
453
 
473
454
  # Step 3
474
455
  logger.debug('Relaunch the cluster without constraining to prior '
@@ -478,16 +459,11 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
478
459
  raise_on_failure=False)
479
460
  if job_submitted_at is None:
480
461
  # Failed to launch the cluster.
481
- if self.retry_until_up:
482
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
483
- logger.info('Retrying to recover the cluster in '
484
- f'{gap_seconds:.1f} seconds.')
485
- time.sleep(gap_seconds)
486
- continue
487
- with ux_utils.print_exception_no_traceback():
488
- raise exceptions.ResourcesUnavailableError(
489
- f'Failed to recover the cluster after retrying '
490
- f'{self._MAX_RETRY_CNT} times.')
462
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
463
+ logger.info('Retrying to recover the cluster in '
464
+ f'{gap_seconds:.1f} seconds.')
465
+ time.sleep(gap_seconds)
466
+ continue
491
467
 
492
468
  return job_submitted_at
493
469
 
@@ -531,7 +507,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
531
507
 
532
508
  # Step 1
533
509
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
534
- terminate_cluster(self.cluster_name)
510
+ managed_job_utils.terminate_cluster(self.cluster_name)
535
511
 
536
512
  # Step 2
537
513
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -566,15 +542,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
566
542
  raise_on_failure=False)
567
543
  if job_submitted_at is None:
568
544
  # Failed to launch the cluster.
569
- if self.retry_until_up:
570
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
571
- logger.info('Retrying to recover the cluster in '
572
- f'{gap_seconds:.1f} seconds.')
573
- time.sleep(gap_seconds)
574
- continue
575
- with ux_utils.print_exception_no_traceback():
576
- raise exceptions.ResourcesUnavailableError(
577
- f'Failed to recover the cluster after retrying '
578
- f'{self._MAX_RETRY_CNT} times.')
545
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
546
+ logger.info('Retrying to recover the cluster in '
547
+ f'{gap_seconds:.1f} seconds.')
548
+ time.sleep(gap_seconds)
549
+ continue
579
550
 
580
551
  return job_submitted_at
sky/jobs/scheduler.py ADDED
@@ -0,0 +1,283 @@
1
+ """Scheduler for managed jobs.
2
+
3
+ Once managed jobs are submitted via submit_job, the scheduler is responsible for
4
+ the business logic of deciding when they are allowed to start, and choosing the
5
+ right one to start. The scheduler will also schedule jobs that are already live
6
+ but waiting to launch a new task or recover.
7
+
8
+ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
+ be called from any code running on the managed jobs controller instance to
10
+ trigger scheduling of new jobs if possible. This function should be called
11
+ immediately after any state change that could result in jobs newly being able to
12
+ be scheduled.
13
+
14
+ The scheduling logic limits the number of running jobs according to two limits:
15
+ 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
+ once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
+ most compute-intensive part of the job lifecycle, which is why we have an
18
+ additional limit.
19
+ 2. The number of jobs that can be running at any given time, based on the amount
20
+ of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
+ little once a job starts (just checking its status periodically), the most
22
+ significant resource it consumes is memory.
23
+
24
+ The state of the scheduler is entirely determined by the schedule_state column
25
+ of all the jobs in the job_info table. This column should only be modified via
26
+ the functions defined in this file. We will always hold the lock while modifying
27
+ this state. See state.ManagedJobScheduleState.
28
+
29
+ Nomenclature:
30
+ - job: same as managed job (may include multiple tasks)
31
+ - launch/launching: launching a cluster (sky.launch) as part of a job
32
+ - start/run: create the job controller process for a job
33
+ - schedule: transition a job to the LAUNCHING state, whether a new job or a job
34
+ that is already alive
35
+ - alive: a job controller exists (includes multiple schedule_states: ALIVE,
36
+ ALIVE_WAITING, LAUNCHING)
37
+ """
38
+
39
+ from argparse import ArgumentParser
40
+ import contextlib
41
+ from functools import lru_cache
42
+ import os
43
+ import time
44
+
45
+ import filelock
46
+ import psutil
47
+
48
+ from sky import sky_logging
49
+ from sky.jobs import constants as managed_job_constants
50
+ from sky.jobs import state
51
+ from sky.skylet import constants
52
+ from sky.utils import subprocess_utils
53
+
54
+ logger = sky_logging.init_logger('sky.jobs.controller')
55
+
56
+ # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
57
+ # parallelism control or updating the schedule_state of any job.
58
+ # Any code that takes this lock must conclude by calling
59
+ # maybe_schedule_next_jobs.
60
+ _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
61
+ _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
62
+
63
+
64
+ @lru_cache(maxsize=1)
65
+ def _get_lock_path() -> str:
66
+ path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
67
+ os.makedirs(os.path.dirname(path), exist_ok=True)
68
+ return path
69
+
70
+
71
+ def maybe_schedule_next_jobs() -> None:
72
+ """Determine if any managed jobs can be scheduled, and if so, schedule them.
73
+
74
+ Here, "schedule" means to select job that is waiting, and allow it to
75
+ proceed. It does NOT mean to submit a job to the scheduler.
76
+
77
+ For newly submitted jobs, scheduling means updating the state of the jobs,
78
+ and starting the job controller process. For jobs that are already alive but
79
+ are waiting to launch a new task or recover, just update the state of the
80
+ job to indicate that the launch can proceed.
81
+
82
+ This function transitions jobs into LAUNCHING on a best-effort basis. That
83
+ is, if we can start any jobs, we will, but if not, we will exit (almost)
84
+ immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
85
+ be started now (either because the lock is held, or because there are not
86
+ enough resources), another call to this function will be made whenever that
87
+ situation is resolved. (If the lock is held, the lock holder should start
88
+ the jobs. If there aren't enough resources, the next controller to exit and
89
+ free up resources should start the jobs.)
90
+
91
+ If this function obtains the lock, it will launch as many jobs as possible
92
+ before releasing the lock. This is what allows other calls to exit
93
+ immediately if the lock is held, while ensuring that all jobs are started as
94
+ soon as possible.
95
+
96
+ This uses subprocess_utils.launch_new_process_tree() to start the controller
97
+ processes, which should be safe to call from pretty much any code running on
98
+ the jobs controller instance. New job controller processes will be detached
99
+ from the current process and there will not be a parent/child relationship.
100
+ See launch_new_process_tree for more.
101
+ """
102
+ try:
103
+ # We must use a global lock rather than a per-job lock to ensure correct
104
+ # parallelism control. If we cannot obtain the lock, exit immediately.
105
+ # The current lock holder is expected to launch any jobs it can before
106
+ # releasing the lock.
107
+ with filelock.FileLock(_get_lock_path(), blocking=False):
108
+ while True:
109
+ maybe_next_job = state.get_waiting_job()
110
+ if maybe_next_job is None:
111
+ # Nothing left to start, break from scheduling loop
112
+ break
113
+
114
+ current_state = maybe_next_job['schedule_state']
115
+
116
+ assert current_state in (
117
+ state.ManagedJobScheduleState.ALIVE_WAITING,
118
+ state.ManagedJobScheduleState.WAITING), maybe_next_job
119
+
120
+ # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
121
+ # since they will have been submitted and therefore started
122
+ # first. The requirements to launch in an alive job are more
123
+ # lenient, so there is no way that we wouldn't be able to launch
124
+ # an ALIVE_WAITING job, but we would be able to launch a WAITING
125
+ # job.
126
+ if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
127
+ if not _can_lauch_in_alive_job():
128
+ # Can't schedule anything, break from scheduling loop.
129
+ break
130
+ elif current_state == state.ManagedJobScheduleState.WAITING:
131
+ if not _can_start_new_job():
132
+ # Can't schedule anything, break from scheduling loop.
133
+ break
134
+
135
+ logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
136
+ state.scheduler_set_launching(maybe_next_job['job_id'],
137
+ current_state)
138
+
139
+ if current_state == state.ManagedJobScheduleState.WAITING:
140
+ # The job controller has not been started yet. We must start
141
+ # it.
142
+
143
+ job_id = maybe_next_job['job_id']
144
+ dag_yaml_path = maybe_next_job['dag_yaml_path']
145
+
146
+ # If the command line here is changed, please also update
147
+ # utils._controller_process_alive. `--job-id X` should be at
148
+ # the end.
149
+ run_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};'
150
+ 'python -u -m sky.jobs.controller '
151
+ f'{dag_yaml_path} --job-id {job_id}')
152
+
153
+ logs_dir = os.path.expanduser(
154
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
155
+ os.makedirs(logs_dir, exist_ok=True)
156
+ log_path = os.path.join(logs_dir, f'{job_id}.log')
157
+
158
+ pid = subprocess_utils.launch_new_process_tree(
159
+ run_cmd, log_output=log_path)
160
+ state.set_job_controller_pid(job_id, pid)
161
+
162
+ logger.debug(f'Job {job_id} started with pid {pid}')
163
+
164
+ except filelock.Timeout:
165
+ # If we can't get the lock, just exit. The process holding the lock
166
+ # should launch any pending jobs.
167
+ pass
168
+
169
+
170
+ def submit_job(job_id: int, dag_yaml_path: str) -> None:
171
+ """Submit an existing job to the scheduler.
172
+
173
+ This should be called after a job is created in the `spot` table as
174
+ PENDING. It will tell the scheduler to try and start the job controller, if
175
+ there are resources available. It may block to acquire the lock, so it
176
+ should not be on the critical path for `sky jobs launch -d`.
177
+ """
178
+ with filelock.FileLock(_get_lock_path()):
179
+ state.scheduler_set_waiting(job_id, dag_yaml_path)
180
+ maybe_schedule_next_jobs()
181
+
182
+
183
+ @contextlib.contextmanager
184
+ def scheduled_launch(job_id: int):
185
+ """Launch as part of an ongoing job.
186
+
187
+ A newly started job will already be LAUNCHING, and this will immediately
188
+ enter the context.
189
+
190
+ If a job is ongoing (ALIVE schedule_state), there are two scenarios where we
191
+ may need to call sky.launch again during the course of a job controller:
192
+ - for tasks after the first task
193
+ - for recovery
194
+
195
+ This function will mark the job as ALIVE_WAITING, which indicates to the
196
+ scheduler that it wants to transition back to LAUNCHING. Then, it will wait
197
+ until the scheduler transitions the job state, before entering the context.
198
+
199
+ On exiting the context, the job will transition to ALIVE.
200
+
201
+ This should only be used within the job controller for the given job_id. If
202
+ multiple uses of this context are nested, behavior is undefined. Don't do
203
+ that.
204
+ """
205
+
206
+ # If we're already in LAUNCHING schedule_state, we don't need to wait.
207
+ # This may be the case for the first launch of a job.
208
+ if (state.get_job_schedule_state(job_id) !=
209
+ state.ManagedJobScheduleState.LAUNCHING):
210
+ # Since we aren't LAUNCHING, we need to wait to be scheduled.
211
+ _set_alive_waiting(job_id)
212
+
213
+ while (state.get_job_schedule_state(job_id) !=
214
+ state.ManagedJobScheduleState.LAUNCHING):
215
+ time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
216
+
217
+ yield
218
+
219
+ with filelock.FileLock(_get_lock_path()):
220
+ state.scheduler_set_alive(job_id)
221
+ maybe_schedule_next_jobs()
222
+
223
+
224
+ def job_done(job_id: int, idempotent: bool = False) -> None:
225
+ """Transition a job to DONE.
226
+
227
+ If idempotent is True, this will not raise an error if the job is already
228
+ DONE.
229
+
230
+ The job could be in any terminal ManagedJobStatus. However, once DONE, it
231
+ should never transition back to another state.
232
+ """
233
+ if idempotent and (state.get_job_schedule_state(job_id)
234
+ == state.ManagedJobScheduleState.DONE):
235
+ return
236
+
237
+ with filelock.FileLock(_get_lock_path()):
238
+ state.scheduler_set_done(job_id, idempotent)
239
+ maybe_schedule_next_jobs()
240
+
241
+
242
+ def _set_alive_waiting(job_id: int) -> None:
243
+ """Should use wait_until_launch_okay() to transition to this state."""
244
+ with filelock.FileLock(_get_lock_path()):
245
+ state.scheduler_set_alive_waiting(job_id)
246
+ maybe_schedule_next_jobs()
247
+
248
+
249
+ def _get_job_parallelism() -> int:
250
+ # Assume a running job uses 350MB memory.
251
+ # We observe 230-300 in practice.
252
+ job_memory = 350 * 1024 * 1024
253
+ return max(psutil.virtual_memory().total // job_memory, 1)
254
+
255
+
256
+ def _get_launch_parallelism() -> int:
257
+ cpus = os.cpu_count()
258
+ return cpus * 4 if cpus is not None else 1
259
+
260
+
261
+ def _can_start_new_job() -> bool:
262
+ launching_jobs = state.get_num_launching_jobs()
263
+ alive_jobs = state.get_num_alive_jobs()
264
+ return launching_jobs < _get_launch_parallelism(
265
+ ) and alive_jobs < _get_job_parallelism()
266
+
267
+
268
+ def _can_lauch_in_alive_job() -> bool:
269
+ launching_jobs = state.get_num_launching_jobs()
270
+ return launching_jobs < _get_launch_parallelism()
271
+
272
+
273
+ if __name__ == '__main__':
274
+ parser = ArgumentParser()
275
+ parser.add_argument('--job-id',
276
+ required=True,
277
+ type=int,
278
+ help='Job id for the controller job.')
279
+ parser.add_argument('dag_yaml',
280
+ type=str,
281
+ help='The path to the user job yaml file.')
282
+ args = parser.parse_args()
283
+ submit_job(args.job_id, args.dag_yaml)