skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/jobs/utils.py CHANGED
@@ -13,23 +13,28 @@ import shlex
13
13
  import shutil
14
14
  import textwrap
15
15
  import time
16
+ import traceback
16
17
  import typing
17
18
  from typing import Any, Dict, List, Optional, Set, Tuple, Union
18
19
 
19
20
  import colorama
20
21
  import filelock
22
+ import psutil
21
23
  from typing_extensions import Literal
22
24
 
25
+ import sky
23
26
  from sky import backends
24
27
  from sky import exceptions
25
28
  from sky import global_user_state
26
29
  from sky import sky_logging
27
30
  from sky.backends import backend_utils
28
31
  from sky.jobs import constants as managed_job_constants
32
+ from sky.jobs import scheduler
29
33
  from sky.jobs import state as managed_job_state
30
34
  from sky.skylet import constants
31
35
  from sky.skylet import job_lib
32
36
  from sky.skylet import log_lib
37
+ from sky.usage import usage_lib
33
38
  from sky.utils import common_utils
34
39
  from sky.utils import log_utils
35
40
  from sky.utils import rich_utils
@@ -37,7 +42,6 @@ from sky.utils import subprocess_utils
37
42
  from sky.utils import ux_utils
38
43
 
39
44
  if typing.TYPE_CHECKING:
40
- import sky
41
45
  from sky import dag as dag_lib
42
46
 
43
47
  logger = sky_logging.init_logger(__name__)
@@ -69,8 +73,10 @@ _JOB_CANCELLED_MESSAGE = (
69
73
  # The maximum time to wait for the managed job status to transition to terminal
70
74
  # state, after the job finished. This is a safeguard to avoid the case where
71
75
  # the managed job status fails to be updated and keep the `sky jobs logs`
72
- # blocking for a long time.
73
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
76
+ # blocking for a long time. This should be significantly longer than the
77
+ # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
78
+ # update the state.
79
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
74
80
 
75
81
 
76
82
  class UserSignal(enum.Enum):
@@ -81,6 +87,43 @@ class UserSignal(enum.Enum):
81
87
 
82
88
 
83
89
  # ====== internal functions ======
90
+ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
91
+ """Terminate the cluster."""
92
+ retry_cnt = 0
93
+ # In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS
94
+ # metadata service throttling, the failed sky.down attempt can take 10-11
95
+ # seconds. In this case, we need the backoff to significantly reduce the
96
+ # rate of requests - that is, significantly increase the time between
97
+ # requests. We set the initial backoff to 15 seconds, so that once it grows
98
+ # exponentially it will quickly dominate the 10-11 seconds that we already
99
+ # see between requests. We set the max backoff very high, since it's
100
+ # generally much more important to eventually succeed than to fail fast.
101
+ backoff = common_utils.Backoff(
102
+ initial_backoff=15,
103
+ # 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry
104
+ max_backoff_factor=20)
105
+ while True:
106
+ try:
107
+ usage_lib.messages.usage.set_internal()
108
+ sky.down(cluster_name)
109
+ return
110
+ except exceptions.ClusterDoesNotExist:
111
+ # The cluster is already down.
112
+ logger.debug(f'The cluster {cluster_name} is already down.')
113
+ return
114
+ except Exception as e: # pylint: disable=broad-except
115
+ retry_cnt += 1
116
+ if retry_cnt >= max_retry:
117
+ raise RuntimeError(
118
+ f'Failed to terminate the cluster {cluster_name}.') from e
119
+ logger.error(
120
+ f'Failed to terminate the cluster {cluster_name}. Retrying.'
121
+ f'Details: {common_utils.format_exception(e)}')
122
+ with ux_utils.enable_traceback():
123
+ logger.error(f' Traceback: {traceback.format_exc()}')
124
+ time.sleep(backoff.current_backoff())
125
+
126
+
84
127
  def get_job_status(backend: 'backends.CloudVmRayBackend',
85
128
  cluster_name: str) -> Optional['job_lib.JobStatus']:
86
129
  """Check the status of the job running on a managed job cluster.
@@ -105,57 +148,145 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
105
148
  return status
106
149
 
107
150
 
151
+ def _controller_process_alive(pid: int, job_id: int) -> bool:
152
+ """Check if the controller process is alive."""
153
+ try:
154
+ process = psutil.Process(pid)
155
+ # The last two args of the command line should be --job-id <id>
156
+ job_args = process.cmdline()[-2:]
157
+ return process.is_running() and job_args == ['--job-id', str(job_id)]
158
+ except psutil.NoSuchProcess:
159
+ return False
160
+
161
+
108
162
  def update_managed_job_status(job_id: Optional[int] = None):
109
- """Update managed job status if the controller job failed abnormally.
163
+ """Update managed job status if the controller process failed abnormally.
164
+
165
+ Check the status of the controller process. If it is not running, it must
166
+ have exited abnormally, and we should set the job status to
167
+ FAILED_CONTROLLER. `end_at` will be set to the current timestamp for the job
168
+ when above happens, which could be not accurate based on the frequency this
169
+ function is called.
110
170
 
111
- Check the status of the controller job. If it is not running, it must have
112
- exited abnormally, and we should set the job status to FAILED_CONTROLLER.
113
- `end_at` will be set to the current timestamp for the job when above
114
- happens, which could be not accurate based on the frequency this function
115
- is called.
171
+ Note: we expect that job_id, if provided, refers to a nonterminal job.
116
172
  """
173
+
117
174
  if job_id is None:
175
+ # Warning: it's totally possible for the managed job to transition to
176
+ # a terminal status during the course of this function. The set_failed()
177
+ # called below will not update the state for jobs that already have a
178
+ # terminal status, so it should be fine.
118
179
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
119
180
  else:
120
181
  job_ids = [job_id]
121
182
  for job_id_ in job_ids:
122
- controller_status = job_lib.get_status(job_id_)
123
- if controller_status is None or controller_status.is_terminal():
124
- logger.error(f'Controller for job {job_id_} has exited abnormally. '
125
- 'Setting the job status to FAILED_CONTROLLER.')
126
- tasks = managed_job_state.get_managed_jobs(job_id_)
127
- for task in tasks:
128
- task_name = task['job_name']
129
- # Tear down the abnormal cluster to avoid resource leakage.
130
- cluster_name = generate_managed_job_cluster_name(
131
- task_name, job_id_)
132
- handle = global_user_state.get_handle_from_cluster_name(
133
- cluster_name)
134
- if handle is not None:
135
- backend = backend_utils.get_backend_from_handle(handle)
136
- max_retry = 3
137
- for retry_cnt in range(max_retry):
138
- try:
139
- backend.teardown(handle, terminate=True)
140
- break
141
- except RuntimeError:
142
- logger.error('Failed to tear down the cluster '
143
- f'{cluster_name!r}. Retrying '
144
- f'[{retry_cnt}/{max_retry}].')
145
-
146
- # The controller job for this managed job is not running: it must
147
- # have exited abnormally, and we should set the job status to
148
- # FAILED_CONTROLLER.
149
- # The `set_failed` will only update the task's status if the
150
- # status is non-terminal.
151
- managed_job_state.set_failed(
152
- job_id_,
153
- task_id=None,
154
- failure_type=managed_job_state.ManagedJobStatus.
155
- FAILED_CONTROLLER,
156
- failure_reason=
157
- 'Controller process has exited abnormally. For more details,'
158
- f' run: sky jobs logs --controller {job_id_}')
183
+
184
+ failure_reason = None
185
+
186
+ tasks = managed_job_state.get_managed_jobs(job_id_)
187
+ schedule_state = tasks[0]['schedule_state']
188
+ if schedule_state is None:
189
+ # Backwards compatibility: this job was submitted when ray was still
190
+ # used for managing the parallelism of job controllers.
191
+ # TODO(cooperc): Remove before 0.11.0.
192
+ controller_status = job_lib.get_status(job_id_)
193
+ if controller_status is None or controller_status.is_terminal():
194
+ logger.error(f'Controller process for legacy job {job_id_} is '
195
+ 'in an unexpected state.')
196
+ failure_reason = 'Legacy job is in an unexpected state'
197
+
198
+ # Continue to mark the job as failed.
199
+ else:
200
+ # Still running.
201
+ continue
202
+ else:
203
+ pid = tasks[0]['controller_pid']
204
+ if pid is None:
205
+ if schedule_state in (
206
+ managed_job_state.ManagedJobScheduleState.INACTIVE,
207
+ managed_job_state.ManagedJobScheduleState.WAITING):
208
+ # Job has not been scheduled yet.
209
+ continue
210
+ elif (schedule_state ==
211
+ managed_job_state.ManagedJobScheduleState.LAUNCHING):
212
+ # This should only be the case for a very short period of
213
+ # time between marking the job as submitted and writing the
214
+ # launched controller process pid back to the database (see
215
+ # scheduler.maybe_schedule_next_jobs).
216
+ # TODO(cooperc): Find a way to detect if we get stuck in
217
+ # this state.
218
+ logger.info(f'Job {job_id_} is in LAUNCHING state, '
219
+ 'but controller process hasn\'t started yet.')
220
+ continue
221
+ # All other statuses are unexpected. Proceed to mark as failed.
222
+ logger.error(f'Expected to find a controller pid for state '
223
+ f'{schedule_state.value} but found none.')
224
+ failure_reason = ('No controller pid set for '
225
+ f'{schedule_state.value}')
226
+ else:
227
+ logger.debug(f'Checking controller pid {pid}')
228
+ if _controller_process_alive(pid, job_id_):
229
+ # The controller is still running.
230
+ continue
231
+ # Otherwise, proceed to mark the job as failed.
232
+ logger.error(f'Controller process for {job_id_} seems to be '
233
+ 'dead.')
234
+ failure_reason = 'Controller process is dead'
235
+
236
+ logger.error(f'Controller process for job {job_id_} has exited '
237
+ 'abnormally. Setting the job status to FAILED_CONTROLLER.')
238
+ for task in tasks:
239
+ task_name = task['job_name']
240
+ # Tear down the abnormal cluster to avoid resource leakage.
241
+ cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
242
+ handle = global_user_state.get_handle_from_cluster_name(
243
+ cluster_name)
244
+ # If the cluster exists, terminate it.
245
+ if handle is not None:
246
+ terminate_cluster(cluster_name)
247
+
248
+ # The controller process for this managed job is not running: it must
249
+ # have exited abnormally, and we should set the job status to
250
+ # FAILED_CONTROLLER.
251
+ # The `set_failed` will only update the task's status if the
252
+ # status is non-terminal.
253
+ managed_job_state.set_failed(
254
+ job_id_,
255
+ task_id=None,
256
+ failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
257
+ failure_reason=
258
+ f'Controller process has exited abnormally ({failure_reason}). For '
259
+ f'more details, run: sky jobs logs --controller {job_id_}')
260
+ scheduler.job_done(job_id_, idempotent=True)
261
+
262
+ # Some jobs may be in a terminal status, but are not yet DONE. For instance,
263
+ # they may be still cleaning up resources, etc. Such jobs won't be captured
264
+ # by the above check, which only looks at nonterminal jobs. So, check the
265
+ # controller liveness of all jobs that should have live controller
266
+ # processes.
267
+ for job_info in managed_job_state.get_schedule_live_jobs(job_id):
268
+ if not job_info['controller_pid']:
269
+ # Technically, a job with no controller process but in LAUNCHING
270
+ # schedule state can happen very briefly after the job is set to
271
+ # LAUNCHING but before the controller process is actually spawned.
272
+ # However, if we observe any state other than LAUNCHING, something
273
+ # is clearly wrong.
274
+ if (job_info['schedule_state'] !=
275
+ managed_job_state.ManagedJobScheduleState.LAUNCHING):
276
+ logger.error(
277
+ f'Missing controller PID for {job_info["job_id"]}. '
278
+ 'Setting to DONE.')
279
+ scheduler.job_done(job_info['job_id'])
280
+ else:
281
+ logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
282
+ 'controller process yet. Skipping.')
283
+
284
+ elif not _controller_process_alive(job_info['controller_pid'],
285
+ job_info['job_id']):
286
+ logger.error(
287
+ f'Controller process for job {job_info["job_id"]} is not '
288
+ 'alive. Marking the job as DONE.')
289
+ scheduler.job_done(job_info['job_id'])
159
290
 
160
291
 
161
292
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
@@ -546,15 +677,75 @@ def stream_logs(job_id: Optional[int],
546
677
  'instead.')
547
678
  job_id = managed_job_ids.pop()
548
679
  assert job_id is not None, (job_id, job_name)
549
- # TODO: keep the following code sync with
550
- # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
551
- # as the following code need to be run in the current machine, instead
552
- # of running remotely.
553
- run_timestamp = job_lib.get_run_timestamp(job_id)
554
- if run_timestamp is None:
555
- return f'No managed job contrller log found with job_id {job_id}.'
556
- log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
557
- log_lib.tail_logs(job_id=job_id, log_dir=log_dir, follow=follow)
680
+
681
+ controller_log_path = os.path.join(
682
+ os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
683
+ f'{job_id}.log')
684
+ job_status = None
685
+
686
+ # Wait for the log file to be written
687
+ while not os.path.exists(controller_log_path):
688
+ if not follow:
689
+ # Assume that the log file hasn't been written yet. Since we
690
+ # aren't following, just return.
691
+ return ''
692
+
693
+ job_status = managed_job_state.get_status(job_id)
694
+ if job_status is None:
695
+ with ux_utils.print_exception_no_traceback():
696
+ raise ValueError(f'Job {job_id} not found.')
697
+ # We shouldn't count CANCELLING as terminal here, the controller is
698
+ # still cleaning up.
699
+ if (job_status.is_terminal() and job_status !=
700
+ managed_job_state.ManagedJobStatus.CANCELLING):
701
+ # Don't keep waiting. If the log file is not created by this
702
+ # point, it never will be. This job may have been submitted
703
+ # using an old version that did not create the log file, so this
704
+ # is not considered an exceptional case.
705
+ return ''
706
+
707
+ time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
708
+
709
+ # This code is based on log_lib.tail_logs. We can't use that code
710
+ # exactly because state works differently between managed jobs and
711
+ # normal jobs.
712
+ with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
713
+ # Note: we do not need to care about start_stream_at here, since
714
+ # that should be in the job log printed above.
715
+ for line in f:
716
+ print(line, end='')
717
+ # Flush.
718
+ print(end='', flush=True)
719
+
720
+ if follow:
721
+ while True:
722
+ # Print all new lines, if there are any.
723
+ line = f.readline()
724
+ while line is not None and line != '':
725
+ print(line, end='')
726
+ line = f.readline()
727
+
728
+ # Flush.
729
+ print(end='', flush=True)
730
+
731
+ # Check if the job if finished.
732
+ job_status = managed_job_state.get_status(job_id)
733
+ assert job_status is not None, (job_id, job_name)
734
+ if job_status.is_terminal():
735
+ break
736
+
737
+ time.sleep(log_lib.SKY_LOG_TAILING_GAP_SECONDS)
738
+
739
+ # Wait for final logs to be written.
740
+ time.sleep(1 + log_lib.SKY_LOG_TAILING_GAP_SECONDS)
741
+
742
+ # Print any remaining logs including incomplete line.
743
+ print(f.read(), end='', flush=True)
744
+
745
+ if follow:
746
+ return ux_utils.finishing_message(
747
+ f'Job finished (status: {job_status}).')
748
+
558
749
  return ''
559
750
 
560
751
  if job_id is None:
@@ -590,6 +781,7 @@ def dump_managed_job_queue() -> str:
590
781
  job_duration = 0
591
782
  job['job_duration'] = job_duration
592
783
  job['status'] = job['status'].value
784
+ job['schedule_state'] = job['schedule_state'].value
593
785
 
594
786
  cluster_name = generate_managed_job_cluster_name(
595
787
  job['task_name'], job['job_id'])
@@ -691,11 +883,18 @@ def format_job_table(
691
883
  status_counts[managed_job_status.value] += 1
692
884
 
693
885
  columns = [
694
- 'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
695
- 'JOB DURATION', '#RECOVERIES', 'STATUS'
886
+ 'ID',
887
+ 'TASK',
888
+ 'NAME',
889
+ 'RESOURCES',
890
+ 'SUBMITTED',
891
+ 'TOT. DURATION',
892
+ 'JOB DURATION',
893
+ '#RECOVERIES',
894
+ 'STATUS',
696
895
  ]
697
896
  if show_all:
698
- columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
897
+ columns += ['STARTED', 'CLUSTER', 'REGION', 'DESCRIPTION']
699
898
  if tasks_have_user:
700
899
  columns.insert(0, 'USER')
701
900
  job_table = log_utils.create_table(columns)
@@ -714,7 +913,25 @@ def format_job_table(
714
913
  # by the task_id.
715
914
  jobs[get_hash(task)].append(task)
716
915
 
916
+ def generate_description(failure_reason: Optional[str],
917
+ schedule_state: Optional[str]) -> str:
918
+ description = ''
919
+ if schedule_state is not None:
920
+ description += f'Scheduler: {schedule_state}'
921
+ if failure_reason is not None:
922
+ description += ', '
923
+ if failure_reason is not None:
924
+ description += f'Failure: {failure_reason}'
925
+
926
+ if description == '':
927
+ return '-'
928
+
929
+ return description
930
+
717
931
  for job_hash, job_tasks in jobs.items():
932
+ if show_all:
933
+ schedule_state = job_tasks[0]['schedule_state']
934
+
718
935
  if len(job_tasks) > 1:
719
936
  # Aggregate the tasks into a new row in the table.
720
937
  job_name = job_tasks[0]['job_name']
@@ -737,7 +954,6 @@ def format_job_table(
737
954
  end_at = None
738
955
  recovery_cnt += task['recovery_count']
739
956
 
740
- failure_reason = job_tasks[current_task_id]['failure_reason']
741
957
  job_duration = log_utils.readable_time_duration(0,
742
958
  job_duration,
743
959
  absolute=True)
@@ -763,11 +979,13 @@ def format_job_table(
763
979
  status_str,
764
980
  ]
765
981
  if show_all:
982
+ schedule_state = job_tasks[0]['schedule_state']
983
+ failure_reason = job_tasks[current_task_id]['failure_reason']
766
984
  job_values.extend([
767
985
  '-',
768
986
  '-',
769
987
  '-',
770
- failure_reason if failure_reason is not None else '-',
988
+ generate_description(failure_reason, schedule_state),
771
989
  ])
772
990
  if tasks_have_user:
773
991
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -795,13 +1013,17 @@ def format_job_table(
795
1013
  task['status'].colored_str(),
796
1014
  ]
797
1015
  if show_all:
1016
+ # schedule_state is only set at the job level, so if we have
1017
+ # more than one task, only display on the aggregated row.
1018
+ schedule_state = (task['schedule_state']
1019
+ if len(job_tasks) == 1 else None)
798
1020
  values.extend([
799
1021
  # STARTED
800
1022
  log_utils.readable_time_duration(task['start_at']),
801
1023
  task['cluster_resources'],
802
1024
  task['region'],
803
- task['failure_reason']
804
- if task['failure_reason'] is not None else '-',
1025
+ generate_description(task['failure_reason'],
1026
+ schedule_state),
805
1027
  ])
806
1028
  if tasks_have_user:
807
1029
  values.insert(0, task.get('user', '-'))
@@ -875,7 +1097,7 @@ class ManagedJobCodeGen:
875
1097
  return cls._build(code)
876
1098
 
877
1099
  @classmethod
878
- def get_all_job_ids_by_name(cls, job_name: str) -> str:
1100
+ def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
879
1101
  code = textwrap.dedent(f"""\
880
1102
  from sky.utils import common_utils
881
1103
  job_id = managed_job_state.get_all_job_ids_by_name({job_name!r})
@@ -896,6 +1118,7 @@ class ManagedJobCodeGen:
896
1118
  # should be removed in v0.8.0.
897
1119
  code = textwrap.dedent("""\
898
1120
  import os
1121
+ import time
899
1122
 
900
1123
  from sky.skylet import job_lib, log_lib
901
1124
  from sky.skylet import constants
@@ -920,7 +1143,7 @@ class ManagedJobCodeGen:
920
1143
  dag_name = managed_job_dag.name
921
1144
  # Add the managed job to queue table.
922
1145
  code = textwrap.dedent(f"""\
923
- managed_job_state.set_job_name({job_id}, {dag_name!r})
1146
+ managed_job_state.set_job_info({job_id}, {dag_name!r})
924
1147
  """)
925
1148
  for task_id, task in enumerate(managed_job_dag.tasks):
926
1149
  resources_str = backend_utils.get_task_resources_str(
@@ -976,7 +976,7 @@ def terminate_instances(
976
976
  _terminate_node(namespace, context, pod_name)
977
977
 
978
978
  # Run pod termination in parallel
979
- subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
979
+ subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
980
980
  _NUM_THREADS)
981
981
 
982
982
 
sky/resources.py CHANGED
@@ -540,7 +540,7 @@ class Resources:
540
540
  if memory_gb <= 0:
541
541
  with ux_utils.print_exception_no_traceback():
542
542
  raise ValueError(
543
- f'The "cpus" field should be positive. Found: {memory!r}')
543
+ f'The "memory" field should be positive. Found: {memory!r}')
544
544
 
545
545
  def _set_accelerators(
546
546
  self,
sky/skylet/constants.py CHANGED
@@ -86,7 +86,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
86
86
  # cluster yaml is updated.
87
87
  #
88
88
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
89
- SKYLET_VERSION = '9'
89
+ SKYLET_VERSION = '10'
90
90
  # The version of the lib files that skylet/jobs use. Whenever there is an API
91
91
  # change for the job_lib or log_lib, we need to bump this version, so that the
92
92
  # user can be notified to update their SkyPilot version on the remote cluster.
sky/skylet/events.py CHANGED
@@ -13,6 +13,8 @@ from sky import clouds
13
13
  from sky import sky_logging
14
14
  from sky.backends import cloud_vm_ray_backend
15
15
  from sky.clouds import cloud_registry
16
+ from sky.jobs import scheduler as managed_job_scheduler
17
+ from sky.jobs import state as managed_job_state
16
18
  from sky.jobs import utils as managed_job_utils
17
19
  from sky.serve import serve_utils
18
20
  from sky.skylet import autostop_lib
@@ -67,12 +69,13 @@ class JobSchedulerEvent(SkyletEvent):
67
69
  job_lib.scheduler.schedule_step(force_update_jobs=True)
68
70
 
69
71
 
70
- class ManagedJobUpdateEvent(SkyletEvent):
71
- """Skylet event for updating managed job status."""
72
+ class ManagedJobEvent(SkyletEvent):
73
+ """Skylet event for updating and scheduling managed jobs."""
72
74
  EVENT_INTERVAL_SECONDS = 300
73
75
 
74
76
  def _run(self):
75
77
  managed_job_utils.update_managed_job_status()
78
+ managed_job_scheduler.maybe_schedule_next_jobs()
76
79
 
77
80
 
78
81
  class ServiceUpdateEvent(SkyletEvent):
@@ -116,7 +119,8 @@ class AutostopEvent(SkyletEvent):
116
119
  logger.debug('autostop_config not set. Skipped.')
117
120
  return
118
121
 
119
- if job_lib.is_cluster_idle():
122
+ if (job_lib.is_cluster_idle() and
123
+ not managed_job_state.get_num_alive_jobs()):
120
124
  idle_minutes = (time.time() -
121
125
  autostop_lib.get_last_active_time()) // 60
122
126
  logger.debug(
sky/skylet/job_lib.py CHANGED
@@ -10,7 +10,6 @@ import pathlib
10
10
  import shlex
11
11
  import signal
12
12
  import sqlite3
13
- import subprocess
14
13
  import time
15
14
  from typing import Any, Dict, List, Optional, Sequence
16
15
 
@@ -23,6 +22,7 @@ from sky.skylet import constants
23
22
  from sky.utils import common_utils
24
23
  from sky.utils import db_utils
25
24
  from sky.utils import log_utils
25
+ from sky.utils import subprocess_utils
26
26
 
27
27
  logger = sky_logging.init_logger(__name__)
28
28
 
@@ -209,31 +209,7 @@ class JobScheduler:
209
209
  _CURSOR.execute((f'UPDATE pending_jobs SET submit={int(time.time())} '
210
210
  f'WHERE job_id={job_id!r}'))
211
211
  _CONN.commit()
212
- # Use nohup to ensure the job driver process is a separate process tree,
213
- # instead of being a child of the current process. This is important to
214
- # avoid a chain of driver processes (job driver can call schedule_step()
215
- # to submit new jobs, and the new job can also call schedule_step()
216
- # recursively).
217
- #
218
- # echo $! will output the PID of the last background process started
219
- # in the current shell, so we can retrieve it and record in the DB.
220
- #
221
- # TODO(zhwu): A more elegant solution is to use another daemon process
222
- # to be in charge of starting these driver processes, instead of
223
- # starting them in the current process.
224
- wrapped_cmd = (f'nohup bash -c {shlex.quote(run_cmd)} '
225
- '</dev/null >/dev/null 2>&1 & echo $!')
226
- proc = subprocess.run(wrapped_cmd,
227
- stdout=subprocess.PIPE,
228
- stderr=subprocess.PIPE,
229
- stdin=subprocess.DEVNULL,
230
- start_new_session=True,
231
- check=True,
232
- shell=True,
233
- text=True)
234
- # Get the PID of the detached process
235
- pid = int(proc.stdout.strip())
236
-
212
+ pid = subprocess_utils.launch_new_process_tree(run_cmd)
237
213
  # TODO(zhwu): Backward compatibility, remove this check after 0.10.0.
238
214
  # This is for the case where the job is submitted with SkyPilot older
239
215
  # than #4318, using ray job submit.
sky/skylet/log_lib.py CHANGED
@@ -25,9 +25,9 @@ from sky.utils import log_utils
25
25
  from sky.utils import subprocess_utils
26
26
  from sky.utils import ux_utils
27
27
 
28
- _SKY_LOG_WAITING_GAP_SECONDS = 1
29
- _SKY_LOG_WAITING_MAX_RETRY = 5
30
- _SKY_LOG_TAILING_GAP_SECONDS = 0.2
28
+ SKY_LOG_WAITING_GAP_SECONDS = 1
29
+ SKY_LOG_WAITING_MAX_RETRY = 5
30
+ SKY_LOG_TAILING_GAP_SECONDS = 0.2
31
31
  # Peek the head of the lines to check if we need to start
32
32
  # streaming when tail > 0.
33
33
  PEEK_HEAD_LINES_FOR_START_STREAM = 20
@@ -336,7 +336,7 @@ def _follow_job_logs(file,
336
336
  ]:
337
337
  if wait_last_logs:
338
338
  # Wait all the logs are printed before exit.
339
- time.sleep(1 + _SKY_LOG_TAILING_GAP_SECONDS)
339
+ time.sleep(1 + SKY_LOG_TAILING_GAP_SECONDS)
340
340
  wait_last_logs = False
341
341
  continue
342
342
  status_str = status.value if status is not None else 'None'
@@ -345,7 +345,7 @@ def _follow_job_logs(file,
345
345
  f'Job finished (status: {status_str}).'))
346
346
  return
347
347
 
348
- time.sleep(_SKY_LOG_TAILING_GAP_SECONDS)
348
+ time.sleep(SKY_LOG_TAILING_GAP_SECONDS)
349
349
  status = job_lib.get_status_no_lock(job_id)
350
350
 
351
351
 
@@ -426,15 +426,15 @@ def tail_logs(job_id: Optional[int],
426
426
  retry_cnt += 1
427
427
  if os.path.exists(log_path) and status != job_lib.JobStatus.INIT:
428
428
  break
429
- if retry_cnt >= _SKY_LOG_WAITING_MAX_RETRY:
429
+ if retry_cnt >= SKY_LOG_WAITING_MAX_RETRY:
430
430
  print(
431
431
  f'{colorama.Fore.RED}ERROR: Logs for '
432
432
  f'{job_str} (status: {status.value}) does not exist '
433
433
  f'after retrying {retry_cnt} times.{colorama.Style.RESET_ALL}')
434
434
  return
435
- print(f'INFO: Waiting {_SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
435
+ print(f'INFO: Waiting {SKY_LOG_WAITING_GAP_SECONDS}s for the logs '
436
436
  'to be written...')
437
- time.sleep(_SKY_LOG_WAITING_GAP_SECONDS)
437
+ time.sleep(SKY_LOG_WAITING_GAP_SECONDS)
438
438
  status = job_lib.update_job_status([job_id], silent=True)[0]
439
439
 
440
440
  start_stream_at = LOG_FILE_START_STREAMING_AT
sky/skylet/log_lib.pyi CHANGED
@@ -13,6 +13,9 @@ from sky.skylet import constants as constants
13
13
  from sky.skylet import job_lib as job_lib
14
14
  from sky.utils import log_utils as log_utils
15
15
 
16
+ SKY_LOG_WAITING_GAP_SECONDS: int = ...
17
+ SKY_LOG_WAITING_MAX_RETRY: int = ...
18
+ SKY_LOG_TAILING_GAP_SECONDS: float = ...
16
19
  LOG_FILE_START_STREAMING_AT: str = ...
17
20
 
18
21