skypilot-nightly 1.0.0.dev20241029__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
8
+ _SKYPILOT_COMMIT_SHA = '9d50f192b262d5f6cc74b5b6644f3a9e3ea31f2f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241029'
38
+ __version__ = '1.0.0.dev20241030'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3175,9 +3175,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3175
3175
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3176
3176
  if returncode == 255:
3177
3177
  is_message_too_long = False
3178
- with open(setup_log_path, 'r', encoding='utf-8') as f:
3179
- if 'too long' in f.read():
3180
- is_message_too_long = True
3178
+ try:
3179
+ with open(os.path.expanduser(setup_log_path),
3180
+ 'r',
3181
+ encoding='utf-8') as f:
3182
+ if 'too long' in f.read():
3183
+ is_message_too_long = True
3184
+ except Exception as e: # pylint: disable=broad-except
3185
+ # We don't crash the setup if we cannot read the log file.
3186
+ # Instead, we should retry the setup with dumping the script
3187
+ # to a file to be safe.
3188
+ logger.debug('Failed to read setup log file '
3189
+ f'{setup_log_path}: {e}')
3190
+ is_message_too_long = True
3181
3191
 
3182
3192
  if is_message_too_long:
3183
3193
  # If the setup script is too long, we retry it with dumping
sky/execution.py CHANGED
@@ -171,10 +171,11 @@ def _execute(
171
171
  task = dag.tasks[0]
172
172
 
173
173
  if any(r.job_recovery is not None for r in task.resources):
174
- with ux_utils.print_exception_no_traceback():
175
- raise ValueError(
176
- 'Job recovery is specified in the task. To launch a '
177
- 'managed job, please use: sky jobs launch')
174
+ logger.warning(
175
+ f'{colorama.Style.DIM}The task has `job_recovery` specified, '
176
+ 'but is launched as an unmanaged job. It will be ignored.'
177
+ 'To enable job recovery, use managed jobs: sky jobs launch.'
178
+ f'{colorama.Style.RESET_ALL}')
178
179
 
179
180
  cluster_exists = False
180
181
  if cluster_name is not None:
sky/jobs/controller.py CHANGED
@@ -160,6 +160,11 @@ class JobsController:
160
160
  if task_id == 0:
161
161
  submitted_at = backend_utils.get_timestamp_from_run_timestamp(
162
162
  self._backend.run_timestamp)
163
+ assert task.name is not None, task
164
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
165
+ task.name, self._job_id)
166
+ self._strategy_executor = recovery_strategy.StrategyExecutor.make(
167
+ cluster_name, self._backend, task, self._retry_until_up)
163
168
  managed_job_state.set_submitted(
164
169
  self._job_id,
165
170
  task_id,
@@ -167,15 +172,14 @@ class JobsController:
167
172
  submitted_at,
168
173
  resources_str=backend_utils.get_task_resources_str(
169
174
  task, is_managed_job=True),
175
+ specs={
176
+ 'max_restarts_on_errors':
177
+ self._strategy_executor.max_restarts_on_errors
178
+ },
170
179
  callback_func=callback_func)
171
180
  logger.info(
172
181
  f'Submitted managed job {self._job_id} (task: {task_id}, name: '
173
182
  f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
174
- assert task.name is not None, task
175
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
176
- task.name, self._job_id)
177
- self._strategy_executor = recovery_strategy.StrategyExecutor.make(
178
- cluster_name, self._backend, task, self._retry_until_up)
179
183
 
180
184
  logger.info('Started monitoring.')
181
185
  managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ class JobsController:
283
287
  failure_reason = (
284
288
  'To see the details, run: '
285
289
  f'sky jobs logs --controller {self._job_id}')
286
-
287
- managed_job_state.set_failed(
288
- self._job_id,
289
- task_id,
290
- failure_type=managed_job_status,
291
- failure_reason=failure_reason,
292
- end_time=end_time,
293
- callback_func=callback_func)
294
- return False
295
- # Although the cluster is healthy, we fail to access the
296
- # job status. Try to recover the job (will not restart the
297
- # cluster, if the cluster is healthy).
298
- assert job_status is None, job_status
299
- logger.info('Failed to fetch the job status while the '
300
- 'cluster is healthy. Try to recover the job '
301
- '(the cluster will not be restarted).')
302
-
290
+ should_restart_on_failure = (
291
+ self._strategy_executor.should_restart_on_failure())
292
+ if should_restart_on_failure:
293
+ max_restarts = (
294
+ self._strategy_executor.max_restarts_on_errors)
295
+ logger.info(
296
+ f'User program crashed '
297
+ f'({managed_job_status.value}). '
298
+ f'Retry the job as max_restarts_on_errors is '
299
+ f'set to {max_restarts}. '
300
+ f'[{self._strategy_executor.restart_cnt_on_failure}'
301
+ f'/{max_restarts}]')
302
+ else:
303
+ managed_job_state.set_failed(
304
+ self._job_id,
305
+ task_id,
306
+ failure_type=managed_job_status,
307
+ failure_reason=failure_reason,
308
+ end_time=end_time,
309
+ callback_func=callback_func)
310
+ return False
311
+ else:
312
+ # Although the cluster is healthy, we fail to access the
313
+ # job status. Try to recover the job (will not restart the
314
+ # cluster, if the cluster is healthy).
315
+ assert job_status is None, job_status
316
+ logger.info('Failed to fetch the job status while the '
317
+ 'cluster is healthy. Try to recover the job '
318
+ '(the cluster will not be restarted).')
303
319
  # When the handle is None, the cluster should be cleaned up already.
304
320
  if handle is not None:
305
321
  resources = handle.launched_resources
@@ -66,7 +66,8 @@ class StrategyExecutor:
66
66
  RETRY_INIT_GAP_SECONDS = 60
67
67
 
68
68
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
69
- task: 'task_lib.Task', retry_until_up: bool) -> None:
69
+ task: 'task_lib.Task', retry_until_up: bool,
70
+ max_restarts_on_errors: int) -> None:
70
71
  """Initialize the strategy executor.
71
72
 
72
73
  Args:
@@ -82,6 +83,8 @@ class StrategyExecutor:
82
83
  self.cluster_name = cluster_name
83
84
  self.backend = backend
84
85
  self.retry_until_up = retry_until_up
86
+ self.max_restarts_on_errors = max_restarts_on_errors
87
+ self.restart_cnt_on_failure = 0
85
88
 
86
89
  def __init_subclass__(cls, name: str, default: bool = False):
87
90
  RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ class StrategyExecutor:
109
112
  # set the new_task_resources to be the same type (list or set) as the
110
113
  # original task.resources
111
114
  task.set_resources(type(task.resources)(new_resources_list))
112
- return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
113
- retry_until_up)
115
+ if isinstance(job_recovery, dict):
116
+ job_recovery_name = job_recovery.pop('strategy',
117
+ DEFAULT_RECOVERY_STRATEGY)
118
+ max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
119
+ 0)
120
+ else:
121
+ job_recovery_name = job_recovery
122
+ max_restarts_on_errors = 0
123
+ return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
124
+ task, retry_until_up,
125
+ max_restarts_on_errors)
114
126
 
115
127
  def launch(self) -> float:
116
128
  """Launch the cluster for the first time.
@@ -368,6 +380,17 @@ class StrategyExecutor:
368
380
  f'{gap_seconds:.1f} seconds.')
369
381
  time.sleep(gap_seconds)
370
382
 
383
+ def should_restart_on_failure(self) -> bool:
384
+ """Increments counter & checks if job should be restarted on a failure.
385
+
386
+ Returns:
387
+ True if the job should be restarted, otherwise False.
388
+ """
389
+ self.restart_cnt_on_failure += 1
390
+ if self.restart_cnt_on_failure > self.max_restarts_on_errors:
391
+ return False
392
+ return True
393
+
371
394
 
372
395
  class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
373
396
  default=False):
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
376
399
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
377
400
 
378
401
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
379
- task: 'task_lib.Task', retry_until_up: bool) -> None:
380
- super().__init__(cluster_name, backend, task, retry_until_up)
402
+ task: 'task_lib.Task', retry_until_up: bool,
403
+ max_restarts_on_errors: int) -> None:
404
+ super().__init__(cluster_name, backend, task, retry_until_up,
405
+ max_restarts_on_errors)
381
406
  # Note down the cloud/region of the launched cluster, so that we can
382
407
  # first retry in the same cloud/region. (Inside recover() we may not
383
408
  # rely on cluster handle, as it can be None if the cluster is
sky/jobs/state.py CHANGED
@@ -2,6 +2,7 @@
2
2
  # TODO(zhwu): maybe use file based status instead of database, so
3
3
  # that we can easily switch to a s3-based storage.
4
4
  import enum
5
+ import json
5
6
  import pathlib
6
7
  import sqlite3
7
8
  import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
65
66
  failure_reason TEXT,
66
67
  spot_job_id INTEGER,
67
68
  task_id INTEGER DEFAULT 0,
68
- task_name TEXT)""")
69
+ task_name TEXT,
70
+ specs TEXT)""")
69
71
  _CONN.commit()
70
72
 
71
73
  db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
92
94
  'TEXT',
93
95
  copy_from='job_name')
94
96
 
97
+ # Specs is some useful information about the task, e.g., the
98
+ # max_restarts_on_errors value. It is stored in JSON format.
99
+ db_utils.add_column_to_table(_CURSOR,
100
+ _CONN,
101
+ 'spot',
102
+ 'specs',
103
+ 'TEXT',
104
+ value_to_replace_existing_entries=json.dumps({
105
+ 'max_restarts_on_errors': 0,
106
+ }))
107
+
95
108
  # `job_info` contains the mapping from job_id to the job_name.
96
109
  # In the future, it may contain more information about each job.
97
110
  _CURSOR.execute("""\
@@ -130,7 +143,8 @@ columns = [
130
143
  'task_name',
131
144
  # columns from the job_info table
132
145
  '_job_info_job_id', # This should be the same as job_id
133
- 'job_name'
146
+ 'job_name',
147
+ 'specs',
134
148
  ]
135
149
 
136
150
 
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
283
297
 
284
298
  def set_submitted(job_id: int, task_id: int, run_timestamp: str,
285
299
  submit_time: float, resources_str: str,
286
- callback_func: CallbackType):
300
+ specs: Dict[str, Union[str,
301
+ int]], callback_func: CallbackType):
287
302
  """Set the task to submitted.
288
303
 
289
304
  Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
293
308
  determine the log directory of the managed task.
294
309
  submit_time: The time when the managed task is submitted.
295
310
  resources_str: The resources string of the managed task.
311
+ specs: The specs of the managed task.
312
+ callback_func: The callback function.
296
313
  """
297
314
  # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
298
315
  # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
306
323
  resources=(?),
307
324
  submitted_at=(?),
308
325
  status=(?),
309
- run_timestamp=(?)
326
+ run_timestamp=(?),
327
+ specs=(?)
310
328
  WHERE spot_job_id=(?) AND
311
329
  task_id=(?)""",
312
330
  (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
313
- run_timestamp, job_id, task_id))
331
+ run_timestamp, json.dumps(specs), job_id, task_id))
314
332
  callback_func('SUBMITTED')
315
333
 
316
334
 
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
619
637
  for (job_id,) in rows:
620
638
  return job_id
621
639
  return None
640
+
641
+
642
+ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
643
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
644
+ task_specs = cursor.execute(
645
+ """\
646
+ SELECT specs FROM spot
647
+ WHERE spot_job_id=(?) AND task_id=(?)""",
648
+ (job_id, task_id)).fetchone()
649
+ return json.loads(task_specs[0])
sky/jobs/utils.py CHANGED
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
70
70
  # state, after the job finished. This is a safeguard to avoid the case where
71
71
  # the managed job status fails to be updated and keep the `sky jobs logs`
72
72
  # blocking for a long time.
73
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
73
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
74
74
 
75
75
 
76
76
  class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
392
392
  f'INFO: Log for the current task ({task_id}) '
393
393
  'is finished. Waiting for the next task\'s log '
394
394
  'to be started.')
395
- status_display.update('Waiting for the next task: '
396
- f'{task_id + 1}.')
395
+ # Add a newline to avoid the status display below
396
+ # removing the last line of the task output.
397
+ print()
398
+ status_display.update(
399
+ ux_utils.spinner_message(
400
+ f'Waiting for the next task: {task_id + 1}'))
397
401
  status_display.start()
398
402
  original_task_id = task_id
399
403
  while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
405
409
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
406
410
  continue
407
411
  else:
408
- break
412
+ task_specs = managed_job_state.get_task_specs(
413
+ job_id, task_id)
414
+ if task_specs.get('max_restarts_on_errors', 0) == 0:
415
+ # We don't need to wait for the managed job status
416
+ # update, as the job is guaranteed to be in terminal
417
+ # state afterwards.
418
+ break
419
+ print()
420
+ status_display.update(
421
+ ux_utils.spinner_message(
422
+ 'Waiting for next restart for the failed task'))
423
+ status_display.start()
424
+ while True:
425
+ _, managed_job_status = (
426
+ managed_job_state.get_latest_task_id_status(
427
+ job_id))
428
+ if (managed_job_status !=
429
+ managed_job_state.ManagedJobStatus.RUNNING):
430
+ break
431
+ time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
432
+ continue
409
433
  # The job can be cancelled by the user or the controller (when
410
434
  # the cluster is partially preempted).
411
435
  logger.debug(
sky/resources.py CHANGED
@@ -55,7 +55,7 @@ class Resources:
55
55
  accelerators: Union[None, str, Dict[str, int]] = None,
56
56
  accelerator_args: Optional[Dict[str, str]] = None,
57
57
  use_spot: Optional[bool] = None,
58
- job_recovery: Optional[str] = None,
58
+ job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
59
59
  region: Optional[str] = None,
60
60
  zone: Optional[str] = None,
61
61
  image_id: Union[Dict[str, str], str, None] = None,
@@ -111,6 +111,12 @@ class Resources:
111
111
  job to recover the cluster from preemption. Refer to
112
112
  `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
113
113
  for more details.
114
+ When a dict is provided, it can have the following fields:
115
+
116
+ - strategy: the recovery strategy to use.
117
+ - max_restarts_on_errors: the max number of restarts on user code
118
+ errors.
119
+
114
120
  region: the region to use.
115
121
  zone: the zone to use.
116
122
  image_id: the image ID to use. If a str, must be a string
@@ -161,10 +167,20 @@ class Resources:
161
167
 
162
168
  self._use_spot_specified = use_spot is not None
163
169
  self._use_spot = use_spot if use_spot is not None else False
164
- self._job_recovery = None
170
+ self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
165
171
  if job_recovery is not None:
166
- if job_recovery.strip().lower() != 'none':
167
- self._job_recovery = job_recovery.upper()
172
+ if isinstance(job_recovery, str):
173
+ job_recovery = {'strategy': job_recovery}
174
+ if 'strategy' not in job_recovery:
175
+ job_recovery['strategy'] = None
176
+
177
+ strategy_name = job_recovery['strategy']
178
+ if strategy_name == 'none':
179
+ self._job_recovery = None
180
+ else:
181
+ if strategy_name is not None:
182
+ job_recovery['strategy'] = strategy_name.upper()
183
+ self._job_recovery = job_recovery
168
184
 
169
185
  if disk_size is not None:
170
186
  if round(disk_size) != disk_size:
@@ -419,7 +435,7 @@ class Resources:
419
435
  return self._use_spot_specified
420
436
 
421
437
  @property
422
- def job_recovery(self) -> Optional[str]:
438
+ def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
423
439
  return self._job_recovery
424
440
 
425
441
  @property
@@ -814,12 +830,13 @@ class Resources:
814
830
  Raises:
815
831
  ValueError: if the attributes are invalid.
816
832
  """
817
- if self._job_recovery is None:
833
+ if self._job_recovery is None or self._job_recovery['strategy'] is None:
818
834
  return
819
- if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
835
+ if (self._job_recovery['strategy']
836
+ not in managed_jobs.RECOVERY_STRATEGIES):
820
837
  with ux_utils.print_exception_no_traceback():
821
838
  raise ValueError(
822
- f'Spot recovery strategy {self._job_recovery} '
839
+ f'Spot recovery strategy {self._job_recovery["strategy"]} '
823
840
  'is not supported. The strategy should be among '
824
841
  f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
825
842
 
sky/setup_files/setup.py CHANGED
@@ -153,7 +153,7 @@ install_requires = [
153
153
  'tabulate',
154
154
  # Light weight requirement, can be replaced with "typing" once
155
155
  # we deprecate Python 3.7 (this will take a while).
156
- "typing_extensions",
156
+ 'typing_extensions',
157
157
  'filelock >= 3.6.0',
158
158
  'packaging',
159
159
  'psutil',
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
216
216
  # We need azure-identity>=1.13.0 to enable the customization of the
217
217
  # timeout of AzureCliCredential.
218
218
  'azure': [
219
- 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0',
220
- 'azure-mgmt-network', 'azure-storage-blob', 'msgraph-sdk'
219
+ 'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
220
+ 'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
221
+ 'azure-storage-blob>=12.23.1', 'msgraph-sdk'
221
222
  ] + local_ray,
222
223
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
223
224
  # parameter for stopping instances.
sky/skylet/job_lib.py CHANGED
@@ -512,16 +512,13 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
512
512
  return records
513
513
 
514
514
 
515
- def _get_pending_jobs():
516
- rows = _CURSOR.execute(
517
- 'SELECT job_id, created_time, submit FROM pending_jobs')
518
- rows = list(rows)
519
- return {
520
- job_id: {
521
- 'created_time': created_time,
522
- 'submit': submit
523
- } for job_id, created_time, submit in rows
524
- }
515
+ def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
516
+ rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
517
+ f'WHERE job_id={job_id!r}')
518
+ for row in rows:
519
+ created_time, submit = row
520
+ return {'created_time': created_time, 'submit': submit}
521
+ return None
525
522
 
526
523
 
527
524
  def update_job_status(job_ids: List[int],
@@ -535,7 +532,7 @@ def update_job_status(job_ids: List[int],
535
532
  during job cancelling, we still need this to handle the staleness problem,
536
533
  caused by instance restarting and other corner cases (if any).
537
534
 
538
- This function should only be run on the remote instance with ray==2.4.0.
535
+ This function should only be run on the remote instance with ray>=2.4.0.
539
536
  """
540
537
  if len(job_ids) == 0:
541
538
  return []
@@ -547,50 +544,45 @@ def update_job_status(job_ids: List[int],
547
544
 
548
545
  # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
549
546
  # which contains the job status (str) and submission_id (str).
547
+ ray_job_query_time = time.time()
550
548
  job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
551
549
 
552
- pending_jobs = _get_pending_jobs()
553
550
  job_details = {}
554
551
  ray_job_ids_set = set(ray_job_ids)
555
552
  for job_detail in job_detail_lists:
556
553
  if job_detail.submission_id in ray_job_ids_set:
557
554
  job_details[job_detail.submission_id] = job_detail
558
- job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids)
559
- for i, ray_job_id in enumerate(ray_job_ids):
560
- job_id = job_ids[i]
561
- if ray_job_id in job_details:
562
- ray_status = job_details[ray_job_id].status
563
- job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
564
- if job_id in pending_jobs:
565
- if pending_jobs[job_id]['created_time'] < psutil.boot_time():
566
- logger.info(
567
- f'Job {job_id} is stale, setting to FAILED: '
568
- f'created_time={pending_jobs[job_id]["created_time"]}, '
569
- f'boot_time={psutil.boot_time()}')
570
- # The job is stale as it is created before the instance
571
- # is booted, e.g. the instance is rebooted.
572
- job_statuses[i] = JobStatus.FAILED
573
- # Gives a 60 second grace period between job being submit from
574
- # the pending table until appearing in ray jobs.
575
- if (pending_jobs[job_id]['submit'] > 0 and
576
- pending_jobs[job_id]['submit'] <
577
- time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
578
- # For jobs submitted outside of the grace period, we will
579
- # consider the ray job status.
580
- continue
581
- else:
582
- # Reset the job status to PENDING even though it may not appear
583
- # in the ray jobs, so that it will not be considered as stale.
584
- job_statuses[i] = JobStatus.PENDING
585
-
586
- assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
587
555
 
588
556
  statuses = []
589
- for job_id, status in zip(job_ids, job_statuses):
557
+ for job_id, ray_job_id in zip(job_ids, ray_job_ids):
590
558
  # Per-job status lock is required because between the job status
591
559
  # query and the job status update, the job status in the databse
592
560
  # can be modified by the generated ray program.
593
561
  with filelock.FileLock(_get_lock_path(job_id)):
562
+ status = None
563
+ if ray_job_id in job_details:
564
+ ray_status = job_details[ray_job_id].status
565
+ status = _RAY_TO_JOB_STATUS_MAP[ray_status]
566
+ pending_job = _get_pending_job(job_id)
567
+ if pending_job is not None:
568
+ if pending_job['created_time'] < psutil.boot_time():
569
+ logger.info(f'Job {job_id} is stale, setting to FAILED: '
570
+ f'created_time={pending_job["created_time"]}, '
571
+ f'boot_time={psutil.boot_time()}')
572
+ # The job is stale as it is created before the instance
573
+ # is booted, e.g. the instance is rebooted.
574
+ status = JobStatus.FAILED
575
+ # Gives a 60 second grace period between job being submit from
576
+ # the pending table until appearing in ray jobs. For jobs
577
+ # submitted outside of the grace period, we will consider the
578
+ # ray job status.
579
+ if not (pending_job['submit'] > 0 and pending_job['submit'] <
580
+ ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
581
+ # Reset the job status to PENDING even though it may not
582
+ # appear in the ray jobs, so that it will not be considered
583
+ # as stale.
584
+ status = JobStatus.PENDING
585
+
594
586
  original_status = get_status_no_lock(job_id)
595
587
  assert original_status is not None, (job_id, status)
596
588
  if status is None:
sky/utils/dag_utils.py CHANGED
@@ -143,11 +143,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
143
143
  for task_ in dag.tasks:
144
144
 
145
145
  new_resources_list = []
146
+ default_strategy = jobs.DEFAULT_RECOVERY_STRATEGY
147
+ assert default_strategy is not None
146
148
  for resources in list(task_.resources):
147
- change_default_value: Dict[str, Any] = {}
148
- if resources.job_recovery is None:
149
- change_default_value[
150
- 'job_recovery'] = jobs.DEFAULT_RECOVERY_STRATEGY
149
+ original_job_recovery = resources.job_recovery
150
+ job_recovery = {'strategy': default_strategy}
151
+ if isinstance(original_job_recovery, str):
152
+ job_recovery['strategy'] = original_job_recovery
153
+ elif isinstance(original_job_recovery, dict):
154
+ job_recovery.update(original_job_recovery)
155
+ strategy = job_recovery.get('strategy')
156
+ if strategy is None:
157
+ job_recovery['strategy'] = default_strategy
158
+ change_default_value: Dict[str, Any] = {
159
+ 'job_recovery': job_recovery
160
+ }
151
161
 
152
162
  new_resources = resources.copy(**change_default_value)
153
163
  new_resources_list.append(new_resources)
sky/utils/schemas.py CHANGED
@@ -92,7 +92,27 @@ def _get_single_resources_schema():
92
92
  'type': 'string',
93
93
  },
94
94
  'job_recovery': {
95
- 'type': 'string',
95
+ # Either a string or a dict.
96
+ 'anyOf': [{
97
+ 'type': 'string',
98
+ }, {
99
+ 'type': 'object',
100
+ 'required': [],
101
+ 'additionalProperties': False,
102
+ 'properties': {
103
+ 'strategy': {
104
+ 'anyOf': [{
105
+ 'type': 'string',
106
+ }, {
107
+ 'type': 'null',
108
+ }],
109
+ },
110
+ 'max_restarts_on_errors': {
111
+ 'type': 'integer',
112
+ 'minimum': 0,
113
+ },
114
+ }
115
+ }],
96
116
  },
97
117
  'disk_size': {
98
118
  'type': 'integer',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241029
3
+ Version: 1.0.0.dev20241030
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -46,11 +46,12 @@ Requires-Dist: awscli>=1.27.10; extra == "all"
46
46
  Requires-Dist: botocore>=1.29.10; extra == "all"
47
47
  Requires-Dist: boto3>=1.26.1; extra == "all"
48
48
  Requires-Dist: colorama<0.4.5; extra == "all"
49
- Requires-Dist: azure-cli>=2.31.0; extra == "all"
50
- Requires-Dist: azure-core; extra == "all"
51
- Requires-Dist: azure-identity>=1.13.0; extra == "all"
52
- Requires-Dist: azure-mgmt-network; extra == "all"
53
- Requires-Dist: azure-storage-blob; extra == "all"
49
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
50
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
51
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
52
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
53
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
54
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
54
55
  Requires-Dist: msgraph-sdk; extra == "all"
55
56
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
56
57
  Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
@@ -78,11 +79,12 @@ Requires-Dist: botocore>=1.29.10; extra == "aws"
78
79
  Requires-Dist: boto3>=1.26.1; extra == "aws"
79
80
  Requires-Dist: colorama<0.4.5; extra == "aws"
80
81
  Provides-Extra: azure
81
- Requires-Dist: azure-cli>=2.31.0; extra == "azure"
82
- Requires-Dist: azure-core; extra == "azure"
83
- Requires-Dist: azure-identity>=1.13.0; extra == "azure"
84
- Requires-Dist: azure-mgmt-network; extra == "azure"
85
- Requires-Dist: azure-storage-blob; extra == "azure"
82
+ Requires-Dist: azure-cli>=2.65.0; extra == "azure"
83
+ Requires-Dist: azure-core>=1.31.0; extra == "azure"
84
+ Requires-Dist: azure-identity>=1.19.0; extra == "azure"
85
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
86
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
87
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
86
88
  Requires-Dist: msgraph-sdk; extra == "azure"
87
89
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
88
90
  Provides-Extra: cloudflare
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=BxGwYNzkF-X3QWAkY2mXeidbCMkcagQQqkJ-gwBlJiI,5882
1
+ sky/__init__.py,sha256=WwnJbF2ubaAJEJkUGPJ7jK5mh3QD1r487evpncErtC8,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
9
  sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
10
- sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
10
+ sky/execution.py,sha256=tDK6JhF_405cjqxRpbdLbHZyxrKTD5oa0UkKDvPJ_9Q,24751
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
13
- sky/resources.py,sha256=bm004Ms2qlBqEr0N_TEUybDOXJVhLF8yOwkhoqb1t9c,67478
13
+ sky/resources.py,sha256=7kVpLRfy3DFFgmEji0_Xz6FbrvBDUSXC6K0bsRIK3hA,68290
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
15
15
  sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
16
16
  sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=LmLsaLiPuuUyGebOXykdvwZpUY-8sB7n4o2AnmwNmdQ,121714
34
- sky/backends/cloud_vm_ray_backend.py,sha256=WX93AnMR_E6e8L0hvXc5eWFdajQo-Sbwfv8Z8lidy9U,232598
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=ZWAzdmKzSf3qalDoKfmLGaO3PywjLtIA5Q3AeeHhvHA,233158
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -95,11 +95,11 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
95
95
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
96
96
  sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
- sky/jobs/controller.py,sha256=zSdawmXg-9SZ91jJg5_OSFVlntu9xupLs-CiPwG1QdQ,26412
98
+ sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
99
99
  sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
100
- sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
101
- sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
102
- sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
100
+ sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
101
+ sky/jobs/state.py,sha256=TV1G12vEMQJRgwWXsAjb3lmkJqkZmAOUUOja2QQPrg8,24307
102
+ sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
103
103
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
@@ -184,7 +184,7 @@ sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,3943
184
184
  sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
185
185
  sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
186
186
  sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
187
- sky/setup_files/setup.py,sha256=o4IgiwFoTB6Sdn3MmOirUIS0OSkoh6qo_0vrgcmrYA4,12093
187
+ sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
188
188
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
189
189
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
190
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
193
  sky/skylet/constants.py,sha256=OsuJcQp6UgkQ9Yfml6f_raXXbHS7-_h-v4QNv92y0Gw,14642
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=Nfvefaa3N5IwxfhhOz1XE7ps46l3LY-db6VWF2pC3HQ,35335
195
+ sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
196
196
  sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -249,14 +249,14 @@ sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,3
249
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
250
250
  sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
251
251
  sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
252
- sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
252
+ sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
253
253
  sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
254
254
  sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
255
255
  sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
256
256
  sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
257
257
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
258
258
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
259
- sky/utils/schemas.py,sha256=MTjGcxmc4aAz9QzqZY2pO87uNuWhJ3ss1N9rXcCNYGQ,28357
259
+ sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
260
260
  sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
261
261
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
262
262
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241029.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241029.dist-info/METADATA,sha256=UAIFfOVp0n7QbIlx-vP21aRhzERPIIoEGbE4RcLzR5U,19540
279
- skypilot_nightly-1.0.0.dev20241029.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241029.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241029.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241029.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241030.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241030.dist-info/METADATA,sha256=bwgfsg4Zzl63yZYrUfZIBNeMitC8bOcgqKucALPDnbk,19708
279
+ skypilot_nightly-1.0.0.dev20241030.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241030.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241030.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241030.dist-info/RECORD,,