skypilot-nightly 1.0.0.dev20250303__py3-none-any.whl → 1.0.0.dev20250305__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '50bbf9162d3516d5eb4e59e071822a4f3c38622c'
8
+ _SKYPILOT_COMMIT_SHA = '296a22e868b9bdf1faccbe3effbfb858a5a05905'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250303'
38
+ __version__ = '1.0.0.dev20250305'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -3823,6 +3823,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3823
3823
  follow: Whether to follow the logs.
3824
3824
  tail: The number of lines to display from the end of the
3825
3825
  log file. If 0, print all lines.
3826
+
3827
+ Returns:
3828
+ The exit code of the tail command. Returns code 100 if the job has
3829
+ failed. See exceptions.JobExitCode for possible return codes.
3826
3830
  """
3827
3831
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
3828
3832
  managed_job_id=managed_job_id,
@@ -3856,7 +3860,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3856
3860
  job_id: Optional[int] = None,
3857
3861
  job_name: Optional[str] = None,
3858
3862
  controller: bool = False,
3859
- follow: bool = True) -> None:
3863
+ follow: bool = True) -> int:
3860
3864
  # if job_name is not None, job_id should be None
3861
3865
  assert job_name is None or job_id is None, (job_name, job_id)
3862
3866
  code = managed_jobs.ManagedJobCodeGen.stream_logs(
@@ -3869,13 +3873,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3869
3873
  signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
3870
3874
 
3871
3875
  # Refer to the notes in tail_logs.
3872
- self.run_on_head(
3873
- handle,
3874
- code,
3875
- stream_logs=True,
3876
- process_stream=False,
3877
- ssh_mode=command_runner.SshMode.INTERACTIVE,
3878
- )
3876
+ try:
3877
+ returncode = self.run_on_head(
3878
+ handle,
3879
+ code,
3880
+ stream_logs=True,
3881
+ process_stream=False,
3882
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
3883
+ )
3884
+ except SystemExit as e:
3885
+ returncode = e.code
3886
+ return returncode
3879
3887
 
3880
3888
  def sync_down_managed_job_logs(
3881
3889
  self,
sky/cli.py CHANGED
@@ -1227,11 +1227,15 @@ def launch(
1227
1227
  clusters=[handle.get_cluster_name()])
1228
1228
  # job_id will be None if no job was submitted (e.g. no entrypoint
1229
1229
  # provided)
1230
+ returncode = 0
1230
1231
  if not detach_run and job_id is not None:
1231
- sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
1232
+ returncode = sdk.tail_logs(handle.get_cluster_name(),
1233
+ job_id,
1234
+ follow=True)
1232
1235
  click.secho(
1233
1236
  ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
1234
1237
  job_id, handle.get_cluster_name()))
1238
+ sys.exit(returncode)
1235
1239
 
1236
1240
 
1237
1241
  @cli.command(cls=_DocumentedCodeCommand)
@@ -1377,7 +1381,8 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
1377
1381
  job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
1378
1382
  if not async_call and not detach_run:
1379
1383
  job_id, _ = job_id_handle
1380
- sdk.tail_logs(cluster, job_id, follow=True)
1384
+ returncode = sdk.tail_logs(cluster, job_id, follow=True)
1385
+ sys.exit(returncode)
1381
1386
 
1382
1387
 
1383
1388
  def _handle_jobs_queue_request(
@@ -2121,12 +2126,20 @@ def logs(
2121
2126
  one job_id can be provided.
2122
2127
 
2123
2128
  2. If ``--status`` is specified, print the status of the job and exit with
2124
- returncode 0 if the job succeeded, or 1 otherwise. At most one job_id can
2125
- be specified.
2129
+ returncode 0 if the job succeeded. At most one job_id can
2130
+ be specified. Other possible return codes:
2131
+
2132
+ - 100: job failed.
2133
+ - 101: job not finished.
2134
+ - 102: job not found.
2135
+ - 103: job was cancelled by the user.
2126
2136
 
2127
2137
  3. If ``--sync-down`` is specified, the logs of the job will be downloaded
2128
2138
  from the cluster and saved to the local machine under
2129
- ``~/sky_logs``. Mulitple job_ids can be specified.
2139
+ ``~/sky_logs``. Multiple job_ids can be specified.
2140
+
2141
+ 4. If the job fails or fetching the logs fails, the command will exit with
2142
+ a non-zero return code.
2130
2143
  """
2131
2144
  if sync_down and status:
2132
2145
  raise click.UsageError(
@@ -2174,17 +2187,18 @@ def logs(
2174
2187
  # it will return {None: None}.
2175
2188
  if job_id is None:
2176
2189
  click.secho(f'No job found on cluster {cluster!r}.', fg='red')
2177
- sys.exit(1)
2190
+ sys.exit(exceptions.JobExitCode.NOT_FOUND)
2178
2191
  job_status = list(job_statuses.values())[0]
2179
2192
  job_status_str = job_status.value if job_status is not None else 'None'
2180
2193
  click.echo(f'Job {job_id}: {job_status_str}')
2181
2194
  if job_status == job_lib.JobStatus.SUCCEEDED:
2182
2195
  return
2183
2196
  else:
2197
+ returncode = exceptions.JobExitCode.from_job_status(job_status)
2184
2198
  if job_status is None:
2185
2199
  id_str = '' if job_id is None else f'{job_id} '
2186
2200
  click.secho(f'Job {id_str}not found', fg='red')
2187
- sys.exit(1)
2201
+ sys.exit(returncode)
2188
2202
 
2189
2203
  job_str = f'job {job_id}'
2190
2204
  if job_id is None:
@@ -2194,7 +2208,8 @@ def logs(
2194
2208
  f'{colorama.Style.RESET_ALL}')
2195
2209
 
2196
2210
  # Stream logs from the server.
2197
- sdk.tail_logs(cluster, job_id, follow, tail=tail)
2211
+ returncode = sdk.tail_logs(cluster, job_id, follow, tail=tail)
2212
+ sys.exit(returncode)
2198
2213
 
2199
2214
 
2200
2215
  @cli.command()
@@ -3019,7 +3034,7 @@ def _down_or_stop_clusters(
3019
3034
  # with the termination.
3020
3035
  hint_or_raise(controller_name, purge)
3021
3036
  except (exceptions.ClusterOwnerIdentityMismatchError,
3022
- RuntimeError) as e:
3037
+ exceptions.NotSupportedError, RuntimeError) as e:
3023
3038
  if purge:
3024
3039
  click.echo(common_utils.format_exception(e))
3025
3040
  else:
@@ -3729,6 +3744,7 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
3729
3744
  if not storages:
3730
3745
  click.echo('No storage(s) to delete.')
3731
3746
  return
3747
+ names = [storage['name'] for storage in storages]
3732
3748
  else:
3733
3749
  names = _get_glob_storages(names)
3734
3750
  if names:
@@ -3893,10 +3909,11 @@ def jobs_launch(
3893
3909
  'sky.jobs.launch')
3894
3910
  if not async_call and not detach_run:
3895
3911
  job_id = job_id_handle[0]
3896
- managed_jobs.tail_logs(name=None,
3897
- job_id=job_id,
3898
- follow=True,
3899
- controller=False)
3912
+ returncode = managed_jobs.tail_logs(name=None,
3913
+ job_id=job_id,
3914
+ follow=True,
3915
+ controller=False)
3916
+ sys.exit(returncode)
3900
3917
 
3901
3918
 
3902
3919
  @jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -4127,11 +4144,12 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
4127
4144
  logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
4128
4145
  f'{log_local_path}{style.RESET_ALL}')
4129
4146
  else:
4130
- managed_jobs.tail_logs(name=name,
4131
- job_id=job_id,
4132
- follow=follow,
4133
- controller=controller,
4134
- refresh=refresh)
4147
+ returncode = managed_jobs.tail_logs(name=name,
4148
+ job_id=job_id,
4149
+ follow=follow,
4150
+ controller=controller,
4151
+ refresh=refresh)
4152
+ sys.exit(returncode)
4135
4153
  except exceptions.ClusterNotUpError:
4136
4154
  with ux_utils.print_exception_no_traceback():
4137
4155
  raise
sky/client/cli.py CHANGED
@@ -1227,11 +1227,15 @@ def launch(
1227
1227
  clusters=[handle.get_cluster_name()])
1228
1228
  # job_id will be None if no job was submitted (e.g. no entrypoint
1229
1229
  # provided)
1230
+ returncode = 0
1230
1231
  if not detach_run and job_id is not None:
1231
- sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
1232
+ returncode = sdk.tail_logs(handle.get_cluster_name(),
1233
+ job_id,
1234
+ follow=True)
1232
1235
  click.secho(
1233
1236
  ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
1234
1237
  job_id, handle.get_cluster_name()))
1238
+ sys.exit(returncode)
1235
1239
 
1236
1240
 
1237
1241
  @cli.command(cls=_DocumentedCodeCommand)
@@ -1377,7 +1381,8 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
1377
1381
  job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
1378
1382
  if not async_call and not detach_run:
1379
1383
  job_id, _ = job_id_handle
1380
- sdk.tail_logs(cluster, job_id, follow=True)
1384
+ returncode = sdk.tail_logs(cluster, job_id, follow=True)
1385
+ sys.exit(returncode)
1381
1386
 
1382
1387
 
1383
1388
  def _handle_jobs_queue_request(
@@ -2121,12 +2126,20 @@ def logs(
2121
2126
  one job_id can be provided.
2122
2127
 
2123
2128
  2. If ``--status`` is specified, print the status of the job and exit with
2124
- returncode 0 if the job succeeded, or 1 otherwise. At most one job_id can
2125
- be specified.
2129
+ returncode 0 if the job succeeded. At most one job_id can
2130
+ be specified. Other possible return codes:
2131
+
2132
+ - 100: job failed.
2133
+ - 101: job not finished.
2134
+ - 102: job not found.
2135
+ - 103: job was cancelled by the user.
2126
2136
 
2127
2137
  3. If ``--sync-down`` is specified, the logs of the job will be downloaded
2128
2138
  from the cluster and saved to the local machine under
2129
- ``~/sky_logs``. Mulitple job_ids can be specified.
2139
+ ``~/sky_logs``. Multiple job_ids can be specified.
2140
+
2141
+ 4. If the job fails or fetching the logs fails, the command will exit with
2142
+ a non-zero return code.
2130
2143
  """
2131
2144
  if sync_down and status:
2132
2145
  raise click.UsageError(
@@ -2174,17 +2187,18 @@ def logs(
2174
2187
  # it will return {None: None}.
2175
2188
  if job_id is None:
2176
2189
  click.secho(f'No job found on cluster {cluster!r}.', fg='red')
2177
- sys.exit(1)
2190
+ sys.exit(exceptions.JobExitCode.NOT_FOUND)
2178
2191
  job_status = list(job_statuses.values())[0]
2179
2192
  job_status_str = job_status.value if job_status is not None else 'None'
2180
2193
  click.echo(f'Job {job_id}: {job_status_str}')
2181
2194
  if job_status == job_lib.JobStatus.SUCCEEDED:
2182
2195
  return
2183
2196
  else:
2197
+ returncode = exceptions.JobExitCode.from_job_status(job_status)
2184
2198
  if job_status is None:
2185
2199
  id_str = '' if job_id is None else f'{job_id} '
2186
2200
  click.secho(f'Job {id_str}not found', fg='red')
2187
- sys.exit(1)
2201
+ sys.exit(returncode)
2188
2202
 
2189
2203
  job_str = f'job {job_id}'
2190
2204
  if job_id is None:
@@ -2194,7 +2208,8 @@ def logs(
2194
2208
  f'{colorama.Style.RESET_ALL}')
2195
2209
 
2196
2210
  # Stream logs from the server.
2197
- sdk.tail_logs(cluster, job_id, follow, tail=tail)
2211
+ returncode = sdk.tail_logs(cluster, job_id, follow, tail=tail)
2212
+ sys.exit(returncode)
2198
2213
 
2199
2214
 
2200
2215
  @cli.command()
@@ -3019,7 +3034,7 @@ def _down_or_stop_clusters(
3019
3034
  # with the termination.
3020
3035
  hint_or_raise(controller_name, purge)
3021
3036
  except (exceptions.ClusterOwnerIdentityMismatchError,
3022
- RuntimeError) as e:
3037
+ exceptions.NotSupportedError, RuntimeError) as e:
3023
3038
  if purge:
3024
3039
  click.echo(common_utils.format_exception(e))
3025
3040
  else:
@@ -3729,6 +3744,7 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
3729
3744
  if not storages:
3730
3745
  click.echo('No storage(s) to delete.')
3731
3746
  return
3747
+ names = [storage['name'] for storage in storages]
3732
3748
  else:
3733
3749
  names = _get_glob_storages(names)
3734
3750
  if names:
@@ -3893,10 +3909,11 @@ def jobs_launch(
3893
3909
  'sky.jobs.launch')
3894
3910
  if not async_call and not detach_run:
3895
3911
  job_id = job_id_handle[0]
3896
- managed_jobs.tail_logs(name=None,
3897
- job_id=job_id,
3898
- follow=True,
3899
- controller=False)
3912
+ returncode = managed_jobs.tail_logs(name=None,
3913
+ job_id=job_id,
3914
+ follow=True,
3915
+ controller=False)
3916
+ sys.exit(returncode)
3900
3917
 
3901
3918
 
3902
3919
  @jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -4127,11 +4144,12 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
4127
4144
  logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
4128
4145
  f'{log_local_path}{style.RESET_ALL}')
4129
4146
  else:
4130
- managed_jobs.tail_logs(name=name,
4131
- job_id=job_id,
4132
- follow=follow,
4133
- controller=controller,
4134
- refresh=refresh)
4147
+ returncode = managed_jobs.tail_logs(name=name,
4148
+ job_id=job_id,
4149
+ follow=follow,
4150
+ controller=controller,
4151
+ refresh=refresh)
4152
+ sys.exit(returncode)
4135
4153
  except exceptions.ClusterNotUpError:
4136
4154
  with ux_utils.print_exception_no_traceback():
4137
4155
  raise
sky/client/sdk.py CHANGED
@@ -25,6 +25,7 @@ import filelock
25
25
  import psutil
26
26
  import requests
27
27
 
28
+ from sky import admin_policy
28
29
  from sky import backends
29
30
  from sky import exceptions
30
31
  from sky import sky_logging
@@ -212,13 +213,17 @@ def list_accelerator_counts(
212
213
  @annotations.client_api
213
214
  def optimize(
214
215
  dag: 'sky.Dag',
215
- minimize: common.OptimizeTarget = common.OptimizeTarget.COST
216
+ minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
217
+ admin_policy_request_options: Optional[admin_policy.RequestOptions] = None
216
218
  ) -> server_common.RequestId:
217
219
  """Finds the best execution plan for the given DAG.
218
220
 
219
221
  Args:
220
222
  dag: the DAG to optimize.
221
223
  minimize: whether to minimize cost or time.
224
+ admin_policy_request_options: Request options used for admin policy
225
+ validation. This is only required when a admin policy is in use,
226
+ see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
222
227
 
223
228
  Returns:
224
229
  The request ID of the optimize request.
@@ -233,7 +238,9 @@ def optimize(
233
238
  """
234
239
  dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
235
240
 
236
- body = payloads.OptimizeBody(dag=dag_str, minimize=minimize)
241
+ body = payloads.OptimizeBody(dag=dag_str,
242
+ minimize=minimize,
243
+ request_options=admin_policy_request_options)
237
244
  response = requests.post(f'{server_common.get_server_url()}/optimize',
238
245
  json=json.loads(body.model_dump_json()))
239
246
  return server_common.get_request_id(response)
@@ -242,7 +249,11 @@ def optimize(
242
249
  @usage_lib.entrypoint
243
250
  @server_common.check_server_healthy_or_start
244
251
  @annotations.client_api
245
- def validate(dag: 'sky.Dag', workdir_only: bool = False) -> None:
252
+ def validate(
253
+ dag: 'sky.Dag',
254
+ workdir_only: bool = False,
255
+ admin_policy_request_options: Optional[admin_policy.RequestOptions] = None
256
+ ) -> None:
246
257
  """Validates the tasks.
247
258
 
248
259
  The file paths (workdir and file_mounts) are validated on the client side
@@ -254,13 +265,17 @@ def validate(dag: 'sky.Dag', workdir_only: bool = False) -> None:
254
265
  dag: the DAG to validate.
255
266
  workdir_only: whether to only validate the workdir. This is used for
256
267
  `exec` as it does not need other files/folders in file_mounts.
268
+ admin_policy_request_options: Request options used for admin policy
269
+ validation. This is only required when a admin policy is in use,
270
+ see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
257
271
  """
258
272
  for task in dag.tasks:
259
273
  task.expand_and_validate_workdir()
260
274
  if not workdir_only:
261
275
  task.expand_and_validate_file_mounts()
262
276
  dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
263
- body = payloads.ValidateBody(dag=dag_str)
277
+ body = payloads.ValidateBody(dag=dag_str,
278
+ request_options=admin_policy_request_options)
264
279
  response = requests.post(f'{server_common.get_server_url()}/validate',
265
280
  json=json.loads(body.model_dump_json()))
266
281
  if response.status_code == 400:
@@ -386,7 +401,12 @@ def launch(
386
401
  'Please contact the SkyPilot team if you '
387
402
  'need this feature at slack.skypilot.co.')
388
403
  dag = dag_utils.convert_entrypoint_to_dag(task)
389
- validate(dag)
404
+ request_options = admin_policy.RequestOptions(
405
+ cluster_name=cluster_name,
406
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
407
+ down=down,
408
+ dryrun=dryrun)
409
+ validate(dag, admin_policy_request_options=request_options)
390
410
 
391
411
  confirm_shown = False
392
412
  if _need_confirmation:
@@ -400,7 +420,8 @@ def launch(
400
420
  if not clusters:
401
421
  # Show the optimize log before the prompt if the cluster does not
402
422
  # exist.
403
- request_id = optimize(dag)
423
+ request_id = optimize(dag,
424
+ admin_policy_request_options=request_options)
404
425
  stream_and_get(request_id)
405
426
  else:
406
427
  cluster_record = clusters[0]
@@ -562,7 +583,7 @@ def tail_logs(cluster_name: str,
562
583
  job_id: Optional[int],
563
584
  follow: bool,
564
585
  tail: int = 0,
565
- output_stream: Optional['io.TextIOBase'] = None) -> None:
586
+ output_stream: Optional['io.TextIOBase'] = None) -> int:
566
587
  """Tails the logs of a job.
567
588
 
568
589
  Args:
@@ -575,7 +596,9 @@ def tail_logs(cluster_name: str,
575
596
  console.
576
597
 
577
598
  Returns:
578
- None
599
+ Exit code based on success or failure of the job. 0 if success,
600
+ 100 if the job failed. See exceptions.JobExitCode for possible exit
601
+ codes.
579
602
 
580
603
  Request Raises:
581
604
  ValueError: if arguments are invalid or the cluster is not supported.
@@ -601,7 +624,7 @@ def tail_logs(cluster_name: str,
601
624
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
602
625
  None))
603
626
  request_id = server_common.get_request_id(response)
604
- stream_response(request_id, response, output_stream)
627
+ return stream_response(request_id, response, output_stream)
605
628
 
606
629
 
607
630
  @usage_lib.entrypoint
sky/core.py CHANGED
@@ -6,16 +6,18 @@ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
 
7
7
  import colorama
8
8
 
9
+ from sky import admin_policy
9
10
  from sky import backends
10
11
  from sky import check as sky_check
11
12
  from sky import clouds
12
- from sky import dag
13
+ from sky import dag as dag_lib
13
14
  from sky import data
14
15
  from sky import exceptions
15
16
  from sky import global_user_state
16
17
  from sky import models
18
+ from sky import optimizer
17
19
  from sky import sky_logging
18
- from sky import task
20
+ from sky import task as task_lib
19
21
  from sky.backends import backend_utils
20
22
  from sky.clouds import service_catalog
21
23
  from sky.jobs.server import core as managed_jobs_core
@@ -25,6 +27,7 @@ from sky.skylet import constants
25
27
  from sky.skylet import job_lib
26
28
  from sky.skylet import log_lib
27
29
  from sky.usage import usage_lib
30
+ from sky.utils import admin_policy_utils
28
31
  from sky.utils import common
29
32
  from sky.utils import common_utils
30
33
  from sky.utils import controller_utils
@@ -44,6 +47,46 @@ logger = sky_logging.init_logger(__name__)
44
47
  # ======================
45
48
 
46
49
 
50
+ @usage_lib.entrypoint
51
+ def optimize(
52
+ dag: 'dag_lib.Dag',
53
+ minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
54
+ blocked_resources: Optional[List['resources_lib.Resources']] = None,
55
+ quiet: bool = False,
56
+ request_options: Optional[admin_policy.RequestOptions] = None
57
+ ) -> 'dag_lib.Dag':
58
+ """Finds the best execution plan for the given DAG.
59
+
60
+ Args:
61
+ dag: the DAG to optimize.
62
+ minimize: whether to minimize cost or time.
63
+ blocked_resources: a list of resources that should not be used.
64
+ quiet: whether to suppress logging.
65
+ request_options: Request options used in enforcing admin policies.
66
+ This is only required when a admin policy is in use,
67
+ see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
68
+ Returns:
69
+ The optimized DAG.
70
+
71
+ Raises:
72
+ exceptions.ResourcesUnavailableError: if no resources are available
73
+ for a task.
74
+ exceptions.NoCloudAccessError: if no public clouds are enabled.
75
+ """
76
+ # TODO: We apply the admin policy only on the first DAG optimization which
77
+ # is shown on `sky launch`. The optimizer is also invoked during failover,
78
+ # but we do not apply the admin policy there. We should apply the admin
79
+ # policy in the optimizer, but that will require some refactoring.
80
+ dag, _ = admin_policy_utils.apply(
81
+ dag,
82
+ use_mutated_config_in_current_request=True,
83
+ request_options=request_options)
84
+ return optimizer.Optimizer.optimize(dag=dag,
85
+ minimize=minimize,
86
+ blocked_resources=blocked_resources,
87
+ quiet=quiet)
88
+
89
+
47
90
  @usage_lib.entrypoint
48
91
  def status(
49
92
  cluster_names: Optional[Union[str, List[str]]] = None,
@@ -325,8 +368,8 @@ def _start(
325
368
 
326
369
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
327
370
 
328
- with dag.Dag():
329
- dummy_task = task.Task().set_resources(handle.launched_resources)
371
+ with dag_lib.Dag():
372
+ dummy_task = task_lib.Task().set_resources(handle.launched_resources)
330
373
  dummy_task.num_nodes = handle.launched_nodes
331
374
  handle = backend.provision(dummy_task,
332
375
  to_provision=handle.launched_resources,
@@ -783,7 +826,7 @@ def cancel(
783
826
  def tail_logs(cluster_name: str,
784
827
  job_id: Optional[int],
785
828
  follow: bool = True,
786
- tail: int = 0) -> None:
829
+ tail: int = 0) -> int:
787
830
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
788
831
  """Tails the logs of a job.
789
832
 
@@ -799,6 +842,12 @@ def tail_logs(cluster_name: str,
799
842
  not the same as the user who created the cluster.
800
843
  sky.exceptions.CloudUserIdentityError: if we fail to get the current
801
844
  user identity.
845
+
846
+ Returns:
847
+ Return code based on success or failure of the job. 0 if success,
848
+ 100 if the job failed. Note: This is not the return code of the job
849
+ script.
850
+
802
851
  """
803
852
  # Check the status of the cluster.
804
853
  handle = backend_utils.check_cluster_available(
@@ -808,7 +857,7 @@ def tail_logs(cluster_name: str,
808
857
  backend = backend_utils.get_backend_from_handle(handle)
809
858
 
810
859
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
811
- backend.tail_logs(handle, job_id, follow=follow, tail=tail)
860
+ return backend.tail_logs(handle, job_id, follow=follow, tail=tail)
812
861
 
813
862
 
814
863
  @usage_lib.entrypoint
sky/exceptions.py CHANGED
@@ -9,7 +9,9 @@ from typing import Any, Dict, List, Optional, Sequence
9
9
  from sky.utils import env_options
10
10
 
11
11
  if typing.TYPE_CHECKING:
12
+ from sky import jobs as managed_jobs
12
13
  from sky.backends import backend
14
+ from sky.skylet import job_lib
13
15
  from sky.utils import status_lib
14
16
 
15
17
  # Return code for keyboard interruption and SIGTSTP
@@ -236,7 +238,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
236
238
  else:
237
239
  if (len(command) > 100 and
238
240
  not env_options.Options.SHOW_DEBUG_INFO.get()):
239
- # Chunck the command to avoid overflow.
241
+ # Chunk the command to avoid overflow.
240
242
  command = command[:100] + '...'
241
243
  message = (f'Command {command} failed with return code '
242
244
  f'{returncode}.\n{error_msg}')
@@ -449,3 +451,80 @@ class ApiServerConnectionError(RuntimeError):
449
451
  f'Could not connect to SkyPilot API server at {server_url}. '
450
452
  f'Please ensure that the server is running. '
451
453
  f'Try: curl {server_url}/api/health')
454
+
455
+
456
+ class JobExitCode(enum.IntEnum):
457
+ """Job exit code enum.
458
+
459
+ These codes are used as return codes for job-related operations and as
460
+ process exit codes to indicate job status.
461
+ """
462
+
463
+ SUCCEEDED = 0
464
+ """The job completed successfully"""
465
+
466
+ FAILED = 100
467
+ """The job failed (due to user code, setup, or driver failure)"""
468
+
469
+ NOT_FINISHED = 101
470
+ """The job has not finished yet"""
471
+
472
+ NOT_FOUND = 102
473
+ """The job was not found"""
474
+
475
+ CANCELLED = 103
476
+ """The job was cancelled by the user"""
477
+
478
+ @classmethod
479
+ def from_job_status(cls,
480
+ status: Optional['job_lib.JobStatus']) -> 'JobExitCode':
481
+ """Convert a job status to an exit code."""
482
+ # Import here to avoid circular imports
483
+ # pylint: disable=import-outside-toplevel
484
+ from sky.skylet import job_lib
485
+
486
+ if status is None:
487
+ return cls.NOT_FOUND
488
+
489
+ if not status.is_terminal():
490
+ return cls.NOT_FINISHED
491
+
492
+ if status == job_lib.JobStatus.SUCCEEDED:
493
+ return cls.SUCCEEDED
494
+
495
+ if status == job_lib.JobStatus.CANCELLED:
496
+ return cls.CANCELLED
497
+
498
+ if status in job_lib.JobStatus.user_code_failure_states(
499
+ ) or status == job_lib.JobStatus.FAILED_DRIVER:
500
+ return cls.FAILED
501
+
502
+ # Should not hit this case, but included to avoid errors
503
+ return cls.FAILED
504
+
505
+ @classmethod
506
+ def from_managed_job_status(
507
+ cls,
508
+ status: Optional['managed_jobs.ManagedJobStatus']) -> 'JobExitCode':
509
+ """Convert a managed job status to an exit code."""
510
+ # Import here to avoid circular imports
511
+ # pylint: disable=import-outside-toplevel
512
+ from sky import jobs as managed_jobs
513
+
514
+ if status is None:
515
+ return cls.NOT_FOUND
516
+
517
+ if not status.is_terminal():
518
+ return cls.NOT_FINISHED
519
+
520
+ if status == managed_jobs.ManagedJobStatus.SUCCEEDED:
521
+ return cls.SUCCEEDED
522
+
523
+ if status == managed_jobs.ManagedJobStatus.CANCELLED:
524
+ return cls.CANCELLED
525
+
526
+ if status.is_failed():
527
+ return cls.FAILED
528
+
529
+ # Should not hit this case, but included to avoid errors
530
+ return cls.FAILED