skypilot-nightly 1.0.0.dev20250303__py3-none-any.whl → 1.0.0.dev20250305__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +16 -8
- sky/cli.py +36 -18
- sky/client/cli.py +36 -18
- sky/client/sdk.py +32 -9
- sky/core.py +55 -6
- sky/exceptions.py +80 -1
- sky/jobs/client/sdk.py +7 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/dashboard/dashboard.py +15 -25
- sky/jobs/dashboard/templates/index.html +100 -3
- sky/jobs/server/core.py +11 -6
- sky/jobs/utils.py +51 -21
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +16 -8
- sky/server/server.py +15 -2
- sky/skylet/constants.py +1 -1
- sky/skylet/job_lib.py +10 -1
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/METADATA +28 -41
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/RECORD +24 -24
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250303.dist-info → skypilot_nightly-1.0.0.dev20250305.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '296a22e868b9bdf1faccbe3effbfb858a5a05905'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250305'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -3823,6 +3823,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3823
3823
|
follow: Whether to follow the logs.
|
3824
3824
|
tail: The number of lines to display from the end of the
|
3825
3825
|
log file. If 0, print all lines.
|
3826
|
+
|
3827
|
+
Returns:
|
3828
|
+
The exit code of the tail command. Returns code 100 if the job has
|
3829
|
+
failed. See exceptions.JobExitCode for possible return codes.
|
3826
3830
|
"""
|
3827
3831
|
code = job_lib.JobLibCodeGen.tail_logs(job_id,
|
3828
3832
|
managed_job_id=managed_job_id,
|
@@ -3856,7 +3860,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3856
3860
|
job_id: Optional[int] = None,
|
3857
3861
|
job_name: Optional[str] = None,
|
3858
3862
|
controller: bool = False,
|
3859
|
-
follow: bool = True) ->
|
3863
|
+
follow: bool = True) -> int:
|
3860
3864
|
# if job_name is not None, job_id should be None
|
3861
3865
|
assert job_name is None or job_id is None, (job_name, job_id)
|
3862
3866
|
code = managed_jobs.ManagedJobCodeGen.stream_logs(
|
@@ -3869,13 +3873,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3869
3873
|
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
3870
3874
|
|
3871
3875
|
# Refer to the notes in tail_logs.
|
3872
|
-
|
3873
|
-
|
3874
|
-
|
3875
|
-
|
3876
|
-
|
3877
|
-
|
3878
|
-
|
3876
|
+
try:
|
3877
|
+
returncode = self.run_on_head(
|
3878
|
+
handle,
|
3879
|
+
code,
|
3880
|
+
stream_logs=True,
|
3881
|
+
process_stream=False,
|
3882
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
3883
|
+
)
|
3884
|
+
except SystemExit as e:
|
3885
|
+
returncode = e.code
|
3886
|
+
return returncode
|
3879
3887
|
|
3880
3888
|
def sync_down_managed_job_logs(
|
3881
3889
|
self,
|
sky/cli.py
CHANGED
@@ -1227,11 +1227,15 @@ def launch(
|
|
1227
1227
|
clusters=[handle.get_cluster_name()])
|
1228
1228
|
# job_id will be None if no job was submitted (e.g. no entrypoint
|
1229
1229
|
# provided)
|
1230
|
+
returncode = 0
|
1230
1231
|
if not detach_run and job_id is not None:
|
1231
|
-
sdk.tail_logs(handle.get_cluster_name(),
|
1232
|
+
returncode = sdk.tail_logs(handle.get_cluster_name(),
|
1233
|
+
job_id,
|
1234
|
+
follow=True)
|
1232
1235
|
click.secho(
|
1233
1236
|
ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
|
1234
1237
|
job_id, handle.get_cluster_name()))
|
1238
|
+
sys.exit(returncode)
|
1235
1239
|
|
1236
1240
|
|
1237
1241
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -1377,7 +1381,8 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
|
|
1377
1381
|
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
|
1378
1382
|
if not async_call and not detach_run:
|
1379
1383
|
job_id, _ = job_id_handle
|
1380
|
-
sdk.tail_logs(cluster, job_id, follow=True)
|
1384
|
+
returncode = sdk.tail_logs(cluster, job_id, follow=True)
|
1385
|
+
sys.exit(returncode)
|
1381
1386
|
|
1382
1387
|
|
1383
1388
|
def _handle_jobs_queue_request(
|
@@ -2121,12 +2126,20 @@ def logs(
|
|
2121
2126
|
one job_id can be provided.
|
2122
2127
|
|
2123
2128
|
2. If ``--status`` is specified, print the status of the job and exit with
|
2124
|
-
returncode 0 if the job succeeded
|
2125
|
-
be specified.
|
2129
|
+
returncode 0 if the job succeeded. At most one job_id can
|
2130
|
+
be specified. Other possible return codes:
|
2131
|
+
|
2132
|
+
- 100: job failed.
|
2133
|
+
- 101: job not finished.
|
2134
|
+
- 102: job not found.
|
2135
|
+
- 103: job was cancelled by the user.
|
2126
2136
|
|
2127
2137
|
3. If ``--sync-down`` is specified, the logs of the job will be downloaded
|
2128
2138
|
from the cluster and saved to the local machine under
|
2129
|
-
``~/sky_logs``.
|
2139
|
+
``~/sky_logs``. Multiple job_ids can be specified.
|
2140
|
+
|
2141
|
+
4. If the job fails or fetching the logs fails, the command will exit with
|
2142
|
+
a non-zero return code.
|
2130
2143
|
"""
|
2131
2144
|
if sync_down and status:
|
2132
2145
|
raise click.UsageError(
|
@@ -2174,17 +2187,18 @@ def logs(
|
|
2174
2187
|
# it will return {None: None}.
|
2175
2188
|
if job_id is None:
|
2176
2189
|
click.secho(f'No job found on cluster {cluster!r}.', fg='red')
|
2177
|
-
sys.exit(
|
2190
|
+
sys.exit(exceptions.JobExitCode.NOT_FOUND)
|
2178
2191
|
job_status = list(job_statuses.values())[0]
|
2179
2192
|
job_status_str = job_status.value if job_status is not None else 'None'
|
2180
2193
|
click.echo(f'Job {job_id}: {job_status_str}')
|
2181
2194
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
2182
2195
|
return
|
2183
2196
|
else:
|
2197
|
+
returncode = exceptions.JobExitCode.from_job_status(job_status)
|
2184
2198
|
if job_status is None:
|
2185
2199
|
id_str = '' if job_id is None else f'{job_id} '
|
2186
2200
|
click.secho(f'Job {id_str}not found', fg='red')
|
2187
|
-
sys.exit(
|
2201
|
+
sys.exit(returncode)
|
2188
2202
|
|
2189
2203
|
job_str = f'job {job_id}'
|
2190
2204
|
if job_id is None:
|
@@ -2194,7 +2208,8 @@ def logs(
|
|
2194
2208
|
f'{colorama.Style.RESET_ALL}')
|
2195
2209
|
|
2196
2210
|
# Stream logs from the server.
|
2197
|
-
sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2211
|
+
returncode = sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2212
|
+
sys.exit(returncode)
|
2198
2213
|
|
2199
2214
|
|
2200
2215
|
@cli.command()
|
@@ -3019,7 +3034,7 @@ def _down_or_stop_clusters(
|
|
3019
3034
|
# with the termination.
|
3020
3035
|
hint_or_raise(controller_name, purge)
|
3021
3036
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
3022
|
-
RuntimeError) as e:
|
3037
|
+
exceptions.NotSupportedError, RuntimeError) as e:
|
3023
3038
|
if purge:
|
3024
3039
|
click.echo(common_utils.format_exception(e))
|
3025
3040
|
else:
|
@@ -3729,6 +3744,7 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
3729
3744
|
if not storages:
|
3730
3745
|
click.echo('No storage(s) to delete.')
|
3731
3746
|
return
|
3747
|
+
names = [storage['name'] for storage in storages]
|
3732
3748
|
else:
|
3733
3749
|
names = _get_glob_storages(names)
|
3734
3750
|
if names:
|
@@ -3893,10 +3909,11 @@ def jobs_launch(
|
|
3893
3909
|
'sky.jobs.launch')
|
3894
3910
|
if not async_call and not detach_run:
|
3895
3911
|
job_id = job_id_handle[0]
|
3896
|
-
managed_jobs.tail_logs(name=None,
|
3897
|
-
|
3898
|
-
|
3899
|
-
|
3912
|
+
returncode = managed_jobs.tail_logs(name=None,
|
3913
|
+
job_id=job_id,
|
3914
|
+
follow=True,
|
3915
|
+
controller=False)
|
3916
|
+
sys.exit(returncode)
|
3900
3917
|
|
3901
3918
|
|
3902
3919
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
@@ -4127,11 +4144,12 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
4127
4144
|
logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
|
4128
4145
|
f'{log_local_path}{style.RESET_ALL}')
|
4129
4146
|
else:
|
4130
|
-
managed_jobs.tail_logs(name=name,
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4147
|
+
returncode = managed_jobs.tail_logs(name=name,
|
4148
|
+
job_id=job_id,
|
4149
|
+
follow=follow,
|
4150
|
+
controller=controller,
|
4151
|
+
refresh=refresh)
|
4152
|
+
sys.exit(returncode)
|
4135
4153
|
except exceptions.ClusterNotUpError:
|
4136
4154
|
with ux_utils.print_exception_no_traceback():
|
4137
4155
|
raise
|
sky/client/cli.py
CHANGED
@@ -1227,11 +1227,15 @@ def launch(
|
|
1227
1227
|
clusters=[handle.get_cluster_name()])
|
1228
1228
|
# job_id will be None if no job was submitted (e.g. no entrypoint
|
1229
1229
|
# provided)
|
1230
|
+
returncode = 0
|
1230
1231
|
if not detach_run and job_id is not None:
|
1231
|
-
sdk.tail_logs(handle.get_cluster_name(),
|
1232
|
+
returncode = sdk.tail_logs(handle.get_cluster_name(),
|
1233
|
+
job_id,
|
1234
|
+
follow=True)
|
1232
1235
|
click.secho(
|
1233
1236
|
ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
|
1234
1237
|
job_id, handle.get_cluster_name()))
|
1238
|
+
sys.exit(returncode)
|
1235
1239
|
|
1236
1240
|
|
1237
1241
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -1377,7 +1381,8 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
|
|
1377
1381
|
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
|
1378
1382
|
if not async_call and not detach_run:
|
1379
1383
|
job_id, _ = job_id_handle
|
1380
|
-
sdk.tail_logs(cluster, job_id, follow=True)
|
1384
|
+
returncode = sdk.tail_logs(cluster, job_id, follow=True)
|
1385
|
+
sys.exit(returncode)
|
1381
1386
|
|
1382
1387
|
|
1383
1388
|
def _handle_jobs_queue_request(
|
@@ -2121,12 +2126,20 @@ def logs(
|
|
2121
2126
|
one job_id can be provided.
|
2122
2127
|
|
2123
2128
|
2. If ``--status`` is specified, print the status of the job and exit with
|
2124
|
-
returncode 0 if the job succeeded
|
2125
|
-
be specified.
|
2129
|
+
returncode 0 if the job succeeded. At most one job_id can
|
2130
|
+
be specified. Other possible return codes:
|
2131
|
+
|
2132
|
+
- 100: job failed.
|
2133
|
+
- 101: job not finished.
|
2134
|
+
- 102: job not found.
|
2135
|
+
- 103: job was cancelled by the user.
|
2126
2136
|
|
2127
2137
|
3. If ``--sync-down`` is specified, the logs of the job will be downloaded
|
2128
2138
|
from the cluster and saved to the local machine under
|
2129
|
-
``~/sky_logs``.
|
2139
|
+
``~/sky_logs``. Multiple job_ids can be specified.
|
2140
|
+
|
2141
|
+
4. If the job fails or fetching the logs fails, the command will exit with
|
2142
|
+
a non-zero return code.
|
2130
2143
|
"""
|
2131
2144
|
if sync_down and status:
|
2132
2145
|
raise click.UsageError(
|
@@ -2174,17 +2187,18 @@ def logs(
|
|
2174
2187
|
# it will return {None: None}.
|
2175
2188
|
if job_id is None:
|
2176
2189
|
click.secho(f'No job found on cluster {cluster!r}.', fg='red')
|
2177
|
-
sys.exit(
|
2190
|
+
sys.exit(exceptions.JobExitCode.NOT_FOUND)
|
2178
2191
|
job_status = list(job_statuses.values())[0]
|
2179
2192
|
job_status_str = job_status.value if job_status is not None else 'None'
|
2180
2193
|
click.echo(f'Job {job_id}: {job_status_str}')
|
2181
2194
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
2182
2195
|
return
|
2183
2196
|
else:
|
2197
|
+
returncode = exceptions.JobExitCode.from_job_status(job_status)
|
2184
2198
|
if job_status is None:
|
2185
2199
|
id_str = '' if job_id is None else f'{job_id} '
|
2186
2200
|
click.secho(f'Job {id_str}not found', fg='red')
|
2187
|
-
sys.exit(
|
2201
|
+
sys.exit(returncode)
|
2188
2202
|
|
2189
2203
|
job_str = f'job {job_id}'
|
2190
2204
|
if job_id is None:
|
@@ -2194,7 +2208,8 @@ def logs(
|
|
2194
2208
|
f'{colorama.Style.RESET_ALL}')
|
2195
2209
|
|
2196
2210
|
# Stream logs from the server.
|
2197
|
-
sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2211
|
+
returncode = sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2212
|
+
sys.exit(returncode)
|
2198
2213
|
|
2199
2214
|
|
2200
2215
|
@cli.command()
|
@@ -3019,7 +3034,7 @@ def _down_or_stop_clusters(
|
|
3019
3034
|
# with the termination.
|
3020
3035
|
hint_or_raise(controller_name, purge)
|
3021
3036
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
3022
|
-
RuntimeError) as e:
|
3037
|
+
exceptions.NotSupportedError, RuntimeError) as e:
|
3023
3038
|
if purge:
|
3024
3039
|
click.echo(common_utils.format_exception(e))
|
3025
3040
|
else:
|
@@ -3729,6 +3744,7 @@ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
|
|
3729
3744
|
if not storages:
|
3730
3745
|
click.echo('No storage(s) to delete.')
|
3731
3746
|
return
|
3747
|
+
names = [storage['name'] for storage in storages]
|
3732
3748
|
else:
|
3733
3749
|
names = _get_glob_storages(names)
|
3734
3750
|
if names:
|
@@ -3893,10 +3909,11 @@ def jobs_launch(
|
|
3893
3909
|
'sky.jobs.launch')
|
3894
3910
|
if not async_call and not detach_run:
|
3895
3911
|
job_id = job_id_handle[0]
|
3896
|
-
managed_jobs.tail_logs(name=None,
|
3897
|
-
|
3898
|
-
|
3899
|
-
|
3912
|
+
returncode = managed_jobs.tail_logs(name=None,
|
3913
|
+
job_id=job_id,
|
3914
|
+
follow=True,
|
3915
|
+
controller=False)
|
3916
|
+
sys.exit(returncode)
|
3900
3917
|
|
3901
3918
|
|
3902
3919
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
@@ -4127,11 +4144,12 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
4127
4144
|
logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
|
4128
4145
|
f'{log_local_path}{style.RESET_ALL}')
|
4129
4146
|
else:
|
4130
|
-
managed_jobs.tail_logs(name=name,
|
4131
|
-
|
4132
|
-
|
4133
|
-
|
4134
|
-
|
4147
|
+
returncode = managed_jobs.tail_logs(name=name,
|
4148
|
+
job_id=job_id,
|
4149
|
+
follow=follow,
|
4150
|
+
controller=controller,
|
4151
|
+
refresh=refresh)
|
4152
|
+
sys.exit(returncode)
|
4135
4153
|
except exceptions.ClusterNotUpError:
|
4136
4154
|
with ux_utils.print_exception_no_traceback():
|
4137
4155
|
raise
|
sky/client/sdk.py
CHANGED
@@ -25,6 +25,7 @@ import filelock
|
|
25
25
|
import psutil
|
26
26
|
import requests
|
27
27
|
|
28
|
+
from sky import admin_policy
|
28
29
|
from sky import backends
|
29
30
|
from sky import exceptions
|
30
31
|
from sky import sky_logging
|
@@ -212,13 +213,17 @@ def list_accelerator_counts(
|
|
212
213
|
@annotations.client_api
|
213
214
|
def optimize(
|
214
215
|
dag: 'sky.Dag',
|
215
|
-
minimize: common.OptimizeTarget = common.OptimizeTarget.COST
|
216
|
+
minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
|
217
|
+
admin_policy_request_options: Optional[admin_policy.RequestOptions] = None
|
216
218
|
) -> server_common.RequestId:
|
217
219
|
"""Finds the best execution plan for the given DAG.
|
218
220
|
|
219
221
|
Args:
|
220
222
|
dag: the DAG to optimize.
|
221
223
|
minimize: whether to minimize cost or time.
|
224
|
+
admin_policy_request_options: Request options used for admin policy
|
225
|
+
validation. This is only required when a admin policy is in use,
|
226
|
+
see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
|
222
227
|
|
223
228
|
Returns:
|
224
229
|
The request ID of the optimize request.
|
@@ -233,7 +238,9 @@ def optimize(
|
|
233
238
|
"""
|
234
239
|
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
235
240
|
|
236
|
-
body = payloads.OptimizeBody(dag=dag_str,
|
241
|
+
body = payloads.OptimizeBody(dag=dag_str,
|
242
|
+
minimize=minimize,
|
243
|
+
request_options=admin_policy_request_options)
|
237
244
|
response = requests.post(f'{server_common.get_server_url()}/optimize',
|
238
245
|
json=json.loads(body.model_dump_json()))
|
239
246
|
return server_common.get_request_id(response)
|
@@ -242,7 +249,11 @@ def optimize(
|
|
242
249
|
@usage_lib.entrypoint
|
243
250
|
@server_common.check_server_healthy_or_start
|
244
251
|
@annotations.client_api
|
245
|
-
def validate(
|
252
|
+
def validate(
|
253
|
+
dag: 'sky.Dag',
|
254
|
+
workdir_only: bool = False,
|
255
|
+
admin_policy_request_options: Optional[admin_policy.RequestOptions] = None
|
256
|
+
) -> None:
|
246
257
|
"""Validates the tasks.
|
247
258
|
|
248
259
|
The file paths (workdir and file_mounts) are validated on the client side
|
@@ -254,13 +265,17 @@ def validate(dag: 'sky.Dag', workdir_only: bool = False) -> None:
|
|
254
265
|
dag: the DAG to validate.
|
255
266
|
workdir_only: whether to only validate the workdir. This is used for
|
256
267
|
`exec` as it does not need other files/folders in file_mounts.
|
268
|
+
admin_policy_request_options: Request options used for admin policy
|
269
|
+
validation. This is only required when a admin policy is in use,
|
270
|
+
see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
|
257
271
|
"""
|
258
272
|
for task in dag.tasks:
|
259
273
|
task.expand_and_validate_workdir()
|
260
274
|
if not workdir_only:
|
261
275
|
task.expand_and_validate_file_mounts()
|
262
276
|
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
263
|
-
body = payloads.ValidateBody(dag=dag_str
|
277
|
+
body = payloads.ValidateBody(dag=dag_str,
|
278
|
+
request_options=admin_policy_request_options)
|
264
279
|
response = requests.post(f'{server_common.get_server_url()}/validate',
|
265
280
|
json=json.loads(body.model_dump_json()))
|
266
281
|
if response.status_code == 400:
|
@@ -386,7 +401,12 @@ def launch(
|
|
386
401
|
'Please contact the SkyPilot team if you '
|
387
402
|
'need this feature at slack.skypilot.co.')
|
388
403
|
dag = dag_utils.convert_entrypoint_to_dag(task)
|
389
|
-
|
404
|
+
request_options = admin_policy.RequestOptions(
|
405
|
+
cluster_name=cluster_name,
|
406
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
407
|
+
down=down,
|
408
|
+
dryrun=dryrun)
|
409
|
+
validate(dag, admin_policy_request_options=request_options)
|
390
410
|
|
391
411
|
confirm_shown = False
|
392
412
|
if _need_confirmation:
|
@@ -400,7 +420,8 @@ def launch(
|
|
400
420
|
if not clusters:
|
401
421
|
# Show the optimize log before the prompt if the cluster does not
|
402
422
|
# exist.
|
403
|
-
request_id = optimize(dag
|
423
|
+
request_id = optimize(dag,
|
424
|
+
admin_policy_request_options=request_options)
|
404
425
|
stream_and_get(request_id)
|
405
426
|
else:
|
406
427
|
cluster_record = clusters[0]
|
@@ -562,7 +583,7 @@ def tail_logs(cluster_name: str,
|
|
562
583
|
job_id: Optional[int],
|
563
584
|
follow: bool,
|
564
585
|
tail: int = 0,
|
565
|
-
output_stream: Optional['io.TextIOBase'] = None) ->
|
586
|
+
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
566
587
|
"""Tails the logs of a job.
|
567
588
|
|
568
589
|
Args:
|
@@ -575,7 +596,9 @@ def tail_logs(cluster_name: str,
|
|
575
596
|
console.
|
576
597
|
|
577
598
|
Returns:
|
578
|
-
|
599
|
+
Exit code based on success or failure of the job. 0 if success,
|
600
|
+
100 if the job failed. See exceptions.JobExitCode for possible exit
|
601
|
+
codes.
|
579
602
|
|
580
603
|
Request Raises:
|
581
604
|
ValueError: if arguments are invalid or the cluster is not supported.
|
@@ -601,7 +624,7 @@ def tail_logs(cluster_name: str,
|
|
601
624
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
602
625
|
None))
|
603
626
|
request_id = server_common.get_request_id(response)
|
604
|
-
stream_response(request_id, response, output_stream)
|
627
|
+
return stream_response(request_id, response, output_stream)
|
605
628
|
|
606
629
|
|
607
630
|
@usage_lib.entrypoint
|
sky/core.py
CHANGED
@@ -6,16 +6,18 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
6
|
|
7
7
|
import colorama
|
8
8
|
|
9
|
+
from sky import admin_policy
|
9
10
|
from sky import backends
|
10
11
|
from sky import check as sky_check
|
11
12
|
from sky import clouds
|
12
|
-
from sky import dag
|
13
|
+
from sky import dag as dag_lib
|
13
14
|
from sky import data
|
14
15
|
from sky import exceptions
|
15
16
|
from sky import global_user_state
|
16
17
|
from sky import models
|
18
|
+
from sky import optimizer
|
17
19
|
from sky import sky_logging
|
18
|
-
from sky import task
|
20
|
+
from sky import task as task_lib
|
19
21
|
from sky.backends import backend_utils
|
20
22
|
from sky.clouds import service_catalog
|
21
23
|
from sky.jobs.server import core as managed_jobs_core
|
@@ -25,6 +27,7 @@ from sky.skylet import constants
|
|
25
27
|
from sky.skylet import job_lib
|
26
28
|
from sky.skylet import log_lib
|
27
29
|
from sky.usage import usage_lib
|
30
|
+
from sky.utils import admin_policy_utils
|
28
31
|
from sky.utils import common
|
29
32
|
from sky.utils import common_utils
|
30
33
|
from sky.utils import controller_utils
|
@@ -44,6 +47,46 @@ logger = sky_logging.init_logger(__name__)
|
|
44
47
|
# ======================
|
45
48
|
|
46
49
|
|
50
|
+
@usage_lib.entrypoint
|
51
|
+
def optimize(
|
52
|
+
dag: 'dag_lib.Dag',
|
53
|
+
minimize: common.OptimizeTarget = common.OptimizeTarget.COST,
|
54
|
+
blocked_resources: Optional[List['resources_lib.Resources']] = None,
|
55
|
+
quiet: bool = False,
|
56
|
+
request_options: Optional[admin_policy.RequestOptions] = None
|
57
|
+
) -> 'dag_lib.Dag':
|
58
|
+
"""Finds the best execution plan for the given DAG.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
dag: the DAG to optimize.
|
62
|
+
minimize: whether to minimize cost or time.
|
63
|
+
blocked_resources: a list of resources that should not be used.
|
64
|
+
quiet: whether to suppress logging.
|
65
|
+
request_options: Request options used in enforcing admin policies.
|
66
|
+
This is only required when a admin policy is in use,
|
67
|
+
see: https://docs.skypilot.co/en/latest/cloud-setup/policy.html
|
68
|
+
Returns:
|
69
|
+
The optimized DAG.
|
70
|
+
|
71
|
+
Raises:
|
72
|
+
exceptions.ResourcesUnavailableError: if no resources are available
|
73
|
+
for a task.
|
74
|
+
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
75
|
+
"""
|
76
|
+
# TODO: We apply the admin policy only on the first DAG optimization which
|
77
|
+
# is shown on `sky launch`. The optimizer is also invoked during failover,
|
78
|
+
# but we do not apply the admin policy there. We should apply the admin
|
79
|
+
# policy in the optimizer, but that will require some refactoring.
|
80
|
+
dag, _ = admin_policy_utils.apply(
|
81
|
+
dag,
|
82
|
+
use_mutated_config_in_current_request=True,
|
83
|
+
request_options=request_options)
|
84
|
+
return optimizer.Optimizer.optimize(dag=dag,
|
85
|
+
minimize=minimize,
|
86
|
+
blocked_resources=blocked_resources,
|
87
|
+
quiet=quiet)
|
88
|
+
|
89
|
+
|
47
90
|
@usage_lib.entrypoint
|
48
91
|
def status(
|
49
92
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
@@ -325,8 +368,8 @@ def _start(
|
|
325
368
|
|
326
369
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
327
370
|
|
328
|
-
with
|
329
|
-
dummy_task =
|
371
|
+
with dag_lib.Dag():
|
372
|
+
dummy_task = task_lib.Task().set_resources(handle.launched_resources)
|
330
373
|
dummy_task.num_nodes = handle.launched_nodes
|
331
374
|
handle = backend.provision(dummy_task,
|
332
375
|
to_provision=handle.launched_resources,
|
@@ -783,7 +826,7 @@ def cancel(
|
|
783
826
|
def tail_logs(cluster_name: str,
|
784
827
|
job_id: Optional[int],
|
785
828
|
follow: bool = True,
|
786
|
-
tail: int = 0) ->
|
829
|
+
tail: int = 0) -> int:
|
787
830
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
788
831
|
"""Tails the logs of a job.
|
789
832
|
|
@@ -799,6 +842,12 @@ def tail_logs(cluster_name: str,
|
|
799
842
|
not the same as the user who created the cluster.
|
800
843
|
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
801
844
|
user identity.
|
845
|
+
|
846
|
+
Returns:
|
847
|
+
Return code based on success or failure of the job. 0 if success,
|
848
|
+
100 if the job failed. Note: This is not the return code of the job
|
849
|
+
script.
|
850
|
+
|
802
851
|
"""
|
803
852
|
# Check the status of the cluster.
|
804
853
|
handle = backend_utils.check_cluster_available(
|
@@ -808,7 +857,7 @@ def tail_logs(cluster_name: str,
|
|
808
857
|
backend = backend_utils.get_backend_from_handle(handle)
|
809
858
|
|
810
859
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
811
|
-
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
860
|
+
return backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
812
861
|
|
813
862
|
|
814
863
|
@usage_lib.entrypoint
|
sky/exceptions.py
CHANGED
@@ -9,7 +9,9 @@ from typing import Any, Dict, List, Optional, Sequence
|
|
9
9
|
from sky.utils import env_options
|
10
10
|
|
11
11
|
if typing.TYPE_CHECKING:
|
12
|
+
from sky import jobs as managed_jobs
|
12
13
|
from sky.backends import backend
|
14
|
+
from sky.skylet import job_lib
|
13
15
|
from sky.utils import status_lib
|
14
16
|
|
15
17
|
# Return code for keyboard interruption and SIGTSTP
|
@@ -236,7 +238,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
|
|
236
238
|
else:
|
237
239
|
if (len(command) > 100 and
|
238
240
|
not env_options.Options.SHOW_DEBUG_INFO.get()):
|
239
|
-
#
|
241
|
+
# Chunk the command to avoid overflow.
|
240
242
|
command = command[:100] + '...'
|
241
243
|
message = (f'Command {command} failed with return code '
|
242
244
|
f'{returncode}.\n{error_msg}')
|
@@ -449,3 +451,80 @@ class ApiServerConnectionError(RuntimeError):
|
|
449
451
|
f'Could not connect to SkyPilot API server at {server_url}. '
|
450
452
|
f'Please ensure that the server is running. '
|
451
453
|
f'Try: curl {server_url}/api/health')
|
454
|
+
|
455
|
+
|
456
|
+
class JobExitCode(enum.IntEnum):
|
457
|
+
"""Job exit code enum.
|
458
|
+
|
459
|
+
These codes are used as return codes for job-related operations and as
|
460
|
+
process exit codes to indicate job status.
|
461
|
+
"""
|
462
|
+
|
463
|
+
SUCCEEDED = 0
|
464
|
+
"""The job completed successfully"""
|
465
|
+
|
466
|
+
FAILED = 100
|
467
|
+
"""The job failed (due to user code, setup, or driver failure)"""
|
468
|
+
|
469
|
+
NOT_FINISHED = 101
|
470
|
+
"""The job has not finished yet"""
|
471
|
+
|
472
|
+
NOT_FOUND = 102
|
473
|
+
"""The job was not found"""
|
474
|
+
|
475
|
+
CANCELLED = 103
|
476
|
+
"""The job was cancelled by the user"""
|
477
|
+
|
478
|
+
@classmethod
|
479
|
+
def from_job_status(cls,
|
480
|
+
status: Optional['job_lib.JobStatus']) -> 'JobExitCode':
|
481
|
+
"""Convert a job status to an exit code."""
|
482
|
+
# Import here to avoid circular imports
|
483
|
+
# pylint: disable=import-outside-toplevel
|
484
|
+
from sky.skylet import job_lib
|
485
|
+
|
486
|
+
if status is None:
|
487
|
+
return cls.NOT_FOUND
|
488
|
+
|
489
|
+
if not status.is_terminal():
|
490
|
+
return cls.NOT_FINISHED
|
491
|
+
|
492
|
+
if status == job_lib.JobStatus.SUCCEEDED:
|
493
|
+
return cls.SUCCEEDED
|
494
|
+
|
495
|
+
if status == job_lib.JobStatus.CANCELLED:
|
496
|
+
return cls.CANCELLED
|
497
|
+
|
498
|
+
if status in job_lib.JobStatus.user_code_failure_states(
|
499
|
+
) or status == job_lib.JobStatus.FAILED_DRIVER:
|
500
|
+
return cls.FAILED
|
501
|
+
|
502
|
+
# Should not hit this case, but included to avoid errors
|
503
|
+
return cls.FAILED
|
504
|
+
|
505
|
+
@classmethod
|
506
|
+
def from_managed_job_status(
|
507
|
+
cls,
|
508
|
+
status: Optional['managed_jobs.ManagedJobStatus']) -> 'JobExitCode':
|
509
|
+
"""Convert a managed job status to an exit code."""
|
510
|
+
# Import here to avoid circular imports
|
511
|
+
# pylint: disable=import-outside-toplevel
|
512
|
+
from sky import jobs as managed_jobs
|
513
|
+
|
514
|
+
if status is None:
|
515
|
+
return cls.NOT_FOUND
|
516
|
+
|
517
|
+
if not status.is_terminal():
|
518
|
+
return cls.NOT_FINISHED
|
519
|
+
|
520
|
+
if status == managed_jobs.ManagedJobStatus.SUCCEEDED:
|
521
|
+
return cls.SUCCEEDED
|
522
|
+
|
523
|
+
if status == managed_jobs.ManagedJobStatus.CANCELLED:
|
524
|
+
return cls.CANCELLED
|
525
|
+
|
526
|
+
if status.is_failed():
|
527
|
+
return cls.FAILED
|
528
|
+
|
529
|
+
# Should not hit this case, but included to avoid errors
|
530
|
+
return cls.FAILED
|