skypilot-nightly 1.0.0.dev20250225__py3-none-any.whl → 1.0.0.dev20250227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +4 -6
- sky/cli.py +51 -17
- sky/client/cli.py +51 -17
- sky/jobs/client/sdk.py +10 -4
- sky/jobs/constants.py +1 -1
- sky/jobs/dashboard/dashboard.py +4 -1
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +33 -9
- sky/jobs/server/server.py +9 -0
- sky/jobs/state.py +30 -10
- sky/jobs/utils.py +57 -12
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +2 -0
- sky/templates/jobs-controller.yaml.j2 +8 -1
- sky/utils/common_utils.py +94 -14
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/RECORD +22 -22
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250225.dist-info → skypilot_nightly-1.0.0.dev20250227.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '2c4849b6f73499740f495f84a29ac4af98d25073'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250227'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/kubernetes.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional, Set
|
|
6
6
|
from sky.adaptors import common
|
7
7
|
from sky.sky_logging import set_logging_level
|
8
8
|
from sky.utils import annotations
|
9
|
-
from sky.utils import
|
9
|
+
from sky.utils import common_utils
|
10
10
|
from sky.utils import ux_utils
|
11
11
|
|
12
12
|
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
|
@@ -69,22 +69,20 @@ def _load_config(context: Optional[str] = None):
|
|
69
69
|
try:
|
70
70
|
kubernetes.config.load_kube_config(context=context)
|
71
71
|
except kubernetes.config.config_exception.ConfigException as e:
|
72
|
-
suffix =
|
73
|
-
if env_options.Options.SHOW_DEBUG_INFO.get():
|
74
|
-
suffix += f' Error: {str(e)}'
|
72
|
+
suffix = common_utils.format_exception(e, use_bracket=True)
|
75
73
|
# Check if exception was due to no current-context
|
76
74
|
if 'Expected key current-context' in str(e):
|
77
75
|
err_str = (
|
78
76
|
f'Failed to load Kubernetes configuration for {context!r}. '
|
79
77
|
'Kubeconfig does not contain any valid context(s).'
|
80
|
-
f'{suffix}\n'
|
78
|
+
f'\n{suffix}\n'
|
81
79
|
' If you were running a local Kubernetes '
|
82
80
|
'cluster, run `sky local up` to start the cluster.')
|
83
81
|
else:
|
84
82
|
err_str = (
|
85
83
|
f'Failed to load Kubernetes configuration for {context!r}. '
|
86
84
|
'Please check if your kubeconfig file exists at '
|
87
|
-
f'~/.kube/config and is valid
|
85
|
+
f'~/.kube/config and is valid.\n{suffix}')
|
88
86
|
err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
|
89
87
|
with ux_utils.print_exception_no_traceback():
|
90
88
|
raise ValueError(err_str) from None
|
sky/cli.py
CHANGED
@@ -1379,12 +1379,14 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
|
|
1379
1379
|
def _handle_jobs_queue_request(
|
1380
1380
|
request_id: str,
|
1381
1381
|
show_all: bool,
|
1382
|
+
show_user: bool,
|
1382
1383
|
limit_num_jobs_to_show: bool = False,
|
1383
1384
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1384
1385
|
"""Get the in-progress managed jobs.
|
1385
1386
|
|
1386
1387
|
Args:
|
1387
1388
|
show_all: Show all information of each job (e.g., region, price).
|
1389
|
+
show_user: Show the user who submitted the job.
|
1388
1390
|
limit_num_jobs_to_show: If True, limit the number of jobs to show to
|
1389
1391
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
|
1390
1392
|
`sky status`.
|
@@ -1452,6 +1454,7 @@ def _handle_jobs_queue_request(
|
|
1452
1454
|
if limit_num_jobs_to_show else None)
|
1453
1455
|
msg = managed_jobs.format_job_table(managed_jobs_,
|
1454
1456
|
show_all=show_all,
|
1457
|
+
show_user=show_user,
|
1455
1458
|
max_jobs=max_jobs_to_show)
|
1456
1459
|
return num_in_progress_jobs, msg
|
1457
1460
|
|
@@ -1561,7 +1564,9 @@ def _status_kubernetes(show_all: bool):
|
|
1561
1564
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1562
1565
|
f'Managed jobs'
|
1563
1566
|
f'{colorama.Style.RESET_ALL}')
|
1564
|
-
msg = managed_jobs.format_job_table(all_jobs,
|
1567
|
+
msg = managed_jobs.format_job_table(all_jobs,
|
1568
|
+
show_all=show_all,
|
1569
|
+
show_user=False)
|
1565
1570
|
click.echo(msg)
|
1566
1571
|
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
1567
1572
|
# TODO: Parse serve controllers and show services separately.
|
@@ -1779,7 +1784,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1779
1784
|
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
1780
1785
|
if show_managed_jobs:
|
1781
1786
|
managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
|
1782
|
-
skip_finished=True
|
1787
|
+
skip_finished=True,
|
1788
|
+
all_users=all_users)
|
1783
1789
|
show_endpoints = endpoints or endpoint is not None
|
1784
1790
|
show_single_endpoint = endpoint is not None
|
1785
1791
|
show_services = show_services and not any([clusters, ip, endpoints])
|
@@ -1859,6 +1865,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1859
1865
|
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
1860
1866
|
managed_jobs_queue_request_id,
|
1861
1867
|
show_all=False,
|
1868
|
+
show_user=False,
|
1862
1869
|
limit_num_jobs_to_show=not all,
|
1863
1870
|
is_called_by_user=False)
|
1864
1871
|
except KeyboardInterrupt:
|
@@ -2751,7 +2758,7 @@ def start(
|
|
2751
2758
|
def down(
|
2752
2759
|
clusters: List[str],
|
2753
2760
|
all: bool, # pylint: disable=redefined-builtin
|
2754
|
-
all_users: bool,
|
2761
|
+
all_users: bool,
|
2755
2762
|
yes: bool,
|
2756
2763
|
purge: bool,
|
2757
2764
|
async_call: bool,
|
@@ -2812,7 +2819,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2812
2819
|
with rich_utils.client_status(
|
2813
2820
|
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
2814
2821
|
try:
|
2815
|
-
request_id = managed_jobs.queue(refresh=False,
|
2822
|
+
request_id = managed_jobs.queue(refresh=False,
|
2823
|
+
skip_finished=True,
|
2824
|
+
all_users=True)
|
2816
2825
|
managed_jobs_ = sdk.stream_and_get(request_id)
|
2817
2826
|
except exceptions.ClusterNotUpError as e:
|
2818
2827
|
if controller.value.connection_error_hint in str(e):
|
@@ -2836,7 +2845,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2836
2845
|
'jobs (output of `sky jobs queue`) will be lost.')
|
2837
2846
|
click.echo(msg)
|
2838
2847
|
if managed_jobs_:
|
2839
|
-
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2848
|
+
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2849
|
+
show_all=False,
|
2850
|
+
show_user=True)
|
2840
2851
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
2841
2852
|
# Add prefix to each line to align with the bullet point.
|
2842
2853
|
msg += '\n'.join(
|
@@ -3905,9 +3916,16 @@ def jobs_launch(
|
|
3905
3916
|
is_flag=True,
|
3906
3917
|
required=False,
|
3907
3918
|
help='Show only pending/running jobs\' information.')
|
3919
|
+
@click.option('--all-users',
|
3920
|
+
'-u',
|
3921
|
+
default=False,
|
3922
|
+
is_flag=True,
|
3923
|
+
required=False,
|
3924
|
+
help='Show jobs from all users.')
|
3908
3925
|
@usage_lib.entrypoint
|
3909
3926
|
# pylint: disable=redefined-builtin
|
3910
|
-
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool
|
3927
|
+
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
3928
|
+
all_users: bool):
|
3911
3929
|
"""Show statuses of managed jobs.
|
3912
3930
|
|
3913
3931
|
Each managed jobs can have one of the following statuses:
|
@@ -3964,9 +3982,10 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
|
|
3964
3982
|
click.secho('Fetching managed job statuses...', fg='cyan')
|
3965
3983
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
3966
3984
|
managed_jobs_request_id = managed_jobs.queue(
|
3967
|
-
refresh=refresh, skip_finished=skip_finished)
|
3985
|
+
refresh=refresh, skip_finished=skip_finished, all_users=all_users)
|
3968
3986
|
_, msg = _handle_jobs_queue_request(managed_jobs_request_id,
|
3969
3987
|
show_all=verbose,
|
3988
|
+
show_user=all_users,
|
3970
3989
|
is_called_by_user=True)
|
3971
3990
|
if not skip_finished:
|
3972
3991
|
in_progress_only_hint = ''
|
@@ -3989,16 +4008,23 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
|
|
3989
4008
|
is_flag=True,
|
3990
4009
|
default=False,
|
3991
4010
|
required=False,
|
3992
|
-
help='Cancel all managed jobs.')
|
4011
|
+
help='Cancel all managed jobs for the current user.')
|
3993
4012
|
@click.option('--yes',
|
3994
4013
|
'-y',
|
3995
4014
|
is_flag=True,
|
3996
4015
|
default=False,
|
3997
4016
|
required=False,
|
3998
4017
|
help='Skip confirmation prompt.')
|
4018
|
+
@click.option('--all-users',
|
4019
|
+
'-u',
|
4020
|
+
is_flag=True,
|
4021
|
+
default=False,
|
4022
|
+
required=False,
|
4023
|
+
help='Cancel all managed jobs from all users.')
|
3999
4024
|
@usage_lib.entrypoint
|
4000
4025
|
# pylint: disable=redefined-builtin
|
4001
|
-
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool
|
4026
|
+
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
4027
|
+
all_users: bool):
|
4002
4028
|
"""Cancel managed jobs.
|
4003
4029
|
|
4004
4030
|
You can provide either a job name or a list of job IDs to be cancelled.
|
@@ -4015,25 +4041,33 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
4015
4041
|
$ sky jobs cancel 1 2 3
|
4016
4042
|
"""
|
4017
4043
|
job_id_str = ','.join(map(str, job_ids))
|
4018
|
-
if sum([bool(job_ids), name is not None, all]) != 1:
|
4019
|
-
|
4020
|
-
|
4021
|
-
|
4044
|
+
if sum([bool(job_ids), name is not None, all or all_users]) != 1:
|
4045
|
+
arguments = []
|
4046
|
+
arguments += [f'--job-ids {job_id_str}'] if job_ids else []
|
4047
|
+
arguments += [f'--name {name}'] if name is not None else []
|
4048
|
+
arguments += ['--all'] if all else []
|
4049
|
+
arguments += ['--all-users'] if all_users else []
|
4022
4050
|
raise click.UsageError(
|
4023
|
-
'Can only specify one of JOB_IDS
|
4024
|
-
f'Provided {
|
4051
|
+
'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
|
4052
|
+
f'Provided {" ".join(arguments)!r}.')
|
4025
4053
|
|
4026
4054
|
if not yes:
|
4027
4055
|
job_identity_str = (f'managed jobs with IDs {job_id_str}'
|
4028
4056
|
if job_ids else repr(name))
|
4029
|
-
if
|
4057
|
+
if all_users:
|
4058
|
+
job_identity_str = 'all managed jobs FOR ALL USERS'
|
4059
|
+
elif all:
|
4030
4060
|
job_identity_str = 'all managed jobs'
|
4031
4061
|
click.confirm(f'Cancelling {job_identity_str}. Proceed?',
|
4032
4062
|
default=True,
|
4033
4063
|
abort=True,
|
4034
4064
|
show_default=True)
|
4035
4065
|
|
4036
|
-
sdk.stream_and_get(
|
4066
|
+
sdk.stream_and_get(
|
4067
|
+
managed_jobs.cancel(job_ids=job_ids,
|
4068
|
+
name=name,
|
4069
|
+
all=all,
|
4070
|
+
all_users=all_users))
|
4037
4071
|
|
4038
4072
|
|
4039
4073
|
@jobs.command('logs', cls=_DocumentedCodeCommand)
|
sky/client/cli.py
CHANGED
@@ -1379,12 +1379,14 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
|
|
1379
1379
|
def _handle_jobs_queue_request(
|
1380
1380
|
request_id: str,
|
1381
1381
|
show_all: bool,
|
1382
|
+
show_user: bool,
|
1382
1383
|
limit_num_jobs_to_show: bool = False,
|
1383
1384
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1384
1385
|
"""Get the in-progress managed jobs.
|
1385
1386
|
|
1386
1387
|
Args:
|
1387
1388
|
show_all: Show all information of each job (e.g., region, price).
|
1389
|
+
show_user: Show the user who submitted the job.
|
1388
1390
|
limit_num_jobs_to_show: If True, limit the number of jobs to show to
|
1389
1391
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
|
1390
1392
|
`sky status`.
|
@@ -1452,6 +1454,7 @@ def _handle_jobs_queue_request(
|
|
1452
1454
|
if limit_num_jobs_to_show else None)
|
1453
1455
|
msg = managed_jobs.format_job_table(managed_jobs_,
|
1454
1456
|
show_all=show_all,
|
1457
|
+
show_user=show_user,
|
1455
1458
|
max_jobs=max_jobs_to_show)
|
1456
1459
|
return num_in_progress_jobs, msg
|
1457
1460
|
|
@@ -1561,7 +1564,9 @@ def _status_kubernetes(show_all: bool):
|
|
1561
1564
|
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1562
1565
|
f'Managed jobs'
|
1563
1566
|
f'{colorama.Style.RESET_ALL}')
|
1564
|
-
msg = managed_jobs.format_job_table(all_jobs,
|
1567
|
+
msg = managed_jobs.format_job_table(all_jobs,
|
1568
|
+
show_all=show_all,
|
1569
|
+
show_user=False)
|
1565
1570
|
click.echo(msg)
|
1566
1571
|
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
1567
1572
|
# TODO: Parse serve controllers and show services separately.
|
@@ -1779,7 +1784,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1779
1784
|
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
1780
1785
|
if show_managed_jobs:
|
1781
1786
|
managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
|
1782
|
-
skip_finished=True
|
1787
|
+
skip_finished=True,
|
1788
|
+
all_users=all_users)
|
1783
1789
|
show_endpoints = endpoints or endpoint is not None
|
1784
1790
|
show_single_endpoint = endpoint is not None
|
1785
1791
|
show_services = show_services and not any([clusters, ip, endpoints])
|
@@ -1859,6 +1865,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1859
1865
|
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
1860
1866
|
managed_jobs_queue_request_id,
|
1861
1867
|
show_all=False,
|
1868
|
+
show_user=False,
|
1862
1869
|
limit_num_jobs_to_show=not all,
|
1863
1870
|
is_called_by_user=False)
|
1864
1871
|
except KeyboardInterrupt:
|
@@ -2751,7 +2758,7 @@ def start(
|
|
2751
2758
|
def down(
|
2752
2759
|
clusters: List[str],
|
2753
2760
|
all: bool, # pylint: disable=redefined-builtin
|
2754
|
-
all_users: bool,
|
2761
|
+
all_users: bool,
|
2755
2762
|
yes: bool,
|
2756
2763
|
purge: bool,
|
2757
2764
|
async_call: bool,
|
@@ -2812,7 +2819,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2812
2819
|
with rich_utils.client_status(
|
2813
2820
|
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
2814
2821
|
try:
|
2815
|
-
request_id = managed_jobs.queue(refresh=False,
|
2822
|
+
request_id = managed_jobs.queue(refresh=False,
|
2823
|
+
skip_finished=True,
|
2824
|
+
all_users=True)
|
2816
2825
|
managed_jobs_ = sdk.stream_and_get(request_id)
|
2817
2826
|
except exceptions.ClusterNotUpError as e:
|
2818
2827
|
if controller.value.connection_error_hint in str(e):
|
@@ -2836,7 +2845,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
2836
2845
|
'jobs (output of `sky jobs queue`) will be lost.')
|
2837
2846
|
click.echo(msg)
|
2838
2847
|
if managed_jobs_:
|
2839
|
-
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2848
|
+
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2849
|
+
show_all=False,
|
2850
|
+
show_user=True)
|
2840
2851
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
2841
2852
|
# Add prefix to each line to align with the bullet point.
|
2842
2853
|
msg += '\n'.join(
|
@@ -3905,9 +3916,16 @@ def jobs_launch(
|
|
3905
3916
|
is_flag=True,
|
3906
3917
|
required=False,
|
3907
3918
|
help='Show only pending/running jobs\' information.')
|
3919
|
+
@click.option('--all-users',
|
3920
|
+
'-u',
|
3921
|
+
default=False,
|
3922
|
+
is_flag=True,
|
3923
|
+
required=False,
|
3924
|
+
help='Show jobs from all users.')
|
3908
3925
|
@usage_lib.entrypoint
|
3909
3926
|
# pylint: disable=redefined-builtin
|
3910
|
-
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool
|
3927
|
+
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
3928
|
+
all_users: bool):
|
3911
3929
|
"""Show statuses of managed jobs.
|
3912
3930
|
|
3913
3931
|
Each managed jobs can have one of the following statuses:
|
@@ -3964,9 +3982,10 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
|
|
3964
3982
|
click.secho('Fetching managed job statuses...', fg='cyan')
|
3965
3983
|
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
3966
3984
|
managed_jobs_request_id = managed_jobs.queue(
|
3967
|
-
refresh=refresh, skip_finished=skip_finished)
|
3985
|
+
refresh=refresh, skip_finished=skip_finished, all_users=all_users)
|
3968
3986
|
_, msg = _handle_jobs_queue_request(managed_jobs_request_id,
|
3969
3987
|
show_all=verbose,
|
3988
|
+
show_user=all_users,
|
3970
3989
|
is_called_by_user=True)
|
3971
3990
|
if not skip_finished:
|
3972
3991
|
in_progress_only_hint = ''
|
@@ -3989,16 +4008,23 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
|
|
3989
4008
|
is_flag=True,
|
3990
4009
|
default=False,
|
3991
4010
|
required=False,
|
3992
|
-
help='Cancel all managed jobs.')
|
4011
|
+
help='Cancel all managed jobs for the current user.')
|
3993
4012
|
@click.option('--yes',
|
3994
4013
|
'-y',
|
3995
4014
|
is_flag=True,
|
3996
4015
|
default=False,
|
3997
4016
|
required=False,
|
3998
4017
|
help='Skip confirmation prompt.')
|
4018
|
+
@click.option('--all-users',
|
4019
|
+
'-u',
|
4020
|
+
is_flag=True,
|
4021
|
+
default=False,
|
4022
|
+
required=False,
|
4023
|
+
help='Cancel all managed jobs from all users.')
|
3999
4024
|
@usage_lib.entrypoint
|
4000
4025
|
# pylint: disable=redefined-builtin
|
4001
|
-
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool
|
4026
|
+
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
4027
|
+
all_users: bool):
|
4002
4028
|
"""Cancel managed jobs.
|
4003
4029
|
|
4004
4030
|
You can provide either a job name or a list of job IDs to be cancelled.
|
@@ -4015,25 +4041,33 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
4015
4041
|
$ sky jobs cancel 1 2 3
|
4016
4042
|
"""
|
4017
4043
|
job_id_str = ','.join(map(str, job_ids))
|
4018
|
-
if sum([bool(job_ids), name is not None, all]) != 1:
|
4019
|
-
|
4020
|
-
|
4021
|
-
|
4044
|
+
if sum([bool(job_ids), name is not None, all or all_users]) != 1:
|
4045
|
+
arguments = []
|
4046
|
+
arguments += [f'--job-ids {job_id_str}'] if job_ids else []
|
4047
|
+
arguments += [f'--name {name}'] if name is not None else []
|
4048
|
+
arguments += ['--all'] if all else []
|
4049
|
+
arguments += ['--all-users'] if all_users else []
|
4022
4050
|
raise click.UsageError(
|
4023
|
-
'Can only specify one of JOB_IDS
|
4024
|
-
f'Provided {
|
4051
|
+
'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
|
4052
|
+
f'Provided {" ".join(arguments)!r}.')
|
4025
4053
|
|
4026
4054
|
if not yes:
|
4027
4055
|
job_identity_str = (f'managed jobs with IDs {job_id_str}'
|
4028
4056
|
if job_ids else repr(name))
|
4029
|
-
if
|
4057
|
+
if all_users:
|
4058
|
+
job_identity_str = 'all managed jobs FOR ALL USERS'
|
4059
|
+
elif all:
|
4030
4060
|
job_identity_str = 'all managed jobs'
|
4031
4061
|
click.confirm(f'Cancelling {job_identity_str}. Proceed?',
|
4032
4062
|
default=True,
|
4033
4063
|
abort=True,
|
4034
4064
|
show_default=True)
|
4035
4065
|
|
4036
|
-
sdk.stream_and_get(
|
4066
|
+
sdk.stream_and_get(
|
4067
|
+
managed_jobs.cancel(job_ids=job_ids,
|
4068
|
+
name=name,
|
4069
|
+
all=all,
|
4070
|
+
all_users=all_users))
|
4037
4071
|
|
4038
4072
|
|
4039
4073
|
@jobs.command('logs', cls=_DocumentedCodeCommand)
|
sky/jobs/client/sdk.py
CHANGED
@@ -85,7 +85,8 @@ def launch(
|
|
85
85
|
@usage_lib.entrypoint
|
86
86
|
@server_common.check_server_healthy_or_start
|
87
87
|
def queue(refresh: bool,
|
88
|
-
skip_finished: bool = False
|
88
|
+
skip_finished: bool = False,
|
89
|
+
all_users: bool = False) -> server_common.RequestId:
|
89
90
|
"""Gets statuses of managed jobs.
|
90
91
|
|
91
92
|
Please refer to sky.cli.job_queue for documentation.
|
@@ -93,6 +94,7 @@ def queue(refresh: bool,
|
|
93
94
|
Args:
|
94
95
|
refresh: Whether to restart the jobs controller if it is stopped.
|
95
96
|
skip_finished: Whether to skip finished jobs.
|
97
|
+
all_users: Whether to show all users' jobs.
|
96
98
|
|
97
99
|
Returns:
|
98
100
|
The request ID of the queue request.
|
@@ -126,6 +128,7 @@ def queue(refresh: bool,
|
|
126
128
|
body = payloads.JobsQueueBody(
|
127
129
|
refresh=refresh,
|
128
130
|
skip_finished=skip_finished,
|
131
|
+
all_users=all_users,
|
129
132
|
)
|
130
133
|
response = requests.post(
|
131
134
|
f'{server_common.get_server_url()}/jobs/queue',
|
@@ -138,9 +141,10 @@ def queue(refresh: bool,
|
|
138
141
|
@usage_lib.entrypoint
|
139
142
|
@server_common.check_server_healthy_or_start
|
140
143
|
def cancel(
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
+
name: Optional[str] = None,
|
145
|
+
job_ids: Optional[List[int]] = None,
|
146
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
147
|
+
all_users: bool = False,
|
144
148
|
) -> server_common.RequestId:
|
145
149
|
"""Cancels managed jobs.
|
146
150
|
|
@@ -150,6 +154,7 @@ def cancel(
|
|
150
154
|
name: Name of the managed job to cancel.
|
151
155
|
job_ids: IDs of the managed jobs to cancel.
|
152
156
|
all: Whether to cancel all managed jobs.
|
157
|
+
all_users: Whether to cancel all managed jobs from all users.
|
153
158
|
|
154
159
|
Returns:
|
155
160
|
The request ID of the cancel request.
|
@@ -162,6 +167,7 @@ def cancel(
|
|
162
167
|
name=name,
|
163
168
|
job_ids=job_ids,
|
164
169
|
all=all,
|
170
|
+
all_users=all_users,
|
165
171
|
)
|
166
172
|
response = requests.post(
|
167
173
|
f'{server_common.get_server_url()}/jobs/cancel',
|
sky/jobs/constants.py
CHANGED
@@ -40,7 +40,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
40
40
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
41
41
|
# change for the jobs/utils, we need to bump this version and update
|
42
42
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
43
|
-
MANAGED_JOBS_VERSION =
|
43
|
+
MANAGED_JOBS_VERSION = 2
|
44
44
|
|
45
45
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
46
46
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/dashboard/dashboard.py
CHANGED
@@ -16,6 +16,7 @@ import flask
|
|
16
16
|
import yaml
|
17
17
|
|
18
18
|
from sky import jobs as managed_jobs
|
19
|
+
from sky.client import sdk
|
19
20
|
from sky.jobs import constants as managed_job_constants
|
20
21
|
from sky.utils import common_utils
|
21
22
|
from sky.utils import controller_utils
|
@@ -134,7 +135,8 @@ def _extract_launch_history(log_content: str) -> str:
|
|
134
135
|
def home():
|
135
136
|
if not _is_running_on_jobs_controller():
|
136
137
|
# Experimental: run on laptop (refresh is very slow).
|
137
|
-
|
138
|
+
request_id = managed_jobs.queue(refresh=True, skip_finished=False)
|
139
|
+
all_managed_jobs = sdk.get(request_id)
|
138
140
|
else:
|
139
141
|
job_table = managed_jobs.dump_managed_job_queue()
|
140
142
|
all_managed_jobs = managed_jobs.load_managed_job_queue(job_table)
|
@@ -142,6 +144,7 @@ def home():
|
|
142
144
|
timestamp = datetime.datetime.now(datetime.timezone.utc)
|
143
145
|
rows = managed_jobs.format_job_table(all_managed_jobs,
|
144
146
|
show_all=True,
|
147
|
+
show_user=False,
|
145
148
|
return_rows=True)
|
146
149
|
|
147
150
|
status_counts = collections.defaultdict(int)
|
sky/jobs/scheduler.py
CHANGED
@@ -49,6 +49,7 @@ from sky import sky_logging
|
|
49
49
|
from sky.jobs import constants as managed_job_constants
|
50
50
|
from sky.jobs import state
|
51
51
|
from sky.skylet import constants
|
52
|
+
from sky.utils import common_utils
|
52
53
|
from sky.utils import subprocess_utils
|
53
54
|
|
54
55
|
logger = sky_logging.init_logger('sky.jobs.controller')
|
@@ -151,12 +152,20 @@ def maybe_schedule_next_jobs() -> None:
|
|
151
152
|
job_id = maybe_next_job['job_id']
|
152
153
|
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
153
154
|
|
155
|
+
activate_python_env_cmd = (
|
156
|
+
f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
157
|
+
env_file = maybe_next_job['env_file_path']
|
158
|
+
source_environment_cmd = (f'source {env_file};'
|
159
|
+
if env_file else '')
|
160
|
+
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
161
|
+
f'{dag_yaml_path} --job-id {job_id};')
|
162
|
+
|
154
163
|
# If the command line here is changed, please also update
|
155
164
|
# utils._controller_process_alive. `--job-id X` should be at
|
156
165
|
# the end.
|
157
|
-
run_cmd = (f'{
|
158
|
-
'
|
159
|
-
f'{
|
166
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
167
|
+
f'{source_environment_cmd}'
|
168
|
+
f'{run_controller_cmd}')
|
160
169
|
|
161
170
|
logs_dir = os.path.expanduser(
|
162
171
|
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
@@ -175,16 +184,19 @@ def maybe_schedule_next_jobs() -> None:
|
|
175
184
|
pass
|
176
185
|
|
177
186
|
|
178
|
-
def submit_job(job_id: int, dag_yaml_path: str) -> None:
|
187
|
+
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
|
179
188
|
"""Submit an existing job to the scheduler.
|
180
189
|
|
181
190
|
This should be called after a job is created in the `spot` table as
|
182
191
|
PENDING. It will tell the scheduler to try and start the job controller, if
|
183
192
|
there are resources available. It may block to acquire the lock, so it
|
184
193
|
should not be on the critical path for `sky jobs launch -d`.
|
194
|
+
|
195
|
+
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
185
196
|
"""
|
186
197
|
with filelock.FileLock(_get_lock_path()):
|
187
|
-
state.scheduler_set_waiting(job_id, dag_yaml_path
|
198
|
+
state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
|
199
|
+
common_utils.get_user_hash())
|
188
200
|
maybe_schedule_next_jobs()
|
189
201
|
|
190
202
|
|
@@ -281,12 +293,15 @@ def _can_lauch_in_alive_job() -> bool:
|
|
281
293
|
|
282
294
|
if __name__ == '__main__':
|
283
295
|
parser = ArgumentParser()
|
296
|
+
parser.add_argument('dag_yaml',
|
297
|
+
type=str,
|
298
|
+
help='The path to the user job yaml file.')
|
284
299
|
parser.add_argument('--job-id',
|
285
300
|
required=True,
|
286
301
|
type=int,
|
287
302
|
help='Job id for the controller job.')
|
288
|
-
parser.add_argument('
|
303
|
+
parser.add_argument('--env-file',
|
289
304
|
type=str,
|
290
|
-
help='The path to the
|
305
|
+
help='The path to the controller env file.')
|
291
306
|
args = parser.parse_args()
|
292
|
-
submit_job(args.job_id, args.dag_yaml)
|
307
|
+
submit_job(args.job_id, args.dag_yaml, args.env_file)
|