skypilot-nightly 1.0.0.dev20250225__py3-none-any.whl → 1.0.0.dev20250227__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '546c0864e0f1e32e3d1080d9b7a5fdf293bc1ad9'
8
+ _SKYPILOT_COMMIT_SHA = '2c4849b6f73499740f495f84a29ac4af98d25073'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250225'
38
+ __version__ = '1.0.0.dev20250227'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional, Set
6
6
  from sky.adaptors import common
7
7
  from sky.sky_logging import set_logging_level
8
8
  from sky.utils import annotations
9
- from sky.utils import env_options
9
+ from sky.utils import common_utils
10
10
  from sky.utils import ux_utils
11
11
 
12
12
  _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Kubernetes. '
@@ -69,22 +69,20 @@ def _load_config(context: Optional[str] = None):
69
69
  try:
70
70
  kubernetes.config.load_kube_config(context=context)
71
71
  except kubernetes.config.config_exception.ConfigException as e:
72
- suffix = ''
73
- if env_options.Options.SHOW_DEBUG_INFO.get():
74
- suffix += f' Error: {str(e)}'
72
+ suffix = common_utils.format_exception(e, use_bracket=True)
75
73
  # Check if exception was due to no current-context
76
74
  if 'Expected key current-context' in str(e):
77
75
  err_str = (
78
76
  f'Failed to load Kubernetes configuration for {context!r}. '
79
77
  'Kubeconfig does not contain any valid context(s).'
80
- f'{suffix}\n'
78
+ f'\n{suffix}\n'
81
79
  ' If you were running a local Kubernetes '
82
80
  'cluster, run `sky local up` to start the cluster.')
83
81
  else:
84
82
  err_str = (
85
83
  f'Failed to load Kubernetes configuration for {context!r}. '
86
84
  'Please check if your kubeconfig file exists at '
87
- f'~/.kube/config and is valid.{suffix}')
85
+ f'~/.kube/config and is valid.\n{suffix}')
88
86
  err_str += '\nTo disable Kubernetes for SkyPilot: run `sky check`.'
89
87
  with ux_utils.print_exception_no_traceback():
90
88
  raise ValueError(err_str) from None
sky/cli.py CHANGED
@@ -1379,12 +1379,14 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
1379
1379
  def _handle_jobs_queue_request(
1380
1380
  request_id: str,
1381
1381
  show_all: bool,
1382
+ show_user: bool,
1382
1383
  limit_num_jobs_to_show: bool = False,
1383
1384
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1384
1385
  """Get the in-progress managed jobs.
1385
1386
 
1386
1387
  Args:
1387
1388
  show_all: Show all information of each job (e.g., region, price).
1389
+ show_user: Show the user who submitted the job.
1388
1390
  limit_num_jobs_to_show: If True, limit the number of jobs to show to
1389
1391
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
1390
1392
  `sky status`.
@@ -1452,6 +1454,7 @@ def _handle_jobs_queue_request(
1452
1454
  if limit_num_jobs_to_show else None)
1453
1455
  msg = managed_jobs.format_job_table(managed_jobs_,
1454
1456
  show_all=show_all,
1457
+ show_user=show_user,
1455
1458
  max_jobs=max_jobs_to_show)
1456
1459
  return num_in_progress_jobs, msg
1457
1460
 
@@ -1561,7 +1564,9 @@ def _status_kubernetes(show_all: bool):
1561
1564
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1562
1565
  f'Managed jobs'
1563
1566
  f'{colorama.Style.RESET_ALL}')
1564
- msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
1567
+ msg = managed_jobs.format_job_table(all_jobs,
1568
+ show_all=show_all,
1569
+ show_user=False)
1565
1570
  click.echo(msg)
1566
1571
  if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1567
1572
  # TODO: Parse serve controllers and show services separately.
@@ -1779,7 +1784,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1779
1784
  show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
1780
1785
  if show_managed_jobs:
1781
1786
  managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
1782
- skip_finished=True)
1787
+ skip_finished=True,
1788
+ all_users=all_users)
1783
1789
  show_endpoints = endpoints or endpoint is not None
1784
1790
  show_single_endpoint = endpoint is not None
1785
1791
  show_services = show_services and not any([clusters, ip, endpoints])
@@ -1859,6 +1865,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1859
1865
  num_in_progress_jobs, msg = _handle_jobs_queue_request(
1860
1866
  managed_jobs_queue_request_id,
1861
1867
  show_all=False,
1868
+ show_user=False,
1862
1869
  limit_num_jobs_to_show=not all,
1863
1870
  is_called_by_user=False)
1864
1871
  except KeyboardInterrupt:
@@ -2751,7 +2758,7 @@ def start(
2751
2758
  def down(
2752
2759
  clusters: List[str],
2753
2760
  all: bool, # pylint: disable=redefined-builtin
2754
- all_users: bool, # pylint: disable=redefined-builtin
2761
+ all_users: bool,
2755
2762
  yes: bool,
2756
2763
  purge: bool,
2757
2764
  async_call: bool,
@@ -2812,7 +2819,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2812
2819
  with rich_utils.client_status(
2813
2820
  '[bold cyan]Checking for in-progress managed jobs[/]'):
2814
2821
  try:
2815
- request_id = managed_jobs.queue(refresh=False, skip_finished=True)
2822
+ request_id = managed_jobs.queue(refresh=False,
2823
+ skip_finished=True,
2824
+ all_users=True)
2816
2825
  managed_jobs_ = sdk.stream_and_get(request_id)
2817
2826
  except exceptions.ClusterNotUpError as e:
2818
2827
  if controller.value.connection_error_hint in str(e):
@@ -2836,7 +2845,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2836
2845
  'jobs (output of `sky jobs queue`) will be lost.')
2837
2846
  click.echo(msg)
2838
2847
  if managed_jobs_:
2839
- job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False)
2848
+ job_table = managed_jobs.format_job_table(managed_jobs_,
2849
+ show_all=False,
2850
+ show_user=True)
2840
2851
  msg = controller.value.decline_down_for_dirty_controller_hint
2841
2852
  # Add prefix to each line to align with the bullet point.
2842
2853
  msg += '\n'.join(
@@ -3905,9 +3916,16 @@ def jobs_launch(
3905
3916
  is_flag=True,
3906
3917
  required=False,
3907
3918
  help='Show only pending/running jobs\' information.')
3919
+ @click.option('--all-users',
3920
+ '-u',
3921
+ default=False,
3922
+ is_flag=True,
3923
+ required=False,
3924
+ help='Show jobs from all users.')
3908
3925
  @usage_lib.entrypoint
3909
3926
  # pylint: disable=redefined-builtin
3910
- def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3927
+ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
3928
+ all_users: bool):
3911
3929
  """Show statuses of managed jobs.
3912
3930
 
3913
3931
  Each managed jobs can have one of the following statuses:
@@ -3964,9 +3982,10 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3964
3982
  click.secho('Fetching managed job statuses...', fg='cyan')
3965
3983
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
3966
3984
  managed_jobs_request_id = managed_jobs.queue(
3967
- refresh=refresh, skip_finished=skip_finished)
3985
+ refresh=refresh, skip_finished=skip_finished, all_users=all_users)
3968
3986
  _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
3969
3987
  show_all=verbose,
3988
+ show_user=all_users,
3970
3989
  is_called_by_user=True)
3971
3990
  if not skip_finished:
3972
3991
  in_progress_only_hint = ''
@@ -3989,16 +4008,23 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3989
4008
  is_flag=True,
3990
4009
  default=False,
3991
4010
  required=False,
3992
- help='Cancel all managed jobs.')
4011
+ help='Cancel all managed jobs for the current user.')
3993
4012
  @click.option('--yes',
3994
4013
  '-y',
3995
4014
  is_flag=True,
3996
4015
  default=False,
3997
4016
  required=False,
3998
4017
  help='Skip confirmation prompt.')
4018
+ @click.option('--all-users',
4019
+ '-u',
4020
+ is_flag=True,
4021
+ default=False,
4022
+ required=False,
4023
+ help='Cancel all managed jobs from all users.')
3999
4024
  @usage_lib.entrypoint
4000
4025
  # pylint: disable=redefined-builtin
4001
- def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
4026
+ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4027
+ all_users: bool):
4002
4028
  """Cancel managed jobs.
4003
4029
 
4004
4030
  You can provide either a job name or a list of job IDs to be cancelled.
@@ -4015,25 +4041,33 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
4015
4041
  $ sky jobs cancel 1 2 3
4016
4042
  """
4017
4043
  job_id_str = ','.join(map(str, job_ids))
4018
- if sum([bool(job_ids), name is not None, all]) != 1:
4019
- argument_str = f'--job-ids {job_id_str}' if job_ids else ''
4020
- argument_str += f' --name {name}' if name is not None else ''
4021
- argument_str += ' --all' if all else ''
4044
+ if sum([bool(job_ids), name is not None, all or all_users]) != 1:
4045
+ arguments = []
4046
+ arguments += [f'--job-ids {job_id_str}'] if job_ids else []
4047
+ arguments += [f'--name {name}'] if name is not None else []
4048
+ arguments += ['--all'] if all else []
4049
+ arguments += ['--all-users'] if all_users else []
4022
4050
  raise click.UsageError(
4023
- 'Can only specify one of JOB_IDS or --name or --all. '
4024
- f'Provided {argument_str!r}.')
4051
+ 'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
4052
+ f'Provided {" ".join(arguments)!r}.')
4025
4053
 
4026
4054
  if not yes:
4027
4055
  job_identity_str = (f'managed jobs with IDs {job_id_str}'
4028
4056
  if job_ids else repr(name))
4029
- if all:
4057
+ if all_users:
4058
+ job_identity_str = 'all managed jobs FOR ALL USERS'
4059
+ elif all:
4030
4060
  job_identity_str = 'all managed jobs'
4031
4061
  click.confirm(f'Cancelling {job_identity_str}. Proceed?',
4032
4062
  default=True,
4033
4063
  abort=True,
4034
4064
  show_default=True)
4035
4065
 
4036
- sdk.stream_and_get(managed_jobs.cancel(job_ids=job_ids, name=name, all=all))
4066
+ sdk.stream_and_get(
4067
+ managed_jobs.cancel(job_ids=job_ids,
4068
+ name=name,
4069
+ all=all,
4070
+ all_users=all_users))
4037
4071
 
4038
4072
 
4039
4073
  @jobs.command('logs', cls=_DocumentedCodeCommand)
sky/client/cli.py CHANGED
@@ -1379,12 +1379,14 @@ def exec(cluster: Optional[str], cluster_option: Optional[str],
1379
1379
  def _handle_jobs_queue_request(
1380
1380
  request_id: str,
1381
1381
  show_all: bool,
1382
+ show_user: bool,
1382
1383
  limit_num_jobs_to_show: bool = False,
1383
1384
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1384
1385
  """Get the in-progress managed jobs.
1385
1386
 
1386
1387
  Args:
1387
1388
  show_all: Show all information of each job (e.g., region, price).
1389
+ show_user: Show the user who submitted the job.
1388
1390
  limit_num_jobs_to_show: If True, limit the number of jobs to show to
1389
1391
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
1390
1392
  `sky status`.
@@ -1452,6 +1454,7 @@ def _handle_jobs_queue_request(
1452
1454
  if limit_num_jobs_to_show else None)
1453
1455
  msg = managed_jobs.format_job_table(managed_jobs_,
1454
1456
  show_all=show_all,
1457
+ show_user=show_user,
1455
1458
  max_jobs=max_jobs_to_show)
1456
1459
  return num_in_progress_jobs, msg
1457
1460
 
@@ -1561,7 +1564,9 @@ def _status_kubernetes(show_all: bool):
1561
1564
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1562
1565
  f'Managed jobs'
1563
1566
  f'{colorama.Style.RESET_ALL}')
1564
- msg = managed_jobs.format_job_table(all_jobs, show_all=show_all)
1567
+ msg = managed_jobs.format_job_table(all_jobs,
1568
+ show_all=show_all,
1569
+ show_user=False)
1565
1570
  click.echo(msg)
1566
1571
  if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1567
1572
  # TODO: Parse serve controllers and show services separately.
@@ -1779,7 +1784,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1779
1784
  show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
1780
1785
  if show_managed_jobs:
1781
1786
  managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
1782
- skip_finished=True)
1787
+ skip_finished=True,
1788
+ all_users=all_users)
1783
1789
  show_endpoints = endpoints or endpoint is not None
1784
1790
  show_single_endpoint = endpoint is not None
1785
1791
  show_services = show_services and not any([clusters, ip, endpoints])
@@ -1859,6 +1865,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1859
1865
  num_in_progress_jobs, msg = _handle_jobs_queue_request(
1860
1866
  managed_jobs_queue_request_id,
1861
1867
  show_all=False,
1868
+ show_user=False,
1862
1869
  limit_num_jobs_to_show=not all,
1863
1870
  is_called_by_user=False)
1864
1871
  except KeyboardInterrupt:
@@ -2751,7 +2758,7 @@ def start(
2751
2758
  def down(
2752
2759
  clusters: List[str],
2753
2760
  all: bool, # pylint: disable=redefined-builtin
2754
- all_users: bool, # pylint: disable=redefined-builtin
2761
+ all_users: bool,
2755
2762
  yes: bool,
2756
2763
  purge: bool,
2757
2764
  async_call: bool,
@@ -2812,7 +2819,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2812
2819
  with rich_utils.client_status(
2813
2820
  '[bold cyan]Checking for in-progress managed jobs[/]'):
2814
2821
  try:
2815
- request_id = managed_jobs.queue(refresh=False, skip_finished=True)
2822
+ request_id = managed_jobs.queue(refresh=False,
2823
+ skip_finished=True,
2824
+ all_users=True)
2816
2825
  managed_jobs_ = sdk.stream_and_get(request_id)
2817
2826
  except exceptions.ClusterNotUpError as e:
2818
2827
  if controller.value.connection_error_hint in str(e):
@@ -2836,7 +2845,9 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2836
2845
  'jobs (output of `sky jobs queue`) will be lost.')
2837
2846
  click.echo(msg)
2838
2847
  if managed_jobs_:
2839
- job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False)
2848
+ job_table = managed_jobs.format_job_table(managed_jobs_,
2849
+ show_all=False,
2850
+ show_user=True)
2840
2851
  msg = controller.value.decline_down_for_dirty_controller_hint
2841
2852
  # Add prefix to each line to align with the bullet point.
2842
2853
  msg += '\n'.join(
@@ -3905,9 +3916,16 @@ def jobs_launch(
3905
3916
  is_flag=True,
3906
3917
  required=False,
3907
3918
  help='Show only pending/running jobs\' information.')
3919
+ @click.option('--all-users',
3920
+ '-u',
3921
+ default=False,
3922
+ is_flag=True,
3923
+ required=False,
3924
+ help='Show jobs from all users.')
3908
3925
  @usage_lib.entrypoint
3909
3926
  # pylint: disable=redefined-builtin
3910
- def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3927
+ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
3928
+ all_users: bool):
3911
3929
  """Show statuses of managed jobs.
3912
3930
 
3913
3931
  Each managed jobs can have one of the following statuses:
@@ -3964,9 +3982,10 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3964
3982
  click.secho('Fetching managed job statuses...', fg='cyan')
3965
3983
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
3966
3984
  managed_jobs_request_id = managed_jobs.queue(
3967
- refresh=refresh, skip_finished=skip_finished)
3985
+ refresh=refresh, skip_finished=skip_finished, all_users=all_users)
3968
3986
  _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
3969
3987
  show_all=verbose,
3988
+ show_user=all_users,
3970
3989
  is_called_by_user=True)
3971
3990
  if not skip_finished:
3972
3991
  in_progress_only_hint = ''
@@ -3989,16 +4008,23 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3989
4008
  is_flag=True,
3990
4009
  default=False,
3991
4010
  required=False,
3992
- help='Cancel all managed jobs.')
4011
+ help='Cancel all managed jobs for the current user.')
3993
4012
  @click.option('--yes',
3994
4013
  '-y',
3995
4014
  is_flag=True,
3996
4015
  default=False,
3997
4016
  required=False,
3998
4017
  help='Skip confirmation prompt.')
4018
+ @click.option('--all-users',
4019
+ '-u',
4020
+ is_flag=True,
4021
+ default=False,
4022
+ required=False,
4023
+ help='Cancel all managed jobs from all users.')
3999
4024
  @usage_lib.entrypoint
4000
4025
  # pylint: disable=redefined-builtin
4001
- def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
4026
+ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4027
+ all_users: bool):
4002
4028
  """Cancel managed jobs.
4003
4029
 
4004
4030
  You can provide either a job name or a list of job IDs to be cancelled.
@@ -4015,25 +4041,33 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
4015
4041
  $ sky jobs cancel 1 2 3
4016
4042
  """
4017
4043
  job_id_str = ','.join(map(str, job_ids))
4018
- if sum([bool(job_ids), name is not None, all]) != 1:
4019
- argument_str = f'--job-ids {job_id_str}' if job_ids else ''
4020
- argument_str += f' --name {name}' if name is not None else ''
4021
- argument_str += ' --all' if all else ''
4044
+ if sum([bool(job_ids), name is not None, all or all_users]) != 1:
4045
+ arguments = []
4046
+ arguments += [f'--job-ids {job_id_str}'] if job_ids else []
4047
+ arguments += [f'--name {name}'] if name is not None else []
4048
+ arguments += ['--all'] if all else []
4049
+ arguments += ['--all-users'] if all_users else []
4022
4050
  raise click.UsageError(
4023
- 'Can only specify one of JOB_IDS or --name or --all. '
4024
- f'Provided {argument_str!r}.')
4051
+ 'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
4052
+ f'Provided {" ".join(arguments)!r}.')
4025
4053
 
4026
4054
  if not yes:
4027
4055
  job_identity_str = (f'managed jobs with IDs {job_id_str}'
4028
4056
  if job_ids else repr(name))
4029
- if all:
4057
+ if all_users:
4058
+ job_identity_str = 'all managed jobs FOR ALL USERS'
4059
+ elif all:
4030
4060
  job_identity_str = 'all managed jobs'
4031
4061
  click.confirm(f'Cancelling {job_identity_str}. Proceed?',
4032
4062
  default=True,
4033
4063
  abort=True,
4034
4064
  show_default=True)
4035
4065
 
4036
- sdk.stream_and_get(managed_jobs.cancel(job_ids=job_ids, name=name, all=all))
4066
+ sdk.stream_and_get(
4067
+ managed_jobs.cancel(job_ids=job_ids,
4068
+ name=name,
4069
+ all=all,
4070
+ all_users=all_users))
4037
4071
 
4038
4072
 
4039
4073
  @jobs.command('logs', cls=_DocumentedCodeCommand)
sky/jobs/client/sdk.py CHANGED
@@ -85,7 +85,8 @@ def launch(
85
85
  @usage_lib.entrypoint
86
86
  @server_common.check_server_healthy_or_start
87
87
  def queue(refresh: bool,
88
- skip_finished: bool = False) -> server_common.RequestId:
88
+ skip_finished: bool = False,
89
+ all_users: bool = False) -> server_common.RequestId:
89
90
  """Gets statuses of managed jobs.
90
91
 
91
92
  Please refer to sky.cli.job_queue for documentation.
@@ -93,6 +94,7 @@ def queue(refresh: bool,
93
94
  Args:
94
95
  refresh: Whether to restart the jobs controller if it is stopped.
95
96
  skip_finished: Whether to skip finished jobs.
97
+ all_users: Whether to show all users' jobs.
96
98
 
97
99
  Returns:
98
100
  The request ID of the queue request.
@@ -126,6 +128,7 @@ def queue(refresh: bool,
126
128
  body = payloads.JobsQueueBody(
127
129
  refresh=refresh,
128
130
  skip_finished=skip_finished,
131
+ all_users=all_users,
129
132
  )
130
133
  response = requests.post(
131
134
  f'{server_common.get_server_url()}/jobs/queue',
@@ -138,9 +141,10 @@ def queue(refresh: bool,
138
141
  @usage_lib.entrypoint
139
142
  @server_common.check_server_healthy_or_start
140
143
  def cancel(
141
- name: Optional[str] = None,
142
- job_ids: Optional[List[int]] = None,
143
- all: bool = False, # pylint: disable=redefined-builtin
144
+ name: Optional[str] = None,
145
+ job_ids: Optional[List[int]] = None,
146
+ all: bool = False, # pylint: disable=redefined-builtin
147
+ all_users: bool = False,
144
148
  ) -> server_common.RequestId:
145
149
  """Cancels managed jobs.
146
150
 
@@ -150,6 +154,7 @@ def cancel(
150
154
  name: Name of the managed job to cancel.
151
155
  job_ids: IDs of the managed jobs to cancel.
152
156
  all: Whether to cancel all managed jobs.
157
+ all_users: Whether to cancel all managed jobs from all users.
153
158
 
154
159
  Returns:
155
160
  The request ID of the cancel request.
@@ -162,6 +167,7 @@ def cancel(
162
167
  name=name,
163
168
  job_ids=job_ids,
164
169
  all=all,
170
+ all_users=all_users,
165
171
  )
166
172
  response = requests.post(
167
173
  f'{server_common.get_server_url()}/jobs/cancel',
sky/jobs/constants.py CHANGED
@@ -40,7 +40,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
40
40
  # The version of the lib files that jobs/utils use. Whenever there is an API
41
41
  # change for the jobs/utils, we need to bump this version and update
42
42
  # job.utils.ManagedJobCodeGen to handle the version update.
43
- MANAGED_JOBS_VERSION = 1
43
+ MANAGED_JOBS_VERSION = 2
44
44
 
45
45
  # The command for setting up the jobs dashboard on the controller. It firstly
46
46
  # checks if the systemd services are available, and if not (e.g., Kubernetes
@@ -16,6 +16,7 @@ import flask
16
16
  import yaml
17
17
 
18
18
  from sky import jobs as managed_jobs
19
+ from sky.client import sdk
19
20
  from sky.jobs import constants as managed_job_constants
20
21
  from sky.utils import common_utils
21
22
  from sky.utils import controller_utils
@@ -134,7 +135,8 @@ def _extract_launch_history(log_content: str) -> str:
134
135
  def home():
135
136
  if not _is_running_on_jobs_controller():
136
137
  # Experimental: run on laptop (refresh is very slow).
137
- all_managed_jobs = managed_jobs.queue(refresh=True, skip_finished=False)
138
+ request_id = managed_jobs.queue(refresh=True, skip_finished=False)
139
+ all_managed_jobs = sdk.get(request_id)
138
140
  else:
139
141
  job_table = managed_jobs.dump_managed_job_queue()
140
142
  all_managed_jobs = managed_jobs.load_managed_job_queue(job_table)
@@ -142,6 +144,7 @@ def home():
142
144
  timestamp = datetime.datetime.now(datetime.timezone.utc)
143
145
  rows = managed_jobs.format_job_table(all_managed_jobs,
144
146
  show_all=True,
147
+ show_user=False,
145
148
  return_rows=True)
146
149
 
147
150
  status_counts = collections.defaultdict(int)
sky/jobs/scheduler.py CHANGED
@@ -49,6 +49,7 @@ from sky import sky_logging
49
49
  from sky.jobs import constants as managed_job_constants
50
50
  from sky.jobs import state
51
51
  from sky.skylet import constants
52
+ from sky.utils import common_utils
52
53
  from sky.utils import subprocess_utils
53
54
 
54
55
  logger = sky_logging.init_logger('sky.jobs.controller')
@@ -151,12 +152,20 @@ def maybe_schedule_next_jobs() -> None:
151
152
  job_id = maybe_next_job['job_id']
152
153
  dag_yaml_path = maybe_next_job['dag_yaml_path']
153
154
 
155
+ activate_python_env_cmd = (
156
+ f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
157
+ env_file = maybe_next_job['env_file_path']
158
+ source_environment_cmd = (f'source {env_file};'
159
+ if env_file else '')
160
+ run_controller_cmd = ('python -u -m sky.jobs.controller '
161
+ f'{dag_yaml_path} --job-id {job_id};')
162
+
154
163
  # If the command line here is changed, please also update
155
164
  # utils._controller_process_alive. `--job-id X` should be at
156
165
  # the end.
157
- run_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};'
158
- 'python -u -m sky.jobs.controller '
159
- f'{dag_yaml_path} --job-id {job_id}')
166
+ run_cmd = (f'{activate_python_env_cmd}'
167
+ f'{source_environment_cmd}'
168
+ f'{run_controller_cmd}')
160
169
 
161
170
  logs_dir = os.path.expanduser(
162
171
  managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
@@ -175,16 +184,19 @@ def maybe_schedule_next_jobs() -> None:
175
184
  pass
176
185
 
177
186
 
178
- def submit_job(job_id: int, dag_yaml_path: str) -> None:
187
+ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
179
188
  """Submit an existing job to the scheduler.
180
189
 
181
190
  This should be called after a job is created in the `spot` table as
182
191
  PENDING. It will tell the scheduler to try and start the job controller, if
183
192
  there are resources available. It may block to acquire the lock, so it
184
193
  should not be on the critical path for `sky jobs launch -d`.
194
+
195
+ The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
185
196
  """
186
197
  with filelock.FileLock(_get_lock_path()):
187
- state.scheduler_set_waiting(job_id, dag_yaml_path)
198
+ state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
199
+ common_utils.get_user_hash())
188
200
  maybe_schedule_next_jobs()
189
201
 
190
202
 
@@ -281,12 +293,15 @@ def _can_lauch_in_alive_job() -> bool:
281
293
 
282
294
  if __name__ == '__main__':
283
295
  parser = ArgumentParser()
296
+ parser.add_argument('dag_yaml',
297
+ type=str,
298
+ help='The path to the user job yaml file.')
284
299
  parser.add_argument('--job-id',
285
300
  required=True,
286
301
  type=int,
287
302
  help='Job id for the controller job.')
288
- parser.add_argument('dag_yaml',
303
+ parser.add_argument('--env-file',
289
304
  type=str,
290
- help='The path to the user job yaml file.')
305
+ help='The path to the controller env file.')
291
306
  args = parser.parse_args()
292
- submit_job(args.job_id, args.dag_yaml)
307
+ submit_job(args.job_id, args.dag_yaml, args.env_file)