skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +50 -67
  3. sky/check.py +31 -1
  4. sky/cli.py +11 -34
  5. sky/clouds/kubernetes.py +3 -3
  6. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  7. sky/core.py +8 -5
  8. sky/data/storage.py +66 -14
  9. sky/global_user_state.py +1 -1
  10. sky/jobs/constants.py +8 -7
  11. sky/jobs/controller.py +19 -22
  12. sky/jobs/core.py +0 -2
  13. sky/jobs/recovery_strategy.py +114 -143
  14. sky/jobs/scheduler.py +283 -0
  15. sky/jobs/state.py +263 -21
  16. sky/jobs/utils.py +338 -96
  17. sky/provision/aws/config.py +48 -26
  18. sky/provision/gcp/instance_utils.py +15 -9
  19. sky/provision/kubernetes/instance.py +1 -1
  20. sky/provision/kubernetes/utils.py +76 -18
  21. sky/resources.py +1 -1
  22. sky/serve/autoscalers.py +359 -301
  23. sky/serve/controller.py +10 -8
  24. sky/serve/core.py +84 -7
  25. sky/serve/load_balancer.py +27 -10
  26. sky/serve/replica_managers.py +1 -3
  27. sky/serve/serve_state.py +10 -5
  28. sky/serve/serve_utils.py +28 -1
  29. sky/serve/service.py +4 -3
  30. sky/serve/service_spec.py +31 -0
  31. sky/skylet/constants.py +1 -1
  32. sky/skylet/events.py +7 -3
  33. sky/skylet/job_lib.py +10 -30
  34. sky/skylet/log_lib.py +8 -8
  35. sky/skylet/log_lib.pyi +3 -0
  36. sky/skylet/skylet.py +1 -1
  37. sky/templates/jobs-controller.yaml.j2 +7 -3
  38. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  39. sky/utils/db_utils.py +18 -4
  40. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  41. sky/utils/resources_utils.py +25 -21
  42. sky/utils/schemas.py +13 -0
  43. sky/utils/subprocess_utils.py +48 -9
  44. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
  45. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
  46. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
  48. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  49. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '35f0cf4cf8fee06aadcac639740d25c7493b5534'
8
+ _SKYPILOT_COMMIT_SHA = '1c94d0f001ed6519873a59a7b46681d64dd696d2'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250114'
38
+ __version__ = '1.0.0.dev20250124'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -10,6 +10,7 @@ import os
10
10
  import pathlib
11
11
  import re
12
12
  import shlex
13
+ import shutil
13
14
  import signal
14
15
  import subprocess
15
16
  import sys
@@ -35,7 +36,6 @@ from sky import jobs as managed_jobs
35
36
  from sky import optimizer
36
37
  from sky import provision as provision_lib
37
38
  from sky import resources as resources_lib
38
- from sky import serve as serve_lib
39
39
  from sky import sky_logging
40
40
  from sky import status_lib
41
41
  from sky import task as task_lib
@@ -45,6 +45,7 @@ from sky.clouds import service_catalog
45
45
  from sky.clouds.utils import gcp_utils
46
46
  from sky.data import data_utils
47
47
  from sky.data import storage as storage_lib
48
+ from sky.jobs import constants as managed_jobs_constants
48
49
  from sky.provision import common as provision_common
49
50
  from sky.provision import instance_setup
50
51
  from sky.provision import metadata_utils
@@ -155,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
155
156
  # might be added during ssh.
156
157
  _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
157
158
 
159
+ _RESOURCES_UNAVAILABLE_LOG = (
160
+ 'Reasons for provision failures (for details, please check the log above):')
161
+
158
162
 
159
163
  def _is_command_length_over_limit(command: str) -> bool:
160
164
  """Check if the length of the command exceeds the limit.
@@ -1997,6 +2001,7 @@ class RetryingVmProvisioner(object):
1997
2001
  skip_unnecessary_provisioning else None)
1998
2002
 
1999
2003
  failover_history: List[Exception] = list()
2004
+ resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
2000
2005
  # If the user is using local credentials which may expire, the
2001
2006
  # controller may leak resources if the credentials expire while a job
2002
2007
  # is running. Here we check the enabled clouds and expiring credentials
@@ -2088,6 +2093,8 @@ class RetryingVmProvisioner(object):
2088
2093
  # Add failed resources to the blocklist, only when it
2089
2094
  # is in fallback mode.
2090
2095
  _add_to_blocked_resources(self._blocked_resources, to_provision)
2096
+ assert len(failover_history) > 0
2097
+ resource_exceptions[to_provision] = failover_history[-1]
2091
2098
  else:
2092
2099
  # If we reach here, it means that the existing cluster must have
2093
2100
  # a previous status of INIT, because other statuses (UP,
@@ -2132,7 +2139,14 @@ class RetryingVmProvisioner(object):
2132
2139
  # possible resources or the requested resources is too
2133
2140
  # restrictive. If we reach here, our failover logic finally
2134
2141
  # ends here.
2135
- raise e.with_failover_history(failover_history)
2142
+ table = log_utils.create_table(['Resource', 'Reason'])
2143
+ for (resource, exception) in resource_exceptions.items():
2144
+ table.add_row(
2145
+ [resources_utils.format_resource(resource), exception])
2146
+ table.max_table_width = shutil.get_terminal_size().columns
2147
+ raise exceptions.ResourcesUnavailableError(
2148
+ _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2149
+ failover_history=failover_history)
2136
2150
  to_provision = task.best_resources
2137
2151
  assert task in self._dag.tasks, 'Internal logic error.'
2138
2152
  assert to_provision is not None, task
@@ -2895,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2895
2909
  'the `--retry-until-up` flag.')
2896
2910
  with ux_utils.print_exception_no_traceback():
2897
2911
  raise exceptions.ResourcesUnavailableError(
2898
- error_message,
2912
+ error_message + '\n' + str(e),
2899
2913
  failover_history=e.failover_history) from None
2900
2914
  if dryrun:
2901
2915
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3910,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3910
3924
  Returns:
3911
3925
  A dictionary mapping job_id to log path.
3912
3926
  """
3913
- # if job_name is not None, job_id should be None
3927
+ # if job_name and job_id should not both be specified
3914
3928
  assert job_name is None or job_id is None, (job_name, job_id)
3915
- if job_id is None and job_name is not None:
3929
+
3930
+ if job_id is None:
3916
3931
  # generate code to get the job_id
3932
+ # if job_name is None, get all job_ids
3933
+ # TODO: Only get the latest job_id, since that's the only one we use
3917
3934
  code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3918
3935
  job_name=job_name)
3919
- returncode, run_timestamps, stderr = self.run_on_head(
3920
- handle,
3921
- code,
3922
- stream_logs=False,
3923
- require_outputs=True,
3924
- separate_stderr=True)
3936
+ returncode, job_ids, stderr = self.run_on_head(handle,
3937
+ code,
3938
+ stream_logs=False,
3939
+ require_outputs=True,
3940
+ separate_stderr=True)
3925
3941
  subprocess_utils.handle_returncode(returncode, code,
3926
3942
  'Failed to sync down logs.',
3927
3943
  stderr)
3928
- job_ids = common_utils.decode_payload(run_timestamps)
3944
+ job_ids = common_utils.decode_payload(job_ids)
3929
3945
  if not job_ids:
3930
3946
  logger.info(f'{colorama.Fore.YELLOW}'
3931
3947
  'No matching job found'
3932
3948
  f'{colorama.Style.RESET_ALL}')
3933
3949
  return {}
3934
3950
  elif len(job_ids) > 1:
3935
- logger.info(
3936
- f'{colorama.Fore.YELLOW}'
3937
- f'Multiple jobs IDs found under the name {job_name}. '
3938
- 'Downloading the latest job logs.'
3939
- f'{colorama.Style.RESET_ALL}')
3940
- job_ids = [job_ids[0]] # descending order
3941
- else:
3942
- job_ids = [job_id]
3951
+ name_str = ''
3952
+ if job_name is not None:
3953
+ name_str = ('Multiple jobs IDs found under the name '
3954
+ f'{job_name}. ')
3955
+ logger.info(f'{colorama.Fore.YELLOW}'
3956
+ f'{name_str}'
3957
+ 'Downloading the latest job logs.'
3958
+ f'{colorama.Style.RESET_ALL}')
3959
+ # list should aready be in descending order
3960
+ job_id = job_ids[0]
3943
3961
 
3944
3962
  # get the run_timestamp
3945
3963
  # the function takes in [job_id]
3946
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3964
+ code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3965
+ [str(job_id)])
3947
3966
  returncode, run_timestamps, stderr = self.run_on_head(
3948
3967
  handle,
3949
3968
  code,
@@ -3964,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3964
3983
  job_id = list(run_timestamps.keys())[0]
3965
3984
  local_log_dir = ''
3966
3985
  if controller: # download controller logs
3967
- remote_log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
3968
- run_timestamp)
3986
+ remote_log = os.path.join(
3987
+ managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
3988
+ f'{job_id}.log')
3969
3989
  local_log_dir = os.path.expanduser(
3970
3990
  os.path.join(local_dir, run_timestamp))
3971
3991
 
3972
3992
  logger.info(f'{colorama.Fore.CYAN}'
3973
- f'Job {job_ids} local logs: {local_log_dir}'
3993
+ f'Job {job_id} local logs: {local_log_dir}'
3974
3994
  f'{colorama.Style.RESET_ALL}')
3975
3995
 
3976
3996
  runners = handle.get_command_runners()
@@ -3981,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3981
4001
  Args:
3982
4002
  args: A tuple of (runner, local_log_dir, remote_log_dir)
3983
4003
  """
3984
- (runner, local_log_dir, remote_log_dir) = args
4004
+ (runner, local_log_dir, remote_log) = args
3985
4005
  try:
3986
4006
  os.makedirs(local_log_dir, exist_ok=True)
3987
4007
  runner.rsync(
3988
- source=f'{remote_log_dir}/',
3989
- target=local_log_dir,
4008
+ source=remote_log,
4009
+ target=f'{local_log_dir}/controller.log',
3990
4010
  up=False,
3991
4011
  stream_logs=False,
3992
4012
  )
@@ -3999,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3999
4019
  else:
4000
4020
  raise
4001
4021
 
4002
- parallel_args = [[runner, *item]
4003
- for item in zip([local_log_dir], [remote_log_dir])
4004
- for runner in runners]
4022
+ parallel_args = [
4023
+ (runner, local_log_dir, remote_log) for runner in runners
4024
+ ]
4005
4025
  subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
4006
4026
  else: # download job logs
4007
4027
  local_log_dir = os.path.expanduser(
@@ -4037,43 +4057,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4037
4057
  f'{colorama.Style.RESET_ALL}')
4038
4058
  return {str(job_id): local_log_dir}
4039
4059
 
4040
- def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
4041
- service_name: str, target: serve_lib.ServiceComponent,
4042
- replica_id: Optional[int], follow: bool) -> None:
4043
- """Tail the logs of a service.
4044
-
4045
- Args:
4046
- handle: The handle to the sky serve controller.
4047
- service_name: The name of the service.
4048
- target: The component to tail the logs of. Could be controller,
4049
- load balancer, or replica.
4050
- replica_id: The replica ID to tail the logs of. Only used when
4051
- target is replica.
4052
- follow: Whether to follow the logs.
4053
- """
4054
- if target != serve_lib.ServiceComponent.REPLICA:
4055
- code = serve_lib.ServeCodeGen.stream_serve_process_logs(
4056
- service_name,
4057
- stream_controller=(
4058
- target == serve_lib.ServiceComponent.CONTROLLER),
4059
- follow=follow)
4060
- else:
4061
- assert replica_id is not None, service_name
4062
- code = serve_lib.ServeCodeGen.stream_replica_logs(
4063
- service_name, replica_id, follow)
4064
-
4065
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
4066
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
4067
-
4068
- self.run_on_head(
4069
- handle,
4070
- code,
4071
- stream_logs=True,
4072
- process_stream=False,
4073
- ssh_mode=command_runner.SshMode.INTERACTIVE,
4074
- stdin=subprocess.DEVNULL,
4075
- )
4076
-
4077
4060
  def teardown_no_lock(self,
4078
4061
  handle: CloudVmRayResourceHandle,
4079
4062
  terminate: bool,
sky/check.py CHANGED
@@ -155,7 +155,8 @@ def check(
155
155
  # Pretty print for UX.
156
156
  if not quiet:
157
157
  enabled_clouds_str = '\n :heavy_check_mark: '.join(
158
- [''] + sorted(all_enabled_clouds))
158
+ [''] +
159
+ [_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
159
160
  rich.print('\n[green]:tada: Enabled clouds :tada:'
160
161
  f'{enabled_clouds_str}[/green]')
161
162
 
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
222
223
  r2_credential_mounts = cloudflare.get_credential_file_mounts()
223
224
  file_mounts.update(r2_credential_mounts)
224
225
  return file_mounts
226
+
227
+
228
+ def _format_enabled_cloud(cloud_name: str) -> str:
229
+ if cloud_name == repr(sky_clouds.Kubernetes()):
230
+ # Get enabled contexts for Kubernetes
231
+ existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
232
+ if not existing_contexts:
233
+ return cloud_name
234
+
235
+ # Check if allowed_contexts is explicitly set in config
236
+ allowed_contexts = skypilot_config.get_nested(
237
+ ('kubernetes', 'allowed_contexts'), None)
238
+
239
+ # Format the context info with consistent styling
240
+ if allowed_contexts is not None:
241
+ contexts_formatted = []
242
+ for i, context in enumerate(existing_contexts):
243
+ # TODO: We should use ux_utils.INDENT_SYMBOL and
244
+ # INDENT_LAST_SYMBOL but, they are formatted for colorama, while
245
+ # here we are using rich. We should migrate this file to
246
+ # use colorama as we do in the rest of the codebase.
247
+ symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
248
+ contexts_formatted.append(f'\n {symbol}{context}')
249
+ context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
250
+ else:
251
+ context_info = f'Active context: {existing_contexts[0]}'
252
+
253
+ return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
254
+ return cloud_name
sky/cli.py CHANGED
@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3530
3530
  if sum([bool(names), all]) != 1:
3531
3531
  raise click.UsageError('Either --all or a name must be specified.')
3532
3532
  if all:
3533
- storages = sky.storage_ls()
3534
- if not storages:
3533
+ # Use '*' to get all storages.
3534
+ names = global_user_state.get_glob_storage_name(storage_name='*')
3535
+ if not names:
3535
3536
  click.echo('No storage(s) to delete.')
3536
3537
  return
3537
- names = [s['name'] for s in storages]
3538
3538
  else:
3539
3539
  names = _get_glob_storages(names)
3540
3540
  if names:
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3548
3548
  abort=True,
3549
3549
  show_default=True)
3550
3550
 
3551
- subprocess_utils.run_in_parallel(sky.storage_delete, names)
3551
+ def delete_storage(name: str) -> None:
3552
+ try:
3553
+ sky.storage_delete(name)
3554
+ except Exception as e: # pylint: disable=broad-except
3555
+ click.secho(f'Error deleting storage {name}: {e}', fg='red')
3556
+
3557
+ subprocess_utils.run_in_parallel(delete_storage, names)
3552
3558
 
3553
3559
 
3554
3560
  @cli.group(cls=_NaturalOrderGroup)
@@ -3588,18 +3594,6 @@ def jobs():
3588
3594
  is_flag=True,
3589
3595
  help=('If True, as soon as a job is submitted, return from this call '
3590
3596
  'and do not stream execution logs.'))
3591
- @click.option(
3592
- '--retry-until-up/--no-retry-until-up',
3593
- '-r/-no-r',
3594
- default=None,
3595
- is_flag=True,
3596
- required=False,
3597
- help=(
3598
- '(Default: True; this flag is deprecated and will be removed in a '
3599
- 'future release.) Whether to retry provisioning infinitely until the '
3600
- 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
3601
- 'applies to launching all managed jobs (both the initial and '
3602
- 'any recovery attempts), not the jobs controller.'))
3603
3597
  @click.option('--yes',
3604
3598
  '-y',
3605
3599
  is_flag=True,
@@ -3636,7 +3630,6 @@ def jobs_launch(
3636
3630
  disk_tier: Optional[str],
3637
3631
  ports: Tuple[str],
3638
3632
  detach_run: bool,
3639
- retry_until_up: Optional[bool],
3640
3633
  yes: bool,
3641
3634
  fast: bool,
3642
3635
  ):
@@ -3680,19 +3673,6 @@ def jobs_launch(
3680
3673
  ports=ports,
3681
3674
  job_recovery=job_recovery,
3682
3675
  )
3683
- # Deprecation. We set the default behavior to be retry until up, and the
3684
- # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
3685
- if retry_until_up is not None:
3686
- flag_str = '--retry-until-up'
3687
- if not retry_until_up:
3688
- flag_str = '--no-retry-until-up'
3689
- click.secho(
3690
- f'Flag {flag_str} is deprecated and will be removed in a '
3691
- 'future release (managed jobs will always be retried). '
3692
- 'Please file an issue if this does not work for you.',
3693
- fg='yellow')
3694
- else:
3695
- retry_until_up = True
3696
3676
 
3697
3677
  # Deprecation. The default behavior is fast, and the flag will be removed.
3698
3678
  # The flag was not present in 0.7.x (only nightly), so we will remove before
@@ -3742,10 +3722,7 @@ def jobs_launch(
3742
3722
 
3743
3723
  common_utils.check_cluster_name_is_valid(name)
3744
3724
 
3745
- managed_jobs.launch(dag,
3746
- name,
3747
- detach_run=detach_run,
3748
- retry_until_up=retry_until_up)
3725
+ managed_jobs.launch(dag, name, detach_run=detach_run)
3749
3726
 
3750
3727
 
3751
3728
  @jobs.command('queue', cls=_DocumentedCodeCommand)
sky/clouds/kubernetes.py CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
131
131
  'Ignoring these contexts.')
132
132
 
133
133
  @classmethod
134
- def _existing_allowed_contexts(cls) -> List[str]:
134
+ def existing_allowed_contexts(cls) -> List[str]:
135
135
  """Get existing allowed contexts.
136
136
 
137
137
  If None is returned in the list, it means that we are running in a pod
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
175
175
  use_spot: bool, region: Optional[str],
176
176
  zone: Optional[str]) -> List[clouds.Region]:
177
177
  del accelerators, zone, use_spot # unused
178
- existing_contexts = cls._existing_allowed_contexts()
178
+ existing_contexts = cls.existing_allowed_contexts()
179
179
 
180
180
  regions = []
181
181
  for context in existing_contexts:
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
591
591
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
592
592
  # Test using python API
593
593
  try:
594
- existing_allowed_contexts = cls._existing_allowed_contexts()
594
+ existing_allowed_contexts = cls.existing_allowed_contexts()
595
595
  except ImportError as e:
596
596
  return (False,
597
597
  f'{common_utils.format_exception(e, use_bracket=True)}')
@@ -115,6 +115,16 @@ def _list_accelerators(
115
115
 
116
116
  If the user does not have sufficient permissions to list pods in all
117
117
  namespaces, the function will return free GPUs as -1.
118
+
119
+ Returns:
120
+ A tuple of three dictionaries:
121
+ - qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
122
+ objects with quantity information.
123
+ - total_accelerators_capacity: Dict mapping accelerator names to their
124
+ total capacity in the cluster.
125
+ - total_accelerators_available: Dict mapping accelerator names to their
126
+ current availability. Returns -1 for each accelerator if
127
+ realtime=False or if insufficient permissions.
118
128
  """
119
129
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
120
130
  # function from kubernetes_utils.
@@ -243,6 +253,10 @@ def _list_accelerators(
243
253
 
244
254
  accelerators_available = accelerator_count - allocated_qty
245
255
 
256
+ # Initialize the entry if it doesn't exist yet
257
+ if accelerator_name not in total_accelerators_available:
258
+ total_accelerators_available[accelerator_name] = 0
259
+
246
260
  if accelerators_available >= min_quantity_filter:
247
261
  quantized_availability = min_quantity_filter * (
248
262
  accelerators_available // min_quantity_filter)
sky/core.py CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
915
915
  handle = global_user_state.get_handle_from_storage_name(name)
916
916
  if handle is None:
917
917
  raise ValueError(f'Storage name {name!r} not found.')
918
- else:
919
- storage_object = data.Storage(name=handle.storage_name,
920
- source=handle.source,
921
- sync_on_reconstruction=False)
922
- storage_object.delete()
918
+
919
+ assert handle.storage_name == name, (
920
+ f'In global_user_state, storage name {name!r} does not match '
921
+ f'handle.storage_name {handle.storage_name!r}')
922
+ storage_object = data.Storage(name=handle.storage_name,
923
+ source=handle.source,
924
+ sync_on_reconstruction=False)
925
+ storage_object.delete()
sky/data/storage.py CHANGED
@@ -1083,18 +1083,16 @@ class Storage(object):
1083
1083
  if not self.stores:
1084
1084
  logger.info('No backing stores found. Deleting storage.')
1085
1085
  global_user_state.remove_storage(self.name)
1086
- if store_type:
1086
+ if store_type is not None:
1087
1087
  store = self.stores[store_type]
1088
- is_sky_managed = store.is_sky_managed
1089
1088
  # We delete a store from the cloud if it's sky managed. Else just
1090
1089
  # remove handle and return
1091
- if is_sky_managed:
1090
+ if store.is_sky_managed:
1092
1091
  self.handle.remove_store(store)
1093
1092
  store.delete()
1094
1093
  # Check remaining stores - if none is sky managed, remove
1095
1094
  # the storage from global_user_state.
1096
- delete = all(
1097
- s.is_sky_managed is False for s in self.stores.values())
1095
+ delete = all(not s.is_sky_managed for s in self.stores.values())
1098
1096
  if delete:
1099
1097
  global_user_state.remove_storage(self.name)
1100
1098
  else:
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
1689
1687
 
1690
1688
  Returns:
1691
1689
  bool; True if bucket was deleted, False if it was deleted externally.
1690
+
1691
+ Raises:
1692
+ StorageBucketDeleteError: If deleting the bucket fails.
1692
1693
  """
1693
1694
  # Deleting objects is very slow programatically
1694
1695
  # (i.e. bucket.objects.all().delete() is slow).
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
2179
2180
 
2180
2181
  Returns:
2181
2182
  bool; True if bucket was deleted, False if it was deleted externally.
2183
+
2184
+ Raises:
2185
+ StorageBucketDeleteError: If deleting the bucket fails.
2186
+ PermissionError: If the bucket is external and the user is not
2187
+ allowed to delete it.
2182
2188
  """
2183
2189
  if _bucket_sub_path is not None:
2184
2190
  command_suffix = f'/{_bucket_sub_path}'
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
3478
3484
 
3479
3485
  Returns:
3480
3486
  bool; True if bucket was deleted, False if it was deleted externally.
3487
+
3488
+ Raises:
3489
+ StorageBucketDeleteError: If deleting the bucket fails.
3481
3490
  """
3482
3491
  # Deleting objects is very slow programatically
3483
3492
  # (i.e. bucket.objects.all().delete() is slow).
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
3932
3941
 
3933
3942
  def _delete_cos_bucket_objects(self,
3934
3943
  bucket: Any,
3935
- prefix: Optional[str] = None):
3944
+ prefix: Optional[str] = None) -> None:
3936
3945
  bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
3937
3946
  if bucket_versioning.status == 'Enabled':
3938
3947
  if prefix is not None:
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
3947
3956
  res = list(bucket.objects.delete())
3948
3957
  logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
3949
3958
 
3950
- def _delete_cos_bucket(self):
3959
+ def _delete_cos_bucket(self) -> None:
3951
3960
  bucket = self.s3_resource.Bucket(self.name)
3952
3961
  try:
3953
3962
  self._delete_cos_bucket_objects(bucket)
@@ -3968,7 +3977,7 @@ class OciStore(AbstractStore):
3968
3977
 
3969
3978
  def __init__(self,
3970
3979
  name: str,
3971
- source: str,
3980
+ source: Optional[SourceType],
3972
3981
  region: Optional[str] = None,
3973
3982
  is_sky_managed: Optional[bool] = None,
3974
3983
  sync_on_reconstruction: Optional[bool] = True,
@@ -3980,13 +3989,53 @@ class OciStore(AbstractStore):
3980
3989
  self.compartment: str
3981
3990
  self.namespace: str
3982
3991
 
3983
- # Bucket region should be consistence with the OCI config file
3984
- region = oci.get_oci_config()['region']
3992
+ # Region is from the specified name in <bucket>@<region> format.
3993
+ # Another case is name can also be set by the source, for example:
3994
+ # /datasets-storage:
3995
+ # source: oci://RAGData@us-sanjose-1
3996
+ # The name in above mount will be set to RAGData@us-sanjose-1
3997
+ region_in_name = None
3998
+ if name is not None and '@' in name:
3999
+ self._validate_bucket_expr(name)
4000
+ name, region_in_name = name.split('@')
4001
+
4002
+ # Region is from the specified source in oci://<bucket>@<region> format
4003
+ region_in_source = None
4004
+ if isinstance(source,
4005
+ str) and source.startswith('oci://') and '@' in source:
4006
+ self._validate_bucket_expr(source)
4007
+ source, region_in_source = source.split('@')
4008
+
4009
+ if region_in_name is not None and region_in_source is not None:
4010
+ # This should never happen because name and source will never be
4011
+ # the remote bucket at the same time.
4012
+ assert region_in_name == region_in_source, (
4013
+ f'Mismatch region specified. Region in name {region_in_name}, '
4014
+ f'but region in source is {region_in_source}')
4015
+
4016
+ if region_in_name is not None:
4017
+ region = region_in_name
4018
+ elif region_in_source is not None:
4019
+ region = region_in_source
4020
+
4021
+ # Default region set to what specified in oci config.
4022
+ if region is None:
4023
+ region = oci.get_oci_config()['region']
4024
+
4025
+ # So far from now on, the name and source are canonical, means there
4026
+ # is no region (@<region> suffix) associated with them anymore.
3985
4027
 
3986
4028
  super().__init__(name, source, region, is_sky_managed,
3987
4029
  sync_on_reconstruction, _bucket_sub_path)
3988
4030
  # TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
3989
4031
 
4032
+ def _validate_bucket_expr(self, bucket_expr: str):
4033
+ pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
4034
+ if not re.match(pattern, bucket_expr):
4035
+ raise ValueError(
4036
+ 'The format for the bucket portion is <bucket>@<region> '
4037
+ 'when specify a region with a bucket.')
4038
+
3990
4039
  def _validate(self):
3991
4040
  if self.source is not None and isinstance(self.source, str):
3992
4041
  if self.source.startswith('oci://'):
@@ -4137,7 +4186,8 @@ class OciStore(AbstractStore):
4137
4186
  sync_command = (
4138
4187
  'oci os object bulk-upload --no-follow-symlinks --overwrite '
4139
4188
  f'--bucket-name {self.name} --namespace-name {self.namespace} '
4140
- f'--src-dir "{base_dir_path}" {includes}')
4189
+ f'--region {self.region} --src-dir "{base_dir_path}" '
4190
+ f'{includes}')
4141
4191
 
4142
4192
  return sync_command
4143
4193
 
@@ -4157,8 +4207,8 @@ class OciStore(AbstractStore):
4157
4207
  sync_command = (
4158
4208
  'oci os object bulk-upload --no-follow-symlinks --overwrite '
4159
4209
  f'--bucket-name {self.name} --namespace-name {self.namespace} '
4160
- f'--object-prefix "{dest_dir_name}" --src-dir "{src_dir_path}" '
4161
- f'{excludes} ')
4210
+ f'--region {self.region} --object-prefix "{dest_dir_name}" '
4211
+ f'--src-dir "{src_dir_path}" {excludes}')
4162
4212
 
4163
4213
  return sync_command
4164
4214
 
@@ -4289,7 +4339,8 @@ class OciStore(AbstractStore):
4289
4339
  def get_file_download_command(remote_path, local_path):
4290
4340
  download_command = (f'oci os object get --bucket-name {self.name} '
4291
4341
  f'--namespace-name {self.namespace} '
4292
- f'--name {remote_path} --file {local_path}')
4342
+ f'--region {self.region} --name {remote_path} '
4343
+ f'--file {local_path}')
4293
4344
 
4294
4345
  return download_command
4295
4346
 
@@ -4346,6 +4397,7 @@ class OciStore(AbstractStore):
4346
4397
  @oci.with_oci_env
4347
4398
  def get_bucket_delete_command(bucket_name):
4348
4399
  remove_command = (f'oci os bucket delete --bucket-name '
4400
+ f'--region {self.region} '
4349
4401
  f'{bucket_name} --empty --force')
4350
4402
 
4351
4403
  return remove_command
sky/global_user_state.py CHANGED
@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
827
827
 
828
828
 
829
829
  def get_storage() -> List[Dict[str, Any]]:
830
- rows = _DB.cursor.execute('select * from storage')
830
+ rows = _DB.cursor.execute('SELECT * FROM storage')
831
831
  records = []
832
832
  for name, launched_at, handle, last_use, status in rows:
833
833
  # TODO: use namedtuple instead of dict
sky/jobs/constants.py CHANGED
@@ -2,18 +2,19 @@
2
2
 
3
3
  JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
4
4
  JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
5
+ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
5
6
 
6
7
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
7
8
 
8
9
  # Resources as a dict for the jobs controller.
9
- # Use default CPU instance type for jobs controller with >= 24GB, i.e.
10
- # m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
11
- # for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
12
- # Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
13
- # OOM (each vCPU can have 4 jobs controller processes as we set the CPU
14
- # requirement to 0.25, and 3 GB is barely enough for 4 job processes).
10
+ # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
11
+ # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
12
+ # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
13
+ # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
14
+ # parallelism limit, and memory / 350MB is the limit to concurrently running
15
+ # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
15
16
  # We use 50 GB disk size to reduce the cost.
16
- CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
17
+ CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
17
18
 
18
19
  # Max length of the cluster name for GCP is 35, the user hash to be attached is
19
20
  # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max