PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250115__py3-none-any.whl → 1.0.0.dev20250117__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250115py3-none-any.whl → 1.0.0.dev20250117py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +50 -67
sky/cli.py +11 -34
sky/core.py +8 -5
sky/data/storage.py +66 -14
sky/global_user_state.py +1 -1
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +19 -22
sky/jobs/core.py +0 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +257 -21
sky/jobs/utils.py +338 -96
sky/provision/kubernetes/instance.py +1 -1
sky/resources.py +1 -1
sky/serve/core.py +30 -5
sky/serve/replica_managers.py +1 -3
sky/skylet/constants.py +1 -1
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +10 -30
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/skylet.py +1 -1
sky/templates/jobs-controller.yaml.j2 +7 -3
sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
sky/utils/resources_utils.py +25 -21
sky/utils/subprocess_utils.py +48 -9
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/RECORD +33 -32
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = 'd9bb51a66a33ed06f9f0c791ac3ed5ad93ffa587'
+_SKYPILOT_COMMIT_SHA = '9e1b4ddc5fb1cb3fd6c00c106555b919e449e2c9'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250115'
+__version__ = '1.0.0.dev20250117'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -10,6 +10,7 @@ import os
 import pathlib
 import re
 import shlex
+import shutil
 import signal
 import subprocess
 import sys
@@ -35,7 +36,6 @@ from sky import jobs as managed_jobs
 from sky import optimizer
 from sky import provision as provision_lib
 from sky import resources as resources_lib
-from sky import serve as serve_lib
 from sky import sky_logging
 from sky import status_lib
 from sky import task as task_lib
@@ -45,6 +45,7 @@ from sky.clouds import service_catalog
 from sky.clouds.utils import gcp_utils
 from sky.data import data_utils
 from sky.data import storage as storage_lib
+from sky.jobs import constants as managed_jobs_constants
 from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
@@ -155,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # might be added during ssh.
 _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
+_RESOURCES_UNAVAILABLE_LOG = (
+    'Reasons for provision failures (for details, please check the log above):')
 def _is_command_length_over_limit(command: str) -> bool:
     """Check if the length of the command exceeds the limit.
@@ -1997,6 +2001,7 @@ class RetryingVmProvisioner(object):
                                        skip_unnecessary_provisioning else None)
         failover_history: List[Exception] = list()
+        resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
         # If the user is using local credentials which may expire, the
         # controller may leak resources if the credentials expire while a job
         # is running. Here we check the enabled clouds and expiring credentials
@@ -2088,6 +2093,8 @@ class RetryingVmProvisioner(object):
                 # Add failed resources to the blocklist, only when it
                 # is in fallback mode.
                 _add_to_blocked_resources(self._blocked_resources, to_provision)
+                assert len(failover_history) > 0
+                resource_exceptions[to_provision] = failover_history[-1]
             else:
                 # If we reach here, it means that the existing cluster must have
                 # a previous status of INIT, because other statuses (UP,
@@ -2132,7 +2139,14 @@ class RetryingVmProvisioner(object):
                 # possible resources or the requested resources is too
                 # restrictive. If we reach here, our failover logic finally
                 # ends here.
-                raise e.with_failover_history(failover_history)
+                table = log_utils.create_table(['Resource', 'Reason'])
+                for (resource, exception) in resource_exceptions.items():
+                    table.add_row(
+                        [resources_utils.format_resource(resource), exception])
+                table.max_table_width = shutil.get_terminal_size().columns
+                raise exceptions.ResourcesUnavailableError(
+                    _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
+                    failover_history=failover_history)
             to_provision = task.best_resources
             assert task in self._dag.tasks, 'Internal logic error.'
             assert to_provision is not None, task
@@ -2895,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         'the `--retry-until-up` flag.')
                     with ux_utils.print_exception_no_traceback():
                         raise exceptions.ResourcesUnavailableError(
-                            error_message,
+                            error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
                 record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3910,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Returns:
             A dictionary mapping job_id to log path.
         """
-        # if job_name is not None, job_id should be None
+        # if job_name and job_id should not both be specified
         assert job_name is None or job_id is None, (job_name, job_id)
-        if job_id is None and job_name is not None:
+        if job_id is None:
             # generate code to get the job_id
+            # if job_name is None, get all job_ids
+            # TODO: Only get the latest job_id, since that's the only one we use
             code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
                 job_name=job_name)
-            returncode, run_timestamps, stderr = self.run_on_head(
-                handle,
-                code,
-                stream_logs=False,
-                require_outputs=True,
-                separate_stderr=True)
+            returncode, job_ids, stderr = self.run_on_head(handle,
+                                                           code,
+                                                           stream_logs=False,
+                                                           require_outputs=True,
+                                                           separate_stderr=True)
             subprocess_utils.handle_returncode(returncode, code,
                                                'Failed to sync down logs.',
                                                stderr)
-            job_ids = common_utils.decode_payload(run_timestamps)
+            job_ids = common_utils.decode_payload(job_ids)
             if not job_ids:
                 logger.info(f'{colorama.Fore.YELLOW}'
                             'No matching job found'
                             f'{colorama.Style.RESET_ALL}')
                 return {}
             elif len(job_ids) > 1:
-                logger.info(
-                    f'{colorama.Fore.YELLOW}'
-                    f'Multiple jobs IDs found under the name {job_name}. '
-                    'Downloading the latest job logs.'
-                    f'{colorama.Style.RESET_ALL}')
-                job_ids = [job_ids[0]]  # descending order
-        else:
-            job_ids = [job_id]
+                name_str = ''
+                if job_name is not None:
+                    name_str = ('Multiple jobs IDs found under the name '
+                                f'{job_name}. ')
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            f'{name_str}'
+                            'Downloading the latest job logs.'
+                            f'{colorama.Style.RESET_ALL}')
+            # list should aready be in descending order
+            job_id = job_ids[0]
         # get the run_timestamp
         # the function takes in [job_id]
-        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
+        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
+            [str(job_id)])
         returncode, run_timestamps, stderr = self.run_on_head(
             handle,
             code,
@@ -3964,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         job_id = list(run_timestamps.keys())[0]
         local_log_dir = ''
         if controller:  # download controller logs
-            remote_log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                          run_timestamp)
+            remote_log = os.path.join(
+                managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
+                f'{job_id}.log')
             local_log_dir = os.path.expanduser(
                 os.path.join(local_dir, run_timestamp))
             logger.info(f'{colorama.Fore.CYAN}'
-                        f'Job {job_ids} local logs: {local_log_dir}'
+                        f'Job {job_id} local logs: {local_log_dir}'
                         f'{colorama.Style.RESET_ALL}')
             runners = handle.get_command_runners()
@@ -3981,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 Args:
                     args: A tuple of (runner, local_log_dir, remote_log_dir)
                 """
-                (runner, local_log_dir, remote_log_dir) = args
+                (runner, local_log_dir, remote_log) = args
                 try:
                     os.makedirs(local_log_dir, exist_ok=True)
                     runner.rsync(
-                        source=f'{remote_log_dir}/',
-                        target=local_log_dir,
+                        source=remote_log,
+                        target=f'{local_log_dir}/controller.log',
                         up=False,
                         stream_logs=False,
                     )
@@ -3999,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     else:
                         raise
-            parallel_args = [[runner, *item]
-                             for item in zip([local_log_dir], [remote_log_dir])
-                             for runner in runners]
+            parallel_args = [
+                (runner, local_log_dir, remote_log) for runner in runners
+            ]
             subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
         else:  # download job logs
             local_log_dir = os.path.expanduser(
@@ -4037,43 +4057,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     f'{colorama.Style.RESET_ALL}')
         return {str(job_id): local_log_dir}
-    def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
-                        service_name: str, target: serve_lib.ServiceComponent,
-                        replica_id: Optional[int], follow: bool) -> None:
-        """Tail the logs of a service.
-        Args:
-            handle: The handle to the sky serve controller.
-            service_name: The name of the service.
-            target: The component to tail the logs of. Could be controller,
-                load balancer, or replica.
-            replica_id: The replica ID to tail the logs of. Only used when
-                target is replica.
-            follow: Whether to follow the logs.
-        """
-        if target != serve_lib.ServiceComponent.REPLICA:
-            code = serve_lib.ServeCodeGen.stream_serve_process_logs(
-                service_name,
-                stream_controller=(
-                    target == serve_lib.ServiceComponent.CONTROLLER),
-                follow=follow)
-        else:
-            assert replica_id is not None, service_name
-            code = serve_lib.ServeCodeGen.stream_replica_logs(
-                service_name, replica_id, follow)
-        signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
-        signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
-        self.run_on_head(
-            handle,
-            code,
-            stream_logs=True,
-            process_stream=False,
-            ssh_mode=command_runner.SshMode.INTERACTIVE,
-            stdin=subprocess.DEVNULL,
-        )
     def teardown_no_lock(self,
                          handle: CloudVmRayResourceHandle,
                          terminate: bool,

sky/cli.py CHANGED Viewed

@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool):  # pylint: disable=r
     if sum([bool(names), all]) != 1:
         raise click.UsageError('Either --all or a name must be specified.')
     if all:
-        storages = sky.storage_ls()
-        if not storages:
+        # Use '*' to get all storages.
+        names = global_user_state.get_glob_storage_name(storage_name='*')
+        if not names:
             click.echo('No storage(s) to delete.')
             return
-        names = [s['name'] for s in storages]
     else:
         names = _get_glob_storages(names)
     if names:
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool):  # pylint: disable=r
                 abort=True,
                 show_default=True)
-    subprocess_utils.run_in_parallel(sky.storage_delete, names)
+    def delete_storage(name: str) -> None:
+        try:
+            sky.storage_delete(name)
+        except Exception as e:  # pylint: disable=broad-except
+            click.secho(f'Error deleting storage {name}: {e}', fg='red')
+    subprocess_utils.run_in_parallel(delete_storage, names)
 @cli.group(cls=_NaturalOrderGroup)
@@ -3588,18 +3594,6 @@ def jobs():
     is_flag=True,
     help=('If True, as soon as a job is submitted, return from this call '
           'and do not stream execution logs.'))
-@click.option(
-    '--retry-until-up/--no-retry-until-up',
-    '-r/-no-r',
-    default=None,
-    is_flag=True,
-    required=False,
-    help=(
-        '(Default: True; this flag is deprecated and will be removed in a '
-        'future release.) Whether to retry provisioning infinitely until the '
-        'cluster is up, if unavailability errors are encountered. This '  # pylint: disable=bad-docstring-quotes
-        'applies to launching all managed jobs (both the initial and '
-        'any recovery attempts), not the jobs controller.'))
 @click.option('--yes',
               '-y',
               is_flag=True,
@@ -3636,7 +3630,6 @@ def jobs_launch(
     disk_tier: Optional[str],
     ports: Tuple[str],
     detach_run: bool,
-    retry_until_up: Optional[bool],
     yes: bool,
     fast: bool,
 ):
@@ -3680,19 +3673,6 @@ def jobs_launch(
         ports=ports,
         job_recovery=job_recovery,
     )
-    # Deprecation. We set the default behavior to be retry until up, and the
-    # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
-    if retry_until_up is not None:
-        flag_str = '--retry-until-up'
-        if not retry_until_up:
-            flag_str = '--no-retry-until-up'
-        click.secho(
-            f'Flag {flag_str} is deprecated and will be removed in a '
-            'future release (managed jobs will always be retried). '
-            'Please file an issue if this does not work for you.',
-            fg='yellow')
-    else:
-        retry_until_up = True
     # Deprecation. The default behavior is fast, and the flag will be removed.
     # The flag was not present in 0.7.x (only nightly), so we will remove before
@@ -3742,10 +3722,7 @@ def jobs_launch(
     common_utils.check_cluster_name_is_valid(name)
-    managed_jobs.launch(dag,
-                        name,
-                        detach_run=detach_run,
-                        retry_until_up=retry_until_up)
+    managed_jobs.launch(dag, name, detach_run=detach_run)
 @jobs.command('queue', cls=_DocumentedCodeCommand)

sky/core.py CHANGED Viewed

@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
     handle = global_user_state.get_handle_from_storage_name(name)
     if handle is None:
         raise ValueError(f'Storage name {name!r} not found.')
-    else:
-        storage_object = data.Storage(name=handle.storage_name,
-                                      source=handle.source,
-                                      sync_on_reconstruction=False)
-        storage_object.delete()
+    assert handle.storage_name == name, (
+        f'In global_user_state, storage name {name!r} does not match '
+        f'handle.storage_name {handle.storage_name!r}')
+    storage_object = data.Storage(name=handle.storage_name,
+                                  source=handle.source,
+                                  sync_on_reconstruction=False)
+    storage_object.delete()

sky/data/storage.py CHANGED Viewed

@@ -1083,18 +1083,16 @@ class Storage(object):
         if not self.stores:
             logger.info('No backing stores found. Deleting storage.')
             global_user_state.remove_storage(self.name)
-        if store_type:
+        if store_type is not None:
             store = self.stores[store_type]
-            is_sky_managed = store.is_sky_managed
             # We delete a store from the cloud if it's sky managed. Else just
             # remove handle and return
-            if is_sky_managed:
+            if store.is_sky_managed:
                 self.handle.remove_store(store)
                 store.delete()
                 # Check remaining stores - if none is sky managed, remove
                 # the storage from global_user_state.
-                delete = all(
-                    s.is_sky_managed is False for s in self.stores.values())
+                delete = all(not s.is_sky_managed for s in self.stores.values())
                 if delete:
                     global_user_state.remove_storage(self.name)
                 else:
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
         """
         # Deleting objects is very slow programatically
         # (i.e. bucket.objects.all().delete() is slow).
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
+            PermissionError: If the bucket is external and the user is not
+                allowed to delete it.
         """
         if _bucket_sub_path is not None:
             command_suffix = f'/{_bucket_sub_path}'
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
         """
         # Deleting objects is very slow programatically
         # (i.e. bucket.objects.all().delete() is slow).
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
     def _delete_cos_bucket_objects(self,
                                    bucket: Any,
-                                   prefix: Optional[str] = None):
+                                   prefix: Optional[str] = None) -> None:
         bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
         if bucket_versioning.status == 'Enabled':
             if prefix is not None:
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
                 res = list(bucket.objects.delete())
         logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
-    def _delete_cos_bucket(self):
+    def _delete_cos_bucket(self) -> None:
         bucket = self.s3_resource.Bucket(self.name)
         try:
             self._delete_cos_bucket_objects(bucket)
@@ -3968,7 +3977,7 @@ class OciStore(AbstractStore):
     def __init__(self,
                  name: str,
-                 source: str,
+                 source: Optional[SourceType],
                  region: Optional[str] = None,
                  is_sky_managed: Optional[bool] = None,
                  sync_on_reconstruction: Optional[bool] = True,
@@ -3980,13 +3989,53 @@ class OciStore(AbstractStore):
         self.compartment: str
         self.namespace: str
-        # Bucket region should be consistence with the OCI config file
-        region = oci.get_oci_config()['region']
+        # Region is from the specified name in <bucket>@<region> format.
+        # Another case is name can also be set by the source, for example:
+        #   /datasets-storage:
+        #       source: oci://RAGData@us-sanjose-1
+        # The name in above mount will be set to RAGData@us-sanjose-1
+        region_in_name = None
+        if name is not None and '@' in name:
+            self._validate_bucket_expr(name)
+            name, region_in_name = name.split('@')
+        # Region is from the specified source in oci://<bucket>@<region> format
+        region_in_source = None
+        if isinstance(source,
+                      str) and source.startswith('oci://') and '@' in source:
+            self._validate_bucket_expr(source)
+            source, region_in_source = source.split('@')
+        if region_in_name is not None and region_in_source is not None:
+            # This should never happen because name and source will never be
+            # the remote bucket at the same time.
+            assert region_in_name == region_in_source, (
+                f'Mismatch region specified. Region in name {region_in_name}, '
+                f'but region in source is {region_in_source}')
+        if region_in_name is not None:
+            region = region_in_name
+        elif region_in_source is not None:
+            region = region_in_source
+        # Default region set to what specified in oci config.
+        if region is None:
+            region = oci.get_oci_config()['region']
+        # So far from now on, the name and source are canonical, means there
+        # is no region (@<region> suffix) associated with them anymore.
         super().__init__(name, source, region, is_sky_managed,
                          sync_on_reconstruction, _bucket_sub_path)
         # TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
+    def _validate_bucket_expr(self, bucket_expr: str):
+        pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
+        if not re.match(pattern, bucket_expr):
+            raise ValueError(
+                'The format for the bucket portion is <bucket>@<region> '
+                'when specify a region with a bucket.')
     def _validate(self):
         if self.source is not None and isinstance(self.source, str):
             if self.source.startswith('oci://'):
@@ -4137,7 +4186,8 @@ class OciStore(AbstractStore):
             sync_command = (
                 'oci os object bulk-upload --no-follow-symlinks --overwrite '
                 f'--bucket-name {self.name} --namespace-name {self.namespace} '
-                f'--src-dir "{base_dir_path}" {includes}')
+                f'--region {self.region} --src-dir "{base_dir_path}" '
+                f'{includes}')
             return sync_command
@@ -4157,8 +4207,8 @@ class OciStore(AbstractStore):
             sync_command = (
                 'oci os object bulk-upload --no-follow-symlinks --overwrite '
                 f'--bucket-name {self.name} --namespace-name {self.namespace} '
-                f'--object-prefix "{dest_dir_name}" --src-dir "{src_dir_path}" '
-                f'{excludes} ')
+                f'--region {self.region} --object-prefix "{dest_dir_name}" '
+                f'--src-dir "{src_dir_path}" {excludes}')
             return sync_command
@@ -4289,7 +4339,8 @@ class OciStore(AbstractStore):
         def get_file_download_command(remote_path, local_path):
             download_command = (f'oci os object get --bucket-name {self.name} '
                                 f'--namespace-name {self.namespace} '
-                                f'--name {remote_path} --file {local_path}')
+                                f'--region {self.region} --name {remote_path} '
+                                f'--file {local_path}')
             return download_command
@@ -4346,6 +4397,7 @@ class OciStore(AbstractStore):
         @oci.with_oci_env
         def get_bucket_delete_command(bucket_name):
             remove_command = (f'oci os bucket delete --bucket-name '
+                              f'--region {self.region} '
                               f'{bucket_name} --empty --force')
             return remove_command

sky/global_user_state.py CHANGED Viewed

@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
 def get_storage() -> List[Dict[str, Any]]:
-    rows = _DB.cursor.execute('select * from storage')
+    rows = _DB.cursor.execute('SELECT * FROM storage')
     records = []
     for name, launched_at, handle, last_use, status in rows:
         # TODO: use namedtuple instead of dict

sky/jobs/constants.py CHANGED Viewed

@@ -2,18 +2,19 @@
 JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
 JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
+JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
 JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
 # Resources as a dict for the jobs controller.
-# Use default CPU instance type for jobs controller with >= 24GB, i.e.
-# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
-# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
-# Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
-# OOM (each vCPU can have 4 jobs controller processes as we set the CPU
-# requirement to 0.25, and 3 GB is barely enough for 4 job processes).
+# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
+# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
+# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
+# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
+# parallelism limit, and memory / 350MB is the limit to concurrently running
+# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
 # We use 50 GB disk size to reduce the cost.
-CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
+CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
 # Max length of the cluster name for GCP is 35, the user hash to be attached is
 # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max

skypilot-nightly 1.0.0.dev20250115__py3-none-any.whl → 1.0.0.dev20250117__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250115py3-none-any.whl → 1.0.0.dev20250117py3-none-any.whl