PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250729py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show

sky/jobs/utils.py CHANGED Viewed

@@ -30,7 +30,6 @@ from sky.backends import backend_utils
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
-from sky.server import common as server_common
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
@@ -39,7 +38,6 @@ from sky.utils import annotations
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import controller_utils
-from sky.utils import env_options
 from sky.utils import infra_utils
 from sky.utils import log_utils
 from sky.utils import message_utils
@@ -136,12 +134,6 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
 def _validate_consolidation_mode_config(
         current_is_consolidation_mode: bool) -> None:
     """Validate the consolidation mode config."""
-    if (current_is_consolidation_mode and
-            not env_options.Options.IS_DEVELOPER.get() and
-            server_common.is_api_server_local()):
-        with ux_utils.print_exception_no_traceback():
-            raise exceptions.NotSupportedError(
-                'Consolidation mode is not supported when running locally.')
     # Check whether the consolidation mode config is changed.
     if current_is_consolidation_mode:
         controller_cn = (
@@ -239,8 +231,8 @@ def ha_recovery_for_consolidation_mode():
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
-def get_job_status(backend: 'backends.CloudVmRayBackend',
-                   cluster_name: str) -> Optional['job_lib.JobStatus']:
+def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
+                   job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -253,10 +245,13 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
         logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
+    job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
             logger.info('=== Checking the job status... ===')
-            statuses = backend.get_job_status(handle, stream_logs=False)
+            statuses = backend.get_job_status(handle,
+                                              job_ids=job_ids,
+                                              stream_logs=False)
             status = list(statuses.values())[0]
             if status is None:
                 logger.info('No job found.')
@@ -323,13 +318,20 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         error_msg = None
         tasks = managed_job_state.get_managed_jobs(job_id)
         for task in tasks:
-            task_name = task['job_name']
-            cluster_name = generate_managed_job_cluster_name(task_name, job_id)
+            pool = task.get('pool', None)
+            if pool is None:
+                task_name = task['job_name']
+                cluster_name = generate_managed_job_cluster_name(
+                    task_name, job_id)
+            else:
+                cluster_name, _ = (
+                    managed_job_state.get_pool_submit_info(job_id))
             handle = global_user_state.get_handle_from_cluster_name(
                 cluster_name)
             if handle is not None:
                 try:
-                    terminate_cluster(cluster_name)
+                    if pool is None:
+                        terminate_cluster(cluster_name)
                 except Exception as e:  # pylint: disable=broad-except
                     error_msg = (
                         f'Failed to terminate cluster {cluster_name}: '
@@ -510,10 +512,10 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
-                      get_end_time: bool) -> float:
+                      job_id: Optional[int], get_end_time: bool) -> float:
     """Get the submitted/ended time of the job."""
     code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
-        job_id=None, get_ended_time=get_end_time)
+        job_id=job_id, get_ended_time=get_end_time)
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     returncode, stdout, stderr = backend.run_on_head(handle,
                                                      code,
@@ -527,14 +529,17 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
 def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
-                            cluster_name: str) -> float:
+                            cluster_name: str, job_id: Optional[int]) -> float:
     """Try to get the end time of the job.
     If the job is preempted or we can't connect to the instance for whatever
     reason, fall back to the current time.
     """
     try:
-        return get_job_timestamp(backend, cluster_name, get_end_time=True)
+        return get_job_timestamp(backend,
+                                 cluster_name,
+                                 job_id=job_id,
+                                 get_end_time=True)
     except exceptions.CommandError as e:
         if e.returncode == 255:
             # Failed to connect - probably the instance was preempted since the
@@ -556,8 +561,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
         if event_callback is None or task is None:
             return
         event_callback = event_callback.strip()
-        cluster_name = generate_managed_job_cluster_name(
-            task.name, job_id) if task.name else None
+        pool = managed_job_state.get_pool_from_job_id(job_id)
+        if pool is not None:
+            cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                task.name, job_id) if task.name else None
         logger.info(f'=== START: event callback for {status!r} ===')
         log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                 'managed_job_event',
@@ -684,6 +693,15 @@ def cancel_job_by_name(job_name: str,
     return f'{job_name!r} {msg}'
+def cancel_jobs_by_pool(pool_name: str,
+                        current_workspace: Optional[str] = None) -> str:
+    """Cancel all jobs in a pool."""
+    job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
+    if not job_ids:
+        return f'No running job found in pool {pool_name!r}.'
+    return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
 def stream_logs_by_id(job_id: int,
                       follow: bool = True,
                       tail: Optional[int] = None) -> Tuple[str, int]:
@@ -716,23 +734,41 @@ def stream_logs_by_id(job_id: int,
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
-            log_file = managed_job_state.get_local_log_file(job_id, None)
-            if log_file is not None:
-                with open(os.path.expanduser(log_file), 'r',
-                          encoding='utf-8') as f:
-                    # Stream the logs to the console without reading the whole
-                    # file into memory.
-                    start_streaming = False
-                    read_from: Union[TextIO, Deque[str]] = f
-                    if tail is not None:
-                        assert tail > 0
-                        # Read only the last 'tail' lines using deque
-                        read_from = collections.deque(f, maxlen=tail)
-                    for line in read_from:
-                        if log_lib.LOG_FILE_START_STREAMING_AT in line:
-                            start_streaming = True
-                        if start_streaming:
-                            print(line, end='', flush=True)
+            log_file_exists = False
+            task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
+                job_id)
+            num_tasks = len(task_info)
+            for task_id, task_name, task_status, log_file in task_info:
+                if log_file:
+                    log_file_exists = True
+                    task_str = (f'Task {task_name}({task_id})'
+                                if task_name else f'Task {task_id}')
+                    if num_tasks > 1:
+                        print(f'=== {task_str} ===')
+                    with open(os.path.expanduser(log_file),
+                              'r',
+                              encoding='utf-8') as f:
+                        # Stream the logs to the console without reading the
+                        # whole file into memory.
+                        start_streaming = False
+                        read_from: Union[TextIO, Deque[str]] = f
+                        if tail is not None:
+                            assert tail > 0
+                            # Read only the last 'tail' lines using deque
+                            read_from = collections.deque(f, maxlen=tail)
+                        for line in read_from:
+                            if log_lib.LOG_FILE_START_STREAMING_AT in line:
+                                start_streaming = True
+                            if start_streaming:
+                                print(line, end='', flush=True)
+                    if num_tasks > 1:
+                        # Add the "Task finished" message for terminal states
+                        if task_status.is_terminal():
+                            print(ux_utils.finishing_message(
+                                f'{task_str} finished '
+                                f'(status: {task_status.value}).'),
+                                  flush=True)
+            if log_file_exists:
                 # Add the "Job finished" message for terminal states
                 if managed_job_status.is_terminal():
                     print(ux_utils.finishing_message(
@@ -759,12 +795,19 @@ def stream_logs_by_id(job_id: int,
         while should_keep_logging(managed_job_status):
             handle = None
+            job_id_to_tail = None
             if task_id is not None:
-                task_name = managed_job_state.get_task_name(job_id, task_id)
-                cluster_name = generate_managed_job_cluster_name(
-                    task_name, job_id)
-                handle = global_user_state.get_handle_from_cluster_name(
-                    cluster_name)
+                pool = managed_job_state.get_pool_from_job_id(job_id)
+                if pool is not None:
+                    cluster_name, job_id_to_tail = (
+                        managed_job_state.get_pool_submit_info(job_id))
+                else:
+                    task_name = managed_job_state.get_task_name(job_id, task_id)
+                    cluster_name = generate_managed_job_cluster_name(
+                        task_name, job_id)
+                if cluster_name is not None:
+                    handle = global_user_state.get_handle_from_cluster_name(
+                        cluster_name)
             # Check the handle: The cluster can be preempted and removed from
             # the table before the managed job state is updated by the
@@ -796,7 +839,7 @@ def stream_logs_by_id(job_id: int,
             status_display.stop()
             tail_param = tail if tail is not None else 0
             returncode = backend.tail_logs(handle,
-                                           job_id=None,
+                                           job_id=job_id_to_tail,
                                            managed_job_id=job_id,
                                            follow=follow,
                                            tail=tail_param)
@@ -1114,9 +1157,15 @@ def dump_managed_job_queue() -> str:
         job['status'] = job['status'].value
         job['schedule_state'] = job['schedule_state'].value
-        cluster_name = generate_managed_job_cluster_name(
-            job['task_name'], job['job_id'])
-        handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+        pool = managed_job_state.get_pool_from_job_id(job['job_id'])
+        if pool is not None:
+            cluster_name, _ = managed_job_state.get_pool_submit_info(
+                job['job_id'])
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                job['task_name'], job['job_id'])
+        handle = global_user_state.get_handle_from_cluster_name(
+            cluster_name) if cluster_name is not None else None
         if isinstance(handle, backends.CloudVmRayResourceHandle):
             resources_str = resources_utils.get_readable_resources_repr(
                 handle, simplify=True)
@@ -1127,6 +1176,11 @@ def dump_managed_job_queue() -> str:
             job['cloud'] = str(handle.launched_resources.cloud)
             job['region'] = handle.launched_resources.region
             job['zone'] = handle.launched_resources.zone
+            job['infra'] = infra_utils.InfraInfo(
+                str(handle.launched_resources.cloud),
+                handle.launched_resources.region,
+                handle.launched_resources.zone).formatted_str()
+            job['accelerators'] = handle.launched_resources.accelerators
         else:
             # FIXME(zongheng): display the last cached values for these.
             job['cluster_resources'] = '-'
@@ -1134,6 +1188,7 @@ def dump_managed_job_queue() -> str:
             job['cloud'] = '-'
             job['region'] = '-'
             job['zone'] = '-'
+            job['infra'] = '-'
         # Add details about schedule state / backoff.
         state_details = None
@@ -1274,10 +1329,13 @@ def format_job_table(
         'JOB DURATION',
         '#RECOVERIES',
         'STATUS',
+        'WORKER_POOL',
     ]
     if show_all:
         # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
         columns += [
+            'WORKER_CLUSTER',
+            'WORKER_JOB_ID',
             'STARTED',
             'INFRA',
             'RESOURCES',
@@ -1387,11 +1445,14 @@ def format_job_table(
                 job_duration,
                 recovery_cnt,
                 status_str,
+                job_tasks[0].get('pool', '-'),
             ]
             if show_all:
                 details = job_tasks[current_task_id].get('details')
                 failure_reason = job_tasks[current_task_id]['failure_reason']
                 job_values.extend([
+                    '-',
+                    '-',
                     '-',
                     '-',
                     '-',
@@ -1427,37 +1488,43 @@ def format_job_table(
                 job_duration,
                 task['recovery_count'],
                 task['status'].colored_str(),
+                task.get('pool', '-'),
             ]
             if show_all:
                 # schedule_state is only set at the job level, so if we have
                 # more than one task, only display on the aggregated row.
                 schedule_state = (task['schedule_state']
                                   if len(job_tasks) == 1 else '-')
-                cloud = task.get('cloud')
-                if cloud is None:
-                    # Backward compatibility for old jobs controller without
-                    # cloud info returned, we parse it from the cluster
-                    # resources
-                    # TODO(zhwu): remove this after 0.12.0
-                    cloud = task['cluster_resources'].split('(')[0].split(
-                        'x')[-1]
-                    task['cluster_resources'] = task[
-                        'cluster_resources'].replace(f'{cloud}(',
-                                                     '(').replace('x ', 'x')
-                region = task['region']
-                zone = task.get('zone')
-                if cloud == '-':
-                    cloud = None
-                if region == '-':
-                    region = None
-                if zone == '-':
-                    zone = None
-                infra = infra_utils.InfraInfo(cloud, region, zone)
+                infra_str = task.get('infra')
+                if infra_str is None:
+                    cloud = task.get('cloud')
+                    if cloud is None:
+                        # Backward compatibility for old jobs controller without
+                        # cloud info returned, we parse it from the cluster
+                        # resources
+                        # TODO(zhwu): remove this after 0.12.0
+                        cloud = task['cluster_resources'].split('(')[0].split(
+                            'x')[-1]
+                        task['cluster_resources'] = task[
+                            'cluster_resources'].replace(f'{cloud}(',
+                                                         '(').replace(
+                                                             'x ', 'x')
+                    region = task['region']
+                    zone = task.get('zone')
+                    if cloud == '-':
+                        cloud = None
+                    if region == '-':
+                        region = None
+                    if zone == '-':
+                        zone = None
+                    infra_str = infra_utils.InfraInfo(cloud, region,
+                                                      zone).formatted_str()
                 values.extend([
+                    task.get('current_cluster_name', '-'),
+                    task.get('job_id_on_pool_cluster', '-'),
                     # STARTED
                     log_utils.readable_time_duration(task['start_at']),
-                    infra.formatted_str(),
+                    infra_str,
                     task['cluster_resources'],
                     schedule_state,
                     generate_details(task.get('details'),
@@ -1549,6 +1616,15 @@ class ManagedJobCodeGen:
         """)
         return cls._build(code)
+    @classmethod
+    def cancel_jobs_by_pool(cls, pool_name: str) -> str:
+        active_workspace = skypilot_config.get_active_workspace()
+        code = textwrap.dedent(f"""\
+            msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
+            print(msg, end="", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def get_version_and_job_table(cls) -> str:
         """Generate code to get controller version and raw job table."""

sky/logs/aws.py CHANGED Viewed

@@ -9,6 +9,8 @@ from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import resources_utils
+EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
 class _CloudwatchLoggingConfig(pydantic.BaseModel):
     """Configuration for AWS CloudWatch logging agent."""
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
             # Check if we're running on EC2 with an IAM role or if
             # AWS credentials are available in the environment
             pre_cmd = (
-                'if ! curl -s -m 1 http://169.254.169.254'
-                '/latest/meta-data/iam/security-credentials/ > /dev/null; '
+                f'if ! curl -s -m 1 {EC2_MD_URL}'
+                'latest/meta-data/iam/security-credentials/ > /dev/null; '
                 'then '
                 # failed EC2 check, look for env vars
                 'if [ -z "$AWS_ACCESS_KEY_ID" ] || '

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -3179,10 +3179,12 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
     return pods
-def is_tpu_on_gke(accelerator: str) -> bool:
+def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
     """Determines if the given accelerator is a TPU supported on GKE."""
-    normalized, _ = normalize_tpu_accelerator_name(accelerator)
-    return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
+    if normalize:
+        normalized, _ = normalize_tpu_accelerator_name(accelerator)
+        return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
+    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
 def get_node_accelerator_count(context: Optional[str],
@@ -3384,7 +3386,7 @@ def process_skypilot_pods(
 def _gpu_resource_key_helper(context: Optional[str]) -> str:
     """Helper function to get the GPU resource key."""
-    gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['amd']
+    gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
     try:
         nodes = kubernetes.core_api(context).list_node().items
         for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():

sky/provision/nebius/constants.py CHANGED Viewed

@@ -15,6 +15,9 @@ INFINIBAND_ENV_VARS = {
                         'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
 }
+# pylint: disable=line-too-long
+INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
 # Docker run options for InfiniBand support
 INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']

sky/provision/vast/instance.py CHANGED Viewed

@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                     region=region,
                     disk_size=config.node_config['DiskSize'],
                     preemptible=config.node_config['Preemptible'],
-                    image_name=config.node_config['ImageId'])
+                    image_name=config.node_config['ImageId'],
+                    ports=config.ports_to_open_on_launch)
             except Exception as e:  # pylint: disable=broad-except
                 logger.warning(f'run_instances error: {e}')
                 raise

sky/provision/vast/utils.py CHANGED Viewed

@@ -5,7 +5,7 @@
 # python sdk.
 #
 """Vast library wrapper for SkyPilot."""
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 from sky import sky_logging
 from sky.adaptors import vast
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
 def launch(name: str, instance_type: str, region: str, disk_size: int,
-           image_name: str, preemptible: bool) -> str:
+           image_name: str, ports: Optional[List[int]],
+           preemptible: bool) -> str:
     """Launches an instance with the given parameters.
     Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
          The disk size {xx} GB is not exactly matched the requested
          size {yy} GB. It is possible to charge extra cost on disk.
+      *  `ports`: This is a feature flag to expose ports to the internet.
       *  `geolocation`: Geolocation on Vast can be as specific as the
          host chooses to be. They can say, for instance, "Yutakachō,
          Shinagawa District, Tokyo, JP." Such a specific geolocation
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
       *  Vast instance types are an invention for skypilot. Refer to
          catalog/vast_catalog.py for the current construction
-         of the type.
-    """
+         of the type."""
     cpu_ram = float(instance_type.split('-')[-1]) / 1024
     gpu_name = instance_type.split('-')[1].replace('_', ' ')
     num_gpus = int(instance_type.split('-')[0].replace('x', ''))
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
     instance_touse = instance_list[0]
+    port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
     launch_params = {
         'id': instance_touse['id'],
         'direct': True,
         'ssh': True,
-        'env': '-e __SOURCE=skypilot',
+        'env': f'-e __SOURCE=skypilot {port_map}',
         'onstart_cmd': ';'.join([
             'touch ~/.no_auto_tmux',
             f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',

sky/py.typed ADDED Viewed

File without changes

sky/resources.py CHANGED Viewed

@@ -19,6 +19,7 @@ from sky.clouds import cloud as sky_cloud
 from sky.provision import docker_utils
 from sky.provision.gcp import constants as gcp_constants
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.provision.nebius import constants as nebius_constants
 from sky.skylet import constants
 from sky.utils import accelerator_registry
 from sky.utils import annotations
@@ -797,8 +798,13 @@ class Resources:
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
+                # TODO(syang): GCP TPU names are supported on both GCP and
+                # kubernetes (GKE), but this logic automatically assumes
+                # GCP TPUs can only be used on GCP.
+                # Fix the logic such that GCP TPU names can failover between
+                # GCP and kubernetes.
                 if self.cloud is None:
-                    if kubernetes_utils.is_tpu_on_gke(acc):
+                    if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()
@@ -813,7 +819,8 @@ class Resources:
                 use_tpu_vm = accelerator_args.get('tpu_vm', True)
                 if (self.cloud.is_same_cloud(clouds.GCP()) and
-                        not kubernetes_utils.is_tpu_on_gke(acc)):
+                        not kubernetes_utils.is_tpu_on_gke(acc,
+                                                           normalize=False)):
                     if 'runtime_version' not in accelerator_args:
                         def _get_default_runtime_version() -> str:
@@ -1254,15 +1261,19 @@ class Resources:
             ValueError: if the attribute is invalid.
         """
-        if (self._network_tier == resources_utils.NetworkTier.BEST and
-                isinstance(self._cloud, clouds.GCP)):
-            # Handle GPU Direct TCPX requirement for docker images
-            if self._image_id is None:
-                # No custom image specified - use the default GPU Direct image
-                self._image_id = {
-                    self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
-                }
-            else:
+        if self._network_tier == resources_utils.NetworkTier.BEST:
+            if isinstance(self._cloud, clouds.GCP):
+                # Handle GPU Direct TCPX requirement for docker images
+                if self._image_id is None:
+                    self._image_id = {
+                        self._region: gcp_constants.GCP_GPU_DIRECT_IMAGE_ID
+                    }
+            elif isinstance(self._cloud, clouds.Nebius):
+                if self._image_id is None:
+                    self._image_id = {
+                        self._region: nebius_constants.INFINIBAND_IMAGE_ID
+                    }
+            elif self._image_id:
                 # Custom image specified - validate it's a docker image
                 # Check if any of the specified images are not docker images
                 non_docker_images = []
@@ -1274,14 +1285,13 @@ class Resources:
                 if non_docker_images:
                     with ux_utils.print_exception_no_traceback():
                         raise ValueError(
-                            f'When using network_tier=BEST on GCP, image_id '
+                            f'When using network_tier=BEST, image_id '
                             f'must be a docker image. '
                             f'Found non-docker images: '
                             f'{", ".join(non_docker_images)}. '
                             f'Please either: (1) use a docker image '
                             f'(prefix with "docker:"), or '
-                            f'(2) leave image_id empty to use the default '
-                            f'GPU Direct TCPX image.')
+                            f'(2) leave image_id empty to use the default')
         if self._image_id is None:
             return

sky/schemas/db/spot_jobs/002_cluster_pool.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Columns for cluster pool.
+Revision ID: 002
+Revises: 001
+Create Date: 2025-07-18
+"""
+# pylint: disable=invalid-name
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+from sky.utils.db import db_utils
+# revision identifiers, used by Alembic.
+revision: str = '002'
+down_revision: Union[str, Sequence[str], None] = '001'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    """Add columns for cluster pool."""
+    with op.get_context().autocommit_block():
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'pool',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'current_cluster_name',
+                                             sa.Text(),
+                                             server_default=None)
+        db_utils.add_column_to_table_alembic('job_info',
+                                             'job_id_on_pool_cluster',
+                                             sa.Integer(),
+                                             server_default=None)
+def downgrade():
+    """Remove columns for cluster pool."""
+    pass

sky/serve/autoscalers.py CHANGED Viewed

@@ -175,6 +175,14 @@ class Autoscaler:
         """Collect request information from aggregator for autoscaling."""
         raise NotImplementedError
+    def info(self) -> Dict[str, Any]:
+        """Get information about the autoscaler."""
+        return {
+            'target_num_replicas': self.target_num_replicas,
+            'min_replicas': self.min_replicas,
+            'max_replicas': self.max_replicas,
+        }
     def _generate_scaling_decisions(
         self,
         replica_infos: List['replica_managers.ReplicaInfo'],

skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250729py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl