PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250728__py3-none-any.whl → 1.0.0.dev20250730__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250728py3-none-any.whl → 1.0.0.dev20250730py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (152) hide show

sky/global_user_state.py CHANGED Viewed

@@ -11,6 +11,7 @@ import json
 import os
 import pickle
 import re
+import threading
 import time
 import typing
 from typing import Any, Dict, List, Optional, Set, Tuple
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
 _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
 _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
+_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
 Base = declarative.declarative_base()
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
         migration_utils.GLOBAL_USER_STATE_VERSION)
+# We wrap the sqlalchemy engine initialization in a thread
+# lock to ensure that multiple threads do not initialize the
+# engine which could result in a rare race condition where
+# a session has already been created with _SQLALCHEMY_ENGINE = e1,
+# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
+# which could result in e1 being garbage collected unexpectedly.
 def initialize_and_get_db() -> sqlalchemy.engine.Engine:
     global _SQLALCHEMY_ENGINE
     if _SQLALCHEMY_ENGINE is not None:
         return _SQLALCHEMY_ENGINE
+    with _SQLALCHEMY_ENGINE_LOCK:
+        if _SQLALCHEMY_ENGINE is not None:
+            return _SQLALCHEMY_ENGINE
+        # get an engine to the db
+        engine = migration_utils.get_engine('state')
-    # get an engine to the db
-    engine = migration_utils.get_engine('state')
+        # run migrations if needed
+        create_table(engine)
-    # run migrations if needed
-    create_table(engine)
-    # return engine
-    _SQLALCHEMY_ENGINE = engine
-    return _SQLALCHEMY_ENGINE
+        # return engine
+        _SQLALCHEMY_ENGINE = engine
+        return _SQLALCHEMY_ENGINE
 def _init_db(func):

sky/jobs/server/core.py CHANGED Viewed

@@ -59,7 +59,10 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     # as uploading to the controller is only a local copy.
     storage_clouds = (
         storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
-    if not managed_job_utils.is_consolidation_mode() and storage_clouds:
+    force_disable_cloud_bucket = skypilot_config.get_nested(
+        ('jobs', 'force_disable_cloud_bucket'), False)
+    if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
+            not force_disable_cloud_bucket):
         for task_ in dag.tasks:
             controller_utils.maybe_translate_local_file_mounts_and_sync_up(
                 task_, task_type='jobs')

sky/jobs/state.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import enum
 import functools
 import json
+import threading
 import time
 import typing
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -33,6 +34,7 @@ CallbackType = Callable[[str], None]
 logger = sky_logging.init_logger(__name__)
 _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
+_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
 Base = declarative.declarative_base()
@@ -131,21 +133,30 @@ def create_table(engine: sqlalchemy.engine.Engine):
                                          migration_utils.SPOT_JOBS_VERSION)
+# We wrap the sqlalchemy engine initialization in a thread
+# lock to ensure that multiple threads do not initialize the
+# engine which could result in a rare race condition where
+# a session has already been created with _SQLALCHEMY_ENGINE = e1,
+# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
+# which could result in e1 being garbage collected unexpectedly.
 def initialize_and_get_db() -> sqlalchemy.engine.Engine:
     global _SQLALCHEMY_ENGINE
     if _SQLALCHEMY_ENGINE is not None:
         return _SQLALCHEMY_ENGINE
-    # get an engine to the db
-    engine = migration_utils.get_engine('spot_jobs')
+    with _SQLALCHEMY_ENGINE_LOCK:
+        if _SQLALCHEMY_ENGINE is not None:
+            return _SQLALCHEMY_ENGINE
+        # get an engine to the db
+        engine = migration_utils.get_engine('spot_jobs')
-    # run migrations if needed
-    create_table(engine)
+        # run migrations if needed
+        create_table(engine)
-    # return engine
-    _SQLALCHEMY_ENGINE = engine
-    return _SQLALCHEMY_ENGINE
+        # return engine
+        _SQLALCHEMY_ENGINE = engine
+        return _SQLALCHEMY_ENGINE
 def _init_db(func):
@@ -1045,6 +1056,23 @@ def _get_all_task_ids_statuses(
         return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
+@_init_db
+def get_all_task_ids_names_statuses_logs(
+        job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        id_names = session.execute(
+            sqlalchemy.select(
+                spot_table.c.task_id,
+                spot_table.c.task_name,
+                spot_table.c.status,
+                spot_table.c.local_log_file,
+            ).where(spot_table.c.spot_job_id == job_id).order_by(
+                spot_table.c.task_id.asc())).fetchall()
+        return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
+                for row in id_names]
 @_init_db
 def get_job_status_with_task_id(job_id: int,
                                 task_id: int) -> Optional[ManagedJobStatus]:

sky/jobs/utils.py CHANGED Viewed

@@ -716,23 +716,41 @@ def stream_logs_by_id(job_id: int,
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
-            log_file = managed_job_state.get_local_log_file(job_id, None)
-            if log_file is not None:
-                with open(os.path.expanduser(log_file), 'r',
-                          encoding='utf-8') as f:
-                    # Stream the logs to the console without reading the whole
-                    # file into memory.
-                    start_streaming = False
-                    read_from: Union[TextIO, Deque[str]] = f
-                    if tail is not None:
-                        assert tail > 0
-                        # Read only the last 'tail' lines using deque
-                        read_from = collections.deque(f, maxlen=tail)
-                    for line in read_from:
-                        if log_lib.LOG_FILE_START_STREAMING_AT in line:
-                            start_streaming = True
-                        if start_streaming:
-                            print(line, end='', flush=True)
+            log_file_exists = False
+            task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
+                job_id)
+            num_tasks = len(task_info)
+            for task_id, task_name, task_status, log_file in task_info:
+                if log_file:
+                    log_file_exists = True
+                    task_str = (f'Task {task_name}({task_id})'
+                                if task_name else f'Task {task_id}')
+                    if num_tasks > 1:
+                        print(f'=== {task_str} ===')
+                    with open(os.path.expanduser(log_file),
+                              'r',
+                              encoding='utf-8') as f:
+                        # Stream the logs to the console without reading the
+                        # whole file into memory.
+                        start_streaming = False
+                        read_from: Union[TextIO, Deque[str]] = f
+                        if tail is not None:
+                            assert tail > 0
+                            # Read only the last 'tail' lines using deque
+                            read_from = collections.deque(f, maxlen=tail)
+                        for line in read_from:
+                            if log_lib.LOG_FILE_START_STREAMING_AT in line:
+                                start_streaming = True
+                            if start_streaming:
+                                print(line, end='', flush=True)
+                    if num_tasks > 1:
+                        # Add the "Task finished" message for terminal states
+                        if task_status.is_terminal():
+                            print(ux_utils.finishing_message(
+                                f'{task_str} finished '
+                                f'(status: {task_status.value}).'),
+                                  flush=True)
+            if log_file_exists:
                 # Add the "Job finished" message for terminal states
                 if managed_job_status.is_terminal():
                     print(ux_utils.finishing_message(

sky/logs/agent.py CHANGED Viewed

@@ -67,20 +67,6 @@ class FluentbitAgent(LoggingAgent):
         }
         return common_utils.dump_yaml_str(cfg_dict)
-    def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
-        """Add fallback outputs to the Fluent Bit configuration.
-        This method can be overridden by subclasses to add fallback outputs
-        in case the primary output fails.
-        Args:
-            cfg_dict: The Fluent Bit configuration dictionary.
-        Returns:
-            The updated configuration dictionary.
-        """
-        return cfg_dict
     @abc.abstractmethod
     def fluentbit_output_config(
             self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:

sky/logs/aws.py CHANGED Viewed

@@ -9,6 +9,8 @@ from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import resources_utils
+EC2_MD_URL = '"${AWS_EC2_METADATA_SERVICE_ENDPOINT:-http://169.254.169.254/}"'
 class _CloudwatchLoggingConfig(pydantic.BaseModel):
     """Configuration for AWS CloudWatch logging agent."""
@@ -109,8 +111,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
             # Check if we're running on EC2 with an IAM role or if
             # AWS credentials are available in the environment
             pre_cmd = (
-                'if ! curl -s -m 1 http://169.254.169.254'
-                '/latest/meta-data/iam/security-credentials/ > /dev/null; '
+                f'if ! curl -s -m 1 {EC2_MD_URL}'
+                'latest/meta-data/iam/security-credentials/ > /dev/null; '
                 'then '
                 # failed EC2 check, look for env vars
                 'if [ -z "$AWS_ACCESS_KEY_ID" ] || '
@@ -211,36 +213,8 @@ class CloudwatchLoggingAgent(FluentbitAgent):
             }
         }
-        # Add fallback outputs for graceful failure handling
-        cfg_dict = self.add_fallback_outputs(cfg_dict)
         return common_utils.dump_yaml_str(cfg_dict)
-    def add_fallback_outputs(self, cfg_dict: Dict[str, Any]) -> Dict[str, Any]:
-        """Add fallback outputs to the Fluent Bit configuration.
-        This adds a local file output as a fallback in case
-        CloudWatch logging fails.
-        Args:
-            cfg_dict: The Fluent Bit configuration dictionary.
-        Returns:
-            The updated configuration dictionary.
-        """
-        # Add a local file output as a fallback
-        fallback_output = {
-            'name': 'file',
-            'match': '*',
-            'path': '/tmp/skypilot_logs_fallback.log',
-            'format': 'out_file',
-        }
-        # Add the fallback output to the configuration
-        cfg_dict['pipeline']['outputs'].append(fallback_output)
-        return cfg_dict
     def fluentbit_output_config(
             self, cluster_name: resources_utils.ClusterName) -> Dict[str, Any]:
         """Get the Fluent Bit output configuration for CloudWatch.

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -210,7 +210,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                             #  case we will need to update this logic.
                             # TODO(Doyoung): Update the error message raised
                             # with the multi-host TPU support.
-                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key()  # pylint: disable=line-too-long
+                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context)  # pylint: disable=line-too-long
                             if 'Insufficient google.com/tpu' in event_message:
                                 extra_msg = (
                                     f'Verify if '
@@ -797,7 +797,8 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     limits = pod_spec['spec']['containers'][0].get('resources',
                                                    {}).get('limits')
     if limits is not None:
-        needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
+        needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
+                                0) > 0
     # TPU pods provisioned on GKE use the default containerd runtime.
     # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview  # pylint: disable=line-too-long
@@ -900,7 +901,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         # to the non-DWS case.
         if needs_gpus:
             gpu_toleration = {
-                'key': kubernetes_utils.get_gpu_resource_key(),
+                'key': kubernetes_utils.get_gpu_resource_key(context),
                 'operator': 'Exists',
                 'effect': 'NoSchedule'
             }

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -147,12 +147,14 @@ MEMORY_SIZE_UNITS = {
 # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
 # nodes. These keys are typically used in the node's status.allocatable
 # or status.capacity fields to indicate the available resources on the node.
-GPU_RESOURCE_KEY = 'nvidia.com/gpu'
+SUPPORTED_GPU_RESOURCE_KEYS = {'amd': 'amd.com/gpu', 'nvidia': 'nvidia.com/gpu'}
 TPU_RESOURCE_KEY = 'google.com/tpu'
 NO_ACCELERATOR_HELP_MESSAGE = (
     'If your cluster contains GPUs or TPUs, make sure '
-    f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
+    f'one of {SUPPORTED_GPU_RESOURCE_KEYS["amd"]}, '
+    f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]} or '
+    f'{TPU_RESOURCE_KEY} resource is available '
     'on the nodes and the node labels for identifying GPUs/TPUs '
     '(e.g., skypilot.co/accelerator) are setup correctly. ')
@@ -391,6 +393,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
         return 'nvidia-h200-141gb'
     elif accelerator.startswith('tpu-'):
         return accelerator
+    elif accelerator.startswith('amd-'):
+        return accelerator
     else:
         return 'nvidia-tesla-{}'.format(accelerator.lower())
@@ -1098,10 +1102,10 @@ def detect_accelerator_resource(
         context: Optional[str]) -> Tuple[bool, Set[str]]:
     """Checks if the Kubernetes cluster has GPU/TPU resource.
-    Two types of accelerator resources are available which are each checked
-    with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
+    Three types of accelerator resources are available which are each checked
+    with amd.com/gpu, nvidia.com/gpu and google.com/tpu. If amd.com/gpu or nvidia.com/gpu resource is
     missing, that typically means that the Kubernetes cluster does not have
-    GPUs or the nvidia GPU operator and/or device drivers are not installed.
+    GPUs or the amd/nvidia GPU operator and/or device drivers are not installed.
     Returns:
         bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
@@ -1112,7 +1116,7 @@ def detect_accelerator_resource(
     nodes = get_kubernetes_nodes(context=context)
     for node in nodes:
         cluster_resources.update(node.status.allocatable.keys())
-    has_accelerator = (get_gpu_resource_key() in cluster_resources or
+    has_accelerator = (get_gpu_resource_key(context) in cluster_resources or
                        TPU_RESOURCE_KEY in cluster_resources)
     return has_accelerator, cluster_resources
@@ -1262,8 +1266,8 @@ def check_instance_fits(context: Optional[str],
         else:
             # Check if any of the GPU nodes have sufficient number of GPUs.
             gpu_nodes = [
-                node for node in gpu_nodes if
-                get_node_accelerator_count(node.status.allocatable) >= acc_count
+                node for node in gpu_nodes if get_node_accelerator_count(
+                    context, node.status.allocatable) >= acc_count
             ]
             if not gpu_nodes:
                 return False, (
@@ -1325,14 +1329,14 @@ def get_accelerator_label_key_values(
     Raises:
         ResourcesUnavailableError: Can be raised from the following conditions:
             - The cluster does not have GPU/TPU resources
-                (nvidia.com/gpu, google.com/tpu)
+                (amd.com/gpu, nvidia.com/gpu, google.com/tpu)
             - The cluster has GPU/TPU resources, but no node in the cluster has
               an accelerator label.
             - The cluster has a node with an invalid accelerator label value.
             - The cluster doesn't have any nodes with acc_type GPU/TPU
     """
     # Check if the cluster has GPU resources
-    # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
+    # TODO(romilb): This assumes the accelerator is a amd/nvidia GPU. We
     #  need to support TPUs and other accelerators as well.
     # TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
     #  is configured in config.yaml since the cluster may be scaling up from
@@ -1496,12 +1500,15 @@ def get_accelerator_label_key_values(
                     f'`sky ssh up --infra {context_display_name}`. {suffix}')
             else:
                 msg = (
-                    f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
+                    f'Could not detect GPU/TPU resources ({SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
+                    f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
                     f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
                     ' contains GPUs, please ensure GPU drivers are installed on '
                     'the node. Check if the GPUs are setup correctly by running '
                     '`kubectl describe nodes` and looking for the '
-                    f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
+                    f'{SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
+                    f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
+                    f'{TPU_RESOURCE_KEY!r} resource. '
                     'Please refer to the documentation on how to set up GPUs.'
                     f'{suffix}')
             raise exceptions.ResourcesUnavailableError(msg)
@@ -2861,7 +2868,7 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
     nodes = get_kubernetes_nodes(context=context)
     nodes_with_accelerator = []
     for node in nodes:
-        if get_gpu_resource_key() in node.status.capacity:
+        if get_gpu_resource_key(context) in node.status.capacity:
             nodes_with_accelerator.append(node)
     label_formatter, _ = detect_gpu_label_formatter(context)
@@ -2950,7 +2957,8 @@ def get_kubernetes_node_info(
                         break
         allocated_qty = 0
-        accelerator_count = get_node_accelerator_count(node.status.allocatable)
+        accelerator_count = get_node_accelerator_count(context,
+                                                       node.status.allocatable)
         if pods is None:
             accelerators_available = -1
@@ -2965,7 +2973,7 @@ def get_kubernetes_node_info(
                     for container in pod.spec.containers:
                         if container.resources.requests:
                             allocated_qty += get_node_accelerator_count(
-                                container.resources.requests)
+                                context, container.resources.requests)
             accelerators_available = accelerator_count - allocated_qty
@@ -3171,13 +3179,16 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
     return pods
-def is_tpu_on_gke(accelerator: str) -> bool:
+def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
     """Determines if the given accelerator is a TPU supported on GKE."""
-    normalized, _ = normalize_tpu_accelerator_name(accelerator)
-    return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
+    if normalize:
+        normalized, _ = normalize_tpu_accelerator_name(accelerator)
+        return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
+    return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
-def get_node_accelerator_count(attribute_dict: dict) -> int:
+def get_node_accelerator_count(context: Optional[str],
+                               attribute_dict: dict) -> int:
     """Retrieves the count of accelerators from a node's resource dictionary.
     This method checks the node's allocatable resources or the accelerators
@@ -3192,7 +3203,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
         Number of accelerators allocated or available from the node. If no
             resource is found, it returns 0.
     """
-    gpu_resource_name = get_gpu_resource_key()
+    gpu_resource_name = get_gpu_resource_key(context)
     assert not (gpu_resource_name in attribute_dict and
                 TPU_RESOURCE_KEY in attribute_dict)
     if gpu_resource_name in attribute_dict:
@@ -3318,7 +3329,7 @@ def process_skypilot_pods(
                 unit='G')
             gpu_count = parse_cpu_or_gpu_resource(
                 pod.spec.containers[0].resources.requests.get(
-                    'nvidia.com/gpu', '0'))
+                    get_gpu_resource_key(context), '0'))
             gpu_name = None
             if gpu_count > 0:
                 label_formatter, _ = (detect_gpu_label_formatter(context))
@@ -3373,19 +3384,33 @@ def process_skypilot_pods(
     return list(clusters.values()), jobs_controllers, serve_controllers
-def get_gpu_resource_key():
-    """Get the GPU resource name to use in kubernetes.
-    The function first checks for an environment variable.
-    If defined, it uses its value; otherwise, it returns the default value.
-    Args:
-        name (str): Default GPU resource name, default is "nvidia.com/gpu".
+def _gpu_resource_key_helper(context: Optional[str]) -> str:
+    """Helper function to get the GPU resource key."""
+    gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
+    try:
+        nodes = kubernetes.core_api(context).list_node().items
+        for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
+            if any(gpu_key in node.status.capacity for node in nodes):
+                return gpu_key
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning(f'Failed to load kube config or query nodes: {e}. '
+                       'Falling back to default GPU resource key.')
+    return gpu_resource_key
+@annotations.lru_cache(scope='request')
+def get_gpu_resource_key(context: Optional[str] = None) -> str:
+    """Get the GPU resource name to use in Kubernetes.
+    The function auto-detects the GPU resource key by querying the Kubernetes node API.
+    If detection fails, it falls back to a default value.
+    An environment variable can override the detected or default value.
     Returns:
         str: The selected GPU resource name.
     """
-    # Retrieve GPU resource name from environment variable, if set.
-    # Else use default.
-    # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
-    return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
+    gpu_resource_key = _gpu_resource_key_helper(context)
+    return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=gpu_resource_key)
 def get_kubeconfig_paths() -> List[str]:

sky/provision/vast/instance.py CHANGED Viewed

@@ -97,7 +97,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                     region=region,
                     disk_size=config.node_config['DiskSize'],
                     preemptible=config.node_config['Preemptible'],
-                    image_name=config.node_config['ImageId'])
+                    image_name=config.node_config['ImageId'],
+                    ports=config.ports_to_open_on_launch)
             except Exception as e:  # pylint: disable=broad-except
                 logger.warning(f'run_instances error: {e}')
                 raise

sky/provision/vast/utils.py CHANGED Viewed

@@ -5,7 +5,7 @@
 # python sdk.
 #
 """Vast library wrapper for SkyPilot."""
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 from sky import sky_logging
 from sky.adaptors import vast
@@ -34,7 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
 def launch(name: str, instance_type: str, region: str, disk_size: int,
-           image_name: str, preemptible: bool) -> str:
+           image_name: str, ports: Optional[List[int]],
+           preemptible: bool) -> str:
     """Launches an instance with the given parameters.
     Converts the instance_type to the Vast GPU name, finds the specs for the
@@ -58,6 +59,8 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
          The disk size {xx} GB is not exactly matched the requested
          size {yy} GB. It is possible to charge extra cost on disk.
+      *  `ports`: This is a feature flag to expose ports to the internet.
       *  `geolocation`: Geolocation on Vast can be as specific as the
          host chooses to be. They can say, for instance, "Yutakachō,
          Shinagawa District, Tokyo, JP." Such a specific geolocation
@@ -79,9 +82,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
       *  Vast instance types are an invention for skypilot. Refer to
          catalog/vast_catalog.py for the current construction
-         of the type.
-    """
+         of the type."""
     cpu_ram = float(instance_type.split('-')[-1]) / 1024
     gpu_name = instance_type.split('-')[1].replace('_', ' ')
     num_gpus = int(instance_type.split('-')[0].replace('x', ''))
@@ -104,11 +105,13 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
     instance_touse = instance_list[0]
+    port_map = ' '.join([f'-p {p}:{p}' for p in ports]) if ports else ''
     launch_params = {
         'id': instance_touse['id'],
         'direct': True,
         'ssh': True,
-        'env': '-e __SOURCE=skypilot',
+        'env': f'-e __SOURCE=skypilot {port_map}',
         'onstart_cmd': ';'.join([
             'touch ~/.no_auto_tmux',
             f'echo "{vast.vast().api_key_access}" > ~/.vast_api_key',

sky/resources.py CHANGED Viewed

@@ -797,8 +797,13 @@ class Resources:
             acc, _ = list(accelerators.items())[0]
             if 'tpu' in acc.lower():
+                # TODO(syang): GCP TPU names are supported on both GCP and
+                # kubernetes (GKE), but this logic automatically assumes
+                # GCP TPUs can only be used on GCP.
+                # Fix the logic such that GCP TPU names can failover between
+                # GCP and kubernetes.
                 if self.cloud is None:
-                    if kubernetes_utils.is_tpu_on_gke(acc):
+                    if kubernetes_utils.is_tpu_on_gke(acc, normalize=False):
                         self._cloud = clouds.Kubernetes()
                     else:
                         self._cloud = clouds.GCP()
@@ -813,7 +818,8 @@ class Resources:
                 use_tpu_vm = accelerator_args.get('tpu_vm', True)
                 if (self.cloud.is_same_cloud(clouds.GCP()) and
-                        not kubernetes_utils.is_tpu_on_gke(acc)):
+                        not kubernetes_utils.is_tpu_on_gke(acc,
+                                                           normalize=False)):
                     if 'runtime_version' not in accelerator_args:
                         def _get_default_runtime_version() -> str:

sky/serve/server/core.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky import skypilot_config
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.catalog import common as service_catalog_common
+from sky.data import storage as storage_lib
 from sky.serve import constants as serve_constants
 from sky.serve import serve_state
 from sky.serve import serve_utils
@@ -151,8 +152,25 @@ def up(
     with rich_utils.safe_status(
             ux_utils.spinner_message('Initializing service')):
-        controller_utils.maybe_translate_local_file_mounts_and_sync_up(
-            task, task_type='serve')
+        # Handle file mounts using two-hop approach when cloud storage
+        # unavailable
+        storage_clouds = (
+            storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
+        force_disable_cloud_bucket = skypilot_config.get_nested(
+            ('serve', 'force_disable_cloud_bucket'), False)
+        if storage_clouds and not force_disable_cloud_bucket:
+            controller_utils.maybe_translate_local_file_mounts_and_sync_up(
+                task, task_type='serve')
+            local_to_controller_file_mounts = {}
+        else:
+            # Fall back to two-hop file_mount uploading when no cloud storage
+            if task.storage_mounts:
+                raise exceptions.NotSupportedError(
+                    'Cloud-based file_mounts are specified, but no cloud '
+                    'storage is available. Please specify local '
+                    'file_mounts only.')
+            local_to_controller_file_mounts = (
+                controller_utils.translate_local_file_mounts_to_two_hop(task))
     tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
         service_name, task)
@@ -183,6 +201,7 @@ def up(
             'service_name': service_name,
             'controller_log_file': controller_log_file,
             'remote_user_config_path': remote_config_yaml_path,
+            'local_to_controller_file_mounts': local_to_controller_file_mounts,
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),
             **tls_template_vars,

skypilot-nightly 1.0.0.dev20250728__py3-none-any.whl → 1.0.0.dev20250730__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250728py3-none-any.whl → 1.0.0.dev20250730py3-none-any.whl