PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251002py3-none-any.whl → 1.0.0.dev20251004py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show

sky/__init__.py +2 -2
sky/authentication.py +19 -109
sky/backends/cloud_vm_ray_backend.py +42 -27
sky/client/cli/command.py +1 -11
sky/clouds/cudo.py +1 -1
sky/clouds/kubernetes.py +7 -19
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-ad77b12fc736dca3.js → [job]-72794fc3fcdd517a.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-7340bc0f0dd8ae74.js → webpack-3286453d56f3c0a0.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage_utils.py +9 -0
sky/execution.py +24 -2
sky/global_user_state.py +16 -0
sky/jobs/recovery_strategy.py +45 -0
sky/jobs/server/core.py +60 -53
sky/jobs/state.py +21 -1
sky/jobs/utils.py +29 -11
sky/provision/kubernetes/config.py +0 -42
sky/provision/kubernetes/instance.py +1 -33
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network_utils.py +0 -21
sky/provision/kubernetes/utils.py +136 -300
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +6 -0
sky/server/server.py +6 -0
sky/setup_files/dependencies.py +1 -0
sky/templates/kubernetes-ray.yml.j2 +4 -13
sky/utils/context.py +12 -7
sky/utils/env_options.py +4 -0
sky/utils/kubernetes_enums.py +2 -15
sky/utils/schemas.py +17 -6
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/METADATA +38 -37
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/RECORD +55 -56
sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
/sky/dashboard/out/_next/static/{16g0-hgEgk6Db72hpE8MY → KL03GEega4QqDqTOMtA_w}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251002.dist-info → skypilot_nightly-1.0.0.dev20251004.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -7,6 +7,7 @@ resources:
 """
 import asyncio
 import logging
+import os
 import traceback
 import typing
 from typing import Optional, Set
@@ -16,16 +17,19 @@ from sky import dag as dag_lib
 from sky import exceptions
 from sky import global_user_state
 from sky import sky_logging
+from sky import skypilot_config
 from sky.backends import backend_utils
 from sky.client import sdk
 from sky.jobs import scheduler
 from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.serve import serve_utils
+from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import context_utils
+from sky.utils import env_options
 from sky.utils import registry
 from sky.utils import status_lib
 from sky.utils import ux_utils
@@ -45,6 +49,13 @@ MAX_JOB_CHECKING_RETRY = 10
 # cluster before its status can be updated by the job controller.
 _AUTODOWN_MINUTES = 10
+ENV_VARS_TO_CLEAR = [
+    skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
+    constants.USER_ID_ENV_VAR,
+    constants.USER_ENV_VAR,
+    env_options.Options.SHOW_DEBUG_INFO.env_key,
+]
 class StrategyExecutor:
     """Handle the launching, recovery and termination of managed job clusters"""
@@ -213,6 +224,7 @@ class StrategyExecutor:
                 **kwargs,
                 _try_cancel_if_cluster_is_init=True,
             )
+            self._logger.debug(f'sdk.cancel request ID: {request_id}')
             await context_utils.to_thread(
                 sdk.get,
                 request_id,
@@ -371,6 +383,31 @@ class StrategyExecutor:
                         usage_lib.messages.usage.set_internal()
                         if self.pool is None:
                             assert self.cluster_name is not None
+                            # sdk.launch will implicitly start the API server,
+                            # but then the API server will inherit the current
+                            # env vars/user, which we may not want.
+                            # Instead, clear env vars here and call api_start
+                            # explicitly.
+                            vars_to_restore = {}
+                            try:
+                                for env_var in ENV_VARS_TO_CLEAR:
+                                    vars_to_restore[env_var] = os.environ.pop(
+                                        env_var, None)
+                                    self._logger.debug('Cleared env var: '
+                                                       f'{env_var}')
+                                self._logger.debug('Env vars for api_start: '
+                                                   f'{os.environ}')
+                                await context_utils.to_thread(sdk.api_start)
+                                self._logger.info('API server started.')
+                            finally:
+                                for env_var, value in vars_to_restore.items():
+                                    if value is not None:
+                                        self._logger.debug(
+                                            'Restored env var: '
+                                            f'{env_var}: {value}')
+                                        os.environ[env_var] = value
                             log_file = _get_logger_file(self._logger)
                             request_id = None
                             try:
@@ -392,6 +429,8 @@ class StrategyExecutor:
                                     # down=True,
                                     _is_launched_by_jobs_controller=True,
                                 )
+                                self._logger.debug('sdk.launch request ID: '
+                                                   f'{request_id}')
                                 if log_file is None:
                                     raise OSError('Log file is None')
                                 with open(log_file, 'a', encoding='utf-8') as f:
@@ -404,6 +443,8 @@ class StrategyExecutor:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
+                                    self._logger.debug('sdk.api_cancel request '
+                                                       f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)
@@ -427,6 +468,8 @@ class StrategyExecutor:
                                     self.dag,
                                     cluster_name=self.cluster_name,
                                 )
+                                self._logger.debug('sdk.exec request ID: '
+                                                   f'{request_id}')
                                 job_id_on_pool_cluster, _ = (
                                     await context_utils.to_thread(
                                         sdk.get, request_id))
@@ -434,6 +477,8 @@ class StrategyExecutor:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
+                                    self._logger.debug('sdk.api_cancel request '
+                                                       f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)

sky/jobs/server/core.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sky.data import storage as storage_lib
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
+from sky.metrics import utils as metrics_lib
 from sky.provision import common as provision_common
 from sky.schemas.api import responses
 from sky.serve import serve_state
@@ -666,6 +667,7 @@ def queue_v2_api(
            ], total, status_counts, total_no_filter
+@metrics_lib.time_me
 def queue_v2(
     refresh: bool,
     skip_finished: bool = False,
@@ -723,11 +725,12 @@ def queue_v2(
         if page is not None:
             raise ValueError('Limit must be specified when page is specified')
-    handle = _maybe_restart_controller(refresh,
-                                       stopped_message='No in-progress '
-                                       'managed jobs.',
-                                       spinner_message='Checking '
-                                       'managed jobs')
+    with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
+        handle = _maybe_restart_controller(refresh,
+                                           stopped_message='No in-progress '
+                                           'managed jobs.',
+                                           spinner_message='Checking '
+                                           'managed jobs')
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
@@ -778,70 +781,74 @@ def queue_v2(
         except exceptions.SkyletMethodNotImplementedError:
             pass
-    code = managed_job_utils.ManagedJobCodeGen.get_job_table(
-        skip_finished, accessible_workspaces, job_ids, workspace_match,
-        name_match, pool_match, page, limit, user_hashes, statuses)
-    returncode, job_table_payload, stderr = backend.run_on_head(
-        handle,
-        code,
-        require_outputs=True,
-        stream_logs=False,
-        separate_stderr=True)
+    with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
+        code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+            skip_finished, accessible_workspaces, job_ids, workspace_match,
+            name_match, pool_match, page, limit, user_hashes, statuses)
+    with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
+        returncode, job_table_payload, stderr = backend.run_on_head(
+            handle,
+            code,
+            require_outputs=True,
+            stream_logs=False,
+            separate_stderr=True)
     if returncode != 0:
         logger.error(job_table_payload + stderr)
         raise RuntimeError('Failed to fetch managed jobs with returncode: '
                            f'{returncode}.\n{job_table_payload + stderr}')
-    (jobs, total, result_type, total_no_filter, status_counts
-    ) = managed_job_utils.load_managed_job_queue(job_table_payload)
+    with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
+        (jobs, total, result_type, total_no_filter, status_counts
+        ) = managed_job_utils.load_managed_job_queue(job_table_payload)
     if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
         return jobs, total, status_counts, total_no_filter
     # Backward compatibility for old jobs controller without filtering
     # TODO(hailong): remove this after 0.12.0
-    if not all_users:
-        def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
-            user_hash = job.get('user_hash', None)
-            if user_hash is None:
-                # For backwards compatibility, we show jobs that do not have a
-                # user_hash. TODO(cooperc): Remove before 0.12.0.
-                return True
-            return user_hash == common_utils.get_user_hash()
+    with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
+        if not all_users:
-        jobs = list(filter(user_hash_matches_or_missing, jobs))
+            def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
+                user_hash = job.get('user_hash', None)
+                if user_hash is None:
+                    # For backwards compatibility, we show jobs that do not have
+                    # a user_hash. TODO(cooperc): Remove before 0.12.0.
+                    return True
+                return user_hash == common_utils.get_user_hash()
-    jobs = list(
-        filter(
-            lambda job: job.get('workspace', skylet_constants.
-                                SKYPILOT_DEFAULT_WORKSPACE) in
-            accessible_workspaces, jobs))
+            jobs = list(filter(user_hash_matches_or_missing, jobs))
-    if skip_finished:
-        # Filter out the finished jobs. If a multi-task job is partially
-        # finished, we will include all its tasks.
-        non_finished_tasks = list(
-            filter(lambda job: not job['status'].is_terminal(), jobs))
-        non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
         jobs = list(
-            filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
-    if job_ids:
-        jobs = [job for job in jobs if job['job_id'] in job_ids]
-    filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
-        jobs,
-        workspace_match,
-        name_match,
-        pool_match,
-        page=page,
-        limit=limit,
-        user_match=user_match,
-        enable_user_match=True,
-        statuses=statuses,
-    )
+            filter(
+                lambda job: job.get('workspace', skylet_constants.
+                                    SKYPILOT_DEFAULT_WORKSPACE) in
+                accessible_workspaces, jobs))
+        if skip_finished:
+            # Filter out the finished jobs. If a multi-task job is partially
+            # finished, we will include all its tasks.
+            non_finished_tasks = list(
+                filter(lambda job: not job['status'].is_terminal(), jobs))
+            non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
+            jobs = list(
+                filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
+        if job_ids:
+            jobs = [job for job in jobs if job['job_id'] in job_ids]
+        filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
+            jobs,
+            workspace_match,
+            name_match,
+            pool_match,
+            page=page,
+            limit=limit,
+            user_match=user_match,
+            enable_user_match=True,
+            statuses=statuses,
+        )
     return filtered_jobs, total, status_counts, total_no_filter

sky/jobs/state.py CHANGED Viewed

@@ -10,7 +10,8 @@ import sqlite3
 import threading
 import time
 import typing
-from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
+from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
+                    Union)
 import urllib.parse
 import colorama
@@ -1250,6 +1251,25 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
         return pool[0] if pool else None
+@_init_db
+def get_pool_and_submit_info_from_job_ids(
+    job_ids: Set[int]
+) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
+    """Get the pool, cluster name, and job id on pool from job id"""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(
+            sqlalchemy.select(
+                job_info_table.c.spot_job_id, job_info_table.c.pool,
+                job_info_table.c.current_cluster_name,
+                job_info_table.c.job_id_on_pool_cluster).where(
+                    job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
+        return {
+            job_id: (pool, cluster_name, job_id_on_pool_cluster)
+            for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
+        }
 @_init_db
 def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
     """Set the current cluster name for a job."""

sky/jobs/utils.py CHANGED Viewed

@@ -1325,6 +1325,23 @@ def get_managed_job_queue(
                                              page,
                                              limit,
                                              statuses=statuses)
+    job_ids = set(job['job_id'] for job in jobs)
+    job_id_to_pool_info = (
+        managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
+    cluster_names: Dict[int, str] = {}
+    for job in jobs:
+        # pool info is (pool, cluster_name, job_id_on_pool_cluster)
+        pool_info = job_id_to_pool_info.get(job['job_id'], None)
+        if pool_info and pool_info[0]:
+            cluster_name = pool_info[1]
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                job['task_name'], job['job_id'])
+        cluster_names[job['job_id']] = cluster_name
+    cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
+        set(cluster_names.values()))
     for job in jobs:
         end_at = job['end_at']
         if end_at is None:
@@ -1344,15 +1361,8 @@ def get_managed_job_queue(
         job['status'] = job['status'].value
         job['schedule_state'] = job['schedule_state'].value
-        pool = managed_job_state.get_pool_from_job_id(job['job_id'])
-        if pool is not None:
-            cluster_name, _ = managed_job_state.get_pool_submit_info(
-                job['job_id'])
-        else:
-            cluster_name = generate_managed_job_cluster_name(
-                job['task_name'], job['job_id'])
-        handle = global_user_state.get_handle_from_cluster_name(
-            cluster_name) if cluster_name is not None else None
+        cluster_name = cluster_names[job['job_id']]
+        handle = cluster_name_to_handles.get(cluster_name, None)
         if isinstance(handle, backends.CloudVmRayResourceHandle):
             resources_str = resources_utils.get_readable_resources_repr(
                 handle, simplify=True)
@@ -1507,12 +1517,20 @@ def load_managed_job_queue(
         total_no_filter = total
         result_type = ManagedJobQueueResultType.LIST
+    job_id_to_user_hash: Dict[int, str] = {}
     for job in jobs:
-        job['status'] = managed_job_state.ManagedJobStatus(job['status'])
         if 'user_hash' in job and job['user_hash'] is not None:
             # Skip jobs that do not have user_hash info.
             # TODO(cooperc): Remove check before 0.12.0.
-            user = global_user_state.get_user(job['user_hash'])
+            job_id_to_user_hash[job['job_id']] = job['user_hash']
+    user_hash_to_user = global_user_state.get_users(
+        job_id_to_user_hash.values())
+    for job in jobs:
+        job['status'] = managed_job_state.ManagedJobStatus(job['status'])
+        if job['job_id'] in job_id_to_user_hash:
+            user_hash = job_id_to_user_hash[job['job_id']]
+            user = user_hash_to_user.get(user_hash, None)
             job['user_name'] = user.name if user is not None else None
     return jobs, total, result_type, total_no_filter, status_counts

sky/provision/kubernetes/config.py CHANGED Viewed

@@ -7,9 +7,7 @@ from typing import Any, Dict, List, Optional, Union
 from sky.adaptors import kubernetes
 from sky.provision import common
-from sky.provision.kubernetes import network_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
-from sky.utils import kubernetes_enums
 from sky.utils import yaml_utils
 logger = logging.getLogger(__name__)
@@ -28,11 +26,6 @@ def bootstrap_instances(
     _configure_services(namespace, context, config.provider_config)
-    networking_mode = network_utils.get_networking_mode(
-        config.provider_config.get('networking_mode'), context)
-    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        config = _configure_ssh_jump(namespace, context, config)
     requested_service_account = config.node_config['spec']['serviceAccountName']
     if (requested_service_account ==
             kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
@@ -481,41 +474,6 @@ def _configure_autoscaler_cluster_role_binding(
                 f'{created_msg(binding_field, name)}')
-def _configure_ssh_jump(namespace, context, config: common.ProvisionConfig):
-    """Creates a SSH jump pod to connect to the cluster.
-    Also updates config['auth']['ssh_proxy_command'] to use the newly created
-    jump pod.
-    """
-    provider_config = config.provider_config
-    pod_cfg = config.node_config
-    ssh_jump_name = pod_cfg['metadata']['labels']['skypilot-ssh-jump']
-    ssh_jump_image = provider_config['ssh_jump_image']
-    volumes = pod_cfg['spec']['volumes']
-    # find 'secret-volume' and get the secret name
-    secret_volume = next(filter(lambda x: x['name'] == 'secret-volume',
-                                volumes))
-    ssh_key_secret_name = secret_volume['secret']['secretName']
-    # TODO(romilb): We currently split SSH jump pod and svc creation. Service
-    #  is first created in authentication.py::setup_kubernetes_authentication
-    #  and then SSH jump pod creation happens here. This is because we need to
-    #  set the ssh_proxy_command in the ray YAML before we pass it to the
-    #  autoscaler. If in the future if we can write the ssh_proxy_command to the
-    #  cluster yaml through this method, then we should move the service
-    #  creation here.
-    # TODO(romilb): We should add a check here to make sure the service is up
-    #  and available before we create the SSH jump pod. If for any reason the
-    #  service is missing, we should raise an error.
-    kubernetes_utils.setup_ssh_jump_pod(ssh_jump_name, ssh_jump_image,
-                                        ssh_key_secret_name, namespace, context)
-    return config
 def _configure_skypilot_system_namespace(
         provider_config: Dict[str, Any]) -> None:
     """Creates the namespace for skypilot-system mounting if it does not exist.

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -17,7 +17,6 @@ from sky.provision import constants
 from sky.provision import docker_utils
 from sky.provision.kubernetes import config as config_lib
 from sky.provision.kubernetes import constants as k8s_constants
-from sky.provision.kubernetes import network_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.provision.kubernetes import volume
 from sky.utils import command_runner
@@ -1148,15 +1147,6 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
         if head_pod_name is None and _is_head(pod):
             head_pod_name = pod.metadata.name
-    networking_mode = network_utils.get_networking_mode(
-        config.provider_config.get('networking_mode'), context)
-    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        # Adding the jump pod to the new_nodes list as well so it can be
-        # checked if it's scheduled and running along with other pods.
-        ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
-        jump_pod = kubernetes.core_api(context).read_namespaced_pod(
-            ssh_jump_pod_name, namespace)
-        pods.append(jump_pod)
     provision_timeout = provider_config['timeout']
     wait_str = ('indefinitely'
@@ -1320,18 +1310,6 @@ def terminate_instances(
                                         ray_tag_filter(cluster_name_on_cloud),
                                         None)
-    # Clean up the SSH jump pod if in use
-    networking_mode = network_utils.get_networking_mode(
-        provider_config.get('networking_mode'), context)
-    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        pod_name = list(pods.keys())[0]
-        try:
-            kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
-                                                       pod_name)
-        except Exception as e:  # pylint: disable=broad-except
-            logger.warning('terminate_instances: Error occurred when analyzing '
-                           f'SSH Jump pod: {e}')
     if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
                                                namespace):
         # For high availability controllers, terminate the deployment
@@ -1367,15 +1345,6 @@ def get_cluster_info(
     pods: Dict[str, List[common.InstanceInfo]] = {}
     head_pod_name = None
-    port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
-    network_mode_str = skypilot_config.get_effective_region_config(
-        cloud='kubernetes',
-        region=context,
-        keys=('networking_mode',),
-        default_value=port_forward_mode.value)
-    network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
-        network_mode_str)
-    external_ip = kubernetes_utils.get_external_ip(network_mode, context)
     port = 22
     if not provider_config.get('use_internal_ips', False):
         port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1389,8 +1358,7 @@ def get_cluster_info(
             common.InstanceInfo(
                 instance_id=pod_name,
                 internal_ip=internal_ip,
-                external_ip=(None if network_mode == port_forward_mode else
-                             external_ip),
+                external_ip=None,
                 ssh_port=port,
                 tags=pod.metadata.labels,
             )

sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml CHANGED Viewed

@@ -23,8 +23,7 @@ spec:
         effect: NoExecute
       containers:
       - name: server
-        # TODO(aylei): version strategy of our addon images
-        image: berkeleyskypilot/fusermount-server:latest
+        image: berkeleyskypilot/fusermount-server:0.2.1
         securityContext:
           privileged: true
         volumeMounts:

sky/provision/kubernetes/network_utils.py CHANGED Viewed

@@ -55,27 +55,6 @@ def get_port_mode(
     return port_mode
-def get_networking_mode(
-    mode_str: Optional[str],
-    context: Optional[str],
-) -> kubernetes_enums.KubernetesNetworkingMode:
-    """Get the networking mode from the provider config."""
-    mode_str = mode_str or skypilot_config.get_effective_region_config(
-        cloud='kubernetes',
-        region=context,
-        keys=('networking_mode',),
-        default_value=kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.
-        value)
-    try:
-        networking_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
-            mode_str)
-    except ValueError as e:
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError(str(e) +
-                             ' Please check: ~/.sky/config.yaml.') from None
-    return networking_mode
 def fill_loadbalancer_template(namespace: str, context: Optional[str],
                                service_name: str, ports: List[int],
                                selector_key: str, selector_value: str) -> Dict:

skypilot-nightly 1.0.0.dev20251002__py3-none-any.whl → 1.0.0.dev20251004__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251002py3-none-any.whl → 1.0.0.dev20251004py3-none-any.whl