PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250701py3-none-any.whl → 1.0.0.dev20250702py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

sky/dashboard/out/workspaces/[name].html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~d427db53e54de9ce~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"~~Md3rlE87jmL5uv7gSo8mR~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~d427db53e54de9ce~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"~~Md3rlE87jmL5uv7gSo8mR~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/N5IdFnjR1RaPGBAVYeTIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"N5IdFnjR1RaPGBAVYeTIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/metrics/utils.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""Utilities for processing GPU metrics from Kubernetes clusters."""
+import os
+import re
+import subprocess
+import time
+from typing import List, Optional, Tuple
+import httpx
+def start_svc_port_forward(context: str, namespace: str, service: str,
+                           service_port: int) -> Tuple[subprocess.Popen, int]:
+    """Starts a port forward to a service in a Kubernetes cluster.
+    Args:
+        context: Kubernetes context name
+        namespace: Namespace where the service is located
+        service: Service name to port forward to
+        service_port: Port on the service to forward to
+    Returns:
+        Tuple of (subprocess.Popen process, local_port assigned)
+    Raises:
+        RuntimeError: If port forward fails to start
+    """
+    start_port_forward_timeout = 10  # 10 second timeout
+    terminate_port_forward_timeout = 5  # 5 second timeout
+    # Use ':service_port' to let kubectl choose the local port
+    cmd = [
+        'kubectl', '--context', context, '-n', namespace, 'port-forward',
+        f'service/{service}', f':{service_port}'
+    ]
+    env = os.environ.copy()
+    if 'KUBECONFIG' not in env:
+        env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
+    # start the port forward process
+    port_forward_process = subprocess.Popen(cmd,
+                                            stdout=subprocess.PIPE,
+                                            stderr=subprocess.STDOUT,
+                                            text=True,
+                                            env=env)
+    local_port = None
+    start_time = time.time()
+    # wait for the port forward to start and extract the local port
+    while time.time() - start_time < start_port_forward_timeout:
+        if port_forward_process.poll() is not None:
+            # port forward process has terminated
+            if port_forward_process.returncode != 0:
+                raise RuntimeError(
+                    f'Port forward failed for service {service} in namespace '
+                    f'{namespace} on context {context}')
+            break
+        # read output line by line to find the local port
+        if port_forward_process.stdout:
+            line = port_forward_process.stdout.readline()
+            if line:
+                # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
+                match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
+                if match:
+                    local_port = int(match.group(1))
+                    break
+        # sleep for 100ms to avoid busy-waiting
+        time.sleep(0.1)
+    if local_port is None:
+        try:
+            port_forward_process.terminate()
+            port_forward_process.wait(timeout=terminate_port_forward_timeout)
+        except subprocess.TimeoutExpired:
+            port_forward_process.kill()
+            port_forward_process.wait()
+        finally:
+            raise RuntimeError(
+                f'Failed to extract local port for service {service} in '
+                f'namespace {namespace} on context {context}')
+    return port_forward_process, local_port
+def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
+    """Stops a port forward to a service in a Kubernetes cluster.
+    Args:
+        port_forward_process: The subprocess.Popen process to terminate
+    """
+    try:
+        port_forward_process.terminate()
+        port_forward_process.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        port_forward_process.kill()
+        port_forward_process.wait()
+async def send_metrics_request_with_port_forward(
+        context: str,
+        namespace: str,
+        service: str,
+        service_port: int,
+        endpoint_path: str = '/federate',
+        match_patterns: Optional[List[str]] = None,
+        timeout: float = 30.0) -> str:
+    """Sends a metrics request to a Prometheus endpoint via port forwarding.
+    Args:
+        context: Kubernetes context name
+        namespace: Namespace where the service is located
+        service: Service name to port forward to
+        service_port: Port on the service to forward to
+        endpoint_path: Path to append to the localhost endpoint (e.g.,
+            '/federate')
+        match_patterns: List of metric patterns to match (for federate
+            endpoint)
+        timeout: Request timeout in seconds
+    Returns:
+        Response text containing the metrics
+    Raises:
+        RuntimeError: If port forward or HTTP request fails
+    """
+    port_forward_process = None
+    try:
+        # Start port forward
+        port_forward_process, local_port = start_svc_port_forward(
+            context, namespace, service, service_port)
+        # Build endpoint URL
+        endpoint = f'http://localhost:{local_port}{endpoint_path}'
+        # Make HTTP request
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            if match_patterns:
+                # For federate endpoint, add match[] parameters
+                params = [('match[]', pattern) for pattern in match_patterns]
+                response = await client.get(endpoint, params=params)
+            else:
+                response = await client.get(endpoint)
+            response.raise_for_status()
+            return response.text
+    finally:
+        # Always clean up port forward
+        if port_forward_process:
+            stop_svc_port_forward(port_forward_process)
+async def add_cluster_name_label(metrics_text: str, context: str) -> str:
+    """Adds a cluster_name label to each metric line.
+    Args:
+        metrics_text: The text containing the metrics
+        context: The cluster name
+    """
+    lines = metrics_text.strip().split('\n')
+    modified_lines = []
+    for line in lines:
+        # keep comment lines and empty lines as-is
+        if line.startswith('#') or not line.strip():
+            modified_lines.append(line)
+            continue
+        # if line is a metric line with labels, add cluster label
+        brace_start = line.find('{')
+        brace_end = line.find('}')
+        if brace_start != -1 and brace_end != -1:
+            metric_name = line[:brace_start]
+            existing_labels = line[brace_start + 1:brace_end]
+            rest_of_line = line[brace_end + 1:]
+            if existing_labels:
+                new_labels = f'cluster="{context}",{existing_labels}'
+            else:
+                new_labels = f'cluster="{context}"'
+            modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
+            modified_lines.append(modified_line)
+        else:
+            # keep other lines as-is
+            modified_lines.append(line)
+    return '\n'.join(modified_lines)
+async def get_metrics_for_context(context: str) -> str:
+    """Get GPU metrics for a single Kubernetes context.
+    Args:
+        context: Kubernetes context name
+    Returns:
+        metrics_text: String containing the metrics
+    Raises:
+        Exception: If metrics collection fails for any reason
+    """
+    # Query both DCGM metrics and kube_pod_labels metrics
+    # This ensures the dashboard can perform joins to filter by skypilot cluster
+    match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
+    # TODO(rohan): don't hardcode the namespace and service name
+    metrics_text = await send_metrics_request_with_port_forward(
+        context=context,
+        namespace='skypilot',
+        service='skypilot-prometheus-server',
+        service_port=80,
+        endpoint_path='/federate',
+        match_patterns=match_patterns)
+    # add cluster name as a label to each metric line
+    metrics_text = await add_cluster_name_label(metrics_text, context)
+    return metrics_text

sky/optimizer.py CHANGED Viewed

@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
             num_node_str = ''
             if task.num_nodes > 1:
                 num_node_str = f'{task.num_nodes}x '
-            if not quiet:
+            if not (quiet or resources.no_missing_accel_warnings):
                 logger.info(
                     f'No resource satisfying {num_node_str}'
                     f'{resources.repr_with_region_zone} on {clouds_str}.')

sky/resources.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Resources: compute requirements of Tasks."""
+import collections
 import dataclasses
 import math
+import re
 import textwrap
 import typing
 from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
     'gpus': 'accelerators',
 }
+MEMORY_SIZE_UNITS = {
+    'b': 1,
+    'k': 2**10,
+    'kb': 2**10,
+    'm': 2**20,
+    'mb': 2**20,
+    'g': 2**30,
+    'gb': 2**30,
+    't': 2**40,
+    'tb': 2**40,
+    'p': 2**50,
+    'pb': 2**50,
+}
 @dataclasses.dataclass
 class AutostopConfig:
@@ -110,7 +126,7 @@ class Resources:
     """
     # If any fields changed, increment the version. For backward compatibility,
     # modify the __setstate__ method to handle the old version.
-    _VERSION = 27
+    _VERSION = 28
     def __init__(
         self,
@@ -142,6 +158,7 @@ class Resources:
         _is_image_managed: Optional[bool] = None,
         _requires_fuse: Optional[bool] = None,
         _cluster_config_overrides: Optional[Dict[str, Any]] = None,
+        _no_missing_accel_warnings: Optional[bool] = None,
     ):
         """Initialize a Resources object.
@@ -366,6 +383,7 @@ class Resources:
         self._cluster_config_overrides = _cluster_config_overrides
         self._cached_repr: Optional[str] = None
+        self._no_missing_accel_warnings = _no_missing_accel_warnings
         # Initialize _priority before calling the setter
         self._priority: Optional[int] = None
@@ -649,6 +667,13 @@ class Resources:
             return False
         return self._requires_fuse
+    @property
+    def no_missing_accel_warnings(self) -> bool:
+        """Returns whether to force quiet mode for this resource."""
+        if self._no_missing_accel_warnings is None:
+            return False
+        return self._no_missing_accel_warnings
     def set_requires_fuse(self, value: bool) -> None:
         """Sets whether this resource requires FUSE mounting support.
@@ -754,6 +779,8 @@ class Resources:
                 if ':' not in accelerators:
                     accelerators = {accelerators: 1}
                 else:
+                    assert isinstance(accelerators,
+                                      str), (type(accelerators), accelerators)
                     splits = accelerators.split(':')
                     parse_error = ('The "accelerators" field as a str '
                                    'should be <name> or <name>:<cnt>. '
@@ -1778,6 +1805,8 @@ class Resources:
                                            self._is_image_managed),
             _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
             _cluster_config_overrides=override_configs,
+            _no_missing_accel_warnings=override.pop(
+                'no_missing_accel_warnings', self._no_missing_accel_warnings),
         )
         assert not override
         return resources
@@ -1843,10 +1872,75 @@ class Resources:
                 config[canonical] = config[alias]
                 del config[alias]
+    @classmethod
+    def _parse_accelerators_from_str(
+            cls, accelerators: str) -> List[Tuple[str, bool]]:
+        """Parse accelerators string into a list of possible accelerators.
+        Returns:
+            A list of possible accelerators. Each element is a tuple of
+            (accelerator_name, was_user_specified). was_user_specified is True
+            if the accelerator was directly named by the user (for example
+            "H100:2" would be True, but "80GB+" would be False since it doesn't
+            mention the name of the accelerator).
+        """
+        # sanity check
+        assert isinstance(accelerators, str), accelerators
+        manufacturer = None
+        memory = None
+        count = 1
+        split = accelerators.split(':')
+        if len(split) == 3:
+            manufacturer, memory, count_str = split
+            count = int(count_str)
+            assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
+                'If specifying a GPU manufacturer, you must also' \
+                'specify the memory size'
+        elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
+            memory = split[0]
+            count = int(split[1])
+        elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
+            manufacturer, memory = split
+        elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
+            memory = split[0]
+        else:
+            # it is just an accelerator name, not a memory size
+            return [(accelerators, True)]
+        # we know we have some case of manufacturer, memory, count, now we
+        # need to convert that to a list of possible accelerators
+        memory_parsed = resources_utils.parse_memory_resource(memory,
+                                                              'accelerators',
+                                                              allow_plus=True)
+        plus = memory_parsed[-1] == '+'
+        if plus:
+            memory_parsed = memory_parsed[:-1]
+        memory_gb = int(memory_parsed)
+        accelerators = [
+            (f'{device}:{count}', False)
+            for device in accelerator_registry.get_devices_by_memory(
+                memory_gb, plus, manufacturer=manufacturer)
+        ]
+        return accelerators
     @classmethod
     def from_yaml_config(
         cls, config: Optional[Dict[str, Any]]
     ) -> Union[Set['Resources'], List['Resources']]:
+        """Creates Resources objects from a YAML config.
+        Args:
+            config: A dict of resource config.
+        Returns:
+            A set of Resources objects if any_of is specified, otherwise a list
+            of Resources objects if ordered is specified, otherwise a set with
+            a single Resources object.
+        """
         if config is None:
             return {Resources()}
@@ -1903,13 +1997,48 @@ class Resources:
         accelerators = config.get('accelerators')
         if config and accelerators is not None:
             if isinstance(accelerators, str):
-                accelerators = {accelerators}
+                accelerators_list = cls._parse_accelerators_from_str(
+                    accelerators)
             elif isinstance(accelerators, dict):
-                accelerators = [
+                accelerator_names = [
                     f'{k}:{v}' if v is not None else f'{k}'
                     for k, v in accelerators.items()
                 ]
-                accelerators = set(accelerators)
+                accelerators_list = []
+                for accel_name in accelerator_names:
+                    parsed_accels = cls._parse_accelerators_from_str(accel_name)
+                    accelerators_list.extend(parsed_accels)
+            elif isinstance(accelerators, list) or isinstance(
+                    accelerators, set):
+                accelerators_list = []
+                for accel_name in accelerators:
+                    parsed_accels = cls._parse_accelerators_from_str(accel_name)
+                    accelerators_list.extend(parsed_accels)
+            else:
+                assert False, ('Invalid accelerators type:'
+                               f'{type(accelerators)}')
+            # now that accelerators is a list, we need to decide which to
+            # include in the final set, however, there may be multiple copies
+            # of the same accelerator, some given by name by the user and the
+            # other copy being given by memory size. In this case, we only care
+            # about the user specified ones (so we can give a warning if it
+            # doesn't exist).
+            accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
+            for accel, user_specified in accelerators_list:
+                # If this accelerator is not in dict yet, or if current one is
+                # user specified and existing one is not, update the entry
+                accel_to_user_specified[accel] = (user_specified or
+                                                  accel_to_user_specified.get(
+                                                      accel, False))
+            # only time we care about ordered is when we are given a list,
+            # otherwise we default to a set
+            accelerators_type = list if isinstance(accelerators, list) else set
+            accelerators = accelerators_type([
+                (accel, user_specified)
+                for accel, user_specified in accel_to_user_specified.items()
+            ])
             if len(accelerators) > 1 and ordered_configs:
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError(
@@ -1935,20 +2064,20 @@ class Resources:
             # In Task, we store a list of resources, each with 1 accelerator.
             # This for loop is for format conversion.
             tmp_resources_list = []
-            for acc in accelerators:
+            for acc, user_specified in accelerators:
                 tmp_resource = config.copy()
                 tmp_resource['accelerators'] = acc
+                if not user_specified:
+                    tmp_resource['_no_missing_accel_warnings'] = True
                 tmp_resources_list.append(
                     Resources._from_yaml_config_single(tmp_resource))
             assert isinstance(accelerators, (list, set)), accelerators
             return type(accelerators)(tmp_resources_list)
         return {Resources._from_yaml_config_single(config)}
     @classmethod
     def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
         resources_fields: Dict[str, Any] = {}
         # Extract infra field if present
@@ -2010,6 +2139,8 @@ class Resources:
             # although it will end up being an int, we don't know at this point
             # if it has units or not, so we store it as a string
             resources_fields['disk_size'] = str(resources_fields['disk_size'])
+        resources_fields['_no_missing_accel_warnings'] = config.pop(
+            '_no_missing_accel_warnings', None)
         assert not config, f'Invalid resource args: {config.keys()}'
         return Resources(**resources_fields)
@@ -2060,6 +2191,9 @@ class Resources:
             config['volumes'] = volumes
         if self._autostop_config is not None:
             config['autostop'] = self._autostop_config.to_yaml_config()
+        add_if_not_none('_no_missing_accel_warnings',
+                        self._no_missing_accel_warnings)
         add_if_not_none('priority', self.priority)
         if self._docker_login_config is not None:
             config['_docker_login_config'] = dataclasses.asdict(
@@ -2232,6 +2366,10 @@ class Resources:
         if version < 27:
             self._priority = None
+        if version < 28:
+            self._no_missing_accel_warnings = state.get(
+                '_no_missing_accel_warnings', None)
         self.__dict__.update(state)

sky/server/server.py CHANGED Viewed

@@ -39,6 +39,7 @@ from sky import models
 from sky import sky_logging
 from sky.data import storage_utils
 from sky.jobs.server import server as jobs_rest
+from sky.metrics import utils as metrics_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.serve.server import server as serve_rest
 from sky.server import common
@@ -218,14 +219,26 @@ class RequestIDMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
 def _get_auth_user_header(request: fastapi.Request) -> Optional[models.User]:
-    if 'X-Auth-Request-Email' not in request.headers:
+    header_name = os.environ.get(constants.ENV_VAR_SERVER_AUTH_USER_HEADER,
+                                 'X-Auth-Request-Email')
+    if header_name not in request.headers:
         return None
-    user_name = request.headers['X-Auth-Request-Email']
+    user_name = request.headers[header_name]
     user_hash = hashlib.md5(
         user_name.encode()).hexdigest()[:common_utils.USER_HASH_LENGTH]
     return models.User(id=user_hash, name=user_name)
+class InitializeRequestAuthUserMiddleware(
+        starlette.middleware.base.BaseHTTPMiddleware):
+    async def dispatch(self, request: fastapi.Request, call_next):
+        # Make sure that request.state.auth_user is set. Otherwise, we may get a
+        # KeyError while trying to read it.
+        request.state.auth_user = None
+        return await call_next(request)
 class BasicAuthMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
     """Middleware to handle HTTP Basic Auth."""
@@ -406,6 +419,18 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
     async def dispatch(self, request: fastapi.Request, call_next):
         auth_user = _get_auth_user_header(request)
+        if request.state.auth_user is not None:
+            # Previous middleware is trusted more than this middleware.  For
+            # instance, a client could set the Authorization and the
+            # X-Auth-Request-Email header. In that case, the auth proxy will be
+            # skipped and we should rely on the Bearer token to authenticate the
+            # user - but that means the user could set X-Auth-Request-Email to
+            # whatever the user wants. We should thus ignore it.
+            if auth_user is not None:
+                logger.debug('Warning: ignoring auth proxy header since the '
+                             'auth user was already set.')
+            return await call_next(request)
         # Add user to database if auth_user is present
         if auth_user is not None:
             newly_added = global_user_state.add_or_update_user(auth_user)
@@ -416,8 +441,6 @@ class AuthProxyMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
         # Store user info in request.state for access by GET endpoints
         if auth_user is not None:
             request.state.auth_user = auth_user
-        else:
-            request.state.auth_user = None
         await _override_user_info_in_request_body(request, auth_user)
         return await call_next(request)
@@ -536,10 +559,17 @@ class GracefulShutdownMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
 app = fastapi.FastAPI(prefix='/api/v1', debug=True, lifespan=lifespan)
+# Middleware wraps in the order defined here. E.g., given
+#   app.add_middleware(Middleware1)
+#   app.add_middleware(Middleware2)
+#   app.add_middleware(Middleware3)
+# The effect will be like:
+#   Middleware3(Middleware2(Middleware1(request)))
+# If MiddlewareN does something like print(n); call_next(); print(n), you'll get
+#   3; 2; 1; <request>; 1; 2; 3
 # Use environment variable to make the metrics middleware optional.
 if os.environ.get(constants.ENV_VAR_SERVER_METRICS_ENABLED):
     app.add_middleware(metrics.PrometheusMiddleware)
-app.add_middleware(RBACMiddleware)
 app.add_middleware(InternalDashboardPrefixMiddleware)
 app.add_middleware(GracefulShutdownMiddleware)
 app.add_middleware(PathCleanMiddleware)
@@ -552,15 +582,26 @@ app.add_middleware(
     allow_credentials=True,
     allow_methods=['*'],
     allow_headers=['*'],
-    # TODO(syang): remove X-Request-ID when v0.10.0 is released.
+    # TODO(syang): remove X-Request-ID \when v0.10.0 is released.
     expose_headers=['X-Request-ID', 'X-Skypilot-Request-ID'])
+# The order of all the authentication-related middleware is important.
+# RBACMiddleware must precede all the auth middleware, so it can access
+# request.state.auth_user.
+app.add_middleware(RBACMiddleware)
+# AuthProxyMiddleware should precede BasicAuthMiddleware and
+# BearerTokenMiddleware, since it should be skipped if either of those set the
+# auth user.
+app.add_middleware(AuthProxyMiddleware)
 enable_basic_auth = os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false')
 if str(enable_basic_auth).lower() == 'true':
     app.add_middleware(BasicAuthMiddleware)
 # Bearer token middleware should always be present to handle service account
 # authentication
 app.add_middleware(BearerTokenMiddleware)
-app.add_middleware(AuthProxyMiddleware)
+# InitializeRequestAuthUserMiddleware must be the last added middleware so that
+# request.state.auth_user is always set, but can be overridden by the auth
+# middleware above.
+app.add_middleware(InitializeRequestAuthUserMiddleware)
 app.add_middleware(RequestIDMiddleware)
 app.include_router(jobs_rest.router, prefix='/jobs', tags=['jobs'])
 app.include_router(serve_rest.router, prefix='/serve', tags=['serve'])
@@ -1576,6 +1617,38 @@ async def all_contexts(request: fastapi.Request) -> None:
     )
+@app.get('/gpu-metrics')
+async def gpu_metrics() -> fastapi.Response:
+    """Gets the GPU metrics from multiple external k8s clusters"""
+    contexts = core.get_all_contexts()
+    all_metrics = []
+    successful_contexts = 0
+    tasks = [
+        asyncio.create_task(metrics_utils.get_metrics_for_context(context))
+        for context in contexts
+        if context != 'in-cluster'
+    ]
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for i, result in enumerate(results):
+        if isinstance(result, Exception):
+            logger.error(
+                f'Failed to get metrics for context {contexts[i]}: {result}')
+        else:
+            metrics_text = result
+            all_metrics.append(metrics_text)
+            successful_contexts += 1
+    combined_metrics = '\n\n'.join(all_metrics)
+    # Return as plain text for Prometheus compatibility
+    return fastapi.Response(
+        content=combined_metrics,
+        media_type='text/plain; version=0.0.4; charset=utf-8')
 # === Internal APIs ===
 @app.get('/api/completion/cluster_name')
 async def complete_cluster_name(incomplete: str,) -> List[str]:

sky/setup_files/MANIFEST.in CHANGED Viewed

@@ -17,3 +17,4 @@ include sky/utils/kubernetes/*
 include sky/server/html/*
 recursive-include sky/dashboard/out *
 include sky/users/*.conf
+include sky/metrics/*

skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250702__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250701py3-none-any.whl → 1.0.0.dev20250702py3-none-any.whl