PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250703__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250701py3-none-any.whl → 1.0.0.dev20250703py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

sky/dashboard/out/workspaces/[name].html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~d427db53e54de9ce~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"~~Md3rlE87jmL5uv7gSo8mR~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-162f3033ffcd3d31.js" defer=""></script><script src="/dashboard/_next/static/chunks/5230-df791914b54d91d9.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-5ea3ffa10fc884f2.js" defer=""></script><script src="/dashboard/_next/static/chunks/1664-d65361e92b85e786.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-9f5e98ce84d46bdd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/3947-b059261d6fa88a1f.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-6ff4e45dfb49d11d.js" defer=""></script><script src="/dashboard/_next/static/chunks/3698-52ad1ca228faa776.js" defer=""></script><script src="/dashboard/_next/static/chunks/9470-21d059a1dfa03f61.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-d0dc765474fa0eca.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-909d53833da080cb.js" defer=""></script><script src="/dashboard/_next/static/chunks/1043-1b39779691bb4030.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-fcfad0ddf92ec7ab.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-044ad21de8b4626b.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-726e5a3f00b67185.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-7c0187f43757a548.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~d427db53e54de9ce~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~Md3rlE87jmL5uv7gSo8mR~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"~~Md3rlE87jmL5uv7gSo8mR~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/0da6afe66176678a.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/0da6afe66176678a.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-9a81ea998672c303.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-efc06c2733009cd3.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-c0a4f1ea606d48d2.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-a37b06ddb64521fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-06bde99155fa6292.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/A-fbCEgJE_q2cV8biIOIr/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"A-fbCEgJE_q2cV8biIOIr","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/jobs/server/core.py CHANGED Viewed

@@ -45,18 +45,21 @@ if typing.TYPE_CHECKING:
 logger = sky_logging.init_logger(__name__)
-def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
-    """Maybe upload files to the controller.
+def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
+    """Upload files to the controller.
-    In consolidation mode, we don't need to upload files to the controller as
-    the API server and the controller are colocated.
+    In consolidation mode, we still need to upload files to the controller as
+    we should keep a separate workdir for each jobs. Assuming two jobs using
+    the same workdir, if there are some modifications to the workdir after job 1
+    is submitted, on recovery of job 1, the modifications should not be applied.
     """
     local_to_controller_file_mounts: Dict[str, str] = {}
-    if managed_job_utils.is_consolidation_mode():
-        return local_to_controller_file_mounts
-    if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
+    # For consolidation mode, we don't need to use cloud storage,
+    # as uploading to the controller is only a local copy.
+    storage_clouds = (
+        storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
+    if not managed_job_utils.is_consolidation_mode() and storage_clouds:
         for task_ in dag.tasks:
             controller_utils.maybe_translate_local_file_mounts_and_sync_up(
                 task_, task_type='jobs')
@@ -67,7 +70,7 @@ def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
         # directly to the controller, because the controller may not
         # even be up yet.
         for task_ in dag.tasks:
-            if task_.storage_mounts:
+            if task_.storage_mounts and not storage_clouds:
                 # Technically, we could convert COPY storage_mounts that
                 # have a local source and do not specify `store`, but we
                 # will not do that for now. Only plain file_mounts are
@@ -242,7 +245,7 @@ def launch(
                         f'with:\n\n`sky down {cluster_name} --purge`\n\n'
                         f'Reason: {common_utils.format_exception(e)}')
-    local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
+    local_to_controller_file_mounts = _upload_files_to_controller(dag)
     # Has to use `\` to avoid yapf issue.
     with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',

sky/metrics/__init__.py ADDED Viewed

File without changes

sky/metrics/utils.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""Utilities for processing GPU metrics from Kubernetes clusters."""
+import os
+import re
+import subprocess
+import time
+from typing import List, Optional, Tuple
+import httpx
+def start_svc_port_forward(context: str, namespace: str, service: str,
+                           service_port: int) -> Tuple[subprocess.Popen, int]:
+    """Starts a port forward to a service in a Kubernetes cluster.
+    Args:
+        context: Kubernetes context name
+        namespace: Namespace where the service is located
+        service: Service name to port forward to
+        service_port: Port on the service to forward to
+    Returns:
+        Tuple of (subprocess.Popen process, local_port assigned)
+    Raises:
+        RuntimeError: If port forward fails to start
+    """
+    start_port_forward_timeout = 10  # 10 second timeout
+    terminate_port_forward_timeout = 5  # 5 second timeout
+    # Use ':service_port' to let kubectl choose the local port
+    cmd = [
+        'kubectl', '--context', context, '-n', namespace, 'port-forward',
+        f'service/{service}', f':{service_port}'
+    ]
+    env = os.environ.copy()
+    if 'KUBECONFIG' not in env:
+        env['KUBECONFIG'] = os.path.expanduser('~/.kube/config')
+    # start the port forward process
+    port_forward_process = subprocess.Popen(cmd,
+                                            stdout=subprocess.PIPE,
+                                            stderr=subprocess.STDOUT,
+                                            text=True,
+                                            env=env)
+    local_port = None
+    start_time = time.time()
+    # wait for the port forward to start and extract the local port
+    while time.time() - start_time < start_port_forward_timeout:
+        if port_forward_process.poll() is not None:
+            # port forward process has terminated
+            if port_forward_process.returncode != 0:
+                raise RuntimeError(
+                    f'Port forward failed for service {service} in namespace '
+                    f'{namespace} on context {context}')
+            break
+        # read output line by line to find the local port
+        if port_forward_process.stdout:
+            line = port_forward_process.stdout.readline()
+            if line:
+                # look for 'Forwarding from 127.0.0.1:XXXXX -> service_port'
+                match = re.search(r'Forwarding from 127\.0\.0\.1:(\d+)', line)
+                if match:
+                    local_port = int(match.group(1))
+                    break
+        # sleep for 100ms to avoid busy-waiting
+        time.sleep(0.1)
+    if local_port is None:
+        try:
+            port_forward_process.terminate()
+            port_forward_process.wait(timeout=terminate_port_forward_timeout)
+        except subprocess.TimeoutExpired:
+            port_forward_process.kill()
+            port_forward_process.wait()
+        finally:
+            raise RuntimeError(
+                f'Failed to extract local port for service {service} in '
+                f'namespace {namespace} on context {context}')
+    return port_forward_process, local_port
+def stop_svc_port_forward(port_forward_process: subprocess.Popen) -> None:
+    """Stops a port forward to a service in a Kubernetes cluster.
+    Args:
+        port_forward_process: The subprocess.Popen process to terminate
+    """
+    try:
+        port_forward_process.terminate()
+        port_forward_process.wait(timeout=5)
+    except subprocess.TimeoutExpired:
+        port_forward_process.kill()
+        port_forward_process.wait()
+async def send_metrics_request_with_port_forward(
+        context: str,
+        namespace: str,
+        service: str,
+        service_port: int,
+        endpoint_path: str = '/federate',
+        match_patterns: Optional[List[str]] = None,
+        timeout: float = 30.0) -> str:
+    """Sends a metrics request to a Prometheus endpoint via port forwarding.
+    Args:
+        context: Kubernetes context name
+        namespace: Namespace where the service is located
+        service: Service name to port forward to
+        service_port: Port on the service to forward to
+        endpoint_path: Path to append to the localhost endpoint (e.g.,
+            '/federate')
+        match_patterns: List of metric patterns to match (for federate
+            endpoint)
+        timeout: Request timeout in seconds
+    Returns:
+        Response text containing the metrics
+    Raises:
+        RuntimeError: If port forward or HTTP request fails
+    """
+    port_forward_process = None
+    try:
+        # Start port forward
+        port_forward_process, local_port = start_svc_port_forward(
+            context, namespace, service, service_port)
+        # Build endpoint URL
+        endpoint = f'http://localhost:{local_port}{endpoint_path}'
+        # Make HTTP request
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            if match_patterns:
+                # For federate endpoint, add match[] parameters
+                params = [('match[]', pattern) for pattern in match_patterns]
+                response = await client.get(endpoint, params=params)
+            else:
+                response = await client.get(endpoint)
+            response.raise_for_status()
+            return response.text
+    finally:
+        # Always clean up port forward
+        if port_forward_process:
+            stop_svc_port_forward(port_forward_process)
+async def add_cluster_name_label(metrics_text: str, context: str) -> str:
+    """Adds a cluster_name label to each metric line.
+    Args:
+        metrics_text: The text containing the metrics
+        context: The cluster name
+    """
+    lines = metrics_text.strip().split('\n')
+    modified_lines = []
+    for line in lines:
+        # keep comment lines and empty lines as-is
+        if line.startswith('#') or not line.strip():
+            modified_lines.append(line)
+            continue
+        # if line is a metric line with labels, add cluster label
+        brace_start = line.find('{')
+        brace_end = line.find('}')
+        if brace_start != -1 and brace_end != -1:
+            metric_name = line[:brace_start]
+            existing_labels = line[brace_start + 1:brace_end]
+            rest_of_line = line[brace_end + 1:]
+            if existing_labels:
+                new_labels = f'cluster="{context}",{existing_labels}'
+            else:
+                new_labels = f'cluster="{context}"'
+            modified_line = f'{metric_name}{{{new_labels}}}{rest_of_line}'
+            modified_lines.append(modified_line)
+        else:
+            # keep other lines as-is
+            modified_lines.append(line)
+    return '\n'.join(modified_lines)
+async def get_metrics_for_context(context: str) -> str:
+    """Get GPU metrics for a single Kubernetes context.
+    Args:
+        context: Kubernetes context name
+    Returns:
+        metrics_text: String containing the metrics
+    Raises:
+        Exception: If metrics collection fails for any reason
+    """
+    # Query both DCGM metrics and kube_pod_labels metrics
+    # This ensures the dashboard can perform joins to filter by skypilot cluster
+    match_patterns = ['{__name__=~"DCGM_.*"}', 'kube_pod_labels']
+    # TODO(rohan): don't hardcode the namespace and service name
+    metrics_text = await send_metrics_request_with_port_forward(
+        context=context,
+        namespace='skypilot',
+        service='skypilot-prometheus-server',
+        service_port=80,
+        endpoint_path='/federate',
+        match_patterns=match_patterns)
+    # add cluster name as a label to each metric line
+    metrics_text = await add_cluster_name_label(metrics_text, context)
+    return metrics_text

sky/optimizer.py CHANGED Viewed

@@ -1375,7 +1375,7 @@ def _fill_in_launchable_resources(
             num_node_str = ''
             if task.num_nodes > 1:
                 num_node_str = f'{task.num_nodes}x '
-            if not quiet:
+            if not (quiet or resources.no_missing_accel_warnings):
                 logger.info(
                     f'No resource satisfying {num_node_str}'
                     f'{resources.repr_with_region_zone} on {clouds_str}.')

sky/resources.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Resources: compute requirements of Tasks."""
+import collections
 import dataclasses
 import math
+import re
 import textwrap
 import typing
 from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
@@ -41,6 +43,20 @@ RESOURCE_CONFIG_ALIASES = {
     'gpus': 'accelerators',
 }
+MEMORY_SIZE_UNITS = {
+    'b': 1,
+    'k': 2**10,
+    'kb': 2**10,
+    'm': 2**20,
+    'mb': 2**20,
+    'g': 2**30,
+    'gb': 2**30,
+    't': 2**40,
+    'tb': 2**40,
+    'p': 2**50,
+    'pb': 2**50,
+}
 @dataclasses.dataclass
 class AutostopConfig:
@@ -110,7 +126,7 @@ class Resources:
     """
     # If any fields changed, increment the version. For backward compatibility,
     # modify the __setstate__ method to handle the old version.
-    _VERSION = 27
+    _VERSION = 28
     def __init__(
         self,
@@ -142,6 +158,7 @@ class Resources:
         _is_image_managed: Optional[bool] = None,
         _requires_fuse: Optional[bool] = None,
         _cluster_config_overrides: Optional[Dict[str, Any]] = None,
+        _no_missing_accel_warnings: Optional[bool] = None,
     ):
         """Initialize a Resources object.
@@ -366,6 +383,7 @@ class Resources:
         self._cluster_config_overrides = _cluster_config_overrides
         self._cached_repr: Optional[str] = None
+        self._no_missing_accel_warnings = _no_missing_accel_warnings
         # Initialize _priority before calling the setter
         self._priority: Optional[int] = None
@@ -649,6 +667,13 @@ class Resources:
             return False
         return self._requires_fuse
+    @property
+    def no_missing_accel_warnings(self) -> bool:
+        """Returns whether to force quiet mode for this resource."""
+        if self._no_missing_accel_warnings is None:
+            return False
+        return self._no_missing_accel_warnings
     def set_requires_fuse(self, value: bool) -> None:
         """Sets whether this resource requires FUSE mounting support.
@@ -754,6 +779,8 @@ class Resources:
                 if ':' not in accelerators:
                     accelerators = {accelerators: 1}
                 else:
+                    assert isinstance(accelerators,
+                                      str), (type(accelerators), accelerators)
                     splits = accelerators.split(':')
                     parse_error = ('The "accelerators" field as a str '
                                    'should be <name> or <name>:<cnt>. '
@@ -1778,6 +1805,8 @@ class Resources:
                                            self._is_image_managed),
             _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
             _cluster_config_overrides=override_configs,
+            _no_missing_accel_warnings=override.pop(
+                'no_missing_accel_warnings', self._no_missing_accel_warnings),
         )
         assert not override
         return resources
@@ -1843,10 +1872,75 @@ class Resources:
                 config[canonical] = config[alias]
                 del config[alias]
+    @classmethod
+    def _parse_accelerators_from_str(
+            cls, accelerators: str) -> List[Tuple[str, bool]]:
+        """Parse accelerators string into a list of possible accelerators.
+        Returns:
+            A list of possible accelerators. Each element is a tuple of
+            (accelerator_name, was_user_specified). was_user_specified is True
+            if the accelerator was directly named by the user (for example
+            "H100:2" would be True, but "80GB+" would be False since it doesn't
+            mention the name of the accelerator).
+        """
+        # sanity check
+        assert isinstance(accelerators, str), accelerators
+        manufacturer = None
+        memory = None
+        count = 1
+        split = accelerators.split(':')
+        if len(split) == 3:
+            manufacturer, memory, count_str = split
+            count = int(count_str)
+            assert re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', memory), \
+                'If specifying a GPU manufacturer, you must also' \
+                'specify the memory size'
+        elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
+            memory = split[0]
+            count = int(split[1])
+        elif len(split) == 2 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[1]):
+            manufacturer, memory = split
+        elif len(split) == 1 and re.match(r'^[0-9]+[GgMmTt][Bb]\+?$', split[0]):
+            memory = split[0]
+        else:
+            # it is just an accelerator name, not a memory size
+            return [(accelerators, True)]
+        # we know we have some case of manufacturer, memory, count, now we
+        # need to convert that to a list of possible accelerators
+        memory_parsed = resources_utils.parse_memory_resource(memory,
+                                                              'accelerators',
+                                                              allow_plus=True)
+        plus = memory_parsed[-1] == '+'
+        if plus:
+            memory_parsed = memory_parsed[:-1]
+        memory_gb = int(memory_parsed)
+        accelerators = [
+            (f'{device}:{count}', False)
+            for device in accelerator_registry.get_devices_by_memory(
+                memory_gb, plus, manufacturer=manufacturer)
+        ]
+        return accelerators
     @classmethod
     def from_yaml_config(
         cls, config: Optional[Dict[str, Any]]
     ) -> Union[Set['Resources'], List['Resources']]:
+        """Creates Resources objects from a YAML config.
+        Args:
+            config: A dict of resource config.
+        Returns:
+            A set of Resources objects if any_of is specified, otherwise a list
+            of Resources objects if ordered is specified, otherwise a set with
+            a single Resources object.
+        """
         if config is None:
             return {Resources()}
@@ -1903,13 +1997,48 @@ class Resources:
         accelerators = config.get('accelerators')
         if config and accelerators is not None:
             if isinstance(accelerators, str):
-                accelerators = {accelerators}
+                accelerators_list = cls._parse_accelerators_from_str(
+                    accelerators)
             elif isinstance(accelerators, dict):
-                accelerators = [
+                accelerator_names = [
                     f'{k}:{v}' if v is not None else f'{k}'
                     for k, v in accelerators.items()
                 ]
-                accelerators = set(accelerators)
+                accelerators_list = []
+                for accel_name in accelerator_names:
+                    parsed_accels = cls._parse_accelerators_from_str(accel_name)
+                    accelerators_list.extend(parsed_accels)
+            elif isinstance(accelerators, list) or isinstance(
+                    accelerators, set):
+                accelerators_list = []
+                for accel_name in accelerators:
+                    parsed_accels = cls._parse_accelerators_from_str(accel_name)
+                    accelerators_list.extend(parsed_accels)
+            else:
+                assert False, ('Invalid accelerators type:'
+                               f'{type(accelerators)}')
+            # now that accelerators is a list, we need to decide which to
+            # include in the final set, however, there may be multiple copies
+            # of the same accelerator, some given by name by the user and the
+            # other copy being given by memory size. In this case, we only care
+            # about the user specified ones (so we can give a warning if it
+            # doesn't exist).
+            accel_to_user_specified: Dict[str, bool] = collections.OrderedDict()
+            for accel, user_specified in accelerators_list:
+                # If this accelerator is not in dict yet, or if current one is
+                # user specified and existing one is not, update the entry
+                accel_to_user_specified[accel] = (user_specified or
+                                                  accel_to_user_specified.get(
+                                                      accel, False))
+            # only time we care about ordered is when we are given a list,
+            # otherwise we default to a set
+            accelerators_type = list if isinstance(accelerators, list) else set
+            accelerators = accelerators_type([
+                (accel, user_specified)
+                for accel, user_specified in accel_to_user_specified.items()
+            ])
             if len(accelerators) > 1 and ordered_configs:
                 with ux_utils.print_exception_no_traceback():
                     raise ValueError(
@@ -1935,20 +2064,20 @@ class Resources:
             # In Task, we store a list of resources, each with 1 accelerator.
             # This for loop is for format conversion.
             tmp_resources_list = []
-            for acc in accelerators:
+            for acc, user_specified in accelerators:
                 tmp_resource = config.copy()
                 tmp_resource['accelerators'] = acc
+                if not user_specified:
+                    tmp_resource['_no_missing_accel_warnings'] = True
                 tmp_resources_list.append(
                     Resources._from_yaml_config_single(tmp_resource))
             assert isinstance(accelerators, (list, set)), accelerators
             return type(accelerators)(tmp_resources_list)
         return {Resources._from_yaml_config_single(config)}
     @classmethod
     def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
         resources_fields: Dict[str, Any] = {}
         # Extract infra field if present
@@ -2010,6 +2139,8 @@ class Resources:
             # although it will end up being an int, we don't know at this point
             # if it has units or not, so we store it as a string
             resources_fields['disk_size'] = str(resources_fields['disk_size'])
+        resources_fields['_no_missing_accel_warnings'] = config.pop(
+            '_no_missing_accel_warnings', None)
         assert not config, f'Invalid resource args: {config.keys()}'
         return Resources(**resources_fields)
@@ -2060,6 +2191,9 @@ class Resources:
             config['volumes'] = volumes
         if self._autostop_config is not None:
             config['autostop'] = self._autostop_config.to_yaml_config()
+        add_if_not_none('_no_missing_accel_warnings',
+                        self._no_missing_accel_warnings)
         add_if_not_none('priority', self.priority)
         if self._docker_login_config is not None:
             config['_docker_login_config'] = dataclasses.asdict(
@@ -2232,6 +2366,10 @@ class Resources:
         if version < 27:
             self._priority = None
+        if version < 28:
+            self._no_missing_accel_warnings = state.get(
+                '_no_missing_accel_warnings', None)
         self.__dict__.update(state)

sky/server/common.py CHANGED Viewed

@@ -165,14 +165,25 @@ def set_api_cookie_jar(cookie_jar: CookieJar,
     if not cookie_path.parent.exists():
         cookie_path.parent.mkdir(parents=True, exist_ok=True)
-    file_cookie_jar = MozillaCookieJar(cookie_path)
+    # Writing directly to the cookie jar path can race with other processes that
+    # are reading the cookie jar, making it look malformed. Instead, write to a
+    # temporary file and then move it to the final location.
+    # Avoid hardcoding the tmp file path, since it could cause a race with other
+    # processes that are also writing to the tmp file.
+    with tempfile.NamedTemporaryFile(dir=cookie_path.parent,
+                                     delete=False) as tmp_file:
+        tmp_cookie_path = tmp_file.name
+    file_cookie_jar = MozillaCookieJar(tmp_cookie_path)
     if cookie_path.exists():
-        file_cookie_jar.load()
+        file_cookie_jar.load(str(cookie_path))
     for cookie in cookie_jar:
         file_cookie_jar.set_cookie(cookie)
     file_cookie_jar.save()
+    # Move the temporary file to the final location.
+    os.replace(tmp_cookie_path, cookie_path)
 def get_cookies_from_response(
         response: 'requests.Response') -> requests.cookies.RequestsCookieJar:

skypilot-nightly 1.0.0.dev20250701__py3-none-any.whl → 1.0.0.dev20250703__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250701py3-none-any.whl → 1.0.0.dev20250703py3-none-any.whl