PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

sky/__init__.py +4 -2
sky/adaptors/slurm.py +159 -72
sky/backends/backend_utils.py +52 -10
sky/backends/cloud_vm_ray_backend.py +192 -32
sky/backends/task_codegen.py +40 -2
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +0 -7
sky/catalog/vast_catalog.py +30 -6
sky/check.py +11 -8
sky/client/cli/command.py +106 -54
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +8 -0
sky/client/sdk_async.py +9 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +44 -12
sky/clouds/ssh.py +1 -1
sky/clouds/vast.py +30 -17
sky/core.py +69 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +29 -4
sky/global_user_state.py +108 -16
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +9 -0
sky/optimizer.py +2 -1
sky/provision/__init__.py +11 -9
sky/provision/kubernetes/utils.py +122 -15
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +2 -1
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/instance.py +75 -29
sky/provision/slurm/utils.py +213 -107
sky/provision/vast/utils.py +1 -0
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +16 -0
sky/server/requests/payloads.py +18 -0
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +28 -10
sky/server/requests/serializers/encoders.py +5 -0
sky/server/requests/serializers/return_value_serializers.py +14 -4
sky/server/server.py +434 -107
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +21 -10
sky/sky_logging.py +2 -1
sky/skylet/constants.py +22 -5
sky/skylet/executor/slurm.py +4 -6
sky/skylet/job_lib.py +89 -4
sky/skylet/services.py +18 -3
sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/templates/kubernetes-ray.yml.j2 +4 -6
sky/templates/slurm-ray.yml.j2 +32 -2
sky/templates/websocket_proxy.py +18 -41
sky/users/permission.py +61 -51
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +256 -94
sky/utils/command_runner.pyi +16 -0
sky/utils/common_utils.py +30 -29
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +63 -20
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
/sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/provision/slurm/utils.py CHANGED Viewed

@@ -1,7 +1,10 @@
 """Slurm utilities for SkyPilot."""
+import json
 import math
 import os
 import re
+import shlex
+import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 from paramiko.config import SSHConfig
@@ -9,15 +12,39 @@ from paramiko.config import SSHConfig
 from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import slurm
+from sky.skylet import constants
 from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils.db import kv_cache
 logger = sky_logging.init_logger(__name__)
-# TODO(jwj): Choose commonly used default values.
 DEFAULT_SLURM_PATH = '~/.slurm/config'
-DEFAULT_CLUSTER_NAME = 'localcluster'
-DEFAULT_PARTITION = 'dev'
+SLURM_MARKER_FILE = '.sky_slurm_cluster'
+# Regex pattern for parsing GPU GRES strings.
+# Format: 'gpu[:acc_type]:acc_count(optional_extra_info)'
+# Examples: 'gpu:8', 'gpu:H100:8', 'gpu:nvidia_h100_80gb_hbm3:8(S:0-1)'
+_GRES_GPU_PATTERN = re.compile(r'\bgpu:(?:(?P<type>[^:(]+):)?(?P<count>\d+)',
+                               re.IGNORECASE)
+_SLURM_NODES_INFO_CACHE_TTL = 30 * 60
+def get_gpu_type_and_count(gres_str: str) -> Tuple[Optional[str], int]:
+    """Parses GPU type and count from a GRES string.
+    Returns:
+        A tuple of (GPU type, GPU count). If no GPU is found, returns (None, 0).
+    """
+    match = _GRES_GPU_PATTERN.search(gres_str)
+    if not match:
+        return None, 0
+    return match.group('type'), int(match.group('count'))
+# SSH host key filename for sshd.
+SLURM_SSHD_HOST_KEY_FILENAME = 'skypilot_host_key'
 def get_slurm_ssh_config() -> SSHConfig:
@@ -27,6 +54,42 @@ def get_slurm_ssh_config() -> SSHConfig:
     return slurm_config
+@annotations.lru_cache(scope='request')
+def _get_slurm_nodes_info(cluster: str) -> List[slurm.NodeInfo]:
+    cache_key = f'slurm:nodes_info:{cluster}'
+    cached = kv_cache.get_cache_entry(cache_key)
+    if cached is not None:
+        logger.debug(f'Slurm nodes info found in cache ({cache_key})')
+        return [slurm.NodeInfo(**item) for item in json.loads(cached)]
+    ssh_config = get_slurm_ssh_config()
+    ssh_config_dict = ssh_config.lookup(cluster)
+    client = slurm.SlurmClient(
+        ssh_config_dict['hostname'],
+        int(ssh_config_dict.get('port', 22)),
+        ssh_config_dict['user'],
+        ssh_config_dict['identityfile'][0],
+        ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
+        ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
+    )
+    nodes_info = client.info_nodes()
+    try:
+        # Nodes in a cluster are unlikely to change frequently, so cache
+        # the result for a short period of time.
+        kv_cache.add_or_update_cache_entry(
+            cache_key, json.dumps([n._asdict() for n in nodes_info]),
+            time.time() + _SLURM_NODES_INFO_CACHE_TTL)
+    except Exception as e:  # pylint: disable=broad-except
+        # Catch the error and continue.
+        # Failure to cache the result is not critical to the
+        # success of this function.
+        logger.debug(f'Failed to cache slurm nodes info for {cluster}: '
+                     f'{common_utils.format_exception(e)}')
+    return nodes_info
 class SlurmInstanceType:
     """Class to represent the "Instance Type" in a Slurm cluster.
@@ -170,35 +233,23 @@ def instance_id(job_id: str, node: str) -> str:
     return f'job{job_id}-{node}'
-def get_cluster_name_from_config(provider_config: Dict[str, Any]) -> str:
-    """Return the cluster name from the provider config.
-    The concept of cluster can be mapped to a cloud region.
-    """
-    return provider_config.get('cluster', DEFAULT_CLUSTER_NAME)
 def get_partition_from_config(provider_config: Dict[str, Any]) -> str:
     """Return the partition from the provider config.
     The concept of partition can be mapped to a cloud zone.
     """
-    return provider_config.get('partition', DEFAULT_PARTITION)
+    partition = provider_config.get('partition')
+    if partition is None:
+        raise ValueError('Partition not specified in provider config.')
+    return partition
 @annotations.lru_cache(scope='request')
-def get_cluster_default_partition(cluster_name: str) -> str:
+def get_cluster_default_partition(cluster_name: str) -> Optional[str]:
     """Get the default partition for a Slurm cluster.
     Queries the Slurm cluster for the partition marked with an asterisk (*)
-    in sinfo output. Falls back to DEFAULT_PARTITION if the query fails or
-    no default partition is found.
-    Args:
-        cluster_name: Name of the Slurm cluster.
-    Returns:
-        The default partition name for the cluster.
+    in sinfo output. If no default partition is marked, returns None.
     """
     try:
         ssh_config = get_slurm_ssh_config()
@@ -214,16 +265,10 @@ def get_cluster_default_partition(cluster_name: str) -> str:
         ssh_config_dict['user'],
         ssh_config_dict['identityfile'][0],
         ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
+        ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
     )
-    default_partition = client.get_default_partition()
-    if default_partition is None:
-        # TODO(kevin): Have a way to specify default partition in
-        # ~/.sky/config.yaml if needed, in case a Slurm cluster
-        # really does not have a default partition.
-        raise ValueError('No default partition found for cluster '
-                         f'{cluster_name}.')
-    return default_partition
+    return client.get_default_partition()
 def get_all_slurm_cluster_names() -> List[str]:
@@ -296,7 +341,7 @@ def check_instance_fits(
     """
     # Get Slurm node list in the given cluster (region).
     try:
-        ssh_config = get_slurm_ssh_config()
+        nodes = _get_slurm_nodes_info(cluster)
     except FileNotFoundError:
         return (False, f'Could not query Slurm cluster {cluster} '
                 f'because the Slurm configuration file '
@@ -305,20 +350,13 @@ def check_instance_fits(
         return (False, f'Could not query Slurm cluster {cluster} '
                 f'because Slurm SSH configuration at {DEFAULT_SLURM_PATH} '
                 f'could not be loaded: {common_utils.format_exception(e)}.')
-    ssh_config_dict = ssh_config.lookup(cluster)
-    client = slurm.SlurmClient(
-        ssh_config_dict['hostname'],
-        int(ssh_config_dict.get('port', 22)),
-        ssh_config_dict['user'],
-        ssh_config_dict['identityfile'][0],
-        ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
-    )
-    nodes = client.info_nodes()
     default_partition = get_cluster_default_partition(cluster)
     def is_default_partition(node_partition: str) -> bool:
+        if default_partition is None:
+            return False
         # info_nodes does not strip the '*' from the default partition name.
         # But non-default partition names can also end with '*',
         # so we need to check whether the partition name without the '*'
@@ -352,27 +390,18 @@ def check_instance_fits(
         assert acc_count is not None, (acc_type, acc_count)
         gpu_nodes = []
-        # GRES string format: 'gpu:acc_type:acc_count(optional_extra_info)'
-        # Examples:
-        # - gpu:nvidia_h100_80gb_hbm3:8(S:0-1)
-        # - gpu:a10g:8
-        # - gpu:l4:1
-        gres_pattern = re.compile(r'^gpu:([^:]+):(\d+)')
         for node_info in nodes:
-            gres_str = node_info.gres
             # Extract the GPU type and count from the GRES string
-            match = gres_pattern.match(gres_str)
-            if not match:
+            node_acc_type, node_acc_count = get_gpu_type_and_count(
+                node_info.gres)
+            if node_acc_type is None:
                 continue
-            node_acc_type = match.group(1).lower()
-            node_acc_count = int(match.group(2))
             # TODO(jwj): Handle status check.
             # Check if the node has the requested GPU type and at least the
             # requested count
-            if (node_acc_type == acc_type.lower() and
+            if (node_acc_type.lower() == acc_type.lower() and
                     node_acc_count >= acc_count):
                 gpu_nodes.append(node_info)
         if len(gpu_nodes) == 0:
@@ -394,6 +423,51 @@ def check_instance_fits(
     return fits, reason
+# GRES names are highly unlikely to change within a cluster.
+# TODO(kevin): Cache using sky/utils/db/kv_cache.py too.
+@annotations.lru_cache(scope='global', maxsize=10)
+def get_gres_gpu_type(cluster: str, requested_gpu_type: str) -> str:
+    """Get the actual GPU type as it appears in the cluster's GRES.
+    Args:
+        cluster: Name of the Slurm cluster.
+        requested_gpu_type: The GPU type requested by the user.
+    Returns:
+        The actual GPU type as it appears in the cluster's GRES string.
+        Falls back to the requested type if not found.
+    """
+    try:
+        ssh_config = get_slurm_ssh_config()
+        ssh_config_dict = ssh_config.lookup(cluster)
+        client = slurm.SlurmClient(
+            ssh_config_dict['hostname'],
+            int(ssh_config_dict.get('port', 22)),
+            ssh_config_dict['user'],
+            ssh_config_dict['identityfile'][0],
+            ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
+            ssh_proxy_jump=ssh_config_dict.get('proxyjump', None),
+        )
+        nodes = client.info_nodes()
+        for node_info in nodes:
+            node_gpu_type, _ = get_gpu_type_and_count(node_info.gres)
+            if node_gpu_type is None:
+                continue
+            if node_gpu_type.lower() == requested_gpu_type.lower():
+                return node_gpu_type
+    except Exception as e:  # pylint: disable=broad-except
+        logger.warning(
+            'Failed to determine the exact GPU GRES type from the Slurm '
+            f'cluster {cluster!r}. Falling back to '
+            f'{requested_gpu_type.lower()!r}. This may cause issues if the '
+            f'casing is incorrect. Error: {common_utils.format_exception(e)}')
+    # GRES names are more commonly in lowercase from what we've seen so far.
+    return requested_gpu_type.lower()
 def _get_slurm_node_info_list(
         slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
     """Gathers detailed information about each node in the Slurm cluster.
@@ -423,6 +497,7 @@ def _get_slurm_node_info_list(
         slurm_config_dict['user'],
         slurm_config_dict['identityfile'][0],
         ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
+        ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
     )
     node_infos = slurm_client.info_nodes()
@@ -434,8 +509,8 @@ def _get_slurm_node_info_list(
     # 2. Process each node, aggregating partitions per node
     slurm_nodes_info: Dict[str, Dict[str, Any]] = {}
-    gres_gpu_pattern = re.compile(r'((gpu)(?::([^:]+))?:(\d+))')
+    nodes_to_jobs_gres = slurm_client.get_all_jobs_gres()
     for node_info in node_infos:
         node_name = node_info.node
         state = node_info.state
@@ -447,43 +522,27 @@ def _get_slurm_node_info_list(
             continue
         # Extract GPU info from GRES
-        gres_match = gres_gpu_pattern.search(gres_str)
-        total_gpus = 0
-        gpu_type_from_sinfo = None  # Default to None for CPU-only nodes
-        if gres_match:
-            try:
-                total_gpus = int(gres_match.group(4))
-                if gres_match.group(3):
-                    gpu_type_from_sinfo = gres_match.group(3).upper()
-                # If total_gpus > 0 but no type, default to 'GPU'
-                elif total_gpus > 0:
-                    gpu_type_from_sinfo = 'GPU'
-            except ValueError:
-                logger.warning(
-                    f'Could not parse GPU count from GRES for {node_name}.')
-        # Get allocated GPUs via squeue
+        node_gpu_type, total_gpus = get_gpu_type_and_count(gres_str)
+        if total_gpus > 0:
+            if node_gpu_type is not None:
+                node_gpu_type = node_gpu_type.upper()
+            else:
+                node_gpu_type = 'GPU'
+        # Get allocated GPUs
         allocated_gpus = 0
         # TODO(zhwu): move to enum
         if state in ('alloc', 'mix', 'drain', 'drng', 'drained', 'resv',
                      'comp'):
-            try:
-                node_jobs = slurm_client.get_node_jobs(node_name)
-                if node_jobs:
-                    job_gres_pattern = re.compile(r'gpu(?::[^:]+)*:(\d+)')
-                    for job_line in node_jobs:
-                        gres_job_match = job_gres_pattern.search(job_line)
-                        if gres_job_match:
-                            allocated_gpus += int(gres_job_match.group(1))
-            except Exception as e:  # pylint: disable=broad-except
-                if state == 'alloc':
-                    # We can infer allocated GPUs only if the node is
-                    # in 'alloc' state.
-                    allocated_gpus = total_gpus
-                else:
-                    # Otherwise, just raise the error.
-                    raise e
+            jobs_gres = nodes_to_jobs_gres.get(node_name, [])
+            if jobs_gres:
+                for job_line in jobs_gres:
+                    _, job_gpu_count = get_gpu_type_and_count(job_line)
+                    allocated_gpus += job_gpu_count
+            elif state == 'alloc':
+                # If no GRES info found but node is fully allocated,
+                # assume all GPUs are in use.
+                allocated_gpus = total_gpus
         elif state == 'idle':
             allocated_gpus = 0
@@ -493,27 +552,16 @@ def _get_slurm_node_info_list(
                                                                  'maint') else 0
         free_gpus = max(0, free_gpus)
-        # Get CPU/Mem info via scontrol
-        vcpu_total = 0
-        mem_gb = 0.0
-        try:
-            node_details = slurm_client.node_details(node_name)
-            vcpu_total = int(node_details.get('CPUTot', '0'))
-            mem_gb = float(node_details.get('RealMemory', '0')) / 1024.0
-        except Exception as e:  # pylint: disable=broad-except
-            logger.warning(
-                f'Failed to get CPU/memory info for {node_name}: {e}')
         slurm_nodes_info[node_name] = {
             'node_name': node_name,
             'slurm_cluster_name': slurm_cluster_name,
             'partitions': [partition],
             'node_state': state,
-            'gpu_type': gpu_type_from_sinfo,
+            'gpu_type': node_gpu_type,
             'total_gpus': total_gpus,
             'free_gpus': free_gpus,
-            'vcpu_count': vcpu_total,
-            'memory_gb': round(mem_gb, 2),
+            'vcpu_count': node_info.cpus,
+            'memory_gb': round(node_info.memory_gb, 2),
         }
     for node_info in slurm_nodes_info.values():
@@ -539,10 +587,15 @@ def slurm_node_info(
     return node_list
-def is_inside_slurm_job() -> bool:
-    return os.environ.get('SLURM_JOB_ID') is not None
+def is_inside_slurm_cluster() -> bool:
+    # Check for the marker file in the current home directory. When run by
+    # the skylet on a compute node, the HOME environment variable is set to
+    # the cluster's sky home directory by the SlurmCommandRunner.
+    marker_file = os.path.join(os.path.expanduser('~'), SLURM_MARKER_FILE)
+    return os.path.exists(marker_file)
+@annotations.lru_cache(scope='request')
 def get_partitions(cluster_name: str) -> List[str]:
     """Get unique partition names available in a Slurm cluster.
@@ -565,6 +618,7 @@ def get_partitions(cluster_name: str) -> List[str]:
             slurm_config_dict['user'],
             slurm_config_dict['identityfile'][0],
             ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
+            ssh_proxy_jump=slurm_config_dict.get('proxyjump', None),
         )
         partitions_info = client.get_partitions_info()
@@ -577,7 +631,59 @@ def get_partitions(cluster_name: str) -> List[str]:
                 other_partitions.append(partition.name)
         return default_partitions + sorted(other_partitions)
     except Exception as e:  # pylint: disable=broad-except
-        logger.warning(
-            f'Failed to get partitions for cluster {cluster_name}: {e}')
-        # Fall back to default partition if query fails
-        return [DEFAULT_PARTITION]
+        raise ValueError(
+            f'Failed to get partitions for cluster '
+            f'{cluster_name}: {common_utils.format_exception(e)}') from e
+def srun_sshd_command(
+    job_id: str,
+    target_node: str,
+    unix_user: str,
+) -> str:
+    """Build srun command for launching sshd -i inside a Slurm job.
+    This is used by the API server to proxy SSH connections to Slurm jobs
+    via sshd running in inetd mode within srun.
+    Args:
+        job_id: The Slurm job ID
+        target_node: The target compute node hostname
+        unix_user: The Unix user for the job
+    Returns:
+        List of command arguments to be extended to ssh base command
+    """
+    # We use ~username to ensure we use the real home of the user ssh'ing in,
+    # because we override the home directory in SlurmCommandRunner.run.
+    user_home_ssh_dir = f'~{unix_user}/.ssh'
+    return shlex.join([
+        'srun',
+        '--quiet',
+        '--unbuffered',
+        '--overlap',
+        '--jobid',
+        job_id,
+        '-w',
+        target_node,
+        '/usr/sbin/sshd',
+        '-i',  # Uses stdin/stdout
+        '-e',  # Writes errors to stderr
+        '-f',  # Use /dev/null to avoid reading system sshd_config
+        '/dev/null',
+        '-h',
+        f'{user_home_ssh_dir}/{SLURM_SSHD_HOST_KEY_FILENAME}',
+        '-o',
+        f'AuthorizedKeysFile={user_home_ssh_dir}/authorized_keys',
+        '-o',
+        'PasswordAuthentication=no',
+        '-o',
+        'PubkeyAuthentication=yes',
+        # If UsePAM is enabled, we will not be able to run sshd(8)
+        # as a non-root user.
+        # See https://man7.org/linux/man-pages/man5/sshd_config.5.html
+        '-o',
+        'UsePAM=no',
+        '-o',
+        f'AcceptEnv={constants.SKY_CLUSTER_NAME_ENV_VAR_KEY}',
+    ])

sky/provision/vast/utils.py CHANGED Viewed

@@ -98,6 +98,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
     ]
     if secure_only:
         query.append('datacenter=true')
+        query.append('hosting_type>=1')
     query_str = ' '.join(query)
     instance_list = vast.vast().search_offers(query=query_str)

sky/resources.py CHANGED Viewed

@@ -219,6 +219,9 @@ class Resources:
             - strategy: the recovery strategy to use.
             - max_restarts_on_errors: the max number of restarts on user code
               errors.
+            - recover_on_exit_codes: a list of exit codes that should trigger
+              job recovery. If any task exits with a code in this list, the job
+              will be recovered regardless of max_restarts_on_errors limit.
           region: the region to use. Deprecated. Use `infra` instead.
           zone: the zone to use. Deprecated. Use `infra` instead.
@@ -569,7 +572,8 @@ class Resources:
         if self.cloud is not None and self._instance_type is not None:
             vcpus, _ = self.cloud.get_vcpus_mem_from_instance_type(
                 self._instance_type)
-            return str(vcpus)
+            if vcpus is not None:
+                return str(vcpus)
         return None
     @property
@@ -1645,6 +1649,7 @@ class Resources:
         other: Union[List['Resources'], 'Resources'],
         requested_num_nodes: int = 1,
         check_ports: bool = False,
+        check_cloud: bool = True,
     ) -> bool:
         """Returns whether this resources is less demanding than the other.
@@ -1654,24 +1659,29 @@ class Resources:
             requested_num_nodes: Number of nodes that the current task
               requests from the cluster.
             check_ports: Whether to check the ports field.
+            check_cloud: Whether we check the cloud/region/zone fields. Useful
+              for resources that don't have cloud specified, like some launched
+              resources.
         """
         if isinstance(other, list):
             resources_list = [self.less_demanding_than(o) for o in other]
             return requested_num_nodes <= sum(resources_list)
-        assert other.cloud is not None, 'Other cloud must be specified'
+        if check_cloud:
+            assert other.cloud is not None, 'Other cloud must be specified'
-        if self.cloud is not None and not self.cloud.is_same_cloud(other.cloud):
-            return False
-        # self.cloud <= other.cloud
+            if self.cloud is not None and not self.cloud.is_same_cloud(
+                    other.cloud):
+                return False
+            # self.cloud <= other.cloud
-        if self.region is not None and self.region != other.region:
-            return False
-        # self.region <= other.region
+            if self.region is not None and self.region != other.region:
+                return False
+            # self.region <= other.region
-        if self.zone is not None and self.zone != other.zone:
-            return False
-        # self.zone <= other.zone
+            if self.zone is not None and self.zone != other.zone:
+                return False
+            # self.zone <= other.zone
         if self.image_id is not None:
             if other.image_id is None:
@@ -1743,8 +1753,10 @@ class Resources:
             # On Kubernetes, we can't launch a task that requires FUSE on a pod
             # that wasn't initialized with FUSE support at the start.
             # Other clouds don't have this limitation.
-            if other.cloud.is_same_cloud(clouds.Kubernetes()):
-                return False
+            if check_cloud:
+                assert other.cloud is not None
+                if other.cloud.is_same_cloud(clouds.Kubernetes()):
+                    return False
         # self <= other
         return True
@@ -1792,6 +1804,101 @@ class Resources:
             self._docker_login_config is None,
         ])
+    def __add__(self, other: Optional['Resources']) -> Optional['Resources']:
+        """Add two Resources objects together.
+        Args:
+            other: Another Resources object to add (may be None)
+        Returns:
+            New Resources object with summed resources, or None if other is None
+        """
+        if other is None:
+            return self
+        # Sum CPUs
+        self_cpus = _parse_value(self.cpus)
+        other_cpus = _parse_value(other.cpus)
+        total_cpus = None
+        if self_cpus is not None or other_cpus is not None:
+            total_cpus = (self_cpus or 0) + (other_cpus or 0)
+        # Sum memory
+        self_memory = _parse_value(self.memory)
+        other_memory = _parse_value(other.memory)
+        total_memory = None
+        if self_memory is not None or other_memory is not None:
+            total_memory = (self_memory or 0) + (other_memory or 0)
+        # Sum accelerators
+        total_accelerators = {}
+        if self.accelerators:
+            for acc_type, count in self.accelerators.items():
+                total_accelerators[acc_type] = float(count)
+        if other.accelerators:
+            for acc_type, count in other.accelerators.items():
+                if acc_type not in total_accelerators:
+                    total_accelerators[acc_type] = 0
+                total_accelerators[acc_type] += float(count)
+        return Resources(
+            cpus=str(total_cpus) if total_cpus is not None else None,
+            memory=str(total_memory) if total_memory is not None else None,
+            accelerators=total_accelerators if total_accelerators else None)
+    def __sub__(self, other: Optional['Resources']) -> 'Resources':
+        """Subtract another Resources object from this one.
+        Args:
+            other: Resources to subtract (may be None)
+        Returns:
+            New Resources object with subtracted resources. If the result for a
+            resource is negative, it will be set to 0.
+        """
+        if other is None:
+            return self
+        # Subtract CPUs
+        self_cpus = _parse_value(self.cpus)
+        other_cpus = _parse_value(other.cpus)
+        free_cpus = None
+        if self_cpus is not None:
+            if other_cpus is not None:
+                free_cpus = max(0, self_cpus - other_cpus)
+            else:
+                free_cpus = self_cpus
+        # Subtract memory
+        self_memory = _parse_value(self.memory)
+        other_memory = _parse_value(other.memory)
+        free_memory = None
+        if self_memory is not None:
+            if other_memory is not None:
+                free_memory = max(0, self_memory - other_memory)
+            else:
+                free_memory = self_memory
+        # Subtract accelerators
+        free_accelerators = {}
+        if self.accelerators:
+            for acc_type, total_count in self.accelerators.items():
+                used_count = (other.accelerators.get(acc_type, 0)
+                              if other.accelerators else 0)
+                free_count = max(0, float(total_count) - float(used_count))
+                if free_count > 0:
+                    free_accelerators[acc_type] = free_count
+        # If all resources are exhausted, return None
+        # Check if we have any free resources
+        free_cpus = None if free_cpus == 0 else free_cpus
+        free_memory = None if free_memory == 0 else free_memory
+        free_accelerators = None if not free_accelerators else free_accelerators
+        return Resources(cpus=free_cpus,
+                         memory=free_memory,
+                         accelerators=free_accelerators)
     def copy(self, **override) -> 'Resources':
         """Returns a copy of the given Resources."""
         use_spot = self.use_spot if self._use_spot_specified else None
@@ -2456,3 +2563,18 @@ def _maybe_add_docker_prefix_to_image_id(
     for k, v in image_id_dict.items():
         if not v.startswith('docker:'):
             image_id_dict[k] = f'docker:{v}'
+def _parse_value(val):
+    if val is None:
+        return None
+    if isinstance(val, (int, float)):
+        return float(val)
+    if isinstance(val, str):
+        # Remove '+' suffix if present
+        val = val.rstrip('+')
+        try:
+            return float(val)
+        except ValueError:
+            return None
+    return None

skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl