PyPI - dstack - Versions diffs - 0.19.30rc1__py3-none-any.whl → 0.19.31__py3-none-any.whl - Mend

dstack 0.19.30rc1py3-none-any.whl → 0.19.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (47) hide show

dstack/_internal/cli/commands/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import ClassVar, Optional
 from rich_argparse import RichHelpFormatter
 from dstack._internal.cli.services.completion import ProjectNameCompleter
+from dstack._internal.cli.utils.common import configure_logging
 from dstack._internal.core.errors import CLIError
 from dstack.api import Client
@@ -52,9 +53,16 @@ class BaseCommand(ABC):
     @abstractmethod
     def _command(self, args: argparse.Namespace):
+        self._configure_logging()
         if not self.ACCEPT_EXTRA_ARGS and args.extra_args:
             raise CLIError(f"Unrecognized arguments: {shlex.join(args.extra_args)}")
+    def _configure_logging(self) -> None:
+        """
+        Override this method to configure command-specific logging
+        """
+        configure_logging()
 class APIBaseCommand(BaseCommand):
     api: Client

dstack/_internal/cli/commands/project.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import argparse
+from typing import Any, Union
 from requests import HTTPError
 from rich.table import Table
 import dstack.api.server
 from dstack._internal.cli.commands import BaseCommand
-from dstack._internal.cli.utils.common import confirm_ask, console
+from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console
 from dstack._internal.core.errors import ClientError, CLIError
 from dstack._internal.core.services.configs import ConfigManager
 from dstack._internal.utils.logging import get_logger
@@ -58,6 +59,10 @@ class ProjectCommand(BaseCommand):
         # List subcommand
         list_parser = subparsers.add_parser("list", help="List configured projects")
         list_parser.set_defaults(subfunc=self._list)
+        for parser in [self._parser, list_parser]:
+            parser.add_argument(
+                "-v", "--verbose", action="store_true", help="Show more information"
+            )
         # Set default subcommand
         set_default_parser = subparsers.add_parser("set-default", help="Set default project")
@@ -122,30 +127,32 @@ class ProjectCommand(BaseCommand):
         table = Table(box=None)
         table.add_column("PROJECT", style="bold", no_wrap=True)
         table.add_column("URL", style="grey58")
-        table.add_column("USER", style="grey58")
+        if args.verbose:
+            table.add_column("USER", style="grey58")
         table.add_column("DEFAULT", justify="center")
         for project_config in config_manager.list_project_configs():
             project_name = project_config.name
             is_default = project_name == default_project.name if default_project else False
-            # Get username from API
-            try:
-                api_client = dstack.api.server.APIClient(
-                    base_url=project_config.url, token=project_config.token
-                )
-                user_info = api_client.users.get_my_user()
-                username = user_info.username
-            except ClientError:
-                username = "(invalid token)"
-            table.add_row(
-                project_name,
-                project_config.url,
-                username,
-                "✓" if is_default else "",
-                style="bold" if is_default else None,
-            )
+            row: dict[Union[str, int], Any] = {
+                "PROJECT": project_name,
+                "URL": project_config.url,
+                "DEFAULT": "✓" if is_default else "",
+            }
+            if args.verbose:
+                # Get username from API
+                try:
+                    api_client = dstack.api.server.APIClient(
+                        base_url=project_config.url, token=project_config.token
+                    )
+                    user_info = api_client.users.get_my_user()
+                    username = user_info.username
+                except ClientError:
+                    username = "(invalid token)"
+                row["USER"] = username
+            add_row_from_dict(table, row, style="bold" if is_default else None)
         console.print(table)

dstack/_internal/cli/commands/server.py CHANGED Viewed

@@ -82,3 +82,8 @@ class ServerCommand(BaseCommand):
             log_level=uvicorn_log_level,
             workers=1,
         )
+    def _configure_logging(self) -> None:
+        # Server logging is configured in the FastAPI lifespan function.
+        # No need to configure CLI logging.
+        pass

dstack/_internal/cli/services/configurators/fleet.py CHANGED Viewed

@@ -159,12 +159,19 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator[Fle
         console.print(
             get_fleets_table(
                 [fleet],
-                verbose=_failed_provisioning(fleet),
+                verbose=_fleet_has_failed_instances(fleet),
                 format_date=local_time,
             )
         )
-        if _failed_provisioning(fleet):
-            console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
+        if _fleet_has_failed_instances(fleet):
+            if _fleet_retrying(fleet):
+                console.print(
+                    "\n[error]Some instances failed. Provisioning will be retried in the background.[/]"
+                )
+            else:
+                console.print(
+                    "\n[error]Some instances failed. Check the table above for errors.[/]"
+                )
             exit(1)
     def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Namespace):
@@ -253,11 +260,11 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator[Fle
         console.print(
             get_fleets_table(
                 [fleet],
-                verbose=_failed_provisioning(fleet),
+                verbose=_fleet_has_failed_instances(fleet),
                 format_date=local_time,
             )
         )
-        if _failed_provisioning(fleet):
+        if _fleet_has_failed_instances(fleet):
             console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
             exit(1)
@@ -462,13 +469,20 @@ def _finished_provisioning(fleet: Fleet) -> bool:
     return True
-def _failed_provisioning(fleet: Fleet) -> bool:
+def _fleet_has_failed_instances(fleet: Fleet) -> bool:
     for instance in fleet.instances:
         if instance.status == InstanceStatus.TERMINATED:
             return True
     return False
+def _fleet_retrying(fleet: Fleet) -> bool:
+    if fleet.spec.configuration.nodes is None:
+        return False
+    active_instances = [i for i in fleet.instances if i.status.is_active()]
+    return len(active_instances) < fleet.spec.configuration.nodes.min
 def _apply_plan(api: Client, plan: FleetPlan) -> Fleet:
     try:
         return api.client.fleets.apply_plan(

dstack/_internal/cli/utils/gpu.py CHANGED Viewed

@@ -9,7 +9,7 @@ from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_
 from dstack._internal.server.schemas.gpus import GpuGroup
-def print_gpu_json(gpu_response, run_spec, group_by_cli, api_project):
+def print_gpu_json(gpus, run_spec, group_by_cli, api_project):
     """Print GPU information in JSON format."""
     req = Requirements(
         resources=run_spec.configuration.resources,
@@ -36,7 +36,7 @@ def print_gpu_json(gpu_response, run_spec, group_by_cli, api_project):
         "gpus": [],
     }
-    for gpu_group in gpu_response.gpus:
+    for gpu_group in gpus:
         gpu_data = {
             "name": gpu_group.name,
             "memory_mib": gpu_group.memory_mib,

dstack/_internal/core/backends/aws/compute.py CHANGED Viewed

@@ -24,6 +24,7 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithPrivateGatewaySupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithReservationSupport,
     ComputeWithVolumeSupport,
     generate_unique_gateway_instance_name,
@@ -90,6 +91,7 @@ def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs):
 class AWSCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithMultinodeSupport,
     ComputeWithReservationSupport,
     ComputeWithPlacementGroupSupport,
@@ -291,7 +293,11 @@ class AWSCompute(
                 image_id, username = self._get_image_id_and_username(
                     ec2_client=ec2_client,
                     region=instance_offer.region,
-                    cuda=len(instance_offer.instance.resources.gpus) > 0,
+                    gpu_name=(
+                        instance_offer.instance.resources.gpus[0].name
+                        if len(instance_offer.instance.resources.gpus) > 0
+                        else None
+                    ),
                     instance_type=instance_offer.instance.name,
                     image_config=self.config.os_images,
                 )
@@ -897,11 +903,13 @@ class AWSCompute(
         self,
         ec2_client: botocore.client.BaseClient,
         region: str,
-        cuda: bool,
+        gpu_name: Optional[str],
         instance_type: str,
         image_config: Optional[AWSOSImageConfig] = None,
     ) -> tuple:
-        return hashkey(region, cuda, instance_type, image_config.json() if image_config else None)
+        return hashkey(
+            region, gpu_name, instance_type, image_config.json() if image_config else None
+        )
     @cachedmethod(
         cache=lambda self: self._get_image_id_and_username_cache,
@@ -912,13 +920,13 @@ class AWSCompute(
         self,
         ec2_client: botocore.client.BaseClient,
         region: str,
-        cuda: bool,
+        gpu_name: Optional[str],
         instance_type: str,
         image_config: Optional[AWSOSImageConfig] = None,
     ) -> tuple[str, str]:
         return aws_resources.get_image_id_and_username(
             ec2_client=ec2_client,
-            cuda=cuda,
+            gpu_name=gpu_name,
             instance_type=instance_type,
             image_config=image_config,
         )

dstack/_internal/core/backends/aws/resources.py CHANGED Viewed

@@ -6,6 +6,8 @@ import botocore.exceptions
 import dstack.version as version
 from dstack._internal.core.backends.aws.models import AWSOSImageConfig
+from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
+from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
 from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError
 from dstack._internal.utils.logging import get_logger
@@ -17,14 +19,14 @@ DLAMI_OWNER_ACCOUNT_ID = "898082745236"
 def get_image_id_and_username(
     ec2_client: botocore.client.BaseClient,
-    cuda: bool,
+    gpu_name: Optional[str],
     instance_type: str,
     image_config: Optional[AWSOSImageConfig] = None,
 ) -> tuple[str, str]:
     if image_config is not None:
-        image = image_config.nvidia if cuda else image_config.cpu
+        image = image_config.nvidia if gpu_name else image_config.cpu
         if image is None:
-            logger.warning("%s image not configured", "nvidia" if cuda else "cpu")
+            logger.warning("%s image not configured", "nvidia" if gpu_name else "cpu")
             raise ComputeResourceNotFoundError()
         image_name = image.name
         image_owner = image.owner
@@ -35,9 +37,12 @@ def get_image_id_and_username(
         image_owner = DLAMI_OWNER_ACCOUNT_ID
         username = "ubuntu"
     else:
-        image_name = (
-            f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
-        )
+        if gpu_name is None:
+            image_name = f"dstack-{version.base_image}"
+        elif not requires_nvidia_proprietary_kernel_modules(gpu_name):
+            image_name = f"dstack-cuda-{version.base_image}"
+        else:
+            image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
         image_owner = DSTACK_ACCOUNT_ID
         username = "ubuntu"
     response = ec2_client.describe_images(

dstack/_internal/core/backends/azure/compute.py CHANGED Viewed

@@ -43,13 +43,16 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithCreateInstanceSupport,
     ComputeWithGatewaySupport,
     ComputeWithMultinodeSupport,
+    ComputeWithPrivilegedSupport,
     generate_unique_gateway_instance_name,
     generate_unique_instance_name,
     get_gateway_user_data,
     get_user_data,
     merge_tags,
+    requires_nvidia_proprietary_kernel_modules,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
 from dstack._internal.core.errors import ComputeError, NoCapacityError
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.gateways import (
@@ -76,6 +79,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("30GB"), max=Memory.pars
 class AzureCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithMultinodeSupport,
     ComputeWithGatewaySupport,
     Compute,
@@ -372,6 +376,7 @@ def _parse_config_vpc_id(vpc_id: str) -> Tuple[str, str]:
 class VMImageVariant(enum.Enum):
     GRID = enum.auto()
     CUDA = enum.auto()
+    CUDA_WITH_PROPRIETARY_KERNEL_MODULES = enum.auto()
     STANDARD = enum.auto()
     @classmethod
@@ -379,18 +384,24 @@ class VMImageVariant(enum.Enum):
         if "_A10_v5" in instance.name:
             return cls.GRID
         elif len(instance.resources.gpus) > 0:
-            return cls.CUDA
+            if not requires_nvidia_proprietary_kernel_modules(instance.resources.gpus[0].name):
+                return cls.CUDA
+            else:
+                return cls.CUDA_WITH_PROPRIETARY_KERNEL_MODULES
         else:
             return cls.STANDARD
     def get_image_name(self) -> str:
-        name = "dstack-"
         if self is self.GRID:
-            name += "grid-"
+            return f"dstack-grid-{version.base_image}"
         elif self is self.CUDA:
-            name += "cuda-"
-        name += version.base_image
-        return name
+            return f"dstack-cuda-{version.base_image}"
+        elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES:
+            return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
+        elif self is self.STANDARD:
+            return f"dstack-{version.base_image}"
+        else:
+            raise ValueError(f"Unexpected image variant {self!r}")
 _SUPPORTED_VM_SERIES_PATTERNS = [

dstack/_internal/core/backends/base/compute.py CHANGED Viewed

@@ -5,14 +5,16 @@ import string
 import threading
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
+from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import Callable, Dict, List, Literal, Optional
+from typing import Callable, Dict, List, Optional
 import git
 import requests
 import yaml
 from cachetools import TTLCache, cachedmethod
+from gpuhunt import CPUArchitecture
 from dstack._internal import settings
 from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
@@ -48,8 +50,38 @@ logger = get_logger(__name__)
 DSTACK_SHIM_BINARY_NAME = "dstack-shim"
 DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
 DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16")
+NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES = frozenset(
+    # All NVIDIA architectures prior to Turing do not support Open Kernel Modules and require
+    # proprietary modules. This list is incomplete, update when necessary.
+    [
+        "v100",
+        "p100",
+        "p40",
+        "p4",
+        "m60",
+        "m40",
+        "m4",
+        "k80",
+        "k40",
+        "k20",
+    ]
+)
-GoArchType = Literal["amd64", "arm64"]
+class GoArchType(str, Enum):
+    """
+    A subset of GOARCH values
+    """
+    AMD64 = "amd64"
+    ARM64 = "arm64"
+    def to_cpu_architecture(self) -> CPUArchitecture:
+        if self == self.AMD64:
+            return CPUArchitecture.X86
+        if self == self.ARM64:
+            return CPUArchitecture.ARM
+        assert False, self
 class Compute(ABC):
@@ -288,6 +320,15 @@ class ComputeWithCreateInstanceSupport(ABC):
             ]
+class ComputeWithPrivilegedSupport:
+    """
+    Must be subclassed to support runs with `privileged: true`.
+    All VM-based Computes (that is, Computes that use the shim) should subclass this mixin.
+    """
+    pass
 class ComputeWithMultinodeSupport:
     """
     Must be subclassed to support multinode tasks and cluster fleets.
@@ -688,14 +729,14 @@ def normalize_arch(arch: Optional[str] = None) -> GoArchType:
     If the arch is not specified, falls back to `amd64`.
     """
     if not arch:
-        return "amd64"
+        return GoArchType.AMD64
     arch_lower = arch.lower()
     if "32" in arch_lower or arch_lower in ["i386", "i686"]:
         raise ValueError(f"32-bit architectures are not supported: {arch}")
     if arch_lower.startswith("x86") or arch_lower.startswith("amd"):
-        return "amd64"
+        return GoArchType.AMD64
     if arch_lower.startswith("arm") or arch_lower.startswith("aarch"):
-        return "arm64"
+        return GoArchType.ARM64
     raise ValueError(f"Unsupported architecture: {arch}")
@@ -711,8 +752,7 @@ def get_dstack_runner_download_url(arch: Optional[str] = None) -> str:
             "/{version}/binaries/dstack-runner-linux-{arch}"
         )
     version = get_dstack_runner_version()
-    arch = normalize_arch(arch)
-    return url_template.format(version=version, arch=arch)
+    return url_template.format(version=version, arch=normalize_arch(arch).value)
 def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
@@ -727,8 +767,7 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
             "/{version}/binaries/dstack-shim-linux-{arch}"
         )
     version = get_dstack_runner_version()
-    arch = normalize_arch(arch)
-    return url_template.format(version=version, arch=arch)
+    return url_template.format(version=version, arch=normalize_arch(arch).value)
 def get_setup_cloud_instance_commands(
@@ -969,3 +1008,12 @@ def merge_tags(
         for k, v in resource_tags.items():
             res.setdefault(k, v)
     return res
+def requires_nvidia_proprietary_kernel_modules(gpu_name: str) -> bool:
+    """
+    Returns:
+        Whether this NVIDIA GPU requires NVIDIA proprietary kernel modules
+        instead of open kernel modules.
+    """
+    return gpu_name.lower() in NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES

dstack/_internal/core/backends/base/offers.py CHANGED Viewed

@@ -22,6 +22,7 @@ from dstack._internal.utils.common import get_or_error
 SUPPORTED_GPUHUNT_FLAGS = [
     "oci-spot",
     "lambda-arm",
+    "gcp-a4",
 ]

dstack/_internal/core/backends/cloudrift/compute.py CHANGED Viewed

@@ -4,6 +4,7 @@ from dstack._internal.core.backends.base.compute import (
     Compute,
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -27,6 +28,7 @@ logger = get_logger(__name__)
 class CloudRiftCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     Compute,
 ):
     def __init__(self, config: CloudRiftConfig):

dstack/_internal/core/backends/cudo/compute.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
     ComputeWithCreateInstanceSupport,
     ComputeWithFilteredOffersCached,
+    ComputeWithPrivilegedSupport,
     generate_unique_instance_name,
     get_shim_commands,
 )
@@ -32,6 +33,7 @@ MAX_RESOURCE_NAME_LEN = 30
 class CudoCompute(
     ComputeWithFilteredOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     Compute,
 ):
     def __init__(self, config: CudoConfig):

dstack/_internal/core/backends/datacrunch/compute.py CHANGED Viewed

@@ -8,6 +8,7 @@ from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     generate_unique_instance_name,
     get_shim_commands,
 )
@@ -39,6 +40,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None)
 class DataCrunchCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     Compute,
 ):
     def __init__(self, config: DataCrunchConfig):

dstack/_internal/core/backends/digitalocean_base/compute.py CHANGED Viewed

@@ -7,6 +7,7 @@ from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     generate_unique_instance_name,
     get_user_data,
 )
@@ -40,6 +41,7 @@ DOCKER_INSTALL_COMMANDS = [
 class BaseDigitalOceanCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     Compute,
 ):
     def __init__(self, config: BaseDigitalOceanConfig, api_url: str, type: BackendType):

dstack/_internal/core/backends/features.py CHANGED Viewed

@@ -4,6 +4,7 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithPrivateGatewaySupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithReservationSupport,
     ComputeWithVolumeSupport,
 )
@@ -38,6 +39,10 @@ BACKENDS_WITH_CREATE_INSTANCE_SUPPORT = _get_backends_with_compute_feature(
     configurator_classes=_configurator_classes,
     compute_feature_class=ComputeWithCreateInstanceSupport,
 )
+BACKENDS_WITH_PRIVILEGED_SUPPORT = _get_backends_with_compute_feature(
+    configurator_classes=_configurator_classes,
+    compute_feature_class=ComputeWithPrivilegedSupport,
+)
 BACKENDS_WITH_MULTINODE_SUPPORT = [BackendType.REMOTE] + _get_backends_with_compute_feature(
     configurator_classes=_configurator_classes,
     compute_feature_class=ComputeWithMultinodeSupport,

dstack 0.19.30rc1__py3-none-any.whl → 0.19.31__py3-none-any.whl

Potentially problematic release.

dstack 0.19.30rc1py3-none-any.whl → 0.19.31py3-none-any.whl