PyPI - dstack - Versions diffs - 0.19.17__py3-none-any.whl → 0.19.18__py3-none-any.whl - Mend

dstack 0.19.17py3-none-any.whl → 0.19.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (43) hide show

dstack/_internal/cli/services/configurators/fleet.py CHANGED Viewed

@@ -35,6 +35,7 @@ from dstack._internal.core.models.fleets import (
 )
 from dstack._internal.core.models.instances import InstanceAvailability, InstanceStatus, SSHKey
 from dstack._internal.core.models.repos.base import Repo
+from dstack._internal.core.services.diff import diff_models
 from dstack._internal.utils.common import local_time
 from dstack._internal.utils.logging import get_logger
 from dstack._internal.utils.ssh import convert_ssh_key_to_pem, generate_public_key, pkey_from_str
@@ -82,7 +83,18 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
             confirm_message += "Create the fleet?"
         else:
             action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]."
-            if plan.current_resource.spec.configuration == plan.spec.configuration:
+            diff = diff_models(
+                old=plan.current_resource.spec.configuration,
+                new=plan.spec.configuration,
+                ignore={
+                    "ssh_config": {
+                        "ssh_key": True,
+                        "proxy_jump": {"ssh_key"},
+                        "hosts": {"__all__": {"ssh_key": True, "proxy_jump": {"ssh_key"}}},
+                    }
+                },
+            )
+            if not diff:
                 if command_args.yes and not command_args.force:
                     # --force is required only with --yes,
                     # otherwise we may ask for force apply interactively.

dstack/_internal/core/backends/aws/compute.py CHANGED Viewed

@@ -1,14 +1,21 @@
+import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Optional, Tuple
 import boto3
 import botocore.client
 import botocore.exceptions
+from cachetools import Cache, TTLCache, cachedmethod
+from cachetools.keys import hashkey
 from pydantic import ValidationError
 import dstack._internal.core.backends.aws.resources as aws_resources
 from dstack._internal import settings
-from dstack._internal.core.backends.aws.models import AWSAccessKeyCreds, AWSConfig
+from dstack._internal.core.backends.aws.models import (
+    AWSAccessKeyCreds,
+    AWSConfig,
+    AWSOSImageConfig,
+)
 from dstack._internal.core.backends.base.compute import (
     Compute,
     ComputeWithCreateInstanceSupport,
@@ -26,7 +33,12 @@ from dstack._internal.core.backends.base.compute import (
     merge_tags,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
-from dstack._internal.core.errors import ComputeError, NoCapacityError, PlacementGroupInUseError
+from dstack._internal.core.errors import (
+    ComputeError,
+    NoCapacityError,
+    PlacementGroupInUseError,
+    PlacementGroupNotSupportedError,
+)
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import CoreModel
 from dstack._internal.core.models.gateways import (
@@ -39,7 +51,11 @@ from dstack._internal.core.models.instances import (
     InstanceOffer,
     InstanceOfferWithAvailability,
 )
-from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
+from dstack._internal.core.models.placement import (
+    PlacementGroup,
+    PlacementGroupProvisioningData,
+    PlacementStrategy,
+)
 from dstack._internal.core.models.resources import Memory, Range
 from dstack._internal.core.models.runs import JobProvisioningData, Requirements
 from dstack._internal.core.models.volumes import (
@@ -66,6 +82,10 @@ class AWSVolumeBackendData(CoreModel):
     iops: int
+def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs):
+    return hashkey(*args, **kwargs)
 class AWSCompute(
     ComputeWithCreateInstanceSupport,
     ComputeWithMultinodeSupport,
@@ -86,6 +106,24 @@ class AWSCompute(
             )
         else:  # default creds
             self.session = boto3.Session()
+        # Caches to avoid redundant API calls when provisioning many instances
+        # get_offers is already cached but we still cache its sub-functions
+        # with more aggressive/longer caches.
+        self._get_regions_to_quotas_cache_lock = threading.Lock()
+        self._get_regions_to_quotas_execution_lock = threading.Lock()
+        self._get_regions_to_quotas_cache = TTLCache(maxsize=10, ttl=300)
+        self._get_regions_to_zones_cache_lock = threading.Lock()
+        self._get_regions_to_zones_cache = Cache(maxsize=10)
+        self._get_vpc_id_subnet_id_or_error_cache_lock = threading.Lock()
+        self._get_vpc_id_subnet_id_or_error_cache = TTLCache(maxsize=100, ttl=600)
+        self._get_maximum_efa_interfaces_cache_lock = threading.Lock()
+        self._get_maximum_efa_interfaces_cache = Cache(maxsize=100)
+        self._get_subnets_availability_zones_cache_lock = threading.Lock()
+        self._get_subnets_availability_zones_cache = Cache(maxsize=100)
+        self._create_security_group_cache_lock = threading.Lock()
+        self._create_security_group_cache = TTLCache(maxsize=100, ttl=600)
+        self._get_image_id_and_username_cache_lock = threading.Lock()
+        self._get_image_id_and_username_cache = TTLCache(maxsize=100, ttl=600)
     def get_offers(
         self, requirements: Optional[Requirements] = None
@@ -126,8 +164,11 @@ class AWSCompute(
             extra_filter=filter,
         )
         regions = list(set(i.region for i in offers))
-        regions_to_quotas = _get_regions_to_quotas(self.session, regions)
-        regions_to_zones = _get_regions_to_zones(self.session, regions)
+        with self._get_regions_to_quotas_execution_lock:
+            # Cache lock does not prevent concurrent execution.
+            # We use a separate lock to avoid requesting quotas in parallel and hitting rate limits.
+            regions_to_quotas = self._get_regions_to_quotas(self.session, regions)
+        regions_to_zones = self._get_regions_to_zones(self.session, regions)
         availability_offers = []
         for offer in offers:
@@ -186,21 +227,24 @@ class AWSCompute(
         tags = aws_resources.filter_invalid_tags(tags)
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
-        max_efa_interfaces = _get_maximum_efa_interfaces(
-            ec2_client=ec2_client, instance_type=instance_offer.instance.name
+        max_efa_interfaces = self._get_maximum_efa_interfaces(
+            ec2_client=ec2_client,
+            region=instance_offer.region,
+            instance_type=instance_offer.instance.name,
         )
         enable_efa = max_efa_interfaces > 0
         is_capacity_block = False
         try:
-            vpc_id, subnet_ids = get_vpc_id_subnet_id_or_error(
+            vpc_id, subnet_ids = self._get_vpc_id_subnet_id_or_error(
                 ec2_client=ec2_client,
                 config=self.config,
                 region=instance_offer.region,
                 allocate_public_ip=allocate_public_ip,
                 availability_zones=zones,
             )
-            subnet_id_to_az_map = aws_resources.get_subnets_availability_zones(
+            subnet_id_to_az_map = self._get_subnets_availability_zones(
                 ec2_client=ec2_client,
+                region=instance_offer.region,
                 subnet_ids=subnet_ids,
             )
             if instance_config.reservation:
@@ -229,12 +273,19 @@ class AWSCompute(
             tried_zones.add(az)
             try:
                 logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az)
-                image_id, username = aws_resources.get_image_id_and_username(
+                image_id, username = self._get_image_id_and_username(
                     ec2_client=ec2_client,
+                    region=instance_offer.region,
                     cuda=len(instance_offer.instance.resources.gpus) > 0,
                     instance_type=instance_offer.instance.name,
                     image_config=self.config.os_images,
                 )
+                security_group_id = self._create_security_group(
+                    ec2_client=ec2_client,
+                    region=instance_offer.region,
+                    project_id=project_name,
+                    vpc_id=vpc_id,
+                )
                 response = ec2_resource.create_instances(
                     **aws_resources.create_instances_struct(
                         disk_size=disk_size,
@@ -243,11 +294,7 @@ class AWSCompute(
                         iam_instance_profile=self.config.iam_instance_profile,
                         user_data=get_user_data(authorized_keys=instance_config.get_public_keys()),
                         tags=aws_resources.make_tags(tags),
-                        security_group_id=aws_resources.create_security_group(
-                            ec2_client=ec2_client,
-                            project_id=project_name,
-                            vpc_id=vpc_id,
-                        ),
+                        security_group_id=security_group_id,
                         spot=instance_offer.instance.resources.spot,
                         subnet_id=subnet_id,
                         allocate_public_ip=allocate_public_ip,
@@ -296,6 +343,8 @@ class AWSCompute(
         placement_group: PlacementGroup,
         master_instance_offer: InstanceOffer,
     ) -> PlacementGroupProvisioningData:
+        if not _offer_supports_placement_group(master_instance_offer, placement_group):
+            raise PlacementGroupNotSupportedError()
         ec2_client = self.session.client("ec2", region_name=placement_group.configuration.region)
         logger.debug("Creating placement group %s...", placement_group.name)
         ec2_client.create_placement_group(
@@ -332,6 +381,8 @@ class AWSCompute(
         placement_group: PlacementGroup,
         instance_offer: InstanceOffer,
     ) -> bool:
+        if not _offer_supports_placement_group(instance_offer, placement_group):
+            return False
         return (
             placement_group.configuration.backend == BackendType.AWS
             and placement_group.configuration.region == instance_offer.region
@@ -361,7 +412,7 @@ class AWSCompute(
         tags = aws_resources.filter_invalid_tags(tags)
         tags = aws_resources.make_tags(tags)
-        vpc_id, subnets_ids = get_vpc_id_subnet_id_or_error(
+        vpc_id, subnets_ids = self._get_vpc_id_subnet_id_or_error(
             ec2_client=ec2_client,
             config=self.config,
             region=configuration.region,
@@ -696,6 +747,165 @@ class AWSCompute(
             return True
         return True
+    def _get_regions_to_quotas_key(
+        self,
+        session: boto3.Session,
+        regions: List[str],
+    ) -> tuple:
+        return hashkey(tuple(regions))
+    @cachedmethod(
+        cache=lambda self: self._get_regions_to_quotas_cache,
+        key=_get_regions_to_quotas_key,
+        lock=lambda self: self._get_regions_to_quotas_cache_lock,
+    )
+    def _get_regions_to_quotas(
+        self,
+        session: boto3.Session,
+        regions: List[str],
+    ) -> Dict[str, Dict[str, int]]:
+        return _get_regions_to_quotas(session=session, regions=regions)
+    def _get_regions_to_zones_key(
+        self,
+        session: boto3.Session,
+        regions: List[str],
+    ) -> tuple:
+        return hashkey(tuple(regions))
+    @cachedmethod(
+        cache=lambda self: self._get_regions_to_zones_cache,
+        key=_get_regions_to_zones_key,
+        lock=lambda self: self._get_regions_to_zones_cache_lock,
+    )
+    def _get_regions_to_zones(
+        self,
+        session: boto3.Session,
+        regions: List[str],
+    ) -> Dict[str, List[str]]:
+        return _get_regions_to_zones(session=session, regions=regions)
+    def _get_vpc_id_subnet_id_or_error_cache_key(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        config: AWSConfig,
+        region: str,
+        allocate_public_ip: bool,
+        availability_zones: Optional[List[str]] = None,
+    ) -> tuple:
+        return hashkey(
+            region, allocate_public_ip, tuple(availability_zones) if availability_zones else None
+        )
+    @cachedmethod(
+        cache=lambda self: self._get_vpc_id_subnet_id_or_error_cache,
+        key=_get_vpc_id_subnet_id_or_error_cache_key,
+        lock=lambda self: self._get_vpc_id_subnet_id_or_error_cache_lock,
+    )
+    def _get_vpc_id_subnet_id_or_error(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        config: AWSConfig,
+        region: str,
+        allocate_public_ip: bool,
+        availability_zones: Optional[List[str]] = None,
+    ) -> Tuple[str, List[str]]:
+        return get_vpc_id_subnet_id_or_error(
+            ec2_client=ec2_client,
+            config=config,
+            region=region,
+            allocate_public_ip=allocate_public_ip,
+            availability_zones=availability_zones,
+        )
+    @cachedmethod(
+        cache=lambda self: self._get_maximum_efa_interfaces_cache,
+        key=_ec2client_cache_methodkey,
+        lock=lambda self: self._get_maximum_efa_interfaces_cache_lock,
+    )
+    def _get_maximum_efa_interfaces(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        instance_type: str,
+    ) -> int:
+        return _get_maximum_efa_interfaces(
+            ec2_client=ec2_client,
+            instance_type=instance_type,
+        )
+    def _get_subnets_availability_zones_key(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        subnet_ids: List[str],
+    ) -> tuple:
+        return hashkey(region, tuple(subnet_ids))
+    @cachedmethod(
+        cache=lambda self: self._get_subnets_availability_zones_cache,
+        key=_get_subnets_availability_zones_key,
+        lock=lambda self: self._get_subnets_availability_zones_cache_lock,
+    )
+    def _get_subnets_availability_zones(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        subnet_ids: List[str],
+    ) -> Dict[str, str]:
+        return aws_resources.get_subnets_availability_zones(
+            ec2_client=ec2_client,
+            subnet_ids=subnet_ids,
+        )
+    @cachedmethod(
+        cache=lambda self: self._create_security_group_cache,
+        key=_ec2client_cache_methodkey,
+        lock=lambda self: self._create_security_group_cache_lock,
+    )
+    def _create_security_group(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        project_id: str,
+        vpc_id: Optional[str],
+    ) -> str:
+        return aws_resources.create_security_group(
+            ec2_client=ec2_client,
+            project_id=project_id,
+            vpc_id=vpc_id,
+        )
+    def _get_image_id_and_username_cache_key(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        cuda: bool,
+        instance_type: str,
+        image_config: Optional[AWSOSImageConfig] = None,
+    ) -> tuple:
+        return hashkey(region, cuda, instance_type, image_config.json() if image_config else None)
+    @cachedmethod(
+        cache=lambda self: self._get_image_id_and_username_cache,
+        key=_get_image_id_and_username_cache_key,
+        lock=lambda self: self._get_image_id_and_username_cache_lock,
+    )
+    def _get_image_id_and_username(
+        self,
+        ec2_client: botocore.client.BaseClient,
+        region: str,
+        cuda: bool,
+        instance_type: str,
+        image_config: Optional[AWSOSImageConfig] = None,
+    ) -> tuple[str, str]:
+        return aws_resources.get_image_id_and_username(
+            ec2_client=ec2_client,
+            cuda=cuda,
+            instance_type=instance_type,
+            image_config=image_config,
+        )
 def get_vpc_id_subnet_id_or_error(
     ec2_client: botocore.client.BaseClient,
@@ -798,7 +1008,7 @@ def _get_regions_to_quotas(
         return region_quotas
     regions_to_quotas = {}
-    with ThreadPoolExecutor(max_workers=8) as executor:
+    with ThreadPoolExecutor(max_workers=12) as executor:
         future_to_region = {}
         for region in regions:
             future = executor.submit(
@@ -823,7 +1033,7 @@ def _has_quota(quotas: Dict[str, int], instance_name: str) -> Optional[bool]:
 def _get_regions_to_zones(session: boto3.Session, regions: List[str]) -> Dict[str, List[str]]:
     regions_to_zones = {}
-    with ThreadPoolExecutor(max_workers=8) as executor:
+    with ThreadPoolExecutor(max_workers=12) as executor:
         future_to_region = {}
         for region in regions:
             future = executor.submit(
@@ -862,6 +1072,15 @@ def _supported_instances(offer: InstanceOffer) -> bool:
     return False
+def _offer_supports_placement_group(offer: InstanceOffer, placement_group: PlacementGroup) -> bool:
+    if placement_group.configuration.placement_strategy != PlacementStrategy.CLUSTER:
+        return True
+    for family in ["t3.", "t2."]:
+        if offer.instance.name.startswith(family):
+            return False
+    return True
 def _get_maximum_efa_interfaces(ec2_client: botocore.client.BaseClient, instance_type: str) -> int:
     try:
         response = ec2_client.describe_instance_types(

dstack/_internal/core/backends/base/compute.py CHANGED Viewed

@@ -57,7 +57,7 @@ class Compute(ABC):
     def __init__(self):
         self._offers_cache_lock = threading.Lock()
-        self._offers_cache = TTLCache(maxsize=5, ttl=30)
+        self._offers_cache = TTLCache(maxsize=10, ttl=180)
     @abstractmethod
     def get_offers(
@@ -559,7 +559,8 @@ def get_shim_commands(
     backend_shim_env: Optional[Dict[str, str]] = None,
     arch: Optional[str] = None,
 ) -> List[str]:
-    commands = get_shim_pre_start_commands(
+    commands = get_setup_cloud_instance_commands()
+    commands += get_shim_pre_start_commands(
         base_path=base_path,
         bin_path=bin_path,
         arch=arch,
@@ -641,6 +642,23 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str:
     return url_template.format(version=version, arch=arch)
+def get_setup_cloud_instance_commands() -> list[str]:
+    return [
+        # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
+        # Attempts to patch /etc/docker/daemon.json while keeping any custom settings it may have.
+        (
+            "/bin/sh -c '"  # wrap in /bin/sh to avoid interfering with other cloud init commands
+            " grep -q nvidia /etc/docker/daemon.json"
+            " && ! grep -q native.cgroupdriver /etc/docker/daemon.json"
+            " && jq '\\''.\"exec-opts\" = ((.\"exec-opts\" // []) + [\"native.cgroupdriver=cgroupfs\"])'\\'' /etc/docker/daemon.json > /tmp/daemon.json"
+            " && sudo mv /tmp/daemon.json /etc/docker/daemon.json"
+            " && sudo service docker restart"
+            " || true"
+            "'"
+        ),
+    ]
 def get_shim_pre_start_commands(
     base_path: Optional[PathLike] = None,
     bin_path: Optional[PathLike] = None,

dstack/_internal/core/backends/cudo/compute.py CHANGED Viewed

@@ -65,12 +65,13 @@ class CudoCompute(
         public_keys = instance_config.get_public_keys()
         memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
-        commands = get_shim_commands(authorized_keys=public_keys)
         gpus_no = len(instance_offer.instance.resources.gpus)
-        shim_commands = " ".join([" && ".join(commands)])
-        startup_script = (
-            shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
-        )
+        if gpus_no > 0:
+            # we'll need jq for patching /etc/docker/daemon.json, see get_shim_commands()
+            commands = install_jq_commands()
+        else:
+            commands = install_docker_commands()
+        commands += get_shim_commands(authorized_keys=public_keys)
         try:
             resp_data = self.api_client.create_virtual_machine(
@@ -85,7 +86,7 @@ class CudoCompute(
                 memory_gib=memory_size,
                 vcpus=instance_offer.instance.resources.cpus,
                 vm_id=vm_id,
-                start_script=startup_script,
+                start_script=" && ".join(commands),
                 password=None,
                 customSshKeys=public_keys,
             )
@@ -151,6 +152,19 @@ def _get_image_id(cuda: bool) -> str:
     return image_name
-def install_docker_script():
-    commands = 'export DEBIAN_FRONTEND="noninteractive" && mkdir -p /etc/apt/keyrings && curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && apt-get update && apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin'
-    return commands
+def install_jq_commands():
+    return [
+        "export DEBIAN_FRONTEND=noninteractive",
+        "apt-get --assume-yes install jq",
+    ]
+def install_docker_commands():
+    return [
+        "export DEBIAN_FRONTEND=noninteractive",
+        "mkdir -p /etc/apt/keyrings",
+        "curl --max-time 60 -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg",
+        'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null',
+        "apt-get update",
+        "apt-get --assume-yes install docker-ce docker-ce-cli containerd.io docker-compose-plugin",
+    ]

dstack/_internal/core/backends/gcp/compute.py CHANGED Viewed

@@ -8,6 +8,7 @@ import google.api_core.exceptions
 import google.cloud.compute_v1 as compute_v1
 from cachetools import TTLCache, cachedmethod
 from google.cloud import tpu_v2
+from google.cloud.compute_v1.types.compute import Instance
 from gpuhunt import KNOWN_TPUS
 import dstack._internal.core.backends.gcp.auth as auth
@@ -19,6 +20,7 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithGatewaySupport,
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
+    ComputeWithPrivateGatewaySupport,
     ComputeWithVolumeSupport,
     generate_unique_gateway_instance_name,
     generate_unique_instance_name,
@@ -83,6 +85,7 @@ class GCPCompute(
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithGatewaySupport,
+    ComputeWithPrivateGatewaySupport,
     ComputeWithVolumeSupport,
     Compute,
 ):
@@ -395,11 +398,7 @@ class GCPCompute(
         if instance.status in ["PROVISIONING", "STAGING"]:
             return
         if instance.status == "RUNNING":
-            if allocate_public_ip:
-                hostname = instance.network_interfaces[0].access_configs[0].nat_i_p
-            else:
-                hostname = instance.network_interfaces[0].network_i_p
-            provisioning_data.hostname = hostname
+            provisioning_data.hostname = _get_instance_ip(instance, allocate_public_ip)
             provisioning_data.internal_ip = instance.network_interfaces[0].network_i_p
             return
         raise ProvisioningError(
@@ -500,7 +499,7 @@ class GCPCompute(
         request.instance_resource = gcp_resources.create_instance_struct(
             disk_size=10,
             image_id=_get_gateway_image_id(),
-            machine_type="e2-small",
+            machine_type="e2-medium",
             accelerators=[],
             spot=False,
             user_data=get_gateway_user_data(configuration.ssh_key_pub),
@@ -512,6 +511,7 @@ class GCPCompute(
             service_account=self.config.vm_service_account,
             network=self.config.vpc_resource_name,
             subnetwork=subnetwork,
+            allocate_public_ip=configuration.public_ip,
         )
         operation = self.instances_client.insert(request=request)
         gcp_resources.wait_for_extended_operation(operation, "instance creation")
@@ -522,7 +522,7 @@ class GCPCompute(
             instance_id=instance_name,
             region=configuration.region,  # used for instance termination
             availability_zone=zone,
-            ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p,
+            ip_address=_get_instance_ip(instance, configuration.public_ip),
             backend_data=json.dumps({"zone": zone}),
         )
@@ -1024,3 +1024,9 @@ def _is_tpu_provisioning_data(provisioning_data: JobProvisioningData) -> bool:
         backend_data_dict = json.loads(provisioning_data.backend_data)
         is_tpu = backend_data_dict.get("is_tpu", False)
     return is_tpu
+def _get_instance_ip(instance: Instance, public_ip: bool) -> str:
+    if public_ip:
+        return instance.network_interfaces[0].access_configs[0].nat_i_p
+    return instance.network_interfaces[0].network_i_p

dstack/_internal/core/backends/lambdalabs/compute.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import hashlib
+import shlex
 import subprocess
 import tempfile
 from threading import Thread
@@ -98,7 +99,7 @@ class LambdaCompute(
                 arch=provisioning_data.instance_type.resources.cpu_arch,
             )
             # shim is assumed to be run under root
-            launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
+            launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
             thread = Thread(
                 target=_start_runner,
                 kwargs={

dstack/_internal/core/compatibility/fleets.py CHANGED Viewed

@@ -1,19 +1,20 @@
-from typing import Any, Dict, Optional
+from typing import Optional
+from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
 from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec
 from dstack._internal.core.models.instances import Instance
-def get_get_plan_excludes(fleet_spec: FleetSpec) -> Dict:
-    get_plan_excludes = {}
+def get_get_plan_excludes(fleet_spec: FleetSpec) -> IncludeExcludeDictType:
+    get_plan_excludes: IncludeExcludeDictType = {}
     spec_excludes = get_fleet_spec_excludes(fleet_spec)
     if spec_excludes:
         get_plan_excludes["spec"] = spec_excludes
     return get_plan_excludes
-def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
-    apply_plan_excludes = {}
+def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> IncludeExcludeDictType:
+    apply_plan_excludes: IncludeExcludeDictType = {}
     spec_excludes = get_fleet_spec_excludes(plan_input.spec)
     if spec_excludes:
         apply_plan_excludes["spec"] = spec_excludes
@@ -28,23 +29,23 @@ def get_apply_plan_excludes(plan_input: ApplyFleetPlanInput) -> Dict:
     return {"plan": apply_plan_excludes}
-def get_create_fleet_excludes(fleet_spec: FleetSpec) -> Dict:
-    create_fleet_excludes = {}
+def get_create_fleet_excludes(fleet_spec: FleetSpec) -> IncludeExcludeDictType:
+    create_fleet_excludes: IncludeExcludeDictType = {}
     spec_excludes = get_fleet_spec_excludes(fleet_spec)
     if spec_excludes:
         create_fleet_excludes["spec"] = spec_excludes
     return create_fleet_excludes
-def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[Dict]:
+def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDictType]:
     """
     Returns `fleet_spec` exclude mapping to exclude certain fields from the request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    spec_excludes: Dict[str, Any] = {}
-    configuration_excludes: Dict[str, Any] = {}
-    profile_excludes: set[str] = set()
+    spec_excludes: IncludeExcludeDictType = {}
+    configuration_excludes: IncludeExcludeDictType = {}
+    profile_excludes: IncludeExcludeSetType = set()
     profile = fleet_spec.profile
     if profile.fleets is None:
         profile_excludes.add("fleets")

dstack 0.19.17__py3-none-any.whl → 0.19.18__py3-none-any.whl

Potentially problematic release.

dstack 0.19.17py3-none-any.whl → 0.19.18py3-none-any.whl