PyPI - dstack - Versions diffs - 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl - Mend

dstack 0.18.40rc1py3-none-any.whl → 0.18.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

dstack/_internal/core/backends/oci/resources.py CHANGED Viewed

@@ -203,34 +203,29 @@ def check_availability_in_domain(
     return available
-def check_availability_in_region(
+def check_availability_per_domain(
     shape_names: Iterable[str],
     shapes_quota: ShapesQuota,
     region: OCIRegionClient,
     compartment_id: str,
-) -> Set[str]:
-    """
-    Returns a subset of `shape_names` with only the shapes available in at least
-    one availability domain within `region`.
-    """
+) -> Dict[str, Set[str]]:
     all_shapes = set(shape_names)
-    available_shapes = set()
+    available_shapes_per_domain = {}
     for availability_domain in region.availability_domains:
         shapes_to_check = {
             shape
-            for shape in all_shapes - available_shapes
+            for shape in all_shapes
             if shapes_quota.is_within_domain_quota(shape, availability_domain.name)
         }
-        available_shapes |= check_availability_in_domain(
+        available_shapes_per_domain[availability_domain.name] = check_availability_in_domain(
             shape_names=shapes_to_check,
             availability_domain_name=availability_domain.name,
             client=region.compute_client,
             compartment_id=compartment_id,
         )
-    return available_shapes
+    return available_shapes_per_domain
 def get_shapes_availability(
@@ -239,12 +234,11 @@ def get_shapes_availability(
     regions: Mapping[str, OCIRegionClient],
     compartment_id: str,
     executor: Executor,
-) -> Dict[str, Set[str]]:
+) -> Dict[str, Dict[str, List[str]]]:
     """
-    Returns a mapping of region names to sets of shape names available in these
-    regions. Only shapes from `offers` are checked.
+    Returns availability domains where shapes are available as regions->shapes->availability_domains mapping.
+    Only shapes from `offers` are checked.
     """
     shape_names_per_region = {region: set() for region in regions}
     for offer in offers:
         if shapes_quota.is_within_region_quota(offer.instance.name, offer.region):
@@ -253,7 +247,7 @@ def get_shapes_availability(
     future_to_region_name = {}
     for region_name, shape_names in shape_names_per_region.items():
         future = executor.submit(
-            check_availability_in_region,
+            check_availability_per_domain,
             shape_names,
             shapes_quota,
             regions[region_name],
@@ -263,29 +257,32 @@ def get_shapes_availability(
     result = {}
     for future in as_completed(future_to_region_name):
-        region_name = future_to_region_name[future]
-        result[region_name] = future.result()
+        domains_to_shape_names = future.result()
+        shape_names_to_domains = {}
+        for domain, shape_names in domains_to_shape_names.items():
+            for shape_name in shape_names:
+                shape_names_to_domains.setdefault(shape_name, []).append(domain)
+        result[future_to_region_name[future]] = shape_names_to_domains
     return result
-def choose_available_domain(
+def get_available_domains(
     shape_name: str, shapes_quota: ShapesQuota, region: OCIRegionClient, compartment_id: str
-) -> Optional[str]:
+) -> List[str]:
     """
-    Returns the name of any availability domain within `region` in which
-    `shape_name` is available. None if the shape is unavailable or not within
-    `shapes_quota` in all domains.
+    Returns the names of all availability domains in `region` in which
+    `shape_name` is available and within `shapes_quota`.
     """
+    domains = []
     for domain in region.availability_domains:
         if shapes_quota.is_within_domain_quota(
             shape_name, domain.name
         ) and check_availability_in_domain(
             {shape_name}, domain.name, region.compute_client, compartment_id
         ):
-            return domain.name
-    return None
+            domains.append(domain.name)
+    return domains
 def get_instance_vnic(

dstack/_internal/core/backends/remote/provisioning.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import io
 import json
 import time
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from textwrap import dedent
-from typing import Any, Dict, Generator, List
+from typing import Any, Dict, Generator, List, Optional
 import paramiko
 from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
@@ -17,6 +17,7 @@ from dstack._internal.core.models.instances import (
     Gpu,
     InstanceType,
     Resources,
+    SSHConnectionParams,
 )
 from dstack._internal.utils.gpu import (
     convert_amd_gpu_name,
@@ -262,35 +263,72 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
 @contextmanager
 def get_paramiko_connection(
-    ssh_user: str, host: str, port: int, pkeys: List[paramiko.PKey]
+    ssh_user: str,
+    host: str,
+    port: int,
+    pkeys: List[paramiko.PKey],
+    proxy: Optional[SSHConnectionParams] = None,
+    proxy_pkeys: Optional[list[paramiko.PKey]] = None,
 ) -> Generator[paramiko.SSHClient, None, None]:
-    with paramiko.SSHClient() as client:
-        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
-        for pkey in pkeys:
-            conn_url = f"{ssh_user}@{host}:{port}"
+    if proxy is not None:
+        if proxy_pkeys is None:
+            raise ProvisioningError("Missing proxy private keys")
+        proxy_ctx = get_paramiko_connection(
+            proxy.username, proxy.hostname, proxy.port, proxy_pkeys
+        )
+    else:
+        proxy_ctx = nullcontext()
+    conn_url = f"{ssh_user}@{host}:{port}"
+    with proxy_ctx as proxy_client, paramiko.SSHClient() as client:
+        proxy_channel: Optional[paramiko.Channel] = None
+        if proxy_client is not None:
             try:
-                logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
-                client.connect(
-                    username=ssh_user,
-                    hostname=host,
-                    port=port,
-                    pkey=pkey,
-                    look_for_keys=False,
-                    allow_agent=False,
-                    timeout=SSH_CONNECT_TIMEOUT,
+                proxy_channel = proxy_client.get_transport().open_channel(
+                    "direct-tcpip", (host, port), ("", 0)
                 )
-            except paramiko.AuthenticationException:
-                logger.debug(
-                    f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
-                )
-                continue  # try next key
             except (paramiko.SSHException, OSError) as e:
-                raise ProvisioningError(f"Connect failed: {e}") from e
-            else:
+                raise ProvisioningError(f"Proxy channel failed: {e}") from e
+        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        for pkey in pkeys:
+            logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
+            connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel)
+            if connected:
                 yield client
                 return
-        else:
-            keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
-            raise ProvisioningError(
-                f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
+            logger.debug(
+                f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
             )
+        keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
+        raise ProvisioningError(
+            f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
+        )
+def _paramiko_connect(
+    client: paramiko.SSHClient,
+    user: str,
+    host: str,
+    port: int,
+    pkey: paramiko.PKey,
+    channel: Optional[paramiko.Channel] = None,
+) -> bool:
+    """
+    Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError`
+    on other errors.
+    """
+    try:
+        client.connect(
+            username=user,
+            hostname=host,
+            port=port,
+            pkey=pkey,
+            look_for_keys=False,
+            allow_agent=False,
+            timeout=SSH_CONNECT_TIMEOUT,
+            sock=channel,
+        )
+        return True
+    except paramiko.AuthenticationException:
+        return False
+    except (paramiko.SSHException, OSError) as e:
+        raise ProvisioningError(f"Connect failed: {e}") from e

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -113,6 +113,7 @@ class RunpodCompute(Compute):
             bid_per_gpu=bid_per_gpu,
             network_volume_id=network_volume_id,
             volume_mount_path=volume_mount_path,
+            env={"RUNPOD_POD_USER": "0"},
         )
         instance_id = resp["id"]

dstack/_internal/core/models/backends/azure.py CHANGED Viewed

@@ -11,6 +11,7 @@ class AzureConfigInfo(CoreModel):
     type: Literal["azure"] = "azure"
     tenant_id: str
     subscription_id: str
+    resource_group: Optional[str] = None
     locations: Optional[List[str]] = None
     vpc_ids: Optional[Dict[str, str]] = None
     public_ips: Optional[bool] = None
@@ -48,6 +49,7 @@ class AzureConfigInfoWithCredsPartial(CoreModel):
     creds: Optional[AnyAzureCreds]
     tenant_id: Optional[str]
     subscription_id: Optional[str]
+    resource_group: Optional[str]
     locations: Optional[List[str]]
     vpc_ids: Optional[Dict[str, str]]
     public_ips: Optional[bool]
@@ -63,4 +65,4 @@ class AzureConfigValues(CoreModel):
 class AzureStoredConfig(AzureConfigInfo):
-    resource_group: str
+    resource_group: str = ""

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -10,7 +10,7 @@ from dstack._internal.core.models.common import CoreModel, Duration, RegistryAut
 from dstack._internal.core.models.envs import Env
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
-from dstack._internal.core.models.profiles import ProfileParams
+from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
 from dstack._internal.core.models.repos.base import Repo
 from dstack._internal.core.models.repos.virtual import VirtualRepo
 from dstack._internal.core.models.resources import Range, ResourcesSpec
@@ -212,6 +212,29 @@ class DevEnvironmentConfigurationParams(CoreModel):
     ide: Annotated[Literal["vscode"], Field(description="The IDE to run")]
     version: Annotated[Optional[str], Field(description="The version of the IDE")]
     init: Annotated[CommandsList, Field(description="The bash commands to run on startup")] = []
+    inactivity_duration: Annotated[
+        Optional[Union[Literal["off"], int, bool, str]],
+        Field(
+            description=(
+                "The maximum amount of time the dev environment can be inactive"
+                " (e.g., `2h`, `1d`, etc)."
+                " After it elapses, the dev environment is automatically stopped."
+                " Inactivity is defined as the absence of SSH connections to the"
+                " dev environment, including VS Code connections, `ssh <run name>`"
+                " shells, and attached `dstack apply` or `dstack attach` commands."
+                " Use `off` for unlimited duration. Defaults to `off`"
+            )
+        ),
+    ]
+    @validator("inactivity_duration", pre=True, allow_reuse=True)
+    def parse_inactivity_duration(
+        cls, v: Optional[Union[Literal["off"], int, bool, str]]
+    ) -> Optional[int]:
+        v = parse_off_duration(v)
+        if isinstance(v, int):
+            return v
+        return None
 class DevEnvironmentConfiguration(

dstack/_internal/core/models/fleets.py CHANGED Viewed

@@ -39,6 +39,14 @@ class InstanceGroupPlacement(str, Enum):
     CLUSTER = "cluster"
+class SSHProxyParams(CoreModel):
+    hostname: Annotated[str, Field(description="The IP address or domain of proxy host")]
+    port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None
+    user: Annotated[str, Field(description="The user to log in with for proxy host")]
+    identity_file: Annotated[str, Field(description="The private key to use for proxy host")]
+    ssh_key: Optional[SSHKey] = None
 class SSHHostParams(CoreModel):
     hostname: Annotated[str, Field(description="The IP address or domain to connect to")]
     port: Annotated[
@@ -50,6 +58,9 @@ class SSHHostParams(CoreModel):
     identity_file: Annotated[
         Optional[str], Field(description="The private key to use for this host")
     ] = None
+    proxy_jump: Annotated[
+        Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host")
+    ] = None
     internal_ip: Annotated[
         Optional[str],
         Field(
@@ -61,6 +72,19 @@ class SSHHostParams(CoreModel):
     ] = None
     ssh_key: Optional[SSHKey] = None
+    blocks: Annotated[
+        Union[Literal["auto"], int],
+        Field(
+            description=(
+                "The amount of blocks to split the instance into, a number or `auto`."
+                " `auto` means as many as possible."
+                " The number of GPUs and CPUs must be divisible by the number of blocks."
+                " Defaults to `1`, i.e. do not split"
+            ),
+            ge=1,
+        ),
+    ] = 1
     @validator("internal_ip")
     def validate_internal_ip(cls, value):
         if value is None:
@@ -83,6 +107,9 @@ class SSHParams(CoreModel):
         Optional[str], Field(description="The private key to use for all hosts")
     ] = None
     ssh_key: Optional[SSHKey] = None
+    proxy_jump: Annotated[
+        Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts")
+    ] = None
     hosts: Annotated[
         List[Union[SSHHostParams, str]],
         Field(
@@ -142,6 +169,19 @@ class InstanceGroupParams(CoreModel):
         Field(description="The resources requirements"),
     ] = ResourcesSpec()
+    blocks: Annotated[
+        Union[Literal["auto"], int],
+        Field(
+            description=(
+                "The amount of blocks to split the instance into, a number or `auto`."
+                " `auto` means as many as possible."
+                " The number of GPUs and CPUs must be divisible by the number of blocks."
+                " Defaults to `1`, i.e. do not split"
+            ),
+            ge=1,
+        ),
+    ] = 1
     backends: Annotated[
         Optional[List[BackendType]],
         Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"),
@@ -152,6 +192,12 @@ class InstanceGroupParams(CoreModel):
             description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
         ),
     ] = None
+    availability_zones: Annotated[
+        Optional[List[str]],
+        Field(
+            description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
+        ),
+    ] = None
     instance_types: Annotated[
         Optional[List[str]],
         Field(

dstack/_internal/core/models/instances.py CHANGED Viewed

@@ -92,6 +92,8 @@ class RemoteConnectionInfo(CoreModel):
     port: int
     ssh_user: str
     ssh_keys: List[SSHKey]
+    ssh_proxy: Optional[SSHConnectionParams] = None
+    ssh_proxy_keys: Optional[list[SSHKey]] = None
     env: Env = Env()
@@ -101,7 +103,6 @@ class InstanceConfiguration(CoreModel):
     user: str  # dstack user name
     ssh_keys: List[SSHKey]
     instance_id: Optional[str] = None
-    availability_zone: Optional[str] = None
     placement_group_name: Optional[str] = None
     reservation: Optional[str] = None
     volumes: Optional[List[Volume]] = None
@@ -140,7 +141,10 @@ class InstanceOffer(CoreModel):
 class InstanceOfferWithAvailability(InstanceOffer):
     availability: InstanceAvailability
+    availability_zones: Optional[List[str]] = None
     instance_runtime: InstanceRuntime = InstanceRuntime.SHIM
+    blocks: int = 1
+    total_blocks: int = 1
 class InstanceStatus(str, Enum):

dstack/_internal/core/models/pools.py CHANGED Viewed

@@ -25,14 +25,17 @@ class Instance(CoreModel):
     fleet_name: Optional[str] = None
     instance_num: int
     pool_name: Optional[str] = None
-    job_name: Optional[str] = None
+    job_name: Optional[str] = None  # deprecated, always None (instance can have more than one job)
     hostname: Optional[str] = None
     status: InstanceStatus
     unreachable: bool = False
     termination_reason: Optional[str] = None
     created: datetime.datetime
     region: Optional[str] = None
+    availability_zone: Optional[str] = None
     price: Optional[float] = None
+    total_blocks: Optional[int] = None
+    busy_blocks: int = 0
 class PoolInstances(CoreModel):

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -40,15 +40,15 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
     return Duration.parse(v)
-def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
+def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
     return parse_off_duration(v)
-def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
+def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
     return parse_off_duration(v)
-def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
+def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
     if v == "off" or v is False:
         return "off"
     if v is True:
@@ -123,6 +123,12 @@ class ProfileParams(CoreModel):
             description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
         ),
     ]
+    availability_zones: Annotated[
+        Optional[List[str]],
+        Field(
+            description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
+        ),
+    ] = None
     instance_types: Annotated[
         Optional[List[str]],
         Field(
@@ -162,7 +168,7 @@ class ProfileParams(CoreModel):
         Optional[Union[Literal["off"], str, int, bool]],
         Field(
             description=(
-                "The maximum duration of a run gracefull stopping."
+                "The maximum duration of a run graceful stopping."
                 " After it elapses, the run is automatically forced stopped."
                 " This includes force detaching volumes used by the run."
                 " Use `off` for unlimited duration. Defaults to `5m`"

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -27,6 +27,7 @@ from dstack._internal.core.models.profiles import (
 from dstack._internal.core.models.repos import AnyRunRepoData
 from dstack._internal.core.models.resources import Memory, ResourcesSpec
 from dstack._internal.core.models.unix import UnixUser
+from dstack._internal.core.models.volumes import MountPoint
 from dstack._internal.utils import common as common_utils
 from dstack._internal.utils.common import format_pretty_duration
@@ -112,6 +113,7 @@ class JobTerminationReason(str, Enum):
     DONE_BY_RUNNER = "done_by_runner"
     ABORTED_BY_USER = "aborted_by_user"
     TERMINATED_BY_SERVER = "terminated_by_server"
+    INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
     # Set by the runner
     CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
     PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -132,6 +134,7 @@ class JobTerminationReason(str, Enum):
             self.DONE_BY_RUNNER: JobStatus.DONE,
             self.ABORTED_BY_USER: JobStatus.ABORTED,
             self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
+            self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
             self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
             self.PORTS_BINDING_FAILED: JobStatus.FAILED,
             self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -147,9 +150,9 @@ class JobTerminationReason(str, Enum):
 class Requirements(CoreModel):
     # TODO: Make requirements' fields required
     resources: ResourcesSpec
-    max_price: Optional[float]
-    spot: Optional[bool]
-    reservation: Optional[str]
+    max_price: Optional[float] = None
+    spot: Optional[bool] = None
+    reservation: Optional[str] = None
     def pretty_format(self, resources_only: bool = False):
         res = self.resources.pretty_format()
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
     registry_auth: Optional[RegistryAuth]
     requirements: Requirements
     retry: Optional[Retry]
+    volumes: Optional[List[MountPoint]] = None
     # For backward compatibility with 0.18.x when retry_policy was required.
     # TODO: remove in 0.19
     retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
@@ -231,6 +235,17 @@ class JobProvisioningData(CoreModel):
 class JobRuntimeData(CoreModel):
+    """
+    Holds various information only available after the job is submitted, such as:
+        * offer (depends on the instance)
+        * volumes used by the job
+        * resource constraints for container (depend on the instance)
+        * port mapping (reported by the shim only after the container is started)
+    Some fields are mutable, for example, `ports` only available when the shim starts
+    the container.
+    """
     network_mode: NetworkMode
     # GPU, CPU, memory resource shares. None means all available (no limit)
     gpu: Optional[int] = None
@@ -240,6 +255,10 @@ class JobRuntimeData(CoreModel):
     # None if data is not yet available (on vm-based backends and ssh instances)
     # or not applicable (container-based backends)
     ports: Optional[dict[int, int]] = None
+    # List of volumes used by the job
+    volume_names: Optional[list[str]] = None  # None for backward compalibility
+    # Virtual shared offer
+    offer: Optional[InstanceOfferWithAvailability] = None  # None for backward compalibility
 class ClusterInfo(CoreModel):
@@ -254,6 +273,7 @@ class JobSubmission(CoreModel):
     submitted_at: datetime
     last_processed_at: datetime
     finished_at: Optional[datetime]
+    inactivity_secs: Optional[int]
     status: JobStatus
     termination_reason: Optional[JobTerminationReason]
     termination_reason_message: Optional[str]

dstack/_internal/core/models/volumes.py CHANGED Viewed

@@ -32,6 +32,9 @@ class VolumeConfiguration(CoreModel):
     name: Annotated[Optional[str], Field(description="The volume name")] = None
     backend: Annotated[BackendType, Field(description="The volume backend")]
     region: Annotated[str, Field(description="The volume region")]
+    availability_zone: Annotated[
+        Optional[str], Field(description="The volume availability zone")
+    ] = None
     size: Annotated[
         Optional[Memory],
         Field(description="The volume size. Must be specified when creating new volumes"),
@@ -68,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
     device_name: Optional[str] = None
+class VolumeInstance(CoreModel):
+    name: str
+    fleet_name: Optional[str] = None
+    instance_num: int
+    instance_id: Optional[str] = None
+class VolumeAttachment(CoreModel):
+    instance: VolumeInstance
+    attachment_data: Optional[VolumeAttachmentData] = None
 class Volume(CoreModel):
     id: uuid.UUID
     name: str
@@ -83,8 +98,19 @@ class Volume(CoreModel):
     deleted: bool
     volume_id: Optional[str] = None  # id of the volume in the cloud
     provisioning_data: Optional[VolumeProvisioningData] = None
+    attachments: Optional[List[VolumeAttachment]] = None
+    # attachment_data is deprecated in favor of attachments.
+    # It's only set for volumes that were attached before attachments.
     attachment_data: Optional[VolumeAttachmentData] = None
+    def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
+        if self.attachments is not None:
+            for attachment in self.attachments:
+                if attachment.instance.instance_id == instance_id:
+                    return attachment.attachment_data
+        # volume was attached before attachments were introduced
+        return self.attachment_data
 class VolumePlan(CoreModel):
     project_name: str

dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl

dstack 0.18.40rc1py3-none-any.whl → 0.18.42py3-none-any.whl