PyPI - dstack - Versions diffs - 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl - Mend

dstack 0.19.30rc1py3-none-any.whl → 0.19.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (54) hide show

dstack/_internal/cli/commands/__init__.py +8 -0
dstack/_internal/cli/commands/project.py +27 -20
dstack/_internal/cli/commands/server.py +5 -0
dstack/_internal/cli/services/configurators/fleet.py +20 -6
dstack/_internal/cli/utils/gpu.py +2 -2
dstack/_internal/core/backends/aws/compute.py +13 -5
dstack/_internal/core/backends/aws/resources.py +11 -6
dstack/_internal/core/backends/azure/compute.py +17 -6
dstack/_internal/core/backends/base/compute.py +57 -9
dstack/_internal/core/backends/base/offers.py +1 -0
dstack/_internal/core/backends/cloudrift/compute.py +2 -0
dstack/_internal/core/backends/cudo/compute.py +2 -0
dstack/_internal/core/backends/datacrunch/compute.py +2 -0
dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
dstack/_internal/core/backends/features.py +5 -0
dstack/_internal/core/backends/gcp/compute.py +87 -38
dstack/_internal/core/backends/gcp/configurator.py +1 -1
dstack/_internal/core/backends/gcp/models.py +14 -1
dstack/_internal/core/backends/gcp/resources.py +35 -12
dstack/_internal/core/backends/hotaisle/compute.py +22 -0
dstack/_internal/core/backends/kubernetes/compute.py +531 -215
dstack/_internal/core/backends/kubernetes/models.py +13 -16
dstack/_internal/core/backends/kubernetes/utils.py +145 -8
dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
dstack/_internal/core/backends/local/compute.py +2 -0
dstack/_internal/core/backends/nebius/compute.py +17 -0
dstack/_internal/core/backends/nebius/configurator.py +15 -0
dstack/_internal/core/backends/nebius/models.py +57 -5
dstack/_internal/core/backends/nebius/resources.py +45 -2
dstack/_internal/core/backends/oci/compute.py +7 -1
dstack/_internal/core/backends/oci/resources.py +8 -3
dstack/_internal/core/backends/template/compute.py.jinja +2 -0
dstack/_internal/core/backends/tensordock/compute.py +2 -0
dstack/_internal/core/backends/vultr/compute.py +2 -0
dstack/_internal/core/compatibility/runs.py +8 -0
dstack/_internal/core/consts.py +2 -0
dstack/_internal/core/models/profiles.py +11 -4
dstack/_internal/core/services/repos.py +101 -11
dstack/_internal/server/background/tasks/common.py +2 -0
dstack/_internal/server/background/tasks/process_fleets.py +75 -17
dstack/_internal/server/background/tasks/process_instances.py +3 -5
dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
dstack/_internal/server/background/tasks/process_runs.py +27 -23
dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
dstack/_internal/server/services/offers.py +7 -1
dstack/_internal/server/testing/common.py +2 -0
dstack/_internal/server/utils/provisioning.py +3 -10
dstack/_internal/utils/ssh.py +22 -2
dstack/version.py +2 -2
{dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
{dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
{dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
{dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
{dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/backends/gcp/compute.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithPrivateGatewaySupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithVolumeSupport,
     generate_unique_gateway_instance_name,
     generate_unique_instance_name,
@@ -31,6 +32,7 @@ from dstack._internal.core.backends.base.compute import (
     get_shim_commands,
     get_user_data,
     merge_tags,
+    requires_nvidia_proprietary_kernel_modules,
 )
 from dstack._internal.core.backends.base.offers import (
     get_catalog_offers,
@@ -38,6 +40,7 @@ from dstack._internal.core.backends.base.offers import (
 )
 from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
 from dstack._internal.core.backends.gcp.models import GCPConfig
+from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
 from dstack._internal.core.errors import (
     ComputeError,
     ComputeResourceNotFoundError,
@@ -88,6 +91,7 @@ class GCPVolumeDiskBackendData(CoreModel):
 class GCPCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     ComputeWithMultinodeSupport,
     ComputeWithPlacementGroupSupport,
     ComputeWithGatewaySupport,
@@ -109,8 +113,8 @@ class GCPCompute(
         self.resource_policies_client = compute_v1.ResourcePoliciesClient(
             credentials=self.credentials
         )
-        self._extra_subnets_cache_lock = threading.Lock()
-        self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
+        self._usable_subnets_cache_lock = threading.Lock()
+        self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120)
     def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
         regions = get_or_error(self.config.regions)
@@ -201,12 +205,12 @@ class GCPCompute(
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
         # Choose any usable subnet in a VPC.
         # Configuring a specific subnet per region is not supported yet.
-        subnetwork = _get_vpc_subnet(
-            subnetworks_client=self.subnetworks_client,
-            config=self.config,
+        subnetwork = self._get_vpc_subnet(instance_offer.region)
+        extra_subnets = self._get_extra_subnets(
             region=instance_offer.region,
+            instance_type_name=instance_offer.instance.name,
         )
-        extra_subnets = self._get_extra_subnets(
+        roce_subnets = self._get_roce_subnets(
             region=instance_offer.region,
             instance_type_name=instance_offer.instance.name,
         )
@@ -293,7 +297,11 @@ class GCPCompute(
         image = _get_image(
             instance_type_name=instance_offer.instance.name,
-            cuda=len(instance_offer.instance.resources.gpus) > 0,
+            gpu_name=(
+                instance_offer.instance.resources.gpus[0].name
+                if len(instance_offer.instance.resources.gpus) > 0
+                else None
+            ),
         )
         for zone in zones:
@@ -324,6 +332,7 @@ class GCPCompute(
                 network=self.config.vpc_resource_name,
                 subnetwork=subnetwork,
                 extra_subnetworks=extra_subnets,
+                roce_subnetworks=roce_subnets,
                 allocate_public_ip=allocate_public_ip,
                 placement_policy=placement_policy,
             )
@@ -333,6 +342,13 @@ class GCPCompute(
                 # If the request succeeds, we'll probably timeout and update_provisioning_data() will get hostname.
                 operation = self.instances_client.insert(request=request)
                 gcp_resources.wait_for_extended_operation(operation, timeout=30)
+            except google.api_core.exceptions.BadRequest as e:
+                if "Network profile only allows resource creation in location" in e.message:
+                    # A hack to find the correct RoCE VPC zone by trial and error.
+                    # Could be better to find it via the API.
+                    logger.debug("Got GCP error when provisioning a VM: %s", e)
+                    continue
+                raise
             except (
                 google.api_core.exceptions.ServiceUnavailable,
                 google.api_core.exceptions.NotFound,
@@ -481,11 +497,7 @@ class GCPCompute(
         )
         # Choose any usable subnet in a VPC.
         # Configuring a specific subnet per region is not supported yet.
-        subnetwork = _get_vpc_subnet(
-            subnetworks_client=self.subnetworks_client,
-            config=self.config,
-            region=configuration.region,
-        )
+        subnetwork = self._get_vpc_subnet(configuration.region)
         labels = {
             "owner": "dstack",
@@ -787,10 +799,6 @@ class GCPCompute(
             instance_id,
         )
-    @cachedmethod(
-        cache=lambda self: self._extra_subnets_cache,
-        lock=lambda self: self._extra_subnets_cache_lock,
-    )
     def _get_extra_subnets(
         self,
         region: str,
@@ -802,15 +810,16 @@ class GCPCompute(
             subnets_num = 8
         elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
             subnets_num = 4
+        elif instance_type_name == "a4-highgpu-8g":
+            subnets_num = 1  # 1 main + 1 extra + 8 RoCE
         else:
             return []
         extra_subnets = []
         for vpc_name in self.config.extra_vpcs[:subnets_num]:
             subnet = gcp_resources.get_vpc_subnet_or_error(
-                subnetworks_client=self.subnetworks_client,
-                vpc_project_id=self.config.vpc_project_id or self.config.project_id,
                 vpc_name=vpc_name,
                 region=region,
+                usable_subnets=self._list_usable_subnets(),
             )
             vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
                 project_id=self.config.vpc_project_id or self.config.project_id,
@@ -819,6 +828,58 @@ class GCPCompute(
             extra_subnets.append((vpc_resource_name, subnet))
         return extra_subnets
+    def _get_roce_subnets(
+        self,
+        region: str,
+        instance_type_name: str,
+    ) -> List[Tuple[str, str]]:
+        if not self.config.roce_vpcs:
+            return []
+        if instance_type_name == "a4-highgpu-8g":
+            nics_num = 8
+        else:
+            return []
+        roce_vpc = self.config.roce_vpcs[0]  # roce_vpcs is validated to have at most 1 item
+        subnets = gcp_resources.get_vpc_subnets(
+            vpc_name=roce_vpc,
+            region=region,
+            usable_subnets=self._list_usable_subnets(),
+        )
+        if len(subnets) < nics_num:
+            raise ComputeError(
+                f"{instance_type_name} requires {nics_num} RoCE subnets,"
+                f" but only {len(subnets)} are available in VPC {roce_vpc}"
+            )
+        vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
+            project_id=self.config.vpc_project_id or self.config.project_id,
+            vpc_name=roce_vpc,
+        )
+        nic_subnets = []
+        for subnet in subnets[:nics_num]:
+            nic_subnets.append((vpc_resource_name, subnet))
+        return nic_subnets
+    @cachedmethod(
+        cache=lambda self: self._usable_subnets_cache,
+        lock=lambda self: self._usable_subnets_cache_lock,
+    )
+    def _list_usable_subnets(self) -> list[compute_v1.UsableSubnetwork]:
+        # To avoid hitting the `ListUsable requests per minute` system limit, we fetch all subnets
+        # at once and cache them
+        return gcp_resources.list_project_usable_subnets(
+            subnetworks_client=self.subnetworks_client,
+            project_id=self.config.vpc_project_id or self.config.project_id,
+        )
+    def _get_vpc_subnet(self, region: str) -> Optional[str]:
+        if self.config.vpc_name is None:
+            return None
+        return gcp_resources.get_vpc_subnet_or_error(
+            vpc_name=self.config.vpc_name,
+            region=region,
+            usable_subnets=self._list_usable_subnets(),
+        )
 def _supported_instances_and_zones(
     regions: List[str],
@@ -861,8 +922,8 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
     gpu = resources.gpus[0]
     if _is_tpu(gpu.name):
         return True
-    if gpu.name == "H100":
-        # H100 and H100_MEGA quotas are not returned by `regions_client.list`
+    if gpu.name in ["B200", "H100"]:
+        # B200, H100 and H100_MEGA quotas are not returned by `regions_client.list`
         return True
     quota_name = f"NVIDIA_{gpu.name}_GPUS"
     if gpu.name == "A100" and gpu.memory_mib == 80 * 1024:
@@ -883,28 +944,13 @@ def _unique_instance_name(instance: InstanceType) -> str:
     return f"{name}-{gpu.name}-{gpu.memory_mib}"
-def _get_vpc_subnet(
-    subnetworks_client: compute_v1.SubnetworksClient,
-    config: GCPConfig,
-    region: str,
-) -> Optional[str]:
-    if config.vpc_name is None:
-        return None
-    return gcp_resources.get_vpc_subnet_or_error(
-        subnetworks_client=subnetworks_client,
-        vpc_project_id=config.vpc_project_id or config.project_id,
-        vpc_name=config.vpc_name,
-        region=region,
-    )
 @dataclass
 class GCPImage:
     id: str
     is_ufw_installed: bool
-def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
+def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
     if instance_type_name == "a3-megagpu-8g":
         image_name = "dstack-a3mega-5"
         is_ufw_installed = False
@@ -913,8 +959,11 @@ def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
             id="projects/cos-cloud/global/images/cos-105-17412-535-78",
             is_ufw_installed=False,
         )
-    elif cuda:
-        image_name = f"dstack-cuda-{version.base_image}"
+    elif gpu_name is not None:
+        if not requires_nvidia_proprietary_kernel_modules(gpu_name):
+            image_name = f"dstack-cuda-{version.base_image}"
+        else:
+            image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
         is_ufw_installed = True
     else:
         image_name = f"dstack-{version.base_image}"

dstack/_internal/core/backends/gcp/configurator.py CHANGED Viewed

@@ -202,5 +202,5 @@ class GCPConfigurator(
             )
         except BackendError as e:
             raise ServerClientError(e.args[0])
-        # Not checking config.extra_vpc so that users are not required to configure subnets for all regions
+        # Not checking config.extra_vpcs and config.roce_vpcs so that users are not required to configure subnets for all regions
         # but only for regions they intend to use. Validation will be done on provisioning.

dstack/_internal/core/backends/gcp/models.py CHANGED Viewed

@@ -41,11 +41,24 @@ class GCPBackendConfig(CoreModel):
         Optional[List[str]],
         Field(
             description=(
-                "The names of additional VPCs used for GPUDirect. Specify eight VPCs to maximize bandwidth."
+                "The names of additional VPCs used for multi-NIC instances, such as those that support GPUDirect."
+                " Specify eight VPCs to maximize bandwidth in clusters with eight-GPU instances."
                 " Each VPC must have a subnet and a firewall rule allowing internal traffic across all subnets"
             )
         ),
     ] = None
+    roce_vpcs: Annotated[
+        Optional[List[str]],
+        Field(
+            description=(
+                "The names of additional VPCs with the RoCE network profile."
+                " Used for RDMA on GPU instances that support the MRDMA interface type."
+                " A VPC should have eight subnets to maximize the bandwidth in clusters"
+                " with eight-GPU instances."
+            ),
+            max_items=1,  # The currently supported instance types only need one VPC with eight subnets.
+        ),
+    ] = None
     vpc_project_id: Annotated[
         Optional[str],
         Field(description="The shared VPC hosted project ID. Required for shared VPC only"),

dstack/_internal/core/backends/gcp/resources.py CHANGED Viewed

@@ -19,6 +19,7 @@ DSTACK_INSTANCE_TAG = "dstack-runner-instance"
 DSTACK_GATEWAY_TAG = "dstack-gateway-instance"
 supported_accelerators = [
+    {"accelerator_name": "nvidia-b200", "gpu_name": "B200", "memory_mb": 1024 * 180},
     {"accelerator_name": "nvidia-a100-80gb", "gpu_name": "A100", "memory_mb": 1024 * 80},
     {"accelerator_name": "nvidia-tesla-a100", "gpu_name": "A100", "memory_mb": 1024 * 40},
     {"accelerator_name": "nvidia-l4", "gpu_name": "L4", "memory_mb": 1024 * 24},
@@ -58,8 +59,6 @@ def check_vpc(
         )
         for region in regions:
             get_vpc_subnet_or_error(
-                subnetworks_client=subnetworks_client,
-                vpc_project_id=vpc_project_id,
                 vpc_name=vpc_name,
                 region=region,
                 usable_subnets=usable_subnets,
@@ -121,6 +120,7 @@ def create_instance_struct(
     network: str = "global/networks/default",
     subnetwork: Optional[str] = None,
     extra_subnetworks: Optional[List[Tuple[str, str]]] = None,
+    roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
     allocate_public_ip: bool = True,
     placement_policy: Optional[str] = None,
 ) -> compute_v1.Instance:
@@ -132,6 +132,7 @@ def create_instance_struct(
         subnetwork=subnetwork,
         allocate_public_ip=allocate_public_ip,
         extra_subnetworks=extra_subnetworks,
+        roce_subnetworks=roce_subnetworks,
     )
     disk = compute_v1.AttachedDisk()
@@ -194,6 +195,7 @@ def _get_network_interfaces(
     subnetwork: Optional[str],
     allocate_public_ip: bool,
     extra_subnetworks: Optional[List[Tuple[str, str]]],
+    roce_subnetworks: Optional[List[Tuple[str, str]]],
 ) -> List[compute_v1.NetworkInterface]:
     network_interface = compute_v1.NetworkInterface()
     network_interface.network = network
@@ -221,6 +223,14 @@ def _get_network_interfaces(
                 nic_type=compute_v1.NetworkInterface.NicType.GVNIC.name,
             )
         )
+    for network, subnetwork in roce_subnetworks or []:
+        network_interfaces.append(
+            compute_v1.NetworkInterface(
+                network=network,
+                subnetwork=subnetwork,
+                nic_type=compute_v1.NetworkInterface.NicType.MRDMA.name,
+            )
+        )
     return network_interfaces
@@ -233,29 +243,41 @@ def list_project_usable_subnets(
 def get_vpc_subnet_or_error(
-    subnetworks_client: compute_v1.SubnetworksClient,
-    vpc_project_id: str,
     vpc_name: str,
     region: str,
-    usable_subnets: Optional[List[compute_v1.UsableSubnetwork]] = None,
+    usable_subnets: list[compute_v1.UsableSubnetwork],
 ) -> str:
     """
     Returns resource name of any usable subnet in a given VPC
     (e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet")
     """
-    if usable_subnets is None:
-        usable_subnets = list_project_usable_subnets(subnetworks_client, vpc_project_id)
+    vpc_subnets = get_vpc_subnets(vpc_name, region, usable_subnets)
+    if vpc_subnets:
+        return vpc_subnets[0]
+    raise ComputeError(
+        f"No usable subnetwork found in region {region} for VPC {vpc_name}."
+        f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
+    )
+def get_vpc_subnets(
+    vpc_name: str,
+    region: str,
+    usable_subnets: list[compute_v1.UsableSubnetwork],
+) -> list[str]:
+    """
+    Returns resource names of all usable subnets in a given VPC
+    (e.g. ["projects/example-project/regions/europe-west4/subnetworks/example-subnet"])
+    """
+    result = []
     for subnet in usable_subnets:
         network_name = subnet.network.split("/")[-1]
         subnet_url = subnet.subnetwork
         subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/")
         subnet_region = subnet_resource_name.split("/")[3]
         if network_name == vpc_name and subnet_region == region:
-            return subnet_resource_name
-    raise ComputeError(
-        f"No usable subnetwork found in region {region} for VPC {vpc_name} in project {vpc_project_id}."
-        f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
-    )
+            result.append(subnet_resource_name)
+    return result
 def create_runner_firewall_rules(
@@ -476,5 +498,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
             "n4-",
             "h3-",
             "v6e",
+            "a4-",
         ]
     )

dstack/_internal/core/backends/hotaisle/compute.py CHANGED Viewed

@@ -11,6 +11,7 @@ from dstack._internal.core.backends.base.compute import (
     Compute,
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -41,12 +42,33 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2000000000,
         "cpu_manufacturer": "Intel",
     },
+    "2x MI300X 26x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "2x MI300X 26x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8462Y": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
 }
 class HotAisleCompute(
     ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
+    ComputeWithPrivilegedSupport,
     Compute,
 ):
     def __init__(self, config: HotAisleConfig):

dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl

Potentially problematic release.

dstack 0.19.30rc1py3-none-any.whl → 0.19.32py3-none-any.whl