PyPI - dstack - Versions diffs - 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl - Mend

dstack 0.18.42py3-none-any.whl → 0.18.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

dstack/_internal/core/backends/azure/resources.py CHANGED Viewed

@@ -6,6 +6,8 @@ from azure.mgmt.network.models import Subnet
 from dstack._internal.core.errors import BackendError
+MAX_RESOURCE_NAME_LEN = 64
 def get_network_subnets(
     network_client: network_mgmt.NetworkManagementClient,

dstack/_internal/core/backends/base/compute.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
+import random
 import re
+import string
 import threading
 from abc import ABC, abstractmethod
 from functools import lru_cache
@@ -31,6 +33,7 @@ from dstack._internal.core.models.volumes import (
     VolumeAttachmentData,
     VolumeProvisioningData,
 )
+from dstack._internal.core.services import is_valid_dstack_resource_name
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
@@ -209,8 +212,105 @@ class Compute(ABC):
         return self.get_offers(requirements)
-def get_instance_name(run: Run, job: Job) -> str:
-    return f"{run.project_name.lower()}-{job.job_spec.job_name}"
+def get_job_instance_name(run: Run, job: Job) -> str:
+    return job.job_spec.job_name
+_DEFAULT_MAX_RESOURCE_NAME_LEN = 60
+_CLOUD_RESOURCE_SUFFIX_LEN = 8
+def generate_unique_instance_name(
+    instance_configuration: InstanceConfiguration,
+    max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
+) -> str:
+    """
+    Generates a unique instance name valid across all backends.
+    """
+    return generate_unique_backend_name(
+        resource_name=instance_configuration.instance_name,
+        project_name=instance_configuration.project_name,
+        max_length=max_length,
+    )
+def generate_unique_instance_name_for_job(
+    run: Run,
+    job: Job,
+    max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
+) -> str:
+    """
+    Generates a unique instance name for a job valid across all backends.
+    """
+    return generate_unique_backend_name(
+        resource_name=get_job_instance_name(run, job),
+        project_name=run.project_name,
+        max_length=max_length,
+    )
+def generate_unique_gateway_instance_name(
+    gateway_compute_configuration: GatewayComputeConfiguration,
+    max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
+) -> str:
+    """
+    Generates a unique gateway instance name valid across all backends.
+    """
+    return generate_unique_backend_name(
+        resource_name=gateway_compute_configuration.instance_name,
+        project_name=gateway_compute_configuration.project_name,
+        max_length=max_length,
+    )
+def generate_unique_volume_name(
+    volume: Volume,
+    max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
+) -> str:
+    """
+    Generates a unique volume name valid across all backends.
+    """
+    return generate_unique_backend_name(
+        resource_name=volume.name,
+        project_name=volume.project_name,
+        max_length=max_length,
+    )
+def generate_unique_backend_name(
+    resource_name: str,
+    project_name: Optional[str],
+    max_length: int,
+) -> str:
+    """
+    Generates a unique resource name valid across all backends.
+    Backend resource names must be unique on every provisioning so that
+    resource re-submission/re-creation doesn't lead to conflicts
+    on backends that require unique names (e.g. Azure, GCP).
+    """
+    # resource_name is guaranteed to be valid in all backends
+    prefix = f"dstack-{resource_name}"
+    if project_name is not None and is_valid_dstack_resource_name(project_name):
+        # project_name is not guaranteed to be valid in all backends,
+        # so we add it only if it passes the validation
+        prefix = f"dstack-{project_name}-{resource_name}"
+    return _generate_unique_backend_name_with_prefix(
+        prefix=prefix,
+        max_length=max_length,
+    )
+def _generate_unique_backend_name_with_prefix(
+    prefix: str,
+    max_length: int,
+) -> str:
+    prefix_len = max_length - _CLOUD_RESOURCE_SUFFIX_LEN - 1
+    prefix = prefix[:prefix_len]
+    suffix = "".join(
+        random.choice(string.ascii_lowercase + string.digits)
+        for _ in range(_CLOUD_RESOURCE_SUFFIX_LEN)
+    )
+    return f"{prefix}-{suffix}"
 def get_cloud_config(**config) -> str:

dstack/_internal/core/backends/base/offers.py CHANGED Viewed

@@ -14,6 +14,12 @@ from dstack._internal.core.models.instances import (
 from dstack._internal.core.models.resources import DEFAULT_DISK, Memory, Range
 from dstack._internal.core.models.runs import Requirements
+# Offers not supported by all dstack versions are hidden behind one or more flags.
+# This list enables the flags that are currently supported.
+SUPPORTED_GPUHUNT_FLAGS = [
+    "oci-spot",
+]
 def get_catalog_offers(
     backend: BackendType,
@@ -110,7 +116,7 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
 def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFilter:
-    q = gpuhunt.QueryFilter()
+    q = gpuhunt.QueryFilter(allowed_flags=SUPPORTED_GPUHUNT_FLAGS)
     if req is None:
         return q

dstack/_internal/core/backends/cudo/compute.py CHANGED Viewed

@@ -4,7 +4,8 @@ import requests
 from dstack._internal.core.backends.base import Compute
 from dstack._internal.core.backends.base.compute import (
-    get_instance_name,
+    generate_unique_instance_name,
+    get_job_instance_name,
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -25,6 +26,9 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+MAX_RESOURCE_NAME_LEN = 30
 class CudoCompute(Compute):
     def __init__(self, config: CudoConfig):
         super().__init__()
@@ -58,7 +62,7 @@ class CudoCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),
+            instance_name=get_job_instance_name(run, job),
             ssh_keys=[
                 SSHKey(public=project_ssh_public_key.strip()),
             ],
@@ -71,6 +75,7 @@ class CudoCompute(Compute):
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
     ) -> JobProvisioningData:
+        vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
         public_keys = instance_config.get_public_keys()
         memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -81,13 +86,12 @@ class CudoCompute(Compute):
             shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
         )
-        vm_id = f"{instance_config.instance_name}-{instance_offer.region}"
         try:
             resp_data = self.api_client.create_virtual_machine(
                 project_id=self.config.project_id,
                 boot_disk_storage_class="STORAGE_CLASS_NETWORK",
                 boot_disk_size_gib=disk_size,
-                book_disk_id=f"{instance_config.instance_name}_{instance_offer.region}_disk_id",
+                book_disk_id=f"{vm_id}_disk_id",
                 boot_disk_image_id=_get_image_id(gpus_no > 0),
                 data_center_id=instance_offer.region,
                 gpus=gpus_no,

dstack/_internal/core/backends/datacrunch/compute.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Dict, List, Optional
 from dstack._internal.core.backends.base import Compute
 from dstack._internal.core.backends.base.compute import (
+    generate_unique_instance_name,
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -22,6 +23,8 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger("datacrunch.compute")
+MAX_INSTANCE_NAME_LEN = 60
 # Ubuntu 22.04 + CUDA 12.0 + Docker
 # from API https://datacrunch.stoplight.io/docs/datacrunch-public/c46ab45dbc508-get-all-image-types
 IMAGE_ID = "2088da25-bb0d-41cc-a191-dccae45d96fd"
@@ -78,6 +81,9 @@ class DataCrunchCompute(Compute):
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
     ) -> JobProvisioningData:
+        instance_name = generate_unique_instance_name(
+            instance_config, max_length=MAX_INSTANCE_NAME_LEN
+        )
         public_keys = instance_config.get_public_keys()
         ssh_ids = []
         for ssh_public_key in public_keys:
@@ -106,8 +112,8 @@ class DataCrunchCompute(Compute):
             instance_type=instance_offer.instance.name,
             ssh_key_ids=ssh_ids,
             startup_script_id=startup_script_ids,
-            hostname=instance_config.instance_name,
-            description=instance_config.instance_name,
+            hostname=instance_name,
+            description=instance_name,
             image=IMAGE_ID,
             disk_size=disk_size,
             location=instance_offer.region,
@@ -119,8 +125,8 @@ class DataCrunchCompute(Compute):
                 "instance_type": instance_offer.instance.name,
                 "ssh_key_ids": ssh_ids,
                 "startup_script_id": startup_script_ids,
-                "hostname": instance_config.instance_name,
-                "description": instance_config.instance_name,
+                "hostname": instance_name,
+                "description": instance_name,
                 "image": IMAGE_ID,
                 "disk_size": disk_size,
                 "location": instance_offer.region,

dstack/_internal/core/backends/gcp/auth.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import json
 from typing import Optional, Tuple
+import google.api_core.exceptions
 import google.auth
+import google.cloud.compute_v1 as compute_v1
 from google.auth.credentials import Credentials
 from google.auth.exceptions import DefaultCredentialsError
-from google.cloud import storage
 from google.oauth2 import service_account
 from dstack._internal.core.errors import BackendAuthError
@@ -16,13 +17,16 @@ from dstack._internal.core.models.backends.gcp import (
 from dstack._internal.core.models.common import is_core_model_instance
-def authenticate(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
-    """
-    :raises BackendAuthError:
-    :return: GCP credentials and project_id
-    """
-    credentials, project_id = get_credentials(creds)
-    validate_credentials(credentials)
+def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[Credentials, str]:
+    credentials, credentials_project_id = get_credentials(creds)
+    if project_id is None:
+        # If project_id is not specified explicitly, try using credentials' project_id.
+        # Explicit project_id takes precedence bacause credentials' project_id may be irrelevant.
+        # For example, with Workload Identity Federation for GKE, it's cluster project_id.
+        project_id = credentials_project_id
+    if project_id is None:
+        raise BackendAuthError("Credentials require project_id to be specified")
+    validate_credentials(credentials, project_id)
     return credentials, project_id
@@ -40,17 +44,19 @@ def get_credentials(creds: AnyGCPCreds) -> Tuple[Credentials, Optional[str]]:
     try:
         default_credentials, project_id = google.auth.default()
     except DefaultCredentialsError:
-        raise BackendAuthError()
+        raise BackendAuthError("Failed to find default credentials")
     return default_credentials, project_id
-def validate_credentials(credentials: Credentials):
+def validate_credentials(credentials: Credentials, project_id: str):
     try:
-        storage_client = storage.Client(credentials=credentials)
-        storage_client.list_buckets(max_results=1)
+        regions_client = compute_v1.RegionsClient(credentials=credentials)
+        regions_client.list(project=project_id)
+    except google.api_core.exceptions.NotFound:
+        raise BackendAuthError(f"project_id {project_id} not found")
     except Exception:
-        raise BackendAuthError()
+        raise BackendAuthError("Insufficient permissions")
 def default_creds_available() -> bool:

dstack/_internal/core/backends/gcp/compute.py CHANGED Viewed

@@ -12,8 +12,11 @@ import dstack._internal.core.backends.gcp.auth as auth
 import dstack._internal.core.backends.gcp.resources as gcp_resources
 from dstack._internal.core.backends.base.compute import (
     Compute,
+    generate_unique_gateway_instance_name,
+    generate_unique_instance_name,
+    generate_unique_volume_name,
     get_gateway_user_data,
-    get_instance_name,
+    get_job_instance_name,
     get_shim_commands,
     get_user_data,
     merge_tags,
@@ -70,7 +73,7 @@ class GCPCompute(Compute):
     def __init__(self, config: GCPConfig):
         super().__init__()
         self.config = config
-        self.credentials, self.project_id = auth.authenticate(config.creds)
+        self.credentials, _ = auth.authenticate(config.creds, self.config.project_id)
         self.instances_client = compute_v1.InstancesClient(credentials=self.credentials)
         self.firewalls_client = compute_v1.FirewallsClient(credentials=self.credentials)
         self.regions_client = compute_v1.RegionsClient(credentials=self.credentials)
@@ -147,17 +150,10 @@ class GCPCompute(Compute):
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
     ) -> JobProvisioningData:
-        instance_name = instance_config.instance_name
+        instance_name = generate_unique_instance_name(
+            instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
+        )
         allocate_public_ip = self.config.allocate_public_ips
-        if not gcp_resources.is_valid_resource_name(instance_name):
-            # In a rare case the instance name is invalid in GCP,
-            # we better use a random instance name than fail provisioning.
-            instance_name = gcp_resources.generate_random_resource_name()
-            logger.warning(
-                "Invalid GCP instance name: %s. A new valid name is generated: %s",
-                instance_config.instance_name,
-                instance_name,
-            )
         authorized_keys = instance_config.get_public_keys()
         # get_offers always fills instance_offer.availability_zones
@@ -182,6 +178,7 @@ class GCPCompute(Compute):
         labels = {
             "owner": "dstack",
             "dstack_project": instance_config.project_name.lower(),
+            "dstack_name": instance_config.instance_name,
             "dstack_user": instance_config.user.lower(),
         }
         labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
@@ -192,7 +189,7 @@ class GCPCompute(Compute):
             else False
         )
         if is_tpu:
-            instance_id = f"tpu-{instance_config.instance_name}"
+            instance_id = instance_name
             startup_script = _get_tpu_startup_script(authorized_keys)
             # GCP does not allow attaching disks while TPUs is creating,
             # so we need to attach the disks on creation.
@@ -378,7 +375,7 @@ class GCPCompute(Compute):
         # TODO: run_job is the same for vm-based backends, refactor
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),  # TODO: generate name
+            instance_name=get_job_instance_name(run, job),  # TODO: generate name
             ssh_keys=[
                 SSHKey(public=project_ssh_public_key.strip()),
             ],
@@ -421,6 +418,9 @@ class GCPCompute(Compute):
         else:
             raise ComputeResourceNotFoundError()
+        instance_name = generate_unique_gateway_instance_name(
+            configuration, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
+        )
         # Choose any usable subnet in a VPC.
         # Configuring a specific subnet per region is not supported yet.
         subnetwork = _get_vpc_subnet(
@@ -432,6 +432,7 @@ class GCPCompute(Compute):
         labels = {
             "owner": "dstack",
             "dstack_project": configuration.project_name.lower(),
+            "dstack_name": configuration.instance_name,
         }
         labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
         labels = merge_tags(tags=labels, backend_tags=self.config.tags)
@@ -449,7 +450,7 @@ class GCPCompute(Compute):
             authorized_keys=[configuration.ssh_key_pub],
             labels=labels,
             tags=[gcp_resources.DSTACK_GATEWAY_TAG],
-            instance_name=configuration.instance_name,
+            instance_name=instance_name,
             zone=zone,
             service_account=self.config.vm_service_account,
             network=self.config.vpc_resource_name,
@@ -458,10 +459,10 @@ class GCPCompute(Compute):
         operation = self.instances_client.insert(request=request)
         gcp_resources.wait_for_extended_operation(operation, "instance creation")
         instance = self.instances_client.get(
-            project=self.config.project_id, zone=zone, instance=configuration.instance_name
+            project=self.config.project_id, zone=zone, instance=instance_name
         )
         return GatewayProvisioningData(
-            instance_id=configuration.instance_name,
+            instance_id=instance_name,
             region=configuration.region,  # used for instance termination
             availability_zone=zone,
             ip_address=instance.network_interfaces[0].access_configs[0].nat_i_p,
@@ -525,16 +526,21 @@ class GCPCompute(Compute):
             )
         zone = zones[0]
+        disk_name = generate_unique_volume_name(
+            volume, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
+        )
         labels = {
             "owner": "dstack",
             "dstack_project": volume.project_name.lower(),
+            "dstack_name": volume.name,
             "dstack_user": volume.user,
         }
         labels = {k: v for k, v in labels.items() if gcp_resources.is_valid_label_value(v)}
         labels = merge_tags(tags=labels, backend_tags=self.config.tags)
         disk = compute_v1.Disk()
-        disk.name = volume.name
+        disk.name = disk_name
         disk.size_gb = volume.configuration.size_gb
         disk.type_ = f"zones/{zone}/diskTypes/pd-balanced"
         disk.labels = labels
@@ -552,7 +558,7 @@ class GCPCompute(Compute):
         created_disk = self.disk_client.get(
             project=self.config.project_id,
             zone=zone,
-            disk=volume.name,
+            disk=disk_name,
         )
         logger.debug("Created persistent disk for volume %s", volume.name)
         return VolumeProvisioningData(

dstack/_internal/core/backends/gcp/resources.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import concurrent.futures
-import random
 import re
-import string
 from typing import Dict, List, Optional
 import google.api_core.exceptions
@@ -64,7 +62,7 @@ def check_vpc(
                 region=region,
             )
     except google.api_core.exceptions.NotFound:
-        raise ComputeError(f"Failed to find Shared VPC project {vpc_project_id}")
+        raise ComputeError(f"Failed to find VPC project {vpc_project_id}")
     if allocate_public_ip:
         return
@@ -322,12 +320,13 @@ def _is_valid_label(key: str, value: str) -> bool:
     return is_valid_resource_name(key) and is_valid_label_value(value)
+MAX_RESOURCE_NAME_LEN = 63
 NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
 LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
 def is_valid_resource_name(name: str) -> bool:
-    if len(name) < 1 or len(name) > 63:
+    if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
         return False
     match = re.match(NAME_PATTERN, name)
     return match is not None
@@ -338,12 +337,6 @@ def is_valid_label_value(value: str) -> bool:
     return match is not None
-def generate_random_resource_name(length: int = 40) -> str:
-    return random.choice(string.ascii_lowercase) + "".join(
-        random.choice(string.ascii_lowercase + string.digits) for _ in range(length)
-    )
 def create_tpu_node_struct(
     instance_name: str,
     startup_script: str,

dstack/_internal/core/backends/kubernetes/compute.py CHANGED Viewed

@@ -9,9 +9,10 @@ from kubernetes import client
 from dstack._internal.core.backends.base.compute import (
     Compute,
+    generate_unique_gateway_instance_name,
+    generate_unique_instance_name_for_job,
     get_docker_commands,
     get_dstack_gateway_commands,
-    get_instance_name,
 )
 from dstack._internal.core.backends.base.offers import match_requirements
 from dstack._internal.core.backends.kubernetes.config import KubernetesConfig
@@ -99,7 +100,7 @@ class KubernetesCompute(Compute):
         project_ssh_private_key: str,
         volumes: List[Volume],
     ) -> JobProvisioningData:
-        instance_name = get_instance_name(run, job)
+        instance_name = generate_unique_instance_name_for_job(run, job)
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )
@@ -231,7 +232,7 @@ class KubernetesCompute(Compute):
         # TODO: By default EKS creates a Classic Load Balancer for Load Balancer services.
         # Consider deploying an NLB. It seems it requires some extra configuration on the cluster:
         # https://docs.aws.amazon.com/eks/latest/userguide/network-load-balancing.html
-        instance_name = configuration.instance_name
+        instance_name = generate_unique_gateway_instance_name(configuration)
         commands = _get_gateway_commands(authorized_keys=[configuration.ssh_key_pub])
         self.api.create_namespaced_pod(
             namespace=DEFAULT_NAMESPACE,

dstack/_internal/core/backends/lambdalabs/compute.py CHANGED Viewed

@@ -6,7 +6,8 @@ from typing import Dict, List, Optional
 from dstack._internal.core.backends.base.compute import (
     Compute,
-    get_instance_name,
+    generate_unique_instance_name,
+    get_job_instance_name,
     get_shim_commands,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -23,6 +24,8 @@ from dstack._internal.core.models.instances import (
 from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
 from dstack._internal.core.models.volumes import Volume
+MAX_INSTANCE_NAME_LEN = 60
 class LambdaCompute(Compute):
     def __init__(self, config: LambdaConfig):
@@ -44,6 +47,9 @@ class LambdaCompute(Compute):
     def create_instance(
         self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
     ) -> JobProvisioningData:
+        instance_name = generate_unique_instance_name(
+            instance_config, max_length=MAX_INSTANCE_NAME_LEN
+        )
         project_ssh_key = instance_config.ssh_keys[0]
         project_key_name = _add_project_ssh_key(
             api_client=self.api_client,
@@ -53,7 +59,7 @@ class LambdaCompute(Compute):
             region_name=instance_offer.region,
             instance_type_name=instance_offer.instance.name,
             ssh_key_names=[project_key_name],
-            name=instance_config.instance_name,
+            name=instance_name,
             quantity=1,
             file_system_names=[],
         )
@@ -107,7 +113,7 @@ class LambdaCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),  # TODO: generate name
+            instance_name=get_job_instance_name(run, job),  # TODO: generate name
             ssh_keys=[
                 SSHKey(
                     public=project_ssh_public_key.strip(), private=project_ssh_private_key.strip()

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional
 import dstack.version as version
 from dstack._internal import settings
 from dstack._internal.core.backends.base import Compute
-from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data
+from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
 from dstack._internal.core.backends.nebius.config import NebiusConfig
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),  # TODO: generate name
+            instance_name=get_job_instance_name(run, job),  # TODO: generate name
             ssh_keys=[
                 SSHKey(public=project_ssh_public_key.strip()),
             ],

dstack/_internal/core/backends/oci/compute.py CHANGED Viewed

@@ -4,7 +4,12 @@ from typing import List, Optional
 import oci
-from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data
+from dstack._internal.core.backends.base.compute import (
+    Compute,
+    generate_unique_instance_name,
+    get_job_instance_name,
+    get_user_data,
+)
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.oci import resources
 from dstack._internal.core.backends.oci.config import OCIConfig
@@ -98,7 +103,7 @@ class OCICompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),
+            instance_name=get_job_instance_name(run, job),
             ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
             user=run.user,
         )
@@ -148,6 +153,7 @@ class OCICompute(Compute):
         ]
         cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
+        display_name = generate_unique_instance_name(instance_config)
         try:
             instance = resources.launch_instance(
                 region=region,
@@ -155,7 +161,7 @@ class OCICompute(Compute):
                 compartment_id=self.config.compartment_id,
                 subnet_id=subnet.id,
                 security_group_id=security_group.id,
-                display_name=instance_config.instance_name,
+                display_name=display_name,
                 cloud_init_user_data=cloud_init_user_data,
                 shape=instance_offer.instance.name,
                 is_spot=instance_offer.instance.resources.spot,
@@ -163,7 +169,7 @@ class OCICompute(Compute):
                 image_id=package.image_id,
             )
         except oci.exceptions.ServiceError as e:
-            if e.code in ("LimitExceeded", "QuotaExceeded"):
+            if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
                 raise NoCapacityError(e.message)
             raise

dstack 0.18.42__py3-none-any.whl → 0.18.43__py3-none-any.whl

dstack 0.18.42py3-none-any.whl → 0.18.43py3-none-any.whl