PyPI - dstack - Versions diffs - 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl - Mend

dstack 0.18.42py3-none-any.whl → 0.18.44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional
 import dstack.version as version
 from dstack._internal import settings
 from dstack._internal.core.backends.base import Compute
-from dstack._internal.core.backends.base.compute import get_instance_name, get_user_data
+from dstack._internal.core.backends.base.compute import get_job_instance_name, get_user_data
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient
 from dstack._internal.core.backends.nebius.config import NebiusConfig
@@ -130,7 +130,7 @@ class NebiusCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),  # TODO: generate name
+            instance_name=get_job_instance_name(run, job),  # TODO: generate name
             ssh_keys=[
                 SSHKey(public=project_ssh_public_key.strip()),
             ],

dstack/_internal/core/backends/oci/compute.py CHANGED Viewed

@@ -4,7 +4,12 @@ from typing import List, Optional
 import oci
-from dstack._internal.core.backends.base.compute import Compute, get_instance_name, get_user_data
+from dstack._internal.core.backends.base.compute import (
+    Compute,
+    generate_unique_instance_name,
+    get_job_instance_name,
+    get_user_data,
+)
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.oci import resources
 from dstack._internal.core.backends.oci.config import OCIConfig
@@ -98,7 +103,7 @@ class OCICompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),
+            instance_name=get_job_instance_name(run, job),
             ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
             user=run.user,
         )
@@ -148,6 +153,7 @@ class OCICompute(Compute):
         ]
         cloud_init_user_data = get_user_data(instance_config.get_public_keys(), setup_commands)
+        display_name = generate_unique_instance_name(instance_config)
         try:
             instance = resources.launch_instance(
                 region=region,
@@ -155,7 +161,7 @@ class OCICompute(Compute):
                 compartment_id=self.config.compartment_id,
                 subnet_id=subnet.id,
                 security_group_id=security_group.id,
-                display_name=instance_config.instance_name,
+                display_name=display_name,
                 cloud_init_user_data=cloud_init_user_data,
                 shape=instance_offer.instance.name,
                 is_spot=instance_offer.instance.resources.spot,
@@ -163,7 +169,7 @@ class OCICompute(Compute):
                 image_id=package.image_id,
             )
         except oci.exceptions.ServiceError as e:
-            if e.code in ("LimitExceeded", "QuotaExceeded"):
+            if e.code in ("LimitExceeded", "QuotaExceeded") or "Out of host capacity" in e.message:
                 raise NoCapacityError(e.message)
             raise

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -5,8 +5,10 @@ from typing import List, Optional
 from dstack._internal.core.backends.base import Compute
 from dstack._internal.core.backends.base.compute import (
+    generate_unique_instance_name,
+    generate_unique_volume_name,
     get_docker_commands,
-    get_instance_name,
+    get_job_instance_name,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
@@ -31,6 +33,9 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+# Undocumented but names of len 60 work
+MAX_RESOURCE_NAME_LEN = 60
 CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24  # 24 hour
@@ -47,8 +52,9 @@ class RunpodCompute(Compute):
     ) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.RUNPOD,
-            locations=self.config.regions,
+            locations=self.config.regions or None,
             requirements=requirements,
+            extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
         )
         offers = [
             InstanceOfferWithAvailability(
@@ -69,7 +75,7 @@ class RunpodCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),
+            instance_name=get_job_instance_name(run, job),
             ssh_keys=[
                 SSHKey(public=run.run_spec.ssh_key_pub.strip()),
                 SSHKey(public=project_ssh_public_key.strip()),
@@ -77,6 +83,7 @@ class RunpodCompute(Compute):
             user=run.user,
         )
+        pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
         authorized_keys = instance_config.get_public_keys()
         memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
         disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -96,13 +103,22 @@ class RunpodCompute(Compute):
         bid_per_gpu = None
         if instance_offer.instance.resources.spot and gpu_count:
             bid_per_gpu = instance_offer.price / gpu_count
+        if _is_secure_cloud(instance_offer.region):
+            cloud_type = "SECURE"
+            data_center_id = instance_offer.region
+            country_code = None
+        else:
+            cloud_type = "COMMUNITY"
+            data_center_id = None
+            country_code = instance_offer.region
         resp = self.api_client.create_pod(
-            name=instance_config.instance_name,
+            name=pod_name,
             image_name=job.job_spec.image_name,
             gpu_type_id=instance_offer.instance.name,
-            cloud_type="SECURE",  # ["ALL", "COMMUNITY", "SECURE"]:
-            data_center_id=instance_offer.region,
+            cloud_type=cloud_type,
+            data_center_id=data_center_id,
+            country_code=country_code,
             gpu_count=gpu_count,
             container_disk_in_gb=disk_size,
             min_vcpu_count=instance_offer.instance.resources.cpus,
@@ -197,9 +213,10 @@ class RunpodCompute(Compute):
         )
     def create_volume(self, volume: Volume) -> VolumeProvisioningData:
+        volume_name = generate_unique_volume_name(volume, max_length=MAX_RESOURCE_NAME_LEN)
         size_gb = volume.configuration.size_gb
         volume_id = self.api_client.create_network_volume(
-            name=volume.name,
+            name=volume_name,
             region=volume.configuration.region,
             size=size_gb,
         )
@@ -250,3 +267,11 @@ def _get_volume_price(size: int) -> float:
     if size < 1000:
         return 0.07 * size
     return 0.05 * size
+def _is_secure_cloud(region: str) -> str:
+    """
+    Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
+    Community cloud regions are country codes: CA, NL, etc.
+    """
+    return "-" in region

dstack/_internal/core/backends/runpod/config.py CHANGED Viewed

@@ -4,6 +4,14 @@ from dstack._internal.core.models.backends.runpod import (
     RunpodStoredConfig,
 )
+RUNPOD_COMMUNITY_CLOUD_DEFAULT = True
 class RunpodConfig(RunpodStoredConfig, BackendConfig):
     creds: AnyRunpodCreds
+    @property
+    def allow_community_cloud(self) -> bool:
+        if self.community_cloud is not None:
+            return self.community_cloud
+        return RUNPOD_COMMUNITY_CLOUD_DEFAULT

dstack/_internal/core/backends/tensordock/compute.py CHANGED Viewed

@@ -4,7 +4,11 @@ from typing import List, Optional
 import requests
 from dstack._internal.core.backends.base import Compute
-from dstack._internal.core.backends.base.compute import get_instance_name, get_shim_commands
+from dstack._internal.core.backends.base.compute import (
+    generate_unique_instance_name,
+    get_job_instance_name,
+    get_shim_commands,
+)
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.tensordock.api_client import TensorDockAPIClient
 from dstack._internal.core.backends.tensordock.config import TensorDockConfig
@@ -23,6 +27,10 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+# Undocumented but names of len 60 work
+MAX_INSTANCE_NAME_LEN = 60
 class TensorDockCompute(Compute):
     def __init__(self, config: TensorDockConfig):
         super().__init__()
@@ -49,10 +57,13 @@ class TensorDockCompute(Compute):
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
     ) -> JobProvisioningData:
+        instance_name = generate_unique_instance_name(
+            instance_config, max_length=MAX_INSTANCE_NAME_LEN
+        )
         commands = get_shim_commands(authorized_keys=instance_config.get_public_keys())
         try:
             resp = self.api_client.deploy_single(
-                instance_name=instance_config.instance_name,
+                instance_name=instance_name,
                 instance=instance_offer.instance,
                 cloudinit={
                     "ssh_pwauth": False,  # disable password auth
@@ -113,7 +124,7 @@ class TensorDockCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),  # TODO: generate name
+            instance_name=get_job_instance_name(run, job),  # TODO: generate name
             ssh_keys=[
                 SSHKey(public=run.run_spec.ssh_key_pub.strip()),
                 SSHKey(public=project_ssh_public_key.strip()),

dstack/_internal/core/backends/vastai/compute.py CHANGED Viewed

@@ -4,7 +4,10 @@ import gpuhunt
 from gpuhunt.providers.vastai import VastAIProvider
 from dstack._internal.core.backends.base import Compute
-from dstack._internal.core.backends.base.compute import get_docker_commands, get_instance_name
+from dstack._internal.core.backends.base.compute import (
+    generate_unique_instance_name_for_job,
+    get_docker_commands,
+)
 from dstack._internal.core.backends.base.offers import get_catalog_offers
 from dstack._internal.core.backends.vastai.api_client import VastAIAPIClient
 from dstack._internal.core.backends.vastai.config import VastAIConfig
@@ -23,6 +26,10 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+# Undocumented but names of len 60 work
+MAX_INSTANCE_NAME_LEN = 60
 class VastAICompute(Compute):
     def __init__(self, config: VastAIConfig):
         super().__init__()
@@ -70,11 +77,14 @@ class VastAICompute(Compute):
         project_ssh_private_key: str,
         volumes: List[Volume],
     ) -> JobProvisioningData:
+        instance_name = generate_unique_instance_name_for_job(
+            run, job, max_length=MAX_INSTANCE_NAME_LEN
+        )
         commands = get_docker_commands(
             [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()]
         )
         resp = self.api_client.create_instance(
-            instance_name=get_instance_name(run, job),
+            instance_name=instance_name,
             bundle_id=instance_offer.instance.name,
             image_name=job.job_spec.image_name,
             onstart=" && ".join(commands),

dstack/_internal/core/backends/vultr/api_client.py CHANGED Viewed

@@ -20,7 +20,7 @@ class VultrApiClient:
             return False
         return True
-    def get_instance(self, instance_id: str, plan_type: str):
+    def get_instance(self, instance_id: str, plan_type: str) -> dict:
         if plan_type == "bare-metal":
             response = self._make_request("GET", f"/bare-metals/{instance_id}")
             return response.json()["bare_metal"]
@@ -28,7 +28,7 @@ class VultrApiClient:
             response = self._make_request("GET", f"/instances/{instance_id}")
             return response.json()["instance"]
-    def get_vpc_for_region(self, region: str) -> Optional[str]:
+    def get_vpc_for_region(self, region: str) -> Optional[dict]:
         response = self._make_request("GET", "/vpcs?per_page=500")
         vpcs = response.json().get("vpcs", [])
         if vpcs:
@@ -37,7 +37,7 @@ class VultrApiClient:
                     return vpc
         return None
-    def create_vpc(self, region: str):
+    def create_vpc(self, region: str) -> dict:
         data = {"region": region, "description": f"dstack-vpc-{region}"}
         response = self._make_request("POST", "/vpcs", data=data)
         return response.json()["vpc"]

dstack/_internal/core/backends/vultr/compute.py CHANGED Viewed

@@ -6,7 +6,8 @@ import requests
 from dstack._internal.core.backends.base import Compute
 from dstack._internal.core.backends.base.compute import (
-    get_instance_name,
+    generate_unique_instance_name,
+    get_job_instance_name,
     get_user_data,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -27,6 +28,8 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+MAX_INSTANCE_NAME_LEN = 64
 class VultrCompute(Compute):
     def __init__(self, config: VultrConfig):
@@ -62,7 +65,7 @@ class VultrCompute(Compute):
     ) -> JobProvisioningData:
         instance_config = InstanceConfiguration(
             project_name=run.project_name,
-            instance_name=get_instance_name(run, job),
+            instance_name=get_job_instance_name(run, job),
             ssh_keys=[SSHKey(public=project_ssh_public_key.strip())],
             user=run.user,
         )
@@ -71,6 +74,9 @@ class VultrCompute(Compute):
     def create_instance(
         self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
     ) -> JobProvisioningData:
+        instance_name = generate_unique_instance_name(
+            instance_config, max_length=MAX_INSTANCE_NAME_LEN
+        )
         # create vpc
         vpc = self.api_client.get_vpc_for_region(instance_offer.region)
         if not vpc:
@@ -85,7 +91,7 @@ class VultrCompute(Compute):
         ]
         instance_id = self.api_client.launch_instance(
             region=instance_offer.region,
-            label=instance_config.instance_name,
+            label=instance_name,
             plan=instance_offer.instance.name,
             user_data=get_user_data(
                 authorized_keys=instance_config.get_public_keys(),

dstack/_internal/core/models/backends/aws.py CHANGED Viewed

@@ -32,6 +32,7 @@ class AWSConfigInfo(CoreModel):
     vpc_ids: Optional[Dict[str, str]] = None
     default_vpcs: Optional[bool] = None
     public_ips: Optional[bool] = None
+    iam_instance_profile: Optional[str] = None
     tags: Optional[Dict[str, str]] = None
     os_images: Optional[AWSOSImageConfig] = None
@@ -70,6 +71,7 @@ class AWSConfigInfoWithCredsPartial(CoreModel):
     vpc_ids: Optional[Dict[str, str]]
     default_vpcs: Optional[bool]
     public_ips: Optional[bool]
+    iam_instance_profile: Optional[str]
     tags: Optional[Dict[str, str]]
     os_images: Optional["AWSOSImageConfig"]

dstack/_internal/core/models/backends/base.py CHANGED Viewed

@@ -15,6 +15,7 @@ class BackendType(str, enum.Enum):
         DATACRUNCH (BackendType): DataCrunch
         KUBERNETES (BackendType): Kubernetes
         LAMBDA (BackendType): Lambda Cloud
+        OCI (BackendType): Oracle Cloud Infrastructure
         RUNPOD (BackendType): Runpod Cloud
         TENSORDOCK (BackendType): TensorDock Marketplace
         VASTAI (BackendType): Vast.ai Marketplace

dstack/_internal/core/models/backends/runpod.py CHANGED Viewed

@@ -10,6 +10,7 @@ from dstack._internal.core.models.common import CoreModel
 class RunpodConfigInfo(CoreModel):
     type: Literal["runpod"] = "runpod"
     regions: Optional[List[str]] = None
+    community_cloud: Optional[bool] = None
 class RunpodStoredConfig(RunpodConfigInfo):
@@ -33,6 +34,7 @@ class RunpodConfigInfoWithCredsPartial(CoreModel):
     type: Literal["runpod"] = "runpod"
     creds: Optional[AnyRunpodCreds]
     regions: Optional[List[str]]
+    community_cloud: Optional[bool]
 class RunpodConfigValues(CoreModel):

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -31,7 +31,6 @@ class RunConfigurationType(str, Enum):
 class PythonVersion(str, Enum):
-    PY38 = "3.8"  # TODO(0.19 or earlier): drop 3.8, stop building Docker images with 3.8
     PY39 = "3.9"
     PY310 = "3.10"
     PY311 = "3.11"
@@ -222,7 +221,8 @@ class DevEnvironmentConfigurationParams(CoreModel):
                 " Inactivity is defined as the absence of SSH connections to the"
                 " dev environment, including VS Code connections, `ssh <run name>`"
                 " shells, and attached `dstack apply` or `dstack attach` commands."
-                " Use `off` for unlimited duration. Defaults to `off`"
+                " Use `off` for unlimited duration. Can be updated in-place."
+                " Defaults to `off`"
             )
         ),
     ]

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional, Union
+from typing import List, Optional, Union, overload
 from pydantic import Field, root_validator, validator
 from typing_extensions import Annotated, Literal
@@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
     DESTROY_AFTER_IDLE = "destroy-after-idle"
+@overload
+def parse_duration(v: None) -> None: ...
+@overload
+def parse_duration(v: Union[int, str]) -> int: ...
 def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
     if v is None:
         return None
@@ -112,6 +120,39 @@ class ProfileRetry(CoreModel):
         return values
+class UtilizationPolicy(CoreModel):
+    _min_time_window = "5m"
+    min_gpu_utilization: Annotated[
+        int,
+        Field(
+            description=(
+                "Minimum required GPU utilization, percent."
+                " If any GPU has utilization below specified value during the whole time window,"
+                " the run is terminated"
+            ),
+            ge=0,
+            le=100,
+        ),
+    ]
+    time_window: Annotated[
+        Union[int, str],
+        Field(
+            description=(
+                "The time window of metric samples taking into account to measure utilization"
+                f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
+            )
+        ),
+    ]
+    @validator("time_window", pre=True)
+    def validate_time_window(cls, v: Union[int, str]) -> int:
+        v = parse_duration(v)
+        if v < parse_duration(cls._min_time_window):
+            raise ValueError(f"Minimum time_window is {cls._min_time_window}")
+        return v
 class ProfileParams(CoreModel):
     backends: Annotated[
         Optional[List[BackendType]],
@@ -194,6 +235,10 @@ class ProfileParams(CoreModel):
             )
         ),
     ]
+    utilization_policy: Annotated[
+        Optional[UtilizationPolicy],
+        Field(description="Run termination policy based on utilization"),
+    ]
     # Deprecated:
     termination_policy: Annotated[
         Optional[TerminationPolicy],

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dstack._internal.core.models.profiles import (
     ProfileRetryPolicy,
     RetryEvent,
     SpotPolicy,
+    UtilizationPolicy,
 )
 from dstack._internal.core.models.repos import AnyRunRepoData
 from dstack._internal.core.models.resources import Memory, ResourcesSpec
@@ -114,6 +115,7 @@ class JobTerminationReason(str, Enum):
     ABORTED_BY_USER = "aborted_by_user"
     TERMINATED_BY_SERVER = "terminated_by_server"
     INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
+    TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
     # Set by the runner
     CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
     PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -135,6 +137,7 @@ class JobTerminationReason(str, Enum):
             self.ABORTED_BY_USER: JobStatus.ABORTED,
             self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
             self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
+            self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
             self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
             self.PORTS_BINDING_FAILED: JobStatus.FAILED,
             self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
     single_branch: Optional[bool] = None
     max_duration: Optional[int]
     stop_duration: Optional[int] = None
+    utilization_policy: Optional[UtilizationPolicy] = None
     registry_auth: Optional[RegistryAuth]
     requirements: Requirements
     retry: Optional[Retry]

dstack/_internal/core/services/__init__.py CHANGED Viewed

@@ -4,5 +4,9 @@ from dstack._internal.core.errors import ServerClientError
 def validate_dstack_resource_name(resource_name: str):
-    if not re.match("^[a-z][a-z0-9-]{1,40}$", resource_name):
+    if not is_valid_dstack_resource_name(resource_name):
         raise ServerClientError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'")
+def is_valid_dstack_resource_name(resource_name: str) -> bool:
+    return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None

dstack/_internal/core/services/configs/__init__.py CHANGED Viewed

@@ -65,6 +65,9 @@ class ConfigManager:
         if len(self.config.projects) == 1:
             self.config.projects[0].default = True
+    def list_projects(self):
+        return [project.name for project in self.config.projects]
     def delete_project(self, name: str):
         self.config.projects = [p for p in self.config.projects if p.name != name]

dstack/_internal/server/app.py CHANGED Viewed

@@ -29,6 +29,7 @@ from dstack._internal.server.routers import (
     metrics,
     pools,
     projects,
+    prometheus,
     repos,
     runs,
     secrets,
@@ -185,6 +186,7 @@ def register_routes(app: FastAPI, ui: bool = True):
     app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
     app.include_router(pools.root_router)
     app.include_router(pools.router)
+    app.include_router(prometheus.router)
     @app.exception_handler(ForbiddenError)
     async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -252,7 +254,11 @@ def register_routes(app: FastAPI, ui: bool = True):
         @app.exception_handler(404)
         async def custom_http_exception_handler(request, exc):
-            if request.url.path.startswith("/api") or _is_proxy_request(request):
+            if (
+                request.url.path.startswith("/api")
+                or _is_proxy_request(request)
+                or _is_prometheus_request(request)
+            ):
                 return JSONResponse(
                     {"detail": exc.detail},
                     status_code=status.HTTP_404_NOT_FOUND,
@@ -283,6 +289,10 @@ def _is_proxy_request(request: Request) -> bool:
     ) and referrer.path.startswith("/proxy")
+def _is_prometheus_request(request: Request) -> bool:
+    return request.url.path.startswith("/metrics")
 def _print_dstack_logo():
     console.print(
         """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮

dstack/_internal/server/background/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.interval import IntervalTrigger
+from dstack._internal.server import settings
 from dstack._internal.server.background.tasks.process_fleets import process_fleets
 from dstack._internal.server.background.tasks.process_gateways import (
     process_gateways_connections,
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
 from dstack._internal.server.background.tasks.process_placement_groups import (
     process_placement_groups,
 )
+from dstack._internal.server.background.tasks.process_prometheus_metrics import (
+    collect_prometheus_metrics,
+    delete_prometheus_metrics,
+)
 from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
 from dstack._internal.server.background.tasks.process_runs import process_runs
 from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
     # * 150 active instances with up to 2 minutes processing latency
     _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
     _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
+    if settings.ENABLE_PROMETHEUS_METRICS:
+        _scheduler.add_job(
+            collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
+        )
+        _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
     # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
     _scheduler.add_job(
         process_submitted_jobs,

dstack/_internal/server/background/tasks/common.py ADDED Viewed

@@ -0,0 +1,22 @@
+from datetime import timedelta
+from dstack._internal.core.models.backends.base import BackendType
+def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
+    """
+    This timeout is used in a few places, but roughly refers to the max time between
+    requesting instance creation and the instance becoming ready to accept jobs.
+    For container-based backends, this also includes the image pulling time.
+    """
+    if backend_type == BackendType.LAMBDA:
+        return timedelta(minutes=30)
+    if backend_type == BackendType.RUNPOD:
+        return timedelta(minutes=20)
+    if backend_type == BackendType.KUBERNETES:
+        return timedelta(minutes=20)
+    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
+        return timedelta(minutes=20)
+    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
+        return timedelta(minutes=55)
+    return timedelta(minutes=10)

dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

Potentially problematic release.

dstack 0.18.42py3-none-any.whl → 0.18.44py3-none-any.whl