PyPI - dstack - Versions diffs - 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl - Mend

dstack 0.19.28py3-none-any.whl → 0.19.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (51) hide show

dstack/_internal/core/backends/tensordock/compute.py CHANGED Viewed

@@ -39,9 +39,7 @@ class TensorDockCompute(
         self.config = config
         self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token)
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
-    ) -> List[InstanceOfferWithAvailability]:
+    def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.TENSORDOCK,
             requirements=requirements,

dstack/_internal/core/backends/tensordock/models.py CHANGED Viewed

@@ -4,6 +4,8 @@ from pydantic import Field
 from dstack._internal.core.models.common import CoreModel
+# TODO: TensorDock is deprecated and will be removed in the future
 class TensorDockAPIKeyCreds(CoreModel):
     type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"

dstack/_internal/core/backends/vastai/compute.py CHANGED Viewed

@@ -5,6 +5,7 @@ from gpuhunt.providers.vastai import VastAIProvider
 from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
+    ComputeWithFilteredOffersCached,
     generate_unique_instance_name_for_job,
     get_docker_commands,
 )
@@ -30,7 +31,10 @@ logger = get_logger(__name__)
 MAX_INSTANCE_NAME_LEN = 60
-class VastAICompute(Compute):
+class VastAICompute(
+    ComputeWithFilteredOffersCached,
+    Compute,
+):
     def __init__(self, config: VastAIConfig):
         super().__init__()
         self.config = config
@@ -49,8 +53,8 @@ class VastAICompute(Compute):
             )
         )
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
+    def get_offers_by_requirements(
+        self, requirements: Requirements
     ) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.VASTAI,

dstack/_internal/core/backends/vultr/compute.py CHANGED Viewed

@@ -6,6 +6,7 @@ import requests
 from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithMultinodeSupport,
     generate_unique_instance_name,
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
     InstanceOfferWithAvailability,
 )
 from dstack._internal.core.models.placement import PlacementGroup
-from dstack._internal.core.models.runs import JobProvisioningData, Requirements
+from dstack._internal.core.models.runs import JobProvisioningData
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 64
 class VultrCompute(
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithMultinodeSupport,
     Compute,
@@ -41,12 +43,10 @@ class VultrCompute(
         self.config = config
         self.api_client = VultrApiClient(config.creds.api_key)
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
-    ) -> List[InstanceOfferWithAvailability]:
+    def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.VULTR,
-            requirements=requirements,
+            requirements=None,
             locations=self.config.regions or None,
             extra_filter=_supported_instances,
         )

dstack/_internal/core/consts.py CHANGED Viewed

@@ -4,3 +4,5 @@ DSTACK_SHIM_HTTP_PORT = 10998
 DSTACK_RUNNER_HTTP_PORT = 10999
 # ssh server (runs alongside the runner inside a container) listen port
 DSTACK_RUNNER_SSH_PORT = 10022
+# legacy AWS, Azure, GCP, and OCI image for older GPUs
+DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"

dstack/_internal/core/models/projects.py CHANGED Viewed

@@ -26,3 +26,11 @@ class Project(CoreModel):
     backends: List[BackendInfo]
     members: List[Member]
     is_public: bool = False
+class ProjectHookConfig(CoreModel):
+    """
+    This class can be inherited to extend the project creation configuration passed to the hooks.
+    """
+    pass

dstack/_internal/core/services/repos.py CHANGED Viewed

@@ -36,24 +36,59 @@ def get_repo_creds_and_default_branch(
     # no auth
     with suppress(InvalidRepoCredentialsError):
-        return _get_repo_creds_and_default_branch_https(url)
+        creds, default_branch = _get_repo_creds_and_default_branch_https(url)
+        logger.debug(
+            "Git repo %s is public. Using no auth. Default branch: %s", repo_url, default_branch
+        )
+        return creds, default_branch
     # ssh key provided by the user or pulled from the server
     if identity_file is not None or private_key is not None:
         if identity_file is not None:
             private_key = _read_private_key(identity_file)
-            return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
+            creds, default_branch = _get_repo_creds_and_default_branch_ssh(
+                url, identity_file, private_key
+            )
+            logger.debug(
+                "Git repo %s is private. Using identity file: %s. Default branch: %s",
+                repo_url,
+                identity_file,
+                default_branch,
+            )
+            return creds, default_branch
         elif private_key is not None:
             with NamedTemporaryFile("w+", 0o600) as f:
                 f.write(private_key)
                 f.flush()
-                return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
+                creds, default_branch = _get_repo_creds_and_default_branch_ssh(
+                    url, f.name, private_key
+                )
+                masked_key = "***" + private_key[-10:] if len(private_key) > 10 else "***MASKED***"
+                logger.debug(
+                    "Git repo %s is private. Using private key: %s. Default branch: %s",
+                    repo_url,
+                    masked_key,
+                    default_branch,
+                )
+                return creds, default_branch
         else:
             assert False, "should not reach here"
     # oauth token provided by the user or pulled from the server
     if oauth_token is not None:
-        return _get_repo_creds_and_default_branch_https(url, oauth_token)
+        creds, default_branch = _get_repo_creds_and_default_branch_https(url, oauth_token)
+        masked_token = (
+            len(oauth_token[:-4]) * "*" + oauth_token[-4:]
+            if len(oauth_token) > 4
+            else "***MASKED***"
+        )
+        logger.debug(
+            "Git repo %s is private. Using provided OAuth token: %s. Default branch: %s",
+            repo_url,
+            masked_token,
+            default_branch,
+        )
+        return creds, default_branch
     # key from ssh config
     identities = get_host_config(url.original_host).get("identityfile")
@@ -61,7 +96,16 @@ def get_repo_creds_and_default_branch(
         _identity_file = identities[0]
         with suppress(InvalidRepoCredentialsError):
             _private_key = _read_private_key(_identity_file)
-            return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
+            creds, default_branch = _get_repo_creds_and_default_branch_ssh(
+                url, _identity_file, _private_key
+            )
+            logger.debug(
+                "Git repo %s is private. Using SSH config identity file: %s. Default branch: %s",
+                repo_url,
+                _identity_file,
+                default_branch,
+            )
+            return creds, default_branch
     # token from gh config
     if os.path.exists(gh_config_path):
@@ -70,13 +114,35 @@ def get_repo_creds_and_default_branch(
         _oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
         if _oauth_token is not None:
             with suppress(InvalidRepoCredentialsError):
-                return _get_repo_creds_and_default_branch_https(url, _oauth_token)
+                creds, default_branch = _get_repo_creds_and_default_branch_https(url, _oauth_token)
+                masked_token = (
+                    len(_oauth_token[:-4]) * "*" + _oauth_token[-4:]
+                    if len(_oauth_token) > 4
+                    else "***MASKED***"
+                )
+                logger.debug(
+                    "Git repo %s is private. Using GitHub config token: %s from %s. Default branch: %s",
+                    repo_url,
+                    masked_token,
+                    gh_config_path,
+                    default_branch,
+                )
+                return creds, default_branch
     # default user key
     if os.path.exists(default_ssh_key):
         with suppress(InvalidRepoCredentialsError):
             _private_key = _read_private_key(default_ssh_key)
-            return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
+            creds, default_branch = _get_repo_creds_and_default_branch_ssh(
+                url, default_ssh_key, _private_key
+            )
+            logger.debug(
+                "Git repo %s is private. Using default identity file: %s. Default branch: %s",
+                repo_url,
+                default_ssh_key,
+                default_branch,
+            )
+            return creds, default_branch
     raise InvalidRepoCredentialsError(
         "No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
@@ -87,8 +153,9 @@ def _get_repo_creds_and_default_branch_ssh(
     url: GitRepoURL, identity_file: PathLike, private_key: str
 ) -> tuple[RemoteRepoCreds, Optional[str]]:
     _url = url.as_ssh()
+    env = _make_git_env_for_creds_check(identity_file=identity_file)
     try:
-        default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
+        default_branch = _get_repo_default_branch(_url, env)
     except GitCommandError as e:
         message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
         raise InvalidRepoCredentialsError(message) from e
@@ -104,8 +171,9 @@ def _get_repo_creds_and_default_branch_https(
     url: GitRepoURL, oauth_token: Optional[str] = None
 ) -> tuple[RemoteRepoCreds, Optional[str]]:
     _url = url.as_https()
+    env = _make_git_env_for_creds_check()
     try:
-        default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
+        default_branch = _get_repo_default_branch(url.as_https(oauth_token), env)
     except GitCommandError as e:
         message = f"Cannot access `{_url}`"
         if oauth_token is not None:
@@ -120,9 +188,32 @@ def _get_repo_creds_and_default_branch_https(
     return creds, default_branch
+def _make_git_env_for_creds_check(identity_file: Optional[PathLike] = None) -> dict[str, str]:
+    # Our goal is to check if _provided_ creds (if any) are correct, so we need to be sure that
+    # only the provided creds are used, without falling back to any additional mechanisms.
+    # To do this, we:
+    # 1. Disable all configs to ignore any stored creds
+    # 2. Disable askpass to avoid asking for creds interactively or fetching stored creds from
+    # a non-interactive askpass helper (for example, VS Code sets GIT_ASKPASS to its own helper,
+    # which silently provides creds to Git).
+    return make_git_env(disable_config=True, disable_askpass=True, identity_file=identity_file)
 def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
+    # Git shipped by Apple with XCode is patched to support an additional config scope
+    # above "system" called "xcode". There is no option in `git config list` to show this config,
+    # but you can list the merged config (`git config list` without options) and then exclude
+    # all settings listed in `git config list --{system,global,local,worktree}`.
+    # As of time of writing, there are only two settings in the "xcode" config, one of which breaks
+    # our "is repo public?" check, namely "credential.helper=osxkeychain".
+    # As there is no way to disable "xcode" config (no env variable, no CLI option, etc.),
+    # the only way to disable credential helper is to override this specific setting with an empty
+    # string via command line argument: `git -c credential.helper= COMMAND [ARGS ...]`.
+    # See: https://github.com/git/git/commit/3d4355712b9fe77a96ad4ad877d92dc7ff6e0874
+    # See: https://gist.github.com/ChrisTollefson/ab9c0a5d1dd4dd615217345c6936a307
+    _git = git.cmd.Git()(c="credential.helper=")
     # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
-    output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
+    output: str = _git.ls_remote("--symref", url, "HEAD", env=env)
     for line in output.splitlines():
         # line format: `<oid> TAB <ref> LF`
         oid, _, ref = line.partition("\t")

dstack/_internal/server/background/tasks/process_instances.py CHANGED Viewed

@@ -578,7 +578,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
                 if placement_group_model is None:  # error occurred
                     continue
                 session.add(placement_group_model)
-                await session.flush()
                 placement_group_models.append(placement_group_model)
         logger.debug(
             "Trying %s in %s/%s for $%0.4f per hour",
@@ -636,7 +635,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
             },
         )
         if instance.fleet_id and _is_fleet_master_instance(instance):
-            # Clean up placement groups that did not end up being used
+            # Clean up placement groups that did not end up being used.
+            # Flush to update still uncommitted placement groups.
+            await session.flush()
             await schedule_fleet_placement_groups_deletion(
                 session=session,
                 fleet_id=instance.fleet_id,

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1139,7 +1139,7 @@ def _patch_base_image_for_aws_efa(
     efa_enabled_patterns = [
         # TODO: p6-b200 isn't supported yet in gpuhunt
         r"^p6-b200\.(48xlarge)$",
-        r"^p5\.(48xlarge)$",
+        r"^p5\.(4xlarge|48xlarge)$",
         r"^p5e\.(48xlarge)$",
         r"^p5en\.(48xlarge)$",
         r"^p4d\.(24xlarge)$",

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -3,7 +3,7 @@ import itertools
 import math
 import uuid
 from datetime import datetime, timedelta
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from sqlalchemy import and_, func, not_, or_, select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
 from dstack._internal.core.models.profiles import (
     DEFAULT_RUN_TERMINATION_IDLE_TIME,
     CreationPolicy,
+    Profile,
     TerminationPolicy,
 )
 from dstack._internal.core.models.resources import Memory
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
     JobRuntimeData,
     JobStatus,
     JobTerminationReason,
+    Requirements,
     Run,
     RunSpec,
 )
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
     project = run_model.project
     run = run_model_to_run(run_model)
     run_spec = run.run_spec
-    profile = run_spec.merged_profile
+    run_profile = run_spec.merged_profile
     job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
     multinode = job.job_spec.jobs_per_replica > 1
@@ -289,7 +291,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
                     instance_filters=instance_filters,
                 )
             fleet_models = fleet_models_with_instances + fleet_models_without_instances
-            fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
+            fleet_model, fleet_instances_with_offers = await _find_optimal_fleet_with_offers(
+                project=project,
                 fleet_models=fleet_models,
                 run_model=run_model,
                 run_spec=run.run_spec,
@@ -332,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         job_model.status = JobStatus.PROVISIONING
     else:
         # Assigned no instance, create a new one
-        if profile.creation_policy == CreationPolicy.REUSE:
+        if run_profile.creation_policy == CreationPolicy.REUSE:
             logger.debug("%s: reuse instance failed", fmt(job_model))
             job_model.status = JobStatus.TERMINATING
             job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
@@ -361,7 +364,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             return
         logger.info("%s: now is provisioning a new instance", fmt(job_model))
-        job_provisioning_data, offer = run_job_result
+        job_provisioning_data, offer, effective_profile, _ = run_job_result
         job_model.job_provisioning_data = job_provisioning_data.json()
         job_model.status = JobStatus.PROVISIONING
         if fleet_model is None:
@@ -381,12 +384,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         instance = _create_instance_model_for_job(
             project=project,
             fleet_model=fleet_model,
-            run_spec=run_spec,
             job_model=job_model,
-            job=job,
             job_provisioning_data=job_provisioning_data,
             offer=offer,
             instance_num=instance_num,
+            profile=effective_profile,
         )
         job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
         # Both this task and process_fleets can add instances to fleets.
@@ -492,7 +494,8 @@ async def _refetch_fleet_models_with_instances(
     return fleet_models
-def _find_optimal_fleet_with_offers(
+async def _find_optimal_fleet_with_offers(
+    project: ProjectModel,
     fleet_models: list[FleetModel],
     run_model: RunModel,
     run_spec: RunSpec,
@@ -502,58 +505,98 @@ def _find_optimal_fleet_with_offers(
 ) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
     if run_model.fleet is not None:
         # Using the fleet that was already chosen by the master job
-        fleet_instances_with_offers = _get_fleet_instances_with_offers(
+        fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
             fleet_model=run_model.fleet,
             run_spec=run_spec,
             job=job,
             master_job_provisioning_data=master_job_provisioning_data,
             volumes=volumes,
         )
-        return run_model.fleet, fleet_instances_with_offers
+        return run_model.fleet, fleet_instances_with_pool_offers
     if len(fleet_models) == 0:
         return None, []
     nodes_required_num = _get_nodes_required_num_for_run(run_spec)
-    # The current strategy is to first consider fleets that can accommodate
-    # the run without additional provisioning and choose the one with the cheapest offer.
-    # Fallback to fleet with the cheapest offer among all fleets with offers.
+    # The current strategy is first to consider fleets that can accommodate
+    # the run without additional provisioning and choose the one with the cheapest pool offer.
+    # Then choose a fleet with the cheapest pool offer among all fleets with pool offers.
+    # If there are no fleets with pool offers, choose a fleet with a cheapest backend offer.
+    # Fallback to autocreated fleet if fleets have no pool or backend offers.
+    # TODO: Consider trying all backend offers and then choosing a fleet.
     candidate_fleets_with_offers: list[
         tuple[
             Optional[FleetModel],
             list[tuple[InstanceModel, InstanceOfferWithAvailability]],
             int,
-            tuple[int, float],
+            int,
+            tuple[int, float, float],
         ]
     ] = []
     for candidate_fleet_model in fleet_models:
-        fleet_instances_with_offers = _get_fleet_instances_with_offers(
+        fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
             fleet_model=candidate_fleet_model,
             run_spec=run_spec,
             job=job,
             master_job_provisioning_data=master_job_provisioning_data,
             volumes=volumes,
         )
-        fleet_available_offers = [
-            o for _, o in fleet_instances_with_offers if o.availability.is_available()
-        ]
-        fleet_has_available_capacity = nodes_required_num <= len(fleet_available_offers)
-        fleet_cheapest_offer = math.inf
-        if len(fleet_available_offers) > 0:
-            fleet_cheapest_offer = fleet_available_offers[0].price
-        fleet_priority = (not fleet_has_available_capacity, fleet_cheapest_offer)
+        fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
+        fleet_cheapest_pool_offer = math.inf
+        if len(fleet_instances_with_pool_offers) > 0:
+            fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
+        candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
+        profile = None
+        requirements = None
+        try:
+            profile, requirements = _get_run_profile_and_requirements_in_fleet(
+                job=job,
+                run_spec=run_spec,
+                fleet=candidate_fleet,
+            )
+        except ValueError:
+            pass
+        fleet_backend_offers = []
+        if profile is not None and requirements is not None:
+            multinode = (
+                candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
+                or job.job_spec.jobs_per_replica > 1
+            )
+            fleet_backend_offers = await get_offers_by_requirements(
+                project=project,
+                profile=profile,
+                requirements=requirements,
+                exclude_not_available=True,
+                multinode=multinode,
+                master_job_provisioning_data=master_job_provisioning_data,
+                volumes=volumes,
+                privileged=job.job_spec.privileged,
+                instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
+            )
+        fleet_cheapest_backend_offer = math.inf
+        if len(fleet_backend_offers) > 0:
+            fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
+        fleet_priority = (
+            not fleet_has_available_capacity,
+            fleet_cheapest_pool_offer,
+            fleet_cheapest_backend_offer,
+        )
         candidate_fleets_with_offers.append(
             (
                 candidate_fleet_model,
-                fleet_instances_with_offers,
-                len(fleet_available_offers),
+                fleet_instances_with_pool_offers,
+                len(fleet_instances_with_pool_offers),
+                len(fleet_backend_offers),
                 fleet_priority,
             )
         )
     if run_spec.merged_profile.fleets is None and all(
-        t[2] == 0 for t in candidate_fleets_with_offers
+        t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
     ):
-        # If fleets are not specified and no fleets have available offers, create a new fleet.
+        # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
         # This is for compatibility with non-fleet-first UX when runs created new fleets
         # if there are no instances to reuse.
         return None, []
@@ -573,7 +616,7 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
     return nodes_required_num
-def _get_fleet_instances_with_offers(
+def _get_fleet_instances_with_pool_offers(
     fleet_model: FleetModel,
     run_spec: RunSpec,
     job: Job,
@@ -661,7 +704,7 @@ async def _run_job_on_new_instance(
     master_job_provisioning_data: Optional[JobProvisioningData] = None,
     volumes: Optional[List[List[Volume]]] = None,
     fleet_model: Optional[FleetModel] = None,
-) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
+) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
     if volumes is None:
         volumes = []
     profile = run.run_spec.merged_profile
@@ -669,21 +712,14 @@ async def _run_job_on_new_instance(
     fleet = None
     if fleet_model is not None:
         fleet = fleet_model_to_fleet(fleet_model)
-        if not _check_can_create_new_instance_in_fleet(fleet):
-            logger.debug(
-                "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
-            )
-            return None
-        profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
-        if profile is None:
-            logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
-            return None
-        fleet_requirements = get_fleet_requirements(fleet.spec)
-        requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
-        if requirements is None:
-            logger.debug(
-                "%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
+        try:
+            profile, requirements = _get_run_profile_and_requirements_in_fleet(
+                job=job,
+                run_spec=run.run_spec,
+                fleet=fleet,
             )
+        except ValueError as e:
+            logger.debug("%s: %s", fmt(job_model), e.args[0])
             return None
         # TODO: Respect fleet provisioning properties such as tags
@@ -723,7 +759,7 @@ async def _run_job_on_new_instance(
                 project_ssh_private_key,
                 offer_volumes,
             )
-            return job_provisioning_data, offer
+            return job_provisioning_data, offer, profile, requirements
         except BackendError as e:
             logger.warning(
                 "%s: %s launch in %s/%s failed: %s",
@@ -746,6 +782,25 @@ async def _run_job_on_new_instance(
     return None
+def _get_run_profile_and_requirements_in_fleet(
+    job: Job,
+    run_spec: RunSpec,
+    fleet: Fleet,
+) -> tuple[Profile, Requirements]:
+    if not _check_can_create_new_instance_in_fleet(fleet):
+        raise ValueError("Cannot fit new instance into fleet")
+    profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
+    if profile is None:
+        raise ValueError("Cannot combine fleet profile")
+    fleet_requirements = get_fleet_requirements(fleet.spec)
+    requirements = combine_fleet_and_run_requirements(
+        fleet_requirements, job.job_spec.requirements
+    )
+    if requirements is None:
+        raise ValueError("Cannot combine fleet requirements")
+    return profile, requirements
 def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
     if fleet.spec.configuration.ssh_config is not None:
         return False
@@ -814,14 +869,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
 def _create_instance_model_for_job(
     project: ProjectModel,
     fleet_model: FleetModel,
-    run_spec: RunSpec,
     job_model: JobModel,
-    job: Job,
     job_provisioning_data: JobProvisioningData,
     offer: InstanceOfferWithAvailability,
     instance_num: int,
+    profile: Profile,
 ) -> InstanceModel:
-    profile = run_spec.merged_profile
     if not job_provisioning_data.dockerized:
         # terminate vastai/k8s instances immediately
         termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE

dstack/_internal/server/services/backends/__init__.py CHANGED Viewed

@@ -345,7 +345,7 @@ async def get_instance_offers(
     Returns list of instances satisfying minimal resource requirements sorted by price
     """
     logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends])
-    tasks = [run_async(backend.compute().get_offers_cached, requirements) for backend in backends]
+    tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends]
     offers_by_backend = []
     for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)):
         if isinstance(result, BackendError):

dstack/_internal/server/services/projects.py CHANGED Viewed

@@ -13,7 +13,12 @@ from dstack._internal.core.backends.dstack.models import (
 )
 from dstack._internal.core.backends.models import BackendInfo
 from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
-from dstack._internal.core.models.projects import Member, MemberPermissions, Project
+from dstack._internal.core.models.projects import (
+    Member,
+    MemberPermissions,
+    Project,
+    ProjectHookConfig,
+)
 from dstack._internal.core.models.runs import RunStatus
 from dstack._internal.core.models.users import GlobalRole, ProjectRole
 from dstack._internal.server.models import (
@@ -120,6 +125,7 @@ async def create_project(
     user: UserModel,
     project_name: str,
     is_public: bool = False,
+    config: Optional[ProjectHookConfig] = None,
 ) -> Project:
     user_permissions = users.get_user_permissions(user)
     if not user_permissions.can_create_projects:
@@ -147,7 +153,7 @@ async def create_project(
         session=session, project_name=project_name
     )
     for hook in _CREATE_PROJECT_HOOKS:
-        await hook(session, project_model)
+        await hook(session, project_model, config)
     # a hook may change project
     session.expire(project_model)
     project_model = await get_project_model_by_name_or_error(
@@ -609,7 +615,9 @@ def get_member_permissions(member_model: MemberModel) -> MemberPermissions:
 _CREATE_PROJECT_HOOKS = []
-def register_create_project_hook(func: Callable[[AsyncSession, ProjectModel], Awaitable[None]]):
+def register_create_project_hook(
+    func: Callable[[AsyncSession, ProjectModel, Optional[ProjectHookConfig]], Awaitable[None]],
+):
     _CREATE_PROJECT_HOOKS.append(func)

dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl

Potentially problematic release.

dstack 0.19.28py3-none-any.whl → 0.19.30py3-none-any.whl