dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (51) hide show
  1. dstack/_internal/cli/main.py +3 -1
  2. dstack/_internal/cli/services/configurators/fleet.py +20 -6
  3. dstack/_internal/cli/utils/gpu.py +2 -2
  4. dstack/_internal/core/backends/aws/compute.py +62 -41
  5. dstack/_internal/core/backends/aws/resources.py +11 -6
  6. dstack/_internal/core/backends/azure/compute.py +25 -13
  7. dstack/_internal/core/backends/base/compute.py +121 -14
  8. dstack/_internal/core/backends/base/offers.py +34 -4
  9. dstack/_internal/core/backends/cloudrift/compute.py +5 -7
  10. dstack/_internal/core/backends/cudo/compute.py +4 -2
  11. dstack/_internal/core/backends/datacrunch/compute.py +13 -11
  12. dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
  13. dstack/_internal/core/backends/gcp/compute.py +25 -11
  14. dstack/_internal/core/backends/hotaisle/compute.py +4 -7
  15. dstack/_internal/core/backends/kubernetes/compute.py +6 -4
  16. dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
  17. dstack/_internal/core/backends/local/compute.py +1 -3
  18. dstack/_internal/core/backends/nebius/compute.py +10 -7
  19. dstack/_internal/core/backends/oci/compute.py +15 -8
  20. dstack/_internal/core/backends/oci/resources.py +8 -3
  21. dstack/_internal/core/backends/runpod/compute.py +15 -6
  22. dstack/_internal/core/backends/template/compute.py.jinja +3 -1
  23. dstack/_internal/core/backends/tensordock/compute.py +1 -3
  24. dstack/_internal/core/backends/tensordock/models.py +2 -0
  25. dstack/_internal/core/backends/vastai/compute.py +7 -3
  26. dstack/_internal/core/backends/vultr/compute.py +5 -5
  27. dstack/_internal/core/consts.py +2 -0
  28. dstack/_internal/core/models/projects.py +8 -0
  29. dstack/_internal/core/services/repos.py +101 -10
  30. dstack/_internal/server/background/tasks/process_instances.py +3 -2
  31. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
  32. dstack/_internal/server/background/tasks/process_submitted_jobs.py +100 -47
  33. dstack/_internal/server/services/backends/__init__.py +1 -1
  34. dstack/_internal/server/services/projects.py +11 -3
  35. dstack/_internal/server/services/runs.py +2 -0
  36. dstack/_internal/server/statics/index.html +1 -1
  37. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
  38. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
  39. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
  40. dstack/_internal/utils/ssh.py +22 -2
  41. dstack/version.py +2 -2
  42. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/METADATA +8 -6
  43. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/RECORD +46 -50
  44. dstack/_internal/core/backends/tensordock/__init__.py +0 -0
  45. dstack/_internal/core/backends/tensordock/api_client.py +0 -104
  46. dstack/_internal/core/backends/tensordock/backend.py +0 -16
  47. dstack/_internal/core/backends/tensordock/configurator.py +0 -74
  48. dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
  49. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/WHEEL +0 -0
  50. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/entry_points.txt +0 -0
  51. {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/licenses/LICENSE.md +0 -0
@@ -39,9 +39,7 @@ class TensorDockCompute(
39
39
  self.config = config
40
40
  self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token)
41
41
 
42
- def get_offers(
43
- self, requirements: Optional[Requirements] = None
44
- ) -> List[InstanceOfferWithAvailability]:
42
+ def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
45
43
  offers = get_catalog_offers(
46
44
  backend=BackendType.TENSORDOCK,
47
45
  requirements=requirements,
@@ -4,6 +4,8 @@ from pydantic import Field
4
4
 
5
5
  from dstack._internal.core.models.common import CoreModel
6
6
 
7
+ # TODO: TensorDock is deprecated and will be removed in the future
8
+
7
9
 
8
10
  class TensorDockAPIKeyCreds(CoreModel):
9
11
  type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
@@ -5,6 +5,7 @@ from gpuhunt.providers.vastai import VastAIProvider
5
5
 
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ ComputeWithFilteredOffersCached,
8
9
  generate_unique_instance_name_for_job,
9
10
  get_docker_commands,
10
11
  )
@@ -30,7 +31,10 @@ logger = get_logger(__name__)
30
31
  MAX_INSTANCE_NAME_LEN = 60
31
32
 
32
33
 
33
- class VastAICompute(Compute):
34
+ class VastAICompute(
35
+ ComputeWithFilteredOffersCached,
36
+ Compute,
37
+ ):
34
38
  def __init__(self, config: VastAIConfig):
35
39
  super().__init__()
36
40
  self.config = config
@@ -49,8 +53,8 @@ class VastAICompute(Compute):
49
53
  )
50
54
  )
51
55
 
52
- def get_offers(
53
- self, requirements: Optional[Requirements] = None
56
+ def get_offers_by_requirements(
57
+ self, requirements: Requirements
54
58
  ) -> List[InstanceOfferWithAvailability]:
55
59
  offers = get_catalog_offers(
56
60
  backend=BackendType.VASTAI,
@@ -6,6 +6,7 @@ import requests
6
6
 
7
7
  from dstack._internal.core.backends.base.backend import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
+ ComputeWithAllOffersCached,
9
10
  ComputeWithCreateInstanceSupport,
10
11
  ComputeWithMultinodeSupport,
11
12
  generate_unique_instance_name,
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
23
24
  InstanceOfferWithAvailability,
24
25
  )
25
26
  from dstack._internal.core.models.placement import PlacementGroup
26
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
27
+ from dstack._internal.core.models.runs import JobProvisioningData
27
28
  from dstack._internal.utils.logging import get_logger
28
29
 
29
30
  logger = get_logger(__name__)
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 64
32
33
 
33
34
 
34
35
  class VultrCompute(
36
+ ComputeWithAllOffersCached,
35
37
  ComputeWithCreateInstanceSupport,
36
38
  ComputeWithMultinodeSupport,
37
39
  Compute,
@@ -41,12 +43,10 @@ class VultrCompute(
41
43
  self.config = config
42
44
  self.api_client = VultrApiClient(config.creds.api_key)
43
45
 
44
- def get_offers(
45
- self, requirements: Optional[Requirements] = None
46
- ) -> List[InstanceOfferWithAvailability]:
46
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
47
47
  offers = get_catalog_offers(
48
48
  backend=BackendType.VULTR,
49
- requirements=requirements,
49
+ requirements=None,
50
50
  locations=self.config.regions or None,
51
51
  extra_filter=_supported_instances,
52
52
  )
@@ -4,3 +4,5 @@ DSTACK_SHIM_HTTP_PORT = 10998
4
4
  DSTACK_RUNNER_HTTP_PORT = 10999
5
5
  # ssh server (runs alongside the runner inside a container) listen port
6
6
  DSTACK_RUNNER_SSH_PORT = 10022
7
+ # legacy AWS, Azure, GCP, and OCI image for older GPUs
8
+ DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES = "0.10"
@@ -26,3 +26,11 @@ class Project(CoreModel):
26
26
  backends: List[BackendInfo]
27
27
  members: List[Member]
28
28
  is_public: bool = False
29
+
30
+
31
+ class ProjectHookConfig(CoreModel):
32
+ """
33
+ This class can be inherited to extend the project creation configuration passed to the hooks.
34
+ """
35
+
36
+ pass
@@ -36,24 +36,59 @@ def get_repo_creds_and_default_branch(
36
36
 
37
37
  # no auth
38
38
  with suppress(InvalidRepoCredentialsError):
39
- return _get_repo_creds_and_default_branch_https(url)
39
+ creds, default_branch = _get_repo_creds_and_default_branch_https(url)
40
+ logger.debug(
41
+ "Git repo %s is public. Using no auth. Default branch: %s", repo_url, default_branch
42
+ )
43
+ return creds, default_branch
40
44
 
41
45
  # ssh key provided by the user or pulled from the server
42
46
  if identity_file is not None or private_key is not None:
43
47
  if identity_file is not None:
44
48
  private_key = _read_private_key(identity_file)
45
- return _get_repo_creds_and_default_branch_ssh(url, identity_file, private_key)
49
+ creds, default_branch = _get_repo_creds_and_default_branch_ssh(
50
+ url, identity_file, private_key
51
+ )
52
+ logger.debug(
53
+ "Git repo %s is private. Using identity file: %s. Default branch: %s",
54
+ repo_url,
55
+ identity_file,
56
+ default_branch,
57
+ )
58
+ return creds, default_branch
46
59
  elif private_key is not None:
47
60
  with NamedTemporaryFile("w+", 0o600) as f:
48
61
  f.write(private_key)
49
62
  f.flush()
50
- return _get_repo_creds_and_default_branch_ssh(url, f.name, private_key)
63
+ creds, default_branch = _get_repo_creds_and_default_branch_ssh(
64
+ url, f.name, private_key
65
+ )
66
+ masked_key = "***" + private_key[-10:] if len(private_key) > 10 else "***MASKED***"
67
+ logger.debug(
68
+ "Git repo %s is private. Using private key: %s. Default branch: %s",
69
+ repo_url,
70
+ masked_key,
71
+ default_branch,
72
+ )
73
+ return creds, default_branch
51
74
  else:
52
75
  assert False, "should not reach here"
53
76
 
54
77
  # oauth token provided by the user or pulled from the server
55
78
  if oauth_token is not None:
56
- return _get_repo_creds_and_default_branch_https(url, oauth_token)
79
+ creds, default_branch = _get_repo_creds_and_default_branch_https(url, oauth_token)
80
+ masked_token = (
81
+ len(oauth_token[:-4]) * "*" + oauth_token[-4:]
82
+ if len(oauth_token) > 4
83
+ else "***MASKED***"
84
+ )
85
+ logger.debug(
86
+ "Git repo %s is private. Using provided OAuth token: %s. Default branch: %s",
87
+ repo_url,
88
+ masked_token,
89
+ default_branch,
90
+ )
91
+ return creds, default_branch
57
92
 
58
93
  # key from ssh config
59
94
  identities = get_host_config(url.original_host).get("identityfile")
@@ -61,7 +96,16 @@ def get_repo_creds_and_default_branch(
61
96
  _identity_file = identities[0]
62
97
  with suppress(InvalidRepoCredentialsError):
63
98
  _private_key = _read_private_key(_identity_file)
64
- return _get_repo_creds_and_default_branch_ssh(url, _identity_file, _private_key)
99
+ creds, default_branch = _get_repo_creds_and_default_branch_ssh(
100
+ url, _identity_file, _private_key
101
+ )
102
+ logger.debug(
103
+ "Git repo %s is private. Using SSH config identity file: %s. Default branch: %s",
104
+ repo_url,
105
+ _identity_file,
106
+ default_branch,
107
+ )
108
+ return creds, default_branch
65
109
 
66
110
  # token from gh config
67
111
  if os.path.exists(gh_config_path):
@@ -70,13 +114,35 @@ def get_repo_creds_and_default_branch(
70
114
  _oauth_token = gh_hosts.get(url.host, {}).get("oauth_token")
71
115
  if _oauth_token is not None:
72
116
  with suppress(InvalidRepoCredentialsError):
73
- return _get_repo_creds_and_default_branch_https(url, _oauth_token)
117
+ creds, default_branch = _get_repo_creds_and_default_branch_https(url, _oauth_token)
118
+ masked_token = (
119
+ len(_oauth_token[:-4]) * "*" + _oauth_token[-4:]
120
+ if len(_oauth_token) > 4
121
+ else "***MASKED***"
122
+ )
123
+ logger.debug(
124
+ "Git repo %s is private. Using GitHub config token: %s from %s. Default branch: %s",
125
+ repo_url,
126
+ masked_token,
127
+ gh_config_path,
128
+ default_branch,
129
+ )
130
+ return creds, default_branch
74
131
 
75
132
  # default user key
76
133
  if os.path.exists(default_ssh_key):
77
134
  with suppress(InvalidRepoCredentialsError):
78
135
  _private_key = _read_private_key(default_ssh_key)
79
- return _get_repo_creds_and_default_branch_ssh(url, default_ssh_key, _private_key)
136
+ creds, default_branch = _get_repo_creds_and_default_branch_ssh(
137
+ url, default_ssh_key, _private_key
138
+ )
139
+ logger.debug(
140
+ "Git repo %s is private. Using default identity file: %s. Default branch: %s",
141
+ repo_url,
142
+ default_ssh_key,
143
+ default_branch,
144
+ )
145
+ return creds, default_branch
80
146
 
81
147
  raise InvalidRepoCredentialsError(
82
148
  "No valid default Git credentials found. Pass valid `--token` or `--git-identity`."
@@ -87,8 +153,9 @@ def _get_repo_creds_and_default_branch_ssh(
87
153
  url: GitRepoURL, identity_file: PathLike, private_key: str
88
154
  ) -> tuple[RemoteRepoCreds, Optional[str]]:
89
155
  _url = url.as_ssh()
156
+ env = _make_git_env_for_creds_check(identity_file=identity_file)
90
157
  try:
91
- default_branch = _get_repo_default_branch(_url, make_git_env(identity_file=identity_file))
158
+ default_branch = _get_repo_default_branch(_url, env)
92
159
  except GitCommandError as e:
93
160
  message = f"Cannot access `{_url}` using the `{identity_file}` private SSH key"
94
161
  raise InvalidRepoCredentialsError(message) from e
@@ -104,8 +171,9 @@ def _get_repo_creds_and_default_branch_https(
104
171
  url: GitRepoURL, oauth_token: Optional[str] = None
105
172
  ) -> tuple[RemoteRepoCreds, Optional[str]]:
106
173
  _url = url.as_https()
174
+ env = _make_git_env_for_creds_check()
107
175
  try:
108
- default_branch = _get_repo_default_branch(url.as_https(oauth_token), make_git_env())
176
+ default_branch = _get_repo_default_branch(url.as_https(oauth_token), env)
109
177
  except GitCommandError as e:
110
178
  message = f"Cannot access `{_url}`"
111
179
  if oauth_token is not None:
@@ -120,9 +188,32 @@ def _get_repo_creds_and_default_branch_https(
120
188
  return creds, default_branch
121
189
 
122
190
 
191
+ def _make_git_env_for_creds_check(identity_file: Optional[PathLike] = None) -> dict[str, str]:
192
+ # Our goal is to check if _provided_ creds (if any) are correct, so we need to be sure that
193
+ # only the provided creds are used, without falling back to any additional mechanisms.
194
+ # To do this, we:
195
+ # 1. Disable all configs to ignore any stored creds
196
+ # 2. Disable askpass to avoid asking for creds interactively or fetching stored creds from
197
+ # a non-interactive askpass helper (for example, VS Code sets GIT_ASKPASS to its own helper,
198
+ # which silently provides creds to Git).
199
+ return make_git_env(disable_config=True, disable_askpass=True, identity_file=identity_file)
200
+
201
+
123
202
  def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
203
+ # Git shipped by Apple with XCode is patched to support an additional config scope
204
+ # above "system" called "xcode". There is no option in `git config list` to show this config,
205
+ # but you can list the merged config (`git config list` without options) and then exclude
206
+ # all settings listed in `git config list --{system,global,local,worktree}`.
207
+ # As of time of writing, there are only two settings in the "xcode" config, one of which breaks
208
+ # our "is repo public?" check, namely "credential.helper=osxkeychain".
209
+ # As there is no way to disable "xcode" config (no env variable, no CLI option, etc.),
210
+ # the only way to disable credential helper is to override this specific setting with an empty
211
+ # string via command line argument: `git -c credential.helper= COMMAND [ARGS ...]`.
212
+ # See: https://github.com/git/git/commit/3d4355712b9fe77a96ad4ad877d92dc7ff6e0874
213
+ # See: https://gist.github.com/ChrisTollefson/ab9c0a5d1dd4dd615217345c6936a307
214
+ _git = git.cmd.Git()(c="credential.helper=")
124
215
  # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
125
- output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
216
+ output: str = _git.ls_remote("--symref", url, "HEAD", env=env)
126
217
  for line in output.splitlines():
127
218
  # line format: `<oid> TAB <ref> LF`
128
219
  oid, _, ref = line.partition("\t")
@@ -578,7 +578,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
578
578
  if placement_group_model is None: # error occurred
579
579
  continue
580
580
  session.add(placement_group_model)
581
- await session.flush()
582
581
  placement_group_models.append(placement_group_model)
583
582
  logger.debug(
584
583
  "Trying %s in %s/%s for $%0.4f per hour",
@@ -636,7 +635,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
636
635
  },
637
636
  )
638
637
  if instance.fleet_id and _is_fleet_master_instance(instance):
639
- # Clean up placement groups that did not end up being used
638
+ # Clean up placement groups that did not end up being used.
639
+ # Flush to update still uncommitted placement groups.
640
+ await session.flush()
640
641
  await schedule_fleet_placement_groups_deletion(
641
642
  session=session,
642
643
  fleet_id=instance.fleet_id,
@@ -1139,7 +1139,7 @@ def _patch_base_image_for_aws_efa(
1139
1139
  efa_enabled_patterns = [
1140
1140
  # TODO: p6-b200 isn't supported yet in gpuhunt
1141
1141
  r"^p6-b200\.(48xlarge)$",
1142
- r"^p5\.(48xlarge)$",
1142
+ r"^p5\.(4xlarge|48xlarge)$",
1143
1143
  r"^p5e\.(48xlarge)$",
1144
1144
  r"^p5en\.(48xlarge)$",
1145
1145
  r"^p4d\.(24xlarge)$",
@@ -3,7 +3,7 @@ import itertools
3
3
  import math
4
4
  import uuid
5
5
  from datetime import datetime, timedelta
6
- from typing import List, Optional, Tuple
6
+ from typing import List, Optional
7
7
 
8
8
  from sqlalchemy import and_, func, not_, or_, select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
@@ -25,6 +25,7 @@ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
25
25
  from dstack._internal.core.models.profiles import (
26
26
  DEFAULT_RUN_TERMINATION_IDLE_TIME,
27
27
  CreationPolicy,
28
+ Profile,
28
29
  TerminationPolicy,
29
30
  )
30
31
  from dstack._internal.core.models.resources import Memory
@@ -34,6 +35,7 @@ from dstack._internal.core.models.runs import (
34
35
  JobRuntimeData,
35
36
  JobStatus,
36
37
  JobTerminationReason,
38
+ Requirements,
37
39
  Run,
38
40
  RunSpec,
39
41
  )
@@ -186,7 +188,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
186
188
  project = run_model.project
187
189
  run = run_model_to_run(run_model)
188
190
  run_spec = run.run_spec
189
- profile = run_spec.merged_profile
191
+ run_profile = run_spec.merged_profile
190
192
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
191
193
  multinode = job.job_spec.jobs_per_replica > 1
192
194
 
@@ -289,7 +291,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
289
291
  instance_filters=instance_filters,
290
292
  )
291
293
  fleet_models = fleet_models_with_instances + fleet_models_without_instances
292
- fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
294
+ fleet_model, fleet_instances_with_offers = await _find_optimal_fleet_with_offers(
295
+ project=project,
293
296
  fleet_models=fleet_models,
294
297
  run_model=run_model,
295
298
  run_spec=run.run_spec,
@@ -332,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
332
335
  job_model.status = JobStatus.PROVISIONING
333
336
  else:
334
337
  # Assigned no instance, create a new one
335
- if profile.creation_policy == CreationPolicy.REUSE:
338
+ if run_profile.creation_policy == CreationPolicy.REUSE:
336
339
  logger.debug("%s: reuse instance failed", fmt(job_model))
337
340
  job_model.status = JobStatus.TERMINATING
338
341
  job_model.termination_reason = JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
@@ -361,7 +364,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
361
364
  return
362
365
 
363
366
  logger.info("%s: now is provisioning a new instance", fmt(job_model))
364
- job_provisioning_data, offer = run_job_result
367
+ job_provisioning_data, offer, effective_profile, _ = run_job_result
365
368
  job_model.job_provisioning_data = job_provisioning_data.json()
366
369
  job_model.status = JobStatus.PROVISIONING
367
370
  if fleet_model is None:
@@ -381,12 +384,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
381
384
  instance = _create_instance_model_for_job(
382
385
  project=project,
383
386
  fleet_model=fleet_model,
384
- run_spec=run_spec,
385
387
  job_model=job_model,
386
- job=job,
387
388
  job_provisioning_data=job_provisioning_data,
388
389
  offer=offer,
389
390
  instance_num=instance_num,
391
+ profile=effective_profile,
390
392
  )
391
393
  job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
392
394
  # Both this task and process_fleets can add instances to fleets.
@@ -492,7 +494,8 @@ async def _refetch_fleet_models_with_instances(
492
494
  return fleet_models
493
495
 
494
496
 
495
- def _find_optimal_fleet_with_offers(
497
+ async def _find_optimal_fleet_with_offers(
498
+ project: ProjectModel,
496
499
  fleet_models: list[FleetModel],
497
500
  run_model: RunModel,
498
501
  run_spec: RunSpec,
@@ -502,58 +505,98 @@ def _find_optimal_fleet_with_offers(
502
505
  ) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
503
506
  if run_model.fleet is not None:
504
507
  # Using the fleet that was already chosen by the master job
505
- fleet_instances_with_offers = _get_fleet_instances_with_offers(
508
+ fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
506
509
  fleet_model=run_model.fleet,
507
510
  run_spec=run_spec,
508
511
  job=job,
509
512
  master_job_provisioning_data=master_job_provisioning_data,
510
513
  volumes=volumes,
511
514
  )
512
- return run_model.fleet, fleet_instances_with_offers
515
+ return run_model.fleet, fleet_instances_with_pool_offers
513
516
 
514
517
  if len(fleet_models) == 0:
515
518
  return None, []
516
519
 
517
520
  nodes_required_num = _get_nodes_required_num_for_run(run_spec)
518
- # The current strategy is to first consider fleets that can accommodate
519
- # the run without additional provisioning and choose the one with the cheapest offer.
520
- # Fallback to fleet with the cheapest offer among all fleets with offers.
521
+ # The current strategy is first to consider fleets that can accommodate
522
+ # the run without additional provisioning and choose the one with the cheapest pool offer.
523
+ # Then choose a fleet with the cheapest pool offer among all fleets with pool offers.
524
+ # If there are no fleets with pool offers, choose a fleet with a cheapest backend offer.
525
+ # Fallback to autocreated fleet if fleets have no pool or backend offers.
526
+ # TODO: Consider trying all backend offers and then choosing a fleet.
521
527
  candidate_fleets_with_offers: list[
522
528
  tuple[
523
529
  Optional[FleetModel],
524
530
  list[tuple[InstanceModel, InstanceOfferWithAvailability]],
525
531
  int,
526
- tuple[int, float],
532
+ int,
533
+ tuple[int, float, float],
527
534
  ]
528
535
  ] = []
529
536
  for candidate_fleet_model in fleet_models:
530
- fleet_instances_with_offers = _get_fleet_instances_with_offers(
537
+ fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
531
538
  fleet_model=candidate_fleet_model,
532
539
  run_spec=run_spec,
533
540
  job=job,
534
541
  master_job_provisioning_data=master_job_provisioning_data,
535
542
  volumes=volumes,
536
543
  )
537
- fleet_available_offers = [
538
- o for _, o in fleet_instances_with_offers if o.availability.is_available()
539
- ]
540
- fleet_has_available_capacity = nodes_required_num <= len(fleet_available_offers)
541
- fleet_cheapest_offer = math.inf
542
- if len(fleet_available_offers) > 0:
543
- fleet_cheapest_offer = fleet_available_offers[0].price
544
- fleet_priority = (not fleet_has_available_capacity, fleet_cheapest_offer)
544
+ fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
545
+ fleet_cheapest_pool_offer = math.inf
546
+ if len(fleet_instances_with_pool_offers) > 0:
547
+ fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
548
+
549
+ candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
550
+ profile = None
551
+ requirements = None
552
+ try:
553
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
554
+ job=job,
555
+ run_spec=run_spec,
556
+ fleet=candidate_fleet,
557
+ )
558
+ except ValueError:
559
+ pass
560
+ fleet_backend_offers = []
561
+ if profile is not None and requirements is not None:
562
+ multinode = (
563
+ candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
564
+ or job.job_spec.jobs_per_replica > 1
565
+ )
566
+ fleet_backend_offers = await get_offers_by_requirements(
567
+ project=project,
568
+ profile=profile,
569
+ requirements=requirements,
570
+ exclude_not_available=True,
571
+ multinode=multinode,
572
+ master_job_provisioning_data=master_job_provisioning_data,
573
+ volumes=volumes,
574
+ privileged=job.job_spec.privileged,
575
+ instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
576
+ )
577
+
578
+ fleet_cheapest_backend_offer = math.inf
579
+ if len(fleet_backend_offers) > 0:
580
+ fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
581
+
582
+ fleet_priority = (
583
+ not fleet_has_available_capacity,
584
+ fleet_cheapest_pool_offer,
585
+ fleet_cheapest_backend_offer,
586
+ )
545
587
  candidate_fleets_with_offers.append(
546
588
  (
547
589
  candidate_fleet_model,
548
- fleet_instances_with_offers,
549
- len(fleet_available_offers),
590
+ fleet_instances_with_pool_offers,
591
+ len(fleet_instances_with_pool_offers),
592
+ len(fleet_backend_offers),
550
593
  fleet_priority,
551
594
  )
552
595
  )
553
596
  if run_spec.merged_profile.fleets is None and all(
554
- t[2] == 0 for t in candidate_fleets_with_offers
597
+ t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
555
598
  ):
556
- # If fleets are not specified and no fleets have available offers, create a new fleet.
599
+ # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
557
600
  # This is for compatibility with non-fleet-first UX when runs created new fleets
558
601
  # if there are no instances to reuse.
559
602
  return None, []
@@ -573,7 +616,7 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
573
616
  return nodes_required_num
574
617
 
575
618
 
576
- def _get_fleet_instances_with_offers(
619
+ def _get_fleet_instances_with_pool_offers(
577
620
  fleet_model: FleetModel,
578
621
  run_spec: RunSpec,
579
622
  job: Job,
@@ -661,7 +704,7 @@ async def _run_job_on_new_instance(
661
704
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
662
705
  volumes: Optional[List[List[Volume]]] = None,
663
706
  fleet_model: Optional[FleetModel] = None,
664
- ) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
707
+ ) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
665
708
  if volumes is None:
666
709
  volumes = []
667
710
  profile = run.run_spec.merged_profile
@@ -669,21 +712,14 @@ async def _run_job_on_new_instance(
669
712
  fleet = None
670
713
  if fleet_model is not None:
671
714
  fleet = fleet_model_to_fleet(fleet_model)
672
- if not _check_can_create_new_instance_in_fleet(fleet):
673
- logger.debug(
674
- "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
675
- )
676
- return None
677
- profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
678
- if profile is None:
679
- logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
680
- return None
681
- fleet_requirements = get_fleet_requirements(fleet.spec)
682
- requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
683
- if requirements is None:
684
- logger.debug(
685
- "%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
715
+ try:
716
+ profile, requirements = _get_run_profile_and_requirements_in_fleet(
717
+ job=job,
718
+ run_spec=run.run_spec,
719
+ fleet=fleet,
686
720
  )
721
+ except ValueError as e:
722
+ logger.debug("%s: %s", fmt(job_model), e.args[0])
687
723
  return None
688
724
  # TODO: Respect fleet provisioning properties such as tags
689
725
 
@@ -723,7 +759,7 @@ async def _run_job_on_new_instance(
723
759
  project_ssh_private_key,
724
760
  offer_volumes,
725
761
  )
726
- return job_provisioning_data, offer
762
+ return job_provisioning_data, offer, profile, requirements
727
763
  except BackendError as e:
728
764
  logger.warning(
729
765
  "%s: %s launch in %s/%s failed: %s",
@@ -746,6 +782,25 @@ async def _run_job_on_new_instance(
746
782
  return None
747
783
 
748
784
 
785
+ def _get_run_profile_and_requirements_in_fleet(
786
+ job: Job,
787
+ run_spec: RunSpec,
788
+ fleet: Fleet,
789
+ ) -> tuple[Profile, Requirements]:
790
+ if not _check_can_create_new_instance_in_fleet(fleet):
791
+ raise ValueError("Cannot fit new instance into fleet")
792
+ profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
793
+ if profile is None:
794
+ raise ValueError("Cannot combine fleet profile")
795
+ fleet_requirements = get_fleet_requirements(fleet.spec)
796
+ requirements = combine_fleet_and_run_requirements(
797
+ fleet_requirements, job.job_spec.requirements
798
+ )
799
+ if requirements is None:
800
+ raise ValueError("Cannot combine fleet requirements")
801
+ return profile, requirements
802
+
803
+
749
804
  def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
750
805
  if fleet.spec.configuration.ssh_config is not None:
751
806
  return False
@@ -814,14 +869,12 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
814
869
  def _create_instance_model_for_job(
815
870
  project: ProjectModel,
816
871
  fleet_model: FleetModel,
817
- run_spec: RunSpec,
818
872
  job_model: JobModel,
819
- job: Job,
820
873
  job_provisioning_data: JobProvisioningData,
821
874
  offer: InstanceOfferWithAvailability,
822
875
  instance_num: int,
876
+ profile: Profile,
823
877
  ) -> InstanceModel:
824
- profile = run_spec.merged_profile
825
878
  if not job_provisioning_data.dockerized:
826
879
  # terminate vastai/k8s instances immediately
827
880
  termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE
@@ -345,7 +345,7 @@ async def get_instance_offers(
345
345
  Returns list of instances satisfying minimal resource requirements sorted by price
346
346
  """
347
347
  logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends])
348
- tasks = [run_async(backend.compute().get_offers_cached, requirements) for backend in backends]
348
+ tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends]
349
349
  offers_by_backend = []
350
350
  for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)):
351
351
  if isinstance(result, BackendError):
@@ -13,7 +13,12 @@ from dstack._internal.core.backends.dstack.models import (
13
13
  )
14
14
  from dstack._internal.core.backends.models import BackendInfo
15
15
  from dstack._internal.core.errors import ForbiddenError, ResourceExistsError, ServerClientError
16
- from dstack._internal.core.models.projects import Member, MemberPermissions, Project
16
+ from dstack._internal.core.models.projects import (
17
+ Member,
18
+ MemberPermissions,
19
+ Project,
20
+ ProjectHookConfig,
21
+ )
17
22
  from dstack._internal.core.models.runs import RunStatus
18
23
  from dstack._internal.core.models.users import GlobalRole, ProjectRole
19
24
  from dstack._internal.server.models import (
@@ -120,6 +125,7 @@ async def create_project(
120
125
  user: UserModel,
121
126
  project_name: str,
122
127
  is_public: bool = False,
128
+ config: Optional[ProjectHookConfig] = None,
123
129
  ) -> Project:
124
130
  user_permissions = users.get_user_permissions(user)
125
131
  if not user_permissions.can_create_projects:
@@ -147,7 +153,7 @@ async def create_project(
147
153
  session=session, project_name=project_name
148
154
  )
149
155
  for hook in _CREATE_PROJECT_HOOKS:
150
- await hook(session, project_model)
156
+ await hook(session, project_model, config)
151
157
  # a hook may change project
152
158
  session.expire(project_model)
153
159
  project_model = await get_project_model_by_name_or_error(
@@ -609,7 +615,9 @@ def get_member_permissions(member_model: MemberModel) -> MemberPermissions:
609
615
  _CREATE_PROJECT_HOOKS = []
610
616
 
611
617
 
612
- def register_create_project_hook(func: Callable[[AsyncSession, ProjectModel], Awaitable[None]]):
618
+ def register_create_project_hook(
619
+ func: Callable[[AsyncSession, ProjectModel, Optional[ProjectHookConfig]], Awaitable[None]],
620
+ ):
613
621
  _CREATE_PROJECT_HOOKS.append(func)
614
622
 
615
623