dstack 0.19.28__py3-none-any.whl → 0.19.30rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (43) hide show
  1. dstack/_internal/core/backends/aws/compute.py +51 -36
  2. dstack/_internal/core/backends/azure/compute.py +10 -7
  3. dstack/_internal/core/backends/base/compute.py +96 -14
  4. dstack/_internal/core/backends/base/offers.py +34 -4
  5. dstack/_internal/core/backends/cloudrift/compute.py +5 -7
  6. dstack/_internal/core/backends/cudo/compute.py +4 -2
  7. dstack/_internal/core/backends/datacrunch/compute.py +13 -11
  8. dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
  9. dstack/_internal/core/backends/gcp/compute.py +12 -7
  10. dstack/_internal/core/backends/hotaisle/compute.py +4 -7
  11. dstack/_internal/core/backends/kubernetes/compute.py +6 -4
  12. dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
  13. dstack/_internal/core/backends/local/compute.py +1 -3
  14. dstack/_internal/core/backends/nebius/compute.py +10 -7
  15. dstack/_internal/core/backends/oci/compute.py +10 -7
  16. dstack/_internal/core/backends/runpod/compute.py +15 -6
  17. dstack/_internal/core/backends/template/compute.py.jinja +3 -1
  18. dstack/_internal/core/backends/tensordock/compute.py +1 -3
  19. dstack/_internal/core/backends/tensordock/models.py +2 -0
  20. dstack/_internal/core/backends/vastai/compute.py +7 -3
  21. dstack/_internal/core/backends/vultr/compute.py +5 -5
  22. dstack/_internal/core/models/projects.py +8 -0
  23. dstack/_internal/core/services/repos.py +2 -1
  24. dstack/_internal/server/background/tasks/process_instances.py +3 -2
  25. dstack/_internal/server/background/tasks/process_submitted_jobs.py +65 -22
  26. dstack/_internal/server/services/backends/__init__.py +1 -1
  27. dstack/_internal/server/services/projects.py +11 -3
  28. dstack/_internal/server/services/runs.py +2 -0
  29. dstack/_internal/server/statics/index.html +1 -1
  30. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
  31. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
  32. dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
  33. dstack/version.py +1 -1
  34. {dstack-0.19.28.dist-info → dstack-0.19.30rc1.dist-info}/METADATA +1 -1
  35. {dstack-0.19.28.dist-info → dstack-0.19.30rc1.dist-info}/RECORD +38 -42
  36. dstack/_internal/core/backends/tensordock/__init__.py +0 -0
  37. dstack/_internal/core/backends/tensordock/api_client.py +0 -104
  38. dstack/_internal/core/backends/tensordock/backend.py +0 -16
  39. dstack/_internal/core/backends/tensordock/configurator.py +0 -74
  40. dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
  41. {dstack-0.19.28.dist-info → dstack-0.19.30rc1.dist-info}/WHEEL +0 -0
  42. {dstack-0.19.28.dist-info → dstack-0.19.30rc1.dist-info}/entry_points.txt +0 -0
  43. {dstack-0.19.28.dist-info → dstack-0.19.30rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -9,6 +9,7 @@ from gpuhunt.providers.hotaisle import HotAisleProvider
9
9
 
10
10
  from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
+ ComputeWithAllOffersCached,
12
13
  ComputeWithCreateInstanceSupport,
13
14
  get_shim_commands,
14
15
  )
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
23
24
  InstanceOfferWithAvailability,
24
25
  )
25
26
  from dstack._internal.core.models.placement import PlacementGroup
26
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
27
+ from dstack._internal.core.models.runs import JobProvisioningData
27
28
  from dstack._internal.utils.logging import get_logger
28
29
 
29
30
  logger = get_logger(__name__)
@@ -44,6 +45,7 @@ INSTANCE_TYPE_SPECS = {
44
45
 
45
46
 
46
47
  class HotAisleCompute(
48
+ ComputeWithAllOffersCached,
47
49
  ComputeWithCreateInstanceSupport,
48
50
  Compute,
49
51
  ):
@@ -56,16 +58,12 @@ class HotAisleCompute(
56
58
  HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
57
59
  )
58
60
 
59
- def get_offers(
60
- self, requirements: Optional[Requirements] = None
61
- ) -> List[InstanceOfferWithAvailability]:
61
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
62
62
  offers = get_catalog_offers(
63
63
  backend=BackendType.HOTAISLE,
64
64
  locations=self.config.regions or None,
65
- requirements=requirements,
66
65
  catalog=self.catalog,
67
66
  )
68
-
69
67
  supported_offers = []
70
68
  for offer in offers:
71
69
  if offer.instance.name in INSTANCE_TYPE_SPECS:
@@ -78,7 +76,6 @@ class HotAisleCompute(
78
76
  logger.warning(
79
77
  f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
80
78
  )
81
-
82
79
  return supported_offers
83
80
 
84
81
  def get_payload_from_offer(self, instance_type) -> dict:
@@ -9,13 +9,14 @@ from kubernetes import client
9
9
 
10
10
  from dstack._internal.core.backends.base.compute import (
11
11
  Compute,
12
+ ComputeWithFilteredOffersCached,
12
13
  ComputeWithGatewaySupport,
13
14
  generate_unique_gateway_instance_name,
14
15
  generate_unique_instance_name_for_job,
15
16
  get_docker_commands,
16
17
  get_dstack_gateway_commands,
17
18
  )
18
- from dstack._internal.core.backends.base.offers import match_requirements
19
+ from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
19
20
  from dstack._internal.core.backends.kubernetes.models import (
20
21
  KubernetesConfig,
21
22
  KubernetesNetworkingConfig,
@@ -58,6 +59,7 @@ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
58
59
 
59
60
 
60
61
  class KubernetesCompute(
62
+ ComputeWithFilteredOffersCached,
61
63
  ComputeWithGatewaySupport,
62
64
  Compute,
63
65
  ):
@@ -70,8 +72,8 @@ class KubernetesCompute(
70
72
  self.networking_config = networking_config
71
73
  self.api = get_api_from_config_data(config.kubeconfig.data)
72
74
 
73
- def get_offers(
74
- self, requirements: Optional[Requirements] = None
75
+ def get_offers_by_requirements(
76
+ self, requirements: Requirements
75
77
  ) -> List[InstanceOfferWithAvailability]:
76
78
  nodes = self.api.list_node()
77
79
  instance_offers = []
@@ -99,7 +101,7 @@ class KubernetesCompute(
99
101
  availability=InstanceAvailability.AVAILABLE,
100
102
  instance_runtime=InstanceRuntime.RUNNER,
101
103
  )
102
- instance_offers.extend(match_requirements([instance_offer], requirements))
104
+ instance_offers.extend(filter_offers_by_requirements([instance_offer], requirements))
103
105
  return instance_offers
104
106
 
105
107
  def run_job(
@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
7
7
 
8
8
  from dstack._internal.core.backends.base.compute import (
9
9
  Compute,
10
+ ComputeWithAllOffersCached,
10
11
  ComputeWithCreateInstanceSupport,
11
12
  generate_unique_instance_name,
12
13
  get_shim_commands,
@@ -22,12 +23,13 @@ from dstack._internal.core.models.instances import (
22
23
  InstanceOfferWithAvailability,
23
24
  )
24
25
  from dstack._internal.core.models.placement import PlacementGroup
25
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
26
+ from dstack._internal.core.models.runs import JobProvisioningData
26
27
 
27
28
  MAX_INSTANCE_NAME_LEN = 60
28
29
 
29
30
 
30
31
  class LambdaCompute(
32
+ ComputeWithAllOffersCached,
31
33
  ComputeWithCreateInstanceSupport,
32
34
  Compute,
33
35
  ):
@@ -36,13 +38,10 @@ class LambdaCompute(
36
38
  self.config = config
37
39
  self.api_client = LambdaAPIClient(config.creds.api_key)
38
40
 
39
- def get_offers(
40
- self, requirements: Optional[Requirements] = None
41
- ) -> List[InstanceOfferWithAvailability]:
41
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
42
42
  offers = get_catalog_offers(
43
43
  backend=BackendType.LAMBDA,
44
44
  locations=self.config.regions or None,
45
- requirements=requirements,
46
45
  )
47
46
  offers_with_availability = self._get_offers_with_availability(offers)
48
47
  return offers_with_availability
@@ -28,9 +28,7 @@ class LocalCompute(
28
28
  ComputeWithVolumeSupport,
29
29
  Compute,
30
30
  ):
31
- def get_offers(
32
- self, requirements: Optional[Requirements] = None
33
- ) -> List[InstanceOfferWithAvailability]:
31
+ def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
34
32
  return [
35
33
  InstanceOfferWithAvailability(
36
34
  backend=BackendType.LOCAL,
@@ -3,7 +3,7 @@ import random
3
3
  import shlex
4
4
  import time
5
5
  from functools import cached_property
6
- from typing import List, Optional
6
+ from typing import Callable, List, Optional
7
7
 
8
8
  from nebius.aio.operation import Operation as SDKOperation
9
9
  from nebius.aio.service_error import RequestError, StatusCode
@@ -12,13 +12,14 @@ from nebius.sdk import SDK
12
12
 
13
13
  from dstack._internal.core.backends.base.backend import Compute
14
14
  from dstack._internal.core.backends.base.compute import (
15
+ ComputeWithAllOffersCached,
15
16
  ComputeWithCreateInstanceSupport,
16
17
  ComputeWithMultinodeSupport,
17
18
  ComputeWithPlacementGroupSupport,
18
19
  generate_unique_instance_name,
19
20
  get_user_data,
20
21
  )
21
- from dstack._internal.core.backends.base.offers import get_catalog_offers
22
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
22
23
  from dstack._internal.core.backends.nebius import resources
23
24
  from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
24
25
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
@@ -76,6 +77,7 @@ SUPPORTED_PLATFORMS = [
76
77
 
77
78
 
78
79
  class NebiusCompute(
80
+ ComputeWithAllOffersCached,
79
81
  ComputeWithCreateInstanceSupport,
80
82
  ComputeWithMultinodeSupport,
81
83
  ComputeWithPlacementGroupSupport,
@@ -106,15 +108,11 @@ class NebiusCompute(
106
108
  ).metadata.id
107
109
  return self._subnet_id_cache[region]
108
110
 
109
- def get_offers(
110
- self, requirements: Optional[Requirements] = None
111
- ) -> List[InstanceOfferWithAvailability]:
111
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
112
112
  offers = get_catalog_offers(
113
113
  backend=BackendType.NEBIUS,
114
114
  locations=list(self._region_to_project_id),
115
- requirements=requirements,
116
115
  extra_filter=_supported_instances,
117
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
118
116
  )
119
117
  return [
120
118
  InstanceOfferWithAvailability(
@@ -124,6 +122,11 @@ class NebiusCompute(
124
122
  for offer in offers
125
123
  ]
126
124
 
125
+ def get_offers_modifier(
126
+ self, requirements: Requirements
127
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
128
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
129
+
127
130
  def create_instance(
128
131
  self,
129
132
  instance_offer: InstanceOfferWithAvailability,
@@ -1,17 +1,18 @@
1
1
  from concurrent.futures import ThreadPoolExecutor
2
2
  from functools import cached_property
3
- from typing import List, Optional
3
+ from typing import Callable, List, Optional
4
4
 
5
5
  import oci
6
6
 
7
7
  from dstack._internal.core.backends.base.compute import (
8
8
  Compute,
9
+ ComputeWithAllOffersCached,
9
10
  ComputeWithCreateInstanceSupport,
10
11
  ComputeWithMultinodeSupport,
11
12
  generate_unique_instance_name,
12
13
  get_user_data,
13
14
  )
14
- from dstack._internal.core.backends.base.offers import get_catalog_offers
15
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
15
16
  from dstack._internal.core.backends.oci import resources
16
17
  from dstack._internal.core.backends.oci.models import OCIConfig
17
18
  from dstack._internal.core.backends.oci.region import make_region_clients_map
@@ -47,6 +48,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
47
48
 
48
49
 
49
50
  class OCICompute(
51
+ ComputeWithAllOffersCached,
50
52
  ComputeWithCreateInstanceSupport,
51
53
  ComputeWithMultinodeSupport,
52
54
  Compute,
@@ -60,14 +62,10 @@ class OCICompute(
60
62
  def shapes_quota(self) -> resources.ShapesQuota:
61
63
  return resources.ShapesQuota.load(self.regions, self.config.compartment_id)
62
64
 
63
- def get_offers(
64
- self, requirements: Optional[Requirements] = None
65
- ) -> List[InstanceOfferWithAvailability]:
65
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
66
66
  offers = get_catalog_offers(
67
67
  backend=BackendType.OCI,
68
68
  locations=self.config.regions,
69
- requirements=requirements,
70
- configurable_disk_size=CONFIGURABLE_DISK_SIZE,
71
69
  extra_filter=_supported_instances,
72
70
  )
73
71
 
@@ -96,6 +94,11 @@ class OCICompute(
96
94
 
97
95
  return offers_with_availability
98
96
 
97
+ def get_offers_modifier(
98
+ self, requirements: Requirements
99
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
100
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
101
+
99
102
  def terminate_instance(
100
103
  self, instance_id: str, region: str, backend_data: Optional[str] = None
101
104
  ) -> None:
@@ -1,17 +1,18 @@
1
1
  import json
2
2
  import uuid
3
3
  from datetime import timedelta
4
- from typing import List, Optional
4
+ from typing import Callable, List, Optional
5
5
 
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ ComputeWithAllOffersCached,
8
9
  ComputeWithVolumeSupport,
9
10
  generate_unique_instance_name,
10
11
  generate_unique_volume_name,
11
12
  get_docker_commands,
12
13
  get_job_instance_name,
13
14
  )
14
- from dstack._internal.core.backends.base.offers import get_catalog_offers
15
+ from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
15
16
  from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
16
17
  from dstack._internal.core.backends.runpod.models import RunpodConfig
17
18
  from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
@@ -27,6 +28,7 @@ from dstack._internal.core.models.instances import (
27
28
  InstanceOfferWithAvailability,
28
29
  SSHKey,
29
30
  )
31
+ from dstack._internal.core.models.resources import Memory, Range
30
32
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
31
33
  from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
32
34
  from dstack._internal.utils.common import get_current_datetime
@@ -39,8 +41,12 @@ MAX_RESOURCE_NAME_LEN = 60
39
41
 
40
42
  CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
41
43
 
44
+ # RunPod does not seem to have any limits on the disk size.
45
+ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
46
+
42
47
 
43
48
  class RunpodCompute(
49
+ ComputeWithAllOffersCached,
44
50
  ComputeWithVolumeSupport,
45
51
  Compute,
46
52
  ):
@@ -51,13 +57,11 @@ class RunpodCompute(
51
57
  self.config = config
52
58
  self.api_client = RunpodApiClient(config.creds.api_key)
53
59
 
54
- def get_offers(
55
- self, requirements: Optional[Requirements] = None
56
- ) -> List[InstanceOfferWithAvailability]:
60
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
57
61
  offers = get_catalog_offers(
58
62
  backend=BackendType.RUNPOD,
59
63
  locations=self.config.regions or None,
60
- requirements=requirements,
64
+ requirements=None,
61
65
  extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
62
66
  )
63
67
  offers = [
@@ -68,6 +72,11 @@ class RunpodCompute(
68
72
  ]
69
73
  return offers
70
74
 
75
+ def get_offers_modifier(
76
+ self, requirements: Requirements
77
+ ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
78
+ return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
79
+
71
80
  def run_job(
72
81
  self,
73
82
  run: Run,
@@ -2,6 +2,7 @@ from typing import List, Optional
2
2
 
3
3
  from dstack._internal.core.backends.base.backend import Compute
4
4
  from dstack._internal.core.backends.base.compute import (
5
+ ComputeWithAllOffersCached,
5
6
  ComputeWithCreateInstanceSupport,
6
7
  ComputeWithGatewaySupport,
7
8
  ComputeWithMultinodeSupport,
@@ -28,6 +29,7 @@ logger = get_logger(__name__)
28
29
 
29
30
  class {{ backend_name }}Compute(
30
31
  # TODO: Choose ComputeWith* classes to extend and implement
32
+ # ComputeWithAllOffersCached,
31
33
  # ComputeWithCreateInstanceSupport,
32
34
  # ComputeWithMultinodeSupport,
33
35
  # ComputeWithReservationSupport,
@@ -42,7 +44,7 @@ class {{ backend_name }}Compute(
42
44
  self.config = config
43
45
 
44
46
  def get_offers(
45
- self, requirements: Optional[Requirements] = None
47
+ self, requirements: Requirements
46
48
  ) -> List[InstanceOfferWithAvailability]:
47
49
  # If the provider is added to gpuhunt, you'd typically get offers
48
50
  # using `get_catalog_offers()` and extend them with availability info.
@@ -39,9 +39,7 @@ class TensorDockCompute(
39
39
  self.config = config
40
40
  self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token)
41
41
 
42
- def get_offers(
43
- self, requirements: Optional[Requirements] = None
44
- ) -> List[InstanceOfferWithAvailability]:
42
+ def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
45
43
  offers = get_catalog_offers(
46
44
  backend=BackendType.TENSORDOCK,
47
45
  requirements=requirements,
@@ -4,6 +4,8 @@ from pydantic import Field
4
4
 
5
5
  from dstack._internal.core.models.common import CoreModel
6
6
 
7
+ # TODO: TensorDock is deprecated and will be removed in the future
8
+
7
9
 
8
10
  class TensorDockAPIKeyCreds(CoreModel):
9
11
  type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
@@ -5,6 +5,7 @@ from gpuhunt.providers.vastai import VastAIProvider
5
5
 
6
6
  from dstack._internal.core.backends.base.backend import Compute
7
7
  from dstack._internal.core.backends.base.compute import (
8
+ ComputeWithFilteredOffersCached,
8
9
  generate_unique_instance_name_for_job,
9
10
  get_docker_commands,
10
11
  )
@@ -30,7 +31,10 @@ logger = get_logger(__name__)
30
31
  MAX_INSTANCE_NAME_LEN = 60
31
32
 
32
33
 
33
- class VastAICompute(Compute):
34
+ class VastAICompute(
35
+ ComputeWithFilteredOffersCached,
36
+ Compute,
37
+ ):
34
38
  def __init__(self, config: VastAIConfig):
35
39
  super().__init__()
36
40
  self.config = config
@@ -49,8 +53,8 @@ class VastAICompute(Compute):
49
53
  )
50
54
  )
51
55
 
52
- def get_offers(
53
- self, requirements: Optional[Requirements] = None
56
+ def get_offers_by_requirements(
57
+ self, requirements: Requirements
54
58
  ) -> List[InstanceOfferWithAvailability]:
55
59
  offers = get_catalog_offers(
56
60
  backend=BackendType.VASTAI,
@@ -6,6 +6,7 @@ import requests
6
6
 
7
7
  from dstack._internal.core.backends.base.backend import Compute
8
8
  from dstack._internal.core.backends.base.compute import (
9
+ ComputeWithAllOffersCached,
9
10
  ComputeWithCreateInstanceSupport,
10
11
  ComputeWithMultinodeSupport,
11
12
  generate_unique_instance_name,
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
23
24
  InstanceOfferWithAvailability,
24
25
  )
25
26
  from dstack._internal.core.models.placement import PlacementGroup
26
- from dstack._internal.core.models.runs import JobProvisioningData, Requirements
27
+ from dstack._internal.core.models.runs import JobProvisioningData
27
28
  from dstack._internal.utils.logging import get_logger
28
29
 
29
30
  logger = get_logger(__name__)
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 64
32
33
 
33
34
 
34
35
  class VultrCompute(
36
+ ComputeWithAllOffersCached,
35
37
  ComputeWithCreateInstanceSupport,
36
38
  ComputeWithMultinodeSupport,
37
39
  Compute,
@@ -41,12 +43,10 @@ class VultrCompute(
41
43
  self.config = config
42
44
  self.api_client = VultrApiClient(config.creds.api_key)
43
45
 
44
- def get_offers(
45
- self, requirements: Optional[Requirements] = None
46
- ) -> List[InstanceOfferWithAvailability]:
46
+ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
47
47
  offers = get_catalog_offers(
48
48
  backend=BackendType.VULTR,
49
- requirements=requirements,
49
+ requirements=None,
50
50
  locations=self.config.regions or None,
51
51
  extra_filter=_supported_instances,
52
52
  )
@@ -26,3 +26,11 @@ class Project(CoreModel):
26
26
  backends: List[BackendInfo]
27
27
  members: List[Member]
28
28
  is_public: bool = False
29
+
30
+
31
+ class ProjectHookConfig(CoreModel):
32
+ """
33
+ This class can be inherited to extend the project creation configuration passed to the hooks.
34
+ """
35
+
36
+ pass
@@ -122,7 +122,8 @@ def _get_repo_creds_and_default_branch_https(
122
122
 
123
123
  def _get_repo_default_branch(url: str, env: dict[str, str]) -> Optional[str]:
124
124
  # output example: "ref: refs/heads/dev\tHEAD\n545344f77c0df78367085952a97fc3a058eb4c65\tHEAD"
125
- output: str = git.cmd.Git().ls_remote("--symref", url, "HEAD", env=env)
125
+ # Disable credential helpers to exclude any default credentials from being used
126
+ output: str = git.cmd.Git()(c="credential.helper=").ls_remote("--symref", url, "HEAD", env=env)
126
127
  for line in output.splitlines():
127
128
  # line format: `<oid> TAB <ref> LF`
128
129
  oid, _, ref = line.partition("\t")
@@ -578,7 +578,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
578
578
  if placement_group_model is None: # error occurred
579
579
  continue
580
580
  session.add(placement_group_model)
581
- await session.flush()
582
581
  placement_group_models.append(placement_group_model)
583
582
  logger.debug(
584
583
  "Trying %s in %s/%s for $%0.4f per hour",
@@ -636,7 +635,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
636
635
  },
637
636
  )
638
637
  if instance.fleet_id and _is_fleet_master_instance(instance):
639
- # Clean up placement groups that did not end up being used
638
+ # Clean up placement groups that did not end up being used.
639
+ # Flush to update still uncommitted placement groups.
640
+ await session.flush()
640
641
  await schedule_fleet_placement_groups_deletion(
641
642
  session=session,
642
643
  fleet_id=instance.fleet_id,
@@ -289,7 +289,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
289
289
  instance_filters=instance_filters,
290
290
  )
291
291
  fleet_models = fleet_models_with_instances + fleet_models_without_instances
292
- fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
292
+ fleet_model, fleet_instances_with_offers = await _find_optimal_fleet_with_offers(
293
+ project=project,
293
294
  fleet_models=fleet_models,
294
295
  run_model=run_model,
295
296
  run_spec=run.run_spec,
@@ -492,7 +493,8 @@ async def _refetch_fleet_models_with_instances(
492
493
  return fleet_models
493
494
 
494
495
 
495
- def _find_optimal_fleet_with_offers(
496
+ async def _find_optimal_fleet_with_offers(
497
+ project: ProjectModel,
496
498
  fleet_models: list[FleetModel],
497
499
  run_model: RunModel,
498
500
  run_spec: RunSpec,
@@ -502,58 +504,99 @@ def _find_optimal_fleet_with_offers(
502
504
  ) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
503
505
  if run_model.fleet is not None:
504
506
  # Using the fleet that was already chosen by the master job
505
- fleet_instances_with_offers = _get_fleet_instances_with_offers(
507
+ fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
506
508
  fleet_model=run_model.fleet,
507
509
  run_spec=run_spec,
508
510
  job=job,
509
511
  master_job_provisioning_data=master_job_provisioning_data,
510
512
  volumes=volumes,
511
513
  )
512
- return run_model.fleet, fleet_instances_with_offers
514
+ return run_model.fleet, fleet_instances_with_pool_offers
513
515
 
514
516
  if len(fleet_models) == 0:
515
517
  return None, []
516
518
 
517
519
  nodes_required_num = _get_nodes_required_num_for_run(run_spec)
518
- # The current strategy is to first consider fleets that can accommodate
519
- # the run without additional provisioning and choose the one with the cheapest offer.
520
- # Fallback to fleet with the cheapest offer among all fleets with offers.
520
+ # The current strategy is first to consider fleets that can accommodate
521
+ # the run without additional provisioning and choose the one with the cheapest pool offer.
522
+ # Then choose a fleet with the cheapest pool offer among all fleets with pool offers.
523
+ # If there are no fleets with pool offers, choose a fleet with a cheapest backend offer.
524
+ # Fallback to autocreated fleet if fleets have no pool or backend offers.
525
+ # TODO: Consider trying all backend offers and then choosing a fleet.
521
526
  candidate_fleets_with_offers: list[
522
527
  tuple[
523
528
  Optional[FleetModel],
524
529
  list[tuple[InstanceModel, InstanceOfferWithAvailability]],
525
530
  int,
526
- tuple[int, float],
531
+ int,
532
+ tuple[int, float, float],
527
533
  ]
528
534
  ] = []
529
535
  for candidate_fleet_model in fleet_models:
530
- fleet_instances_with_offers = _get_fleet_instances_with_offers(
536
+ fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
531
537
  fleet_model=candidate_fleet_model,
532
538
  run_spec=run_spec,
533
539
  job=job,
534
540
  master_job_provisioning_data=master_job_provisioning_data,
535
541
  volumes=volumes,
536
542
  )
537
- fleet_available_offers = [
538
- o for _, o in fleet_instances_with_offers if o.availability.is_available()
539
- ]
540
- fleet_has_available_capacity = nodes_required_num <= len(fleet_available_offers)
541
- fleet_cheapest_offer = math.inf
542
- if len(fleet_available_offers) > 0:
543
- fleet_cheapest_offer = fleet_available_offers[0].price
544
- fleet_priority = (not fleet_has_available_capacity, fleet_cheapest_offer)
543
+ fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
544
+ fleet_cheapest_pool_offer = math.inf
545
+ if len(fleet_instances_with_pool_offers) > 0:
546
+ fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
547
+
548
+ candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
549
+ profile = combine_fleet_and_run_profiles(
550
+ candidate_fleet.spec.merged_profile, run_spec.merged_profile
551
+ )
552
+ fleet_requirements = get_fleet_requirements(candidate_fleet.spec)
553
+ requirements = combine_fleet_and_run_requirements(
554
+ fleet_requirements, job.job_spec.requirements
555
+ )
556
+ multinode = (
557
+ candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
558
+ or job.job_spec.jobs_per_replica > 1
559
+ )
560
+ fleet_backend_offers = []
561
+ if (
562
+ _check_can_create_new_instance_in_fleet(candidate_fleet)
563
+ and profile is not None
564
+ and requirements is not None
565
+ ):
566
+ fleet_backend_offers = await get_offers_by_requirements(
567
+ project=project,
568
+ profile=profile,
569
+ requirements=requirements,
570
+ exclude_not_available=True,
571
+ multinode=multinode,
572
+ master_job_provisioning_data=master_job_provisioning_data,
573
+ volumes=volumes,
574
+ privileged=job.job_spec.privileged,
575
+ instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
576
+ )
577
+
578
+ fleet_cheapest_backend_offer = math.inf
579
+ if len(fleet_backend_offers) > 0:
580
+ fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
581
+
582
+ fleet_priority = (
583
+ not fleet_has_available_capacity,
584
+ fleet_cheapest_pool_offer,
585
+ fleet_cheapest_backend_offer,
586
+ )
545
587
  candidate_fleets_with_offers.append(
546
588
  (
547
589
  candidate_fleet_model,
548
- fleet_instances_with_offers,
549
- len(fleet_available_offers),
590
+ fleet_instances_with_pool_offers,
591
+ len(fleet_instances_with_pool_offers),
592
+ len(fleet_backend_offers),
550
593
  fleet_priority,
551
594
  )
552
595
  )
553
596
  if run_spec.merged_profile.fleets is None and all(
554
- t[2] == 0 for t in candidate_fleets_with_offers
597
+ t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
555
598
  ):
556
- # If fleets are not specified and no fleets have available offers, create a new fleet.
599
+ # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
557
600
  # This is for compatibility with non-fleet-first UX when runs created new fleets
558
601
  # if there are no instances to reuse.
559
602
  return None, []
@@ -573,7 +616,7 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
573
616
  return nodes_required_num
574
617
 
575
618
 
576
- def _get_fleet_instances_with_offers(
619
+ def _get_fleet_instances_with_pool_offers(
577
620
  fleet_model: FleetModel,
578
621
  run_spec: RunSpec,
579
622
  job: Job,
@@ -345,7 +345,7 @@ async def get_instance_offers(
345
345
  Returns list of instances satisfying minimal resource requirements sorted by price
346
346
  """
347
347
  logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends])
348
- tasks = [run_async(backend.compute().get_offers_cached, requirements) for backend in backends]
348
+ tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends]
349
349
  offers_by_backend = []
350
350
  for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)):
351
351
  if isinstance(result, BackendError):