dstack 0.19.27__py3-none-any.whl → 0.19.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +11 -8
- dstack/_internal/cli/commands/apply.py +6 -3
- dstack/_internal/cli/commands/completion.py +3 -1
- dstack/_internal/cli/commands/config.py +1 -0
- dstack/_internal/cli/commands/init.py +2 -2
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/commands/project.py +1 -0
- dstack/_internal/cli/commands/server.py +2 -2
- dstack/_internal/cli/main.py +1 -1
- dstack/_internal/cli/services/configurators/base.py +2 -4
- dstack/_internal/cli/services/configurators/fleet.py +4 -5
- dstack/_internal/cli/services/configurators/gateway.py +3 -5
- dstack/_internal/cli/services/configurators/run.py +51 -27
- dstack/_internal/cli/services/configurators/volume.py +3 -5
- dstack/_internal/core/backends/aws/compute.py +51 -36
- dstack/_internal/core/backends/azure/compute.py +10 -7
- dstack/_internal/core/backends/base/compute.py +96 -14
- dstack/_internal/core/backends/base/offers.py +34 -4
- dstack/_internal/core/backends/cloudrift/compute.py +5 -7
- dstack/_internal/core/backends/cudo/compute.py +4 -2
- dstack/_internal/core/backends/datacrunch/compute.py +13 -11
- dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
- dstack/_internal/core/backends/gcp/compute.py +12 -7
- dstack/_internal/core/backends/hotaisle/compute.py +4 -7
- dstack/_internal/core/backends/kubernetes/compute.py +6 -4
- dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
- dstack/_internal/core/backends/local/compute.py +1 -3
- dstack/_internal/core/backends/nebius/compute.py +10 -7
- dstack/_internal/core/backends/oci/compute.py +10 -7
- dstack/_internal/core/backends/runpod/compute.py +15 -6
- dstack/_internal/core/backends/template/compute.py.jinja +3 -1
- dstack/_internal/core/backends/tensordock/compute.py +1 -3
- dstack/_internal/core/backends/tensordock/models.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +7 -3
- dstack/_internal/core/backends/vultr/compute.py +5 -5
- dstack/_internal/core/compatibility/runs.py +2 -0
- dstack/_internal/core/models/common.py +67 -43
- dstack/_internal/core/models/configurations.py +88 -62
- dstack/_internal/core/models/fleets.py +41 -24
- dstack/_internal/core/models/instances.py +5 -5
- dstack/_internal/core/models/profiles.py +66 -47
- dstack/_internal/core/models/projects.py +8 -0
- dstack/_internal/core/models/repos/remote.py +21 -16
- dstack/_internal/core/models/resources.py +69 -65
- dstack/_internal/core/models/runs.py +17 -9
- dstack/_internal/server/app.py +5 -0
- dstack/_internal/server/background/tasks/process_fleets.py +8 -0
- dstack/_internal/server/background/tasks/process_instances.py +3 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +97 -34
- dstack/_internal/server/models.py +6 -5
- dstack/_internal/server/schemas/gateways.py +10 -9
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/backends/handlers.py +2 -0
- dstack/_internal/server/services/docker.py +8 -7
- dstack/_internal/server/services/projects.py +63 -4
- dstack/_internal/server/services/runs.py +2 -0
- dstack/_internal/server/settings.py +46 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
- dstack/_internal/server/statics/{main-4eecc75fbe64067eb1bc.js → main-c51afa7f243e24d3e446.js} +61115 -49101
- dstack/_internal/server/statics/{main-4eecc75fbe64067eb1bc.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
- dstack/_internal/utils/env.py +85 -11
- dstack/version.py +1 -1
- {dstack-0.19.27.dist-info → dstack-0.19.29.dist-info}/METADATA +1 -1
- {dstack-0.19.27.dist-info → dstack-0.19.29.dist-info}/RECORD +68 -73
- dstack/_internal/core/backends/tensordock/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/api_client.py +0 -104
- dstack/_internal/core/backends/tensordock/backend.py +0 -16
- dstack/_internal/core/backends/tensordock/configurator.py +0 -74
- dstack/_internal/server/statics/main-56191c63d516fd0041c4.css +0 -3
- dstack/_internal/server/statics/static/media/github.1f7102513534c83a9d8d735d2b8c12a2.svg +0 -3
- {dstack-0.19.27.dist-info → dstack-0.19.29.dist-info}/WHEEL +0 -0
- {dstack-0.19.27.dist-info → dstack-0.19.29.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.27.dist-info → dstack-0.19.29.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -7,7 +7,7 @@ from abc import ABC, abstractmethod
|
|
|
7
7
|
from collections.abc import Iterable
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Dict, List, Literal, Optional
|
|
10
|
+
from typing import Callable, Dict, List, Literal, Optional
|
|
11
11
|
|
|
12
12
|
import git
|
|
13
13
|
import requests
|
|
@@ -15,6 +15,7 @@ import yaml
|
|
|
15
15
|
from cachetools import TTLCache, cachedmethod
|
|
16
16
|
|
|
17
17
|
from dstack._internal import settings
|
|
18
|
+
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
18
19
|
from dstack._internal.core.consts import (
|
|
19
20
|
DSTACK_RUNNER_HTTP_PORT,
|
|
20
21
|
DSTACK_RUNNER_SSH_PORT,
|
|
@@ -57,14 +58,8 @@ class Compute(ABC):
|
|
|
57
58
|
If a compute supports additional features, it must also subclass `ComputeWith*` classes.
|
|
58
59
|
"""
|
|
59
60
|
|
|
60
|
-
def __init__(self):
|
|
61
|
-
self._offers_cache_lock = threading.Lock()
|
|
62
|
-
self._offers_cache = TTLCache(maxsize=10, ttl=180)
|
|
63
|
-
|
|
64
61
|
@abstractmethod
|
|
65
|
-
def get_offers(
|
|
66
|
-
self, requirements: Optional[Requirements] = None
|
|
67
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
62
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
68
63
|
"""
|
|
69
64
|
Returns offers with availability matching `requirements`.
|
|
70
65
|
If the provider is added to gpuhunt, typically gets offers using `base.offers.get_catalog_offers()`
|
|
@@ -121,10 +116,97 @@ class Compute(ABC):
|
|
|
121
116
|
"""
|
|
122
117
|
pass
|
|
123
118
|
|
|
124
|
-
|
|
119
|
+
|
|
120
|
+
class ComputeWithAllOffersCached(ABC):
|
|
121
|
+
"""
|
|
122
|
+
Provides common `get_offers()` implementation for backends
|
|
123
|
+
whose offers do not depend on requirements.
|
|
124
|
+
It caches all offers with availability and post-filters by requirements.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(self) -> None:
|
|
128
|
+
super().__init__()
|
|
129
|
+
self._offers_cache_lock = threading.Lock()
|
|
130
|
+
self._offers_cache = TTLCache(maxsize=1, ttl=180)
|
|
131
|
+
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
134
|
+
"""
|
|
135
|
+
Returns all backend offers with availability.
|
|
136
|
+
"""
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
def get_offers_modifier(
|
|
140
|
+
self, requirements: Requirements
|
|
141
|
+
) -> Optional[
|
|
142
|
+
Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
|
|
143
|
+
]:
|
|
144
|
+
"""
|
|
145
|
+
Returns a modifier function that modifies offers before they are filtered by requirements.
|
|
146
|
+
Can return `None` to exclude the offer.
|
|
147
|
+
E.g. can be used to set appropriate disk size based on requirements.
|
|
148
|
+
"""
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
def get_offers_post_filter(
|
|
152
|
+
self, requirements: Requirements
|
|
153
|
+
) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
|
|
154
|
+
"""
|
|
155
|
+
Returns a filter function to apply to offers based on requirements.
|
|
156
|
+
This allows backends to implement custom post-filtering logic for specific requirements.
|
|
157
|
+
"""
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
161
|
+
offers = self._get_all_offers_with_availability_cached()
|
|
162
|
+
modifier = self.get_offers_modifier(requirements)
|
|
163
|
+
if modifier is not None:
|
|
164
|
+
modified_offers = []
|
|
165
|
+
for o in offers:
|
|
166
|
+
modified_offer = modifier(o)
|
|
167
|
+
if modified_offer is not None:
|
|
168
|
+
modified_offers.append(modified_offer)
|
|
169
|
+
offers = modified_offers
|
|
170
|
+
offers = filter_offers_by_requirements(offers, requirements)
|
|
171
|
+
post_filter = self.get_offers_post_filter(requirements)
|
|
172
|
+
if post_filter is not None:
|
|
173
|
+
offers = [o for o in offers if post_filter(o)]
|
|
174
|
+
return offers
|
|
175
|
+
|
|
176
|
+
@cachedmethod(
|
|
177
|
+
cache=lambda self: self._offers_cache,
|
|
178
|
+
lock=lambda self: self._offers_cache_lock,
|
|
179
|
+
)
|
|
180
|
+
def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]:
|
|
181
|
+
return self.get_all_offers_with_availability()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class ComputeWithFilteredOffersCached(ABC):
|
|
185
|
+
"""
|
|
186
|
+
Provides common `get_offers()` implementation for backends
|
|
187
|
+
whose offers depend on requirements.
|
|
188
|
+
It caches offers using requirements as key.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(self) -> None:
|
|
192
|
+
super().__init__()
|
|
193
|
+
self._offers_cache_lock = threading.Lock()
|
|
194
|
+
self._offers_cache = TTLCache(maxsize=10, ttl=180)
|
|
195
|
+
|
|
196
|
+
@abstractmethod
|
|
197
|
+
def get_offers_by_requirements(
|
|
198
|
+
self, requirements: Requirements
|
|
199
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
200
|
+
"""
|
|
201
|
+
Returns backend offers with availability matching requirements.
|
|
202
|
+
"""
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
206
|
+
return self._get_offers_cached(requirements)
|
|
207
|
+
|
|
208
|
+
def _get_offers_cached_key(self, requirements: Requirements) -> int:
|
|
125
209
|
# Requirements is not hashable, so we use a hack to get arguments hash
|
|
126
|
-
if requirements is None:
|
|
127
|
-
return hash(None)
|
|
128
210
|
return hash(requirements.json())
|
|
129
211
|
|
|
130
212
|
@cachedmethod(
|
|
@@ -132,10 +214,10 @@ class Compute(ABC):
|
|
|
132
214
|
key=_get_offers_cached_key,
|
|
133
215
|
lock=lambda self: self._offers_cache_lock,
|
|
134
216
|
)
|
|
135
|
-
def
|
|
136
|
-
self, requirements:
|
|
217
|
+
def _get_offers_cached(
|
|
218
|
+
self, requirements: Requirements
|
|
137
219
|
) -> List[InstanceOfferWithAvailability]:
|
|
138
|
-
return self.
|
|
220
|
+
return self.get_offers_by_requirements(requirements)
|
|
139
221
|
|
|
140
222
|
|
|
141
223
|
class ComputeWithCreateInstanceSupport(ABC):
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import asdict
|
|
2
|
-
from typing import Callable, List, Optional
|
|
2
|
+
from typing import Callable, List, Optional, TypeVar
|
|
3
3
|
|
|
4
4
|
import gpuhunt
|
|
5
5
|
from pydantic import parse_obj_as
|
|
@@ -9,11 +9,13 @@ from dstack._internal.core.models.instances import (
|
|
|
9
9
|
Disk,
|
|
10
10
|
Gpu,
|
|
11
11
|
InstanceOffer,
|
|
12
|
+
InstanceOfferWithAvailability,
|
|
12
13
|
InstanceType,
|
|
13
14
|
Resources,
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.core.models.resources import DEFAULT_DISK, CPUSpec, Memory, Range
|
|
16
17
|
from dstack._internal.core.models.runs import Requirements
|
|
18
|
+
from dstack._internal.utils.common import get_or_error
|
|
17
19
|
|
|
18
20
|
# Offers not supported by all dstack versions are hidden behind one or more flags.
|
|
19
21
|
# This list enables the flags that are currently supported.
|
|
@@ -163,9 +165,13 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi
|
|
|
163
165
|
return q
|
|
164
166
|
|
|
165
167
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
168
|
+
InstanceOfferT = TypeVar("InstanceOfferT", InstanceOffer, InstanceOfferWithAvailability)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def filter_offers_by_requirements(
|
|
172
|
+
offers: List[InstanceOfferT],
|
|
173
|
+
requirements: Optional[Requirements],
|
|
174
|
+
) -> List[InstanceOfferT]:
|
|
169
175
|
query_filter = requirements_to_query_filter(requirements)
|
|
170
176
|
filtered_offers = []
|
|
171
177
|
for offer in offers:
|
|
@@ -190,3 +196,27 @@ def choose_disk_size_mib(
|
|
|
190
196
|
disk_size_gib = disk_size_range.min
|
|
191
197
|
|
|
192
198
|
return round(disk_size_gib * 1024)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def get_offers_disk_modifier(
|
|
202
|
+
configurable_disk_size: Range[Memory], requirements: Requirements
|
|
203
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
204
|
+
"""
|
|
205
|
+
Returns a func that modifies offers disk by setting min value that satisfies both
|
|
206
|
+
`configurable_disk_size` and `requirements`.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def modifier(offer: InstanceOfferWithAvailability) -> Optional[InstanceOfferWithAvailability]:
|
|
210
|
+
requirements_disk_range = DEFAULT_DISK.size
|
|
211
|
+
if requirements.resources.disk is not None:
|
|
212
|
+
requirements_disk_range = requirements.resources.disk.size
|
|
213
|
+
disk_size_range = requirements_disk_range.intersect(configurable_disk_size)
|
|
214
|
+
if disk_size_range is None:
|
|
215
|
+
return None
|
|
216
|
+
offer_copy = offer.copy(deep=True)
|
|
217
|
+
offer_copy.instance.resources.disk = Disk(
|
|
218
|
+
size_mib=get_or_error(disk_size_range.min) * 1024
|
|
219
|
+
)
|
|
220
|
+
return offer_copy
|
|
221
|
+
|
|
222
|
+
return modifier
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Dict, List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.backends.base.backend import Compute
|
|
4
3
|
from dstack._internal.core.backends.base.compute import (
|
|
4
|
+
Compute,
|
|
5
|
+
ComputeWithAllOffersCached,
|
|
5
6
|
ComputeWithCreateInstanceSupport,
|
|
6
7
|
get_shim_commands,
|
|
7
8
|
)
|
|
@@ -17,13 +18,14 @@ from dstack._internal.core.models.instances import (
|
|
|
17
18
|
InstanceOfferWithAvailability,
|
|
18
19
|
)
|
|
19
20
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
20
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
21
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
21
22
|
from dstack._internal.utils.logging import get_logger
|
|
22
23
|
|
|
23
24
|
logger = get_logger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class CloudRiftCompute(
|
|
28
|
+
ComputeWithAllOffersCached,
|
|
27
29
|
ComputeWithCreateInstanceSupport,
|
|
28
30
|
Compute,
|
|
29
31
|
):
|
|
@@ -32,15 +34,11 @@ class CloudRiftCompute(
|
|
|
32
34
|
self.config = config
|
|
33
35
|
self.client = RiftClient(self.config.creds.api_key)
|
|
34
36
|
|
|
35
|
-
def
|
|
36
|
-
self, requirements: Optional[Requirements] = None
|
|
37
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
37
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
38
38
|
offers = get_catalog_offers(
|
|
39
39
|
backend=BackendType.CLOUDRIFT,
|
|
40
40
|
locations=self.config.regions or None,
|
|
41
|
-
requirements=requirements,
|
|
42
41
|
)
|
|
43
|
-
|
|
44
42
|
offers_with_availabilities = self._get_offers_with_availability(offers)
|
|
45
43
|
return offers_with_availabilities
|
|
46
44
|
|
|
@@ -5,6 +5,7 @@ import requests
|
|
|
5
5
|
from dstack._internal.core.backends.base.backend import Compute
|
|
6
6
|
from dstack._internal.core.backends.base.compute import (
|
|
7
7
|
ComputeWithCreateInstanceSupport,
|
|
8
|
+
ComputeWithFilteredOffersCached,
|
|
8
9
|
generate_unique_instance_name,
|
|
9
10
|
get_shim_commands,
|
|
10
11
|
)
|
|
@@ -29,6 +30,7 @@ MAX_RESOURCE_NAME_LEN = 30
|
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
class CudoCompute(
|
|
33
|
+
ComputeWithFilteredOffersCached,
|
|
32
34
|
ComputeWithCreateInstanceSupport,
|
|
33
35
|
Compute,
|
|
34
36
|
):
|
|
@@ -37,8 +39,8 @@ class CudoCompute(
|
|
|
37
39
|
self.config = config
|
|
38
40
|
self.api_client = CudoApiClient(config.creds.api_key)
|
|
39
41
|
|
|
40
|
-
def
|
|
41
|
-
self, requirements:
|
|
42
|
+
def get_offers_by_requirements(
|
|
43
|
+
self, requirements: Requirements
|
|
42
44
|
) -> List[InstanceOfferWithAvailability]:
|
|
43
45
|
offers = get_catalog_offers(
|
|
44
46
|
backend=BackendType.CUDO,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Dict, List, Optional
|
|
1
|
+
from typing import Callable, Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
from datacrunch import DataCrunchClient
|
|
4
4
|
from datacrunch.exceptions import APIException
|
|
@@ -6,11 +6,12 @@ from datacrunch.instances.instances import Instance
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
generate_unique_instance_name,
|
|
11
12
|
get_shim_commands,
|
|
12
13
|
)
|
|
13
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
14
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
14
15
|
from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
|
|
15
16
|
from dstack._internal.core.errors import NoCapacityError
|
|
16
17
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -36,6 +37,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None)
|
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DataCrunchCompute(
|
|
40
|
+
ComputeWithAllOffersCached,
|
|
39
41
|
ComputeWithCreateInstanceSupport,
|
|
40
42
|
Compute,
|
|
41
43
|
):
|
|
@@ -47,18 +49,19 @@ class DataCrunchCompute(
|
|
|
47
49
|
client_secret=self.config.creds.client_secret,
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
def
|
|
51
|
-
self, requirements: Optional[Requirements] = None
|
|
52
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
52
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
53
53
|
offers = get_catalog_offers(
|
|
54
54
|
backend=BackendType.DATACRUNCH,
|
|
55
55
|
locations=self.config.regions,
|
|
56
|
-
requirements=requirements,
|
|
57
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
58
56
|
)
|
|
59
57
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
60
58
|
return offers_with_availability
|
|
61
59
|
|
|
60
|
+
def get_offers_modifier(
|
|
61
|
+
self, requirements: Requirements
|
|
62
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
63
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
64
|
+
|
|
62
65
|
def _get_offers_with_availability(
|
|
63
66
|
self, offers: List[InstanceOffer]
|
|
64
67
|
) -> List[InstanceOfferWithAvailability]:
|
|
@@ -182,10 +185,9 @@ class DataCrunchCompute(
|
|
|
182
185
|
|
|
183
186
|
def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str:
|
|
184
187
|
# https://api.datacrunch.io/v1/images
|
|
185
|
-
if (
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
):
|
|
188
|
+
if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[
|
|
189
|
+
0
|
|
190
|
+
].name in ["V100", "A6000"]:
|
|
189
191
|
# Ubuntu 22.04 + CUDA 12.0 + Docker
|
|
190
192
|
return "2088da25-bb0d-41cc-a191-dccae45d96fd"
|
|
191
193
|
# Ubuntu 24.04 + CUDA 12.8 Open + Docker
|
|
@@ -5,6 +5,7 @@ from gpuhunt.providers.digitalocean import DigitalOceanProvider
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithAllOffersCached,
|
|
8
9
|
ComputeWithCreateInstanceSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
get_user_data,
|
|
@@ -20,7 +21,7 @@ from dstack._internal.core.models.instances import (
|
|
|
20
21
|
InstanceOfferWithAvailability,
|
|
21
22
|
)
|
|
22
23
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
24
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
24
25
|
from dstack._internal.utils.logging import get_logger
|
|
25
26
|
|
|
26
27
|
logger = get_logger(__name__)
|
|
@@ -37,6 +38,7 @@ DOCKER_INSTALL_COMMANDS = [
|
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class BaseDigitalOceanCompute(
|
|
41
|
+
ComputeWithAllOffersCached,
|
|
40
42
|
ComputeWithCreateInstanceSupport,
|
|
41
43
|
Compute,
|
|
42
44
|
):
|
|
@@ -50,13 +52,10 @@ class BaseDigitalOceanCompute(
|
|
|
50
52
|
DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url)
|
|
51
53
|
)
|
|
52
54
|
|
|
53
|
-
def
|
|
54
|
-
self, requirements: Optional[Requirements] = None
|
|
55
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
55
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
56
56
|
offers = get_catalog_offers(
|
|
57
57
|
backend=self.BACKEND_TYPE,
|
|
58
58
|
locations=self.config.regions,
|
|
59
|
-
requirements=requirements,
|
|
60
59
|
catalog=self.catalog,
|
|
61
60
|
)
|
|
62
61
|
return [
|
|
@@ -17,6 +17,7 @@ import dstack._internal.core.backends.gcp.resources as gcp_resources
|
|
|
17
17
|
from dstack import version
|
|
18
18
|
from dstack._internal.core.backends.base.compute import (
|
|
19
19
|
Compute,
|
|
20
|
+
ComputeWithAllOffersCached,
|
|
20
21
|
ComputeWithCreateInstanceSupport,
|
|
21
22
|
ComputeWithGatewaySupport,
|
|
22
23
|
ComputeWithMultinodeSupport,
|
|
@@ -31,7 +32,10 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
31
32
|
get_user_data,
|
|
32
33
|
merge_tags,
|
|
33
34
|
)
|
|
34
|
-
from dstack._internal.core.backends.base.offers import
|
|
35
|
+
from dstack._internal.core.backends.base.offers import (
|
|
36
|
+
get_catalog_offers,
|
|
37
|
+
get_offers_disk_modifier,
|
|
38
|
+
)
|
|
35
39
|
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
|
|
36
40
|
from dstack._internal.core.backends.gcp.models import GCPConfig
|
|
37
41
|
from dstack._internal.core.errors import (
|
|
@@ -82,6 +86,7 @@ class GCPVolumeDiskBackendData(CoreModel):
|
|
|
82
86
|
|
|
83
87
|
|
|
84
88
|
class GCPCompute(
|
|
89
|
+
ComputeWithAllOffersCached,
|
|
85
90
|
ComputeWithCreateInstanceSupport,
|
|
86
91
|
ComputeWithMultinodeSupport,
|
|
87
92
|
ComputeWithPlacementGroupSupport,
|
|
@@ -107,14 +112,10 @@ class GCPCompute(
|
|
|
107
112
|
self._extra_subnets_cache_lock = threading.Lock()
|
|
108
113
|
self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
|
|
109
114
|
|
|
110
|
-
def
|
|
111
|
-
self, requirements: Optional[Requirements] = None
|
|
112
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
115
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
113
116
|
regions = get_or_error(self.config.regions)
|
|
114
117
|
offers = get_catalog_offers(
|
|
115
118
|
backend=BackendType.GCP,
|
|
116
|
-
requirements=requirements,
|
|
117
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
118
119
|
extra_filter=_supported_instances_and_zones(regions),
|
|
119
120
|
)
|
|
120
121
|
quotas: Dict[str, Dict[str, float]] = defaultdict(dict)
|
|
@@ -142,9 +143,13 @@ class GCPCompute(
|
|
|
142
143
|
offer_keys_to_offers[key] = offer_with_availability
|
|
143
144
|
offers_with_availability.append(offer_with_availability)
|
|
144
145
|
offers_with_availability[-1].region = region
|
|
145
|
-
|
|
146
146
|
return offers_with_availability
|
|
147
147
|
|
|
148
|
+
def get_offers_modifier(
|
|
149
|
+
self, requirements: Requirements
|
|
150
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
151
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
152
|
+
|
|
148
153
|
def terminate_instance(
|
|
149
154
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
150
155
|
) -> None:
|
|
@@ -9,6 +9,7 @@ from gpuhunt.providers.hotaisle import HotAisleProvider
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithAllOffersCached,
|
|
12
13
|
ComputeWithCreateInstanceSupport,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
InstanceOfferWithAvailability,
|
|
24
25
|
)
|
|
25
26
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
28
|
from dstack._internal.utils.logging import get_logger
|
|
28
29
|
|
|
29
30
|
logger = get_logger(__name__)
|
|
@@ -44,6 +45,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class HotAisleCompute(
|
|
48
|
+
ComputeWithAllOffersCached,
|
|
47
49
|
ComputeWithCreateInstanceSupport,
|
|
48
50
|
Compute,
|
|
49
51
|
):
|
|
@@ -56,16 +58,12 @@ class HotAisleCompute(
|
|
|
56
58
|
HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
|
|
57
59
|
)
|
|
58
60
|
|
|
59
|
-
def
|
|
60
|
-
self, requirements: Optional[Requirements] = None
|
|
61
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
61
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
62
62
|
offers = get_catalog_offers(
|
|
63
63
|
backend=BackendType.HOTAISLE,
|
|
64
64
|
locations=self.config.regions or None,
|
|
65
|
-
requirements=requirements,
|
|
66
65
|
catalog=self.catalog,
|
|
67
66
|
)
|
|
68
|
-
|
|
69
67
|
supported_offers = []
|
|
70
68
|
for offer in offers:
|
|
71
69
|
if offer.instance.name in INSTANCE_TYPE_SPECS:
|
|
@@ -78,7 +76,6 @@ class HotAisleCompute(
|
|
|
78
76
|
logger.warning(
|
|
79
77
|
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
|
|
80
78
|
)
|
|
81
|
-
|
|
82
79
|
return supported_offers
|
|
83
80
|
|
|
84
81
|
def get_payload_from_offer(self, instance_type) -> dict:
|
|
@@ -9,13 +9,14 @@ from kubernetes import client
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithFilteredOffersCached,
|
|
12
13
|
ComputeWithGatewaySupport,
|
|
13
14
|
generate_unique_gateway_instance_name,
|
|
14
15
|
generate_unique_instance_name_for_job,
|
|
15
16
|
get_docker_commands,
|
|
16
17
|
get_dstack_gateway_commands,
|
|
17
18
|
)
|
|
18
|
-
from dstack._internal.core.backends.base.offers import
|
|
19
|
+
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
19
20
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
20
21
|
KubernetesConfig,
|
|
21
22
|
KubernetesNetworkingConfig,
|
|
@@ -58,6 +59,7 @@ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
|
58
59
|
|
|
59
60
|
|
|
60
61
|
class KubernetesCompute(
|
|
62
|
+
ComputeWithFilteredOffersCached,
|
|
61
63
|
ComputeWithGatewaySupport,
|
|
62
64
|
Compute,
|
|
63
65
|
):
|
|
@@ -70,8 +72,8 @@ class KubernetesCompute(
|
|
|
70
72
|
self.networking_config = networking_config
|
|
71
73
|
self.api = get_api_from_config_data(config.kubeconfig.data)
|
|
72
74
|
|
|
73
|
-
def
|
|
74
|
-
self, requirements:
|
|
75
|
+
def get_offers_by_requirements(
|
|
76
|
+
self, requirements: Requirements
|
|
75
77
|
) -> List[InstanceOfferWithAvailability]:
|
|
76
78
|
nodes = self.api.list_node()
|
|
77
79
|
instance_offers = []
|
|
@@ -99,7 +101,7 @@ class KubernetesCompute(
|
|
|
99
101
|
availability=InstanceAvailability.AVAILABLE,
|
|
100
102
|
instance_runtime=InstanceRuntime.RUNNER,
|
|
101
103
|
)
|
|
102
|
-
instance_offers.extend(
|
|
104
|
+
instance_offers.extend(filter_offers_by_requirements([instance_offer], requirements))
|
|
103
105
|
return instance_offers
|
|
104
106
|
|
|
105
107
|
def run_job(
|
|
@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
9
|
Compute,
|
|
10
|
+
ComputeWithAllOffersCached,
|
|
10
11
|
ComputeWithCreateInstanceSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
12
13
|
get_shim_commands,
|
|
@@ -22,12 +23,13 @@ from dstack._internal.core.models.instances import (
|
|
|
22
23
|
InstanceOfferWithAvailability,
|
|
23
24
|
)
|
|
24
25
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
25
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
27
|
|
|
27
28
|
MAX_INSTANCE_NAME_LEN = 60
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class LambdaCompute(
|
|
32
|
+
ComputeWithAllOffersCached,
|
|
31
33
|
ComputeWithCreateInstanceSupport,
|
|
32
34
|
Compute,
|
|
33
35
|
):
|
|
@@ -36,13 +38,10 @@ class LambdaCompute(
|
|
|
36
38
|
self.config = config
|
|
37
39
|
self.api_client = LambdaAPIClient(config.creds.api_key)
|
|
38
40
|
|
|
39
|
-
def
|
|
40
|
-
self, requirements: Optional[Requirements] = None
|
|
41
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
41
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
42
42
|
offers = get_catalog_offers(
|
|
43
43
|
backend=BackendType.LAMBDA,
|
|
44
44
|
locations=self.config.regions or None,
|
|
45
|
-
requirements=requirements,
|
|
46
45
|
)
|
|
47
46
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
48
47
|
return offers_with_availability
|
|
@@ -28,9 +28,7 @@ class LocalCompute(
|
|
|
28
28
|
ComputeWithVolumeSupport,
|
|
29
29
|
Compute,
|
|
30
30
|
):
|
|
31
|
-
def get_offers(
|
|
32
|
-
self, requirements: Optional[Requirements] = None
|
|
33
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
31
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
34
32
|
return [
|
|
35
33
|
InstanceOfferWithAvailability(
|
|
36
34
|
backend=BackendType.LOCAL,
|
|
@@ -3,7 +3,7 @@ import random
|
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
5
|
from functools import cached_property
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Callable, List, Optional
|
|
7
7
|
|
|
8
8
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
9
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -12,13 +12,14 @@ from nebius.sdk import SDK
|
|
|
12
12
|
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Compute
|
|
14
14
|
from dstack._internal.core.backends.base.compute import (
|
|
15
|
+
ComputeWithAllOffersCached,
|
|
15
16
|
ComputeWithCreateInstanceSupport,
|
|
16
17
|
ComputeWithMultinodeSupport,
|
|
17
18
|
ComputeWithPlacementGroupSupport,
|
|
18
19
|
generate_unique_instance_name,
|
|
19
20
|
get_user_data,
|
|
20
21
|
)
|
|
21
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
22
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
22
23
|
from dstack._internal.core.backends.nebius import resources
|
|
23
24
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
24
25
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -76,6 +77,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
class NebiusCompute(
|
|
80
|
+
ComputeWithAllOffersCached,
|
|
79
81
|
ComputeWithCreateInstanceSupport,
|
|
80
82
|
ComputeWithMultinodeSupport,
|
|
81
83
|
ComputeWithPlacementGroupSupport,
|
|
@@ -106,15 +108,11 @@ class NebiusCompute(
|
|
|
106
108
|
).metadata.id
|
|
107
109
|
return self._subnet_id_cache[region]
|
|
108
110
|
|
|
109
|
-
def
|
|
110
|
-
self, requirements: Optional[Requirements] = None
|
|
111
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
111
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
112
112
|
offers = get_catalog_offers(
|
|
113
113
|
backend=BackendType.NEBIUS,
|
|
114
114
|
locations=list(self._region_to_project_id),
|
|
115
|
-
requirements=requirements,
|
|
116
115
|
extra_filter=_supported_instances,
|
|
117
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
118
116
|
)
|
|
119
117
|
return [
|
|
120
118
|
InstanceOfferWithAvailability(
|
|
@@ -124,6 +122,11 @@ class NebiusCompute(
|
|
|
124
122
|
for offer in offers
|
|
125
123
|
]
|
|
126
124
|
|
|
125
|
+
def get_offers_modifier(
|
|
126
|
+
self, requirements: Requirements
|
|
127
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
128
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
129
|
+
|
|
127
130
|
def create_instance(
|
|
128
131
|
self,
|
|
129
132
|
instance_offer: InstanceOfferWithAvailability,
|