dstack 0.19.32__py3-none-any.whl → 0.19.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +1 -5
- dstack/_internal/core/backends/aws/compute.py +8 -5
- dstack/_internal/core/backends/azure/compute.py +9 -6
- dstack/_internal/core/backends/base/compute.py +40 -17
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/datacrunch/compute.py +9 -6
- dstack/_internal/core/backends/gcp/compute.py +151 -6
- dstack/_internal/core/backends/gcp/models.py +10 -0
- dstack/_internal/core/backends/gcp/resources.py +87 -5
- dstack/_internal/core/backends/hotaisle/compute.py +11 -1
- dstack/_internal/core/backends/kubernetes/compute.py +161 -83
- dstack/_internal/core/backends/kubernetes/models.py +4 -2
- dstack/_internal/core/backends/nebius/compute.py +9 -6
- dstack/_internal/core/backends/oci/compute.py +9 -6
- dstack/_internal/core/backends/runpod/compute.py +14 -7
- dstack/_internal/core/backends/vastai/compute.py +3 -1
- dstack/_internal/core/backends/vastai/configurator.py +0 -1
- dstack/_internal/core/compatibility/runs.py +25 -4
- dstack/_internal/core/models/fleets.py +1 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/profiles.py +1 -1
- dstack/_internal/core/models/runs.py +4 -2
- dstack/_internal/core/models/users.py +10 -0
- dstack/_internal/core/services/configs/__init__.py +1 -0
- dstack/_internal/core/services/ssh/key_manager.py +56 -0
- dstack/_internal/server/background/tasks/process_instances.py +5 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/models.py +6 -0
- dstack/_internal/server/routers/metrics.py +6 -2
- dstack/_internal/server/routers/runs.py +5 -1
- dstack/_internal/server/routers/users.py +21 -2
- dstack/_internal/server/services/jobs/__init__.py +18 -9
- dstack/_internal/server/services/offers.py +1 -0
- dstack/_internal/server/services/runs.py +13 -4
- dstack/_internal/server/services/users.py +35 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-e79754c136f1d8e4e7e6.js} +12632 -8039
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-e79754c136f1d8e4e7e6.js.map} +1 -1
- dstack/_internal/server/testing/common.py +4 -0
- dstack/api/_public/__init__.py +8 -11
- dstack/api/_public/repos.py +0 -21
- dstack/api/_public/runs.py +61 -9
- dstack/api/server/__init__.py +4 -0
- dstack/api/server/_users.py +17 -2
- dstack/version.py +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/METADATA +2 -2
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/RECORD +53 -51
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/WHEEL +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.32.dist-info → dstack-0.19.34.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -62,7 +62,6 @@ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInte
|
|
|
62
62
|
from dstack._internal.utils.logging import get_logger
|
|
63
63
|
from dstack._internal.utils.nested_list import NestedList, NestedListItem
|
|
64
64
|
from dstack._internal.utils.path import is_absolute_posix_path
|
|
65
|
-
from dstack.api._public.repos import get_ssh_keypair
|
|
66
65
|
from dstack.api._public.runs import Run
|
|
67
66
|
from dstack.api.server import APIClient
|
|
68
67
|
from dstack.api.utils import load_profile
|
|
@@ -135,10 +134,6 @@ class BaseRunConfigurator(
|
|
|
135
134
|
|
|
136
135
|
config_manager = ConfigManager()
|
|
137
136
|
repo = self.get_repo(conf, configuration_path, configurator_args, config_manager)
|
|
138
|
-
self.api.ssh_identity_file = get_ssh_keypair(
|
|
139
|
-
configurator_args.ssh_identity_file,
|
|
140
|
-
config_manager.dstack_key_path,
|
|
141
|
-
)
|
|
142
137
|
profile = load_profile(Path.cwd(), configurator_args.profile)
|
|
143
138
|
with console.status("Getting apply plan..."):
|
|
144
139
|
run_plan = self.api.runs.get_run_plan(
|
|
@@ -146,6 +141,7 @@ class BaseRunConfigurator(
|
|
|
146
141
|
repo=repo,
|
|
147
142
|
configuration_path=configuration_path,
|
|
148
143
|
profile=profile,
|
|
144
|
+
ssh_identity_file=configurator_args.ssh_identity_file,
|
|
149
145
|
)
|
|
150
146
|
|
|
151
147
|
print_run_plan(run_plan, max_offers=configurator_args.max_offers)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import threading
|
|
2
|
+
from collections.abc import Iterable
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
4
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
5
|
|
|
@@ -34,7 +35,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
34
35
|
get_user_data,
|
|
35
36
|
merge_tags,
|
|
36
37
|
)
|
|
37
|
-
from dstack._internal.core.backends.base.offers import
|
|
38
|
+
from dstack._internal.core.backends.base.offers import (
|
|
39
|
+
OfferModifier,
|
|
40
|
+
get_catalog_offers,
|
|
41
|
+
get_offers_disk_modifier,
|
|
42
|
+
)
|
|
38
43
|
from dstack._internal.core.errors import (
|
|
39
44
|
ComputeError,
|
|
40
45
|
NoCapacityError,
|
|
@@ -159,10 +164,8 @@ class AWSCompute(
|
|
|
159
164
|
)
|
|
160
165
|
return availability_offers
|
|
161
166
|
|
|
162
|
-
def
|
|
163
|
-
|
|
164
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
165
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
167
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
168
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
166
169
|
|
|
167
170
|
def _get_offers_cached_key(self, requirements: Requirements) -> int:
|
|
168
171
|
# Requirements is not hashable, so we use a hack to get arguments hash
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import enum
|
|
3
3
|
import re
|
|
4
|
+
from collections.abc import Iterable
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
-
from typing import
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from azure.core.credentials import TokenCredential
|
|
8
9
|
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
|
|
@@ -51,7 +52,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
51
52
|
merge_tags,
|
|
52
53
|
requires_nvidia_proprietary_kernel_modules,
|
|
53
54
|
)
|
|
54
|
-
from dstack._internal.core.backends.base.offers import
|
|
55
|
+
from dstack._internal.core.backends.base.offers import (
|
|
56
|
+
OfferModifier,
|
|
57
|
+
get_catalog_offers,
|
|
58
|
+
get_offers_disk_modifier,
|
|
59
|
+
)
|
|
55
60
|
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
56
61
|
from dstack._internal.core.errors import ComputeError, NoCapacityError
|
|
57
62
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -108,10 +113,8 @@ class AzureCompute(
|
|
|
108
113
|
)
|
|
109
114
|
return offers_with_availability
|
|
110
115
|
|
|
111
|
-
def
|
|
112
|
-
|
|
113
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
114
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
116
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
117
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
115
118
|
|
|
116
119
|
def create_instance(
|
|
117
120
|
self,
|
|
@@ -17,12 +17,13 @@ from cachetools import TTLCache, cachedmethod
|
|
|
17
17
|
from gpuhunt import CPUArchitecture
|
|
18
18
|
|
|
19
19
|
from dstack._internal import settings
|
|
20
|
-
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
20
|
+
from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements
|
|
21
21
|
from dstack._internal.core.consts import (
|
|
22
22
|
DSTACK_RUNNER_HTTP_PORT,
|
|
23
23
|
DSTACK_RUNNER_SSH_PORT,
|
|
24
24
|
DSTACK_SHIM_HTTP_PORT,
|
|
25
25
|
)
|
|
26
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
26
27
|
from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
|
|
27
28
|
from dstack._internal.core.models.gateways import (
|
|
28
29
|
GatewayComputeConfiguration,
|
|
@@ -168,17 +169,13 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
168
169
|
"""
|
|
169
170
|
pass
|
|
170
171
|
|
|
171
|
-
def
|
|
172
|
-
self, requirements: Requirements
|
|
173
|
-
) -> Optional[
|
|
174
|
-
Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
|
|
175
|
-
]:
|
|
172
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
176
173
|
"""
|
|
177
|
-
Returns
|
|
178
|
-
|
|
174
|
+
Returns functions that modify offers before they are filtered by requirements.
|
|
175
|
+
A modifier function can return `None` to exclude the offer.
|
|
179
176
|
E.g. can be used to set appropriate disk size based on requirements.
|
|
180
177
|
"""
|
|
181
|
-
return
|
|
178
|
+
return []
|
|
182
179
|
|
|
183
180
|
def get_offers_post_filter(
|
|
184
181
|
self, requirements: Requirements
|
|
@@ -191,14 +188,7 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
191
188
|
|
|
192
189
|
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
193
190
|
offers = self._get_all_offers_with_availability_cached()
|
|
194
|
-
|
|
195
|
-
if modifier is not None:
|
|
196
|
-
modified_offers = []
|
|
197
|
-
for o in offers:
|
|
198
|
-
modified_offer = modifier(o)
|
|
199
|
-
if modified_offer is not None:
|
|
200
|
-
modified_offers.append(modified_offer)
|
|
201
|
-
offers = modified_offers
|
|
191
|
+
offers = self.__apply_modifiers(offers, self.get_offers_modifiers(requirements))
|
|
202
192
|
offers = filter_offers_by_requirements(offers, requirements)
|
|
203
193
|
post_filter = self.get_offers_post_filter(requirements)
|
|
204
194
|
if post_filter is not None:
|
|
@@ -212,6 +202,20 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
212
202
|
def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]:
|
|
213
203
|
return self.get_all_offers_with_availability()
|
|
214
204
|
|
|
205
|
+
@staticmethod
|
|
206
|
+
def __apply_modifiers(
|
|
207
|
+
offers: Iterable[InstanceOfferWithAvailability], modifiers: Iterable[OfferModifier]
|
|
208
|
+
) -> list[InstanceOfferWithAvailability]:
|
|
209
|
+
modified_offers = []
|
|
210
|
+
for offer in offers:
|
|
211
|
+
for modifier in modifiers:
|
|
212
|
+
offer = modifier(offer)
|
|
213
|
+
if offer is None:
|
|
214
|
+
break
|
|
215
|
+
else:
|
|
216
|
+
modified_offers.append(offer)
|
|
217
|
+
return modified_offers
|
|
218
|
+
|
|
215
219
|
|
|
216
220
|
class ComputeWithFilteredOffersCached(ABC):
|
|
217
221
|
"""
|
|
@@ -341,6 +345,15 @@ class ComputeWithMultinodeSupport:
|
|
|
341
345
|
class ComputeWithReservationSupport:
|
|
342
346
|
"""
|
|
343
347
|
Must be subclassed to support provisioning from reservations.
|
|
348
|
+
|
|
349
|
+
The following is expected from a backend that supports reservations:
|
|
350
|
+
|
|
351
|
+
- `get_offers` respects `Requirements.reservation` if set, and only returns
|
|
352
|
+
offers that can be provisioned in the configured reservation. It can
|
|
353
|
+
adjust some offer properties such as `availability` and
|
|
354
|
+
`availability_zones` if necessary.
|
|
355
|
+
- `create_instance` respects `InstanceConfig.reservation` if set, and
|
|
356
|
+
provisions the instance in the configured reservation.
|
|
344
357
|
"""
|
|
345
358
|
|
|
346
359
|
pass
|
|
@@ -391,6 +404,16 @@ class ComputeWithPlacementGroupSupport(ABC):
|
|
|
391
404
|
"""
|
|
392
405
|
pass
|
|
393
406
|
|
|
407
|
+
def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
|
|
408
|
+
"""
|
|
409
|
+
Whether placement groups can be used for instances provisioned in reservations.
|
|
410
|
+
|
|
411
|
+
Arguments:
|
|
412
|
+
backend_type: matches the backend type of this compute, unless this compute is a proxy
|
|
413
|
+
for other backends (dstack Sky)
|
|
414
|
+
"""
|
|
415
|
+
return True
|
|
416
|
+
|
|
394
417
|
|
|
395
418
|
class ComputeWithGatewaySupport(ABC):
|
|
396
419
|
"""
|
|
@@ -23,6 +23,8 @@ SUPPORTED_GPUHUNT_FLAGS = [
|
|
|
23
23
|
"oci-spot",
|
|
24
24
|
"lambda-arm",
|
|
25
25
|
"gcp-a4",
|
|
26
|
+
"gcp-g4",
|
|
27
|
+
"gcp-dws-calendar-mode",
|
|
26
28
|
]
|
|
27
29
|
|
|
28
30
|
|
|
@@ -93,6 +95,7 @@ def catalog_item_to_offer(
|
|
|
93
95
|
),
|
|
94
96
|
region=item.location,
|
|
95
97
|
price=item.price,
|
|
98
|
+
backend_data=item.provider_data,
|
|
96
99
|
)
|
|
97
100
|
|
|
98
101
|
|
|
@@ -199,9 +202,12 @@ def choose_disk_size_mib(
|
|
|
199
202
|
return round(disk_size_gib * 1024)
|
|
200
203
|
|
|
201
204
|
|
|
205
|
+
OfferModifier = Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
|
|
206
|
+
|
|
207
|
+
|
|
202
208
|
def get_offers_disk_modifier(
|
|
203
209
|
configurable_disk_size: Range[Memory], requirements: Requirements
|
|
204
|
-
) ->
|
|
210
|
+
) -> OfferModifier:
|
|
205
211
|
"""
|
|
206
212
|
Returns a func that modifies offers disk by setting min value that satisfies both
|
|
207
213
|
`configurable_disk_size` and `requirements`.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
from datacrunch import DataCrunchClient
|
|
4
5
|
from datacrunch.exceptions import APIException
|
|
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
15
|
-
from dstack._internal.core.backends.base.offers import
|
|
16
|
+
from dstack._internal.core.backends.base.offers import (
|
|
17
|
+
OfferModifier,
|
|
18
|
+
get_catalog_offers,
|
|
19
|
+
get_offers_disk_modifier,
|
|
20
|
+
)
|
|
16
21
|
from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
|
|
17
22
|
from dstack._internal.core.errors import NoCapacityError
|
|
18
23
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -59,10 +64,8 @@ class DataCrunchCompute(
|
|
|
59
64
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
60
65
|
return offers_with_availability
|
|
61
66
|
|
|
62
|
-
def
|
|
63
|
-
|
|
64
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
65
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
67
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
68
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
66
69
|
|
|
67
70
|
def _get_offers_with_availability(
|
|
68
71
|
self, offers: List[InstanceOffer]
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
|
+
import re
|
|
3
4
|
import threading
|
|
4
5
|
from collections import defaultdict
|
|
6
|
+
from collections.abc import Iterable
|
|
5
7
|
from dataclasses import dataclass
|
|
6
8
|
from typing import Callable, Dict, List, Literal, Optional, Tuple
|
|
7
9
|
|
|
@@ -24,6 +26,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
24
26
|
ComputeWithPlacementGroupSupport,
|
|
25
27
|
ComputeWithPrivateGatewaySupport,
|
|
26
28
|
ComputeWithPrivilegedSupport,
|
|
29
|
+
ComputeWithReservationSupport,
|
|
27
30
|
ComputeWithVolumeSupport,
|
|
28
31
|
generate_unique_gateway_instance_name,
|
|
29
32
|
generate_unique_instance_name,
|
|
@@ -35,6 +38,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
35
38
|
requires_nvidia_proprietary_kernel_modules,
|
|
36
39
|
)
|
|
37
40
|
from dstack._internal.core.backends.base.offers import (
|
|
41
|
+
OfferModifier,
|
|
38
42
|
get_catalog_offers,
|
|
39
43
|
get_offers_disk_modifier,
|
|
40
44
|
)
|
|
@@ -78,9 +82,16 @@ logger = get_logger(__name__)
|
|
|
78
82
|
# pd-balanced disks can be 10GB-64TB, but dstack images are 20GB and cannot grow larger
|
|
79
83
|
# than 32TB because of filesystem settings
|
|
80
84
|
CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("20GB"), max=Memory.parse("32TB"))
|
|
85
|
+
# Pattern from https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation
|
|
86
|
+
RESERVATION_PATTERN = re.compile(
|
|
87
|
+
r"projects/(?P<project_id>[a-z0-9-]+)/reservations/(?P<reservation_name>[a-z0-9-]+)"
|
|
88
|
+
)
|
|
89
|
+
RESOURCE_NAME_PATTERN = re.compile(r"[a-z0-9-]+")
|
|
90
|
+
TPU_VERSIONS = [tpu.name for tpu in KNOWN_TPUS]
|
|
81
91
|
|
|
82
92
|
|
|
83
|
-
|
|
93
|
+
class GCPOfferBackendData(CoreModel):
|
|
94
|
+
is_dws_calendar_mode: bool = False
|
|
84
95
|
|
|
85
96
|
|
|
86
97
|
class GCPVolumeDiskBackendData(CoreModel):
|
|
@@ -93,6 +104,7 @@ class GCPCompute(
|
|
|
93
104
|
ComputeWithCreateInstanceSupport,
|
|
94
105
|
ComputeWithPrivilegedSupport,
|
|
95
106
|
ComputeWithMultinodeSupport,
|
|
107
|
+
ComputeWithReservationSupport,
|
|
96
108
|
ComputeWithPlacementGroupSupport,
|
|
97
109
|
ComputeWithGatewaySupport,
|
|
98
110
|
ComputeWithPrivateGatewaySupport,
|
|
@@ -113,8 +125,12 @@ class GCPCompute(
|
|
|
113
125
|
self.resource_policies_client = compute_v1.ResourcePoliciesClient(
|
|
114
126
|
credentials=self.credentials
|
|
115
127
|
)
|
|
128
|
+
self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials)
|
|
116
129
|
self._usable_subnets_cache_lock = threading.Lock()
|
|
117
130
|
self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120)
|
|
131
|
+
self._find_reservation_cache_lock = threading.Lock()
|
|
132
|
+
# smaller TTL, since we check the reservation's in_use_count, which can change often
|
|
133
|
+
self._find_reservation_cache = TTLCache(maxsize=8, ttl=20)
|
|
118
134
|
|
|
119
135
|
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
120
136
|
regions = get_or_error(self.config.regions)
|
|
@@ -149,10 +165,57 @@ class GCPCompute(
|
|
|
149
165
|
offers_with_availability[-1].region = region
|
|
150
166
|
return offers_with_availability
|
|
151
167
|
|
|
152
|
-
def
|
|
168
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
169
|
+
modifiers = []
|
|
170
|
+
|
|
171
|
+
if requirements.reservation:
|
|
172
|
+
zone_to_reservation = self._find_reservation(requirements.reservation)
|
|
173
|
+
|
|
174
|
+
def reservation_modifier(
|
|
175
|
+
offer: InstanceOfferWithAvailability,
|
|
176
|
+
) -> Optional[InstanceOfferWithAvailability]:
|
|
177
|
+
if offer.instance.resources.spot:
|
|
178
|
+
return None
|
|
179
|
+
assert offer.availability_zones is not None
|
|
180
|
+
matching_zones = []
|
|
181
|
+
zones_with_capacity = []
|
|
182
|
+
for zone in offer.availability_zones:
|
|
183
|
+
reservation = zone_to_reservation.get(zone)
|
|
184
|
+
if reservation is not None and _offer_matches_reservation(offer, reservation):
|
|
185
|
+
matching_zones.append(zone)
|
|
186
|
+
if _reservation_has_capacity(reservation):
|
|
187
|
+
zones_with_capacity.append(zone)
|
|
188
|
+
if not matching_zones:
|
|
189
|
+
return None
|
|
190
|
+
offer = offer.copy(deep=True)
|
|
191
|
+
if zones_with_capacity:
|
|
192
|
+
offer.availability_zones = zones_with_capacity
|
|
193
|
+
else:
|
|
194
|
+
offer.availability_zones = matching_zones
|
|
195
|
+
offer.availability = InstanceAvailability.NOT_AVAILABLE
|
|
196
|
+
return offer
|
|
197
|
+
|
|
198
|
+
modifiers.append(reservation_modifier)
|
|
199
|
+
|
|
200
|
+
modifiers.append(get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements))
|
|
201
|
+
return modifiers
|
|
202
|
+
|
|
203
|
+
def get_offers_post_filter(
|
|
153
204
|
self, requirements: Requirements
|
|
154
|
-
) -> Callable[[InstanceOfferWithAvailability],
|
|
155
|
-
|
|
205
|
+
) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
|
|
206
|
+
if requirements.reservation is None:
|
|
207
|
+
|
|
208
|
+
def reserved_offers_filter(offer: InstanceOfferWithAvailability) -> bool:
|
|
209
|
+
"""Remove reserved-only offers"""
|
|
210
|
+
if GCPOfferBackendData.__response__.parse_obj(
|
|
211
|
+
offer.backend_data
|
|
212
|
+
).is_dws_calendar_mode:
|
|
213
|
+
return False
|
|
214
|
+
return True
|
|
215
|
+
|
|
216
|
+
return reserved_offers_filter
|
|
217
|
+
|
|
218
|
+
return None
|
|
156
219
|
|
|
157
220
|
def terminate_instance(
|
|
158
221
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
@@ -305,6 +368,16 @@ class GCPCompute(
|
|
|
305
368
|
)
|
|
306
369
|
|
|
307
370
|
for zone in zones:
|
|
371
|
+
reservation = None
|
|
372
|
+
if instance_config.reservation:
|
|
373
|
+
reservation = self._find_reservation(instance_config.reservation).get(zone)
|
|
374
|
+
if reservation is None:
|
|
375
|
+
logger.warning(
|
|
376
|
+
"Reservation %s no longer exists in zone %s",
|
|
377
|
+
instance_config.reservation,
|
|
378
|
+
zone,
|
|
379
|
+
)
|
|
380
|
+
continue
|
|
308
381
|
request = compute_v1.InsertInstanceRequest()
|
|
309
382
|
request.zone = zone
|
|
310
383
|
request.project = self.config.project_id
|
|
@@ -335,6 +408,7 @@ class GCPCompute(
|
|
|
335
408
|
roce_subnetworks=roce_subnets,
|
|
336
409
|
allocate_public_ip=allocate_public_ip,
|
|
337
410
|
placement_policy=placement_policy,
|
|
411
|
+
reservation=reservation,
|
|
338
412
|
)
|
|
339
413
|
try:
|
|
340
414
|
# GCP needs some time to return an error in case of no capacity (< 30s).
|
|
@@ -475,6 +549,11 @@ class GCPCompute(
|
|
|
475
549
|
) -> bool:
|
|
476
550
|
return placement_group.configuration.region == instance_offer.region
|
|
477
551
|
|
|
552
|
+
def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
|
|
553
|
+
# Cannot use our own placement policies when provisioning in a reservation.
|
|
554
|
+
# Instead, we use the placement policy defined in reservation settings.
|
|
555
|
+
return False
|
|
556
|
+
|
|
478
557
|
def create_gateway(
|
|
479
558
|
self,
|
|
480
559
|
configuration: GatewayComputeConfiguration,
|
|
@@ -880,6 +959,26 @@ class GCPCompute(
|
|
|
880
959
|
usable_subnets=self._list_usable_subnets(),
|
|
881
960
|
)
|
|
882
961
|
|
|
962
|
+
@cachedmethod(
|
|
963
|
+
cache=lambda self: self._find_reservation_cache,
|
|
964
|
+
lock=lambda self: self._find_reservation_cache_lock,
|
|
965
|
+
)
|
|
966
|
+
def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]:
|
|
967
|
+
if match := RESERVATION_PATTERN.fullmatch(configured_name):
|
|
968
|
+
project_id = match.group("project_id")
|
|
969
|
+
name = match.group("reservation_name")
|
|
970
|
+
elif RESOURCE_NAME_PATTERN.fullmatch(configured_name):
|
|
971
|
+
project_id = self.config.project_id
|
|
972
|
+
name = configured_name
|
|
973
|
+
else:
|
|
974
|
+
# misconfigured or non-GCP
|
|
975
|
+
return {}
|
|
976
|
+
return gcp_resources.find_reservation(
|
|
977
|
+
reservations_client=self.reservations_client,
|
|
978
|
+
project_id=project_id,
|
|
979
|
+
name=name,
|
|
980
|
+
)
|
|
981
|
+
|
|
883
982
|
|
|
884
983
|
def _supported_instances_and_zones(
|
|
885
984
|
regions: List[str],
|
|
@@ -922,8 +1021,8 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
|
|
|
922
1021
|
gpu = resources.gpus[0]
|
|
923
1022
|
if _is_tpu(gpu.name):
|
|
924
1023
|
return True
|
|
925
|
-
if gpu.name in ["B200", "H100"]:
|
|
926
|
-
# B200, H100 and
|
|
1024
|
+
if gpu.name in ["B200", "H100", "RTXPRO6000"]:
|
|
1025
|
+
# B200, H100, H100_MEGA, and RTXPRO6000 quotas are not returned by `regions_client.list`
|
|
927
1026
|
return True
|
|
928
1027
|
quota_name = f"NVIDIA_{gpu.name}_GPUS"
|
|
929
1028
|
if gpu.name == "A100" and gpu.memory_mib == 80 * 1024:
|
|
@@ -933,6 +1032,52 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
|
|
|
933
1032
|
return len(resources.gpus) <= quotas.get(quota_name, 0)
|
|
934
1033
|
|
|
935
1034
|
|
|
1035
|
+
def _offer_matches_reservation(
|
|
1036
|
+
offer: InstanceOfferWithAvailability, reservation: compute_v1.Reservation
|
|
1037
|
+
) -> bool:
|
|
1038
|
+
if (
|
|
1039
|
+
reservation.specific_reservation is None
|
|
1040
|
+
or reservation.specific_reservation.instance_properties is None
|
|
1041
|
+
):
|
|
1042
|
+
return False
|
|
1043
|
+
properties = reservation.specific_reservation.instance_properties
|
|
1044
|
+
if properties.machine_type != offer.instance.name:
|
|
1045
|
+
return False
|
|
1046
|
+
accelerators = properties.guest_accelerators or []
|
|
1047
|
+
if not accelerators and offer.instance.resources.gpus:
|
|
1048
|
+
return False
|
|
1049
|
+
if len(accelerators) > 1:
|
|
1050
|
+
logger.warning(
|
|
1051
|
+
"Expected 0 or 1 accelerator types per instance,"
|
|
1052
|
+
f" but {properties.machine_type} has {len(accelerators)}."
|
|
1053
|
+
f" Ignoring reservation {reservation.self_link}"
|
|
1054
|
+
)
|
|
1055
|
+
return False
|
|
1056
|
+
if accelerators:
|
|
1057
|
+
if accelerators[0].accelerator_count != len(offer.instance.resources.gpus):
|
|
1058
|
+
return False
|
|
1059
|
+
if (
|
|
1060
|
+
offer.instance.resources.gpus
|
|
1061
|
+
and gcp_resources.find_accelerator_name(
|
|
1062
|
+
offer.instance.resources.gpus[0].name,
|
|
1063
|
+
offer.instance.resources.gpus[0].memory_mib,
|
|
1064
|
+
)
|
|
1065
|
+
!= accelerators[0].accelerator_type
|
|
1066
|
+
):
|
|
1067
|
+
return False
|
|
1068
|
+
return True
|
|
1069
|
+
|
|
1070
|
+
|
|
1071
|
+
def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool:
|
|
1072
|
+
return (
|
|
1073
|
+
reservation.specific_reservation is not None
|
|
1074
|
+
and reservation.specific_reservation.in_use_count is not None
|
|
1075
|
+
and reservation.specific_reservation.assured_count is not None
|
|
1076
|
+
and reservation.specific_reservation.in_use_count
|
|
1077
|
+
< reservation.specific_reservation.assured_count
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
|
|
936
1081
|
def _unique_instance_name(instance: InstanceType) -> str:
|
|
937
1082
|
if instance.resources.spot:
|
|
938
1083
|
name = f"{instance.name}-spot"
|
|
@@ -89,6 +89,16 @@ class GCPBackendConfig(CoreModel):
|
|
|
89
89
|
description="The tags (labels) that will be assigned to resources created by `dstack`"
|
|
90
90
|
),
|
|
91
91
|
] = None
|
|
92
|
+
preview_features: Annotated[
|
|
93
|
+
Optional[List[Literal["g4"]]],
|
|
94
|
+
Field(
|
|
95
|
+
description=(
|
|
96
|
+
"The list of preview GCP features to enable."
|
|
97
|
+
" There are currently no preview features"
|
|
98
|
+
),
|
|
99
|
+
max_items=1,
|
|
100
|
+
),
|
|
101
|
+
] = None
|
|
92
102
|
|
|
93
103
|
|
|
94
104
|
class GCPBackendConfigWithCreds(GCPBackendConfig):
|