dstack 0.19.31__py3-none-any.whl → 0.19.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +1 -5
- dstack/_internal/core/backends/aws/compute.py +8 -5
- dstack/_internal/core/backends/azure/compute.py +9 -6
- dstack/_internal/core/backends/base/compute.py +40 -17
- dstack/_internal/core/backends/base/offers.py +5 -1
- dstack/_internal/core/backends/datacrunch/compute.py +9 -6
- dstack/_internal/core/backends/gcp/compute.py +137 -7
- dstack/_internal/core/backends/gcp/models.py +7 -0
- dstack/_internal/core/backends/gcp/resources.py +87 -5
- dstack/_internal/core/backends/hotaisle/compute.py +30 -0
- dstack/_internal/core/backends/kubernetes/compute.py +218 -77
- dstack/_internal/core/backends/kubernetes/models.py +4 -2
- dstack/_internal/core/backends/nebius/compute.py +24 -6
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/backends/oci/compute.py +9 -6
- dstack/_internal/core/backends/runpod/compute.py +10 -6
- dstack/_internal/core/backends/vastai/compute.py +3 -1
- dstack/_internal/core/backends/vastai/configurator.py +0 -1
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/models/fleets.py +1 -1
- dstack/_internal/core/models/profiles.py +12 -5
- dstack/_internal/core/models/runs.py +3 -2
- dstack/_internal/core/models/users.py +10 -0
- dstack/_internal/core/services/configs/__init__.py +1 -0
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +6 -4
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +63 -20
- dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
- dstack/_internal/server/models.py +3 -0
- dstack/_internal/server/routers/runs.py +5 -1
- dstack/_internal/server/routers/users.py +14 -2
- dstack/_internal/server/services/runs.py +9 -4
- dstack/_internal/server/services/users.py +35 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
- dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
- dstack/api/_public/__init__.py +9 -12
- dstack/api/_public/repos.py +0 -21
- dstack/api/_public/runs.py +64 -9
- dstack/api/server/_users.py +17 -2
- dstack/version.py +2 -2
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/METADATA +12 -14
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/RECORD +52 -51
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.31.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -62,7 +62,6 @@ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInte
|
|
|
62
62
|
from dstack._internal.utils.logging import get_logger
|
|
63
63
|
from dstack._internal.utils.nested_list import NestedList, NestedListItem
|
|
64
64
|
from dstack._internal.utils.path import is_absolute_posix_path
|
|
65
|
-
from dstack.api._public.repos import get_ssh_keypair
|
|
66
65
|
from dstack.api._public.runs import Run
|
|
67
66
|
from dstack.api.server import APIClient
|
|
68
67
|
from dstack.api.utils import load_profile
|
|
@@ -135,10 +134,6 @@ class BaseRunConfigurator(
|
|
|
135
134
|
|
|
136
135
|
config_manager = ConfigManager()
|
|
137
136
|
repo = self.get_repo(conf, configuration_path, configurator_args, config_manager)
|
|
138
|
-
self.api.ssh_identity_file = get_ssh_keypair(
|
|
139
|
-
configurator_args.ssh_identity_file,
|
|
140
|
-
config_manager.dstack_key_path,
|
|
141
|
-
)
|
|
142
137
|
profile = load_profile(Path.cwd(), configurator_args.profile)
|
|
143
138
|
with console.status("Getting apply plan..."):
|
|
144
139
|
run_plan = self.api.runs.get_run_plan(
|
|
@@ -146,6 +141,7 @@ class BaseRunConfigurator(
|
|
|
146
141
|
repo=repo,
|
|
147
142
|
configuration_path=configuration_path,
|
|
148
143
|
profile=profile,
|
|
144
|
+
ssh_identity_file=configurator_args.ssh_identity_file,
|
|
149
145
|
)
|
|
150
146
|
|
|
151
147
|
print_run_plan(run_plan, max_offers=configurator_args.max_offers)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import threading
|
|
2
|
+
from collections.abc import Iterable
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
4
|
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
4
5
|
|
|
@@ -34,7 +35,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
34
35
|
get_user_data,
|
|
35
36
|
merge_tags,
|
|
36
37
|
)
|
|
37
|
-
from dstack._internal.core.backends.base.offers import
|
|
38
|
+
from dstack._internal.core.backends.base.offers import (
|
|
39
|
+
OfferModifier,
|
|
40
|
+
get_catalog_offers,
|
|
41
|
+
get_offers_disk_modifier,
|
|
42
|
+
)
|
|
38
43
|
from dstack._internal.core.errors import (
|
|
39
44
|
ComputeError,
|
|
40
45
|
NoCapacityError,
|
|
@@ -159,10 +164,8 @@ class AWSCompute(
|
|
|
159
164
|
)
|
|
160
165
|
return availability_offers
|
|
161
166
|
|
|
162
|
-
def
|
|
163
|
-
|
|
164
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
165
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
167
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
168
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
166
169
|
|
|
167
170
|
def _get_offers_cached_key(self, requirements: Requirements) -> int:
|
|
168
171
|
# Requirements is not hashable, so we use a hack to get arguments hash
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import enum
|
|
3
3
|
import re
|
|
4
|
+
from collections.abc import Iterable
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
-
from typing import
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
6
7
|
|
|
7
8
|
from azure.core.credentials import TokenCredential
|
|
8
9
|
from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
|
|
@@ -51,7 +52,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
51
52
|
merge_tags,
|
|
52
53
|
requires_nvidia_proprietary_kernel_modules,
|
|
53
54
|
)
|
|
54
|
-
from dstack._internal.core.backends.base.offers import
|
|
55
|
+
from dstack._internal.core.backends.base.offers import (
|
|
56
|
+
OfferModifier,
|
|
57
|
+
get_catalog_offers,
|
|
58
|
+
get_offers_disk_modifier,
|
|
59
|
+
)
|
|
55
60
|
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
56
61
|
from dstack._internal.core.errors import ComputeError, NoCapacityError
|
|
57
62
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -108,10 +113,8 @@ class AzureCompute(
|
|
|
108
113
|
)
|
|
109
114
|
return offers_with_availability
|
|
110
115
|
|
|
111
|
-
def
|
|
112
|
-
|
|
113
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
114
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
116
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
117
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
115
118
|
|
|
116
119
|
def create_instance(
|
|
117
120
|
self,
|
|
@@ -17,12 +17,13 @@ from cachetools import TTLCache, cachedmethod
|
|
|
17
17
|
from gpuhunt import CPUArchitecture
|
|
18
18
|
|
|
19
19
|
from dstack._internal import settings
|
|
20
|
-
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
20
|
+
from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements
|
|
21
21
|
from dstack._internal.core.consts import (
|
|
22
22
|
DSTACK_RUNNER_HTTP_PORT,
|
|
23
23
|
DSTACK_RUNNER_SSH_PORT,
|
|
24
24
|
DSTACK_SHIM_HTTP_PORT,
|
|
25
25
|
)
|
|
26
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
26
27
|
from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
|
|
27
28
|
from dstack._internal.core.models.gateways import (
|
|
28
29
|
GatewayComputeConfiguration,
|
|
@@ -168,17 +169,13 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
168
169
|
"""
|
|
169
170
|
pass
|
|
170
171
|
|
|
171
|
-
def
|
|
172
|
-
self, requirements: Requirements
|
|
173
|
-
) -> Optional[
|
|
174
|
-
Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
|
|
175
|
-
]:
|
|
172
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
176
173
|
"""
|
|
177
|
-
Returns
|
|
178
|
-
|
|
174
|
+
Returns functions that modify offers before they are filtered by requirements.
|
|
175
|
+
A modifier function can return `None` to exclude the offer.
|
|
179
176
|
E.g. can be used to set appropriate disk size based on requirements.
|
|
180
177
|
"""
|
|
181
|
-
return
|
|
178
|
+
return []
|
|
182
179
|
|
|
183
180
|
def get_offers_post_filter(
|
|
184
181
|
self, requirements: Requirements
|
|
@@ -191,14 +188,7 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
191
188
|
|
|
192
189
|
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
193
190
|
offers = self._get_all_offers_with_availability_cached()
|
|
194
|
-
|
|
195
|
-
if modifier is not None:
|
|
196
|
-
modified_offers = []
|
|
197
|
-
for o in offers:
|
|
198
|
-
modified_offer = modifier(o)
|
|
199
|
-
if modified_offer is not None:
|
|
200
|
-
modified_offers.append(modified_offer)
|
|
201
|
-
offers = modified_offers
|
|
191
|
+
offers = self.__apply_modifiers(offers, self.get_offers_modifiers(requirements))
|
|
202
192
|
offers = filter_offers_by_requirements(offers, requirements)
|
|
203
193
|
post_filter = self.get_offers_post_filter(requirements)
|
|
204
194
|
if post_filter is not None:
|
|
@@ -212,6 +202,20 @@ class ComputeWithAllOffersCached(ABC):
|
|
|
212
202
|
def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]:
|
|
213
203
|
return self.get_all_offers_with_availability()
|
|
214
204
|
|
|
205
|
+
@staticmethod
|
|
206
|
+
def __apply_modifiers(
|
|
207
|
+
offers: Iterable[InstanceOfferWithAvailability], modifiers: Iterable[OfferModifier]
|
|
208
|
+
) -> list[InstanceOfferWithAvailability]:
|
|
209
|
+
modified_offers = []
|
|
210
|
+
for offer in offers:
|
|
211
|
+
for modifier in modifiers:
|
|
212
|
+
offer = modifier(offer)
|
|
213
|
+
if offer is None:
|
|
214
|
+
break
|
|
215
|
+
else:
|
|
216
|
+
modified_offers.append(offer)
|
|
217
|
+
return modified_offers
|
|
218
|
+
|
|
215
219
|
|
|
216
220
|
class ComputeWithFilteredOffersCached(ABC):
|
|
217
221
|
"""
|
|
@@ -341,6 +345,15 @@ class ComputeWithMultinodeSupport:
|
|
|
341
345
|
class ComputeWithReservationSupport:
|
|
342
346
|
"""
|
|
343
347
|
Must be subclassed to support provisioning from reservations.
|
|
348
|
+
|
|
349
|
+
The following is expected from a backend that supports reservations:
|
|
350
|
+
|
|
351
|
+
- `get_offers` respects `Requirements.reservation` if set, and only returns
|
|
352
|
+
offers that can be provisioned in the configured reservation. It can
|
|
353
|
+
adjust some offer properties such as `availability` and
|
|
354
|
+
`availability_zones` if necessary.
|
|
355
|
+
- `create_instance` respects `InstanceConfig.reservation` if set, and
|
|
356
|
+
provisions the instance in the configured reservation.
|
|
344
357
|
"""
|
|
345
358
|
|
|
346
359
|
pass
|
|
@@ -391,6 +404,16 @@ class ComputeWithPlacementGroupSupport(ABC):
|
|
|
391
404
|
"""
|
|
392
405
|
pass
|
|
393
406
|
|
|
407
|
+
def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
|
|
408
|
+
"""
|
|
409
|
+
Whether placement groups can be used for instances provisioned in reservations.
|
|
410
|
+
|
|
411
|
+
Arguments:
|
|
412
|
+
backend_type: matches the backend type of this compute, unless this compute is a proxy
|
|
413
|
+
for other backends (dstack Sky)
|
|
414
|
+
"""
|
|
415
|
+
return True
|
|
416
|
+
|
|
394
417
|
|
|
395
418
|
class ComputeWithGatewaySupport(ABC):
|
|
396
419
|
"""
|
|
@@ -23,6 +23,7 @@ SUPPORTED_GPUHUNT_FLAGS = [
|
|
|
23
23
|
"oci-spot",
|
|
24
24
|
"lambda-arm",
|
|
25
25
|
"gcp-a4",
|
|
26
|
+
"gcp-g4-preview",
|
|
26
27
|
]
|
|
27
28
|
|
|
28
29
|
|
|
@@ -199,9 +200,12 @@ def choose_disk_size_mib(
|
|
|
199
200
|
return round(disk_size_gib * 1024)
|
|
200
201
|
|
|
201
202
|
|
|
203
|
+
OfferModifier = Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
|
|
204
|
+
|
|
205
|
+
|
|
202
206
|
def get_offers_disk_modifier(
|
|
203
207
|
configurable_disk_size: Range[Memory], requirements: Requirements
|
|
204
|
-
) ->
|
|
208
|
+
) -> OfferModifier:
|
|
205
209
|
"""
|
|
206
210
|
Returns a func that modifies offers disk by setting min value that satisfies both
|
|
207
211
|
`configurable_disk_size` and `requirements`.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
from datacrunch import DataCrunchClient
|
|
4
5
|
from datacrunch.exceptions import APIException
|
|
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
12
13
|
generate_unique_instance_name,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
15
|
-
from dstack._internal.core.backends.base.offers import
|
|
16
|
+
from dstack._internal.core.backends.base.offers import (
|
|
17
|
+
OfferModifier,
|
|
18
|
+
get_catalog_offers,
|
|
19
|
+
get_offers_disk_modifier,
|
|
20
|
+
)
|
|
16
21
|
from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
|
|
17
22
|
from dstack._internal.core.errors import NoCapacityError
|
|
18
23
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -59,10 +64,8 @@ class DataCrunchCompute(
|
|
|
59
64
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
60
65
|
return offers_with_availability
|
|
61
66
|
|
|
62
|
-
def
|
|
63
|
-
|
|
64
|
-
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
65
|
-
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
67
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
68
|
+
return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
|
|
66
69
|
|
|
67
70
|
def _get_offers_with_availability(
|
|
68
71
|
self, offers: List[InstanceOffer]
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
|
+
import re
|
|
3
4
|
import threading
|
|
4
5
|
from collections import defaultdict
|
|
6
|
+
from collections.abc import Iterable
|
|
5
7
|
from dataclasses import dataclass
|
|
6
8
|
from typing import Callable, Dict, List, Literal, Optional, Tuple
|
|
7
9
|
|
|
@@ -24,6 +26,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
24
26
|
ComputeWithPlacementGroupSupport,
|
|
25
27
|
ComputeWithPrivateGatewaySupport,
|
|
26
28
|
ComputeWithPrivilegedSupport,
|
|
29
|
+
ComputeWithReservationSupport,
|
|
27
30
|
ComputeWithVolumeSupport,
|
|
28
31
|
generate_unique_gateway_instance_name,
|
|
29
32
|
generate_unique_instance_name,
|
|
@@ -35,6 +38,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
35
38
|
requires_nvidia_proprietary_kernel_modules,
|
|
36
39
|
)
|
|
37
40
|
from dstack._internal.core.backends.base.offers import (
|
|
41
|
+
OfferModifier,
|
|
38
42
|
get_catalog_offers,
|
|
39
43
|
get_offers_disk_modifier,
|
|
40
44
|
)
|
|
@@ -78,8 +82,11 @@ logger = get_logger(__name__)
|
|
|
78
82
|
# pd-balanced disks can be 10GB-64TB, but dstack images are 20GB and cannot grow larger
|
|
79
83
|
# than 32TB because of filesystem settings
|
|
80
84
|
CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("20GB"), max=Memory.parse("32TB"))
|
|
81
|
-
|
|
82
|
-
|
|
85
|
+
# Pattern from https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation
|
|
86
|
+
RESERVATION_PATTERN = re.compile(
|
|
87
|
+
r"projects/(?P<project_id>[a-z0-9-]+)/reservations/(?P<reservation_name>[a-z0-9-]+)"
|
|
88
|
+
)
|
|
89
|
+
RESOURCE_NAME_PATTERN = re.compile(r"[a-z0-9-]+")
|
|
83
90
|
TPU_VERSIONS = [tpu.name for tpu in KNOWN_TPUS]
|
|
84
91
|
|
|
85
92
|
|
|
@@ -93,6 +100,7 @@ class GCPCompute(
|
|
|
93
100
|
ComputeWithCreateInstanceSupport,
|
|
94
101
|
ComputeWithPrivilegedSupport,
|
|
95
102
|
ComputeWithMultinodeSupport,
|
|
103
|
+
ComputeWithReservationSupport,
|
|
96
104
|
ComputeWithPlacementGroupSupport,
|
|
97
105
|
ComputeWithGatewaySupport,
|
|
98
106
|
ComputeWithPrivateGatewaySupport,
|
|
@@ -113,8 +121,12 @@ class GCPCompute(
|
|
|
113
121
|
self.resource_policies_client = compute_v1.ResourcePoliciesClient(
|
|
114
122
|
credentials=self.credentials
|
|
115
123
|
)
|
|
124
|
+
self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials)
|
|
116
125
|
self._usable_subnets_cache_lock = threading.Lock()
|
|
117
126
|
self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120)
|
|
127
|
+
self._find_reservation_cache_lock = threading.Lock()
|
|
128
|
+
# smaller TTL, since we check the reservation's in_use_count, which can change often
|
|
129
|
+
self._find_reservation_cache = TTLCache(maxsize=8, ttl=20)
|
|
118
130
|
|
|
119
131
|
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
120
132
|
regions = get_or_error(self.config.regions)
|
|
@@ -130,13 +142,19 @@ class GCPCompute(
|
|
|
130
142
|
offer_keys_to_offers = {}
|
|
131
143
|
offers_with_availability = []
|
|
132
144
|
for offer in offers:
|
|
145
|
+
preview = False
|
|
146
|
+
if offer.instance.name.startswith("g4-standard-"):
|
|
147
|
+
if self.config.preview_features and "g4" in self.config.preview_features:
|
|
148
|
+
preview = True
|
|
149
|
+
else:
|
|
150
|
+
continue
|
|
133
151
|
region = offer.region[:-2] # strip zone
|
|
134
152
|
key = (_unique_instance_name(offer.instance), region)
|
|
135
153
|
if key in offer_keys_to_offers:
|
|
136
154
|
offer_keys_to_offers[key].availability_zones.append(offer.region)
|
|
137
155
|
continue
|
|
138
156
|
availability = InstanceAvailability.NO_QUOTA
|
|
139
|
-
if _has_gpu_quota(quotas[region], offer.instance.resources):
|
|
157
|
+
if preview or _has_gpu_quota(quotas[region], offer.instance.resources):
|
|
140
158
|
availability = InstanceAvailability.UNKNOWN
|
|
141
159
|
# todo quotas: cpu, memory, global gpu, tpu
|
|
142
160
|
offer_with_availability = InstanceOfferWithAvailability(
|
|
@@ -149,10 +167,40 @@ class GCPCompute(
|
|
|
149
167
|
offers_with_availability[-1].region = region
|
|
150
168
|
return offers_with_availability
|
|
151
169
|
|
|
152
|
-
def
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
170
|
+
def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
|
|
171
|
+
modifiers = []
|
|
172
|
+
|
|
173
|
+
if requirements.reservation:
|
|
174
|
+
zone_to_reservation = self._find_reservation(requirements.reservation)
|
|
175
|
+
|
|
176
|
+
def reservation_modifier(
|
|
177
|
+
offer: InstanceOfferWithAvailability,
|
|
178
|
+
) -> Optional[InstanceOfferWithAvailability]:
|
|
179
|
+
if offer.instance.resources.spot:
|
|
180
|
+
return None
|
|
181
|
+
assert offer.availability_zones is not None
|
|
182
|
+
matching_zones = []
|
|
183
|
+
zones_with_capacity = []
|
|
184
|
+
for zone in offer.availability_zones:
|
|
185
|
+
reservation = zone_to_reservation.get(zone)
|
|
186
|
+
if reservation is not None and _offer_matches_reservation(offer, reservation):
|
|
187
|
+
matching_zones.append(zone)
|
|
188
|
+
if _reservation_has_capacity(reservation):
|
|
189
|
+
zones_with_capacity.append(zone)
|
|
190
|
+
if not matching_zones:
|
|
191
|
+
return None
|
|
192
|
+
offer = offer.copy(deep=True)
|
|
193
|
+
if zones_with_capacity:
|
|
194
|
+
offer.availability_zones = zones_with_capacity
|
|
195
|
+
else:
|
|
196
|
+
offer.availability_zones = matching_zones
|
|
197
|
+
offer.availability = InstanceAvailability.NOT_AVAILABLE
|
|
198
|
+
return offer
|
|
199
|
+
|
|
200
|
+
modifiers.append(reservation_modifier)
|
|
201
|
+
|
|
202
|
+
modifiers.append(get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements))
|
|
203
|
+
return modifiers
|
|
156
204
|
|
|
157
205
|
def terminate_instance(
|
|
158
206
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
@@ -305,6 +353,16 @@ class GCPCompute(
|
|
|
305
353
|
)
|
|
306
354
|
|
|
307
355
|
for zone in zones:
|
|
356
|
+
reservation = None
|
|
357
|
+
if instance_config.reservation:
|
|
358
|
+
reservation = self._find_reservation(instance_config.reservation).get(zone)
|
|
359
|
+
if reservation is None:
|
|
360
|
+
logger.warning(
|
|
361
|
+
"Reservation %s no longer exists in zone %s",
|
|
362
|
+
instance_config.reservation,
|
|
363
|
+
zone,
|
|
364
|
+
)
|
|
365
|
+
continue
|
|
308
366
|
request = compute_v1.InsertInstanceRequest()
|
|
309
367
|
request.zone = zone
|
|
310
368
|
request.project = self.config.project_id
|
|
@@ -335,6 +393,7 @@ class GCPCompute(
|
|
|
335
393
|
roce_subnetworks=roce_subnets,
|
|
336
394
|
allocate_public_ip=allocate_public_ip,
|
|
337
395
|
placement_policy=placement_policy,
|
|
396
|
+
reservation=reservation,
|
|
338
397
|
)
|
|
339
398
|
try:
|
|
340
399
|
# GCP needs some time to return an error in case of no capacity (< 30s).
|
|
@@ -475,6 +534,11 @@ class GCPCompute(
|
|
|
475
534
|
) -> bool:
|
|
476
535
|
return placement_group.configuration.region == instance_offer.region
|
|
477
536
|
|
|
537
|
+
def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
|
|
538
|
+
# Cannot use our own placement policies when provisioning in a reservation.
|
|
539
|
+
# Instead, we use the placement policy defined in reservation settings.
|
|
540
|
+
return False
|
|
541
|
+
|
|
478
542
|
def create_gateway(
|
|
479
543
|
self,
|
|
480
544
|
configuration: GatewayComputeConfiguration,
|
|
@@ -880,6 +944,26 @@ class GCPCompute(
|
|
|
880
944
|
usable_subnets=self._list_usable_subnets(),
|
|
881
945
|
)
|
|
882
946
|
|
|
947
|
+
@cachedmethod(
|
|
948
|
+
cache=lambda self: self._find_reservation_cache,
|
|
949
|
+
lock=lambda self: self._find_reservation_cache_lock,
|
|
950
|
+
)
|
|
951
|
+
def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]:
|
|
952
|
+
if match := RESERVATION_PATTERN.fullmatch(configured_name):
|
|
953
|
+
project_id = match.group("project_id")
|
|
954
|
+
name = match.group("reservation_name")
|
|
955
|
+
elif RESOURCE_NAME_PATTERN.fullmatch(configured_name):
|
|
956
|
+
project_id = self.config.project_id
|
|
957
|
+
name = configured_name
|
|
958
|
+
else:
|
|
959
|
+
# misconfigured or non-GCP
|
|
960
|
+
return {}
|
|
961
|
+
return gcp_resources.find_reservation(
|
|
962
|
+
reservations_client=self.reservations_client,
|
|
963
|
+
project_id=project_id,
|
|
964
|
+
name=name,
|
|
965
|
+
)
|
|
966
|
+
|
|
883
967
|
|
|
884
968
|
def _supported_instances_and_zones(
|
|
885
969
|
regions: List[str],
|
|
@@ -933,6 +1017,52 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
|
|
|
933
1017
|
return len(resources.gpus) <= quotas.get(quota_name, 0)
|
|
934
1018
|
|
|
935
1019
|
|
|
1020
|
+
def _offer_matches_reservation(
|
|
1021
|
+
offer: InstanceOfferWithAvailability, reservation: compute_v1.Reservation
|
|
1022
|
+
) -> bool:
|
|
1023
|
+
if (
|
|
1024
|
+
reservation.specific_reservation is None
|
|
1025
|
+
or reservation.specific_reservation.instance_properties is None
|
|
1026
|
+
):
|
|
1027
|
+
return False
|
|
1028
|
+
properties = reservation.specific_reservation.instance_properties
|
|
1029
|
+
if properties.machine_type != offer.instance.name:
|
|
1030
|
+
return False
|
|
1031
|
+
accelerators = properties.guest_accelerators or []
|
|
1032
|
+
if not accelerators and offer.instance.resources.gpus:
|
|
1033
|
+
return False
|
|
1034
|
+
if len(accelerators) > 1:
|
|
1035
|
+
logger.warning(
|
|
1036
|
+
"Expected 0 or 1 accelerator types per instance,"
|
|
1037
|
+
f" but {properties.machine_type} has {len(accelerators)}."
|
|
1038
|
+
f" Ignoring reservation {reservation.self_link}"
|
|
1039
|
+
)
|
|
1040
|
+
return False
|
|
1041
|
+
if accelerators:
|
|
1042
|
+
if accelerators[0].accelerator_count != len(offer.instance.resources.gpus):
|
|
1043
|
+
return False
|
|
1044
|
+
if (
|
|
1045
|
+
offer.instance.resources.gpus
|
|
1046
|
+
and gcp_resources.find_accelerator_name(
|
|
1047
|
+
offer.instance.resources.gpus[0].name,
|
|
1048
|
+
offer.instance.resources.gpus[0].memory_mib,
|
|
1049
|
+
)
|
|
1050
|
+
!= accelerators[0].accelerator_type
|
|
1051
|
+
):
|
|
1052
|
+
return False
|
|
1053
|
+
return True
|
|
1054
|
+
|
|
1055
|
+
|
|
1056
|
+
def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool:
|
|
1057
|
+
return (
|
|
1058
|
+
reservation.specific_reservation is not None
|
|
1059
|
+
and reservation.specific_reservation.in_use_count is not None
|
|
1060
|
+
and reservation.specific_reservation.assured_count is not None
|
|
1061
|
+
and reservation.specific_reservation.in_use_count
|
|
1062
|
+
< reservation.specific_reservation.assured_count
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
|
|
936
1066
|
def _unique_instance_name(instance: InstanceType) -> str:
|
|
937
1067
|
if instance.resources.spot:
|
|
938
1068
|
name = f"{instance.name}-spot"
|
|
@@ -89,6 +89,13 @@ class GCPBackendConfig(CoreModel):
|
|
|
89
89
|
description="The tags (labels) that will be assigned to resources created by `dstack`"
|
|
90
90
|
),
|
|
91
91
|
] = None
|
|
92
|
+
preview_features: Annotated[
|
|
93
|
+
Optional[List[Literal["g4"]]],
|
|
94
|
+
Field(
|
|
95
|
+
description=("The list of preview GCP features to enable. Supported values: `g4`"),
|
|
96
|
+
max_items=1,
|
|
97
|
+
),
|
|
98
|
+
] = None
|
|
92
99
|
|
|
93
100
|
|
|
94
101
|
class GCPBackendConfigWithCreds(GCPBackendConfig):
|
|
@@ -26,9 +26,35 @@ supported_accelerators = [
|
|
|
26
26
|
{"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16},
|
|
27
27
|
{"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16},
|
|
28
28
|
{"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16},
|
|
29
|
+
{"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96},
|
|
29
30
|
]
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]:
|
|
34
|
+
for acc in supported_accelerators:
|
|
35
|
+
if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]:
|
|
36
|
+
return acc["accelerator_name"]
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def sanitize_filter_value(value: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Escape characters that could break the Compute Engine API filter string.
|
|
43
|
+
"""
|
|
44
|
+
return value.replace("\\", "\\\\").replace('"', '\\"')
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_resource_project(resource_url: str) -> str:
|
|
48
|
+
"""
|
|
49
|
+
Extract the project ID from a URL like
|
|
50
|
+
https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name
|
|
51
|
+
"""
|
|
52
|
+
matches = re.findall(r"/projects/(?P<project_id>[a-z0-9-]+)/", resource_url)
|
|
53
|
+
if not matches:
|
|
54
|
+
raise BackendError(f"Invalid resource URL {resource_url}")
|
|
55
|
+
return matches[0]
|
|
56
|
+
|
|
57
|
+
|
|
32
58
|
def get_availability_zones(
|
|
33
59
|
regions_client: compute_v1.RegionsClient,
|
|
34
60
|
project_id: str,
|
|
@@ -123,6 +149,7 @@ def create_instance_struct(
|
|
|
123
149
|
roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
124
150
|
allocate_public_ip: bool = True,
|
|
125
151
|
placement_policy: Optional[str] = None,
|
|
152
|
+
reservation: Optional[compute_v1.Reservation] = None,
|
|
126
153
|
) -> compute_v1.Instance:
|
|
127
154
|
instance = compute_v1.Instance()
|
|
128
155
|
instance.name = instance_name
|
|
@@ -147,6 +174,25 @@ def create_instance_struct(
|
|
|
147
174
|
initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
|
|
148
175
|
disk.initialize_params = initialize_params
|
|
149
176
|
instance.disks = [disk]
|
|
177
|
+
if (
|
|
178
|
+
reservation is not None
|
|
179
|
+
and reservation.specific_reservation is not None
|
|
180
|
+
and reservation.specific_reservation.instance_properties is not None
|
|
181
|
+
and reservation.specific_reservation.instance_properties.local_ssds is not None
|
|
182
|
+
):
|
|
183
|
+
for local_ssd in reservation.specific_reservation.instance_properties.local_ssds:
|
|
184
|
+
instance.disks.append(
|
|
185
|
+
compute_v1.AttachedDisk(
|
|
186
|
+
auto_delete=True,
|
|
187
|
+
boot=False,
|
|
188
|
+
type_="SCRATCH",
|
|
189
|
+
initialize_params=compute_v1.AttachedDiskInitializeParams(
|
|
190
|
+
disk_type=f"zones/{zone}/diskTypes/local-ssd",
|
|
191
|
+
disk_size_gb=local_ssd.disk_size_gb,
|
|
192
|
+
),
|
|
193
|
+
interface=local_ssd.interface,
|
|
194
|
+
)
|
|
195
|
+
)
|
|
150
196
|
|
|
151
197
|
if accelerators:
|
|
152
198
|
instance.guest_accelerators = accelerators
|
|
@@ -162,6 +208,8 @@ def create_instance_struct(
|
|
|
162
208
|
|
|
163
209
|
if placement_policy is not None:
|
|
164
210
|
instance.resource_policies = [placement_policy]
|
|
211
|
+
elif reservation is not None and "placement" in reservation.resource_policies:
|
|
212
|
+
instance.resource_policies = [reservation.resource_policies["placement"]]
|
|
165
213
|
|
|
166
214
|
if spot:
|
|
167
215
|
instance.scheduling = compute_v1.Scheduling()
|
|
@@ -187,6 +235,17 @@ def create_instance_struct(
|
|
|
187
235
|
)
|
|
188
236
|
]
|
|
189
237
|
|
|
238
|
+
if reservation is not None:
|
|
239
|
+
reservation_project = get_resource_project(reservation.self_link)
|
|
240
|
+
instance.reservation_affinity = compute_v1.ReservationAffinity()
|
|
241
|
+
instance.reservation_affinity.consume_reservation_type = (
|
|
242
|
+
compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name
|
|
243
|
+
)
|
|
244
|
+
instance.reservation_affinity.key = "compute.googleapis.com/reservation-name"
|
|
245
|
+
instance.reservation_affinity.values = [
|
|
246
|
+
f"projects/{reservation_project}/reservations/{reservation.name}"
|
|
247
|
+
]
|
|
248
|
+
|
|
190
249
|
return instance
|
|
191
250
|
|
|
192
251
|
|
|
@@ -350,11 +409,8 @@ def get_accelerators(
|
|
|
350
409
|
return []
|
|
351
410
|
accelerator_config = compute_v1.AcceleratorConfig()
|
|
352
411
|
accelerator_config.accelerator_count = len(gpus)
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
accelerator_name = acc["accelerator_name"]
|
|
356
|
-
break
|
|
357
|
-
else:
|
|
412
|
+
accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib)
|
|
413
|
+
if accelerator_name is None:
|
|
358
414
|
raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB")
|
|
359
415
|
accelerator_config.accelerator_type = (
|
|
360
416
|
f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}"
|
|
@@ -362,6 +418,31 @@ def get_accelerators(
|
|
|
362
418
|
return [accelerator_config]
|
|
363
419
|
|
|
364
420
|
|
|
421
|
+
def find_reservation(
|
|
422
|
+
reservations_client: compute_v1.ReservationsClient,
|
|
423
|
+
project_id: str,
|
|
424
|
+
name: str,
|
|
425
|
+
) -> dict[str, compute_v1.Reservation]:
|
|
426
|
+
request = compute_v1.AggregatedListReservationsRequest(
|
|
427
|
+
project=project_id,
|
|
428
|
+
filter=(
|
|
429
|
+
f'(name = "{sanitize_filter_value(name)}")'
|
|
430
|
+
' AND (status = "READY")'
|
|
431
|
+
" AND (specificReservationRequired = true)"
|
|
432
|
+
),
|
|
433
|
+
)
|
|
434
|
+
try:
|
|
435
|
+
aggregated_reservations = reservations_client.aggregated_list(request=request)
|
|
436
|
+
except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e:
|
|
437
|
+
logger.warning("Could not find reservation: %s", e)
|
|
438
|
+
return {}
|
|
439
|
+
zone_to_reservation = {}
|
|
440
|
+
for zone, zone_reservations in aggregated_reservations:
|
|
441
|
+
if zone_reservations.reservations:
|
|
442
|
+
zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0]
|
|
443
|
+
return zone_to_reservation
|
|
444
|
+
|
|
445
|
+
|
|
365
446
|
def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
|
|
366
447
|
filtered_labels = {}
|
|
367
448
|
for k, v in labels.items():
|
|
@@ -499,5 +580,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
|
|
|
499
580
|
"h3-",
|
|
500
581
|
"v6e",
|
|
501
582
|
"a4-",
|
|
583
|
+
"g4-",
|
|
502
584
|
]
|
|
503
585
|
)
|