dstack 0.19.32__py3-none-any.whl → 0.19.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (46) hide show
  1. dstack/_internal/cli/commands/offer.py +1 -1
  2. dstack/_internal/cli/services/configurators/run.py +1 -5
  3. dstack/_internal/core/backends/aws/compute.py +8 -5
  4. dstack/_internal/core/backends/azure/compute.py +9 -6
  5. dstack/_internal/core/backends/base/compute.py +40 -17
  6. dstack/_internal/core/backends/base/offers.py +5 -1
  7. dstack/_internal/core/backends/datacrunch/compute.py +9 -6
  8. dstack/_internal/core/backends/gcp/compute.py +137 -7
  9. dstack/_internal/core/backends/gcp/models.py +7 -0
  10. dstack/_internal/core/backends/gcp/resources.py +87 -5
  11. dstack/_internal/core/backends/hotaisle/compute.py +11 -1
  12. dstack/_internal/core/backends/kubernetes/compute.py +161 -83
  13. dstack/_internal/core/backends/kubernetes/models.py +4 -2
  14. dstack/_internal/core/backends/nebius/compute.py +9 -6
  15. dstack/_internal/core/backends/oci/compute.py +9 -6
  16. dstack/_internal/core/backends/runpod/compute.py +10 -6
  17. dstack/_internal/core/backends/vastai/compute.py +3 -1
  18. dstack/_internal/core/backends/vastai/configurator.py +0 -1
  19. dstack/_internal/core/models/fleets.py +1 -1
  20. dstack/_internal/core/models/profiles.py +1 -1
  21. dstack/_internal/core/models/runs.py +3 -2
  22. dstack/_internal/core/models/users.py +10 -0
  23. dstack/_internal/core/services/configs/__init__.py +1 -0
  24. dstack/_internal/server/background/tasks/process_instances.py +5 -1
  25. dstack/_internal/server/background/tasks/process_running_jobs.py +1 -0
  26. dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py +34 -0
  27. dstack/_internal/server/models.py +3 -0
  28. dstack/_internal/server/routers/runs.py +5 -1
  29. dstack/_internal/server/routers/users.py +14 -2
  30. dstack/_internal/server/services/runs.py +9 -4
  31. dstack/_internal/server/services/users.py +35 -2
  32. dstack/_internal/server/statics/index.html +1 -1
  33. dstack/_internal/server/statics/main-720ce3a11140daa480cc.css +3 -0
  34. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js → main-97c7e184573ca23f9fe4.js} +12218 -7625
  35. dstack/_internal/server/statics/{main-c51afa7f243e24d3e446.js.map → main-97c7e184573ca23f9fe4.js.map} +1 -1
  36. dstack/api/_public/__init__.py +9 -12
  37. dstack/api/_public/repos.py +0 -21
  38. dstack/api/_public/runs.py +64 -9
  39. dstack/api/server/_users.py +17 -2
  40. dstack/version.py +2 -2
  41. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/METADATA +2 -2
  42. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/RECORD +45 -44
  43. dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +0 -3
  44. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/WHEEL +0 -0
  45. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/entry_points.txt +0 -0
  46. {dstack-0.19.32.dist-info → dstack-0.19.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -104,8 +104,8 @@ class OfferCommand(APIBaseCommand):
104
104
 
105
105
  run_spec = RunSpec(
106
106
  configuration=conf,
107
- ssh_key_pub="(dummy)",
108
107
  profile=profile,
108
+ ssh_key_pub="(dummy)", # TODO: Remove since 0.19.40
109
109
  )
110
110
 
111
111
  if args.group_by:
@@ -62,7 +62,6 @@ from dstack._internal.utils.interpolator import InterpolatorError, VariablesInte
62
62
  from dstack._internal.utils.logging import get_logger
63
63
  from dstack._internal.utils.nested_list import NestedList, NestedListItem
64
64
  from dstack._internal.utils.path import is_absolute_posix_path
65
- from dstack.api._public.repos import get_ssh_keypair
66
65
  from dstack.api._public.runs import Run
67
66
  from dstack.api.server import APIClient
68
67
  from dstack.api.utils import load_profile
@@ -135,10 +134,6 @@ class BaseRunConfigurator(
135
134
 
136
135
  config_manager = ConfigManager()
137
136
  repo = self.get_repo(conf, configuration_path, configurator_args, config_manager)
138
- self.api.ssh_identity_file = get_ssh_keypair(
139
- configurator_args.ssh_identity_file,
140
- config_manager.dstack_key_path,
141
- )
142
137
  profile = load_profile(Path.cwd(), configurator_args.profile)
143
138
  with console.status("Getting apply plan..."):
144
139
  run_plan = self.api.runs.get_run_plan(
@@ -146,6 +141,7 @@ class BaseRunConfigurator(
146
141
  repo=repo,
147
142
  configuration_path=configuration_path,
148
143
  profile=profile,
144
+ ssh_identity_file=configurator_args.ssh_identity_file,
149
145
  )
150
146
 
151
147
  print_run_plan(run_plan, max_offers=configurator_args.max_offers)
@@ -1,4 +1,5 @@
1
1
  import threading
2
+ from collections.abc import Iterable
2
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
4
  from typing import Any, Callable, Dict, List, Optional, Tuple
4
5
 
@@ -34,7 +35,11 @@ from dstack._internal.core.backends.base.compute import (
34
35
  get_user_data,
35
36
  merge_tags,
36
37
  )
37
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
38
+ from dstack._internal.core.backends.base.offers import (
39
+ OfferModifier,
40
+ get_catalog_offers,
41
+ get_offers_disk_modifier,
42
+ )
38
43
  from dstack._internal.core.errors import (
39
44
  ComputeError,
40
45
  NoCapacityError,
@@ -159,10 +164,8 @@ class AWSCompute(
159
164
  )
160
165
  return availability_offers
161
166
 
162
- def get_offers_modifier(
163
- self, requirements: Requirements
164
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
165
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
167
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
168
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
166
169
 
167
170
  def _get_offers_cached_key(self, requirements: Requirements) -> int:
168
171
  # Requirements is not hashable, so we use a hack to get arguments hash
@@ -1,8 +1,9 @@
1
1
  import base64
2
2
  import enum
3
3
  import re
4
+ from collections.abc import Iterable
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from typing import Callable, Dict, List, Optional, Tuple
6
+ from typing import Dict, List, Optional, Tuple
6
7
 
7
8
  from azure.core.credentials import TokenCredential
8
9
  from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
@@ -51,7 +52,11 @@ from dstack._internal.core.backends.base.compute import (
51
52
  merge_tags,
52
53
  requires_nvidia_proprietary_kernel_modules,
53
54
  )
54
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
55
+ from dstack._internal.core.backends.base.offers import (
56
+ OfferModifier,
57
+ get_catalog_offers,
58
+ get_offers_disk_modifier,
59
+ )
55
60
  from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
56
61
  from dstack._internal.core.errors import ComputeError, NoCapacityError
57
62
  from dstack._internal.core.models.backends.base import BackendType
@@ -108,10 +113,8 @@ class AzureCompute(
108
113
  )
109
114
  return offers_with_availability
110
115
 
111
- def get_offers_modifier(
112
- self, requirements: Requirements
113
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
114
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
116
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
117
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
115
118
 
116
119
  def create_instance(
117
120
  self,
@@ -17,12 +17,13 @@ from cachetools import TTLCache, cachedmethod
17
17
  from gpuhunt import CPUArchitecture
18
18
 
19
19
  from dstack._internal import settings
20
- from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
20
+ from dstack._internal.core.backends.base.offers import OfferModifier, filter_offers_by_requirements
21
21
  from dstack._internal.core.consts import (
22
22
  DSTACK_RUNNER_HTTP_PORT,
23
23
  DSTACK_RUNNER_SSH_PORT,
24
24
  DSTACK_SHIM_HTTP_PORT,
25
25
  )
26
+ from dstack._internal.core.models.backends.base import BackendType
26
27
  from dstack._internal.core.models.configurations import LEGACY_REPO_DIR
27
28
  from dstack._internal.core.models.gateways import (
28
29
  GatewayComputeConfiguration,
@@ -168,17 +169,13 @@ class ComputeWithAllOffersCached(ABC):
168
169
  """
169
170
  pass
170
171
 
171
- def get_offers_modifier(
172
- self, requirements: Requirements
173
- ) -> Optional[
174
- Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
175
- ]:
172
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
176
173
  """
177
- Returns a modifier function that modifies offers before they are filtered by requirements.
178
- Can return `None` to exclude the offer.
174
+ Returns functions that modify offers before they are filtered by requirements.
175
+ A modifier function can return `None` to exclude the offer.
179
176
  E.g. can be used to set appropriate disk size based on requirements.
180
177
  """
181
- return None
178
+ return []
182
179
 
183
180
  def get_offers_post_filter(
184
181
  self, requirements: Requirements
@@ -191,14 +188,7 @@ class ComputeWithAllOffersCached(ABC):
191
188
 
192
189
  def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
193
190
  offers = self._get_all_offers_with_availability_cached()
194
- modifier = self.get_offers_modifier(requirements)
195
- if modifier is not None:
196
- modified_offers = []
197
- for o in offers:
198
- modified_offer = modifier(o)
199
- if modified_offer is not None:
200
- modified_offers.append(modified_offer)
201
- offers = modified_offers
191
+ offers = self.__apply_modifiers(offers, self.get_offers_modifiers(requirements))
202
192
  offers = filter_offers_by_requirements(offers, requirements)
203
193
  post_filter = self.get_offers_post_filter(requirements)
204
194
  if post_filter is not None:
@@ -212,6 +202,20 @@ class ComputeWithAllOffersCached(ABC):
212
202
  def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]:
213
203
  return self.get_all_offers_with_availability()
214
204
 
205
+ @staticmethod
206
+ def __apply_modifiers(
207
+ offers: Iterable[InstanceOfferWithAvailability], modifiers: Iterable[OfferModifier]
208
+ ) -> list[InstanceOfferWithAvailability]:
209
+ modified_offers = []
210
+ for offer in offers:
211
+ for modifier in modifiers:
212
+ offer = modifier(offer)
213
+ if offer is None:
214
+ break
215
+ else:
216
+ modified_offers.append(offer)
217
+ return modified_offers
218
+
215
219
 
216
220
  class ComputeWithFilteredOffersCached(ABC):
217
221
  """
@@ -341,6 +345,15 @@ class ComputeWithMultinodeSupport:
341
345
  class ComputeWithReservationSupport:
342
346
  """
343
347
  Must be subclassed to support provisioning from reservations.
348
+
349
+ The following is expected from a backend that supports reservations:
350
+
351
+ - `get_offers` respects `Requirements.reservation` if set, and only returns
352
+ offers that can be provisioned in the configured reservation. It can
353
+ adjust some offer properties such as `availability` and
354
+ `availability_zones` if necessary.
355
+ - `create_instance` respects `InstanceConfig.reservation` if set, and
356
+ provisions the instance in the configured reservation.
344
357
  """
345
358
 
346
359
  pass
@@ -391,6 +404,16 @@ class ComputeWithPlacementGroupSupport(ABC):
391
404
  """
392
405
  pass
393
406
 
407
+ def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
408
+ """
409
+ Whether placement groups can be used for instances provisioned in reservations.
410
+
411
+ Arguments:
412
+ backend_type: matches the backend type of this compute, unless this compute is a proxy
413
+ for other backends (dstack Sky)
414
+ """
415
+ return True
416
+
394
417
 
395
418
  class ComputeWithGatewaySupport(ABC):
396
419
  """
@@ -23,6 +23,7 @@ SUPPORTED_GPUHUNT_FLAGS = [
23
23
  "oci-spot",
24
24
  "lambda-arm",
25
25
  "gcp-a4",
26
+ "gcp-g4-preview",
26
27
  ]
27
28
 
28
29
 
@@ -199,9 +200,12 @@ def choose_disk_size_mib(
199
200
  return round(disk_size_gib * 1024)
200
201
 
201
202
 
203
+ OfferModifier = Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
204
+
205
+
202
206
  def get_offers_disk_modifier(
203
207
  configurable_disk_size: Range[Memory], requirements: Requirements
204
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
208
+ ) -> OfferModifier:
205
209
  """
206
210
  Returns a func that modifies offers disk by setting min value that satisfies both
207
211
  `configurable_disk_size` and `requirements`.
@@ -1,4 +1,5 @@
1
- from typing import Callable, Dict, List, Optional
1
+ from collections.abc import Iterable
2
+ from typing import Dict, List, Optional
2
3
 
3
4
  from datacrunch import DataCrunchClient
4
5
  from datacrunch.exceptions import APIException
@@ -12,7 +13,11 @@ from dstack._internal.core.backends.base.compute import (
12
13
  generate_unique_instance_name,
13
14
  get_shim_commands,
14
15
  )
15
- from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
16
+ from dstack._internal.core.backends.base.offers import (
17
+ OfferModifier,
18
+ get_catalog_offers,
19
+ get_offers_disk_modifier,
20
+ )
16
21
  from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
17
22
  from dstack._internal.core.errors import NoCapacityError
18
23
  from dstack._internal.core.models.backends.base import BackendType
@@ -59,10 +64,8 @@ class DataCrunchCompute(
59
64
  offers_with_availability = self._get_offers_with_availability(offers)
60
65
  return offers_with_availability
61
66
 
62
- def get_offers_modifier(
63
- self, requirements: Requirements
64
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
65
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
67
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
68
+ return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
66
69
 
67
70
  def _get_offers_with_availability(
68
71
  self, offers: List[InstanceOffer]
@@ -1,7 +1,9 @@
1
1
  import concurrent.futures
2
2
  import json
3
+ import re
3
4
  import threading
4
5
  from collections import defaultdict
6
+ from collections.abc import Iterable
5
7
  from dataclasses import dataclass
6
8
  from typing import Callable, Dict, List, Literal, Optional, Tuple
7
9
 
@@ -24,6 +26,7 @@ from dstack._internal.core.backends.base.compute import (
24
26
  ComputeWithPlacementGroupSupport,
25
27
  ComputeWithPrivateGatewaySupport,
26
28
  ComputeWithPrivilegedSupport,
29
+ ComputeWithReservationSupport,
27
30
  ComputeWithVolumeSupport,
28
31
  generate_unique_gateway_instance_name,
29
32
  generate_unique_instance_name,
@@ -35,6 +38,7 @@ from dstack._internal.core.backends.base.compute import (
35
38
  requires_nvidia_proprietary_kernel_modules,
36
39
  )
37
40
  from dstack._internal.core.backends.base.offers import (
41
+ OfferModifier,
38
42
  get_catalog_offers,
39
43
  get_offers_disk_modifier,
40
44
  )
@@ -78,8 +82,11 @@ logger = get_logger(__name__)
78
82
  # pd-balanced disks can be 10GB-64TB, but dstack images are 20GB and cannot grow larger
79
83
  # than 32TB because of filesystem settings
80
84
  CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("20GB"), max=Memory.parse("32TB"))
81
-
82
-
85
+ # Pattern from https://cloud.google.com/compute/docs/instances/reservations-consume#consuming_instances_from_a_specific_reservation
86
+ RESERVATION_PATTERN = re.compile(
87
+ r"projects/(?P<project_id>[a-z0-9-]+)/reservations/(?P<reservation_name>[a-z0-9-]+)"
88
+ )
89
+ RESOURCE_NAME_PATTERN = re.compile(r"[a-z0-9-]+")
83
90
  TPU_VERSIONS = [tpu.name for tpu in KNOWN_TPUS]
84
91
 
85
92
 
@@ -93,6 +100,7 @@ class GCPCompute(
93
100
  ComputeWithCreateInstanceSupport,
94
101
  ComputeWithPrivilegedSupport,
95
102
  ComputeWithMultinodeSupport,
103
+ ComputeWithReservationSupport,
96
104
  ComputeWithPlacementGroupSupport,
97
105
  ComputeWithGatewaySupport,
98
106
  ComputeWithPrivateGatewaySupport,
@@ -113,8 +121,12 @@ class GCPCompute(
113
121
  self.resource_policies_client = compute_v1.ResourcePoliciesClient(
114
122
  credentials=self.credentials
115
123
  )
124
+ self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials)
116
125
  self._usable_subnets_cache_lock = threading.Lock()
117
126
  self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120)
127
+ self._find_reservation_cache_lock = threading.Lock()
128
+ # smaller TTL, since we check the reservation's in_use_count, which can change often
129
+ self._find_reservation_cache = TTLCache(maxsize=8, ttl=20)
118
130
 
119
131
  def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
120
132
  regions = get_or_error(self.config.regions)
@@ -130,13 +142,19 @@ class GCPCompute(
130
142
  offer_keys_to_offers = {}
131
143
  offers_with_availability = []
132
144
  for offer in offers:
145
+ preview = False
146
+ if offer.instance.name.startswith("g4-standard-"):
147
+ if self.config.preview_features and "g4" in self.config.preview_features:
148
+ preview = True
149
+ else:
150
+ continue
133
151
  region = offer.region[:-2] # strip zone
134
152
  key = (_unique_instance_name(offer.instance), region)
135
153
  if key in offer_keys_to_offers:
136
154
  offer_keys_to_offers[key].availability_zones.append(offer.region)
137
155
  continue
138
156
  availability = InstanceAvailability.NO_QUOTA
139
- if _has_gpu_quota(quotas[region], offer.instance.resources):
157
+ if preview or _has_gpu_quota(quotas[region], offer.instance.resources):
140
158
  availability = InstanceAvailability.UNKNOWN
141
159
  # todo quotas: cpu, memory, global gpu, tpu
142
160
  offer_with_availability = InstanceOfferWithAvailability(
@@ -149,10 +167,40 @@ class GCPCompute(
149
167
  offers_with_availability[-1].region = region
150
168
  return offers_with_availability
151
169
 
152
- def get_offers_modifier(
153
- self, requirements: Requirements
154
- ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
155
- return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
170
+ def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
171
+ modifiers = []
172
+
173
+ if requirements.reservation:
174
+ zone_to_reservation = self._find_reservation(requirements.reservation)
175
+
176
+ def reservation_modifier(
177
+ offer: InstanceOfferWithAvailability,
178
+ ) -> Optional[InstanceOfferWithAvailability]:
179
+ if offer.instance.resources.spot:
180
+ return None
181
+ assert offer.availability_zones is not None
182
+ matching_zones = []
183
+ zones_with_capacity = []
184
+ for zone in offer.availability_zones:
185
+ reservation = zone_to_reservation.get(zone)
186
+ if reservation is not None and _offer_matches_reservation(offer, reservation):
187
+ matching_zones.append(zone)
188
+ if _reservation_has_capacity(reservation):
189
+ zones_with_capacity.append(zone)
190
+ if not matching_zones:
191
+ return None
192
+ offer = offer.copy(deep=True)
193
+ if zones_with_capacity:
194
+ offer.availability_zones = zones_with_capacity
195
+ else:
196
+ offer.availability_zones = matching_zones
197
+ offer.availability = InstanceAvailability.NOT_AVAILABLE
198
+ return offer
199
+
200
+ modifiers.append(reservation_modifier)
201
+
202
+ modifiers.append(get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements))
203
+ return modifiers
156
204
 
157
205
  def terminate_instance(
158
206
  self, instance_id: str, region: str, backend_data: Optional[str] = None
@@ -305,6 +353,16 @@ class GCPCompute(
305
353
  )
306
354
 
307
355
  for zone in zones:
356
+ reservation = None
357
+ if instance_config.reservation:
358
+ reservation = self._find_reservation(instance_config.reservation).get(zone)
359
+ if reservation is None:
360
+ logger.warning(
361
+ "Reservation %s no longer exists in zone %s",
362
+ instance_config.reservation,
363
+ zone,
364
+ )
365
+ continue
308
366
  request = compute_v1.InsertInstanceRequest()
309
367
  request.zone = zone
310
368
  request.project = self.config.project_id
@@ -335,6 +393,7 @@ class GCPCompute(
335
393
  roce_subnetworks=roce_subnets,
336
394
  allocate_public_ip=allocate_public_ip,
337
395
  placement_policy=placement_policy,
396
+ reservation=reservation,
338
397
  )
339
398
  try:
340
399
  # GCP needs some time to return an error in case of no capacity (< 30s).
@@ -475,6 +534,11 @@ class GCPCompute(
475
534
  ) -> bool:
476
535
  return placement_group.configuration.region == instance_offer.region
477
536
 
537
+ def are_placement_groups_compatible_with_reservations(self, backend_type: BackendType) -> bool:
538
+ # Cannot use our own placement policies when provisioning in a reservation.
539
+ # Instead, we use the placement policy defined in reservation settings.
540
+ return False
541
+
478
542
  def create_gateway(
479
543
  self,
480
544
  configuration: GatewayComputeConfiguration,
@@ -880,6 +944,26 @@ class GCPCompute(
880
944
  usable_subnets=self._list_usable_subnets(),
881
945
  )
882
946
 
947
+ @cachedmethod(
948
+ cache=lambda self: self._find_reservation_cache,
949
+ lock=lambda self: self._find_reservation_cache_lock,
950
+ )
951
+ def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]:
952
+ if match := RESERVATION_PATTERN.fullmatch(configured_name):
953
+ project_id = match.group("project_id")
954
+ name = match.group("reservation_name")
955
+ elif RESOURCE_NAME_PATTERN.fullmatch(configured_name):
956
+ project_id = self.config.project_id
957
+ name = configured_name
958
+ else:
959
+ # misconfigured or non-GCP
960
+ return {}
961
+ return gcp_resources.find_reservation(
962
+ reservations_client=self.reservations_client,
963
+ project_id=project_id,
964
+ name=name,
965
+ )
966
+
883
967
 
884
968
  def _supported_instances_and_zones(
885
969
  regions: List[str],
@@ -933,6 +1017,52 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
933
1017
  return len(resources.gpus) <= quotas.get(quota_name, 0)
934
1018
 
935
1019
 
1020
+ def _offer_matches_reservation(
1021
+ offer: InstanceOfferWithAvailability, reservation: compute_v1.Reservation
1022
+ ) -> bool:
1023
+ if (
1024
+ reservation.specific_reservation is None
1025
+ or reservation.specific_reservation.instance_properties is None
1026
+ ):
1027
+ return False
1028
+ properties = reservation.specific_reservation.instance_properties
1029
+ if properties.machine_type != offer.instance.name:
1030
+ return False
1031
+ accelerators = properties.guest_accelerators or []
1032
+ if not accelerators and offer.instance.resources.gpus:
1033
+ return False
1034
+ if len(accelerators) > 1:
1035
+ logger.warning(
1036
+ "Expected 0 or 1 accelerator types per instance,"
1037
+ f" but {properties.machine_type} has {len(accelerators)}."
1038
+ f" Ignoring reservation {reservation.self_link}"
1039
+ )
1040
+ return False
1041
+ if accelerators:
1042
+ if accelerators[0].accelerator_count != len(offer.instance.resources.gpus):
1043
+ return False
1044
+ if (
1045
+ offer.instance.resources.gpus
1046
+ and gcp_resources.find_accelerator_name(
1047
+ offer.instance.resources.gpus[0].name,
1048
+ offer.instance.resources.gpus[0].memory_mib,
1049
+ )
1050
+ != accelerators[0].accelerator_type
1051
+ ):
1052
+ return False
1053
+ return True
1054
+
1055
+
1056
+ def _reservation_has_capacity(reservation: compute_v1.Reservation) -> bool:
1057
+ return (
1058
+ reservation.specific_reservation is not None
1059
+ and reservation.specific_reservation.in_use_count is not None
1060
+ and reservation.specific_reservation.assured_count is not None
1061
+ and reservation.specific_reservation.in_use_count
1062
+ < reservation.specific_reservation.assured_count
1063
+ )
1064
+
1065
+
936
1066
  def _unique_instance_name(instance: InstanceType) -> str:
937
1067
  if instance.resources.spot:
938
1068
  name = f"{instance.name}-spot"
@@ -89,6 +89,13 @@ class GCPBackendConfig(CoreModel):
89
89
  description="The tags (labels) that will be assigned to resources created by `dstack`"
90
90
  ),
91
91
  ] = None
92
+ preview_features: Annotated[
93
+ Optional[List[Literal["g4"]]],
94
+ Field(
95
+ description=("The list of preview GCP features to enable. Supported values: `g4`"),
96
+ max_items=1,
97
+ ),
98
+ ] = None
92
99
 
93
100
 
94
101
  class GCPBackendConfigWithCreds(GCPBackendConfig):
@@ -26,9 +26,35 @@ supported_accelerators = [
26
26
  {"accelerator_name": "nvidia-tesla-t4", "gpu_name": "T4", "memory_mb": 1024 * 16},
27
27
  {"accelerator_name": "nvidia-tesla-v100", "gpu_name": "V100", "memory_mb": 1024 * 16},
28
28
  {"accelerator_name": "nvidia-tesla-p100", "gpu_name": "P100", "memory_mb": 1024 * 16},
29
+ {"accelerator_name": "nvidia-rtx-pro-6000", "gpu_name": "RTXPRO6000", "memory_mb": 1024 * 96},
29
30
  ]
30
31
 
31
32
 
33
+ def find_accelerator_name(gpu_name: str, memory_mib: int) -> Optional[str]:
34
+ for acc in supported_accelerators:
35
+ if gpu_name == acc["gpu_name"] and memory_mib == acc["memory_mb"]:
36
+ return acc["accelerator_name"]
37
+ return None
38
+
39
+
40
+ def sanitize_filter_value(value: str) -> str:
41
+ """
42
+ Escape characters that could break the Compute Engine API filter string.
43
+ """
44
+ return value.replace("\\", "\\\\").replace('"', '\\"')
45
+
46
+
47
+ def get_resource_project(resource_url: str) -> str:
48
+ """
49
+ Extract the project ID from a URL like
50
+ https://www.googleapis.com/compute/v1/projects/proj-id/zones/us-central1-a/instances/vm-name
51
+ """
52
+ matches = re.findall(r"/projects/(?P<project_id>[a-z0-9-]+)/", resource_url)
53
+ if not matches:
54
+ raise BackendError(f"Invalid resource URL {resource_url}")
55
+ return matches[0]
56
+
57
+
32
58
  def get_availability_zones(
33
59
  regions_client: compute_v1.RegionsClient,
34
60
  project_id: str,
@@ -123,6 +149,7 @@ def create_instance_struct(
123
149
  roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
124
150
  allocate_public_ip: bool = True,
125
151
  placement_policy: Optional[str] = None,
152
+ reservation: Optional[compute_v1.Reservation] = None,
126
153
  ) -> compute_v1.Instance:
127
154
  instance = compute_v1.Instance()
128
155
  instance.name = instance_name
@@ -147,6 +174,25 @@ def create_instance_struct(
147
174
  initialize_params.disk_type = f"zones/{zone}/diskTypes/hyperdisk-balanced"
148
175
  disk.initialize_params = initialize_params
149
176
  instance.disks = [disk]
177
+ if (
178
+ reservation is not None
179
+ and reservation.specific_reservation is not None
180
+ and reservation.specific_reservation.instance_properties is not None
181
+ and reservation.specific_reservation.instance_properties.local_ssds is not None
182
+ ):
183
+ for local_ssd in reservation.specific_reservation.instance_properties.local_ssds:
184
+ instance.disks.append(
185
+ compute_v1.AttachedDisk(
186
+ auto_delete=True,
187
+ boot=False,
188
+ type_="SCRATCH",
189
+ initialize_params=compute_v1.AttachedDiskInitializeParams(
190
+ disk_type=f"zones/{zone}/diskTypes/local-ssd",
191
+ disk_size_gb=local_ssd.disk_size_gb,
192
+ ),
193
+ interface=local_ssd.interface,
194
+ )
195
+ )
150
196
 
151
197
  if accelerators:
152
198
  instance.guest_accelerators = accelerators
@@ -162,6 +208,8 @@ def create_instance_struct(
162
208
 
163
209
  if placement_policy is not None:
164
210
  instance.resource_policies = [placement_policy]
211
+ elif reservation is not None and "placement" in reservation.resource_policies:
212
+ instance.resource_policies = [reservation.resource_policies["placement"]]
165
213
 
166
214
  if spot:
167
215
  instance.scheduling = compute_v1.Scheduling()
@@ -187,6 +235,17 @@ def create_instance_struct(
187
235
  )
188
236
  ]
189
237
 
238
+ if reservation is not None:
239
+ reservation_project = get_resource_project(reservation.self_link)
240
+ instance.reservation_affinity = compute_v1.ReservationAffinity()
241
+ instance.reservation_affinity.consume_reservation_type = (
242
+ compute_v1.ReservationAffinity.ConsumeReservationType.SPECIFIC_RESERVATION.name
243
+ )
244
+ instance.reservation_affinity.key = "compute.googleapis.com/reservation-name"
245
+ instance.reservation_affinity.values = [
246
+ f"projects/{reservation_project}/reservations/{reservation.name}"
247
+ ]
248
+
190
249
  return instance
191
250
 
192
251
 
@@ -350,11 +409,8 @@ def get_accelerators(
350
409
  return []
351
410
  accelerator_config = compute_v1.AcceleratorConfig()
352
411
  accelerator_config.accelerator_count = len(gpus)
353
- for acc in supported_accelerators:
354
- if gpus[0].name == acc["gpu_name"] and gpus[0].memory_mib == acc["memory_mb"]:
355
- accelerator_name = acc["accelerator_name"]
356
- break
357
- else:
412
+ accelerator_name = find_accelerator_name(gpus[0].name, gpus[0].memory_mib)
413
+ if accelerator_name is None:
358
414
  raise ValueError(f"Unsupported GPU: {gpus[0].name} {gpus[0].memory_mib} MiB")
359
415
  accelerator_config.accelerator_type = (
360
416
  f"projects/{project_id}/zones/{zone}/acceleratorTypes/{accelerator_name}"
@@ -362,6 +418,31 @@ def get_accelerators(
362
418
  return [accelerator_config]
363
419
 
364
420
 
421
+ def find_reservation(
422
+ reservations_client: compute_v1.ReservationsClient,
423
+ project_id: str,
424
+ name: str,
425
+ ) -> dict[str, compute_v1.Reservation]:
426
+ request = compute_v1.AggregatedListReservationsRequest(
427
+ project=project_id,
428
+ filter=(
429
+ f'(name = "{sanitize_filter_value(name)}")'
430
+ ' AND (status = "READY")'
431
+ " AND (specificReservationRequired = true)"
432
+ ),
433
+ )
434
+ try:
435
+ aggregated_reservations = reservations_client.aggregated_list(request=request)
436
+ except (google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden) as e:
437
+ logger.warning("Could not find reservation: %s", e)
438
+ return {}
439
+ zone_to_reservation = {}
440
+ for zone, zone_reservations in aggregated_reservations:
441
+ if zone_reservations.reservations:
442
+ zone_to_reservation[zone.split("/")[-1]] = zone_reservations.reservations[0]
443
+ return zone_to_reservation
444
+
445
+
365
446
  def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
366
447
  filtered_labels = {}
367
448
  for k, v in labels.items():
@@ -499,5 +580,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
499
580
  "h3-",
500
581
  "v6e",
501
582
  "a4-",
583
+ "g4-",
502
584
  ]
503
585
  )