dstack 0.19.30rc1__py3-none-any.whl → 0.19.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/services/configurators/fleet.py +20 -6
- dstack/_internal/cli/utils/gpu.py +2 -2
- dstack/_internal/core/backends/aws/compute.py +13 -5
- dstack/_internal/core/backends/aws/resources.py +11 -6
- dstack/_internal/core/backends/azure/compute.py +17 -6
- dstack/_internal/core/backends/base/compute.py +57 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +87 -38
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +22 -0
- dstack/_internal/core/backends/kubernetes/compute.py +531 -215
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +17 -0
- dstack/_internal/core/backends/nebius/configurator.py +15 -0
- dstack/_internal/core/backends/nebius/models.py +57 -5
- dstack/_internal/core/backends/nebius/resources.py +45 -2
- dstack/_internal/core/backends/oci/compute.py +7 -1
- dstack/_internal/core/backends/oci/resources.py +8 -3
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/consts.py +2 -0
- dstack/_internal/core/models/profiles.py +11 -4
- dstack/_internal/core/services/repos.py +101 -11
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +75 -17
- dstack/_internal/server/background/tasks/process_instances.py +3 -5
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_runs.py +27 -23
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +107 -54
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/_internal/utils/ssh.py +22 -2
- dstack/version.py +2 -2
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/METADATA +20 -18
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/RECORD +54 -54
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/WHEEL +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30rc1.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -23,6 +23,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
23
23
|
ComputeWithMultinodeSupport,
|
|
24
24
|
ComputeWithPlacementGroupSupport,
|
|
25
25
|
ComputeWithPrivateGatewaySupport,
|
|
26
|
+
ComputeWithPrivilegedSupport,
|
|
26
27
|
ComputeWithVolumeSupport,
|
|
27
28
|
generate_unique_gateway_instance_name,
|
|
28
29
|
generate_unique_instance_name,
|
|
@@ -31,6 +32,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
31
32
|
get_shim_commands,
|
|
32
33
|
get_user_data,
|
|
33
34
|
merge_tags,
|
|
35
|
+
requires_nvidia_proprietary_kernel_modules,
|
|
34
36
|
)
|
|
35
37
|
from dstack._internal.core.backends.base.offers import (
|
|
36
38
|
get_catalog_offers,
|
|
@@ -38,6 +40,7 @@ from dstack._internal.core.backends.base.offers import (
|
|
|
38
40
|
)
|
|
39
41
|
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
|
|
40
42
|
from dstack._internal.core.backends.gcp.models import GCPConfig
|
|
43
|
+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
41
44
|
from dstack._internal.core.errors import (
|
|
42
45
|
ComputeError,
|
|
43
46
|
ComputeResourceNotFoundError,
|
|
@@ -88,6 +91,7 @@ class GCPVolumeDiskBackendData(CoreModel):
|
|
|
88
91
|
class GCPCompute(
|
|
89
92
|
ComputeWithAllOffersCached,
|
|
90
93
|
ComputeWithCreateInstanceSupport,
|
|
94
|
+
ComputeWithPrivilegedSupport,
|
|
91
95
|
ComputeWithMultinodeSupport,
|
|
92
96
|
ComputeWithPlacementGroupSupport,
|
|
93
97
|
ComputeWithGatewaySupport,
|
|
@@ -109,8 +113,8 @@ class GCPCompute(
|
|
|
109
113
|
self.resource_policies_client = compute_v1.ResourcePoliciesClient(
|
|
110
114
|
credentials=self.credentials
|
|
111
115
|
)
|
|
112
|
-
self.
|
|
113
|
-
self.
|
|
116
|
+
self._usable_subnets_cache_lock = threading.Lock()
|
|
117
|
+
self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120)
|
|
114
118
|
|
|
115
119
|
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
116
120
|
regions = get_or_error(self.config.regions)
|
|
@@ -201,12 +205,12 @@ class GCPCompute(
|
|
|
201
205
|
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
|
|
202
206
|
# Choose any usable subnet in a VPC.
|
|
203
207
|
# Configuring a specific subnet per region is not supported yet.
|
|
204
|
-
subnetwork = _get_vpc_subnet(
|
|
205
|
-
|
|
206
|
-
config=self.config,
|
|
208
|
+
subnetwork = self._get_vpc_subnet(instance_offer.region)
|
|
209
|
+
extra_subnets = self._get_extra_subnets(
|
|
207
210
|
region=instance_offer.region,
|
|
211
|
+
instance_type_name=instance_offer.instance.name,
|
|
208
212
|
)
|
|
209
|
-
|
|
213
|
+
roce_subnets = self._get_roce_subnets(
|
|
210
214
|
region=instance_offer.region,
|
|
211
215
|
instance_type_name=instance_offer.instance.name,
|
|
212
216
|
)
|
|
@@ -293,7 +297,11 @@ class GCPCompute(
|
|
|
293
297
|
|
|
294
298
|
image = _get_image(
|
|
295
299
|
instance_type_name=instance_offer.instance.name,
|
|
296
|
-
|
|
300
|
+
gpu_name=(
|
|
301
|
+
instance_offer.instance.resources.gpus[0].name
|
|
302
|
+
if len(instance_offer.instance.resources.gpus) > 0
|
|
303
|
+
else None
|
|
304
|
+
),
|
|
297
305
|
)
|
|
298
306
|
|
|
299
307
|
for zone in zones:
|
|
@@ -324,6 +332,7 @@ class GCPCompute(
|
|
|
324
332
|
network=self.config.vpc_resource_name,
|
|
325
333
|
subnetwork=subnetwork,
|
|
326
334
|
extra_subnetworks=extra_subnets,
|
|
335
|
+
roce_subnetworks=roce_subnets,
|
|
327
336
|
allocate_public_ip=allocate_public_ip,
|
|
328
337
|
placement_policy=placement_policy,
|
|
329
338
|
)
|
|
@@ -333,6 +342,13 @@ class GCPCompute(
|
|
|
333
342
|
# If the request succeeds, we'll probably timeout and update_provisioning_data() will get hostname.
|
|
334
343
|
operation = self.instances_client.insert(request=request)
|
|
335
344
|
gcp_resources.wait_for_extended_operation(operation, timeout=30)
|
|
345
|
+
except google.api_core.exceptions.BadRequest as e:
|
|
346
|
+
if "Network profile only allows resource creation in location" in e.message:
|
|
347
|
+
# A hack to find the correct RoCE VPC zone by trial and error.
|
|
348
|
+
# Could be better to find it via the API.
|
|
349
|
+
logger.debug("Got GCP error when provisioning a VM: %s", e)
|
|
350
|
+
continue
|
|
351
|
+
raise
|
|
336
352
|
except (
|
|
337
353
|
google.api_core.exceptions.ServiceUnavailable,
|
|
338
354
|
google.api_core.exceptions.NotFound,
|
|
@@ -481,11 +497,7 @@ class GCPCompute(
|
|
|
481
497
|
)
|
|
482
498
|
# Choose any usable subnet in a VPC.
|
|
483
499
|
# Configuring a specific subnet per region is not supported yet.
|
|
484
|
-
subnetwork = _get_vpc_subnet(
|
|
485
|
-
subnetworks_client=self.subnetworks_client,
|
|
486
|
-
config=self.config,
|
|
487
|
-
region=configuration.region,
|
|
488
|
-
)
|
|
500
|
+
subnetwork = self._get_vpc_subnet(configuration.region)
|
|
489
501
|
|
|
490
502
|
labels = {
|
|
491
503
|
"owner": "dstack",
|
|
@@ -787,10 +799,6 @@ class GCPCompute(
|
|
|
787
799
|
instance_id,
|
|
788
800
|
)
|
|
789
801
|
|
|
790
|
-
@cachedmethod(
|
|
791
|
-
cache=lambda self: self._extra_subnets_cache,
|
|
792
|
-
lock=lambda self: self._extra_subnets_cache_lock,
|
|
793
|
-
)
|
|
794
802
|
def _get_extra_subnets(
|
|
795
803
|
self,
|
|
796
804
|
region: str,
|
|
@@ -802,15 +810,16 @@ class GCPCompute(
|
|
|
802
810
|
subnets_num = 8
|
|
803
811
|
elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
|
|
804
812
|
subnets_num = 4
|
|
813
|
+
elif instance_type_name == "a4-highgpu-8g":
|
|
814
|
+
subnets_num = 1 # 1 main + 1 extra + 8 RoCE
|
|
805
815
|
else:
|
|
806
816
|
return []
|
|
807
817
|
extra_subnets = []
|
|
808
818
|
for vpc_name in self.config.extra_vpcs[:subnets_num]:
|
|
809
819
|
subnet = gcp_resources.get_vpc_subnet_or_error(
|
|
810
|
-
subnetworks_client=self.subnetworks_client,
|
|
811
|
-
vpc_project_id=self.config.vpc_project_id or self.config.project_id,
|
|
812
820
|
vpc_name=vpc_name,
|
|
813
821
|
region=region,
|
|
822
|
+
usable_subnets=self._list_usable_subnets(),
|
|
814
823
|
)
|
|
815
824
|
vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
|
|
816
825
|
project_id=self.config.vpc_project_id or self.config.project_id,
|
|
@@ -819,6 +828,58 @@ class GCPCompute(
|
|
|
819
828
|
extra_subnets.append((vpc_resource_name, subnet))
|
|
820
829
|
return extra_subnets
|
|
821
830
|
|
|
831
|
+
def _get_roce_subnets(
|
|
832
|
+
self,
|
|
833
|
+
region: str,
|
|
834
|
+
instance_type_name: str,
|
|
835
|
+
) -> List[Tuple[str, str]]:
|
|
836
|
+
if not self.config.roce_vpcs:
|
|
837
|
+
return []
|
|
838
|
+
if instance_type_name == "a4-highgpu-8g":
|
|
839
|
+
nics_num = 8
|
|
840
|
+
else:
|
|
841
|
+
return []
|
|
842
|
+
roce_vpc = self.config.roce_vpcs[0] # roce_vpcs is validated to have at most 1 item
|
|
843
|
+
subnets = gcp_resources.get_vpc_subnets(
|
|
844
|
+
vpc_name=roce_vpc,
|
|
845
|
+
region=region,
|
|
846
|
+
usable_subnets=self._list_usable_subnets(),
|
|
847
|
+
)
|
|
848
|
+
if len(subnets) < nics_num:
|
|
849
|
+
raise ComputeError(
|
|
850
|
+
f"{instance_type_name} requires {nics_num} RoCE subnets,"
|
|
851
|
+
f" but only {len(subnets)} are available in VPC {roce_vpc}"
|
|
852
|
+
)
|
|
853
|
+
vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
|
|
854
|
+
project_id=self.config.vpc_project_id or self.config.project_id,
|
|
855
|
+
vpc_name=roce_vpc,
|
|
856
|
+
)
|
|
857
|
+
nic_subnets = []
|
|
858
|
+
for subnet in subnets[:nics_num]:
|
|
859
|
+
nic_subnets.append((vpc_resource_name, subnet))
|
|
860
|
+
return nic_subnets
|
|
861
|
+
|
|
862
|
+
@cachedmethod(
|
|
863
|
+
cache=lambda self: self._usable_subnets_cache,
|
|
864
|
+
lock=lambda self: self._usable_subnets_cache_lock,
|
|
865
|
+
)
|
|
866
|
+
def _list_usable_subnets(self) -> list[compute_v1.UsableSubnetwork]:
|
|
867
|
+
# To avoid hitting the `ListUsable requests per minute` system limit, we fetch all subnets
|
|
868
|
+
# at once and cache them
|
|
869
|
+
return gcp_resources.list_project_usable_subnets(
|
|
870
|
+
subnetworks_client=self.subnetworks_client,
|
|
871
|
+
project_id=self.config.vpc_project_id or self.config.project_id,
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
def _get_vpc_subnet(self, region: str) -> Optional[str]:
|
|
875
|
+
if self.config.vpc_name is None:
|
|
876
|
+
return None
|
|
877
|
+
return gcp_resources.get_vpc_subnet_or_error(
|
|
878
|
+
vpc_name=self.config.vpc_name,
|
|
879
|
+
region=region,
|
|
880
|
+
usable_subnets=self._list_usable_subnets(),
|
|
881
|
+
)
|
|
882
|
+
|
|
822
883
|
|
|
823
884
|
def _supported_instances_and_zones(
|
|
824
885
|
regions: List[str],
|
|
@@ -861,8 +922,8 @@ def _has_gpu_quota(quotas: Dict[str, float], resources: Resources) -> bool:
|
|
|
861
922
|
gpu = resources.gpus[0]
|
|
862
923
|
if _is_tpu(gpu.name):
|
|
863
924
|
return True
|
|
864
|
-
if gpu.name
|
|
865
|
-
# H100 and H100_MEGA quotas are not returned by `regions_client.list`
|
|
925
|
+
if gpu.name in ["B200", "H100"]:
|
|
926
|
+
# B200, H100 and H100_MEGA quotas are not returned by `regions_client.list`
|
|
866
927
|
return True
|
|
867
928
|
quota_name = f"NVIDIA_{gpu.name}_GPUS"
|
|
868
929
|
if gpu.name == "A100" and gpu.memory_mib == 80 * 1024:
|
|
@@ -883,28 +944,13 @@ def _unique_instance_name(instance: InstanceType) -> str:
|
|
|
883
944
|
return f"{name}-{gpu.name}-{gpu.memory_mib}"
|
|
884
945
|
|
|
885
946
|
|
|
886
|
-
def _get_vpc_subnet(
|
|
887
|
-
subnetworks_client: compute_v1.SubnetworksClient,
|
|
888
|
-
config: GCPConfig,
|
|
889
|
-
region: str,
|
|
890
|
-
) -> Optional[str]:
|
|
891
|
-
if config.vpc_name is None:
|
|
892
|
-
return None
|
|
893
|
-
return gcp_resources.get_vpc_subnet_or_error(
|
|
894
|
-
subnetworks_client=subnetworks_client,
|
|
895
|
-
vpc_project_id=config.vpc_project_id or config.project_id,
|
|
896
|
-
vpc_name=config.vpc_name,
|
|
897
|
-
region=region,
|
|
898
|
-
)
|
|
899
|
-
|
|
900
|
-
|
|
901
947
|
@dataclass
|
|
902
948
|
class GCPImage:
|
|
903
949
|
id: str
|
|
904
950
|
is_ufw_installed: bool
|
|
905
951
|
|
|
906
952
|
|
|
907
|
-
def _get_image(instance_type_name: str,
|
|
953
|
+
def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
|
|
908
954
|
if instance_type_name == "a3-megagpu-8g":
|
|
909
955
|
image_name = "dstack-a3mega-5"
|
|
910
956
|
is_ufw_installed = False
|
|
@@ -913,8 +959,11 @@ def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
|
|
|
913
959
|
id="projects/cos-cloud/global/images/cos-105-17412-535-78",
|
|
914
960
|
is_ufw_installed=False,
|
|
915
961
|
)
|
|
916
|
-
elif
|
|
917
|
-
|
|
962
|
+
elif gpu_name is not None:
|
|
963
|
+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
|
|
964
|
+
image_name = f"dstack-cuda-{version.base_image}"
|
|
965
|
+
else:
|
|
966
|
+
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
|
|
918
967
|
is_ufw_installed = True
|
|
919
968
|
else:
|
|
920
969
|
image_name = f"dstack-{version.base_image}"
|
|
@@ -202,5 +202,5 @@ class GCPConfigurator(
|
|
|
202
202
|
)
|
|
203
203
|
except BackendError as e:
|
|
204
204
|
raise ServerClientError(e.args[0])
|
|
205
|
-
# Not checking config.
|
|
205
|
+
# Not checking config.extra_vpcs and config.roce_vpcs so that users are not required to configure subnets for all regions
|
|
206
206
|
# but only for regions they intend to use. Validation will be done on provisioning.
|
|
@@ -41,11 +41,24 @@ class GCPBackendConfig(CoreModel):
|
|
|
41
41
|
Optional[List[str]],
|
|
42
42
|
Field(
|
|
43
43
|
description=(
|
|
44
|
-
"The names of additional VPCs used for
|
|
44
|
+
"The names of additional VPCs used for multi-NIC instances, such as those that support GPUDirect."
|
|
45
|
+
" Specify eight VPCs to maximize bandwidth in clusters with eight-GPU instances."
|
|
45
46
|
" Each VPC must have a subnet and a firewall rule allowing internal traffic across all subnets"
|
|
46
47
|
)
|
|
47
48
|
),
|
|
48
49
|
] = None
|
|
50
|
+
roce_vpcs: Annotated[
|
|
51
|
+
Optional[List[str]],
|
|
52
|
+
Field(
|
|
53
|
+
description=(
|
|
54
|
+
"The names of additional VPCs with the RoCE network profile."
|
|
55
|
+
" Used for RDMA on GPU instances that support the MRDMA interface type."
|
|
56
|
+
" A VPC should have eight subnets to maximize the bandwidth in clusters"
|
|
57
|
+
" with eight-GPU instances."
|
|
58
|
+
),
|
|
59
|
+
max_items=1, # The currently supported instance types only need one VPC with eight subnets.
|
|
60
|
+
),
|
|
61
|
+
] = None
|
|
49
62
|
vpc_project_id: Annotated[
|
|
50
63
|
Optional[str],
|
|
51
64
|
Field(description="The shared VPC hosted project ID. Required for shared VPC only"),
|
|
@@ -19,6 +19,7 @@ DSTACK_INSTANCE_TAG = "dstack-runner-instance"
|
|
|
19
19
|
DSTACK_GATEWAY_TAG = "dstack-gateway-instance"
|
|
20
20
|
|
|
21
21
|
supported_accelerators = [
|
|
22
|
+
{"accelerator_name": "nvidia-b200", "gpu_name": "B200", "memory_mb": 1024 * 180},
|
|
22
23
|
{"accelerator_name": "nvidia-a100-80gb", "gpu_name": "A100", "memory_mb": 1024 * 80},
|
|
23
24
|
{"accelerator_name": "nvidia-tesla-a100", "gpu_name": "A100", "memory_mb": 1024 * 40},
|
|
24
25
|
{"accelerator_name": "nvidia-l4", "gpu_name": "L4", "memory_mb": 1024 * 24},
|
|
@@ -58,8 +59,6 @@ def check_vpc(
|
|
|
58
59
|
)
|
|
59
60
|
for region in regions:
|
|
60
61
|
get_vpc_subnet_or_error(
|
|
61
|
-
subnetworks_client=subnetworks_client,
|
|
62
|
-
vpc_project_id=vpc_project_id,
|
|
63
62
|
vpc_name=vpc_name,
|
|
64
63
|
region=region,
|
|
65
64
|
usable_subnets=usable_subnets,
|
|
@@ -121,6 +120,7 @@ def create_instance_struct(
|
|
|
121
120
|
network: str = "global/networks/default",
|
|
122
121
|
subnetwork: Optional[str] = None,
|
|
123
122
|
extra_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
123
|
+
roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
124
124
|
allocate_public_ip: bool = True,
|
|
125
125
|
placement_policy: Optional[str] = None,
|
|
126
126
|
) -> compute_v1.Instance:
|
|
@@ -132,6 +132,7 @@ def create_instance_struct(
|
|
|
132
132
|
subnetwork=subnetwork,
|
|
133
133
|
allocate_public_ip=allocate_public_ip,
|
|
134
134
|
extra_subnetworks=extra_subnetworks,
|
|
135
|
+
roce_subnetworks=roce_subnetworks,
|
|
135
136
|
)
|
|
136
137
|
|
|
137
138
|
disk = compute_v1.AttachedDisk()
|
|
@@ -194,6 +195,7 @@ def _get_network_interfaces(
|
|
|
194
195
|
subnetwork: Optional[str],
|
|
195
196
|
allocate_public_ip: bool,
|
|
196
197
|
extra_subnetworks: Optional[List[Tuple[str, str]]],
|
|
198
|
+
roce_subnetworks: Optional[List[Tuple[str, str]]],
|
|
197
199
|
) -> List[compute_v1.NetworkInterface]:
|
|
198
200
|
network_interface = compute_v1.NetworkInterface()
|
|
199
201
|
network_interface.network = network
|
|
@@ -221,6 +223,14 @@ def _get_network_interfaces(
|
|
|
221
223
|
nic_type=compute_v1.NetworkInterface.NicType.GVNIC.name,
|
|
222
224
|
)
|
|
223
225
|
)
|
|
226
|
+
for network, subnetwork in roce_subnetworks or []:
|
|
227
|
+
network_interfaces.append(
|
|
228
|
+
compute_v1.NetworkInterface(
|
|
229
|
+
network=network,
|
|
230
|
+
subnetwork=subnetwork,
|
|
231
|
+
nic_type=compute_v1.NetworkInterface.NicType.MRDMA.name,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
224
234
|
return network_interfaces
|
|
225
235
|
|
|
226
236
|
|
|
@@ -233,29 +243,41 @@ def list_project_usable_subnets(
|
|
|
233
243
|
|
|
234
244
|
|
|
235
245
|
def get_vpc_subnet_or_error(
|
|
236
|
-
subnetworks_client: compute_v1.SubnetworksClient,
|
|
237
|
-
vpc_project_id: str,
|
|
238
246
|
vpc_name: str,
|
|
239
247
|
region: str,
|
|
240
|
-
usable_subnets:
|
|
248
|
+
usable_subnets: list[compute_v1.UsableSubnetwork],
|
|
241
249
|
) -> str:
|
|
242
250
|
"""
|
|
243
251
|
Returns resource name of any usable subnet in a given VPC
|
|
244
252
|
(e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet")
|
|
245
253
|
"""
|
|
246
|
-
|
|
247
|
-
|
|
254
|
+
vpc_subnets = get_vpc_subnets(vpc_name, region, usable_subnets)
|
|
255
|
+
if vpc_subnets:
|
|
256
|
+
return vpc_subnets[0]
|
|
257
|
+
raise ComputeError(
|
|
258
|
+
f"No usable subnetwork found in region {region} for VPC {vpc_name}."
|
|
259
|
+
f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_vpc_subnets(
|
|
264
|
+
vpc_name: str,
|
|
265
|
+
region: str,
|
|
266
|
+
usable_subnets: list[compute_v1.UsableSubnetwork],
|
|
267
|
+
) -> list[str]:
|
|
268
|
+
"""
|
|
269
|
+
Returns resource names of all usable subnets in a given VPC
|
|
270
|
+
(e.g. ["projects/example-project/regions/europe-west4/subnetworks/example-subnet"])
|
|
271
|
+
"""
|
|
272
|
+
result = []
|
|
248
273
|
for subnet in usable_subnets:
|
|
249
274
|
network_name = subnet.network.split("/")[-1]
|
|
250
275
|
subnet_url = subnet.subnetwork
|
|
251
276
|
subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/")
|
|
252
277
|
subnet_region = subnet_resource_name.split("/")[3]
|
|
253
278
|
if network_name == vpc_name and subnet_region == region:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
f"No usable subnetwork found in region {region} for VPC {vpc_name} in project {vpc_project_id}."
|
|
257
|
-
f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
|
|
258
|
-
)
|
|
279
|
+
result.append(subnet_resource_name)
|
|
280
|
+
return result
|
|
259
281
|
|
|
260
282
|
|
|
261
283
|
def create_runner_firewall_rules(
|
|
@@ -476,5 +498,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
|
|
|
476
498
|
"n4-",
|
|
477
499
|
"h3-",
|
|
478
500
|
"v6e",
|
|
501
|
+
"a4-",
|
|
479
502
|
]
|
|
480
503
|
)
|
|
@@ -11,6 +11,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
11
11
|
Compute,
|
|
12
12
|
ComputeWithAllOffersCached,
|
|
13
13
|
ComputeWithCreateInstanceSupport,
|
|
14
|
+
ComputeWithPrivilegedSupport,
|
|
14
15
|
get_shim_commands,
|
|
15
16
|
)
|
|
16
17
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -41,12 +42,33 @@ INSTANCE_TYPE_SPECS = {
|
|
|
41
42
|
"cpu_frequency": 2000000000,
|
|
42
43
|
"cpu_manufacturer": "Intel",
|
|
43
44
|
},
|
|
45
|
+
"2x MI300X 26x Xeon Platinum 8470": {
|
|
46
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
47
|
+
"cpu_frequency": 2000000000,
|
|
48
|
+
"cpu_manufacturer": "Intel",
|
|
49
|
+
},
|
|
50
|
+
"2x MI300X 26x Xeon Platinum 8462Y+": {
|
|
51
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
52
|
+
"cpu_frequency": 2800000000,
|
|
53
|
+
"cpu_manufacturer": "Intel",
|
|
54
|
+
},
|
|
55
|
+
"4x MI300X 52x Xeon Platinum 8462Y": {
|
|
56
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
57
|
+
"cpu_frequency": 2000000000,
|
|
58
|
+
"cpu_manufacturer": "Intel",
|
|
59
|
+
},
|
|
60
|
+
"4x MI300X 52x Xeon Platinum 8462Y+": {
|
|
61
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
62
|
+
"cpu_frequency": 2800000000,
|
|
63
|
+
"cpu_manufacturer": "Intel",
|
|
64
|
+
},
|
|
44
65
|
}
|
|
45
66
|
|
|
46
67
|
|
|
47
68
|
class HotAisleCompute(
|
|
48
69
|
ComputeWithAllOffersCached,
|
|
49
70
|
ComputeWithCreateInstanceSupport,
|
|
71
|
+
ComputeWithPrivilegedSupport,
|
|
50
72
|
Compute,
|
|
51
73
|
):
|
|
52
74
|
def __init__(self, config: HotAisleConfig):
|