dstack 0.19.30__py3-none-any.whl → 0.19.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/__init__.py +8 -0
- dstack/_internal/cli/commands/project.py +27 -20
- dstack/_internal/cli/commands/server.py +5 -0
- dstack/_internal/cli/main.py +1 -3
- dstack/_internal/core/backends/aws/compute.py +2 -0
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +32 -9
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/cloudrift/compute.py +2 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/digitalocean_base/compute.py +2 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/gcp/compute.py +74 -34
- dstack/_internal/core/backends/gcp/configurator.py +1 -1
- dstack/_internal/core/backends/gcp/models.py +14 -1
- dstack/_internal/core/backends/gcp/resources.py +35 -12
- dstack/_internal/core/backends/hotaisle/compute.py +2 -0
- dstack/_internal/core/backends/kubernetes/compute.py +466 -213
- dstack/_internal/core/backends/kubernetes/models.py +13 -16
- dstack/_internal/core/backends/kubernetes/utils.py +145 -8
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -0
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +2 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +2 -0
- dstack/_internal/server/background/tasks/common.py +2 -0
- dstack/_internal/server/background/tasks/process_instances.py +2 -2
- dstack/_internal/server/services/offers.py +7 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/provisioning.py +3 -10
- dstack/version.py +1 -1
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/METADATA +11 -9
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/RECORD +39 -39
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/WHEEL +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.30.dist-info → dstack-0.19.31.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -19,6 +19,7 @@ DSTACK_INSTANCE_TAG = "dstack-runner-instance"
|
|
|
19
19
|
DSTACK_GATEWAY_TAG = "dstack-gateway-instance"
|
|
20
20
|
|
|
21
21
|
supported_accelerators = [
|
|
22
|
+
{"accelerator_name": "nvidia-b200", "gpu_name": "B200", "memory_mb": 1024 * 180},
|
|
22
23
|
{"accelerator_name": "nvidia-a100-80gb", "gpu_name": "A100", "memory_mb": 1024 * 80},
|
|
23
24
|
{"accelerator_name": "nvidia-tesla-a100", "gpu_name": "A100", "memory_mb": 1024 * 40},
|
|
24
25
|
{"accelerator_name": "nvidia-l4", "gpu_name": "L4", "memory_mb": 1024 * 24},
|
|
@@ -58,8 +59,6 @@ def check_vpc(
|
|
|
58
59
|
)
|
|
59
60
|
for region in regions:
|
|
60
61
|
get_vpc_subnet_or_error(
|
|
61
|
-
subnetworks_client=subnetworks_client,
|
|
62
|
-
vpc_project_id=vpc_project_id,
|
|
63
62
|
vpc_name=vpc_name,
|
|
64
63
|
region=region,
|
|
65
64
|
usable_subnets=usable_subnets,
|
|
@@ -121,6 +120,7 @@ def create_instance_struct(
|
|
|
121
120
|
network: str = "global/networks/default",
|
|
122
121
|
subnetwork: Optional[str] = None,
|
|
123
122
|
extra_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
123
|
+
roce_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
124
124
|
allocate_public_ip: bool = True,
|
|
125
125
|
placement_policy: Optional[str] = None,
|
|
126
126
|
) -> compute_v1.Instance:
|
|
@@ -132,6 +132,7 @@ def create_instance_struct(
|
|
|
132
132
|
subnetwork=subnetwork,
|
|
133
133
|
allocate_public_ip=allocate_public_ip,
|
|
134
134
|
extra_subnetworks=extra_subnetworks,
|
|
135
|
+
roce_subnetworks=roce_subnetworks,
|
|
135
136
|
)
|
|
136
137
|
|
|
137
138
|
disk = compute_v1.AttachedDisk()
|
|
@@ -194,6 +195,7 @@ def _get_network_interfaces(
|
|
|
194
195
|
subnetwork: Optional[str],
|
|
195
196
|
allocate_public_ip: bool,
|
|
196
197
|
extra_subnetworks: Optional[List[Tuple[str, str]]],
|
|
198
|
+
roce_subnetworks: Optional[List[Tuple[str, str]]],
|
|
197
199
|
) -> List[compute_v1.NetworkInterface]:
|
|
198
200
|
network_interface = compute_v1.NetworkInterface()
|
|
199
201
|
network_interface.network = network
|
|
@@ -221,6 +223,14 @@ def _get_network_interfaces(
|
|
|
221
223
|
nic_type=compute_v1.NetworkInterface.NicType.GVNIC.name,
|
|
222
224
|
)
|
|
223
225
|
)
|
|
226
|
+
for network, subnetwork in roce_subnetworks or []:
|
|
227
|
+
network_interfaces.append(
|
|
228
|
+
compute_v1.NetworkInterface(
|
|
229
|
+
network=network,
|
|
230
|
+
subnetwork=subnetwork,
|
|
231
|
+
nic_type=compute_v1.NetworkInterface.NicType.MRDMA.name,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
224
234
|
return network_interfaces
|
|
225
235
|
|
|
226
236
|
|
|
@@ -233,29 +243,41 @@ def list_project_usable_subnets(
|
|
|
233
243
|
|
|
234
244
|
|
|
235
245
|
def get_vpc_subnet_or_error(
|
|
236
|
-
subnetworks_client: compute_v1.SubnetworksClient,
|
|
237
|
-
vpc_project_id: str,
|
|
238
246
|
vpc_name: str,
|
|
239
247
|
region: str,
|
|
240
|
-
usable_subnets:
|
|
248
|
+
usable_subnets: list[compute_v1.UsableSubnetwork],
|
|
241
249
|
) -> str:
|
|
242
250
|
"""
|
|
243
251
|
Returns resource name of any usable subnet in a given VPC
|
|
244
252
|
(e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet")
|
|
245
253
|
"""
|
|
246
|
-
|
|
247
|
-
|
|
254
|
+
vpc_subnets = get_vpc_subnets(vpc_name, region, usable_subnets)
|
|
255
|
+
if vpc_subnets:
|
|
256
|
+
return vpc_subnets[0]
|
|
257
|
+
raise ComputeError(
|
|
258
|
+
f"No usable subnetwork found in region {region} for VPC {vpc_name}."
|
|
259
|
+
f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_vpc_subnets(
|
|
264
|
+
vpc_name: str,
|
|
265
|
+
region: str,
|
|
266
|
+
usable_subnets: list[compute_v1.UsableSubnetwork],
|
|
267
|
+
) -> list[str]:
|
|
268
|
+
"""
|
|
269
|
+
Returns resource names of all usable subnets in a given VPC
|
|
270
|
+
(e.g. ["projects/example-project/regions/europe-west4/subnetworks/example-subnet"])
|
|
271
|
+
"""
|
|
272
|
+
result = []
|
|
248
273
|
for subnet in usable_subnets:
|
|
249
274
|
network_name = subnet.network.split("/")[-1]
|
|
250
275
|
subnet_url = subnet.subnetwork
|
|
251
276
|
subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/")
|
|
252
277
|
subnet_region = subnet_resource_name.split("/")[3]
|
|
253
278
|
if network_name == vpc_name and subnet_region == region:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
f"No usable subnetwork found in region {region} for VPC {vpc_name} in project {vpc_project_id}."
|
|
257
|
-
f" Ensure that VPC {vpc_name} exists and has usable subnetworks."
|
|
258
|
-
)
|
|
279
|
+
result.append(subnet_resource_name)
|
|
280
|
+
return result
|
|
259
281
|
|
|
260
282
|
|
|
261
283
|
def create_runner_firewall_rules(
|
|
@@ -476,5 +498,6 @@ def instance_type_supports_persistent_disk(instance_type_name: str) -> bool:
|
|
|
476
498
|
"n4-",
|
|
477
499
|
"h3-",
|
|
478
500
|
"v6e",
|
|
501
|
+
"a4-",
|
|
479
502
|
]
|
|
480
503
|
)
|
|
@@ -11,6 +11,7 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
11
11
|
Compute,
|
|
12
12
|
ComputeWithAllOffersCached,
|
|
13
13
|
ComputeWithCreateInstanceSupport,
|
|
14
|
+
ComputeWithPrivilegedSupport,
|
|
14
15
|
get_shim_commands,
|
|
15
16
|
)
|
|
16
17
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
@@ -47,6 +48,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
47
48
|
class HotAisleCompute(
|
|
48
49
|
ComputeWithAllOffersCached,
|
|
49
50
|
ComputeWithCreateInstanceSupport,
|
|
51
|
+
ComputeWithPrivilegedSupport,
|
|
50
52
|
Compute,
|
|
51
53
|
):
|
|
52
54
|
def __init__(self, config: HotAisleConfig):
|