dstack 0.19.6rc1__py3-none-any.whl → 0.19.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/fleet.py +3 -2
- dstack/_internal/cli/services/configurators/run.py +50 -4
- dstack/_internal/cli/utils/fleet.py +3 -1
- dstack/_internal/cli/utils/run.py +25 -28
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/gcp/resources.py +6 -1
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +47 -0
- dstack/_internal/core/backends/nebius/models.py +8 -0
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +2 -1
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/errors.py +4 -0
- dstack/_internal/core/models/fleets.py +2 -0
- dstack/_internal/core/models/instances.py +4 -3
- dstack/_internal/core/models/resources.py +80 -3
- dstack/_internal/core/models/runs.py +10 -3
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +1 -1
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/gateways.py +2 -1
- dstack/_internal/server/services/config.py +7 -2
- dstack/_internal/server/services/fleets.py +24 -26
- dstack/_internal/server/services/gateways/__init__.py +17 -2
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/plugins.py +77 -0
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runs.py +41 -17
- dstack/_internal/server/services/volumes.py +10 -1
- dstack/_internal/server/testing/common.py +35 -26
- dstack/_internal/utils/common.py +22 -9
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/__init__.py +8 -1
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +44 -3
- dstack/plugins/__init__.py +8 -0
- dstack/plugins/_base.py +72 -0
- dstack/plugins/_models.py +8 -0
- dstack/plugins/_utils.py +19 -0
- dstack/version.py +1 -1
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/METADATA +14 -2
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/RECORD +69 -62
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.6rc1.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import random
|
|
2
3
|
import shlex
|
|
3
4
|
import time
|
|
4
5
|
from functools import cached_property
|
|
@@ -13,13 +14,19 @@ from dstack._internal.core.backends.base.backend import Compute
|
|
|
13
14
|
from dstack._internal.core.backends.base.compute import (
|
|
14
15
|
ComputeWithCreateInstanceSupport,
|
|
15
16
|
ComputeWithMultinodeSupport,
|
|
17
|
+
ComputeWithPlacementGroupSupport,
|
|
16
18
|
generate_unique_instance_name,
|
|
17
19
|
get_user_data,
|
|
18
20
|
)
|
|
19
21
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
20
22
|
from dstack._internal.core.backends.nebius import resources
|
|
23
|
+
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
21
24
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
22
|
-
from dstack._internal.core.errors import
|
|
25
|
+
from dstack._internal.core.errors import (
|
|
26
|
+
BackendError,
|
|
27
|
+
NotYetTerminated,
|
|
28
|
+
ProvisioningError,
|
|
29
|
+
)
|
|
23
30
|
from dstack._internal.core.models.backends.base import BackendType
|
|
24
31
|
from dstack._internal.core.models.common import CoreModel
|
|
25
32
|
from dstack._internal.core.models.instances import (
|
|
@@ -28,6 +35,11 @@ from dstack._internal.core.models.instances import (
|
|
|
28
35
|
InstanceOffer,
|
|
29
36
|
InstanceOfferWithAvailability,
|
|
30
37
|
)
|
|
38
|
+
from dstack._internal.core.models.placement import (
|
|
39
|
+
PlacementGroup,
|
|
40
|
+
PlacementGroupProvisioningData,
|
|
41
|
+
PlacementStrategy,
|
|
42
|
+
)
|
|
31
43
|
from dstack._internal.core.models.resources import Memory, Range
|
|
32
44
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
33
45
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -72,6 +84,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
72
84
|
class NebiusCompute(
|
|
73
85
|
ComputeWithCreateInstanceSupport,
|
|
74
86
|
ComputeWithMultinodeSupport,
|
|
87
|
+
ComputeWithPlacementGroupSupport,
|
|
75
88
|
Compute,
|
|
76
89
|
):
|
|
77
90
|
def __init__(self, config: NebiusConfig):
|
|
@@ -121,6 +134,7 @@ class NebiusCompute(
|
|
|
121
134
|
self,
|
|
122
135
|
instance_offer: InstanceOfferWithAvailability,
|
|
123
136
|
instance_config: InstanceConfiguration,
|
|
137
|
+
placement_group: Optional[PlacementGroup],
|
|
124
138
|
) -> JobProvisioningData:
|
|
125
139
|
# NOTE: This method can block for a long time as it waits for the boot disk to be created
|
|
126
140
|
# and the instance to enter the STARTING state. This has to be done in create_instance so
|
|
@@ -128,6 +142,14 @@ class NebiusCompute(
|
|
|
128
142
|
# instance.
|
|
129
143
|
instance_name = generate_unique_instance_name(instance_config)
|
|
130
144
|
platform, preset = instance_offer.instance.name.split()
|
|
145
|
+
cluster_id = None
|
|
146
|
+
if placement_group:
|
|
147
|
+
assert placement_group.provisioning_data is not None
|
|
148
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
149
|
+
placement_group.provisioning_data.backend_data
|
|
150
|
+
)
|
|
151
|
+
if backend_data.cluster is not None:
|
|
152
|
+
cluster_id = backend_data.cluster.id
|
|
131
153
|
create_disk_op = resources.create_disk(
|
|
132
154
|
sdk=self._sdk,
|
|
133
155
|
name=instance_name,
|
|
@@ -155,6 +177,7 @@ class NebiusCompute(
|
|
|
155
177
|
),
|
|
156
178
|
platform=platform,
|
|
157
179
|
preset=preset,
|
|
180
|
+
cluster_id=cluster_id,
|
|
158
181
|
disk_id=create_disk_op.resource_id,
|
|
159
182
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
160
183
|
)
|
|
@@ -230,6 +253,63 @@ class NebiusCompute(
|
|
|
230
253
|
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
231
254
|
resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
|
|
232
255
|
|
|
256
|
+
def create_placement_group(
|
|
257
|
+
self,
|
|
258
|
+
placement_group: PlacementGroup,
|
|
259
|
+
master_instance_offer: InstanceOffer,
|
|
260
|
+
) -> PlacementGroupProvisioningData:
|
|
261
|
+
assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
|
|
262
|
+
backend_data = NebiusPlacementGroupBackendData(cluster=None)
|
|
263
|
+
# Only create a Nebius cluster if the instance supports it.
|
|
264
|
+
# For other instances, return dummy PlacementGroupProvisioningData.
|
|
265
|
+
if fabrics := get_suitable_infiniband_fabrics(
|
|
266
|
+
master_instance_offer, allowed_fabrics=self.config.fabrics
|
|
267
|
+
):
|
|
268
|
+
fabric = random.choice(fabrics)
|
|
269
|
+
op = resources.create_cluster(
|
|
270
|
+
self._sdk,
|
|
271
|
+
name=placement_group.name,
|
|
272
|
+
project_id=self._region_to_project_id[placement_group.configuration.region],
|
|
273
|
+
fabric=fabric,
|
|
274
|
+
)
|
|
275
|
+
backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
|
|
276
|
+
return PlacementGroupProvisioningData(
|
|
277
|
+
backend=BackendType.NEBIUS,
|
|
278
|
+
backend_data=backend_data.json(),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
def delete_placement_group(self, placement_group: PlacementGroup) -> None:
|
|
282
|
+
assert placement_group.provisioning_data is not None
|
|
283
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
284
|
+
placement_group.provisioning_data.backend_data
|
|
285
|
+
)
|
|
286
|
+
if backend_data.cluster is not None:
|
|
287
|
+
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
288
|
+
resources.delete_cluster(self._sdk, backend_data.cluster.id)
|
|
289
|
+
|
|
290
|
+
def is_suitable_placement_group(
|
|
291
|
+
self,
|
|
292
|
+
placement_group: PlacementGroup,
|
|
293
|
+
instance_offer: InstanceOffer,
|
|
294
|
+
) -> bool:
|
|
295
|
+
if not (
|
|
296
|
+
placement_group.configuration.backend == BackendType.NEBIUS
|
|
297
|
+
and placement_group.configuration.region == instance_offer.region
|
|
298
|
+
):
|
|
299
|
+
return False
|
|
300
|
+
assert placement_group.provisioning_data is not None
|
|
301
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
302
|
+
placement_group.provisioning_data.backend_data
|
|
303
|
+
)
|
|
304
|
+
return (
|
|
305
|
+
backend_data.cluster is None
|
|
306
|
+
or backend_data.cluster.fabric
|
|
307
|
+
in get_suitable_infiniband_fabrics(
|
|
308
|
+
instance_offer,
|
|
309
|
+
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
|
|
310
|
+
)
|
|
311
|
+
)
|
|
312
|
+
|
|
233
313
|
|
|
234
314
|
class NebiusInstanceBackendData(CoreModel):
|
|
235
315
|
boot_disk_id: str
|
|
@@ -240,6 +320,20 @@ class NebiusInstanceBackendData(CoreModel):
|
|
|
240
320
|
return cls.__response__.parse_raw(raw)
|
|
241
321
|
|
|
242
322
|
|
|
323
|
+
class NebiusClusterBackendData(CoreModel):
|
|
324
|
+
id: str
|
|
325
|
+
fabric: str
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class NebiusPlacementGroupBackendData(CoreModel):
|
|
329
|
+
cluster: Optional[NebiusClusterBackendData]
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData":
|
|
333
|
+
assert raw is not None
|
|
334
|
+
return cls.__response__.parse_raw(raw)
|
|
335
|
+
|
|
336
|
+
|
|
243
337
|
def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
|
|
244
338
|
start = time.monotonic()
|
|
245
339
|
while True:
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.configurator import (
|
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.core.backends.nebius import resources
|
|
11
11
|
from dstack._internal.core.backends.nebius.backend import NebiusBackend
|
|
12
|
+
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
|
|
12
13
|
from dstack._internal.core.backends.nebius.models import (
|
|
13
14
|
AnyNebiusBackendConfig,
|
|
14
15
|
NebiusBackendConfig,
|
|
@@ -38,6 +39,16 @@ class NebiusConfigurator(Configurator):
|
|
|
38
39
|
fields=[["creds"]],
|
|
39
40
|
details=str(e),
|
|
40
41
|
)
|
|
42
|
+
valid_fabrics = get_all_infiniband_fabrics()
|
|
43
|
+
if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
|
|
44
|
+
raise_invalid_credentials_error(
|
|
45
|
+
fields=[["fabrics"]],
|
|
46
|
+
details=(
|
|
47
|
+
"These InfiniBand fabrics do not exist or are not known to dstack:"
|
|
48
|
+
f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
|
|
49
|
+
f" some of the valid options: {sorted(valid_fabrics)}"
|
|
50
|
+
),
|
|
51
|
+
)
|
|
41
52
|
|
|
42
53
|
def create_backend(
|
|
43
54
|
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from collections.abc import Container
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.instances import InstanceOffer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class InfinibandFabric:
|
|
10
|
+
name: str
|
|
11
|
+
platform: str
|
|
12
|
+
region: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# https://docs.nebius.com/compute/clusters/gpu#fabrics
|
|
16
|
+
INFINIBAND_FABRICS = [
|
|
17
|
+
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
|
|
18
|
+
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
|
|
19
|
+
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
|
|
20
|
+
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
|
|
21
|
+
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
|
|
22
|
+
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_suitable_infiniband_fabrics(
|
|
27
|
+
offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
|
|
28
|
+
) -> list[str]:
|
|
29
|
+
if len(offer.instance.resources.gpus) < 8:
|
|
30
|
+
# From the create VM page in the Nebius Console:
|
|
31
|
+
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
|
|
32
|
+
# > can be added to the cluster
|
|
33
|
+
return []
|
|
34
|
+
platform, _ = offer.instance.name.split()
|
|
35
|
+
return [
|
|
36
|
+
f.name
|
|
37
|
+
for f in INFINIBAND_FABRICS
|
|
38
|
+
if (
|
|
39
|
+
f.platform == platform
|
|
40
|
+
and f.region == offer.region
|
|
41
|
+
and (allowed_fabrics is None or f.name in allowed_fabrics)
|
|
42
|
+
)
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_all_infiniband_fabrics() -> set[str]:
|
|
47
|
+
return {f.name for f in INFINIBAND_FABRICS}
|
|
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
|
|
|
87
87
|
Optional[list[str]],
|
|
88
88
|
Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
|
|
89
89
|
] = None
|
|
90
|
+
fabrics: Annotated[
|
|
91
|
+
Optional[list[str]],
|
|
92
|
+
Field(
|
|
93
|
+
description=(
|
|
94
|
+
"The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
|
|
95
|
+
)
|
|
96
|
+
),
|
|
97
|
+
] = None
|
|
90
98
|
|
|
91
99
|
|
|
92
100
|
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
@@ -15,14 +15,19 @@ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
|
|
|
15
15
|
from nebius.api.nebius.compute.v1 import (
|
|
16
16
|
AttachedDiskSpec,
|
|
17
17
|
CreateDiskRequest,
|
|
18
|
+
CreateGpuClusterRequest,
|
|
18
19
|
CreateInstanceRequest,
|
|
19
20
|
DeleteDiskRequest,
|
|
21
|
+
DeleteGpuClusterRequest,
|
|
20
22
|
DeleteInstanceRequest,
|
|
21
23
|
DiskServiceClient,
|
|
22
24
|
DiskSpec,
|
|
23
25
|
ExistingDisk,
|
|
24
26
|
GetInstanceRequest,
|
|
27
|
+
GpuClusterServiceClient,
|
|
28
|
+
GpuClusterSpec,
|
|
25
29
|
Instance,
|
|
30
|
+
InstanceGpuClusterSpec,
|
|
26
31
|
InstanceServiceClient,
|
|
27
32
|
InstanceSpec,
|
|
28
33
|
IPAddress,
|
|
@@ -275,6 +280,7 @@ def create_instance(
|
|
|
275
280
|
user_data: str,
|
|
276
281
|
platform: str,
|
|
277
282
|
preset: str,
|
|
283
|
+
cluster_id: Optional[str],
|
|
278
284
|
disk_id: str,
|
|
279
285
|
subnet_id: str,
|
|
280
286
|
) -> SDKOperation[Operation]:
|
|
@@ -287,6 +293,7 @@ def create_instance(
|
|
|
287
293
|
spec=InstanceSpec(
|
|
288
294
|
cloud_init_user_data=user_data,
|
|
289
295
|
resources=ResourcesSpec(platform=platform, preset=preset),
|
|
296
|
+
gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None,
|
|
290
297
|
boot_disk=AttachedDiskSpec(
|
|
291
298
|
attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
292
299
|
existing_disk=ExistingDisk(id=disk_id),
|
|
@@ -319,3 +326,25 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
|
|
|
319
326
|
DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
320
327
|
)
|
|
321
328
|
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]:
|
|
332
|
+
with wrap_capacity_errors():
|
|
333
|
+
return LOOP.await_(
|
|
334
|
+
GpuClusterServiceClient(sdk).create(
|
|
335
|
+
CreateGpuClusterRequest(
|
|
336
|
+
metadata=ResourceMetadata(name=name, parent_id=project_id),
|
|
337
|
+
spec=GpuClusterSpec(infiniband_fabric=fabric),
|
|
338
|
+
),
|
|
339
|
+
timeout=REQUEST_TIMEOUT,
|
|
340
|
+
metadata=REQUEST_MD,
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def delete_cluster(sdk: SDK, cluster_id: str) -> None:
|
|
346
|
+
return LOOP.await_(
|
|
347
|
+
GpuClusterServiceClient(sdk).delete(
|
|
348
|
+
DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
349
|
+
)
|
|
350
|
+
)
|
|
@@ -23,6 +23,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
23
|
InstanceOffer,
|
|
24
24
|
InstanceOfferWithAvailability,
|
|
25
25
|
)
|
|
26
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
27
|
from dstack._internal.core.models.resources import Memory, Range
|
|
27
28
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
28
29
|
|
|
@@ -105,6 +106,7 @@ class OCICompute(
|
|
|
105
106
|
self,
|
|
106
107
|
instance_offer: InstanceOfferWithAvailability,
|
|
107
108
|
instance_config: InstanceConfiguration,
|
|
109
|
+
placement_group: Optional[PlacementGroup],
|
|
108
110
|
) -> JobProvisioningData:
|
|
109
111
|
region = self.regions[instance_offer.region]
|
|
110
112
|
|
|
@@ -6,8 +6,9 @@ from textwrap import dedent
|
|
|
6
6
|
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
|
-
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
9
|
+
from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
|
|
10
10
|
|
|
11
|
+
from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
|
|
11
12
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
12
13
|
|
|
13
14
|
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
|
|
|
36
37
|
HOST_INFO_FILE = "host_info.json"
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
|
|
41
|
+
cmd = "uname -m"
|
|
42
|
+
try:
|
|
43
|
+
_, stdout, stderr = client.exec_command(cmd, timeout=20)
|
|
44
|
+
except (paramiko.SSHException, OSError) as e:
|
|
45
|
+
raise ProvisioningError(f"detect_cpu_arch: {e}") from e
|
|
46
|
+
out = stdout.read().strip().decode()
|
|
47
|
+
err = stderr.read().strip().decode()
|
|
48
|
+
if err:
|
|
49
|
+
raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
|
|
50
|
+
try:
|
|
51
|
+
return normalize_arch(out)
|
|
52
|
+
except ValueError as e:
|
|
53
|
+
raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
|
|
54
|
+
|
|
55
|
+
|
|
39
56
|
def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
|
|
40
57
|
try:
|
|
41
58
|
sftp = client.open_sftp()
|
|
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
|
|
|
226
243
|
raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
|
|
227
244
|
|
|
228
245
|
|
|
229
|
-
def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
246
|
+
def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
|
|
247
|
+
_cpu_arch: CPUArchitecture
|
|
248
|
+
if cpu_arch == "amd64":
|
|
249
|
+
_cpu_arch = CPUArchitecture.X86
|
|
250
|
+
elif cpu_arch == "arm64":
|
|
251
|
+
_cpu_arch = CPUArchitecture.ARM
|
|
252
|
+
else:
|
|
253
|
+
raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
|
|
230
254
|
gpu_count = host_info.get("gpu_count", 0)
|
|
231
255
|
if gpu_count > 0:
|
|
232
256
|
gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
|
|
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
|
251
275
|
instance_type = InstanceType(
|
|
252
276
|
name="instance",
|
|
253
277
|
resources=Resources(
|
|
278
|
+
cpu_arch=_cpu_arch,
|
|
254
279
|
cpus=host_info["cpus"],
|
|
255
280
|
memory_mib=host_info["memory"] / 1024 / 1024,
|
|
256
281
|
spot=False,
|
|
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
|
|
|
18
18
|
InstanceConfiguration,
|
|
19
19
|
InstanceOfferWithAvailability,
|
|
20
20
|
)
|
|
21
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
21
22
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
22
23
|
from dstack._internal.core.models.volumes import Volume
|
|
23
24
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
|
|
|
64
65
|
self,
|
|
65
66
|
instance_offer: InstanceOfferWithAvailability,
|
|
66
67
|
instance_config: InstanceConfiguration,
|
|
68
|
+
placement_group: Optional[PlacementGroup],
|
|
67
69
|
) -> JobProvisioningData:
|
|
68
70
|
# TODO: Implement if backend supports creating instances (VM-based).
|
|
69
71
|
# Delete if backend can only run jobs (container-based).
|
|
@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
|
|
|
19
19
|
InstanceConfiguration,
|
|
20
20
|
InstanceOfferWithAvailability,
|
|
21
21
|
)
|
|
22
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
22
23
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
23
24
|
from dstack._internal.utils.logging import get_logger
|
|
24
25
|
|
|
@@ -57,6 +58,7 @@ class TensorDockCompute(
|
|
|
57
58
|
self,
|
|
58
59
|
instance_offer: InstanceOfferWithAvailability,
|
|
59
60
|
instance_config: InstanceConfiguration,
|
|
61
|
+
placement_group: Optional[PlacementGroup],
|
|
60
62
|
) -> JobProvisioningData:
|
|
61
63
|
instance_name = generate_unique_instance_name(
|
|
62
64
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
|
|
|
22
22
|
InstanceOffer,
|
|
23
23
|
InstanceOfferWithAvailability,
|
|
24
24
|
)
|
|
25
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
25
26
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
26
27
|
from dstack._internal.utils.logging import get_logger
|
|
27
28
|
|
|
@@ -58,7 +59,10 @@ class VultrCompute(
|
|
|
58
59
|
return offers
|
|
59
60
|
|
|
60
61
|
def create_instance(
|
|
61
|
-
self,
|
|
62
|
+
self,
|
|
63
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
64
|
+
instance_config: InstanceConfiguration,
|
|
65
|
+
placement_group: Optional[PlacementGroup],
|
|
62
66
|
) -> JobProvisioningData:
|
|
63
67
|
instance_name = generate_unique_instance_name(
|
|
64
68
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
dstack/_internal/core/errors.py
CHANGED
|
@@ -269,6 +269,8 @@ class FleetSpec(CoreModel):
|
|
|
269
269
|
configuration_path: Optional[str] = None
|
|
270
270
|
profile: Profile
|
|
271
271
|
autocreated: bool = False
|
|
272
|
+
# merged_profile stores profile parameters merged from profile and configuration.
|
|
273
|
+
# Read profile parameters from merged_profile instead of profile directly.
|
|
272
274
|
# TODO: make merged_profile a computed field after migrating to pydanticV2
|
|
273
275
|
merged_profile: Annotated[Profile, Field(exclude=True)] = None
|
|
274
276
|
|
|
@@ -49,15 +49,17 @@ class Resources(CoreModel):
|
|
|
49
49
|
spot: bool
|
|
50
50
|
disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
|
|
51
51
|
description: str = ""
|
|
52
|
+
cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
|
|
52
53
|
|
|
53
54
|
def pretty_format(self, include_spot: bool = False) -> str:
|
|
54
55
|
resources = {}
|
|
55
56
|
if self.cpus > 0:
|
|
56
57
|
resources["cpus"] = self.cpus
|
|
58
|
+
resources["cpu_arch"] = self.cpu_arch
|
|
57
59
|
if self.memory_mib > 0:
|
|
58
60
|
resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
|
|
59
61
|
if self.disk.size_mib > 0:
|
|
60
|
-
resources["disk_size"] = f"{self.disk.size_mib / 1024:.
|
|
62
|
+
resources["disk_size"] = f"{self.disk.size_mib / 1024:.0f}GB"
|
|
61
63
|
if self.gpus:
|
|
62
64
|
gpu = self.gpus[0]
|
|
63
65
|
resources["gpu_name"] = gpu.name
|
|
@@ -66,7 +68,7 @@ class Resources(CoreModel):
|
|
|
66
68
|
resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
|
|
67
69
|
output = pretty_resources(**resources)
|
|
68
70
|
if include_spot and self.spot:
|
|
69
|
-
output += "
|
|
71
|
+
output += " (spot)"
|
|
70
72
|
return output
|
|
71
73
|
|
|
72
74
|
|
|
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
|
|
|
105
107
|
user: str # dstack user name
|
|
106
108
|
ssh_keys: List[SSHKey]
|
|
107
109
|
instance_id: Optional[str] = None
|
|
108
|
-
placement_group_name: Optional[str] = None
|
|
109
110
|
reservation: Optional[str] = None
|
|
110
111
|
volumes: Optional[List[Volume]] = None
|
|
111
112
|
tags: Optional[Dict[str, str]] = None
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import math
|
|
2
|
+
from collections.abc import Mapping
|
|
2
3
|
from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union
|
|
3
4
|
|
|
4
5
|
import gpuhunt
|
|
5
|
-
from pydantic import Field, root_validator, validator
|
|
6
|
+
from pydantic import Field, parse_obj_as, root_validator, validator
|
|
6
7
|
from pydantic.generics import GenericModel
|
|
7
8
|
from typing_extensions import Annotated
|
|
8
9
|
|
|
@@ -128,6 +129,67 @@ DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
|
|
|
128
129
|
DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
|
|
129
130
|
|
|
130
131
|
|
|
132
|
+
class CPUSpec(CoreModel):
|
|
133
|
+
class Config:
|
|
134
|
+
@staticmethod
|
|
135
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
136
|
+
add_extra_schema_types(
|
|
137
|
+
schema["properties"]["count"],
|
|
138
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
arch: Annotated[
|
|
142
|
+
Optional[gpuhunt.CPUArchitecture],
|
|
143
|
+
Field(description="The CPU architecture, one of: `x86`, `arm`"),
|
|
144
|
+
] = None
|
|
145
|
+
count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def __get_validators__(cls):
|
|
149
|
+
yield cls.parse
|
|
150
|
+
yield cls.validate
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def parse(cls, v: Any) -> Any:
|
|
154
|
+
if isinstance(v, int):
|
|
155
|
+
v = str(v)
|
|
156
|
+
if isinstance(v, str):
|
|
157
|
+
tokens = v.replace(" ", "").split(":")
|
|
158
|
+
spec = {}
|
|
159
|
+
for token in tokens:
|
|
160
|
+
if not token:
|
|
161
|
+
raise ValueError(f"CPU spec contains empty token: {v}")
|
|
162
|
+
if ".." in token or token.isdigit():
|
|
163
|
+
if "count" in spec:
|
|
164
|
+
raise ValueError(f"CPU spec count conflict: {v}")
|
|
165
|
+
spec["count"] = token
|
|
166
|
+
else:
|
|
167
|
+
try:
|
|
168
|
+
arch = gpuhunt.CPUArchitecture.cast(token)
|
|
169
|
+
except ValueError:
|
|
170
|
+
raise ValueError(f"Invalid CPU architecture: {v}")
|
|
171
|
+
if "arch" in spec:
|
|
172
|
+
raise ValueError(f"CPU spec arch conflict: {v}")
|
|
173
|
+
spec["arch"] = arch
|
|
174
|
+
return spec
|
|
175
|
+
# Range and min/max dict - for backward compatibility
|
|
176
|
+
if isinstance(v, Range):
|
|
177
|
+
return {"arch": None, "count": v}
|
|
178
|
+
if isinstance(v, Mapping) and v.keys() == {"min", "max"}:
|
|
179
|
+
return {"arch": None, "count": v}
|
|
180
|
+
return v
|
|
181
|
+
|
|
182
|
+
@validator("arch", pre=True)
|
|
183
|
+
def _validate_arch(cls, v: Any) -> Any:
|
|
184
|
+
if v is None:
|
|
185
|
+
return None
|
|
186
|
+
if isinstance(v, gpuhunt.CPUArchitecture):
|
|
187
|
+
return v
|
|
188
|
+
if isinstance(v, str):
|
|
189
|
+
return gpuhunt.CPUArchitecture.cast(v)
|
|
190
|
+
return v
|
|
191
|
+
|
|
192
|
+
|
|
131
193
|
class GPUSpec(CoreModel):
|
|
132
194
|
class Config:
|
|
133
195
|
@staticmethod
|
|
@@ -246,6 +308,8 @@ class GPUSpec(CoreModel):
|
|
|
246
308
|
v = v.lower()
|
|
247
309
|
if v == "tpu":
|
|
248
310
|
return gpuhunt.AcceleratorVendor.GOOGLE
|
|
311
|
+
if v == "tt":
|
|
312
|
+
return gpuhunt.AcceleratorVendor.TENSTORRENT
|
|
249
313
|
return gpuhunt.AcceleratorVendor.cast(v)
|
|
250
314
|
|
|
251
315
|
|
|
@@ -300,7 +364,10 @@ class ResourcesSpec(CoreModel):
|
|
|
300
364
|
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
301
365
|
)
|
|
302
366
|
|
|
303
|
-
|
|
367
|
+
# TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
|
|
368
|
+
cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
|
|
369
|
+
CPUSpec()
|
|
370
|
+
)
|
|
304
371
|
memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
|
|
305
372
|
DEFAULT_MEMORY_SIZE
|
|
306
373
|
)
|
|
@@ -315,8 +382,18 @@ class ResourcesSpec(CoreModel):
|
|
|
315
382
|
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
316
383
|
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
317
384
|
|
|
385
|
+
# TODO: Remove in 0.20. Added for backward compatibility.
|
|
386
|
+
@root_validator
|
|
387
|
+
def _post_validate(cls, values):
|
|
388
|
+
cpu = values.get("cpu")
|
|
389
|
+
if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
|
|
390
|
+
values["cpu"] = cpu.count
|
|
391
|
+
return values
|
|
392
|
+
|
|
318
393
|
def pretty_format(self) -> str:
|
|
319
|
-
|
|
394
|
+
# TODO: Remove in 0.20. Use self.cpu directly
|
|
395
|
+
cpu = parse_obj_as(CPUSpec, self.cpu)
|
|
396
|
+
resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory)
|
|
320
397
|
if self.gpu:
|
|
321
398
|
gpu = self.gpu
|
|
322
399
|
resources.update(
|
|
@@ -162,7 +162,7 @@ class Requirements(CoreModel):
|
|
|
162
162
|
if self.spot is not None:
|
|
163
163
|
res += f", {'spot' if self.spot else 'on-demand'}"
|
|
164
164
|
if self.max_price is not None:
|
|
165
|
-
res += f" under ${self.max_price:
|
|
165
|
+
res += f" under ${self.max_price:3f}".rstrip("0").rstrip(".") + " per hour"
|
|
166
166
|
return res
|
|
167
167
|
|
|
168
168
|
|
|
@@ -357,6 +357,8 @@ class RunSpec(CoreModel):
|
|
|
357
357
|
description="The contents of the SSH public key that will be used to connect to the run."
|
|
358
358
|
),
|
|
359
359
|
]
|
|
360
|
+
# merged_profile stores profile parameters merged from profile and configuration.
|
|
361
|
+
# Read profile parameters from merged_profile instead of profile directly.
|
|
360
362
|
# TODO: make merged_profile a computed field after migrating to pydanticV2
|
|
361
363
|
merged_profile: Annotated[Profile, Field(exclude=True)] = None
|
|
362
364
|
|
|
@@ -437,9 +439,14 @@ class Run(CoreModel):
|
|
|
437
439
|
|
|
438
440
|
@root_validator
|
|
439
441
|
def _error(cls, values) -> Dict:
|
|
442
|
+
try:
|
|
443
|
+
termination_reason = values["termination_reason"]
|
|
444
|
+
jobs = values["jobs"]
|
|
445
|
+
except KeyError:
|
|
446
|
+
return values
|
|
440
447
|
values["error"] = _get_run_error(
|
|
441
|
-
run_termination_reason=
|
|
442
|
-
run_jobs=
|
|
448
|
+
run_termination_reason=termination_reason,
|
|
449
|
+
run_jobs=jobs,
|
|
443
450
|
)
|
|
444
451
|
return values
|
|
445
452
|
|
|
@@ -159,7 +159,7 @@ class VolumeMountPoint(CoreModel):
|
|
|
159
159
|
description=(
|
|
160
160
|
"The network volume name or the list of network volume names to mount."
|
|
161
161
|
" If a list is specified, one of the volumes in the list will be mounted."
|
|
162
|
-
" Specify volumes from different backends/regions to increase availability
|
|
162
|
+
" Specify volumes from different backends/regions to increase availability"
|
|
163
163
|
)
|
|
164
164
|
),
|
|
165
165
|
]
|