dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +56 -13
- dstack/_internal/cli/utils/run.py +10 -5
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +3 -1
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +48 -0
- dstack/_internal/core/backends/nebius/models.py +9 -1
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +79 -4
- dstack/_internal/core/models/runs.py +26 -9
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +18 -8
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +37 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +48 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -2,6 +2,7 @@ from dataclasses import asdict
|
|
|
2
2
|
from typing import Callable, List, Optional
|
|
3
3
|
|
|
4
4
|
import gpuhunt
|
|
5
|
+
from pydantic import parse_obj_as
|
|
5
6
|
|
|
6
7
|
from dstack._internal.core.models.backends.base import BackendType
|
|
7
8
|
from dstack._internal.core.models.instances import (
|
|
@@ -11,13 +12,14 @@ from dstack._internal.core.models.instances import (
|
|
|
11
12
|
InstanceType,
|
|
12
13
|
Resources,
|
|
13
14
|
)
|
|
14
|
-
from dstack._internal.core.models.resources import DEFAULT_DISK, Memory, Range
|
|
15
|
+
from dstack._internal.core.models.resources import DEFAULT_DISK, CPUSpec, Memory, Range
|
|
15
16
|
from dstack._internal.core.models.runs import Requirements
|
|
16
17
|
|
|
17
18
|
# Offers not supported by all dstack versions are hidden behind one or more flags.
|
|
18
19
|
# This list enables the flags that are currently supported.
|
|
19
20
|
SUPPORTED_GPUHUNT_FLAGS = [
|
|
20
21
|
"oci-spot",
|
|
22
|
+
"lambda-arm",
|
|
21
23
|
]
|
|
22
24
|
|
|
23
25
|
|
|
@@ -71,6 +73,7 @@ def catalog_item_to_offer(
|
|
|
71
73
|
if disk_size_mib is None:
|
|
72
74
|
return None
|
|
73
75
|
resources = Resources(
|
|
76
|
+
cpu_arch=item.cpu_arch,
|
|
74
77
|
cpus=item.cpu,
|
|
75
78
|
memory_mib=round(item.memory * 1024),
|
|
76
79
|
gpus=gpus,
|
|
@@ -90,6 +93,9 @@ def catalog_item_to_offer(
|
|
|
90
93
|
|
|
91
94
|
|
|
92
95
|
def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
|
|
96
|
+
cpu_arch = offer.instance.resources.cpu_arch
|
|
97
|
+
if cpu_arch is None:
|
|
98
|
+
cpu_arch = gpuhunt.CPUArchitecture.X86
|
|
93
99
|
gpu_count = len(offer.instance.resources.gpus)
|
|
94
100
|
gpu_vendor = None
|
|
95
101
|
gpu_name = None
|
|
@@ -104,6 +110,7 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
|
|
|
104
110
|
instance_name=offer.instance.name,
|
|
105
111
|
location=offer.region,
|
|
106
112
|
price=offer.price,
|
|
113
|
+
cpu_arch=cpu_arch,
|
|
107
114
|
cpu=offer.instance.resources.cpus,
|
|
108
115
|
memory=offer.instance.resources.memory_mib / 1024,
|
|
109
116
|
gpu_count=gpu_count,
|
|
@@ -125,8 +132,11 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi
|
|
|
125
132
|
|
|
126
133
|
res = req.resources
|
|
127
134
|
if res.cpu:
|
|
128
|
-
|
|
129
|
-
|
|
135
|
+
# TODO: Remove in 0.20. Use res.cpu directly
|
|
136
|
+
cpu = parse_obj_as(CPUSpec, res.cpu)
|
|
137
|
+
q.cpu_arch = cpu.arch
|
|
138
|
+
q.min_cpu = cpu.count.min
|
|
139
|
+
q.max_cpu = cpu.count.max
|
|
130
140
|
if res.memory:
|
|
131
141
|
q.min_memory = res.memory.min
|
|
132
142
|
q.max_memory = res.memory.max
|
|
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
|
|
|
18
18
|
InstanceConfiguration,
|
|
19
19
|
InstanceOfferWithAvailability,
|
|
20
20
|
)
|
|
21
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
21
22
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
22
23
|
from dstack._internal.utils.logging import get_logger
|
|
23
24
|
|
|
@@ -58,6 +59,7 @@ class CudoCompute(
|
|
|
58
59
|
self,
|
|
59
60
|
instance_offer: InstanceOfferWithAvailability,
|
|
60
61
|
instance_config: InstanceConfiguration,
|
|
62
|
+
placement_group: Optional[PlacementGroup],
|
|
61
63
|
) -> JobProvisioningData:
|
|
62
64
|
vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
|
|
63
65
|
public_keys = instance_config.get_public_keys()
|
|
@@ -145,7 +147,7 @@ class CudoCompute(
|
|
|
145
147
|
|
|
146
148
|
|
|
147
149
|
def _get_image_id(cuda: bool) -> str:
|
|
148
|
-
image_name = "ubuntu-2204-nvidia-535-docker-
|
|
150
|
+
image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
|
|
149
151
|
return image_name
|
|
150
152
|
|
|
151
153
|
|
|
@@ -20,6 +20,7 @@ from dstack._internal.core.models.instances import (
|
|
|
20
20
|
InstanceOffer,
|
|
21
21
|
InstanceOfferWithAvailability,
|
|
22
22
|
)
|
|
23
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
24
|
from dstack._internal.core.models.resources import Memory, Range
|
|
24
25
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
25
26
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -85,6 +86,7 @@ class DataCrunchCompute(
|
|
|
85
86
|
self,
|
|
86
87
|
instance_offer: InstanceOfferWithAvailability,
|
|
87
88
|
instance_config: InstanceConfiguration,
|
|
89
|
+
placement_group: Optional[PlacementGroup],
|
|
88
90
|
) -> JobProvisioningData:
|
|
89
91
|
instance_name = generate_unique_instance_name(
|
|
90
92
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
@@ -19,7 +19,7 @@ def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[
|
|
|
19
19
|
credentials, credentials_project_id = get_credentials(creds)
|
|
20
20
|
if project_id is None:
|
|
21
21
|
# If project_id is not specified explicitly, try using credentials' project_id.
|
|
22
|
-
# Explicit project_id takes precedence
|
|
22
|
+
# Explicit project_id takes precedence because credentials' project_id may be irrelevant.
|
|
23
23
|
# For example, with Workload Identity Federation for GKE, it's cluster project_id.
|
|
24
24
|
project_id = credentials_project_id
|
|
25
25
|
if project_id is None:
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
|
+
import threading
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from typing import Callable, Dict, List, Literal, Optional, Tuple
|
|
5
6
|
|
|
6
7
|
import google.api_core.exceptions
|
|
7
8
|
import google.cloud.compute_v1 as compute_v1
|
|
9
|
+
from cachetools import TTLCache, cachedmethod
|
|
8
10
|
from google.cloud import tpu_v2
|
|
9
11
|
from gpuhunt import KNOWN_TPUS
|
|
10
12
|
|
|
@@ -98,6 +100,8 @@ class GCPCompute(
|
|
|
98
100
|
self.resource_policies_client = compute_v1.ResourcePoliciesClient(
|
|
99
101
|
credentials=self.credentials
|
|
100
102
|
)
|
|
103
|
+
self._extra_subnets_cache_lock = threading.Lock()
|
|
104
|
+
self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
|
|
101
105
|
|
|
102
106
|
def get_offers(
|
|
103
107
|
self, requirements: Optional[Requirements] = None
|
|
@@ -166,6 +170,7 @@ class GCPCompute(
|
|
|
166
170
|
self,
|
|
167
171
|
instance_offer: InstanceOfferWithAvailability,
|
|
168
172
|
instance_config: InstanceConfiguration,
|
|
173
|
+
placement_group: Optional[PlacementGroup],
|
|
169
174
|
) -> JobProvisioningData:
|
|
170
175
|
instance_name = generate_unique_instance_name(
|
|
171
176
|
instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
|
|
@@ -192,18 +197,16 @@ class GCPCompute(
|
|
|
192
197
|
config=self.config,
|
|
193
198
|
region=instance_offer.region,
|
|
194
199
|
)
|
|
195
|
-
extra_subnets = _get_extra_subnets(
|
|
196
|
-
subnetworks_client=self.subnetworks_client,
|
|
197
|
-
config=self.config,
|
|
200
|
+
extra_subnets = self._get_extra_subnets(
|
|
198
201
|
region=instance_offer.region,
|
|
199
202
|
instance_type_name=instance_offer.instance.name,
|
|
200
203
|
)
|
|
201
204
|
placement_policy = None
|
|
202
|
-
if
|
|
205
|
+
if placement_group is not None:
|
|
203
206
|
placement_policy = gcp_resources.get_placement_policy_resource_name(
|
|
204
207
|
project_id=self.config.project_id,
|
|
205
208
|
region=instance_offer.region,
|
|
206
|
-
placement_policy=
|
|
209
|
+
placement_policy=placement_group.name,
|
|
207
210
|
)
|
|
208
211
|
labels = {
|
|
209
212
|
"owner": "dstack",
|
|
@@ -406,6 +409,7 @@ class GCPCompute(
|
|
|
406
409
|
def create_placement_group(
|
|
407
410
|
self,
|
|
408
411
|
placement_group: PlacementGroup,
|
|
412
|
+
master_instance_offer: InstanceOffer,
|
|
409
413
|
) -> PlacementGroupProvisioningData:
|
|
410
414
|
policy = compute_v1.ResourcePolicy(
|
|
411
415
|
name=placement_group.name,
|
|
@@ -440,6 +444,16 @@ class GCPCompute(
|
|
|
440
444
|
raise PlacementGroupInUseError()
|
|
441
445
|
raise
|
|
442
446
|
|
|
447
|
+
def is_suitable_placement_group(
|
|
448
|
+
self,
|
|
449
|
+
placement_group: PlacementGroup,
|
|
450
|
+
instance_offer: InstanceOffer,
|
|
451
|
+
) -> bool:
|
|
452
|
+
return (
|
|
453
|
+
placement_group.configuration.backend == BackendType.GCP
|
|
454
|
+
and placement_group.configuration.region == instance_offer.region
|
|
455
|
+
)
|
|
456
|
+
|
|
443
457
|
def create_gateway(
|
|
444
458
|
self,
|
|
445
459
|
configuration: GatewayComputeConfiguration,
|
|
@@ -757,6 +771,38 @@ class GCPCompute(
|
|
|
757
771
|
instance_id,
|
|
758
772
|
)
|
|
759
773
|
|
|
774
|
+
@cachedmethod(
|
|
775
|
+
cache=lambda self: self._extra_subnets_cache,
|
|
776
|
+
lock=lambda self: self._extra_subnets_cache_lock,
|
|
777
|
+
)
|
|
778
|
+
def _get_extra_subnets(
|
|
779
|
+
self,
|
|
780
|
+
region: str,
|
|
781
|
+
instance_type_name: str,
|
|
782
|
+
) -> List[Tuple[str, str]]:
|
|
783
|
+
if self.config.extra_vpcs is None:
|
|
784
|
+
return []
|
|
785
|
+
if instance_type_name == "a3-megagpu-8g":
|
|
786
|
+
subnets_num = 8
|
|
787
|
+
elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
|
|
788
|
+
subnets_num = 4
|
|
789
|
+
else:
|
|
790
|
+
return []
|
|
791
|
+
extra_subnets = []
|
|
792
|
+
for vpc_name in self.config.extra_vpcs[:subnets_num]:
|
|
793
|
+
subnet = gcp_resources.get_vpc_subnet_or_error(
|
|
794
|
+
subnetworks_client=self.subnetworks_client,
|
|
795
|
+
vpc_project_id=self.config.vpc_project_id or self.config.project_id,
|
|
796
|
+
vpc_name=vpc_name,
|
|
797
|
+
region=region,
|
|
798
|
+
)
|
|
799
|
+
vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
|
|
800
|
+
project_id=self.config.vpc_project_id or self.config.project_id,
|
|
801
|
+
vpc_name=vpc_name,
|
|
802
|
+
)
|
|
803
|
+
extra_subnets.append((vpc_resource_name, subnet))
|
|
804
|
+
return extra_subnets
|
|
805
|
+
|
|
760
806
|
|
|
761
807
|
def _supported_instances_and_zones(
|
|
762
808
|
regions: List[str],
|
|
@@ -831,36 +877,6 @@ def _get_vpc_subnet(
|
|
|
831
877
|
)
|
|
832
878
|
|
|
833
879
|
|
|
834
|
-
def _get_extra_subnets(
|
|
835
|
-
subnetworks_client: compute_v1.SubnetworksClient,
|
|
836
|
-
config: GCPConfig,
|
|
837
|
-
region: str,
|
|
838
|
-
instance_type_name: str,
|
|
839
|
-
) -> List[Tuple[str, str]]:
|
|
840
|
-
if config.extra_vpcs is None:
|
|
841
|
-
return []
|
|
842
|
-
if instance_type_name == "a3-megagpu-8g":
|
|
843
|
-
subnets_num = 8
|
|
844
|
-
elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
|
|
845
|
-
subnets_num = 4
|
|
846
|
-
else:
|
|
847
|
-
return []
|
|
848
|
-
extra_subnets = []
|
|
849
|
-
for vpc_name in config.extra_vpcs[:subnets_num]:
|
|
850
|
-
subnet = gcp_resources.get_vpc_subnet_or_error(
|
|
851
|
-
subnetworks_client=subnetworks_client,
|
|
852
|
-
vpc_project_id=config.vpc_project_id or config.project_id,
|
|
853
|
-
vpc_name=vpc_name,
|
|
854
|
-
region=region,
|
|
855
|
-
)
|
|
856
|
-
vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
|
|
857
|
-
project_id=config.vpc_project_id or config.project_id,
|
|
858
|
-
vpc_name=vpc_name,
|
|
859
|
-
)
|
|
860
|
-
extra_subnets.append((vpc_resource_name, subnet))
|
|
861
|
-
return extra_subnets
|
|
862
|
-
|
|
863
|
-
|
|
864
880
|
def _get_image_id(instance_type_name: str, cuda: bool) -> str:
|
|
865
881
|
if instance_type_name == "a3-megagpu-8g":
|
|
866
882
|
image_name = "dstack-a3mega-5"
|
|
@@ -20,6 +20,7 @@ from dstack._internal.core.models.instances import (
|
|
|
20
20
|
InstanceOffer,
|
|
21
21
|
InstanceOfferWithAvailability,
|
|
22
22
|
)
|
|
23
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
24
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
24
25
|
|
|
25
26
|
MAX_INSTANCE_NAME_LEN = 60
|
|
@@ -46,7 +47,10 @@ class LambdaCompute(
|
|
|
46
47
|
return offers_with_availability
|
|
47
48
|
|
|
48
49
|
def create_instance(
|
|
49
|
-
self,
|
|
50
|
+
self,
|
|
51
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
52
|
+
instance_config: InstanceConfiguration,
|
|
53
|
+
placement_group: Optional[PlacementGroup],
|
|
50
54
|
) -> JobProvisioningData:
|
|
51
55
|
instance_name = generate_unique_instance_name(
|
|
52
56
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
@@ -89,7 +93,10 @@ class LambdaCompute(
|
|
|
89
93
|
instance_info = _get_instance_info(self.api_client, provisioning_data.instance_id)
|
|
90
94
|
if instance_info is not None and instance_info["status"] != "booting":
|
|
91
95
|
provisioning_data.hostname = instance_info["ip"]
|
|
92
|
-
commands = get_shim_commands(
|
|
96
|
+
commands = get_shim_commands(
|
|
97
|
+
authorized_keys=[project_ssh_public_key],
|
|
98
|
+
arch=provisioning_data.instance_type.resources.cpu_arch,
|
|
99
|
+
)
|
|
93
100
|
# shim is assumed to be run under root
|
|
94
101
|
launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
|
|
95
102
|
thread = Thread(
|
|
@@ -179,13 +186,18 @@ def _setup_instance(
|
|
|
179
186
|
ssh_private_key: str,
|
|
180
187
|
):
|
|
181
188
|
setup_commands = (
|
|
182
|
-
"mkdir /home/ubuntu/.dstack
|
|
183
|
-
"sudo apt-get update
|
|
184
|
-
"sudo apt-get install -y --no-install-recommends nvidia-container-toolkit
|
|
185
|
-
"sudo
|
|
186
|
-
|
|
189
|
+
"mkdir /home/ubuntu/.dstack",
|
|
190
|
+
"sudo apt-get update",
|
|
191
|
+
"sudo apt-get install -y --no-install-recommends nvidia-container-toolkit",
|
|
192
|
+
"sudo install -d -m 0755 /etc/docker",
|
|
193
|
+
# Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
|
|
194
|
+
"""echo '{"exec-opts":["native.cgroupdriver=cgroupfs"]}' | sudo tee /etc/docker/daemon.json""",
|
|
195
|
+
"sudo nvidia-ctk runtime configure --runtime=docker",
|
|
196
|
+
"sudo systemctl restart docker.service", # `systemctl reload` (`kill -HUP`) won't work
|
|
197
|
+
)
|
|
198
|
+
_run_ssh_command(
|
|
199
|
+
hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands)
|
|
187
200
|
)
|
|
188
|
-
_run_ssh_command(hostname=hostname, ssh_private_key=ssh_private_key, command=setup_commands)
|
|
189
201
|
|
|
190
202
|
|
|
191
203
|
def _launch_runner(
|
|
@@ -15,6 +15,7 @@ from dstack._internal.core.models.instances import (
|
|
|
15
15
|
InstanceType,
|
|
16
16
|
Resources,
|
|
17
17
|
)
|
|
18
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
18
19
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
19
20
|
from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
|
|
20
21
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -53,6 +54,7 @@ class LocalCompute(
|
|
|
53
54
|
self,
|
|
54
55
|
instance_offer: InstanceOfferWithAvailability,
|
|
55
56
|
instance_config: InstanceConfiguration,
|
|
57
|
+
placement_group: Optional[PlacementGroup],
|
|
56
58
|
) -> JobProvisioningData:
|
|
57
59
|
return JobProvisioningData(
|
|
58
60
|
backend=instance_offer.backend,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import random
|
|
2
3
|
import shlex
|
|
3
4
|
import time
|
|
4
5
|
from functools import cached_property
|
|
@@ -13,13 +14,19 @@ from dstack._internal.core.backends.base.backend import Compute
|
|
|
13
14
|
from dstack._internal.core.backends.base.compute import (
|
|
14
15
|
ComputeWithCreateInstanceSupport,
|
|
15
16
|
ComputeWithMultinodeSupport,
|
|
17
|
+
ComputeWithPlacementGroupSupport,
|
|
16
18
|
generate_unique_instance_name,
|
|
17
19
|
get_user_data,
|
|
18
20
|
)
|
|
19
21
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
20
22
|
from dstack._internal.core.backends.nebius import resources
|
|
23
|
+
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
21
24
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
22
|
-
from dstack._internal.core.errors import
|
|
25
|
+
from dstack._internal.core.errors import (
|
|
26
|
+
BackendError,
|
|
27
|
+
NotYetTerminated,
|
|
28
|
+
ProvisioningError,
|
|
29
|
+
)
|
|
23
30
|
from dstack._internal.core.models.backends.base import BackendType
|
|
24
31
|
from dstack._internal.core.models.common import CoreModel
|
|
25
32
|
from dstack._internal.core.models.instances import (
|
|
@@ -28,6 +35,11 @@ from dstack._internal.core.models.instances import (
|
|
|
28
35
|
InstanceOffer,
|
|
29
36
|
InstanceOfferWithAvailability,
|
|
30
37
|
)
|
|
38
|
+
from dstack._internal.core.models.placement import (
|
|
39
|
+
PlacementGroup,
|
|
40
|
+
PlacementGroupProvisioningData,
|
|
41
|
+
PlacementStrategy,
|
|
42
|
+
)
|
|
31
43
|
from dstack._internal.core.models.resources import Memory, Range
|
|
32
44
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
33
45
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -72,6 +84,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
72
84
|
class NebiusCompute(
|
|
73
85
|
ComputeWithCreateInstanceSupport,
|
|
74
86
|
ComputeWithMultinodeSupport,
|
|
87
|
+
ComputeWithPlacementGroupSupport,
|
|
75
88
|
Compute,
|
|
76
89
|
):
|
|
77
90
|
def __init__(self, config: NebiusConfig):
|
|
@@ -121,6 +134,7 @@ class NebiusCompute(
|
|
|
121
134
|
self,
|
|
122
135
|
instance_offer: InstanceOfferWithAvailability,
|
|
123
136
|
instance_config: InstanceConfiguration,
|
|
137
|
+
placement_group: Optional[PlacementGroup],
|
|
124
138
|
) -> JobProvisioningData:
|
|
125
139
|
# NOTE: This method can block for a long time as it waits for the boot disk to be created
|
|
126
140
|
# and the instance to enter the STARTING state. This has to be done in create_instance so
|
|
@@ -128,6 +142,14 @@ class NebiusCompute(
|
|
|
128
142
|
# instance.
|
|
129
143
|
instance_name = generate_unique_instance_name(instance_config)
|
|
130
144
|
platform, preset = instance_offer.instance.name.split()
|
|
145
|
+
cluster_id = None
|
|
146
|
+
if placement_group:
|
|
147
|
+
assert placement_group.provisioning_data is not None
|
|
148
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
149
|
+
placement_group.provisioning_data.backend_data
|
|
150
|
+
)
|
|
151
|
+
if backend_data.cluster is not None:
|
|
152
|
+
cluster_id = backend_data.cluster.id
|
|
131
153
|
create_disk_op = resources.create_disk(
|
|
132
154
|
sdk=self._sdk,
|
|
133
155
|
name=instance_name,
|
|
@@ -155,6 +177,7 @@ class NebiusCompute(
|
|
|
155
177
|
),
|
|
156
178
|
platform=platform,
|
|
157
179
|
preset=preset,
|
|
180
|
+
cluster_id=cluster_id,
|
|
158
181
|
disk_id=create_disk_op.resource_id,
|
|
159
182
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
160
183
|
)
|
|
@@ -230,6 +253,63 @@ class NebiusCompute(
|
|
|
230
253
|
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
231
254
|
resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
|
|
232
255
|
|
|
256
|
+
def create_placement_group(
|
|
257
|
+
self,
|
|
258
|
+
placement_group: PlacementGroup,
|
|
259
|
+
master_instance_offer: InstanceOffer,
|
|
260
|
+
) -> PlacementGroupProvisioningData:
|
|
261
|
+
assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
|
|
262
|
+
backend_data = NebiusPlacementGroupBackendData(cluster=None)
|
|
263
|
+
# Only create a Nebius cluster if the instance supports it.
|
|
264
|
+
# For other instances, return dummy PlacementGroupProvisioningData.
|
|
265
|
+
if fabrics := get_suitable_infiniband_fabrics(
|
|
266
|
+
master_instance_offer, allowed_fabrics=self.config.fabrics
|
|
267
|
+
):
|
|
268
|
+
fabric = random.choice(fabrics)
|
|
269
|
+
op = resources.create_cluster(
|
|
270
|
+
self._sdk,
|
|
271
|
+
name=placement_group.name,
|
|
272
|
+
project_id=self._region_to_project_id[placement_group.configuration.region],
|
|
273
|
+
fabric=fabric,
|
|
274
|
+
)
|
|
275
|
+
backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
|
|
276
|
+
return PlacementGroupProvisioningData(
|
|
277
|
+
backend=BackendType.NEBIUS,
|
|
278
|
+
backend_data=backend_data.json(),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
def delete_placement_group(self, placement_group: PlacementGroup) -> None:
|
|
282
|
+
assert placement_group.provisioning_data is not None
|
|
283
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
284
|
+
placement_group.provisioning_data.backend_data
|
|
285
|
+
)
|
|
286
|
+
if backend_data.cluster is not None:
|
|
287
|
+
with resources.ignore_errors([StatusCode.NOT_FOUND]):
|
|
288
|
+
resources.delete_cluster(self._sdk, backend_data.cluster.id)
|
|
289
|
+
|
|
290
|
+
def is_suitable_placement_group(
|
|
291
|
+
self,
|
|
292
|
+
placement_group: PlacementGroup,
|
|
293
|
+
instance_offer: InstanceOffer,
|
|
294
|
+
) -> bool:
|
|
295
|
+
if not (
|
|
296
|
+
placement_group.configuration.backend == BackendType.NEBIUS
|
|
297
|
+
and placement_group.configuration.region == instance_offer.region
|
|
298
|
+
):
|
|
299
|
+
return False
|
|
300
|
+
assert placement_group.provisioning_data is not None
|
|
301
|
+
backend_data = NebiusPlacementGroupBackendData.load(
|
|
302
|
+
placement_group.provisioning_data.backend_data
|
|
303
|
+
)
|
|
304
|
+
return (
|
|
305
|
+
backend_data.cluster is None
|
|
306
|
+
or backend_data.cluster.fabric
|
|
307
|
+
in get_suitable_infiniband_fabrics(
|
|
308
|
+
instance_offer,
|
|
309
|
+
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
|
|
310
|
+
)
|
|
311
|
+
)
|
|
312
|
+
|
|
233
313
|
|
|
234
314
|
class NebiusInstanceBackendData(CoreModel):
|
|
235
315
|
boot_disk_id: str
|
|
@@ -240,6 +320,20 @@ class NebiusInstanceBackendData(CoreModel):
|
|
|
240
320
|
return cls.__response__.parse_raw(raw)
|
|
241
321
|
|
|
242
322
|
|
|
323
|
+
class NebiusClusterBackendData(CoreModel):
|
|
324
|
+
id: str
|
|
325
|
+
fabric: str
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class NebiusPlacementGroupBackendData(CoreModel):
|
|
329
|
+
cluster: Optional[NebiusClusterBackendData]
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData":
|
|
333
|
+
assert raw is not None
|
|
334
|
+
return cls.__response__.parse_raw(raw)
|
|
335
|
+
|
|
336
|
+
|
|
243
337
|
def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
|
|
244
338
|
start = time.monotonic()
|
|
245
339
|
while True:
|
|
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.configurator import (
|
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.core.backends.nebius import resources
|
|
11
11
|
from dstack._internal.core.backends.nebius.backend import NebiusBackend
|
|
12
|
+
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
|
|
12
13
|
from dstack._internal.core.backends.nebius.models import (
|
|
13
14
|
AnyNebiusBackendConfig,
|
|
14
15
|
NebiusBackendConfig,
|
|
@@ -38,6 +39,16 @@ class NebiusConfigurator(Configurator):
|
|
|
38
39
|
fields=[["creds"]],
|
|
39
40
|
details=str(e),
|
|
40
41
|
)
|
|
42
|
+
valid_fabrics = get_all_infiniband_fabrics()
|
|
43
|
+
if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
|
|
44
|
+
raise_invalid_credentials_error(
|
|
45
|
+
fields=[["fabrics"]],
|
|
46
|
+
details=(
|
|
47
|
+
"These InfiniBand fabrics do not exist or are not known to dstack:"
|
|
48
|
+
f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
|
|
49
|
+
f" some of the valid options: {sorted(valid_fabrics)}"
|
|
50
|
+
),
|
|
51
|
+
)
|
|
41
52
|
|
|
42
53
|
def create_backend(
|
|
43
54
|
self, project_name: str, config: NebiusBackendConfigWithCreds
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from collections.abc import Container
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.instances import InstanceOffer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class InfinibandFabric:
|
|
10
|
+
name: str
|
|
11
|
+
platform: str
|
|
12
|
+
region: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# https://docs.nebius.com/compute/clusters/gpu#fabrics
|
|
16
|
+
INFINIBAND_FABRICS = [
|
|
17
|
+
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
|
|
18
|
+
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
|
|
19
|
+
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
|
|
20
|
+
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
|
|
21
|
+
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
|
|
22
|
+
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
|
|
23
|
+
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_suitable_infiniband_fabrics(
|
|
28
|
+
offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
|
|
29
|
+
) -> list[str]:
|
|
30
|
+
if len(offer.instance.resources.gpus) < 8:
|
|
31
|
+
# From the create VM page in the Nebius Console:
|
|
32
|
+
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
|
|
33
|
+
# > can be added to the cluster
|
|
34
|
+
return []
|
|
35
|
+
platform, _ = offer.instance.name.split()
|
|
36
|
+
return [
|
|
37
|
+
f.name
|
|
38
|
+
for f in INFINIBAND_FABRICS
|
|
39
|
+
if (
|
|
40
|
+
f.platform == platform
|
|
41
|
+
and f.region == offer.region
|
|
42
|
+
and (allowed_fabrics is None or f.name in allowed_fabrics)
|
|
43
|
+
)
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_all_infiniband_fabrics() -> set[str]:
|
|
48
|
+
return {f.name for f in INFINIBAND_FABRICS}
|
|
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
|
|
|
5
5
|
from dstack._internal.core.backends.base.models import fill_data
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
8
|
-
DEFAULT_PROJECT_NAME_PREFIX = "default
|
|
8
|
+
DEFAULT_PROJECT_NAME_PREFIX = "default"
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class NebiusServiceAccountCreds(CoreModel):
|
|
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
|
|
|
87
87
|
Optional[list[str]],
|
|
88
88
|
Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
|
|
89
89
|
] = None
|
|
90
|
+
fabrics: Annotated[
|
|
91
|
+
Optional[list[str]],
|
|
92
|
+
Field(
|
|
93
|
+
description=(
|
|
94
|
+
"The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
|
|
95
|
+
)
|
|
96
|
+
),
|
|
97
|
+
] = None
|
|
90
98
|
|
|
91
99
|
|
|
92
100
|
class NebiusBackendConfigWithCreds(NebiusBackendConfig):
|
|
@@ -15,14 +15,19 @@ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
|
|
|
15
15
|
from nebius.api.nebius.compute.v1 import (
|
|
16
16
|
AttachedDiskSpec,
|
|
17
17
|
CreateDiskRequest,
|
|
18
|
+
CreateGpuClusterRequest,
|
|
18
19
|
CreateInstanceRequest,
|
|
19
20
|
DeleteDiskRequest,
|
|
21
|
+
DeleteGpuClusterRequest,
|
|
20
22
|
DeleteInstanceRequest,
|
|
21
23
|
DiskServiceClient,
|
|
22
24
|
DiskSpec,
|
|
23
25
|
ExistingDisk,
|
|
24
26
|
GetInstanceRequest,
|
|
27
|
+
GpuClusterServiceClient,
|
|
28
|
+
GpuClusterSpec,
|
|
25
29
|
Instance,
|
|
30
|
+
InstanceGpuClusterSpec,
|
|
26
31
|
InstanceServiceClient,
|
|
27
32
|
InstanceSpec,
|
|
28
33
|
IPAddress,
|
|
@@ -275,6 +280,7 @@ def create_instance(
|
|
|
275
280
|
user_data: str,
|
|
276
281
|
platform: str,
|
|
277
282
|
preset: str,
|
|
283
|
+
cluster_id: Optional[str],
|
|
278
284
|
disk_id: str,
|
|
279
285
|
subnet_id: str,
|
|
280
286
|
) -> SDKOperation[Operation]:
|
|
@@ -287,6 +293,7 @@ def create_instance(
|
|
|
287
293
|
spec=InstanceSpec(
|
|
288
294
|
cloud_init_user_data=user_data,
|
|
289
295
|
resources=ResourcesSpec(platform=platform, preset=preset),
|
|
296
|
+
gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None,
|
|
290
297
|
boot_disk=AttachedDiskSpec(
|
|
291
298
|
attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
292
299
|
existing_disk=ExistingDisk(id=disk_id),
|
|
@@ -319,3 +326,25 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
|
|
|
319
326
|
DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
320
327
|
)
|
|
321
328
|
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]:
|
|
332
|
+
with wrap_capacity_errors():
|
|
333
|
+
return LOOP.await_(
|
|
334
|
+
GpuClusterServiceClient(sdk).create(
|
|
335
|
+
CreateGpuClusterRequest(
|
|
336
|
+
metadata=ResourceMetadata(name=name, parent_id=project_id),
|
|
337
|
+
spec=GpuClusterSpec(infiniband_fabric=fabric),
|
|
338
|
+
),
|
|
339
|
+
timeout=REQUEST_TIMEOUT,
|
|
340
|
+
metadata=REQUEST_MD,
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def delete_cluster(sdk: SDK, cluster_id: str) -> None:
|
|
346
|
+
return LOOP.await_(
|
|
347
|
+
GpuClusterServiceClient(sdk).delete(
|
|
348
|
+
DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
|
|
349
|
+
)
|
|
350
|
+
)
|
|
@@ -23,6 +23,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
23
|
InstanceOffer,
|
|
24
24
|
InstanceOfferWithAvailability,
|
|
25
25
|
)
|
|
26
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
27
|
from dstack._internal.core.models.resources import Memory, Range
|
|
27
28
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
28
29
|
|
|
@@ -105,6 +106,7 @@ class OCICompute(
|
|
|
105
106
|
self,
|
|
106
107
|
instance_offer: InstanceOfferWithAvailability,
|
|
107
108
|
instance_config: InstanceConfiguration,
|
|
109
|
+
placement_group: Optional[PlacementGroup],
|
|
108
110
|
) -> JobProvisioningData:
|
|
109
111
|
region = self.regions[instance_offer.region]
|
|
110
112
|
|