dstack 0.19.7__py3-none-any.whl → 0.19.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (52) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +38 -2
  3. dstack/_internal/cli/utils/run.py +3 -3
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +2 -0
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +47 -0
  19. dstack/_internal/core/backends/nebius/models.py +8 -0
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +78 -3
  28. dstack/_internal/core/models/runs.py +7 -2
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  33. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  34. dstack/_internal/server/models.py +1 -0
  35. dstack/_internal/server/services/fleets.py +9 -26
  36. dstack/_internal/server/services/instances.py +0 -2
  37. dstack/_internal/server/services/offers.py +15 -0
  38. dstack/_internal/server/services/placement.py +27 -6
  39. dstack/_internal/server/services/resources.py +21 -0
  40. dstack/_internal/server/services/runs.py +16 -6
  41. dstack/_internal/server/testing/common.py +35 -26
  42. dstack/_internal/utils/common.py +13 -1
  43. dstack/_internal/utils/json_schema.py +6 -3
  44. dstack/api/__init__.py +1 -0
  45. dstack/api/server/_fleets.py +16 -0
  46. dstack/api/server/_runs.py +44 -3
  47. dstack/version.py +1 -1
  48. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/METADATA +3 -1
  49. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/RECORD +52 -50
  50. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/WHEEL +0 -0
  51. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/entry_points.txt +0 -0
  52. {dstack-0.19.7.dist-info → dstack-0.19.8.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,10 +1,12 @@
1
1
  import concurrent.futures
2
2
  import json
3
+ import threading
3
4
  from collections import defaultdict
4
5
  from typing import Callable, Dict, List, Literal, Optional, Tuple
5
6
 
6
7
  import google.api_core.exceptions
7
8
  import google.cloud.compute_v1 as compute_v1
9
+ from cachetools import TTLCache, cachedmethod
8
10
  from google.cloud import tpu_v2
9
11
  from gpuhunt import KNOWN_TPUS
10
12
 
@@ -98,6 +100,8 @@ class GCPCompute(
98
100
  self.resource_policies_client = compute_v1.ResourcePoliciesClient(
99
101
  credentials=self.credentials
100
102
  )
103
+ self._extra_subnets_cache_lock = threading.Lock()
104
+ self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
101
105
 
102
106
  def get_offers(
103
107
  self, requirements: Optional[Requirements] = None
@@ -166,6 +170,7 @@ class GCPCompute(
166
170
  self,
167
171
  instance_offer: InstanceOfferWithAvailability,
168
172
  instance_config: InstanceConfiguration,
173
+ placement_group: Optional[PlacementGroup],
169
174
  ) -> JobProvisioningData:
170
175
  instance_name = generate_unique_instance_name(
171
176
  instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
@@ -192,18 +197,16 @@ class GCPCompute(
192
197
  config=self.config,
193
198
  region=instance_offer.region,
194
199
  )
195
- extra_subnets = _get_extra_subnets(
196
- subnetworks_client=self.subnetworks_client,
197
- config=self.config,
200
+ extra_subnets = self._get_extra_subnets(
198
201
  region=instance_offer.region,
199
202
  instance_type_name=instance_offer.instance.name,
200
203
  )
201
204
  placement_policy = None
202
- if instance_config.placement_group_name is not None:
205
+ if placement_group is not None:
203
206
  placement_policy = gcp_resources.get_placement_policy_resource_name(
204
207
  project_id=self.config.project_id,
205
208
  region=instance_offer.region,
206
- placement_policy=instance_config.placement_group_name,
209
+ placement_policy=placement_group.name,
207
210
  )
208
211
  labels = {
209
212
  "owner": "dstack",
@@ -406,6 +409,7 @@ class GCPCompute(
406
409
  def create_placement_group(
407
410
  self,
408
411
  placement_group: PlacementGroup,
412
+ master_instance_offer: InstanceOffer,
409
413
  ) -> PlacementGroupProvisioningData:
410
414
  policy = compute_v1.ResourcePolicy(
411
415
  name=placement_group.name,
@@ -440,6 +444,16 @@ class GCPCompute(
440
444
  raise PlacementGroupInUseError()
441
445
  raise
442
446
 
447
+ def is_suitable_placement_group(
448
+ self,
449
+ placement_group: PlacementGroup,
450
+ instance_offer: InstanceOffer,
451
+ ) -> bool:
452
+ return (
453
+ placement_group.configuration.backend == BackendType.GCP
454
+ and placement_group.configuration.region == instance_offer.region
455
+ )
456
+
443
457
  def create_gateway(
444
458
  self,
445
459
  configuration: GatewayComputeConfiguration,
@@ -757,6 +771,38 @@ class GCPCompute(
757
771
  instance_id,
758
772
  )
759
773
 
774
+ @cachedmethod(
775
+ cache=lambda self: self._extra_subnets_cache,
776
+ lock=lambda self: self._extra_subnets_cache_lock,
777
+ )
778
+ def _get_extra_subnets(
779
+ self,
780
+ region: str,
781
+ instance_type_name: str,
782
+ ) -> List[Tuple[str, str]]:
783
+ if self.config.extra_vpcs is None:
784
+ return []
785
+ if instance_type_name == "a3-megagpu-8g":
786
+ subnets_num = 8
787
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
788
+ subnets_num = 4
789
+ else:
790
+ return []
791
+ extra_subnets = []
792
+ for vpc_name in self.config.extra_vpcs[:subnets_num]:
793
+ subnet = gcp_resources.get_vpc_subnet_or_error(
794
+ subnetworks_client=self.subnetworks_client,
795
+ vpc_project_id=self.config.vpc_project_id or self.config.project_id,
796
+ vpc_name=vpc_name,
797
+ region=region,
798
+ )
799
+ vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
800
+ project_id=self.config.vpc_project_id or self.config.project_id,
801
+ vpc_name=vpc_name,
802
+ )
803
+ extra_subnets.append((vpc_resource_name, subnet))
804
+ return extra_subnets
805
+
760
806
 
761
807
  def _supported_instances_and_zones(
762
808
  regions: List[str],
@@ -831,36 +877,6 @@ def _get_vpc_subnet(
831
877
  )
832
878
 
833
879
 
834
- def _get_extra_subnets(
835
- subnetworks_client: compute_v1.SubnetworksClient,
836
- config: GCPConfig,
837
- region: str,
838
- instance_type_name: str,
839
- ) -> List[Tuple[str, str]]:
840
- if config.extra_vpcs is None:
841
- return []
842
- if instance_type_name == "a3-megagpu-8g":
843
- subnets_num = 8
844
- elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
845
- subnets_num = 4
846
- else:
847
- return []
848
- extra_subnets = []
849
- for vpc_name in config.extra_vpcs[:subnets_num]:
850
- subnet = gcp_resources.get_vpc_subnet_or_error(
851
- subnetworks_client=subnetworks_client,
852
- vpc_project_id=config.vpc_project_id or config.project_id,
853
- vpc_name=vpc_name,
854
- region=region,
855
- )
856
- vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
857
- project_id=config.vpc_project_id or config.project_id,
858
- vpc_name=vpc_name,
859
- )
860
- extra_subnets.append((vpc_resource_name, subnet))
861
- return extra_subnets
862
-
863
-
864
880
  def _get_image_id(instance_type_name: str, cuda: bool) -> str:
865
881
  if instance_type_name == "a3-megagpu-8g":
866
882
  image_name = "dstack-a3mega-5"
@@ -20,6 +20,7 @@ from dstack._internal.core.models.instances import (
20
20
  InstanceOffer,
21
21
  InstanceOfferWithAvailability,
22
22
  )
23
+ from dstack._internal.core.models.placement import PlacementGroup
23
24
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
24
25
 
25
26
  MAX_INSTANCE_NAME_LEN = 60
@@ -46,7 +47,10 @@ class LambdaCompute(
46
47
  return offers_with_availability
47
48
 
48
49
  def create_instance(
49
- self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
50
+ self,
51
+ instance_offer: InstanceOfferWithAvailability,
52
+ instance_config: InstanceConfiguration,
53
+ placement_group: Optional[PlacementGroup],
50
54
  ) -> JobProvisioningData:
51
55
  instance_name = generate_unique_instance_name(
52
56
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -89,7 +93,10 @@ class LambdaCompute(
89
93
  instance_info = _get_instance_info(self.api_client, provisioning_data.instance_id)
90
94
  if instance_info is not None and instance_info["status"] != "booting":
91
95
  provisioning_data.hostname = instance_info["ip"]
92
- commands = get_shim_commands(authorized_keys=[project_ssh_public_key])
96
+ commands = get_shim_commands(
97
+ authorized_keys=[project_ssh_public_key],
98
+ arch=provisioning_data.instance_type.resources.cpu_arch,
99
+ )
93
100
  # shim is assumed to be run under root
94
101
  launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
95
102
  thread = Thread(
@@ -179,13 +186,18 @@ def _setup_instance(
179
186
  ssh_private_key: str,
180
187
  ):
181
188
  setup_commands = (
182
- "mkdir /home/ubuntu/.dstack && "
183
- "sudo apt-get update && "
184
- "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit && "
185
- "sudo nvidia-ctk runtime configure --runtime=docker && "
186
- "sudo pkill -SIGHUP dockerd"
189
+ "mkdir /home/ubuntu/.dstack",
190
+ "sudo apt-get update",
191
+ "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit",
192
+ "sudo install -d -m 0755 /etc/docker",
193
+ # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
194
+ """echo '{"exec-opts":["native.cgroupdriver=cgroupfs"]}' | sudo tee /etc/docker/daemon.json""",
195
+ "sudo nvidia-ctk runtime configure --runtime=docker",
196
+ "sudo systemctl restart docker.service", # `systemctl reload` (`kill -HUP`) won't work
197
+ )
198
+ _run_ssh_command(
199
+ hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands)
187
200
  )
188
- _run_ssh_command(hostname=hostname, ssh_private_key=ssh_private_key, command=setup_commands)
189
201
 
190
202
 
191
203
  def _launch_runner(
@@ -15,6 +15,7 @@ from dstack._internal.core.models.instances import (
15
15
  InstanceType,
16
16
  Resources,
17
17
  )
18
+ from dstack._internal.core.models.placement import PlacementGroup
18
19
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
19
20
  from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
20
21
  from dstack._internal.utils.logging import get_logger
@@ -53,6 +54,7 @@ class LocalCompute(
53
54
  self,
54
55
  instance_offer: InstanceOfferWithAvailability,
55
56
  instance_config: InstanceConfiguration,
57
+ placement_group: Optional[PlacementGroup],
56
58
  ) -> JobProvisioningData:
57
59
  return JobProvisioningData(
58
60
  backend=instance_offer.backend,
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import random
2
3
  import shlex
3
4
  import time
4
5
  from functools import cached_property
@@ -13,13 +14,19 @@ from dstack._internal.core.backends.base.backend import Compute
13
14
  from dstack._internal.core.backends.base.compute import (
14
15
  ComputeWithCreateInstanceSupport,
15
16
  ComputeWithMultinodeSupport,
17
+ ComputeWithPlacementGroupSupport,
16
18
  generate_unique_instance_name,
17
19
  get_user_data,
18
20
  )
19
21
  from dstack._internal.core.backends.base.offers import get_catalog_offers
20
22
  from dstack._internal.core.backends.nebius import resources
23
+ from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
21
24
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
22
- from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
25
+ from dstack._internal.core.errors import (
26
+ BackendError,
27
+ NotYetTerminated,
28
+ ProvisioningError,
29
+ )
23
30
  from dstack._internal.core.models.backends.base import BackendType
24
31
  from dstack._internal.core.models.common import CoreModel
25
32
  from dstack._internal.core.models.instances import (
@@ -28,6 +35,11 @@ from dstack._internal.core.models.instances import (
28
35
  InstanceOffer,
29
36
  InstanceOfferWithAvailability,
30
37
  )
38
+ from dstack._internal.core.models.placement import (
39
+ PlacementGroup,
40
+ PlacementGroupProvisioningData,
41
+ PlacementStrategy,
42
+ )
31
43
  from dstack._internal.core.models.resources import Memory, Range
32
44
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
33
45
  from dstack._internal.utils.logging import get_logger
@@ -72,6 +84,7 @@ SUPPORTED_PLATFORMS = [
72
84
  class NebiusCompute(
73
85
  ComputeWithCreateInstanceSupport,
74
86
  ComputeWithMultinodeSupport,
87
+ ComputeWithPlacementGroupSupport,
75
88
  Compute,
76
89
  ):
77
90
  def __init__(self, config: NebiusConfig):
@@ -121,6 +134,7 @@ class NebiusCompute(
121
134
  self,
122
135
  instance_offer: InstanceOfferWithAvailability,
123
136
  instance_config: InstanceConfiguration,
137
+ placement_group: Optional[PlacementGroup],
124
138
  ) -> JobProvisioningData:
125
139
  # NOTE: This method can block for a long time as it waits for the boot disk to be created
126
140
  # and the instance to enter the STARTING state. This has to be done in create_instance so
@@ -128,6 +142,14 @@ class NebiusCompute(
128
142
  # instance.
129
143
  instance_name = generate_unique_instance_name(instance_config)
130
144
  platform, preset = instance_offer.instance.name.split()
145
+ cluster_id = None
146
+ if placement_group:
147
+ assert placement_group.provisioning_data is not None
148
+ backend_data = NebiusPlacementGroupBackendData.load(
149
+ placement_group.provisioning_data.backend_data
150
+ )
151
+ if backend_data.cluster is not None:
152
+ cluster_id = backend_data.cluster.id
131
153
  create_disk_op = resources.create_disk(
132
154
  sdk=self._sdk,
133
155
  name=instance_name,
@@ -155,6 +177,7 @@ class NebiusCompute(
155
177
  ),
156
178
  platform=platform,
157
179
  preset=preset,
180
+ cluster_id=cluster_id,
158
181
  disk_id=create_disk_op.resource_id,
159
182
  subnet_id=self._get_subnet_id(instance_offer.region),
160
183
  )
@@ -230,6 +253,63 @@ class NebiusCompute(
230
253
  with resources.ignore_errors([StatusCode.NOT_FOUND]):
231
254
  resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
232
255
 
256
+ def create_placement_group(
257
+ self,
258
+ placement_group: PlacementGroup,
259
+ master_instance_offer: InstanceOffer,
260
+ ) -> PlacementGroupProvisioningData:
261
+ assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
262
+ backend_data = NebiusPlacementGroupBackendData(cluster=None)
263
+ # Only create a Nebius cluster if the instance supports it.
264
+ # For other instances, return dummy PlacementGroupProvisioningData.
265
+ if fabrics := get_suitable_infiniband_fabrics(
266
+ master_instance_offer, allowed_fabrics=self.config.fabrics
267
+ ):
268
+ fabric = random.choice(fabrics)
269
+ op = resources.create_cluster(
270
+ self._sdk,
271
+ name=placement_group.name,
272
+ project_id=self._region_to_project_id[placement_group.configuration.region],
273
+ fabric=fabric,
274
+ )
275
+ backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
276
+ return PlacementGroupProvisioningData(
277
+ backend=BackendType.NEBIUS,
278
+ backend_data=backend_data.json(),
279
+ )
280
+
281
+ def delete_placement_group(self, placement_group: PlacementGroup) -> None:
282
+ assert placement_group.provisioning_data is not None
283
+ backend_data = NebiusPlacementGroupBackendData.load(
284
+ placement_group.provisioning_data.backend_data
285
+ )
286
+ if backend_data.cluster is not None:
287
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
288
+ resources.delete_cluster(self._sdk, backend_data.cluster.id)
289
+
290
+ def is_suitable_placement_group(
291
+ self,
292
+ placement_group: PlacementGroup,
293
+ instance_offer: InstanceOffer,
294
+ ) -> bool:
295
+ if not (
296
+ placement_group.configuration.backend == BackendType.NEBIUS
297
+ and placement_group.configuration.region == instance_offer.region
298
+ ):
299
+ return False
300
+ assert placement_group.provisioning_data is not None
301
+ backend_data = NebiusPlacementGroupBackendData.load(
302
+ placement_group.provisioning_data.backend_data
303
+ )
304
+ return (
305
+ backend_data.cluster is None
306
+ or backend_data.cluster.fabric
307
+ in get_suitable_infiniband_fabrics(
308
+ instance_offer,
309
+ allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
310
+ )
311
+ )
312
+
233
313
 
234
314
  class NebiusInstanceBackendData(CoreModel):
235
315
  boot_disk_id: str
@@ -240,6 +320,20 @@ class NebiusInstanceBackendData(CoreModel):
240
320
  return cls.__response__.parse_raw(raw)
241
321
 
242
322
 
323
+ class NebiusClusterBackendData(CoreModel):
324
+ id: str
325
+ fabric: str
326
+
327
+
328
+ class NebiusPlacementGroupBackendData(CoreModel):
329
+ cluster: Optional[NebiusClusterBackendData]
330
+
331
+ @classmethod
332
+ def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData":
333
+ assert raw is not None
334
+ return cls.__response__.parse_raw(raw)
335
+
336
+
243
337
  def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
244
338
  start = time.monotonic()
245
339
  while True:
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.configurator import (
9
9
  )
10
10
  from dstack._internal.core.backends.nebius import resources
11
11
  from dstack._internal.core.backends.nebius.backend import NebiusBackend
12
+ from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
12
13
  from dstack._internal.core.backends.nebius.models import (
13
14
  AnyNebiusBackendConfig,
14
15
  NebiusBackendConfig,
@@ -38,6 +39,16 @@ class NebiusConfigurator(Configurator):
38
39
  fields=[["creds"]],
39
40
  details=str(e),
40
41
  )
42
+ valid_fabrics = get_all_infiniband_fabrics()
43
+ if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
44
+ raise_invalid_credentials_error(
45
+ fields=[["fabrics"]],
46
+ details=(
47
+ "These InfiniBand fabrics do not exist or are not known to dstack:"
48
+ f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
49
+ f" some of the valid options: {sorted(valid_fabrics)}"
50
+ ),
51
+ )
41
52
 
42
53
  def create_backend(
43
54
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -0,0 +1,47 @@
1
+ from collections.abc import Container
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from dstack._internal.core.models.instances import InstanceOffer
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class InfinibandFabric:
10
+ name: str
11
+ platform: str
12
+ region: str
13
+
14
+
15
+ # https://docs.nebius.com/compute/clusters/gpu#fabrics
16
+ INFINIBAND_FABRICS = [
17
+ InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
18
+ InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
19
+ InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
20
+ InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21
+ InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
+ InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
+ ]
24
+
25
+
26
+ def get_suitable_infiniband_fabrics(
27
+ offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
28
+ ) -> list[str]:
29
+ if len(offer.instance.resources.gpus) < 8:
30
+ # From the create VM page in the Nebius Console:
31
+ # > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
32
+ # > can be added to the cluster
33
+ return []
34
+ platform, _ = offer.instance.name.split()
35
+ return [
36
+ f.name
37
+ for f in INFINIBAND_FABRICS
38
+ if (
39
+ f.platform == platform
40
+ and f.region == offer.region
41
+ and (allowed_fabrics is None or f.name in allowed_fabrics)
42
+ )
43
+ ]
44
+
45
+
46
+ def get_all_infiniband_fabrics() -> set[str]:
47
+ return {f.name for f in INFINIBAND_FABRICS}
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
87
87
  Optional[list[str]],
88
88
  Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
89
89
  ] = None
90
+ fabrics: Annotated[
91
+ Optional[list[str]],
92
+ Field(
93
+ description=(
94
+ "The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
95
+ )
96
+ ),
97
+ ] = None
90
98
 
91
99
 
92
100
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):
@@ -15,14 +15,19 @@ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
15
15
  from nebius.api.nebius.compute.v1 import (
16
16
  AttachedDiskSpec,
17
17
  CreateDiskRequest,
18
+ CreateGpuClusterRequest,
18
19
  CreateInstanceRequest,
19
20
  DeleteDiskRequest,
21
+ DeleteGpuClusterRequest,
20
22
  DeleteInstanceRequest,
21
23
  DiskServiceClient,
22
24
  DiskSpec,
23
25
  ExistingDisk,
24
26
  GetInstanceRequest,
27
+ GpuClusterServiceClient,
28
+ GpuClusterSpec,
25
29
  Instance,
30
+ InstanceGpuClusterSpec,
26
31
  InstanceServiceClient,
27
32
  InstanceSpec,
28
33
  IPAddress,
@@ -275,6 +280,7 @@ def create_instance(
275
280
  user_data: str,
276
281
  platform: str,
277
282
  preset: str,
283
+ cluster_id: Optional[str],
278
284
  disk_id: str,
279
285
  subnet_id: str,
280
286
  ) -> SDKOperation[Operation]:
@@ -287,6 +293,7 @@ def create_instance(
287
293
  spec=InstanceSpec(
288
294
  cloud_init_user_data=user_data,
289
295
  resources=ResourcesSpec(platform=platform, preset=preset),
296
+ gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None,
290
297
  boot_disk=AttachedDiskSpec(
291
298
  attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
292
299
  existing_disk=ExistingDisk(id=disk_id),
@@ -319,3 +326,25 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
319
326
  DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
320
327
  )
321
328
  )
329
+
330
+
331
+ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]:
332
+ with wrap_capacity_errors():
333
+ return LOOP.await_(
334
+ GpuClusterServiceClient(sdk).create(
335
+ CreateGpuClusterRequest(
336
+ metadata=ResourceMetadata(name=name, parent_id=project_id),
337
+ spec=GpuClusterSpec(infiniband_fabric=fabric),
338
+ ),
339
+ timeout=REQUEST_TIMEOUT,
340
+ metadata=REQUEST_MD,
341
+ )
342
+ )
343
+
344
+
345
+ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
346
+ return LOOP.await_(
347
+ GpuClusterServiceClient(sdk).delete(
348
+ DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
349
+ )
350
+ )
@@ -23,6 +23,7 @@ from dstack._internal.core.models.instances import (
23
23
  InstanceOffer,
24
24
  InstanceOfferWithAvailability,
25
25
  )
26
+ from dstack._internal.core.models.placement import PlacementGroup
26
27
  from dstack._internal.core.models.resources import Memory, Range
27
28
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
28
29
 
@@ -105,6 +106,7 @@ class OCICompute(
105
106
  self,
106
107
  instance_offer: InstanceOfferWithAvailability,
107
108
  instance_config: InstanceConfiguration,
109
+ placement_group: Optional[PlacementGroup],
108
110
  ) -> JobProvisioningData:
109
111
  region = self.regions[instance_offer.region]
110
112
 
@@ -6,8 +6,9 @@ from textwrap import dedent
6
6
  from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
- from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
9
+ from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
10
10
 
11
+ from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
11
12
  from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
12
13
 
13
14
  # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
36
37
  HOST_INFO_FILE = "host_info.json"
37
38
 
38
39
 
40
+ def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
41
+ cmd = "uname -m"
42
+ try:
43
+ _, stdout, stderr = client.exec_command(cmd, timeout=20)
44
+ except (paramiko.SSHException, OSError) as e:
45
+ raise ProvisioningError(f"detect_cpu_arch: {e}") from e
46
+ out = stdout.read().strip().decode()
47
+ err = stderr.read().strip().decode()
48
+ if err:
49
+ raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
50
+ try:
51
+ return normalize_arch(out)
52
+ except ValueError as e:
53
+ raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
54
+
55
+
39
56
  def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
40
57
  try:
41
58
  sftp = client.open_sftp()
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
226
243
  raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
227
244
 
228
245
 
229
- def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
246
+ def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
247
+ _cpu_arch: CPUArchitecture
248
+ if cpu_arch == "amd64":
249
+ _cpu_arch = CPUArchitecture.X86
250
+ elif cpu_arch == "arm64":
251
+ _cpu_arch = CPUArchitecture.ARM
252
+ else:
253
+ raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
230
254
  gpu_count = host_info.get("gpu_count", 0)
231
255
  if gpu_count > 0:
232
256
  gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
251
275
  instance_type = InstanceType(
252
276
  name="instance",
253
277
  resources=Resources(
278
+ cpu_arch=_cpu_arch,
254
279
  cpus=host_info["cpus"],
255
280
  memory_mib=host_info["memory"] / 1024 / 1024,
256
281
  spot=False,
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
18
18
  InstanceConfiguration,
19
19
  InstanceOfferWithAvailability,
20
20
  )
21
+ from dstack._internal.core.models.placement import PlacementGroup
21
22
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
22
23
  from dstack._internal.core.models.volumes import Volume
23
24
  from dstack._internal.utils.logging import get_logger
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
64
65
  self,
65
66
  instance_offer: InstanceOfferWithAvailability,
66
67
  instance_config: InstanceConfiguration,
68
+ placement_group: Optional[PlacementGroup],
67
69
  ) -> JobProvisioningData:
68
70
  # TODO: Implement if backend supports creating instances (VM-based).
69
71
  # Delete if backend can only run jobs (container-based).
@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
19
19
  InstanceConfiguration,
20
20
  InstanceOfferWithAvailability,
21
21
  )
22
+ from dstack._internal.core.models.placement import PlacementGroup
22
23
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
23
24
  from dstack._internal.utils.logging import get_logger
24
25
 
@@ -57,6 +58,7 @@ class TensorDockCompute(
57
58
  self,
58
59
  instance_offer: InstanceOfferWithAvailability,
59
60
  instance_config: InstanceConfiguration,
61
+ placement_group: Optional[PlacementGroup],
60
62
  ) -> JobProvisioningData:
61
63
  instance_name = generate_unique_instance_name(
62
64
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
22
22
  InstanceOffer,
23
23
  InstanceOfferWithAvailability,
24
24
  )
25
+ from dstack._internal.core.models.placement import PlacementGroup
25
26
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
26
27
  from dstack._internal.utils.logging import get_logger
27
28
 
@@ -58,7 +59,10 @@ class VultrCompute(
58
59
  return offers
59
60
 
60
61
  def create_instance(
61
- self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
62
+ self,
63
+ instance_offer: InstanceOfferWithAvailability,
64
+ instance_config: InstanceConfiguration,
65
+ placement_group: Optional[PlacementGroup],
62
66
  ) -> JobProvisioningData:
63
67
  instance_name = generate_unique_instance_name(
64
68
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -49,11 +49,13 @@ class Resources(CoreModel):
49
49
  spot: bool
50
50
  disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
51
51
  description: str = ""
52
+ cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
52
53
 
53
54
  def pretty_format(self, include_spot: bool = False) -> str:
54
55
  resources = {}
55
56
  if self.cpus > 0:
56
57
  resources["cpus"] = self.cpus
58
+ resources["cpu_arch"] = self.cpu_arch
57
59
  if self.memory_mib > 0:
58
60
  resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
59
61
  if self.disk.size_mib > 0:
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
105
107
  user: str # dstack user name
106
108
  ssh_keys: List[SSHKey]
107
109
  instance_id: Optional[str] = None
108
- placement_group_name: Optional[str] = None
109
110
  reservation: Optional[str] = None
110
111
  volumes: Optional[List[Volume]] = None
111
112
  tags: Optional[Dict[str, str]] = None