dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/services/args.py +2 -2
  2. dstack/_internal/cli/services/configurators/run.py +56 -13
  3. dstack/_internal/cli/utils/run.py +10 -5
  4. dstack/_internal/core/backends/aws/compute.py +13 -1
  5. dstack/_internal/core/backends/azure/compute.py +42 -13
  6. dstack/_internal/core/backends/azure/configurator.py +21 -0
  7. dstack/_internal/core/backends/azure/models.py +9 -0
  8. dstack/_internal/core/backends/base/compute.py +101 -27
  9. dstack/_internal/core/backends/base/offers.py +13 -3
  10. dstack/_internal/core/backends/cudo/compute.py +3 -1
  11. dstack/_internal/core/backends/datacrunch/compute.py +2 -0
  12. dstack/_internal/core/backends/gcp/auth.py +1 -1
  13. dstack/_internal/core/backends/gcp/compute.py +51 -35
  14. dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
  15. dstack/_internal/core/backends/local/compute.py +2 -0
  16. dstack/_internal/core/backends/nebius/compute.py +95 -1
  17. dstack/_internal/core/backends/nebius/configurator.py +11 -0
  18. dstack/_internal/core/backends/nebius/fabrics.py +48 -0
  19. dstack/_internal/core/backends/nebius/models.py +9 -1
  20. dstack/_internal/core/backends/nebius/resources.py +29 -0
  21. dstack/_internal/core/backends/oci/compute.py +2 -0
  22. dstack/_internal/core/backends/remote/provisioning.py +27 -2
  23. dstack/_internal/core/backends/template/compute.py.jinja +2 -0
  24. dstack/_internal/core/backends/tensordock/compute.py +2 -0
  25. dstack/_internal/core/backends/vultr/compute.py +5 -1
  26. dstack/_internal/core/models/instances.py +2 -1
  27. dstack/_internal/core/models/resources.py +79 -4
  28. dstack/_internal/core/models/runs.py +26 -9
  29. dstack/_internal/core/models/volumes.py +1 -1
  30. dstack/_internal/server/background/tasks/process_fleets.py +4 -13
  31. dstack/_internal/server/background/tasks/process_instances.py +176 -55
  32. dstack/_internal/server/background/tasks/process_metrics.py +26 -9
  33. dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
  34. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  36. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  37. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  38. dstack/_internal/server/models.py +6 -1
  39. dstack/_internal/server/schemas/runner.py +41 -8
  40. dstack/_internal/server/services/fleets.py +9 -26
  41. dstack/_internal/server/services/instances.py +0 -2
  42. dstack/_internal/server/services/jobs/__init__.py +1 -0
  43. dstack/_internal/server/services/offers.py +15 -0
  44. dstack/_internal/server/services/placement.py +27 -6
  45. dstack/_internal/server/services/resources.py +21 -0
  46. dstack/_internal/server/services/runner/client.py +7 -4
  47. dstack/_internal/server/services/runs.py +18 -8
  48. dstack/_internal/server/settings.py +20 -1
  49. dstack/_internal/server/testing/common.py +37 -26
  50. dstack/_internal/utils/common.py +13 -1
  51. dstack/_internal/utils/json_schema.py +6 -3
  52. dstack/api/__init__.py +1 -0
  53. dstack/api/server/_fleets.py +16 -0
  54. dstack/api/server/_runs.py +48 -3
  55. dstack/version.py +1 -1
  56. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
  57. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
  58. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
@@ -2,6 +2,7 @@ from dataclasses import asdict
2
2
  from typing import Callable, List, Optional
3
3
 
4
4
  import gpuhunt
5
+ from pydantic import parse_obj_as
5
6
 
6
7
  from dstack._internal.core.models.backends.base import BackendType
7
8
  from dstack._internal.core.models.instances import (
@@ -11,13 +12,14 @@ from dstack._internal.core.models.instances import (
11
12
  InstanceType,
12
13
  Resources,
13
14
  )
14
- from dstack._internal.core.models.resources import DEFAULT_DISK, Memory, Range
15
+ from dstack._internal.core.models.resources import DEFAULT_DISK, CPUSpec, Memory, Range
15
16
  from dstack._internal.core.models.runs import Requirements
16
17
 
17
18
  # Offers not supported by all dstack versions are hidden behind one or more flags.
18
19
  # This list enables the flags that are currently supported.
19
20
  SUPPORTED_GPUHUNT_FLAGS = [
20
21
  "oci-spot",
22
+ "lambda-arm",
21
23
  ]
22
24
 
23
25
 
@@ -71,6 +73,7 @@ def catalog_item_to_offer(
71
73
  if disk_size_mib is None:
72
74
  return None
73
75
  resources = Resources(
76
+ cpu_arch=item.cpu_arch,
74
77
  cpus=item.cpu,
75
78
  memory_mib=round(item.memory * 1024),
76
79
  gpus=gpus,
@@ -90,6 +93,9 @@ def catalog_item_to_offer(
90
93
 
91
94
 
92
95
  def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
96
+ cpu_arch = offer.instance.resources.cpu_arch
97
+ if cpu_arch is None:
98
+ cpu_arch = gpuhunt.CPUArchitecture.X86
93
99
  gpu_count = len(offer.instance.resources.gpus)
94
100
  gpu_vendor = None
95
101
  gpu_name = None
@@ -104,6 +110,7 @@ def offer_to_catalog_item(offer: InstanceOffer) -> gpuhunt.CatalogItem:
104
110
  instance_name=offer.instance.name,
105
111
  location=offer.region,
106
112
  price=offer.price,
113
+ cpu_arch=cpu_arch,
107
114
  cpu=offer.instance.resources.cpus,
108
115
  memory=offer.instance.resources.memory_mib / 1024,
109
116
  gpu_count=gpu_count,
@@ -125,8 +132,11 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi
125
132
 
126
133
  res = req.resources
127
134
  if res.cpu:
128
- q.min_cpu = res.cpu.min
129
- q.max_cpu = res.cpu.max
135
+ # TODO: Remove in 0.20. Use res.cpu directly
136
+ cpu = parse_obj_as(CPUSpec, res.cpu)
137
+ q.cpu_arch = cpu.arch
138
+ q.min_cpu = cpu.count.min
139
+ q.max_cpu = cpu.count.max
130
140
  if res.memory:
131
141
  q.min_memory = res.memory.min
132
142
  q.max_memory = res.memory.max
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
18
18
  InstanceConfiguration,
19
19
  InstanceOfferWithAvailability,
20
20
  )
21
+ from dstack._internal.core.models.placement import PlacementGroup
21
22
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
22
23
  from dstack._internal.utils.logging import get_logger
23
24
 
@@ -58,6 +59,7 @@ class CudoCompute(
58
59
  self,
59
60
  instance_offer: InstanceOfferWithAvailability,
60
61
  instance_config: InstanceConfiguration,
62
+ placement_group: Optional[PlacementGroup],
61
63
  ) -> JobProvisioningData:
62
64
  vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
63
65
  public_keys = instance_config.get_public_keys()
@@ -145,7 +147,7 @@ class CudoCompute(
145
147
 
146
148
 
147
149
  def _get_image_id(cuda: bool) -> str:
148
- image_name = "ubuntu-2204-nvidia-535-docker-v20240214" if cuda else "ubuntu-2204"
150
+ image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
149
151
  return image_name
150
152
 
151
153
 
@@ -20,6 +20,7 @@ from dstack._internal.core.models.instances import (
20
20
  InstanceOffer,
21
21
  InstanceOfferWithAvailability,
22
22
  )
23
+ from dstack._internal.core.models.placement import PlacementGroup
23
24
  from dstack._internal.core.models.resources import Memory, Range
24
25
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
25
26
  from dstack._internal.utils.logging import get_logger
@@ -85,6 +86,7 @@ class DataCrunchCompute(
85
86
  self,
86
87
  instance_offer: InstanceOfferWithAvailability,
87
88
  instance_config: InstanceConfiguration,
89
+ placement_group: Optional[PlacementGroup],
88
90
  ) -> JobProvisioningData:
89
91
  instance_name = generate_unique_instance_name(
90
92
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -19,7 +19,7 @@ def authenticate(creds: AnyGCPCreds, project_id: Optional[str] = None) -> Tuple[
19
19
  credentials, credentials_project_id = get_credentials(creds)
20
20
  if project_id is None:
21
21
  # If project_id is not specified explicitly, try using credentials' project_id.
22
- # Explicit project_id takes precedence bacause credentials' project_id may be irrelevant.
22
+ # Explicit project_id takes precedence because credentials' project_id may be irrelevant.
23
23
  # For example, with Workload Identity Federation for GKE, it's cluster project_id.
24
24
  project_id = credentials_project_id
25
25
  if project_id is None:
@@ -1,10 +1,12 @@
1
1
  import concurrent.futures
2
2
  import json
3
+ import threading
3
4
  from collections import defaultdict
4
5
  from typing import Callable, Dict, List, Literal, Optional, Tuple
5
6
 
6
7
  import google.api_core.exceptions
7
8
  import google.cloud.compute_v1 as compute_v1
9
+ from cachetools import TTLCache, cachedmethod
8
10
  from google.cloud import tpu_v2
9
11
  from gpuhunt import KNOWN_TPUS
10
12
 
@@ -98,6 +100,8 @@ class GCPCompute(
98
100
  self.resource_policies_client = compute_v1.ResourcePoliciesClient(
99
101
  credentials=self.credentials
100
102
  )
103
+ self._extra_subnets_cache_lock = threading.Lock()
104
+ self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
101
105
 
102
106
  def get_offers(
103
107
  self, requirements: Optional[Requirements] = None
@@ -166,6 +170,7 @@ class GCPCompute(
166
170
  self,
167
171
  instance_offer: InstanceOfferWithAvailability,
168
172
  instance_config: InstanceConfiguration,
173
+ placement_group: Optional[PlacementGroup],
169
174
  ) -> JobProvisioningData:
170
175
  instance_name = generate_unique_instance_name(
171
176
  instance_config, max_length=gcp_resources.MAX_RESOURCE_NAME_LEN
@@ -192,18 +197,16 @@ class GCPCompute(
192
197
  config=self.config,
193
198
  region=instance_offer.region,
194
199
  )
195
- extra_subnets = _get_extra_subnets(
196
- subnetworks_client=self.subnetworks_client,
197
- config=self.config,
200
+ extra_subnets = self._get_extra_subnets(
198
201
  region=instance_offer.region,
199
202
  instance_type_name=instance_offer.instance.name,
200
203
  )
201
204
  placement_policy = None
202
- if instance_config.placement_group_name is not None:
205
+ if placement_group is not None:
203
206
  placement_policy = gcp_resources.get_placement_policy_resource_name(
204
207
  project_id=self.config.project_id,
205
208
  region=instance_offer.region,
206
- placement_policy=instance_config.placement_group_name,
209
+ placement_policy=placement_group.name,
207
210
  )
208
211
  labels = {
209
212
  "owner": "dstack",
@@ -406,6 +409,7 @@ class GCPCompute(
406
409
  def create_placement_group(
407
410
  self,
408
411
  placement_group: PlacementGroup,
412
+ master_instance_offer: InstanceOffer,
409
413
  ) -> PlacementGroupProvisioningData:
410
414
  policy = compute_v1.ResourcePolicy(
411
415
  name=placement_group.name,
@@ -440,6 +444,16 @@ class GCPCompute(
440
444
  raise PlacementGroupInUseError()
441
445
  raise
442
446
 
447
+ def is_suitable_placement_group(
448
+ self,
449
+ placement_group: PlacementGroup,
450
+ instance_offer: InstanceOffer,
451
+ ) -> bool:
452
+ return (
453
+ placement_group.configuration.backend == BackendType.GCP
454
+ and placement_group.configuration.region == instance_offer.region
455
+ )
456
+
443
457
  def create_gateway(
444
458
  self,
445
459
  configuration: GatewayComputeConfiguration,
@@ -757,6 +771,38 @@ class GCPCompute(
757
771
  instance_id,
758
772
  )
759
773
 
774
+ @cachedmethod(
775
+ cache=lambda self: self._extra_subnets_cache,
776
+ lock=lambda self: self._extra_subnets_cache_lock,
777
+ )
778
+ def _get_extra_subnets(
779
+ self,
780
+ region: str,
781
+ instance_type_name: str,
782
+ ) -> List[Tuple[str, str]]:
783
+ if self.config.extra_vpcs is None:
784
+ return []
785
+ if instance_type_name == "a3-megagpu-8g":
786
+ subnets_num = 8
787
+ elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
788
+ subnets_num = 4
789
+ else:
790
+ return []
791
+ extra_subnets = []
792
+ for vpc_name in self.config.extra_vpcs[:subnets_num]:
793
+ subnet = gcp_resources.get_vpc_subnet_or_error(
794
+ subnetworks_client=self.subnetworks_client,
795
+ vpc_project_id=self.config.vpc_project_id or self.config.project_id,
796
+ vpc_name=vpc_name,
797
+ region=region,
798
+ )
799
+ vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
800
+ project_id=self.config.vpc_project_id or self.config.project_id,
801
+ vpc_name=vpc_name,
802
+ )
803
+ extra_subnets.append((vpc_resource_name, subnet))
804
+ return extra_subnets
805
+
760
806
 
761
807
  def _supported_instances_and_zones(
762
808
  regions: List[str],
@@ -831,36 +877,6 @@ def _get_vpc_subnet(
831
877
  )
832
878
 
833
879
 
834
- def _get_extra_subnets(
835
- subnetworks_client: compute_v1.SubnetworksClient,
836
- config: GCPConfig,
837
- region: str,
838
- instance_type_name: str,
839
- ) -> List[Tuple[str, str]]:
840
- if config.extra_vpcs is None:
841
- return []
842
- if instance_type_name == "a3-megagpu-8g":
843
- subnets_num = 8
844
- elif instance_type_name in ["a3-edgegpu-8g", "a3-highgpu-8g"]:
845
- subnets_num = 4
846
- else:
847
- return []
848
- extra_subnets = []
849
- for vpc_name in config.extra_vpcs[:subnets_num]:
850
- subnet = gcp_resources.get_vpc_subnet_or_error(
851
- subnetworks_client=subnetworks_client,
852
- vpc_project_id=config.vpc_project_id or config.project_id,
853
- vpc_name=vpc_name,
854
- region=region,
855
- )
856
- vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
857
- project_id=config.vpc_project_id or config.project_id,
858
- vpc_name=vpc_name,
859
- )
860
- extra_subnets.append((vpc_resource_name, subnet))
861
- return extra_subnets
862
-
863
-
864
880
  def _get_image_id(instance_type_name: str, cuda: bool) -> str:
865
881
  if instance_type_name == "a3-megagpu-8g":
866
882
  image_name = "dstack-a3mega-5"
@@ -20,6 +20,7 @@ from dstack._internal.core.models.instances import (
20
20
  InstanceOffer,
21
21
  InstanceOfferWithAvailability,
22
22
  )
23
+ from dstack._internal.core.models.placement import PlacementGroup
23
24
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
24
25
 
25
26
  MAX_INSTANCE_NAME_LEN = 60
@@ -46,7 +47,10 @@ class LambdaCompute(
46
47
  return offers_with_availability
47
48
 
48
49
  def create_instance(
49
- self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
50
+ self,
51
+ instance_offer: InstanceOfferWithAvailability,
52
+ instance_config: InstanceConfiguration,
53
+ placement_group: Optional[PlacementGroup],
50
54
  ) -> JobProvisioningData:
51
55
  instance_name = generate_unique_instance_name(
52
56
  instance_config, max_length=MAX_INSTANCE_NAME_LEN
@@ -89,7 +93,10 @@ class LambdaCompute(
89
93
  instance_info = _get_instance_info(self.api_client, provisioning_data.instance_id)
90
94
  if instance_info is not None and instance_info["status"] != "booting":
91
95
  provisioning_data.hostname = instance_info["ip"]
92
- commands = get_shim_commands(authorized_keys=[project_ssh_public_key])
96
+ commands = get_shim_commands(
97
+ authorized_keys=[project_ssh_public_key],
98
+ arch=provisioning_data.instance_type.resources.cpu_arch,
99
+ )
93
100
  # shim is assumed to be run under root
94
101
  launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
95
102
  thread = Thread(
@@ -179,13 +186,18 @@ def _setup_instance(
179
186
  ssh_private_key: str,
180
187
  ):
181
188
  setup_commands = (
182
- "mkdir /home/ubuntu/.dstack && "
183
- "sudo apt-get update && "
184
- "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit && "
185
- "sudo nvidia-ctk runtime configure --runtime=docker && "
186
- "sudo pkill -SIGHUP dockerd"
189
+ "mkdir /home/ubuntu/.dstack",
190
+ "sudo apt-get update",
191
+ "sudo apt-get install -y --no-install-recommends nvidia-container-toolkit",
192
+ "sudo install -d -m 0755 /etc/docker",
193
+ # Workaround for https://github.com/NVIDIA/nvidia-container-toolkit/issues/48
194
+ """echo '{"exec-opts":["native.cgroupdriver=cgroupfs"]}' | sudo tee /etc/docker/daemon.json""",
195
+ "sudo nvidia-ctk runtime configure --runtime=docker",
196
+ "sudo systemctl restart docker.service", # `systemctl reload` (`kill -HUP`) won't work
197
+ )
198
+ _run_ssh_command(
199
+ hostname=hostname, ssh_private_key=ssh_private_key, command=" && ".join(setup_commands)
187
200
  )
188
- _run_ssh_command(hostname=hostname, ssh_private_key=ssh_private_key, command=setup_commands)
189
201
 
190
202
 
191
203
  def _launch_runner(
@@ -15,6 +15,7 @@ from dstack._internal.core.models.instances import (
15
15
  InstanceType,
16
16
  Resources,
17
17
  )
18
+ from dstack._internal.core.models.placement import PlacementGroup
18
19
  from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
19
20
  from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
20
21
  from dstack._internal.utils.logging import get_logger
@@ -53,6 +54,7 @@ class LocalCompute(
53
54
  self,
54
55
  instance_offer: InstanceOfferWithAvailability,
55
56
  instance_config: InstanceConfiguration,
57
+ placement_group: Optional[PlacementGroup],
56
58
  ) -> JobProvisioningData:
57
59
  return JobProvisioningData(
58
60
  backend=instance_offer.backend,
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import random
2
3
  import shlex
3
4
  import time
4
5
  from functools import cached_property
@@ -13,13 +14,19 @@ from dstack._internal.core.backends.base.backend import Compute
13
14
  from dstack._internal.core.backends.base.compute import (
14
15
  ComputeWithCreateInstanceSupport,
15
16
  ComputeWithMultinodeSupport,
17
+ ComputeWithPlacementGroupSupport,
16
18
  generate_unique_instance_name,
17
19
  get_user_data,
18
20
  )
19
21
  from dstack._internal.core.backends.base.offers import get_catalog_offers
20
22
  from dstack._internal.core.backends.nebius import resources
23
+ from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
21
24
  from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
22
- from dstack._internal.core.errors import BackendError, NotYetTerminated, ProvisioningError
25
+ from dstack._internal.core.errors import (
26
+ BackendError,
27
+ NotYetTerminated,
28
+ ProvisioningError,
29
+ )
23
30
  from dstack._internal.core.models.backends.base import BackendType
24
31
  from dstack._internal.core.models.common import CoreModel
25
32
  from dstack._internal.core.models.instances import (
@@ -28,6 +35,11 @@ from dstack._internal.core.models.instances import (
28
35
  InstanceOffer,
29
36
  InstanceOfferWithAvailability,
30
37
  )
38
+ from dstack._internal.core.models.placement import (
39
+ PlacementGroup,
40
+ PlacementGroupProvisioningData,
41
+ PlacementStrategy,
42
+ )
31
43
  from dstack._internal.core.models.resources import Memory, Range
32
44
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
33
45
  from dstack._internal.utils.logging import get_logger
@@ -72,6 +84,7 @@ SUPPORTED_PLATFORMS = [
72
84
  class NebiusCompute(
73
85
  ComputeWithCreateInstanceSupport,
74
86
  ComputeWithMultinodeSupport,
87
+ ComputeWithPlacementGroupSupport,
75
88
  Compute,
76
89
  ):
77
90
  def __init__(self, config: NebiusConfig):
@@ -121,6 +134,7 @@ class NebiusCompute(
121
134
  self,
122
135
  instance_offer: InstanceOfferWithAvailability,
123
136
  instance_config: InstanceConfiguration,
137
+ placement_group: Optional[PlacementGroup],
124
138
  ) -> JobProvisioningData:
125
139
  # NOTE: This method can block for a long time as it waits for the boot disk to be created
126
140
  # and the instance to enter the STARTING state. This has to be done in create_instance so
@@ -128,6 +142,14 @@ class NebiusCompute(
128
142
  # instance.
129
143
  instance_name = generate_unique_instance_name(instance_config)
130
144
  platform, preset = instance_offer.instance.name.split()
145
+ cluster_id = None
146
+ if placement_group:
147
+ assert placement_group.provisioning_data is not None
148
+ backend_data = NebiusPlacementGroupBackendData.load(
149
+ placement_group.provisioning_data.backend_data
150
+ )
151
+ if backend_data.cluster is not None:
152
+ cluster_id = backend_data.cluster.id
131
153
  create_disk_op = resources.create_disk(
132
154
  sdk=self._sdk,
133
155
  name=instance_name,
@@ -155,6 +177,7 @@ class NebiusCompute(
155
177
  ),
156
178
  platform=platform,
157
179
  preset=preset,
180
+ cluster_id=cluster_id,
158
181
  disk_id=create_disk_op.resource_id,
159
182
  subnet_id=self._get_subnet_id(instance_offer.region),
160
183
  )
@@ -230,6 +253,63 @@ class NebiusCompute(
230
253
  with resources.ignore_errors([StatusCode.NOT_FOUND]):
231
254
  resources.delete_disk(self._sdk, backend_data_parsed.boot_disk_id)
232
255
 
256
+ def create_placement_group(
257
+ self,
258
+ placement_group: PlacementGroup,
259
+ master_instance_offer: InstanceOffer,
260
+ ) -> PlacementGroupProvisioningData:
261
+ assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER
262
+ backend_data = NebiusPlacementGroupBackendData(cluster=None)
263
+ # Only create a Nebius cluster if the instance supports it.
264
+ # For other instances, return dummy PlacementGroupProvisioningData.
265
+ if fabrics := get_suitable_infiniband_fabrics(
266
+ master_instance_offer, allowed_fabrics=self.config.fabrics
267
+ ):
268
+ fabric = random.choice(fabrics)
269
+ op = resources.create_cluster(
270
+ self._sdk,
271
+ name=placement_group.name,
272
+ project_id=self._region_to_project_id[placement_group.configuration.region],
273
+ fabric=fabric,
274
+ )
275
+ backend_data.cluster = NebiusClusterBackendData(id=op.resource_id, fabric=fabric)
276
+ return PlacementGroupProvisioningData(
277
+ backend=BackendType.NEBIUS,
278
+ backend_data=backend_data.json(),
279
+ )
280
+
281
+ def delete_placement_group(self, placement_group: PlacementGroup) -> None:
282
+ assert placement_group.provisioning_data is not None
283
+ backend_data = NebiusPlacementGroupBackendData.load(
284
+ placement_group.provisioning_data.backend_data
285
+ )
286
+ if backend_data.cluster is not None:
287
+ with resources.ignore_errors([StatusCode.NOT_FOUND]):
288
+ resources.delete_cluster(self._sdk, backend_data.cluster.id)
289
+
290
+ def is_suitable_placement_group(
291
+ self,
292
+ placement_group: PlacementGroup,
293
+ instance_offer: InstanceOffer,
294
+ ) -> bool:
295
+ if not (
296
+ placement_group.configuration.backend == BackendType.NEBIUS
297
+ and placement_group.configuration.region == instance_offer.region
298
+ ):
299
+ return False
300
+ assert placement_group.provisioning_data is not None
301
+ backend_data = NebiusPlacementGroupBackendData.load(
302
+ placement_group.provisioning_data.backend_data
303
+ )
304
+ return (
305
+ backend_data.cluster is None
306
+ or backend_data.cluster.fabric
307
+ in get_suitable_infiniband_fabrics(
308
+ instance_offer,
309
+ allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
310
+ )
311
+ )
312
+
233
313
 
234
314
  class NebiusInstanceBackendData(CoreModel):
235
315
  boot_disk_id: str
@@ -240,6 +320,20 @@ class NebiusInstanceBackendData(CoreModel):
240
320
  return cls.__response__.parse_raw(raw)
241
321
 
242
322
 
323
+ class NebiusClusterBackendData(CoreModel):
324
+ id: str
325
+ fabric: str
326
+
327
+
328
+ class NebiusPlacementGroupBackendData(CoreModel):
329
+ cluster: Optional[NebiusClusterBackendData]
330
+
331
+ @classmethod
332
+ def load(cls, raw: Optional[str]) -> "NebiusPlacementGroupBackendData":
333
+ assert raw is not None
334
+ return cls.__response__.parse_raw(raw)
335
+
336
+
243
337
  def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
244
338
  start = time.monotonic()
245
339
  while True:
@@ -9,6 +9,7 @@ from dstack._internal.core.backends.base.configurator import (
9
9
  )
10
10
  from dstack._internal.core.backends.nebius import resources
11
11
  from dstack._internal.core.backends.nebius.backend import NebiusBackend
12
+ from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
12
13
  from dstack._internal.core.backends.nebius.models import (
13
14
  AnyNebiusBackendConfig,
14
15
  NebiusBackendConfig,
@@ -38,6 +39,16 @@ class NebiusConfigurator(Configurator):
38
39
  fields=[["creds"]],
39
40
  details=str(e),
40
41
  )
42
+ valid_fabrics = get_all_infiniband_fabrics()
43
+ if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
44
+ raise_invalid_credentials_error(
45
+ fields=[["fabrics"]],
46
+ details=(
47
+ "These InfiniBand fabrics do not exist or are not known to dstack:"
48
+ f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
49
+ f" some of the valid options: {sorted(valid_fabrics)}"
50
+ ),
51
+ )
41
52
 
42
53
  def create_backend(
43
54
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -0,0 +1,48 @@
1
+ from collections.abc import Container
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from dstack._internal.core.models.instances import InstanceOffer
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class InfinibandFabric:
10
+ name: str
11
+ platform: str
12
+ region: str
13
+
14
+
15
+ # https://docs.nebius.com/compute/clusters/gpu#fabrics
16
+ INFINIBAND_FABRICS = [
17
+ InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
18
+ InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
19
+ InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
20
+ InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21
+ InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
+ InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
+ InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
24
+ ]
25
+
26
+
27
+ def get_suitable_infiniband_fabrics(
28
+ offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
29
+ ) -> list[str]:
30
+ if len(offer.instance.resources.gpus) < 8:
31
+ # From the create VM page in the Nebius Console:
32
+ # > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
33
+ # > can be added to the cluster
34
+ return []
35
+ platform, _ = offer.instance.name.split()
36
+ return [
37
+ f.name
38
+ for f in INFINIBAND_FABRICS
39
+ if (
40
+ f.platform == platform
41
+ and f.region == offer.region
42
+ and (allowed_fabrics is None or f.name in allowed_fabrics)
43
+ )
44
+ ]
45
+
46
+
47
+ def get_all_infiniband_fabrics() -> set[str]:
48
+ return {f.name for f in INFINIBAND_FABRICS}
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
5
5
  from dstack._internal.core.backends.base.models import fill_data
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
8
- DEFAULT_PROJECT_NAME_PREFIX = "default-project"
8
+ DEFAULT_PROJECT_NAME_PREFIX = "default"
9
9
 
10
10
 
11
11
  class NebiusServiceAccountCreds(CoreModel):
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
87
87
  Optional[list[str]],
88
88
  Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
89
89
  ] = None
90
+ fabrics: Annotated[
91
+ Optional[list[str]],
92
+ Field(
93
+ description=(
94
+ "The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
95
+ )
96
+ ),
97
+ ] = None
90
98
 
91
99
 
92
100
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):
@@ -15,14 +15,19 @@ from nebius.api.nebius.common.v1 import Operation, ResourceMetadata
15
15
  from nebius.api.nebius.compute.v1 import (
16
16
  AttachedDiskSpec,
17
17
  CreateDiskRequest,
18
+ CreateGpuClusterRequest,
18
19
  CreateInstanceRequest,
19
20
  DeleteDiskRequest,
21
+ DeleteGpuClusterRequest,
20
22
  DeleteInstanceRequest,
21
23
  DiskServiceClient,
22
24
  DiskSpec,
23
25
  ExistingDisk,
24
26
  GetInstanceRequest,
27
+ GpuClusterServiceClient,
28
+ GpuClusterSpec,
25
29
  Instance,
30
+ InstanceGpuClusterSpec,
26
31
  InstanceServiceClient,
27
32
  InstanceSpec,
28
33
  IPAddress,
@@ -275,6 +280,7 @@ def create_instance(
275
280
  user_data: str,
276
281
  platform: str,
277
282
  preset: str,
283
+ cluster_id: Optional[str],
278
284
  disk_id: str,
279
285
  subnet_id: str,
280
286
  ) -> SDKOperation[Operation]:
@@ -287,6 +293,7 @@ def create_instance(
287
293
  spec=InstanceSpec(
288
294
  cloud_init_user_data=user_data,
289
295
  resources=ResourcesSpec(platform=platform, preset=preset),
296
+ gpu_cluster=InstanceGpuClusterSpec(id=cluster_id) if cluster_id is not None else None,
290
297
  boot_disk=AttachedDiskSpec(
291
298
  attach_mode=AttachedDiskSpec.AttachMode.READ_WRITE,
292
299
  existing_disk=ExistingDisk(id=disk_id),
@@ -319,3 +326,25 @@ def delete_instance(sdk: SDK, instance_id: str) -> SDKOperation[Operation]:
319
326
  DeleteInstanceRequest(id=instance_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
320
327
  )
321
328
  )
329
+
330
+
331
+ def create_cluster(sdk: SDK, name: str, project_id: str, fabric: str) -> SDKOperation[Operation]:
332
+ with wrap_capacity_errors():
333
+ return LOOP.await_(
334
+ GpuClusterServiceClient(sdk).create(
335
+ CreateGpuClusterRequest(
336
+ metadata=ResourceMetadata(name=name, parent_id=project_id),
337
+ spec=GpuClusterSpec(infiniband_fabric=fabric),
338
+ ),
339
+ timeout=REQUEST_TIMEOUT,
340
+ metadata=REQUEST_MD,
341
+ )
342
+ )
343
+
344
+
345
+ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
346
+ return LOOP.await_(
347
+ GpuClusterServiceClient(sdk).delete(
348
+ DeleteGpuClusterRequest(id=cluster_id), timeout=REQUEST_TIMEOUT, metadata=REQUEST_MD
349
+ )
350
+ )
@@ -23,6 +23,7 @@ from dstack._internal.core.models.instances import (
23
23
  InstanceOffer,
24
24
  InstanceOfferWithAvailability,
25
25
  )
26
+ from dstack._internal.core.models.placement import PlacementGroup
26
27
  from dstack._internal.core.models.resources import Memory, Range
27
28
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
28
29
 
@@ -105,6 +106,7 @@ class OCICompute(
105
106
  self,
106
107
  instance_offer: InstanceOfferWithAvailability,
107
108
  instance_config: InstanceConfiguration,
109
+ placement_group: Optional[PlacementGroup],
108
110
  ) -> JobProvisioningData:
109
111
  region = self.regions[instance_offer.region]
110
112