dstack 0.19.1__py3-none-any.whl → 0.19.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (68) hide show
  1. dstack/_internal/cli/commands/metrics.py +138 -0
  2. dstack/_internal/cli/commands/stats.py +5 -119
  3. dstack/_internal/cli/main.py +2 -0
  4. dstack/_internal/cli/services/profile.py +9 -0
  5. dstack/_internal/core/backends/aws/configurator.py +1 -0
  6. dstack/_internal/core/backends/base/compute.py +4 -1
  7. dstack/_internal/core/backends/base/models.py +7 -7
  8. dstack/_internal/core/backends/configurators.py +9 -0
  9. dstack/_internal/core/backends/cudo/compute.py +2 -0
  10. dstack/_internal/core/backends/cudo/configurator.py +0 -13
  11. dstack/_internal/core/backends/datacrunch/compute.py +118 -32
  12. dstack/_internal/core/backends/datacrunch/configurator.py +16 -11
  13. dstack/_internal/core/backends/gcp/compute.py +140 -26
  14. dstack/_internal/core/backends/gcp/configurator.py +2 -0
  15. dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
  16. dstack/_internal/core/backends/gcp/features/tcpx.py +34 -0
  17. dstack/_internal/core/backends/gcp/models.py +13 -1
  18. dstack/_internal/core/backends/gcp/resources.py +64 -27
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -4
  20. dstack/_internal/core/backends/lambdalabs/configurator.py +0 -21
  21. dstack/_internal/core/backends/models.py +8 -0
  22. dstack/_internal/core/backends/nebius/__init__.py +0 -0
  23. dstack/_internal/core/backends/nebius/backend.py +16 -0
  24. dstack/_internal/core/backends/nebius/compute.py +272 -0
  25. dstack/_internal/core/backends/nebius/configurator.py +74 -0
  26. dstack/_internal/core/backends/nebius/models.py +108 -0
  27. dstack/_internal/core/backends/nebius/resources.py +240 -0
  28. dstack/_internal/core/backends/tensordock/api_client.py +5 -4
  29. dstack/_internal/core/backends/tensordock/compute.py +2 -15
  30. dstack/_internal/core/errors.py +14 -0
  31. dstack/_internal/core/models/backends/base.py +2 -0
  32. dstack/_internal/core/models/profiles.py +3 -0
  33. dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
  34. dstack/_internal/server/background/tasks/process_instances.py +12 -7
  35. dstack/_internal/server/background/tasks/process_running_jobs.py +20 -0
  36. dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -2
  37. dstack/_internal/server/routers/prometheus.py +5 -0
  38. dstack/_internal/server/security/permissions.py +19 -1
  39. dstack/_internal/server/services/instances.py +14 -6
  40. dstack/_internal/server/services/jobs/__init__.py +3 -3
  41. dstack/_internal/server/services/offers.py +4 -2
  42. dstack/_internal/server/services/runs.py +0 -2
  43. dstack/_internal/server/statics/index.html +1 -1
  44. dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-8f9c66f404e9c7e7e020.css} +1 -1
  45. dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js → main-e190de603dc1e9f485ec.js} +7306 -149
  46. dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js.map → main-e190de603dc1e9f485ec.js.map} +1 -1
  47. dstack/_internal/utils/common.py +8 -2
  48. dstack/_internal/utils/event_loop.py +30 -0
  49. dstack/_internal/utils/ignore.py +2 -0
  50. dstack/api/server/_fleets.py +3 -5
  51. dstack/api/server/_runs.py +6 -7
  52. dstack/version.py +1 -1
  53. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/METADATA +27 -11
  54. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/RECORD +67 -57
  55. tests/_internal/core/backends/datacrunch/test_configurator.py +6 -2
  56. tests/_internal/server/background/tasks/test_process_instances.py +4 -2
  57. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +29 -0
  58. tests/_internal/server/routers/test_backends.py +116 -0
  59. tests/_internal/server/routers/test_fleets.py +2 -0
  60. tests/_internal/server/routers/test_prometheus.py +21 -0
  61. tests/_internal/server/routers/test_runs.py +4 -0
  62. tests/_internal/utils/test_common.py +16 -1
  63. tests/_internal/utils/test_event_loop.py +18 -0
  64. dstack/_internal/core/backends/datacrunch/api_client.py +0 -77
  65. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/LICENSE.md +0 -0
  66. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/WHEEL +0 -0
  67. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/entry_points.txt +0 -0
  68. {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,12 @@
1
1
  import json
2
2
 
3
+ from datacrunch import DataCrunchClient
4
+ from datacrunch.exceptions import APIException
5
+
3
6
  from dstack._internal.core.backends.base.configurator import (
4
7
  BackendRecord,
5
8
  Configurator,
9
+ raise_invalid_credentials_error,
6
10
  )
7
11
  from dstack._internal.core.backends.datacrunch.backend import DataCrunchBackend
8
12
  from dstack._internal.core.backends.datacrunch.models import (
@@ -17,13 +21,6 @@ from dstack._internal.core.models.backends.base import (
17
21
  BackendType,
18
22
  )
19
23
 
20
- REGIONS = [
21
- "FIN-01",
22
- "ICE-01",
23
- ]
24
-
25
- DEFAULT_REGION = "FIN-01"
26
-
27
24
 
28
25
  class DataCrunchConfigurator(Configurator):
29
26
  TYPE = BackendType.DATACRUNCH
@@ -32,14 +29,11 @@ class DataCrunchConfigurator(Configurator):
32
29
  def validate_config(
33
30
  self, config: DataCrunchBackendConfigWithCreds, default_creds_enabled: bool
34
31
  ):
35
- # FIXME: validate datacrunch creds
36
- return
32
+ self._validate_creds(config.creds)
37
33
 
38
34
  def create_backend(
39
35
  self, project_name: str, config: DataCrunchBackendConfigWithCreds
40
36
  ) -> BackendRecord:
41
- if config.regions is None:
42
- config.regions = REGIONS
43
37
  return BackendRecord(
44
38
  config=DataCrunchStoredConfig(
45
39
  **DataCrunchBackendConfig.__response__.parse_obj(config).dict()
@@ -64,3 +58,14 @@ class DataCrunchConfigurator(Configurator):
64
58
  **json.loads(record.config),
65
59
  creds=DataCrunchCreds.parse_raw(record.auth),
66
60
  )
61
+
62
+ def _validate_creds(self, creds: DataCrunchCreds):
63
+ try:
64
+ DataCrunchClient(
65
+ client_id=creds.client_id,
66
+ client_secret=creds.client_secret,
67
+ )
68
+ except APIException as e:
69
+ if e.code == "unauthorized_request":
70
+ raise_invalid_credentials_error(fields=[["creds", "api_key"]])
71
+ raise
@@ -1,7 +1,7 @@
1
1
  import concurrent.futures
2
2
  import json
3
3
  from collections import defaultdict
4
- from typing import Callable, Dict, List, Literal, Optional
4
+ from typing import Callable, Dict, List, Literal, Optional, Tuple
5
5
 
6
6
  import google.api_core.exceptions
7
7
  import google.cloud.compute_v1 as compute_v1
@@ -10,11 +10,13 @@ from gpuhunt import KNOWN_TPUS
10
10
 
11
11
  import dstack._internal.core.backends.gcp.auth as auth
12
12
  import dstack._internal.core.backends.gcp.resources as gcp_resources
13
+ from dstack import version
13
14
  from dstack._internal.core.backends.base.compute import (
14
15
  Compute,
15
16
  ComputeWithCreateInstanceSupport,
16
17
  ComputeWithGatewaySupport,
17
18
  ComputeWithMultinodeSupport,
19
+ ComputeWithPlacementGroupSupport,
18
20
  ComputeWithVolumeSupport,
19
21
  generate_unique_gateway_instance_name,
20
22
  generate_unique_instance_name,
@@ -25,11 +27,13 @@ from dstack._internal.core.backends.base.compute import (
25
27
  merge_tags,
26
28
  )
27
29
  from dstack._internal.core.backends.base.offers import get_catalog_offers
30
+ from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
28
31
  from dstack._internal.core.backends.gcp.models import GCPConfig
29
32
  from dstack._internal.core.errors import (
30
33
  ComputeError,
31
34
  ComputeResourceNotFoundError,
32
35
  NoCapacityError,
36
+ PlacementGroupInUseError,
33
37
  ProvisioningError,
34
38
  )
35
39
  from dstack._internal.core.models.backends.base import BackendType
@@ -46,6 +50,7 @@ from dstack._internal.core.models.instances import (
46
50
  InstanceType,
47
51
  Resources,
48
52
  )
53
+ from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
49
54
  from dstack._internal.core.models.resources import Memory, Range
50
55
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
51
56
  from dstack._internal.core.models.volumes import (
@@ -74,6 +79,7 @@ class GCPVolumeDiskBackendData(CoreModel):
74
79
  class GCPCompute(
75
80
  ComputeWithCreateInstanceSupport,
76
81
  ComputeWithMultinodeSupport,
82
+ ComputeWithPlacementGroupSupport,
77
83
  ComputeWithGatewaySupport,
78
84
  ComputeWithVolumeSupport,
79
85
  Compute,
@@ -89,6 +95,9 @@ class GCPCompute(
89
95
  self.routers_client = compute_v1.RoutersClient(credentials=self.credentials)
90
96
  self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials)
91
97
  self.disk_client = compute_v1.DisksClient(credentials=self.credentials)
98
+ self.resource_policies_client = compute_v1.ResourcePoliciesClient(
99
+ credentials=self.credentials
100
+ )
92
101
 
93
102
  def get_offers(
94
103
  self, requirements: Optional[Requirements] = None
@@ -183,6 +192,19 @@ class GCPCompute(
183
192
  config=self.config,
184
193
  region=instance_offer.region,
185
194
  )
195
+ extra_subnets = _get_extra_subnets(
196
+ subnetworks_client=self.subnetworks_client,
197
+ config=self.config,
198
+ region=instance_offer.region,
199
+ instance_type_name=instance_offer.instance.name,
200
+ )
201
+ placement_policy = None
202
+ if instance_config.placement_group_name is not None:
203
+ placement_policy = gcp_resources.get_placement_policy_resource_name(
204
+ project_id=self.config.project_id,
205
+ region=instance_offer.region,
206
+ placement_policy=instance_config.placement_group_name,
207
+ )
186
208
  labels = {
187
209
  "owner": "dstack",
188
210
  "dstack_project": instance_config.project_name.lower(),
@@ -259,8 +281,9 @@ class GCPCompute(
259
281
  request.project = self.config.project_id
260
282
  request.instance_resource = gcp_resources.create_instance_struct(
261
283
  disk_size=disk_size,
262
- image_id=gcp_resources.get_image_id(
263
- len(instance_offer.instance.resources.gpus) > 0,
284
+ image_id=_get_image_id(
285
+ instance_type_name=instance_offer.instance.name,
286
+ cuda=len(instance_offer.instance.resources.gpus) > 0,
264
287
  ),
265
288
  machine_type=instance_offer.instance.name,
266
289
  accelerators=gcp_resources.get_accelerators(
@@ -269,7 +292,12 @@ class GCPCompute(
269
292
  gpus=instance_offer.instance.resources.gpus,
270
293
  ),
271
294
  spot=instance_offer.instance.resources.spot,
272
- user_data=get_user_data(authorized_keys),
295
+ user_data=get_user_data(
296
+ authorized_keys,
297
+ backend_specific_commands=_get_backend_specific_commands(
298
+ instance_offer.instance.name
299
+ ),
300
+ ),
273
301
  authorized_keys=authorized_keys,
274
302
  labels=labels,
275
303
  tags=[gcp_resources.DSTACK_INSTANCE_TAG],
@@ -278,7 +306,9 @@ class GCPCompute(
278
306
  service_account=self.config.vm_service_account,
279
307
  network=self.config.vpc_resource_name,
280
308
  subnetwork=subnetwork,
309
+ extra_subnetworks=extra_subnets,
281
310
  allocate_public_ip=allocate_public_ip,
311
+ placement_policy=placement_policy,
282
312
  )
283
313
  try:
284
314
  # GCP needs some time to return an error in case of no capacity (< 30s).
@@ -371,6 +401,43 @@ class GCPCompute(
371
401
  f"Failed to get instance IP address. Instance status: {instance.status}"
372
402
  )
373
403
 
404
+ def create_placement_group(
405
+ self,
406
+ placement_group: PlacementGroup,
407
+ ) -> PlacementGroupProvisioningData:
408
+ policy = compute_v1.ResourcePolicy(
409
+ name=placement_group.name,
410
+ region=placement_group.configuration.region,
411
+ group_placement_policy=compute_v1.ResourcePolicyGroupPlacementPolicy(
412
+ availability_domain_count=1,
413
+ collocation="COLLOCATED",
414
+ ),
415
+ )
416
+ self.resource_policies_client.insert(
417
+ project=self.config.project_id,
418
+ region=placement_group.configuration.region,
419
+ resource_policy_resource=policy,
420
+ )
421
+ return PlacementGroupProvisioningData(backend=BackendType.GCP)
422
+
423
+ def delete_placement_group(
424
+ self,
425
+ placement_group: PlacementGroup,
426
+ ):
427
+ try:
428
+ operation = self.resource_policies_client.delete(
429
+ project=self.config.project_id,
430
+ region=placement_group.configuration.region,
431
+ resource_policy=placement_group.name,
432
+ )
433
+ operation.result() # Wait for operation to complete
434
+ except google.api_core.exceptions.NotFound:
435
+ logger.debug("Placement group %s not found", placement_group.name)
436
+ except google.api_core.exceptions.BadRequest as e:
437
+ if "is already being used by" in e.message:
438
+ raise PlacementGroupInUseError()
439
+ raise
440
+
374
441
  def create_gateway(
375
442
  self,
376
443
  configuration: GatewayComputeConfiguration,
@@ -412,7 +479,7 @@ class GCPCompute(
412
479
  request.project = self.config.project_id
413
480
  request.instance_resource = gcp_resources.create_instance_struct(
414
481
  disk_size=10,
415
- image_id=gcp_resources.get_gateway_image_id(),
482
+ image_id=_get_gateway_image_id(),
416
483
  machine_type="e2-small",
417
484
  accelerators=[],
418
485
  spot=False,
@@ -681,21 +748,6 @@ class GCPCompute(
681
748
  )
682
749
 
683
750
 
684
- def _get_vpc_subnet(
685
- subnetworks_client: compute_v1.SubnetworksClient,
686
- config: GCPConfig,
687
- region: str,
688
- ) -> Optional[str]:
689
- if config.vpc_name is None:
690
- return None
691
- return gcp_resources.get_vpc_subnet_or_error(
692
- subnetworks_client=subnetworks_client,
693
- vpc_project_id=config.vpc_project_id or config.project_id,
694
- vpc_name=config.vpc_name,
695
- region=region,
696
- )
697
-
698
-
699
751
  def _supported_instances_and_zones(
700
752
  regions: List[str],
701
753
  ) -> Optional[Callable[[InstanceOffer], bool]]:
@@ -754,6 +806,74 @@ def _unique_instance_name(instance: InstanceType) -> str:
754
806
  return f"{name}-{gpu.name}-{gpu.memory_mib}"
755
807
 
756
808
 
809
+ def _get_vpc_subnet(
810
+ subnetworks_client: compute_v1.SubnetworksClient,
811
+ config: GCPConfig,
812
+ region: str,
813
+ ) -> Optional[str]:
814
+ if config.vpc_name is None:
815
+ return None
816
+ return gcp_resources.get_vpc_subnet_or_error(
817
+ subnetworks_client=subnetworks_client,
818
+ vpc_project_id=config.vpc_project_id or config.project_id,
819
+ vpc_name=config.vpc_name,
820
+ region=region,
821
+ )
822
+
823
+
824
+ def _get_extra_subnets(
825
+ subnetworks_client: compute_v1.SubnetworksClient,
826
+ config: GCPConfig,
827
+ region: str,
828
+ instance_type_name: str,
829
+ ) -> List[Tuple[str, str]]:
830
+ if config.extra_vpcs is None:
831
+ return []
832
+ if instance_type_name != "a3-megagpu-8g":
833
+ return []
834
+ extra_subnets = []
835
+ for vpc_name in config.extra_vpcs:
836
+ subnet = gcp_resources.get_vpc_subnet_or_error(
837
+ subnetworks_client=subnetworks_client,
838
+ vpc_project_id=config.vpc_project_id or config.project_id,
839
+ vpc_name=vpc_name,
840
+ region=region,
841
+ )
842
+ vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
843
+ project_id=config.vpc_project_id or config.project_id,
844
+ vpc_name=vpc_name,
845
+ )
846
+ extra_subnets.append((vpc_resource_name, subnet))
847
+ return extra_subnets[:8]
848
+
849
+
850
+ def _get_image_id(instance_type_name: str, cuda: bool) -> str:
851
+ if instance_type_name == "a3-megagpu-8g":
852
+ image_name = "dstack-a3mega-5"
853
+ elif cuda:
854
+ image_name = f"dstack-cuda-{version.base_image}"
855
+ else:
856
+ image_name = f"dstack-{version.base_image}"
857
+ image_name = image_name.replace(".", "-")
858
+ return f"projects/dstack/global/images/{image_name}"
859
+
860
+
861
+ def _get_gateway_image_id() -> str:
862
+ return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
863
+
864
+
865
+ def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
866
+ if instance_type_name == "a3-megagpu-8g":
867
+ return tcpx_features.get_backend_specific_commands_tcpxo()
868
+ return []
869
+
870
+
871
+ def _get_volume_price(size: int) -> float:
872
+ # https://cloud.google.com/compute/disks-image-pricing#persistentdisk
873
+ # The price is different in different regions. Take max across supported regions.
874
+ return size * 0.12
875
+
876
+
757
877
  def _get_tpu_startup_script(authorized_keys: List[str]) -> str:
758
878
  commands = get_shim_commands(
759
879
  authorized_keys=authorized_keys, is_privileged=True, pjrt_device="TPU"
@@ -805,12 +925,6 @@ def _is_single_host_tpu(instance_name: str) -> bool:
805
925
  return False
806
926
 
807
927
 
808
- def _get_volume_price(size: int) -> float:
809
- # https://cloud.google.com/compute/disks-image-pricing#persistentdisk
810
- # The price is different in different regions. Take max across supported regions.
811
- return size * 0.12
812
-
813
-
814
928
  def _get_tpu_data_disks(
815
929
  project_id: str, volumes: Optional[List[Volume]]
816
930
  ) -> List[tpu_v2.AttachedDisk]:
@@ -199,3 +199,5 @@ class GCPConfigurator(Configurator):
199
199
  )
200
200
  except BackendError as e:
201
201
  raise ServerClientError(e.args[0])
202
+ # Not checking config.extra_vpc so that users are not required to configure subnets for all regions
203
+ # but only for regions they intend to use. Validation will be done on provisioning.
@@ -0,0 +1,34 @@
1
+ from typing import List
2
+
3
+
4
+ def get_backend_specific_commands_tcpxo() -> List[str]:
5
+ return [
6
+ "modprobe import-helper",
7
+ "gcloud -q auth configure-docker us-docker.pkg.dev",
8
+ # Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/.
9
+ (
10
+ "docker run --rm "
11
+ "--name nccl-installer "
12
+ "--pull=never "
13
+ "--network=host "
14
+ "--volume /var/lib:/var/lib "
15
+ "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 "
16
+ "install --install-nccl"
17
+ ),
18
+ # Start FasTrak receive-datapath-manager
19
+ (
20
+ "docker run "
21
+ "--name receive-datapath-manager "
22
+ "--detach "
23
+ "--pull=never "
24
+ "--cap-add=NET_ADMIN "
25
+ "--network=host "
26
+ "--privileged "
27
+ "--gpus all "
28
+ "--volume /usr/lib32:/usr/local/nvidia/lib64 "
29
+ "--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper "
30
+ "--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu "
31
+ "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 "
32
+ "--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
33
+ ),
34
+ ]
@@ -33,7 +33,19 @@ class GCPBackendConfig(CoreModel):
33
33
  regions: Annotated[
34
34
  Optional[List[str]], Field(description="The list of GCP regions. Omit to use all regions")
35
35
  ] = None
36
- vpc_name: Annotated[Optional[str], Field(description="The name of a custom VPC")] = None
36
+ vpc_name: Annotated[
37
+ Optional[str],
38
+ Field(description="The name of a custom VPC. If not specified, the default VPC is used"),
39
+ ] = None
40
+ extra_vpcs: Annotated[
41
+ Optional[List[str]],
42
+ Field(
43
+ description=(
44
+ "The names of additional VPCs used for GPUDirect. Specify eight VPCs to maximize bandwidth."
45
+ " Each VPC must have a subnet and a firewall rule allowing internal traffic across all subnets"
46
+ )
47
+ ),
48
+ ] = None
37
49
  vpc_project_id: Annotated[
38
50
  Optional[str],
39
51
  Field(description="The shared VPC hosted project ID. Required for shared VPC only"),
@@ -1,6 +1,6 @@
1
1
  import concurrent.futures
2
2
  import re
3
- from typing import Dict, List, Optional
3
+ from typing import Dict, List, Optional, Tuple
4
4
 
5
5
  import google.api_core.exceptions
6
6
  import google.cloud.compute_v1 as compute_v1
@@ -8,7 +8,6 @@ from google.api_core.extended_operation import ExtendedOperation
8
8
  from google.api_core.operation import Operation
9
9
  from google.cloud import tpu_v2
10
10
 
11
- import dstack.version as version
12
11
  from dstack._internal.core.errors import BackendError, ComputeError
13
12
  from dstack._internal.core.models.instances import Gpu
14
13
  from dstack._internal.utils.common import remove_prefix
@@ -54,12 +53,16 @@ def check_vpc(
54
53
  if shared_vpc_project_id:
55
54
  vpc_project_id = shared_vpc_project_id
56
55
  try:
56
+ usable_subnets = list_project_usable_subnets(
57
+ subnetworks_client=subnetworks_client, project_id=vpc_project_id
58
+ )
57
59
  for region in regions:
58
60
  get_vpc_subnet_or_error(
59
61
  subnetworks_client=subnetworks_client,
60
62
  vpc_project_id=vpc_project_id,
61
63
  vpc_name=vpc_name,
62
64
  region=region,
65
+ usable_subnets=usable_subnets,
63
66
  )
64
67
  except google.api_core.exceptions.NotFound:
65
68
  raise ComputeError(f"Failed to find VPC project {vpc_project_id}")
@@ -117,26 +120,19 @@ def create_instance_struct(
117
120
  service_account: Optional[str] = None,
118
121
  network: str = "global/networks/default",
119
122
  subnetwork: Optional[str] = None,
123
+ extra_subnetworks: Optional[List[Tuple[str, str]]] = None,
120
124
  allocate_public_ip: bool = True,
125
+ placement_policy: Optional[str] = None,
121
126
  ) -> compute_v1.Instance:
122
- network_interface = compute_v1.NetworkInterface()
123
- network_interface.network = network
124
- if subnetwork is not None:
125
- network_interface.subnetwork = subnetwork
126
-
127
- if allocate_public_ip:
128
- access = compute_v1.AccessConfig()
129
- access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
130
- access.name = "External NAT"
131
- access.network_tier = access.NetworkTier.PREMIUM.name
132
- network_interface.access_configs = [access]
133
- else:
134
- network_interface.access_configs = []
135
-
136
127
  instance = compute_v1.Instance()
137
- instance.network_interfaces = [network_interface]
138
128
  instance.name = instance_name
139
129
  instance.machine_type = f"zones/{zone}/machineTypes/{machine_type}"
130
+ instance.network_interfaces = _get_network_interfaces(
131
+ network=network,
132
+ subnetwork=subnetwork,
133
+ allocate_public_ip=allocate_public_ip,
134
+ extra_subnetworks=extra_subnetworks,
135
+ )
140
136
 
141
137
  disk = compute_v1.AttachedDisk()
142
138
  disk.auto_delete = True
@@ -160,6 +156,9 @@ def create_instance_struct(
160
156
  # Attachable GPUs, H100, A100, and L4
161
157
  instance.scheduling.on_host_maintenance = "TERMINATE"
162
158
 
159
+ if placement_policy is not None:
160
+ instance.resource_policies = [placement_policy]
161
+
163
162
  if spot:
164
163
  instance.scheduling = compute_v1.Scheduling()
165
164
  instance.scheduling.provisioning_model = compute_v1.Scheduling.ProvisioningModel.SPOT.name
@@ -187,18 +186,42 @@ def create_instance_struct(
187
186
  return instance
188
187
 
189
188
 
190
- def get_image_id(cuda: bool) -> str:
191
- if not cuda:
192
- image_name = f"dstack-{version.base_image}"
189
+ def _get_network_interfaces(
190
+ network: str,
191
+ subnetwork: Optional[str],
192
+ allocate_public_ip: bool,
193
+ extra_subnetworks: Optional[List[Tuple[str, str]]],
194
+ ) -> List[compute_v1.NetworkInterface]:
195
+ network_interface = compute_v1.NetworkInterface()
196
+ network_interface.network = network
197
+ if subnetwork is not None:
198
+ network_interface.subnetwork = subnetwork
199
+ if allocate_public_ip:
200
+ access = compute_v1.AccessConfig()
201
+ access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
202
+ access.name = "External NAT"
203
+ access.network_tier = access.NetworkTier.PREMIUM.name
204
+ network_interface.access_configs = [access]
193
205
  else:
194
- image_name = f"dstack-cuda-{version.base_image}"
195
- image_name = image_name.replace(".", "-")
206
+ network_interface.access_configs = []
196
207
 
197
- return f"projects/dstack/global/images/{image_name}"
208
+ network_interfaces = [network_interface]
209
+ for network, subnetwork in extra_subnetworks or []:
210
+ network_interfaces.append(
211
+ compute_v1.NetworkInterface(
212
+ network=network,
213
+ subnetwork=subnetwork,
214
+ )
215
+ )
216
+ return network_interfaces
198
217
 
199
218
 
200
- def get_gateway_image_id() -> str:
201
- return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
219
+ def list_project_usable_subnets(
220
+ subnetworks_client: compute_v1.SubnetworksClient,
221
+ project_id: str,
222
+ ) -> List[compute_v1.UsableSubnetwork]:
223
+ request = compute_v1.ListUsableSubnetworksRequest(project=project_id)
224
+ return [s for s in subnetworks_client.list_usable(request=request)]
202
225
 
203
226
 
204
227
  def get_vpc_subnet_or_error(
@@ -206,13 +229,15 @@ def get_vpc_subnet_or_error(
206
229
  vpc_project_id: str,
207
230
  vpc_name: str,
208
231
  region: str,
232
+ usable_subnets: Optional[List[compute_v1.UsableSubnetwork]] = None,
209
233
  ) -> str:
210
234
  """
211
235
  Returns resource name of any usable subnet in a given VPC
212
236
  (e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet")
213
237
  """
214
- request = compute_v1.ListUsableSubnetworksRequest(project=vpc_project_id)
215
- for subnet in subnetworks_client.list_usable(request=request):
238
+ if usable_subnets is None:
239
+ usable_subnets = list_project_usable_subnets(subnetworks_client, vpc_project_id)
240
+ for subnet in usable_subnets:
216
241
  network_name = subnet.network.split("/")[-1]
217
242
  subnet_url = subnet.subnetwork
218
243
  subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/")
@@ -410,3 +435,15 @@ def wait_for_operation(operation: Operation, verbose_name: str = "operation", ti
410
435
 
411
436
  def full_resource_name_to_name(full_resource_name: str) -> str:
412
437
  return full_resource_name.split("/")[-1]
438
+
439
+
440
+ def vpc_name_to_vpc_resource_name(project_id: str, vpc_name: str) -> str:
441
+ return f"projects/{project_id}/global/networks/{vpc_name}"
442
+
443
+
444
+ def get_placement_policy_resource_name(
445
+ project_id: str,
446
+ region: str,
447
+ placement_policy: str,
448
+ ) -> str:
449
+ return f"projects/{project_id}/regions/{region}/resourcePolicies/{placement_policy}"
@@ -39,7 +39,7 @@ class LambdaCompute(
39
39
  ) -> List[InstanceOfferWithAvailability]:
40
40
  offers = get_catalog_offers(
41
41
  backend=BackendType.LAMBDA,
42
- locations=self.config.regions,
42
+ locations=self.config.regions or None,
43
43
  requirements=requirements,
44
44
  )
45
45
  offers_with_availability = self._get_offers_with_availability(offers)
@@ -90,7 +90,7 @@ class LambdaCompute(
90
90
  if instance_info is not None and instance_info["status"] != "booting":
91
91
  provisioning_data.hostname = instance_info["ip"]
92
92
  commands = get_shim_commands(authorized_keys=[project_ssh_public_key])
93
- # shim is asssumed to be run under root
93
+ # shim is assumed to be run under root
94
94
  launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
95
95
  thread = Thread(
96
96
  target=_start_runner,
@@ -119,8 +119,6 @@ class LambdaCompute(
119
119
  }
120
120
  availability_offers = []
121
121
  for offer in offers:
122
- if offer.region not in self.config.regions:
123
- continue
124
122
  availability = InstanceAvailability.NOT_AVAILABLE
125
123
  if offer.region in instance_availability.get(offer.instance.name, []):
126
124
  availability = InstanceAvailability.AVAILABLE
@@ -19,25 +19,6 @@ from dstack._internal.core.models.backends.base import (
19
19
  BackendType,
20
20
  )
21
21
 
22
- REGIONS = [
23
- "us-south-1",
24
- "us-south-2",
25
- "us-south-3",
26
- "us-west-2",
27
- "us-west-1",
28
- "us-midwest-1",
29
- "us-west-3",
30
- "us-east-1",
31
- "us-east-2",
32
- "europe-central-1",
33
- "asia-south-1",
34
- "me-west-1",
35
- "asia-northeast-1",
36
- "asia-northeast-2",
37
- ]
38
-
39
- DEFAULT_REGION = "us-east-1"
40
-
41
22
 
42
23
  class LambdaConfigurator(Configurator):
43
24
  TYPE = BackendType.LAMBDA
@@ -49,8 +30,6 @@ class LambdaConfigurator(Configurator):
49
30
  def create_backend(
50
31
  self, project_name: str, config: LambdaBackendConfigWithCreds
51
32
  ) -> BackendRecord:
52
- if config.regions is None:
53
- config.regions = REGIONS
54
33
  return BackendRecord(
55
34
  config=LambdaStoredConfig(
56
35
  **LambdaBackendConfig.__response__.parse_obj(config).dict()
@@ -34,6 +34,11 @@ from dstack._internal.core.backends.lambdalabs.models import (
34
34
  LambdaBackendConfig,
35
35
  LambdaBackendConfigWithCreds,
36
36
  )
37
+ from dstack._internal.core.backends.nebius.models import (
38
+ NebiusBackendConfig,
39
+ NebiusBackendConfigWithCreds,
40
+ NebiusBackendFileConfigWithCreds,
41
+ )
37
42
  from dstack._internal.core.backends.oci.models import (
38
43
  OCIBackendConfig,
39
44
  OCIBackendConfigWithCreds,
@@ -65,6 +70,7 @@ AnyBackendConfigWithoutCreds = Union[
65
70
  GCPBackendConfig,
66
71
  KubernetesBackendConfig,
67
72
  LambdaBackendConfig,
73
+ NebiusBackendConfig,
68
74
  OCIBackendConfig,
69
75
  RunpodBackendConfig,
70
76
  TensorDockBackendConfig,
@@ -86,6 +92,7 @@ AnyBackendConfigWithCreds = Union[
86
92
  KubernetesBackendConfigWithCreds,
87
93
  LambdaBackendConfigWithCreds,
88
94
  OCIBackendConfigWithCreds,
95
+ NebiusBackendConfigWithCreds,
89
96
  RunpodBackendConfigWithCreds,
90
97
  TensorDockBackendConfigWithCreds,
91
98
  VastAIBackendConfigWithCreds,
@@ -105,6 +112,7 @@ AnyBackendFileConfigWithCreds = Union[
105
112
  KubernetesBackendFileConfigWithCreds,
106
113
  LambdaBackendConfigWithCreds,
107
114
  OCIBackendConfigWithCreds,
115
+ NebiusBackendFileConfigWithCreds,
108
116
  RunpodBackendConfigWithCreds,
109
117
  TensorDockBackendConfigWithCreds,
110
118
  VastAIBackendConfigWithCreds,
File without changes
@@ -0,0 +1,16 @@
1
+ from dstack._internal.core.backends.base.backend import Backend
2
+ from dstack._internal.core.backends.nebius.compute import NebiusCompute
3
+ from dstack._internal.core.backends.nebius.models import NebiusConfig
4
+ from dstack._internal.core.models.backends.base import BackendType
5
+
6
+
7
+ class NebiusBackend(Backend):
8
+ TYPE = BackendType.NEBIUS
9
+ COMPUTE_CLASS = NebiusCompute
10
+
11
+ def __init__(self, config: NebiusConfig):
12
+ self.config = config
13
+ self._compute = NebiusCompute(self.config)
14
+
15
+ def compute(self) -> NebiusCompute:
16
+ return self._compute