dstack 0.19.1__py3-none-any.whl → 0.19.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/metrics.py +138 -0
- dstack/_internal/cli/commands/stats.py +5 -119
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/profile.py +9 -0
- dstack/_internal/core/backends/aws/configurator.py +1 -0
- dstack/_internal/core/backends/base/compute.py +4 -1
- dstack/_internal/core/backends/base/models.py +7 -7
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/cudo/compute.py +2 -0
- dstack/_internal/core/backends/cudo/configurator.py +0 -13
- dstack/_internal/core/backends/datacrunch/compute.py +118 -32
- dstack/_internal/core/backends/datacrunch/configurator.py +16 -11
- dstack/_internal/core/backends/gcp/compute.py +140 -26
- dstack/_internal/core/backends/gcp/configurator.py +2 -0
- dstack/_internal/core/backends/gcp/features/__init__.py +0 -0
- dstack/_internal/core/backends/gcp/features/tcpx.py +34 -0
- dstack/_internal/core/backends/gcp/models.py +13 -1
- dstack/_internal/core/backends/gcp/resources.py +64 -27
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -4
- dstack/_internal/core/backends/lambdalabs/configurator.py +0 -21
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/__init__.py +0 -0
- dstack/_internal/core/backends/nebius/backend.py +16 -0
- dstack/_internal/core/backends/nebius/compute.py +272 -0
- dstack/_internal/core/backends/nebius/configurator.py +74 -0
- dstack/_internal/core/backends/nebius/models.py +108 -0
- dstack/_internal/core/backends/nebius/resources.py +240 -0
- dstack/_internal/core/backends/tensordock/api_client.py +5 -4
- dstack/_internal/core/backends/tensordock/compute.py +2 -15
- dstack/_internal/core/errors.py +14 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/profiles.py +3 -0
- dstack/_internal/proxy/lib/schemas/model_proxy.py +3 -3
- dstack/_internal/server/background/tasks/process_instances.py +12 -7
- dstack/_internal/server/background/tasks/process_running_jobs.py +20 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -2
- dstack/_internal/server/routers/prometheus.py +5 -0
- dstack/_internal/server/security/permissions.py +19 -1
- dstack/_internal/server/services/instances.py +14 -6
- dstack/_internal/server/services/jobs/__init__.py +3 -3
- dstack/_internal/server/services/offers.py +4 -2
- dstack/_internal/server/services/runs.py +0 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-da9f8c06a69c20dac23e.css → main-8f9c66f404e9c7e7e020.css} +1 -1
- dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js → main-e190de603dc1e9f485ec.js} +7306 -149
- dstack/_internal/server/statics/{main-4a0fe83e84574654e397.js.map → main-e190de603dc1e9f485ec.js.map} +1 -1
- dstack/_internal/utils/common.py +8 -2
- dstack/_internal/utils/event_loop.py +30 -0
- dstack/_internal/utils/ignore.py +2 -0
- dstack/api/server/_fleets.py +3 -5
- dstack/api/server/_runs.py +6 -7
- dstack/version.py +1 -1
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/METADATA +27 -11
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/RECORD +67 -57
- tests/_internal/core/backends/datacrunch/test_configurator.py +6 -2
- tests/_internal/server/background/tasks/test_process_instances.py +4 -2
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +29 -0
- tests/_internal/server/routers/test_backends.py +116 -0
- tests/_internal/server/routers/test_fleets.py +2 -0
- tests/_internal/server/routers/test_prometheus.py +21 -0
- tests/_internal/server/routers/test_runs.py +4 -0
- tests/_internal/utils/test_common.py +16 -1
- tests/_internal/utils/test_event_loop.py +18 -0
- dstack/_internal/core/backends/datacrunch/api_client.py +0 -77
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/LICENSE.md +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/WHEEL +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.1.dist-info → dstack-0.19.3.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
|
+
from datacrunch import DataCrunchClient
|
|
4
|
+
from datacrunch.exceptions import APIException
|
|
5
|
+
|
|
3
6
|
from dstack._internal.core.backends.base.configurator import (
|
|
4
7
|
BackendRecord,
|
|
5
8
|
Configurator,
|
|
9
|
+
raise_invalid_credentials_error,
|
|
6
10
|
)
|
|
7
11
|
from dstack._internal.core.backends.datacrunch.backend import DataCrunchBackend
|
|
8
12
|
from dstack._internal.core.backends.datacrunch.models import (
|
|
@@ -17,13 +21,6 @@ from dstack._internal.core.models.backends.base import (
|
|
|
17
21
|
BackendType,
|
|
18
22
|
)
|
|
19
23
|
|
|
20
|
-
REGIONS = [
|
|
21
|
-
"FIN-01",
|
|
22
|
-
"ICE-01",
|
|
23
|
-
]
|
|
24
|
-
|
|
25
|
-
DEFAULT_REGION = "FIN-01"
|
|
26
|
-
|
|
27
24
|
|
|
28
25
|
class DataCrunchConfigurator(Configurator):
|
|
29
26
|
TYPE = BackendType.DATACRUNCH
|
|
@@ -32,14 +29,11 @@ class DataCrunchConfigurator(Configurator):
|
|
|
32
29
|
def validate_config(
|
|
33
30
|
self, config: DataCrunchBackendConfigWithCreds, default_creds_enabled: bool
|
|
34
31
|
):
|
|
35
|
-
|
|
36
|
-
return
|
|
32
|
+
self._validate_creds(config.creds)
|
|
37
33
|
|
|
38
34
|
def create_backend(
|
|
39
35
|
self, project_name: str, config: DataCrunchBackendConfigWithCreds
|
|
40
36
|
) -> BackendRecord:
|
|
41
|
-
if config.regions is None:
|
|
42
|
-
config.regions = REGIONS
|
|
43
37
|
return BackendRecord(
|
|
44
38
|
config=DataCrunchStoredConfig(
|
|
45
39
|
**DataCrunchBackendConfig.__response__.parse_obj(config).dict()
|
|
@@ -64,3 +58,14 @@ class DataCrunchConfigurator(Configurator):
|
|
|
64
58
|
**json.loads(record.config),
|
|
65
59
|
creds=DataCrunchCreds.parse_raw(record.auth),
|
|
66
60
|
)
|
|
61
|
+
|
|
62
|
+
def _validate_creds(self, creds: DataCrunchCreds):
|
|
63
|
+
try:
|
|
64
|
+
DataCrunchClient(
|
|
65
|
+
client_id=creds.client_id,
|
|
66
|
+
client_secret=creds.client_secret,
|
|
67
|
+
)
|
|
68
|
+
except APIException as e:
|
|
69
|
+
if e.code == "unauthorized_request":
|
|
70
|
+
raise_invalid_credentials_error(fields=[["creds", "api_key"]])
|
|
71
|
+
raise
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from typing import Callable, Dict, List, Literal, Optional
|
|
4
|
+
from typing import Callable, Dict, List, Literal, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import google.api_core.exceptions
|
|
7
7
|
import google.cloud.compute_v1 as compute_v1
|
|
@@ -10,11 +10,13 @@ from gpuhunt import KNOWN_TPUS
|
|
|
10
10
|
|
|
11
11
|
import dstack._internal.core.backends.gcp.auth as auth
|
|
12
12
|
import dstack._internal.core.backends.gcp.resources as gcp_resources
|
|
13
|
+
from dstack import version
|
|
13
14
|
from dstack._internal.core.backends.base.compute import (
|
|
14
15
|
Compute,
|
|
15
16
|
ComputeWithCreateInstanceSupport,
|
|
16
17
|
ComputeWithGatewaySupport,
|
|
17
18
|
ComputeWithMultinodeSupport,
|
|
19
|
+
ComputeWithPlacementGroupSupport,
|
|
18
20
|
ComputeWithVolumeSupport,
|
|
19
21
|
generate_unique_gateway_instance_name,
|
|
20
22
|
generate_unique_instance_name,
|
|
@@ -25,11 +27,13 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
25
27
|
merge_tags,
|
|
26
28
|
)
|
|
27
29
|
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
30
|
+
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
|
|
28
31
|
from dstack._internal.core.backends.gcp.models import GCPConfig
|
|
29
32
|
from dstack._internal.core.errors import (
|
|
30
33
|
ComputeError,
|
|
31
34
|
ComputeResourceNotFoundError,
|
|
32
35
|
NoCapacityError,
|
|
36
|
+
PlacementGroupInUseError,
|
|
33
37
|
ProvisioningError,
|
|
34
38
|
)
|
|
35
39
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -46,6 +50,7 @@ from dstack._internal.core.models.instances import (
|
|
|
46
50
|
InstanceType,
|
|
47
51
|
Resources,
|
|
48
52
|
)
|
|
53
|
+
from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData
|
|
49
54
|
from dstack._internal.core.models.resources import Memory, Range
|
|
50
55
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
51
56
|
from dstack._internal.core.models.volumes import (
|
|
@@ -74,6 +79,7 @@ class GCPVolumeDiskBackendData(CoreModel):
|
|
|
74
79
|
class GCPCompute(
|
|
75
80
|
ComputeWithCreateInstanceSupport,
|
|
76
81
|
ComputeWithMultinodeSupport,
|
|
82
|
+
ComputeWithPlacementGroupSupport,
|
|
77
83
|
ComputeWithGatewaySupport,
|
|
78
84
|
ComputeWithVolumeSupport,
|
|
79
85
|
Compute,
|
|
@@ -89,6 +95,9 @@ class GCPCompute(
|
|
|
89
95
|
self.routers_client = compute_v1.RoutersClient(credentials=self.credentials)
|
|
90
96
|
self.tpu_client = tpu_v2.TpuClient(credentials=self.credentials)
|
|
91
97
|
self.disk_client = compute_v1.DisksClient(credentials=self.credentials)
|
|
98
|
+
self.resource_policies_client = compute_v1.ResourcePoliciesClient(
|
|
99
|
+
credentials=self.credentials
|
|
100
|
+
)
|
|
92
101
|
|
|
93
102
|
def get_offers(
|
|
94
103
|
self, requirements: Optional[Requirements] = None
|
|
@@ -183,6 +192,19 @@ class GCPCompute(
|
|
|
183
192
|
config=self.config,
|
|
184
193
|
region=instance_offer.region,
|
|
185
194
|
)
|
|
195
|
+
extra_subnets = _get_extra_subnets(
|
|
196
|
+
subnetworks_client=self.subnetworks_client,
|
|
197
|
+
config=self.config,
|
|
198
|
+
region=instance_offer.region,
|
|
199
|
+
instance_type_name=instance_offer.instance.name,
|
|
200
|
+
)
|
|
201
|
+
placement_policy = None
|
|
202
|
+
if instance_config.placement_group_name is not None:
|
|
203
|
+
placement_policy = gcp_resources.get_placement_policy_resource_name(
|
|
204
|
+
project_id=self.config.project_id,
|
|
205
|
+
region=instance_offer.region,
|
|
206
|
+
placement_policy=instance_config.placement_group_name,
|
|
207
|
+
)
|
|
186
208
|
labels = {
|
|
187
209
|
"owner": "dstack",
|
|
188
210
|
"dstack_project": instance_config.project_name.lower(),
|
|
@@ -259,8 +281,9 @@ class GCPCompute(
|
|
|
259
281
|
request.project = self.config.project_id
|
|
260
282
|
request.instance_resource = gcp_resources.create_instance_struct(
|
|
261
283
|
disk_size=disk_size,
|
|
262
|
-
image_id=
|
|
263
|
-
|
|
284
|
+
image_id=_get_image_id(
|
|
285
|
+
instance_type_name=instance_offer.instance.name,
|
|
286
|
+
cuda=len(instance_offer.instance.resources.gpus) > 0,
|
|
264
287
|
),
|
|
265
288
|
machine_type=instance_offer.instance.name,
|
|
266
289
|
accelerators=gcp_resources.get_accelerators(
|
|
@@ -269,7 +292,12 @@ class GCPCompute(
|
|
|
269
292
|
gpus=instance_offer.instance.resources.gpus,
|
|
270
293
|
),
|
|
271
294
|
spot=instance_offer.instance.resources.spot,
|
|
272
|
-
user_data=get_user_data(
|
|
295
|
+
user_data=get_user_data(
|
|
296
|
+
authorized_keys,
|
|
297
|
+
backend_specific_commands=_get_backend_specific_commands(
|
|
298
|
+
instance_offer.instance.name
|
|
299
|
+
),
|
|
300
|
+
),
|
|
273
301
|
authorized_keys=authorized_keys,
|
|
274
302
|
labels=labels,
|
|
275
303
|
tags=[gcp_resources.DSTACK_INSTANCE_TAG],
|
|
@@ -278,7 +306,9 @@ class GCPCompute(
|
|
|
278
306
|
service_account=self.config.vm_service_account,
|
|
279
307
|
network=self.config.vpc_resource_name,
|
|
280
308
|
subnetwork=subnetwork,
|
|
309
|
+
extra_subnetworks=extra_subnets,
|
|
281
310
|
allocate_public_ip=allocate_public_ip,
|
|
311
|
+
placement_policy=placement_policy,
|
|
282
312
|
)
|
|
283
313
|
try:
|
|
284
314
|
# GCP needs some time to return an error in case of no capacity (< 30s).
|
|
@@ -371,6 +401,43 @@ class GCPCompute(
|
|
|
371
401
|
f"Failed to get instance IP address. Instance status: {instance.status}"
|
|
372
402
|
)
|
|
373
403
|
|
|
404
|
+
def create_placement_group(
|
|
405
|
+
self,
|
|
406
|
+
placement_group: PlacementGroup,
|
|
407
|
+
) -> PlacementGroupProvisioningData:
|
|
408
|
+
policy = compute_v1.ResourcePolicy(
|
|
409
|
+
name=placement_group.name,
|
|
410
|
+
region=placement_group.configuration.region,
|
|
411
|
+
group_placement_policy=compute_v1.ResourcePolicyGroupPlacementPolicy(
|
|
412
|
+
availability_domain_count=1,
|
|
413
|
+
collocation="COLLOCATED",
|
|
414
|
+
),
|
|
415
|
+
)
|
|
416
|
+
self.resource_policies_client.insert(
|
|
417
|
+
project=self.config.project_id,
|
|
418
|
+
region=placement_group.configuration.region,
|
|
419
|
+
resource_policy_resource=policy,
|
|
420
|
+
)
|
|
421
|
+
return PlacementGroupProvisioningData(backend=BackendType.GCP)
|
|
422
|
+
|
|
423
|
+
def delete_placement_group(
|
|
424
|
+
self,
|
|
425
|
+
placement_group: PlacementGroup,
|
|
426
|
+
):
|
|
427
|
+
try:
|
|
428
|
+
operation = self.resource_policies_client.delete(
|
|
429
|
+
project=self.config.project_id,
|
|
430
|
+
region=placement_group.configuration.region,
|
|
431
|
+
resource_policy=placement_group.name,
|
|
432
|
+
)
|
|
433
|
+
operation.result() # Wait for operation to complete
|
|
434
|
+
except google.api_core.exceptions.NotFound:
|
|
435
|
+
logger.debug("Placement group %s not found", placement_group.name)
|
|
436
|
+
except google.api_core.exceptions.BadRequest as e:
|
|
437
|
+
if "is already being used by" in e.message:
|
|
438
|
+
raise PlacementGroupInUseError()
|
|
439
|
+
raise
|
|
440
|
+
|
|
374
441
|
def create_gateway(
|
|
375
442
|
self,
|
|
376
443
|
configuration: GatewayComputeConfiguration,
|
|
@@ -412,7 +479,7 @@ class GCPCompute(
|
|
|
412
479
|
request.project = self.config.project_id
|
|
413
480
|
request.instance_resource = gcp_resources.create_instance_struct(
|
|
414
481
|
disk_size=10,
|
|
415
|
-
image_id=
|
|
482
|
+
image_id=_get_gateway_image_id(),
|
|
416
483
|
machine_type="e2-small",
|
|
417
484
|
accelerators=[],
|
|
418
485
|
spot=False,
|
|
@@ -681,21 +748,6 @@ class GCPCompute(
|
|
|
681
748
|
)
|
|
682
749
|
|
|
683
750
|
|
|
684
|
-
def _get_vpc_subnet(
|
|
685
|
-
subnetworks_client: compute_v1.SubnetworksClient,
|
|
686
|
-
config: GCPConfig,
|
|
687
|
-
region: str,
|
|
688
|
-
) -> Optional[str]:
|
|
689
|
-
if config.vpc_name is None:
|
|
690
|
-
return None
|
|
691
|
-
return gcp_resources.get_vpc_subnet_or_error(
|
|
692
|
-
subnetworks_client=subnetworks_client,
|
|
693
|
-
vpc_project_id=config.vpc_project_id or config.project_id,
|
|
694
|
-
vpc_name=config.vpc_name,
|
|
695
|
-
region=region,
|
|
696
|
-
)
|
|
697
|
-
|
|
698
|
-
|
|
699
751
|
def _supported_instances_and_zones(
|
|
700
752
|
regions: List[str],
|
|
701
753
|
) -> Optional[Callable[[InstanceOffer], bool]]:
|
|
@@ -754,6 +806,74 @@ def _unique_instance_name(instance: InstanceType) -> str:
|
|
|
754
806
|
return f"{name}-{gpu.name}-{gpu.memory_mib}"
|
|
755
807
|
|
|
756
808
|
|
|
809
|
+
def _get_vpc_subnet(
|
|
810
|
+
subnetworks_client: compute_v1.SubnetworksClient,
|
|
811
|
+
config: GCPConfig,
|
|
812
|
+
region: str,
|
|
813
|
+
) -> Optional[str]:
|
|
814
|
+
if config.vpc_name is None:
|
|
815
|
+
return None
|
|
816
|
+
return gcp_resources.get_vpc_subnet_or_error(
|
|
817
|
+
subnetworks_client=subnetworks_client,
|
|
818
|
+
vpc_project_id=config.vpc_project_id or config.project_id,
|
|
819
|
+
vpc_name=config.vpc_name,
|
|
820
|
+
region=region,
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def _get_extra_subnets(
|
|
825
|
+
subnetworks_client: compute_v1.SubnetworksClient,
|
|
826
|
+
config: GCPConfig,
|
|
827
|
+
region: str,
|
|
828
|
+
instance_type_name: str,
|
|
829
|
+
) -> List[Tuple[str, str]]:
|
|
830
|
+
if config.extra_vpcs is None:
|
|
831
|
+
return []
|
|
832
|
+
if instance_type_name != "a3-megagpu-8g":
|
|
833
|
+
return []
|
|
834
|
+
extra_subnets = []
|
|
835
|
+
for vpc_name in config.extra_vpcs:
|
|
836
|
+
subnet = gcp_resources.get_vpc_subnet_or_error(
|
|
837
|
+
subnetworks_client=subnetworks_client,
|
|
838
|
+
vpc_project_id=config.vpc_project_id or config.project_id,
|
|
839
|
+
vpc_name=vpc_name,
|
|
840
|
+
region=region,
|
|
841
|
+
)
|
|
842
|
+
vpc_resource_name = gcp_resources.vpc_name_to_vpc_resource_name(
|
|
843
|
+
project_id=config.vpc_project_id or config.project_id,
|
|
844
|
+
vpc_name=vpc_name,
|
|
845
|
+
)
|
|
846
|
+
extra_subnets.append((vpc_resource_name, subnet))
|
|
847
|
+
return extra_subnets[:8]
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def _get_image_id(instance_type_name: str, cuda: bool) -> str:
|
|
851
|
+
if instance_type_name == "a3-megagpu-8g":
|
|
852
|
+
image_name = "dstack-a3mega-5"
|
|
853
|
+
elif cuda:
|
|
854
|
+
image_name = f"dstack-cuda-{version.base_image}"
|
|
855
|
+
else:
|
|
856
|
+
image_name = f"dstack-{version.base_image}"
|
|
857
|
+
image_name = image_name.replace(".", "-")
|
|
858
|
+
return f"projects/dstack/global/images/{image_name}"
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def _get_gateway_image_id() -> str:
|
|
862
|
+
return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
|
|
866
|
+
if instance_type_name == "a3-megagpu-8g":
|
|
867
|
+
return tcpx_features.get_backend_specific_commands_tcpxo()
|
|
868
|
+
return []
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def _get_volume_price(size: int) -> float:
|
|
872
|
+
# https://cloud.google.com/compute/disks-image-pricing#persistentdisk
|
|
873
|
+
# The price is different in different regions. Take max across supported regions.
|
|
874
|
+
return size * 0.12
|
|
875
|
+
|
|
876
|
+
|
|
757
877
|
def _get_tpu_startup_script(authorized_keys: List[str]) -> str:
|
|
758
878
|
commands = get_shim_commands(
|
|
759
879
|
authorized_keys=authorized_keys, is_privileged=True, pjrt_device="TPU"
|
|
@@ -805,12 +925,6 @@ def _is_single_host_tpu(instance_name: str) -> bool:
|
|
|
805
925
|
return False
|
|
806
926
|
|
|
807
927
|
|
|
808
|
-
def _get_volume_price(size: int) -> float:
|
|
809
|
-
# https://cloud.google.com/compute/disks-image-pricing#persistentdisk
|
|
810
|
-
# The price is different in different regions. Take max across supported regions.
|
|
811
|
-
return size * 0.12
|
|
812
|
-
|
|
813
|
-
|
|
814
928
|
def _get_tpu_data_disks(
|
|
815
929
|
project_id: str, volumes: Optional[List[Volume]]
|
|
816
930
|
) -> List[tpu_v2.AttachedDisk]:
|
|
@@ -199,3 +199,5 @@ class GCPConfigurator(Configurator):
|
|
|
199
199
|
)
|
|
200
200
|
except BackendError as e:
|
|
201
201
|
raise ServerClientError(e.args[0])
|
|
202
|
+
# Not checking config.extra_vpc so that users are not required to configure subnets for all regions
|
|
203
|
+
# but only for regions they intend to use. Validation will be done on provisioning.
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_backend_specific_commands_tcpxo() -> List[str]:
|
|
5
|
+
return [
|
|
6
|
+
"modprobe import-helper",
|
|
7
|
+
"gcloud -q auth configure-docker us-docker.pkg.dev",
|
|
8
|
+
# Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/.
|
|
9
|
+
(
|
|
10
|
+
"docker run --rm "
|
|
11
|
+
"--name nccl-installer "
|
|
12
|
+
"--pull=never "
|
|
13
|
+
"--network=host "
|
|
14
|
+
"--volume /var/lib:/var/lib "
|
|
15
|
+
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 "
|
|
16
|
+
"install --install-nccl"
|
|
17
|
+
),
|
|
18
|
+
# Start FasTrak receive-datapath-manager
|
|
19
|
+
(
|
|
20
|
+
"docker run "
|
|
21
|
+
"--name receive-datapath-manager "
|
|
22
|
+
"--detach "
|
|
23
|
+
"--pull=never "
|
|
24
|
+
"--cap-add=NET_ADMIN "
|
|
25
|
+
"--network=host "
|
|
26
|
+
"--privileged "
|
|
27
|
+
"--gpus all "
|
|
28
|
+
"--volume /usr/lib32:/usr/local/nvidia/lib64 "
|
|
29
|
+
"--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper "
|
|
30
|
+
"--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu "
|
|
31
|
+
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 "
|
|
32
|
+
"--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
|
|
33
|
+
),
|
|
34
|
+
]
|
|
@@ -33,7 +33,19 @@ class GCPBackendConfig(CoreModel):
|
|
|
33
33
|
regions: Annotated[
|
|
34
34
|
Optional[List[str]], Field(description="The list of GCP regions. Omit to use all regions")
|
|
35
35
|
] = None
|
|
36
|
-
vpc_name: Annotated[
|
|
36
|
+
vpc_name: Annotated[
|
|
37
|
+
Optional[str],
|
|
38
|
+
Field(description="The name of a custom VPC. If not specified, the default VPC is used"),
|
|
39
|
+
] = None
|
|
40
|
+
extra_vpcs: Annotated[
|
|
41
|
+
Optional[List[str]],
|
|
42
|
+
Field(
|
|
43
|
+
description=(
|
|
44
|
+
"The names of additional VPCs used for GPUDirect. Specify eight VPCs to maximize bandwidth."
|
|
45
|
+
" Each VPC must have a subnet and a firewall rule allowing internal traffic across all subnets"
|
|
46
|
+
)
|
|
47
|
+
),
|
|
48
|
+
] = None
|
|
37
49
|
vpc_project_id: Annotated[
|
|
38
50
|
Optional[str],
|
|
39
51
|
Field(description="The shared VPC hosted project ID. Required for shared VPC only"),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import re
|
|
3
|
-
from typing import Dict, List, Optional
|
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import google.api_core.exceptions
|
|
6
6
|
import google.cloud.compute_v1 as compute_v1
|
|
@@ -8,7 +8,6 @@ from google.api_core.extended_operation import ExtendedOperation
|
|
|
8
8
|
from google.api_core.operation import Operation
|
|
9
9
|
from google.cloud import tpu_v2
|
|
10
10
|
|
|
11
|
-
import dstack.version as version
|
|
12
11
|
from dstack._internal.core.errors import BackendError, ComputeError
|
|
13
12
|
from dstack._internal.core.models.instances import Gpu
|
|
14
13
|
from dstack._internal.utils.common import remove_prefix
|
|
@@ -54,12 +53,16 @@ def check_vpc(
|
|
|
54
53
|
if shared_vpc_project_id:
|
|
55
54
|
vpc_project_id = shared_vpc_project_id
|
|
56
55
|
try:
|
|
56
|
+
usable_subnets = list_project_usable_subnets(
|
|
57
|
+
subnetworks_client=subnetworks_client, project_id=vpc_project_id
|
|
58
|
+
)
|
|
57
59
|
for region in regions:
|
|
58
60
|
get_vpc_subnet_or_error(
|
|
59
61
|
subnetworks_client=subnetworks_client,
|
|
60
62
|
vpc_project_id=vpc_project_id,
|
|
61
63
|
vpc_name=vpc_name,
|
|
62
64
|
region=region,
|
|
65
|
+
usable_subnets=usable_subnets,
|
|
63
66
|
)
|
|
64
67
|
except google.api_core.exceptions.NotFound:
|
|
65
68
|
raise ComputeError(f"Failed to find VPC project {vpc_project_id}")
|
|
@@ -117,26 +120,19 @@ def create_instance_struct(
|
|
|
117
120
|
service_account: Optional[str] = None,
|
|
118
121
|
network: str = "global/networks/default",
|
|
119
122
|
subnetwork: Optional[str] = None,
|
|
123
|
+
extra_subnetworks: Optional[List[Tuple[str, str]]] = None,
|
|
120
124
|
allocate_public_ip: bool = True,
|
|
125
|
+
placement_policy: Optional[str] = None,
|
|
121
126
|
) -> compute_v1.Instance:
|
|
122
|
-
network_interface = compute_v1.NetworkInterface()
|
|
123
|
-
network_interface.network = network
|
|
124
|
-
if subnetwork is not None:
|
|
125
|
-
network_interface.subnetwork = subnetwork
|
|
126
|
-
|
|
127
|
-
if allocate_public_ip:
|
|
128
|
-
access = compute_v1.AccessConfig()
|
|
129
|
-
access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
|
|
130
|
-
access.name = "External NAT"
|
|
131
|
-
access.network_tier = access.NetworkTier.PREMIUM.name
|
|
132
|
-
network_interface.access_configs = [access]
|
|
133
|
-
else:
|
|
134
|
-
network_interface.access_configs = []
|
|
135
|
-
|
|
136
127
|
instance = compute_v1.Instance()
|
|
137
|
-
instance.network_interfaces = [network_interface]
|
|
138
128
|
instance.name = instance_name
|
|
139
129
|
instance.machine_type = f"zones/{zone}/machineTypes/{machine_type}"
|
|
130
|
+
instance.network_interfaces = _get_network_interfaces(
|
|
131
|
+
network=network,
|
|
132
|
+
subnetwork=subnetwork,
|
|
133
|
+
allocate_public_ip=allocate_public_ip,
|
|
134
|
+
extra_subnetworks=extra_subnetworks,
|
|
135
|
+
)
|
|
140
136
|
|
|
141
137
|
disk = compute_v1.AttachedDisk()
|
|
142
138
|
disk.auto_delete = True
|
|
@@ -160,6 +156,9 @@ def create_instance_struct(
|
|
|
160
156
|
# Attachable GPUs, H100, A100, and L4
|
|
161
157
|
instance.scheduling.on_host_maintenance = "TERMINATE"
|
|
162
158
|
|
|
159
|
+
if placement_policy is not None:
|
|
160
|
+
instance.resource_policies = [placement_policy]
|
|
161
|
+
|
|
163
162
|
if spot:
|
|
164
163
|
instance.scheduling = compute_v1.Scheduling()
|
|
165
164
|
instance.scheduling.provisioning_model = compute_v1.Scheduling.ProvisioningModel.SPOT.name
|
|
@@ -187,18 +186,42 @@ def create_instance_struct(
|
|
|
187
186
|
return instance
|
|
188
187
|
|
|
189
188
|
|
|
190
|
-
def
|
|
191
|
-
|
|
192
|
-
|
|
189
|
+
def _get_network_interfaces(
|
|
190
|
+
network: str,
|
|
191
|
+
subnetwork: Optional[str],
|
|
192
|
+
allocate_public_ip: bool,
|
|
193
|
+
extra_subnetworks: Optional[List[Tuple[str, str]]],
|
|
194
|
+
) -> List[compute_v1.NetworkInterface]:
|
|
195
|
+
network_interface = compute_v1.NetworkInterface()
|
|
196
|
+
network_interface.network = network
|
|
197
|
+
if subnetwork is not None:
|
|
198
|
+
network_interface.subnetwork = subnetwork
|
|
199
|
+
if allocate_public_ip:
|
|
200
|
+
access = compute_v1.AccessConfig()
|
|
201
|
+
access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name
|
|
202
|
+
access.name = "External NAT"
|
|
203
|
+
access.network_tier = access.NetworkTier.PREMIUM.name
|
|
204
|
+
network_interface.access_configs = [access]
|
|
193
205
|
else:
|
|
194
|
-
|
|
195
|
-
image_name = image_name.replace(".", "-")
|
|
206
|
+
network_interface.access_configs = []
|
|
196
207
|
|
|
197
|
-
|
|
208
|
+
network_interfaces = [network_interface]
|
|
209
|
+
for network, subnetwork in extra_subnetworks or []:
|
|
210
|
+
network_interfaces.append(
|
|
211
|
+
compute_v1.NetworkInterface(
|
|
212
|
+
network=network,
|
|
213
|
+
subnetwork=subnetwork,
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
return network_interfaces
|
|
198
217
|
|
|
199
218
|
|
|
200
|
-
def
|
|
201
|
-
|
|
219
|
+
def list_project_usable_subnets(
|
|
220
|
+
subnetworks_client: compute_v1.SubnetworksClient,
|
|
221
|
+
project_id: str,
|
|
222
|
+
) -> List[compute_v1.UsableSubnetwork]:
|
|
223
|
+
request = compute_v1.ListUsableSubnetworksRequest(project=project_id)
|
|
224
|
+
return [s for s in subnetworks_client.list_usable(request=request)]
|
|
202
225
|
|
|
203
226
|
|
|
204
227
|
def get_vpc_subnet_or_error(
|
|
@@ -206,13 +229,15 @@ def get_vpc_subnet_or_error(
|
|
|
206
229
|
vpc_project_id: str,
|
|
207
230
|
vpc_name: str,
|
|
208
231
|
region: str,
|
|
232
|
+
usable_subnets: Optional[List[compute_v1.UsableSubnetwork]] = None,
|
|
209
233
|
) -> str:
|
|
210
234
|
"""
|
|
211
235
|
Returns resource name of any usable subnet in a given VPC
|
|
212
236
|
(e.g. "projects/example-project/regions/europe-west4/subnetworks/example-subnet")
|
|
213
237
|
"""
|
|
214
|
-
|
|
215
|
-
|
|
238
|
+
if usable_subnets is None:
|
|
239
|
+
usable_subnets = list_project_usable_subnets(subnetworks_client, vpc_project_id)
|
|
240
|
+
for subnet in usable_subnets:
|
|
216
241
|
network_name = subnet.network.split("/")[-1]
|
|
217
242
|
subnet_url = subnet.subnetwork
|
|
218
243
|
subnet_resource_name = remove_prefix(subnet_url, "https://www.googleapis.com/compute/v1/")
|
|
@@ -410,3 +435,15 @@ def wait_for_operation(operation: Operation, verbose_name: str = "operation", ti
|
|
|
410
435
|
|
|
411
436
|
def full_resource_name_to_name(full_resource_name: str) -> str:
|
|
412
437
|
return full_resource_name.split("/")[-1]
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def vpc_name_to_vpc_resource_name(project_id: str, vpc_name: str) -> str:
|
|
441
|
+
return f"projects/{project_id}/global/networks/{vpc_name}"
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def get_placement_policy_resource_name(
|
|
445
|
+
project_id: str,
|
|
446
|
+
region: str,
|
|
447
|
+
placement_policy: str,
|
|
448
|
+
) -> str:
|
|
449
|
+
return f"projects/{project_id}/regions/{region}/resourcePolicies/{placement_policy}"
|
|
@@ -39,7 +39,7 @@ class LambdaCompute(
|
|
|
39
39
|
) -> List[InstanceOfferWithAvailability]:
|
|
40
40
|
offers = get_catalog_offers(
|
|
41
41
|
backend=BackendType.LAMBDA,
|
|
42
|
-
locations=self.config.regions,
|
|
42
|
+
locations=self.config.regions or None,
|
|
43
43
|
requirements=requirements,
|
|
44
44
|
)
|
|
45
45
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
@@ -90,7 +90,7 @@ class LambdaCompute(
|
|
|
90
90
|
if instance_info is not None and instance_info["status"] != "booting":
|
|
91
91
|
provisioning_data.hostname = instance_info["ip"]
|
|
92
92
|
commands = get_shim_commands(authorized_keys=[project_ssh_public_key])
|
|
93
|
-
# shim is
|
|
93
|
+
# shim is assumed to be run under root
|
|
94
94
|
launch_command = "sudo sh -c '" + "&& ".join(commands) + "'"
|
|
95
95
|
thread = Thread(
|
|
96
96
|
target=_start_runner,
|
|
@@ -119,8 +119,6 @@ class LambdaCompute(
|
|
|
119
119
|
}
|
|
120
120
|
availability_offers = []
|
|
121
121
|
for offer in offers:
|
|
122
|
-
if offer.region not in self.config.regions:
|
|
123
|
-
continue
|
|
124
122
|
availability = InstanceAvailability.NOT_AVAILABLE
|
|
125
123
|
if offer.region in instance_availability.get(offer.instance.name, []):
|
|
126
124
|
availability = InstanceAvailability.AVAILABLE
|
|
@@ -19,25 +19,6 @@ from dstack._internal.core.models.backends.base import (
|
|
|
19
19
|
BackendType,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
REGIONS = [
|
|
23
|
-
"us-south-1",
|
|
24
|
-
"us-south-2",
|
|
25
|
-
"us-south-3",
|
|
26
|
-
"us-west-2",
|
|
27
|
-
"us-west-1",
|
|
28
|
-
"us-midwest-1",
|
|
29
|
-
"us-west-3",
|
|
30
|
-
"us-east-1",
|
|
31
|
-
"us-east-2",
|
|
32
|
-
"europe-central-1",
|
|
33
|
-
"asia-south-1",
|
|
34
|
-
"me-west-1",
|
|
35
|
-
"asia-northeast-1",
|
|
36
|
-
"asia-northeast-2",
|
|
37
|
-
]
|
|
38
|
-
|
|
39
|
-
DEFAULT_REGION = "us-east-1"
|
|
40
|
-
|
|
41
22
|
|
|
42
23
|
class LambdaConfigurator(Configurator):
|
|
43
24
|
TYPE = BackendType.LAMBDA
|
|
@@ -49,8 +30,6 @@ class LambdaConfigurator(Configurator):
|
|
|
49
30
|
def create_backend(
|
|
50
31
|
self, project_name: str, config: LambdaBackendConfigWithCreds
|
|
51
32
|
) -> BackendRecord:
|
|
52
|
-
if config.regions is None:
|
|
53
|
-
config.regions = REGIONS
|
|
54
33
|
return BackendRecord(
|
|
55
34
|
config=LambdaStoredConfig(
|
|
56
35
|
**LambdaBackendConfig.__response__.parse_obj(config).dict()
|
|
@@ -34,6 +34,11 @@ from dstack._internal.core.backends.lambdalabs.models import (
|
|
|
34
34
|
LambdaBackendConfig,
|
|
35
35
|
LambdaBackendConfigWithCreds,
|
|
36
36
|
)
|
|
37
|
+
from dstack._internal.core.backends.nebius.models import (
|
|
38
|
+
NebiusBackendConfig,
|
|
39
|
+
NebiusBackendConfigWithCreds,
|
|
40
|
+
NebiusBackendFileConfigWithCreds,
|
|
41
|
+
)
|
|
37
42
|
from dstack._internal.core.backends.oci.models import (
|
|
38
43
|
OCIBackendConfig,
|
|
39
44
|
OCIBackendConfigWithCreds,
|
|
@@ -65,6 +70,7 @@ AnyBackendConfigWithoutCreds = Union[
|
|
|
65
70
|
GCPBackendConfig,
|
|
66
71
|
KubernetesBackendConfig,
|
|
67
72
|
LambdaBackendConfig,
|
|
73
|
+
NebiusBackendConfig,
|
|
68
74
|
OCIBackendConfig,
|
|
69
75
|
RunpodBackendConfig,
|
|
70
76
|
TensorDockBackendConfig,
|
|
@@ -86,6 +92,7 @@ AnyBackendConfigWithCreds = Union[
|
|
|
86
92
|
KubernetesBackendConfigWithCreds,
|
|
87
93
|
LambdaBackendConfigWithCreds,
|
|
88
94
|
OCIBackendConfigWithCreds,
|
|
95
|
+
NebiusBackendConfigWithCreds,
|
|
89
96
|
RunpodBackendConfigWithCreds,
|
|
90
97
|
TensorDockBackendConfigWithCreds,
|
|
91
98
|
VastAIBackendConfigWithCreds,
|
|
@@ -105,6 +112,7 @@ AnyBackendFileConfigWithCreds = Union[
|
|
|
105
112
|
KubernetesBackendFileConfigWithCreds,
|
|
106
113
|
LambdaBackendConfigWithCreds,
|
|
107
114
|
OCIBackendConfigWithCreds,
|
|
115
|
+
NebiusBackendFileConfigWithCreds,
|
|
108
116
|
RunpodBackendConfigWithCreds,
|
|
109
117
|
TensorDockBackendConfigWithCreds,
|
|
110
118
|
VastAIBackendConfigWithCreds,
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
2
|
+
from dstack._internal.core.backends.nebius.compute import NebiusCompute
|
|
3
|
+
from dstack._internal.core.backends.nebius.models import NebiusConfig
|
|
4
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NebiusBackend(Backend):
|
|
8
|
+
TYPE = BackendType.NEBIUS
|
|
9
|
+
COMPUTE_CLASS = NebiusCompute
|
|
10
|
+
|
|
11
|
+
def __init__(self, config: NebiusConfig):
|
|
12
|
+
self.config = config
|
|
13
|
+
self._compute = NebiusCompute(self.config)
|
|
14
|
+
|
|
15
|
+
def compute(self) -> NebiusCompute:
|
|
16
|
+
return self._compute
|