dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/main.py +3 -1
- dstack/_internal/cli/services/configurators/fleet.py +20 -6
- dstack/_internal/cli/utils/gpu.py +2 -2
- dstack/_internal/core/backends/aws/compute.py +62 -41
- dstack/_internal/core/backends/aws/resources.py +11 -6
- dstack/_internal/core/backends/azure/compute.py +25 -13
- dstack/_internal/core/backends/base/compute.py +121 -14
- dstack/_internal/core/backends/base/offers.py +34 -4
- dstack/_internal/core/backends/cloudrift/compute.py +5 -7
- dstack/_internal/core/backends/cudo/compute.py +4 -2
- dstack/_internal/core/backends/datacrunch/compute.py +13 -11
- dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
- dstack/_internal/core/backends/gcp/compute.py +25 -11
- dstack/_internal/core/backends/hotaisle/compute.py +4 -7
- dstack/_internal/core/backends/kubernetes/compute.py +6 -4
- dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
- dstack/_internal/core/backends/local/compute.py +1 -3
- dstack/_internal/core/backends/nebius/compute.py +10 -7
- dstack/_internal/core/backends/oci/compute.py +15 -8
- dstack/_internal/core/backends/oci/resources.py +8 -3
- dstack/_internal/core/backends/runpod/compute.py +15 -6
- dstack/_internal/core/backends/template/compute.py.jinja +3 -1
- dstack/_internal/core/backends/tensordock/compute.py +1 -3
- dstack/_internal/core/backends/tensordock/models.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +7 -3
- dstack/_internal/core/backends/vultr/compute.py +5 -5
- dstack/_internal/core/consts.py +2 -0
- dstack/_internal/core/models/projects.py +8 -0
- dstack/_internal/core/services/repos.py +101 -10
- dstack/_internal/server/background/tasks/process_instances.py +3 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +1 -1
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +100 -47
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/projects.py +11 -3
- dstack/_internal/server/services/runs.py +2 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
- dstack/_internal/utils/ssh.py +22 -2
- dstack/version.py +2 -2
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/METADATA +8 -6
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/RECORD +46 -50
- dstack/_internal/core/backends/tensordock/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/api_client.py +0 -104
- dstack/_internal/core/backends/tensordock/backend.py +0 -16
- dstack/_internal/core/backends/tensordock/configurator.py +0 -74
- dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/WHEEL +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.30.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Dict, List, Optional
|
|
2
2
|
|
|
3
|
-
from dstack._internal.core.backends.base.backend import Compute
|
|
4
3
|
from dstack._internal.core.backends.base.compute import (
|
|
4
|
+
Compute,
|
|
5
|
+
ComputeWithAllOffersCached,
|
|
5
6
|
ComputeWithCreateInstanceSupport,
|
|
6
7
|
get_shim_commands,
|
|
7
8
|
)
|
|
@@ -17,13 +18,14 @@ from dstack._internal.core.models.instances import (
|
|
|
17
18
|
InstanceOfferWithAvailability,
|
|
18
19
|
)
|
|
19
20
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
20
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
21
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
21
22
|
from dstack._internal.utils.logging import get_logger
|
|
22
23
|
|
|
23
24
|
logger = get_logger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class CloudRiftCompute(
|
|
28
|
+
ComputeWithAllOffersCached,
|
|
27
29
|
ComputeWithCreateInstanceSupport,
|
|
28
30
|
Compute,
|
|
29
31
|
):
|
|
@@ -32,15 +34,11 @@ class CloudRiftCompute(
|
|
|
32
34
|
self.config = config
|
|
33
35
|
self.client = RiftClient(self.config.creds.api_key)
|
|
34
36
|
|
|
35
|
-
def
|
|
36
|
-
self, requirements: Optional[Requirements] = None
|
|
37
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
37
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
38
38
|
offers = get_catalog_offers(
|
|
39
39
|
backend=BackendType.CLOUDRIFT,
|
|
40
40
|
locations=self.config.regions or None,
|
|
41
|
-
requirements=requirements,
|
|
42
41
|
)
|
|
43
|
-
|
|
44
42
|
offers_with_availabilities = self._get_offers_with_availability(offers)
|
|
45
43
|
return offers_with_availabilities
|
|
46
44
|
|
|
@@ -5,6 +5,7 @@ import requests
|
|
|
5
5
|
from dstack._internal.core.backends.base.backend import Compute
|
|
6
6
|
from dstack._internal.core.backends.base.compute import (
|
|
7
7
|
ComputeWithCreateInstanceSupport,
|
|
8
|
+
ComputeWithFilteredOffersCached,
|
|
8
9
|
generate_unique_instance_name,
|
|
9
10
|
get_shim_commands,
|
|
10
11
|
)
|
|
@@ -29,6 +30,7 @@ MAX_RESOURCE_NAME_LEN = 30
|
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
class CudoCompute(
|
|
33
|
+
ComputeWithFilteredOffersCached,
|
|
32
34
|
ComputeWithCreateInstanceSupport,
|
|
33
35
|
Compute,
|
|
34
36
|
):
|
|
@@ -37,8 +39,8 @@ class CudoCompute(
|
|
|
37
39
|
self.config = config
|
|
38
40
|
self.api_client = CudoApiClient(config.creds.api_key)
|
|
39
41
|
|
|
40
|
-
def
|
|
41
|
-
self, requirements:
|
|
42
|
+
def get_offers_by_requirements(
|
|
43
|
+
self, requirements: Requirements
|
|
42
44
|
) -> List[InstanceOfferWithAvailability]:
|
|
43
45
|
offers = get_catalog_offers(
|
|
44
46
|
backend=BackendType.CUDO,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Dict, List, Optional
|
|
1
|
+
from typing import Callable, Dict, List, Optional
|
|
2
2
|
|
|
3
3
|
from datacrunch import DataCrunchClient
|
|
4
4
|
from datacrunch.exceptions import APIException
|
|
@@ -6,11 +6,12 @@ from datacrunch.instances.instances import Instance
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
generate_unique_instance_name,
|
|
11
12
|
get_shim_commands,
|
|
12
13
|
)
|
|
13
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
14
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
14
15
|
from dstack._internal.core.backends.datacrunch.models import DataCrunchConfig
|
|
15
16
|
from dstack._internal.core.errors import NoCapacityError
|
|
16
17
|
from dstack._internal.core.models.backends.base import BackendType
|
|
@@ -36,6 +37,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=IMAGE_SIZE, max=None)
|
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DataCrunchCompute(
|
|
40
|
+
ComputeWithAllOffersCached,
|
|
39
41
|
ComputeWithCreateInstanceSupport,
|
|
40
42
|
Compute,
|
|
41
43
|
):
|
|
@@ -47,18 +49,19 @@ class DataCrunchCompute(
|
|
|
47
49
|
client_secret=self.config.creds.client_secret,
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
def
|
|
51
|
-
self, requirements: Optional[Requirements] = None
|
|
52
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
52
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
53
53
|
offers = get_catalog_offers(
|
|
54
54
|
backend=BackendType.DATACRUNCH,
|
|
55
55
|
locations=self.config.regions,
|
|
56
|
-
requirements=requirements,
|
|
57
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
58
56
|
)
|
|
59
57
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
60
58
|
return offers_with_availability
|
|
61
59
|
|
|
60
|
+
def get_offers_modifier(
|
|
61
|
+
self, requirements: Requirements
|
|
62
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
63
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
64
|
+
|
|
62
65
|
def _get_offers_with_availability(
|
|
63
66
|
self, offers: List[InstanceOffer]
|
|
64
67
|
) -> List[InstanceOfferWithAvailability]:
|
|
@@ -182,10 +185,9 @@ class DataCrunchCompute(
|
|
|
182
185
|
|
|
183
186
|
def _get_vm_image_id(instance_offer: InstanceOfferWithAvailability) -> str:
|
|
184
187
|
# https://api.datacrunch.io/v1/images
|
|
185
|
-
if (
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
):
|
|
188
|
+
if len(instance_offer.instance.resources.gpus) > 0 and instance_offer.instance.resources.gpus[
|
|
189
|
+
0
|
|
190
|
+
].name in ["V100", "A6000"]:
|
|
189
191
|
# Ubuntu 22.04 + CUDA 12.0 + Docker
|
|
190
192
|
return "2088da25-bb0d-41cc-a191-dccae45d96fd"
|
|
191
193
|
# Ubuntu 24.04 + CUDA 12.8 Open + Docker
|
|
@@ -5,6 +5,7 @@ from gpuhunt.providers.digitalocean import DigitalOceanProvider
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithAllOffersCached,
|
|
8
9
|
ComputeWithCreateInstanceSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
get_user_data,
|
|
@@ -20,7 +21,7 @@ from dstack._internal.core.models.instances import (
|
|
|
20
21
|
InstanceOfferWithAvailability,
|
|
21
22
|
)
|
|
22
23
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
23
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
24
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
24
25
|
from dstack._internal.utils.logging import get_logger
|
|
25
26
|
|
|
26
27
|
logger = get_logger(__name__)
|
|
@@ -37,6 +38,7 @@ DOCKER_INSTALL_COMMANDS = [
|
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
class BaseDigitalOceanCompute(
|
|
41
|
+
ComputeWithAllOffersCached,
|
|
40
42
|
ComputeWithCreateInstanceSupport,
|
|
41
43
|
Compute,
|
|
42
44
|
):
|
|
@@ -50,13 +52,10 @@ class BaseDigitalOceanCompute(
|
|
|
50
52
|
DigitalOceanProvider(api_key=config.creds.api_key, api_url=api_url)
|
|
51
53
|
)
|
|
52
54
|
|
|
53
|
-
def
|
|
54
|
-
self, requirements: Optional[Requirements] = None
|
|
55
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
55
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
56
56
|
offers = get_catalog_offers(
|
|
57
57
|
backend=self.BACKEND_TYPE,
|
|
58
58
|
locations=self.config.regions,
|
|
59
|
-
requirements=requirements,
|
|
60
59
|
catalog=self.catalog,
|
|
61
60
|
)
|
|
62
61
|
return [
|
|
@@ -17,6 +17,7 @@ import dstack._internal.core.backends.gcp.resources as gcp_resources
|
|
|
17
17
|
from dstack import version
|
|
18
18
|
from dstack._internal.core.backends.base.compute import (
|
|
19
19
|
Compute,
|
|
20
|
+
ComputeWithAllOffersCached,
|
|
20
21
|
ComputeWithCreateInstanceSupport,
|
|
21
22
|
ComputeWithGatewaySupport,
|
|
22
23
|
ComputeWithMultinodeSupport,
|
|
@@ -30,10 +31,15 @@ from dstack._internal.core.backends.base.compute import (
|
|
|
30
31
|
get_shim_commands,
|
|
31
32
|
get_user_data,
|
|
32
33
|
merge_tags,
|
|
34
|
+
requires_nvidia_proprietary_kernel_modules,
|
|
35
|
+
)
|
|
36
|
+
from dstack._internal.core.backends.base.offers import (
|
|
37
|
+
get_catalog_offers,
|
|
38
|
+
get_offers_disk_modifier,
|
|
33
39
|
)
|
|
34
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
35
40
|
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
|
|
36
41
|
from dstack._internal.core.backends.gcp.models import GCPConfig
|
|
42
|
+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
37
43
|
from dstack._internal.core.errors import (
|
|
38
44
|
ComputeError,
|
|
39
45
|
ComputeResourceNotFoundError,
|
|
@@ -82,6 +88,7 @@ class GCPVolumeDiskBackendData(CoreModel):
|
|
|
82
88
|
|
|
83
89
|
|
|
84
90
|
class GCPCompute(
|
|
91
|
+
ComputeWithAllOffersCached,
|
|
85
92
|
ComputeWithCreateInstanceSupport,
|
|
86
93
|
ComputeWithMultinodeSupport,
|
|
87
94
|
ComputeWithPlacementGroupSupport,
|
|
@@ -107,14 +114,10 @@ class GCPCompute(
|
|
|
107
114
|
self._extra_subnets_cache_lock = threading.Lock()
|
|
108
115
|
self._extra_subnets_cache = TTLCache(maxsize=30, ttl=60)
|
|
109
116
|
|
|
110
|
-
def
|
|
111
|
-
self, requirements: Optional[Requirements] = None
|
|
112
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
117
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
113
118
|
regions = get_or_error(self.config.regions)
|
|
114
119
|
offers = get_catalog_offers(
|
|
115
120
|
backend=BackendType.GCP,
|
|
116
|
-
requirements=requirements,
|
|
117
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
118
121
|
extra_filter=_supported_instances_and_zones(regions),
|
|
119
122
|
)
|
|
120
123
|
quotas: Dict[str, Dict[str, float]] = defaultdict(dict)
|
|
@@ -142,9 +145,13 @@ class GCPCompute(
|
|
|
142
145
|
offer_keys_to_offers[key] = offer_with_availability
|
|
143
146
|
offers_with_availability.append(offer_with_availability)
|
|
144
147
|
offers_with_availability[-1].region = region
|
|
145
|
-
|
|
146
148
|
return offers_with_availability
|
|
147
149
|
|
|
150
|
+
def get_offers_modifier(
|
|
151
|
+
self, requirements: Requirements
|
|
152
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
153
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
154
|
+
|
|
148
155
|
def terminate_instance(
|
|
149
156
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
150
157
|
) -> None:
|
|
@@ -288,7 +295,11 @@ class GCPCompute(
|
|
|
288
295
|
|
|
289
296
|
image = _get_image(
|
|
290
297
|
instance_type_name=instance_offer.instance.name,
|
|
291
|
-
|
|
298
|
+
gpu_name=(
|
|
299
|
+
instance_offer.instance.resources.gpus[0].name
|
|
300
|
+
if len(instance_offer.instance.resources.gpus) > 0
|
|
301
|
+
else None
|
|
302
|
+
),
|
|
292
303
|
)
|
|
293
304
|
|
|
294
305
|
for zone in zones:
|
|
@@ -899,7 +910,7 @@ class GCPImage:
|
|
|
899
910
|
is_ufw_installed: bool
|
|
900
911
|
|
|
901
912
|
|
|
902
|
-
def _get_image(instance_type_name: str,
|
|
913
|
+
def _get_image(instance_type_name: str, gpu_name: Optional[str]) -> GCPImage:
|
|
903
914
|
if instance_type_name == "a3-megagpu-8g":
|
|
904
915
|
image_name = "dstack-a3mega-5"
|
|
905
916
|
is_ufw_installed = False
|
|
@@ -908,8 +919,11 @@ def _get_image(instance_type_name: str, cuda: bool) -> GCPImage:
|
|
|
908
919
|
id="projects/cos-cloud/global/images/cos-105-17412-535-78",
|
|
909
920
|
is_ufw_installed=False,
|
|
910
921
|
)
|
|
911
|
-
elif
|
|
912
|
-
|
|
922
|
+
elif gpu_name is not None:
|
|
923
|
+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
|
|
924
|
+
image_name = f"dstack-cuda-{version.base_image}"
|
|
925
|
+
else:
|
|
926
|
+
image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
|
|
913
927
|
is_ufw_installed = True
|
|
914
928
|
else:
|
|
915
929
|
image_name = f"dstack-{version.base_image}"
|
|
@@ -9,6 +9,7 @@ from gpuhunt.providers.hotaisle import HotAisleProvider
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithAllOffersCached,
|
|
12
13
|
ComputeWithCreateInstanceSupport,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
InstanceOfferWithAvailability,
|
|
24
25
|
)
|
|
25
26
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
28
|
from dstack._internal.utils.logging import get_logger
|
|
28
29
|
|
|
29
30
|
logger = get_logger(__name__)
|
|
@@ -44,6 +45,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class HotAisleCompute(
|
|
48
|
+
ComputeWithAllOffersCached,
|
|
47
49
|
ComputeWithCreateInstanceSupport,
|
|
48
50
|
Compute,
|
|
49
51
|
):
|
|
@@ -56,16 +58,12 @@ class HotAisleCompute(
|
|
|
56
58
|
HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
|
|
57
59
|
)
|
|
58
60
|
|
|
59
|
-
def
|
|
60
|
-
self, requirements: Optional[Requirements] = None
|
|
61
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
61
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
62
62
|
offers = get_catalog_offers(
|
|
63
63
|
backend=BackendType.HOTAISLE,
|
|
64
64
|
locations=self.config.regions or None,
|
|
65
|
-
requirements=requirements,
|
|
66
65
|
catalog=self.catalog,
|
|
67
66
|
)
|
|
68
|
-
|
|
69
67
|
supported_offers = []
|
|
70
68
|
for offer in offers:
|
|
71
69
|
if offer.instance.name in INSTANCE_TYPE_SPECS:
|
|
@@ -78,7 +76,6 @@ class HotAisleCompute(
|
|
|
78
76
|
logger.warning(
|
|
79
77
|
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
|
|
80
78
|
)
|
|
81
|
-
|
|
82
79
|
return supported_offers
|
|
83
80
|
|
|
84
81
|
def get_payload_from_offer(self, instance_type) -> dict:
|
|
@@ -9,13 +9,14 @@ from kubernetes import client
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithFilteredOffersCached,
|
|
12
13
|
ComputeWithGatewaySupport,
|
|
13
14
|
generate_unique_gateway_instance_name,
|
|
14
15
|
generate_unique_instance_name_for_job,
|
|
15
16
|
get_docker_commands,
|
|
16
17
|
get_dstack_gateway_commands,
|
|
17
18
|
)
|
|
18
|
-
from dstack._internal.core.backends.base.offers import
|
|
19
|
+
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
19
20
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
20
21
|
KubernetesConfig,
|
|
21
22
|
KubernetesNetworkingConfig,
|
|
@@ -58,6 +59,7 @@ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
|
58
59
|
|
|
59
60
|
|
|
60
61
|
class KubernetesCompute(
|
|
62
|
+
ComputeWithFilteredOffersCached,
|
|
61
63
|
ComputeWithGatewaySupport,
|
|
62
64
|
Compute,
|
|
63
65
|
):
|
|
@@ -70,8 +72,8 @@ class KubernetesCompute(
|
|
|
70
72
|
self.networking_config = networking_config
|
|
71
73
|
self.api = get_api_from_config_data(config.kubeconfig.data)
|
|
72
74
|
|
|
73
|
-
def
|
|
74
|
-
self, requirements:
|
|
75
|
+
def get_offers_by_requirements(
|
|
76
|
+
self, requirements: Requirements
|
|
75
77
|
) -> List[InstanceOfferWithAvailability]:
|
|
76
78
|
nodes = self.api.list_node()
|
|
77
79
|
instance_offers = []
|
|
@@ -99,7 +101,7 @@ class KubernetesCompute(
|
|
|
99
101
|
availability=InstanceAvailability.AVAILABLE,
|
|
100
102
|
instance_runtime=InstanceRuntime.RUNNER,
|
|
101
103
|
)
|
|
102
|
-
instance_offers.extend(
|
|
104
|
+
instance_offers.extend(filter_offers_by_requirements([instance_offer], requirements))
|
|
103
105
|
return instance_offers
|
|
104
106
|
|
|
105
107
|
def run_job(
|
|
@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
9
|
Compute,
|
|
10
|
+
ComputeWithAllOffersCached,
|
|
10
11
|
ComputeWithCreateInstanceSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
12
13
|
get_shim_commands,
|
|
@@ -22,12 +23,13 @@ from dstack._internal.core.models.instances import (
|
|
|
22
23
|
InstanceOfferWithAvailability,
|
|
23
24
|
)
|
|
24
25
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
25
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
27
|
|
|
27
28
|
MAX_INSTANCE_NAME_LEN = 60
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class LambdaCompute(
|
|
32
|
+
ComputeWithAllOffersCached,
|
|
31
33
|
ComputeWithCreateInstanceSupport,
|
|
32
34
|
Compute,
|
|
33
35
|
):
|
|
@@ -36,13 +38,10 @@ class LambdaCompute(
|
|
|
36
38
|
self.config = config
|
|
37
39
|
self.api_client = LambdaAPIClient(config.creds.api_key)
|
|
38
40
|
|
|
39
|
-
def
|
|
40
|
-
self, requirements: Optional[Requirements] = None
|
|
41
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
41
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
42
42
|
offers = get_catalog_offers(
|
|
43
43
|
backend=BackendType.LAMBDA,
|
|
44
44
|
locations=self.config.regions or None,
|
|
45
|
-
requirements=requirements,
|
|
46
45
|
)
|
|
47
46
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
48
47
|
return offers_with_availability
|
|
@@ -28,9 +28,7 @@ class LocalCompute(
|
|
|
28
28
|
ComputeWithVolumeSupport,
|
|
29
29
|
Compute,
|
|
30
30
|
):
|
|
31
|
-
def get_offers(
|
|
32
|
-
self, requirements: Optional[Requirements] = None
|
|
33
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
31
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
34
32
|
return [
|
|
35
33
|
InstanceOfferWithAvailability(
|
|
36
34
|
backend=BackendType.LOCAL,
|
|
@@ -3,7 +3,7 @@ import random
|
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
5
|
from functools import cached_property
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Callable, List, Optional
|
|
7
7
|
|
|
8
8
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
9
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -12,13 +12,14 @@ from nebius.sdk import SDK
|
|
|
12
12
|
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Compute
|
|
14
14
|
from dstack._internal.core.backends.base.compute import (
|
|
15
|
+
ComputeWithAllOffersCached,
|
|
15
16
|
ComputeWithCreateInstanceSupport,
|
|
16
17
|
ComputeWithMultinodeSupport,
|
|
17
18
|
ComputeWithPlacementGroupSupport,
|
|
18
19
|
generate_unique_instance_name,
|
|
19
20
|
get_user_data,
|
|
20
21
|
)
|
|
21
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
22
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
22
23
|
from dstack._internal.core.backends.nebius import resources
|
|
23
24
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
24
25
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -76,6 +77,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
class NebiusCompute(
|
|
80
|
+
ComputeWithAllOffersCached,
|
|
79
81
|
ComputeWithCreateInstanceSupport,
|
|
80
82
|
ComputeWithMultinodeSupport,
|
|
81
83
|
ComputeWithPlacementGroupSupport,
|
|
@@ -106,15 +108,11 @@ class NebiusCompute(
|
|
|
106
108
|
).metadata.id
|
|
107
109
|
return self._subnet_id_cache[region]
|
|
108
110
|
|
|
109
|
-
def
|
|
110
|
-
self, requirements: Optional[Requirements] = None
|
|
111
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
111
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
112
112
|
offers = get_catalog_offers(
|
|
113
113
|
backend=BackendType.NEBIUS,
|
|
114
114
|
locations=list(self._region_to_project_id),
|
|
115
|
-
requirements=requirements,
|
|
116
115
|
extra_filter=_supported_instances,
|
|
117
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
118
116
|
)
|
|
119
117
|
return [
|
|
120
118
|
InstanceOfferWithAvailability(
|
|
@@ -124,6 +122,11 @@ class NebiusCompute(
|
|
|
124
122
|
for offer in offers
|
|
125
123
|
]
|
|
126
124
|
|
|
125
|
+
def get_offers_modifier(
|
|
126
|
+
self, requirements: Requirements
|
|
127
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
128
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
129
|
+
|
|
127
130
|
def create_instance(
|
|
128
131
|
self,
|
|
129
132
|
instance_offer: InstanceOfferWithAvailability,
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
from concurrent.futures import ThreadPoolExecutor
|
|
2
2
|
from functools import cached_property
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import Callable, List, Optional
|
|
4
4
|
|
|
5
5
|
import oci
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
8
|
Compute,
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
ComputeWithMultinodeSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
12
13
|
get_user_data,
|
|
13
14
|
)
|
|
14
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
15
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
15
16
|
from dstack._internal.core.backends.oci import resources
|
|
16
17
|
from dstack._internal.core.backends.oci.models import OCIConfig
|
|
17
18
|
from dstack._internal.core.backends.oci.region import make_region_clients_map
|
|
@@ -47,6 +48,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class OCICompute(
|
|
51
|
+
ComputeWithAllOffersCached,
|
|
50
52
|
ComputeWithCreateInstanceSupport,
|
|
51
53
|
ComputeWithMultinodeSupport,
|
|
52
54
|
Compute,
|
|
@@ -60,14 +62,10 @@ class OCICompute(
|
|
|
60
62
|
def shapes_quota(self) -> resources.ShapesQuota:
|
|
61
63
|
return resources.ShapesQuota.load(self.regions, self.config.compartment_id)
|
|
62
64
|
|
|
63
|
-
def
|
|
64
|
-
self, requirements: Optional[Requirements] = None
|
|
65
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
65
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
66
66
|
offers = get_catalog_offers(
|
|
67
67
|
backend=BackendType.OCI,
|
|
68
68
|
locations=self.config.regions,
|
|
69
|
-
requirements=requirements,
|
|
70
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
71
69
|
extra_filter=_supported_instances,
|
|
72
70
|
)
|
|
73
71
|
|
|
@@ -96,6 +94,11 @@ class OCICompute(
|
|
|
96
94
|
|
|
97
95
|
return offers_with_availability
|
|
98
96
|
|
|
97
|
+
def get_offers_modifier(
|
|
98
|
+
self, requirements: Requirements
|
|
99
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
100
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
101
|
+
|
|
99
102
|
def terminate_instance(
|
|
100
103
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
101
104
|
) -> None:
|
|
@@ -115,7 +118,11 @@ class OCICompute(
|
|
|
115
118
|
availability_domain = instance_offer.availability_zones[0]
|
|
116
119
|
|
|
117
120
|
listing, package = resources.get_marketplace_listing_and_package(
|
|
118
|
-
|
|
121
|
+
gpu_name=(
|
|
122
|
+
instance_offer.instance.resources.gpus[0].name
|
|
123
|
+
if len(instance_offer.instance.resources.gpus) > 0
|
|
124
|
+
else None
|
|
125
|
+
),
|
|
119
126
|
client=region.marketplace_client,
|
|
120
127
|
)
|
|
121
128
|
resources.accept_marketplace_listing_agreements(
|
|
@@ -23,7 +23,9 @@ import oci
|
|
|
23
23
|
from oci.object_storage.models import CreatePreauthenticatedRequestDetails
|
|
24
24
|
|
|
25
25
|
from dstack import version
|
|
26
|
+
from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
|
|
26
27
|
from dstack._internal.core.backends.oci.region import OCIRegionClient
|
|
28
|
+
from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
|
|
27
29
|
from dstack._internal.core.errors import BackendError
|
|
28
30
|
from dstack._internal.core.models.instances import InstanceOffer
|
|
29
31
|
from dstack._internal.utils.common import batched
|
|
@@ -352,11 +354,14 @@ def terminate_instance_if_exists(client: oci.core.ComputeClient, instance_id: st
|
|
|
352
354
|
|
|
353
355
|
|
|
354
356
|
def get_marketplace_listing_and_package(
|
|
355
|
-
|
|
357
|
+
gpu_name: Optional[str], client: oci.marketplace.MarketplaceClient
|
|
356
358
|
) -> Tuple[oci.marketplace.models.Listing, oci.marketplace.models.ImageListingPackage]:
|
|
357
359
|
listing_name = f"dstack-{version.base_image}"
|
|
358
|
-
if
|
|
359
|
-
|
|
360
|
+
if gpu_name is not None:
|
|
361
|
+
if not requires_nvidia_proprietary_kernel_modules(gpu_name):
|
|
362
|
+
listing_name = f"dstack-cuda-{version.base_image}"
|
|
363
|
+
else:
|
|
364
|
+
listing_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
|
|
360
365
|
|
|
361
366
|
listing_summaries = list_marketplace_listings(listing_name, client)
|
|
362
367
|
if len(listing_summaries) != 1:
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
3
|
from datetime import timedelta
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Callable, List, Optional
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithAllOffersCached,
|
|
8
9
|
ComputeWithVolumeSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
generate_unique_volume_name,
|
|
11
12
|
get_docker_commands,
|
|
12
13
|
get_job_instance_name,
|
|
13
14
|
)
|
|
14
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
15
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
15
16
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
16
17
|
from dstack._internal.core.backends.runpod.models import RunpodConfig
|
|
17
18
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -27,6 +28,7 @@ from dstack._internal.core.models.instances import (
|
|
|
27
28
|
InstanceOfferWithAvailability,
|
|
28
29
|
SSHKey,
|
|
29
30
|
)
|
|
31
|
+
from dstack._internal.core.models.resources import Memory, Range
|
|
30
32
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
31
33
|
from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
|
|
32
34
|
from dstack._internal.utils.common import get_current_datetime
|
|
@@ -39,8 +41,12 @@ MAX_RESOURCE_NAME_LEN = 60
|
|
|
39
41
|
|
|
40
42
|
CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
41
43
|
|
|
44
|
+
# RunPod does not seem to have any limits on the disk size.
|
|
45
|
+
CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
|
|
46
|
+
|
|
42
47
|
|
|
43
48
|
class RunpodCompute(
|
|
49
|
+
ComputeWithAllOffersCached,
|
|
44
50
|
ComputeWithVolumeSupport,
|
|
45
51
|
Compute,
|
|
46
52
|
):
|
|
@@ -51,13 +57,11 @@ class RunpodCompute(
|
|
|
51
57
|
self.config = config
|
|
52
58
|
self.api_client = RunpodApiClient(config.creds.api_key)
|
|
53
59
|
|
|
54
|
-
def
|
|
55
|
-
self, requirements: Optional[Requirements] = None
|
|
56
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
60
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
57
61
|
offers = get_catalog_offers(
|
|
58
62
|
backend=BackendType.RUNPOD,
|
|
59
63
|
locations=self.config.regions or None,
|
|
60
|
-
requirements=
|
|
64
|
+
requirements=None,
|
|
61
65
|
extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
|
|
62
66
|
)
|
|
63
67
|
offers = [
|
|
@@ -68,6 +72,11 @@ class RunpodCompute(
|
|
|
68
72
|
]
|
|
69
73
|
return offers
|
|
70
74
|
|
|
75
|
+
def get_offers_modifier(
|
|
76
|
+
self, requirements: Requirements
|
|
77
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
78
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
79
|
+
|
|
71
80
|
def run_job(
|
|
72
81
|
self,
|
|
73
82
|
run: Run,
|
|
@@ -2,6 +2,7 @@ from typing import List, Optional
|
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.backends.base.backend import Compute
|
|
4
4
|
from dstack._internal.core.backends.base.compute import (
|
|
5
|
+
ComputeWithAllOffersCached,
|
|
5
6
|
ComputeWithCreateInstanceSupport,
|
|
6
7
|
ComputeWithGatewaySupport,
|
|
7
8
|
ComputeWithMultinodeSupport,
|
|
@@ -28,6 +29,7 @@ logger = get_logger(__name__)
|
|
|
28
29
|
|
|
29
30
|
class {{ backend_name }}Compute(
|
|
30
31
|
# TODO: Choose ComputeWith* classes to extend and implement
|
|
32
|
+
# ComputeWithAllOffersCached,
|
|
31
33
|
# ComputeWithCreateInstanceSupport,
|
|
32
34
|
# ComputeWithMultinodeSupport,
|
|
33
35
|
# ComputeWithReservationSupport,
|
|
@@ -42,7 +44,7 @@ class {{ backend_name }}Compute(
|
|
|
42
44
|
self.config = config
|
|
43
45
|
|
|
44
46
|
def get_offers(
|
|
45
|
-
self, requirements:
|
|
47
|
+
self, requirements: Requirements
|
|
46
48
|
) -> List[InstanceOfferWithAvailability]:
|
|
47
49
|
# If the provider is added to gpuhunt, you'd typically get offers
|
|
48
50
|
# using `get_catalog_offers()` and extend them with availability info.
|