dstack 0.19.28__py3-none-any.whl → 0.19.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/core/backends/aws/compute.py +51 -36
- dstack/_internal/core/backends/azure/compute.py +10 -7
- dstack/_internal/core/backends/base/compute.py +96 -14
- dstack/_internal/core/backends/base/offers.py +34 -4
- dstack/_internal/core/backends/cloudrift/compute.py +5 -7
- dstack/_internal/core/backends/cudo/compute.py +4 -2
- dstack/_internal/core/backends/datacrunch/compute.py +13 -11
- dstack/_internal/core/backends/digitalocean_base/compute.py +4 -5
- dstack/_internal/core/backends/gcp/compute.py +12 -7
- dstack/_internal/core/backends/hotaisle/compute.py +4 -7
- dstack/_internal/core/backends/kubernetes/compute.py +6 -4
- dstack/_internal/core/backends/lambdalabs/compute.py +4 -5
- dstack/_internal/core/backends/local/compute.py +1 -3
- dstack/_internal/core/backends/nebius/compute.py +10 -7
- dstack/_internal/core/backends/oci/compute.py +10 -7
- dstack/_internal/core/backends/runpod/compute.py +15 -6
- dstack/_internal/core/backends/template/compute.py.jinja +3 -1
- dstack/_internal/core/backends/tensordock/compute.py +1 -3
- dstack/_internal/core/backends/tensordock/models.py +2 -0
- dstack/_internal/core/backends/vastai/compute.py +7 -3
- dstack/_internal/core/backends/vultr/compute.py +5 -5
- dstack/_internal/core/models/projects.py +8 -0
- dstack/_internal/server/background/tasks/process_instances.py +3 -2
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +65 -22
- dstack/_internal/server/services/backends/__init__.py +1 -1
- dstack/_internal/server/services/projects.py +11 -3
- dstack/_internal/server/services/runs.py +2 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/main-56191fbfe77f49b251de.css +3 -0
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js → main-c51afa7f243e24d3e446.js} +61081 -49037
- dstack/_internal/server/statics/{main-a2a16772fbf11a14d191.js.map → main-c51afa7f243e24d3e446.js.map} +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.28.dist-info → dstack-0.19.29.dist-info}/METADATA +1 -1
- {dstack-0.19.28.dist-info → dstack-0.19.29.dist-info}/RECORD +37 -41
- dstack/_internal/core/backends/tensordock/__init__.py +0 -0
- dstack/_internal/core/backends/tensordock/api_client.py +0 -104
- dstack/_internal/core/backends/tensordock/backend.py +0 -16
- dstack/_internal/core/backends/tensordock/configurator.py +0 -74
- dstack/_internal/server/statics/main-5e0d56245c4bd241ec27.css +0 -3
- {dstack-0.19.28.dist-info → dstack-0.19.29.dist-info}/WHEEL +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.29.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.28.dist-info → dstack-0.19.29.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -9,6 +9,7 @@ from gpuhunt.providers.hotaisle import HotAisleProvider
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithAllOffersCached,
|
|
12
13
|
ComputeWithCreateInstanceSupport,
|
|
13
14
|
get_shim_commands,
|
|
14
15
|
)
|
|
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
InstanceOfferWithAvailability,
|
|
24
25
|
)
|
|
25
26
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
28
|
from dstack._internal.utils.logging import get_logger
|
|
28
29
|
|
|
29
30
|
logger = get_logger(__name__)
|
|
@@ -44,6 +45,7 @@ INSTANCE_TYPE_SPECS = {
|
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class HotAisleCompute(
|
|
48
|
+
ComputeWithAllOffersCached,
|
|
47
49
|
ComputeWithCreateInstanceSupport,
|
|
48
50
|
Compute,
|
|
49
51
|
):
|
|
@@ -56,16 +58,12 @@ class HotAisleCompute(
|
|
|
56
58
|
HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
|
|
57
59
|
)
|
|
58
60
|
|
|
59
|
-
def
|
|
60
|
-
self, requirements: Optional[Requirements] = None
|
|
61
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
61
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
62
62
|
offers = get_catalog_offers(
|
|
63
63
|
backend=BackendType.HOTAISLE,
|
|
64
64
|
locations=self.config.regions or None,
|
|
65
|
-
requirements=requirements,
|
|
66
65
|
catalog=self.catalog,
|
|
67
66
|
)
|
|
68
|
-
|
|
69
67
|
supported_offers = []
|
|
70
68
|
for offer in offers:
|
|
71
69
|
if offer.instance.name in INSTANCE_TYPE_SPECS:
|
|
@@ -78,7 +76,6 @@ class HotAisleCompute(
|
|
|
78
76
|
logger.warning(
|
|
79
77
|
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
|
|
80
78
|
)
|
|
81
|
-
|
|
82
79
|
return supported_offers
|
|
83
80
|
|
|
84
81
|
def get_payload_from_offer(self, instance_type) -> dict:
|
|
@@ -9,13 +9,14 @@ from kubernetes import client
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.backends.base.compute import (
|
|
11
11
|
Compute,
|
|
12
|
+
ComputeWithFilteredOffersCached,
|
|
12
13
|
ComputeWithGatewaySupport,
|
|
13
14
|
generate_unique_gateway_instance_name,
|
|
14
15
|
generate_unique_instance_name_for_job,
|
|
15
16
|
get_docker_commands,
|
|
16
17
|
get_dstack_gateway_commands,
|
|
17
18
|
)
|
|
18
|
-
from dstack._internal.core.backends.base.offers import
|
|
19
|
+
from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
|
|
19
20
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
20
21
|
KubernetesConfig,
|
|
21
22
|
KubernetesNetworkingConfig,
|
|
@@ -58,6 +59,7 @@ NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
|
|
|
58
59
|
|
|
59
60
|
|
|
60
61
|
class KubernetesCompute(
|
|
62
|
+
ComputeWithFilteredOffersCached,
|
|
61
63
|
ComputeWithGatewaySupport,
|
|
62
64
|
Compute,
|
|
63
65
|
):
|
|
@@ -70,8 +72,8 @@ class KubernetesCompute(
|
|
|
70
72
|
self.networking_config = networking_config
|
|
71
73
|
self.api = get_api_from_config_data(config.kubeconfig.data)
|
|
72
74
|
|
|
73
|
-
def
|
|
74
|
-
self, requirements:
|
|
75
|
+
def get_offers_by_requirements(
|
|
76
|
+
self, requirements: Requirements
|
|
75
77
|
) -> List[InstanceOfferWithAvailability]:
|
|
76
78
|
nodes = self.api.list_node()
|
|
77
79
|
instance_offers = []
|
|
@@ -99,7 +101,7 @@ class KubernetesCompute(
|
|
|
99
101
|
availability=InstanceAvailability.AVAILABLE,
|
|
100
102
|
instance_runtime=InstanceRuntime.RUNNER,
|
|
101
103
|
)
|
|
102
|
-
instance_offers.extend(
|
|
104
|
+
instance_offers.extend(filter_offers_by_requirements([instance_offer], requirements))
|
|
103
105
|
return instance_offers
|
|
104
106
|
|
|
105
107
|
def run_job(
|
|
@@ -7,6 +7,7 @@ from typing import Dict, List, Optional
|
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
9
|
Compute,
|
|
10
|
+
ComputeWithAllOffersCached,
|
|
10
11
|
ComputeWithCreateInstanceSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
12
13
|
get_shim_commands,
|
|
@@ -22,12 +23,13 @@ from dstack._internal.core.models.instances import (
|
|
|
22
23
|
InstanceOfferWithAvailability,
|
|
23
24
|
)
|
|
24
25
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
25
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
26
27
|
|
|
27
28
|
MAX_INSTANCE_NAME_LEN = 60
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
class LambdaCompute(
|
|
32
|
+
ComputeWithAllOffersCached,
|
|
31
33
|
ComputeWithCreateInstanceSupport,
|
|
32
34
|
Compute,
|
|
33
35
|
):
|
|
@@ -36,13 +38,10 @@ class LambdaCompute(
|
|
|
36
38
|
self.config = config
|
|
37
39
|
self.api_client = LambdaAPIClient(config.creds.api_key)
|
|
38
40
|
|
|
39
|
-
def
|
|
40
|
-
self, requirements: Optional[Requirements] = None
|
|
41
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
41
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
42
42
|
offers = get_catalog_offers(
|
|
43
43
|
backend=BackendType.LAMBDA,
|
|
44
44
|
locations=self.config.regions or None,
|
|
45
|
-
requirements=requirements,
|
|
46
45
|
)
|
|
47
46
|
offers_with_availability = self._get_offers_with_availability(offers)
|
|
48
47
|
return offers_with_availability
|
|
@@ -28,9 +28,7 @@ class LocalCompute(
|
|
|
28
28
|
ComputeWithVolumeSupport,
|
|
29
29
|
Compute,
|
|
30
30
|
):
|
|
31
|
-
def get_offers(
|
|
32
|
-
self, requirements: Optional[Requirements] = None
|
|
33
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
31
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
34
32
|
return [
|
|
35
33
|
InstanceOfferWithAvailability(
|
|
36
34
|
backend=BackendType.LOCAL,
|
|
@@ -3,7 +3,7 @@ import random
|
|
|
3
3
|
import shlex
|
|
4
4
|
import time
|
|
5
5
|
from functools import cached_property
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Callable, List, Optional
|
|
7
7
|
|
|
8
8
|
from nebius.aio.operation import Operation as SDKOperation
|
|
9
9
|
from nebius.aio.service_error import RequestError, StatusCode
|
|
@@ -12,13 +12,14 @@ from nebius.sdk import SDK
|
|
|
12
12
|
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Compute
|
|
14
14
|
from dstack._internal.core.backends.base.compute import (
|
|
15
|
+
ComputeWithAllOffersCached,
|
|
15
16
|
ComputeWithCreateInstanceSupport,
|
|
16
17
|
ComputeWithMultinodeSupport,
|
|
17
18
|
ComputeWithPlacementGroupSupport,
|
|
18
19
|
generate_unique_instance_name,
|
|
19
20
|
get_user_data,
|
|
20
21
|
)
|
|
21
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
22
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
22
23
|
from dstack._internal.core.backends.nebius import resources
|
|
23
24
|
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
|
|
24
25
|
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
|
|
@@ -76,6 +77,7 @@ SUPPORTED_PLATFORMS = [
|
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
class NebiusCompute(
|
|
80
|
+
ComputeWithAllOffersCached,
|
|
79
81
|
ComputeWithCreateInstanceSupport,
|
|
80
82
|
ComputeWithMultinodeSupport,
|
|
81
83
|
ComputeWithPlacementGroupSupport,
|
|
@@ -106,15 +108,11 @@ class NebiusCompute(
|
|
|
106
108
|
).metadata.id
|
|
107
109
|
return self._subnet_id_cache[region]
|
|
108
110
|
|
|
109
|
-
def
|
|
110
|
-
self, requirements: Optional[Requirements] = None
|
|
111
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
111
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
112
112
|
offers = get_catalog_offers(
|
|
113
113
|
backend=BackendType.NEBIUS,
|
|
114
114
|
locations=list(self._region_to_project_id),
|
|
115
|
-
requirements=requirements,
|
|
116
115
|
extra_filter=_supported_instances,
|
|
117
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
118
116
|
)
|
|
119
117
|
return [
|
|
120
118
|
InstanceOfferWithAvailability(
|
|
@@ -124,6 +122,11 @@ class NebiusCompute(
|
|
|
124
122
|
for offer in offers
|
|
125
123
|
]
|
|
126
124
|
|
|
125
|
+
def get_offers_modifier(
|
|
126
|
+
self, requirements: Requirements
|
|
127
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
128
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
129
|
+
|
|
127
130
|
def create_instance(
|
|
128
131
|
self,
|
|
129
132
|
instance_offer: InstanceOfferWithAvailability,
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
from concurrent.futures import ThreadPoolExecutor
|
|
2
2
|
from functools import cached_property
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import Callable, List, Optional
|
|
4
4
|
|
|
5
5
|
import oci
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
8
|
Compute,
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
ComputeWithMultinodeSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
12
13
|
get_user_data,
|
|
13
14
|
)
|
|
14
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
15
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
15
16
|
from dstack._internal.core.backends.oci import resources
|
|
16
17
|
from dstack._internal.core.backends.oci.models import OCIConfig
|
|
17
18
|
from dstack._internal.core.backends.oci.region import make_region_clients_map
|
|
@@ -47,6 +48,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("50GB"), max=Memory.pars
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
class OCICompute(
|
|
51
|
+
ComputeWithAllOffersCached,
|
|
50
52
|
ComputeWithCreateInstanceSupport,
|
|
51
53
|
ComputeWithMultinodeSupport,
|
|
52
54
|
Compute,
|
|
@@ -60,14 +62,10 @@ class OCICompute(
|
|
|
60
62
|
def shapes_quota(self) -> resources.ShapesQuota:
|
|
61
63
|
return resources.ShapesQuota.load(self.regions, self.config.compartment_id)
|
|
62
64
|
|
|
63
|
-
def
|
|
64
|
-
self, requirements: Optional[Requirements] = None
|
|
65
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
65
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
66
66
|
offers = get_catalog_offers(
|
|
67
67
|
backend=BackendType.OCI,
|
|
68
68
|
locations=self.config.regions,
|
|
69
|
-
requirements=requirements,
|
|
70
|
-
configurable_disk_size=CONFIGURABLE_DISK_SIZE,
|
|
71
69
|
extra_filter=_supported_instances,
|
|
72
70
|
)
|
|
73
71
|
|
|
@@ -96,6 +94,11 @@ class OCICompute(
|
|
|
96
94
|
|
|
97
95
|
return offers_with_availability
|
|
98
96
|
|
|
97
|
+
def get_offers_modifier(
|
|
98
|
+
self, requirements: Requirements
|
|
99
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
100
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
101
|
+
|
|
99
102
|
def terminate_instance(
|
|
100
103
|
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
101
104
|
) -> None:
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import uuid
|
|
3
3
|
from datetime import timedelta
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Callable, List, Optional
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithAllOffersCached,
|
|
8
9
|
ComputeWithVolumeSupport,
|
|
9
10
|
generate_unique_instance_name,
|
|
10
11
|
generate_unique_volume_name,
|
|
11
12
|
get_docker_commands,
|
|
12
13
|
get_job_instance_name,
|
|
13
14
|
)
|
|
14
|
-
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
15
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
|
|
15
16
|
from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
|
|
16
17
|
from dstack._internal.core.backends.runpod.models import RunpodConfig
|
|
17
18
|
from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
|
|
@@ -27,6 +28,7 @@ from dstack._internal.core.models.instances import (
|
|
|
27
28
|
InstanceOfferWithAvailability,
|
|
28
29
|
SSHKey,
|
|
29
30
|
)
|
|
31
|
+
from dstack._internal.core.models.resources import Memory, Range
|
|
30
32
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
31
33
|
from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
|
|
32
34
|
from dstack._internal.utils.common import get_current_datetime
|
|
@@ -39,8 +41,12 @@ MAX_RESOURCE_NAME_LEN = 60
|
|
|
39
41
|
|
|
40
42
|
CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour
|
|
41
43
|
|
|
44
|
+
# RunPod does not seem to have any limits on the disk size.
|
|
45
|
+
CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
|
|
46
|
+
|
|
42
47
|
|
|
43
48
|
class RunpodCompute(
|
|
49
|
+
ComputeWithAllOffersCached,
|
|
44
50
|
ComputeWithVolumeSupport,
|
|
45
51
|
Compute,
|
|
46
52
|
):
|
|
@@ -51,13 +57,11 @@ class RunpodCompute(
|
|
|
51
57
|
self.config = config
|
|
52
58
|
self.api_client = RunpodApiClient(config.creds.api_key)
|
|
53
59
|
|
|
54
|
-
def
|
|
55
|
-
self, requirements: Optional[Requirements] = None
|
|
56
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
60
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
57
61
|
offers = get_catalog_offers(
|
|
58
62
|
backend=BackendType.RUNPOD,
|
|
59
63
|
locations=self.config.regions or None,
|
|
60
|
-
requirements=
|
|
64
|
+
requirements=None,
|
|
61
65
|
extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
|
|
62
66
|
)
|
|
63
67
|
offers = [
|
|
@@ -68,6 +72,11 @@ class RunpodCompute(
|
|
|
68
72
|
]
|
|
69
73
|
return offers
|
|
70
74
|
|
|
75
|
+
def get_offers_modifier(
|
|
76
|
+
self, requirements: Requirements
|
|
77
|
+
) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
|
|
78
|
+
return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
|
|
79
|
+
|
|
71
80
|
def run_job(
|
|
72
81
|
self,
|
|
73
82
|
run: Run,
|
|
@@ -2,6 +2,7 @@ from typing import List, Optional
|
|
|
2
2
|
|
|
3
3
|
from dstack._internal.core.backends.base.backend import Compute
|
|
4
4
|
from dstack._internal.core.backends.base.compute import (
|
|
5
|
+
ComputeWithAllOffersCached,
|
|
5
6
|
ComputeWithCreateInstanceSupport,
|
|
6
7
|
ComputeWithGatewaySupport,
|
|
7
8
|
ComputeWithMultinodeSupport,
|
|
@@ -28,6 +29,7 @@ logger = get_logger(__name__)
|
|
|
28
29
|
|
|
29
30
|
class {{ backend_name }}Compute(
|
|
30
31
|
# TODO: Choose ComputeWith* classes to extend and implement
|
|
32
|
+
# ComputeWithAllOffersCached,
|
|
31
33
|
# ComputeWithCreateInstanceSupport,
|
|
32
34
|
# ComputeWithMultinodeSupport,
|
|
33
35
|
# ComputeWithReservationSupport,
|
|
@@ -42,7 +44,7 @@ class {{ backend_name }}Compute(
|
|
|
42
44
|
self.config = config
|
|
43
45
|
|
|
44
46
|
def get_offers(
|
|
45
|
-
self, requirements:
|
|
47
|
+
self, requirements: Requirements
|
|
46
48
|
) -> List[InstanceOfferWithAvailability]:
|
|
47
49
|
# If the provider is added to gpuhunt, you'd typically get offers
|
|
48
50
|
# using `get_catalog_offers()` and extend them with availability info.
|
|
@@ -39,9 +39,7 @@ class TensorDockCompute(
|
|
|
39
39
|
self.config = config
|
|
40
40
|
self.api_client = TensorDockAPIClient(config.creds.api_key, config.creds.api_token)
|
|
41
41
|
|
|
42
|
-
def get_offers(
|
|
43
|
-
self, requirements: Optional[Requirements] = None
|
|
44
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
42
|
+
def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
|
|
45
43
|
offers = get_catalog_offers(
|
|
46
44
|
backend=BackendType.TENSORDOCK,
|
|
47
45
|
requirements=requirements,
|
|
@@ -4,6 +4,8 @@ from pydantic import Field
|
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.common import CoreModel
|
|
6
6
|
|
|
7
|
+
# TODO: TensorDock is deprecated and will be removed in the future
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class TensorDockAPIKeyCreds(CoreModel):
|
|
9
11
|
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
@@ -5,6 +5,7 @@ from gpuhunt.providers.vastai import VastAIProvider
|
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.backends.base.backend import Compute
|
|
7
7
|
from dstack._internal.core.backends.base.compute import (
|
|
8
|
+
ComputeWithFilteredOffersCached,
|
|
8
9
|
generate_unique_instance_name_for_job,
|
|
9
10
|
get_docker_commands,
|
|
10
11
|
)
|
|
@@ -30,7 +31,10 @@ logger = get_logger(__name__)
|
|
|
30
31
|
MAX_INSTANCE_NAME_LEN = 60
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
class VastAICompute(
|
|
34
|
+
class VastAICompute(
|
|
35
|
+
ComputeWithFilteredOffersCached,
|
|
36
|
+
Compute,
|
|
37
|
+
):
|
|
34
38
|
def __init__(self, config: VastAIConfig):
|
|
35
39
|
super().__init__()
|
|
36
40
|
self.config = config
|
|
@@ -49,8 +53,8 @@ class VastAICompute(Compute):
|
|
|
49
53
|
)
|
|
50
54
|
)
|
|
51
55
|
|
|
52
|
-
def
|
|
53
|
-
self, requirements:
|
|
56
|
+
def get_offers_by_requirements(
|
|
57
|
+
self, requirements: Requirements
|
|
54
58
|
) -> List[InstanceOfferWithAvailability]:
|
|
55
59
|
offers = get_catalog_offers(
|
|
56
60
|
backend=BackendType.VASTAI,
|
|
@@ -6,6 +6,7 @@ import requests
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.backends.base.backend import Compute
|
|
8
8
|
from dstack._internal.core.backends.base.compute import (
|
|
9
|
+
ComputeWithAllOffersCached,
|
|
9
10
|
ComputeWithCreateInstanceSupport,
|
|
10
11
|
ComputeWithMultinodeSupport,
|
|
11
12
|
generate_unique_instance_name,
|
|
@@ -23,7 +24,7 @@ from dstack._internal.core.models.instances import (
|
|
|
23
24
|
InstanceOfferWithAvailability,
|
|
24
25
|
)
|
|
25
26
|
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
-
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
|
+
from dstack._internal.core.models.runs import JobProvisioningData
|
|
27
28
|
from dstack._internal.utils.logging import get_logger
|
|
28
29
|
|
|
29
30
|
logger = get_logger(__name__)
|
|
@@ -32,6 +33,7 @@ MAX_INSTANCE_NAME_LEN = 64
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class VultrCompute(
|
|
36
|
+
ComputeWithAllOffersCached,
|
|
35
37
|
ComputeWithCreateInstanceSupport,
|
|
36
38
|
ComputeWithMultinodeSupport,
|
|
37
39
|
Compute,
|
|
@@ -41,12 +43,10 @@ class VultrCompute(
|
|
|
41
43
|
self.config = config
|
|
42
44
|
self.api_client = VultrApiClient(config.creds.api_key)
|
|
43
45
|
|
|
44
|
-
def
|
|
45
|
-
self, requirements: Optional[Requirements] = None
|
|
46
|
-
) -> List[InstanceOfferWithAvailability]:
|
|
46
|
+
def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
|
|
47
47
|
offers = get_catalog_offers(
|
|
48
48
|
backend=BackendType.VULTR,
|
|
49
|
-
requirements=
|
|
49
|
+
requirements=None,
|
|
50
50
|
locations=self.config.regions or None,
|
|
51
51
|
extra_filter=_supported_instances,
|
|
52
52
|
)
|
|
@@ -26,3 +26,11 @@ class Project(CoreModel):
|
|
|
26
26
|
backends: List[BackendInfo]
|
|
27
27
|
members: List[Member]
|
|
28
28
|
is_public: bool = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProjectHookConfig(CoreModel):
|
|
32
|
+
"""
|
|
33
|
+
This class can be inherited to extend the project creation configuration passed to the hooks.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
pass
|
|
@@ -578,7 +578,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
578
578
|
if placement_group_model is None: # error occurred
|
|
579
579
|
continue
|
|
580
580
|
session.add(placement_group_model)
|
|
581
|
-
await session.flush()
|
|
582
581
|
placement_group_models.append(placement_group_model)
|
|
583
582
|
logger.debug(
|
|
584
583
|
"Trying %s in %s/%s for $%0.4f per hour",
|
|
@@ -636,7 +635,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
636
635
|
},
|
|
637
636
|
)
|
|
638
637
|
if instance.fleet_id and _is_fleet_master_instance(instance):
|
|
639
|
-
# Clean up placement groups that did not end up being used
|
|
638
|
+
# Clean up placement groups that did not end up being used.
|
|
639
|
+
# Flush to update still uncommitted placement groups.
|
|
640
|
+
await session.flush()
|
|
640
641
|
await schedule_fleet_placement_groups_deletion(
|
|
641
642
|
session=session,
|
|
642
643
|
fleet_id=instance.fleet_id,
|
|
@@ -289,7 +289,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
289
289
|
instance_filters=instance_filters,
|
|
290
290
|
)
|
|
291
291
|
fleet_models = fleet_models_with_instances + fleet_models_without_instances
|
|
292
|
-
fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
|
|
292
|
+
fleet_model, fleet_instances_with_offers = await _find_optimal_fleet_with_offers(
|
|
293
|
+
project=project,
|
|
293
294
|
fleet_models=fleet_models,
|
|
294
295
|
run_model=run_model,
|
|
295
296
|
run_spec=run.run_spec,
|
|
@@ -492,7 +493,8 @@ async def _refetch_fleet_models_with_instances(
|
|
|
492
493
|
return fleet_models
|
|
493
494
|
|
|
494
495
|
|
|
495
|
-
def _find_optimal_fleet_with_offers(
|
|
496
|
+
async def _find_optimal_fleet_with_offers(
|
|
497
|
+
project: ProjectModel,
|
|
496
498
|
fleet_models: list[FleetModel],
|
|
497
499
|
run_model: RunModel,
|
|
498
500
|
run_spec: RunSpec,
|
|
@@ -502,58 +504,99 @@ def _find_optimal_fleet_with_offers(
|
|
|
502
504
|
) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
|
|
503
505
|
if run_model.fleet is not None:
|
|
504
506
|
# Using the fleet that was already chosen by the master job
|
|
505
|
-
|
|
507
|
+
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
506
508
|
fleet_model=run_model.fleet,
|
|
507
509
|
run_spec=run_spec,
|
|
508
510
|
job=job,
|
|
509
511
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
510
512
|
volumes=volumes,
|
|
511
513
|
)
|
|
512
|
-
return run_model.fleet,
|
|
514
|
+
return run_model.fleet, fleet_instances_with_pool_offers
|
|
513
515
|
|
|
514
516
|
if len(fleet_models) == 0:
|
|
515
517
|
return None, []
|
|
516
518
|
|
|
517
519
|
nodes_required_num = _get_nodes_required_num_for_run(run_spec)
|
|
518
|
-
# The current strategy is to
|
|
519
|
-
# the run without additional provisioning and choose the one with the cheapest offer.
|
|
520
|
-
#
|
|
520
|
+
# The current strategy is first to consider fleets that can accommodate
|
|
521
|
+
# the run without additional provisioning and choose the one with the cheapest pool offer.
|
|
522
|
+
# Then choose a fleet with the cheapest pool offer among all fleets with pool offers.
|
|
523
|
+
# If there are no fleets with pool offers, choose a fleet with a cheapest backend offer.
|
|
524
|
+
# Fallback to autocreated fleet if fleets have no pool or backend offers.
|
|
525
|
+
# TODO: Consider trying all backend offers and then choosing a fleet.
|
|
521
526
|
candidate_fleets_with_offers: list[
|
|
522
527
|
tuple[
|
|
523
528
|
Optional[FleetModel],
|
|
524
529
|
list[tuple[InstanceModel, InstanceOfferWithAvailability]],
|
|
525
530
|
int,
|
|
526
|
-
|
|
531
|
+
int,
|
|
532
|
+
tuple[int, float, float],
|
|
527
533
|
]
|
|
528
534
|
] = []
|
|
529
535
|
for candidate_fleet_model in fleet_models:
|
|
530
|
-
|
|
536
|
+
fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
|
|
531
537
|
fleet_model=candidate_fleet_model,
|
|
532
538
|
run_spec=run_spec,
|
|
533
539
|
job=job,
|
|
534
540
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
535
541
|
volumes=volumes,
|
|
536
542
|
)
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
543
|
+
fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
|
|
544
|
+
fleet_cheapest_pool_offer = math.inf
|
|
545
|
+
if len(fleet_instances_with_pool_offers) > 0:
|
|
546
|
+
fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
|
|
547
|
+
|
|
548
|
+
candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
|
|
549
|
+
profile = combine_fleet_and_run_profiles(
|
|
550
|
+
candidate_fleet.spec.merged_profile, run_spec.merged_profile
|
|
551
|
+
)
|
|
552
|
+
fleet_requirements = get_fleet_requirements(candidate_fleet.spec)
|
|
553
|
+
requirements = combine_fleet_and_run_requirements(
|
|
554
|
+
fleet_requirements, job.job_spec.requirements
|
|
555
|
+
)
|
|
556
|
+
multinode = (
|
|
557
|
+
candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
558
|
+
or job.job_spec.jobs_per_replica > 1
|
|
559
|
+
)
|
|
560
|
+
fleet_backend_offers = []
|
|
561
|
+
if (
|
|
562
|
+
_check_can_create_new_instance_in_fleet(candidate_fleet)
|
|
563
|
+
and profile is not None
|
|
564
|
+
and requirements is not None
|
|
565
|
+
):
|
|
566
|
+
fleet_backend_offers = await get_offers_by_requirements(
|
|
567
|
+
project=project,
|
|
568
|
+
profile=profile,
|
|
569
|
+
requirements=requirements,
|
|
570
|
+
exclude_not_available=True,
|
|
571
|
+
multinode=multinode,
|
|
572
|
+
master_job_provisioning_data=master_job_provisioning_data,
|
|
573
|
+
volumes=volumes,
|
|
574
|
+
privileged=job.job_spec.privileged,
|
|
575
|
+
instance_mounts=check_run_spec_requires_instance_mounts(run_spec),
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
fleet_cheapest_backend_offer = math.inf
|
|
579
|
+
if len(fleet_backend_offers) > 0:
|
|
580
|
+
fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
|
|
581
|
+
|
|
582
|
+
fleet_priority = (
|
|
583
|
+
not fleet_has_available_capacity,
|
|
584
|
+
fleet_cheapest_pool_offer,
|
|
585
|
+
fleet_cheapest_backend_offer,
|
|
586
|
+
)
|
|
545
587
|
candidate_fleets_with_offers.append(
|
|
546
588
|
(
|
|
547
589
|
candidate_fleet_model,
|
|
548
|
-
|
|
549
|
-
len(
|
|
590
|
+
fleet_instances_with_pool_offers,
|
|
591
|
+
len(fleet_instances_with_pool_offers),
|
|
592
|
+
len(fleet_backend_offers),
|
|
550
593
|
fleet_priority,
|
|
551
594
|
)
|
|
552
595
|
)
|
|
553
596
|
if run_spec.merged_profile.fleets is None and all(
|
|
554
|
-
t[2] == 0 for t in candidate_fleets_with_offers
|
|
597
|
+
t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
|
|
555
598
|
):
|
|
556
|
-
# If fleets are not specified and no fleets have available offers, create a new fleet.
|
|
599
|
+
# If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
|
|
557
600
|
# This is for compatibility with non-fleet-first UX when runs created new fleets
|
|
558
601
|
# if there are no instances to reuse.
|
|
559
602
|
return None, []
|
|
@@ -573,7 +616,7 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
|
|
|
573
616
|
return nodes_required_num
|
|
574
617
|
|
|
575
618
|
|
|
576
|
-
def
|
|
619
|
+
def _get_fleet_instances_with_pool_offers(
|
|
577
620
|
fleet_model: FleetModel,
|
|
578
621
|
run_spec: RunSpec,
|
|
579
622
|
job: Job,
|
|
@@ -345,7 +345,7 @@ async def get_instance_offers(
|
|
|
345
345
|
Returns list of instances satisfying minimal resource requirements sorted by price
|
|
346
346
|
"""
|
|
347
347
|
logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends])
|
|
348
|
-
tasks = [run_async(backend.compute().
|
|
348
|
+
tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends]
|
|
349
349
|
offers_by_backend = []
|
|
350
350
|
for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)):
|
|
351
351
|
if isinstance(result, BackendError):
|