dstack 0.19.23rc1__py3-none-any.whl → 0.19.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +14 -2
- dstack/_internal/cli/commands/init.py +47 -2
- dstack/_internal/cli/commands/offer.py +68 -60
- dstack/_internal/cli/services/configurators/run.py +35 -10
- dstack/_internal/cli/services/repos.py +6 -24
- dstack/_internal/cli/utils/common.py +7 -0
- dstack/_internal/cli/utils/gpu.py +210 -0
- dstack/_internal/cli/utils/run.py +33 -0
- dstack/_internal/core/backends/aws/compute.py +1 -4
- dstack/_internal/core/backends/base/compute.py +0 -4
- dstack/_internal/core/backends/gcp/compute.py +1 -4
- dstack/_internal/core/backends/nebius/compute.py +1 -4
- dstack/_internal/core/models/common.py +1 -1
- dstack/_internal/core/models/config.py +3 -1
- dstack/_internal/core/models/configurations.py +16 -14
- dstack/_internal/core/models/fleets.py +2 -2
- dstack/_internal/core/models/instances.py +4 -1
- dstack/_internal/core/models/profiles.py +2 -2
- dstack/_internal/core/models/repos/remote.py +2 -2
- dstack/_internal/core/models/resources.py +4 -4
- dstack/_internal/core/models/runs.py +13 -9
- dstack/_internal/core/services/configs/__init__.py +8 -7
- dstack/_internal/proxy/gateway/services/registry.py +2 -0
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +10 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +66 -46
- dstack/_internal/server/background/tasks/process_runs.py +16 -15
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/models.py +24 -13
- dstack/_internal/server/routers/gpus.py +29 -0
- dstack/_internal/server/schemas/gateways.py +1 -1
- dstack/_internal/server/schemas/gpus.py +66 -0
- dstack/_internal/server/services/docker.py +1 -1
- dstack/_internal/server/services/gpus.py +390 -0
- dstack/_internal/server/services/jobs/__init__.py +3 -1
- dstack/_internal/server/services/offers.py +48 -31
- dstack/_internal/server/services/probes.py +5 -1
- dstack/_internal/server/services/proxy/repo.py +1 -0
- dstack/_internal/server/services/repos.py +1 -1
- dstack/_internal/server/services/runs.py +15 -12
- dstack/_internal/server/services/secrets.py +1 -1
- dstack/_internal/server/services/services/__init__.py +60 -41
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
- dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js → main-d151b300fcac3933213d.js} +20 -23
- dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js.map → main-d151b300fcac3933213d.js.map} +1 -1
- dstack/_internal/server/testing/common.py +7 -2
- dstack/api/_public/repos.py +8 -7
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_gpus.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/METADATA +1 -1
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/RECORD +60 -51
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/WHEEL +0 -0
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.23rc1.dist-info → dstack-0.19.25.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -14,7 +14,7 @@ class CreateGatewayRequest(CoreModel):
|
|
|
14
14
|
backend_type: Annotated[Optional[BackendType], Field(exclude=True)] = None
|
|
15
15
|
region: Annotated[Optional[str], Field(exclude=True)] = None
|
|
16
16
|
|
|
17
|
-
class Config:
|
|
17
|
+
class Config(CoreModel.Config):
|
|
18
18
|
@staticmethod
|
|
19
19
|
def schema_extra(schema: Dict[str, Any]) -> None:
|
|
20
20
|
del schema["properties"]["name"]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from typing import List, Literal, Optional
|
|
2
|
+
|
|
3
|
+
import gpuhunt
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
7
|
+
from dstack._internal.core.models.common import CoreModel
|
|
8
|
+
from dstack._internal.core.models.instances import InstanceAvailability
|
|
9
|
+
from dstack._internal.core.models.resources import Range
|
|
10
|
+
from dstack._internal.core.models.runs import RunSpec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BackendGpu(CoreModel):
|
|
14
|
+
"""GPU specification from a backend offer."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
memory_mib: int
|
|
18
|
+
vendor: gpuhunt.AcceleratorVendor
|
|
19
|
+
availability: InstanceAvailability
|
|
20
|
+
spot: bool
|
|
21
|
+
count: int
|
|
22
|
+
price: float
|
|
23
|
+
region: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BackendGpus(CoreModel):
|
|
27
|
+
"""Backend GPU specifications."""
|
|
28
|
+
|
|
29
|
+
backend_type: BackendType
|
|
30
|
+
gpus: List[BackendGpu]
|
|
31
|
+
regions: List[str]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ListGpusRequest(CoreModel):
|
|
35
|
+
"""Request for listing GPUs with optional grouping."""
|
|
36
|
+
|
|
37
|
+
run_spec: RunSpec
|
|
38
|
+
group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
description="List of fields to group by. Valid values: 'backend', 'region', 'count'. "
|
|
41
|
+
"Note: 'region' can only be used together with 'backend'.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class GpuGroup(CoreModel):
|
|
46
|
+
"""GPU group that can handle all grouping scenarios."""
|
|
47
|
+
|
|
48
|
+
name: str
|
|
49
|
+
memory_mib: int
|
|
50
|
+
vendor: gpuhunt.AcceleratorVendor
|
|
51
|
+
availability: List[InstanceAvailability]
|
|
52
|
+
spot: List[Literal["spot", "on-demand"]]
|
|
53
|
+
count: Range[int]
|
|
54
|
+
price: Range[float]
|
|
55
|
+
backends: Optional[List[BackendType]] = None
|
|
56
|
+
backend: Optional[BackendType] = None
|
|
57
|
+
regions: Optional[List[str]] = None
|
|
58
|
+
region: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ListGpusResponse(CoreModel):
|
|
62
|
+
"""Response containing GPU specifications."""
|
|
63
|
+
|
|
64
|
+
gpus: List[GpuGroup] = Field(
|
|
65
|
+
description="List of GPU specifications, grouped according to the group_by parameter"
|
|
66
|
+
)
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
6
|
+
from dstack._internal.core.models.instances import InstanceOfferWithAvailability
|
|
7
|
+
from dstack._internal.core.models.profiles import SpotPolicy
|
|
8
|
+
from dstack._internal.core.models.resources import Range
|
|
9
|
+
from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map
|
|
10
|
+
from dstack._internal.server.models import ProjectModel
|
|
11
|
+
from dstack._internal.server.schemas.gpus import (
|
|
12
|
+
BackendGpu,
|
|
13
|
+
BackendGpus,
|
|
14
|
+
GpuGroup,
|
|
15
|
+
ListGpusResponse,
|
|
16
|
+
)
|
|
17
|
+
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def _get_gpu_offers(
|
|
21
|
+
session: AsyncSession, project: ProjectModel, run_spec: RunSpec
|
|
22
|
+
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
23
|
+
"""Fetches all available instance offers that match the run spec's GPU requirements."""
|
|
24
|
+
profile = run_spec.merged_profile
|
|
25
|
+
requirements = Requirements(
|
|
26
|
+
resources=run_spec.configuration.resources,
|
|
27
|
+
max_price=profile.max_price,
|
|
28
|
+
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
|
|
29
|
+
reservation=profile.reservation,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return await get_offers_by_requirements(
|
|
33
|
+
project=project,
|
|
34
|
+
profile=profile,
|
|
35
|
+
requirements=requirements,
|
|
36
|
+
exclude_not_available=False,
|
|
37
|
+
multinode=False,
|
|
38
|
+
volumes=None,
|
|
39
|
+
privileged=False,
|
|
40
|
+
instance_mounts=False,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _process_offers_into_backend_gpus(
|
|
45
|
+
offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
|
|
46
|
+
) -> List[BackendGpus]:
|
|
47
|
+
"""Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
|
|
48
|
+
backend_data: Dict[str, Dict] = {}
|
|
49
|
+
|
|
50
|
+
for backend, offer in offers:
|
|
51
|
+
backend_type = backend.TYPE
|
|
52
|
+
if backend_type not in backend_data:
|
|
53
|
+
backend_data[backend_type] = {"gpus": {}, "regions": set()}
|
|
54
|
+
|
|
55
|
+
backend_data[backend_type]["regions"].add(offer.region)
|
|
56
|
+
|
|
57
|
+
if not offer.instance.resources.gpus:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
gpu_types_in_offer = {}
|
|
61
|
+
for gpu in offer.instance.resources.gpus:
|
|
62
|
+
gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor)
|
|
63
|
+
if gpu_type_key not in gpu_types_in_offer:
|
|
64
|
+
gpu_types_in_offer[gpu_type_key] = 0
|
|
65
|
+
gpu_types_in_offer[gpu_type_key] += 1
|
|
66
|
+
|
|
67
|
+
for (
|
|
68
|
+
gpu_name,
|
|
69
|
+
gpu_memory_mib,
|
|
70
|
+
gpu_vendor,
|
|
71
|
+
), gpu_count_in_offer in gpu_types_in_offer.items():
|
|
72
|
+
instance_config_key = (
|
|
73
|
+
gpu_name,
|
|
74
|
+
gpu_memory_mib,
|
|
75
|
+
gpu_vendor,
|
|
76
|
+
gpu_count_in_offer,
|
|
77
|
+
offer.instance.resources.spot,
|
|
78
|
+
offer.region,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if instance_config_key not in backend_data[backend_type]["gpus"]:
|
|
82
|
+
backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu(
|
|
83
|
+
name=gpu_name,
|
|
84
|
+
memory_mib=gpu_memory_mib,
|
|
85
|
+
vendor=gpu_vendor,
|
|
86
|
+
availability=offer.availability,
|
|
87
|
+
spot=offer.instance.resources.spot,
|
|
88
|
+
count=gpu_count_in_offer,
|
|
89
|
+
price=offer.price,
|
|
90
|
+
region=offer.region,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
backend_gpus_list = []
|
|
94
|
+
for backend_type, data in backend_data.items():
|
|
95
|
+
gpus_list = sorted(
|
|
96
|
+
list(data["gpus"].values()),
|
|
97
|
+
key=lambda g: (
|
|
98
|
+
not g.availability.is_available(),
|
|
99
|
+
g.vendor.value,
|
|
100
|
+
g.name,
|
|
101
|
+
g.memory_mib,
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
backend_gpus_list.append(
|
|
105
|
+
BackendGpus(
|
|
106
|
+
backend_type=backend_type,
|
|
107
|
+
gpus=gpus_list,
|
|
108
|
+
regions=sorted(list(data["regions"])),
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
return backend_gpus_list
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
|
|
115
|
+
"""Updates an existing GpuGroup with new data from another GPU offer."""
|
|
116
|
+
spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
|
|
117
|
+
|
|
118
|
+
if gpu.availability not in row.availability:
|
|
119
|
+
row.availability.append(gpu.availability)
|
|
120
|
+
if spot_type not in row.spot:
|
|
121
|
+
row.spot.append(spot_type)
|
|
122
|
+
if row.backends and backend_type not in row.backends:
|
|
123
|
+
row.backends.append(backend_type)
|
|
124
|
+
|
|
125
|
+
row.count.min = min(row.count.min, gpu.count)
|
|
126
|
+
row.count.max = max(row.count.max, gpu.count)
|
|
127
|
+
per_gpu_price = gpu.price / gpu.count
|
|
128
|
+
row.price.min = min(row.price.min, per_gpu_price)
|
|
129
|
+
row.price.max = max(row.price.max, per_gpu_price)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
133
|
+
"""Aggregates GPU specs into a flat list, without any grouping."""
|
|
134
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
135
|
+
for backend in backend_gpus:
|
|
136
|
+
for gpu in backend.gpus:
|
|
137
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor)
|
|
138
|
+
if key not in gpu_rows:
|
|
139
|
+
per_gpu_price = gpu.price / gpu.count
|
|
140
|
+
price_range = Range[float](min=per_gpu_price, max=per_gpu_price)
|
|
141
|
+
|
|
142
|
+
gpu_rows[key] = GpuGroup(
|
|
143
|
+
name=gpu.name,
|
|
144
|
+
memory_mib=gpu.memory_mib,
|
|
145
|
+
vendor=gpu.vendor,
|
|
146
|
+
availability=[gpu.availability],
|
|
147
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
148
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
149
|
+
price=price_range,
|
|
150
|
+
backends=[backend.backend_type],
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
154
|
+
|
|
155
|
+
result = sorted(
|
|
156
|
+
list(gpu_rows.values()),
|
|
157
|
+
key=lambda g: (
|
|
158
|
+
not any(av.is_available() for av in g.availability),
|
|
159
|
+
g.price.min,
|
|
160
|
+
g.price.max,
|
|
161
|
+
g.name,
|
|
162
|
+
g.memory_mib,
|
|
163
|
+
),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
170
|
+
"""Aggregates GPU specs, grouping them by backend."""
|
|
171
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
172
|
+
for backend in backend_gpus:
|
|
173
|
+
for gpu in backend.gpus:
|
|
174
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type)
|
|
175
|
+
if key not in gpu_rows:
|
|
176
|
+
per_gpu_price = gpu.price / gpu.count
|
|
177
|
+
gpu_rows[key] = GpuGroup(
|
|
178
|
+
name=gpu.name,
|
|
179
|
+
memory_mib=gpu.memory_mib,
|
|
180
|
+
vendor=gpu.vendor,
|
|
181
|
+
availability=[gpu.availability],
|
|
182
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
183
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
184
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
185
|
+
backend=backend.backend_type,
|
|
186
|
+
regions=backend.regions.copy(),
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
190
|
+
|
|
191
|
+
return sorted(
|
|
192
|
+
list(gpu_rows.values()),
|
|
193
|
+
key=lambda g: (
|
|
194
|
+
not any(av.is_available() for av in g.availability),
|
|
195
|
+
g.price.min,
|
|
196
|
+
g.price.max,
|
|
197
|
+
g.backend.value,
|
|
198
|
+
g.name,
|
|
199
|
+
g.memory_mib,
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
205
|
+
"""Aggregates GPU specs, grouping them by both backend and region."""
|
|
206
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
207
|
+
for backend in backend_gpus:
|
|
208
|
+
for gpu in backend.gpus:
|
|
209
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
|
|
210
|
+
if key not in gpu_rows:
|
|
211
|
+
per_gpu_price = gpu.price / gpu.count
|
|
212
|
+
gpu_rows[key] = GpuGroup(
|
|
213
|
+
name=gpu.name,
|
|
214
|
+
memory_mib=gpu.memory_mib,
|
|
215
|
+
vendor=gpu.vendor,
|
|
216
|
+
availability=[gpu.availability],
|
|
217
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
218
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
219
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
220
|
+
backend=backend.backend_type,
|
|
221
|
+
region=gpu.region,
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
225
|
+
|
|
226
|
+
return sorted(
|
|
227
|
+
list(gpu_rows.values()),
|
|
228
|
+
key=lambda g: (
|
|
229
|
+
not any(av.is_available() for av in g.availability),
|
|
230
|
+
g.price.min,
|
|
231
|
+
g.price.max,
|
|
232
|
+
g.backend.value,
|
|
233
|
+
g.region,
|
|
234
|
+
g.name,
|
|
235
|
+
g.memory_mib,
|
|
236
|
+
),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
241
|
+
"""Aggregates GPU specs, grouping them by GPU count."""
|
|
242
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
243
|
+
for backend in backend_gpus:
|
|
244
|
+
for gpu in backend.gpus:
|
|
245
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count)
|
|
246
|
+
if key not in gpu_rows:
|
|
247
|
+
per_gpu_price = gpu.price / gpu.count
|
|
248
|
+
gpu_rows[key] = GpuGroup(
|
|
249
|
+
name=gpu.name,
|
|
250
|
+
memory_mib=gpu.memory_mib,
|
|
251
|
+
vendor=gpu.vendor,
|
|
252
|
+
availability=[gpu.availability],
|
|
253
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
254
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
255
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
256
|
+
backends=[backend.backend_type],
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
260
|
+
|
|
261
|
+
return sorted(
|
|
262
|
+
list(gpu_rows.values()),
|
|
263
|
+
key=lambda g: (
|
|
264
|
+
not any(av.is_available() for av in g.availability),
|
|
265
|
+
g.price.min,
|
|
266
|
+
g.price.max,
|
|
267
|
+
g.count.min,
|
|
268
|
+
g.name,
|
|
269
|
+
g.memory_mib,
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
275
|
+
"""Aggregates GPU specs, grouping them by backend and GPU count."""
|
|
276
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
277
|
+
for backend in backend_gpus:
|
|
278
|
+
for gpu in backend.gpus:
|
|
279
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count)
|
|
280
|
+
if key not in gpu_rows:
|
|
281
|
+
per_gpu_price = gpu.price / gpu.count
|
|
282
|
+
gpu_rows[key] = GpuGroup(
|
|
283
|
+
name=gpu.name,
|
|
284
|
+
memory_mib=gpu.memory_mib,
|
|
285
|
+
vendor=gpu.vendor,
|
|
286
|
+
availability=[gpu.availability],
|
|
287
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
288
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
289
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
290
|
+
backend=backend.backend_type,
|
|
291
|
+
regions=backend.regions.copy(),
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
295
|
+
|
|
296
|
+
return sorted(
|
|
297
|
+
list(gpu_rows.values()),
|
|
298
|
+
key=lambda g: (
|
|
299
|
+
not any(av.is_available() for av in g.availability),
|
|
300
|
+
g.price.min,
|
|
301
|
+
g.price.max,
|
|
302
|
+
g.backend.value,
|
|
303
|
+
g.count.min,
|
|
304
|
+
g.name,
|
|
305
|
+
g.memory_mib,
|
|
306
|
+
),
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _get_gpus_grouped_by_backend_region_and_count(
|
|
311
|
+
backend_gpus: List[BackendGpus],
|
|
312
|
+
) -> List[GpuGroup]:
|
|
313
|
+
"""Aggregates GPU specs, grouping them by backend, region, and GPU count."""
|
|
314
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
315
|
+
for backend in backend_gpus:
|
|
316
|
+
for gpu in backend.gpus:
|
|
317
|
+
key = (
|
|
318
|
+
gpu.name,
|
|
319
|
+
gpu.memory_mib,
|
|
320
|
+
gpu.vendor,
|
|
321
|
+
backend.backend_type,
|
|
322
|
+
gpu.region,
|
|
323
|
+
gpu.count,
|
|
324
|
+
)
|
|
325
|
+
if key not in gpu_rows:
|
|
326
|
+
per_gpu_price = gpu.price / gpu.count
|
|
327
|
+
gpu_rows[key] = GpuGroup(
|
|
328
|
+
name=gpu.name,
|
|
329
|
+
memory_mib=gpu.memory_mib,
|
|
330
|
+
vendor=gpu.vendor,
|
|
331
|
+
availability=[gpu.availability],
|
|
332
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
333
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
334
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
335
|
+
backend=backend.backend_type,
|
|
336
|
+
region=gpu.region,
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
340
|
+
|
|
341
|
+
return sorted(
|
|
342
|
+
list(gpu_rows.values()),
|
|
343
|
+
key=lambda g: (
|
|
344
|
+
not any(av.is_available() for av in g.availability),
|
|
345
|
+
g.price.min,
|
|
346
|
+
g.price.max,
|
|
347
|
+
g.backend.value,
|
|
348
|
+
g.region,
|
|
349
|
+
g.count.min,
|
|
350
|
+
g.name,
|
|
351
|
+
g.memory_mib,
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
async def list_gpus_grouped(
|
|
357
|
+
session: AsyncSession,
|
|
358
|
+
project: ProjectModel,
|
|
359
|
+
run_spec: RunSpec,
|
|
360
|
+
group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
|
|
361
|
+
) -> ListGpusResponse:
|
|
362
|
+
"""Retrieves available GPU specifications based on a run spec, with optional grouping."""
|
|
363
|
+
offers = await _get_gpu_offers(session, project, run_spec)
|
|
364
|
+
backend_gpus = _process_offers_into_backend_gpus(offers)
|
|
365
|
+
|
|
366
|
+
group_by_set = set(group_by) if group_by else set()
|
|
367
|
+
|
|
368
|
+
if "region" in group_by_set and "backend" not in group_by_set:
|
|
369
|
+
from dstack._internal.core.errors import ServerClientError
|
|
370
|
+
|
|
371
|
+
raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
|
|
372
|
+
|
|
373
|
+
# Determine grouping strategy based on combination
|
|
374
|
+
has_backend = "backend" in group_by_set
|
|
375
|
+
has_region = "region" in group_by_set
|
|
376
|
+
has_count = "count" in group_by_set
|
|
377
|
+
if has_backend and has_region and has_count:
|
|
378
|
+
gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
|
|
379
|
+
elif has_backend and has_count:
|
|
380
|
+
gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
|
|
381
|
+
elif has_backend and has_region:
|
|
382
|
+
gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
|
|
383
|
+
elif has_backend:
|
|
384
|
+
gpus = _get_gpus_grouped_by_backend(backend_gpus)
|
|
385
|
+
elif has_count:
|
|
386
|
+
gpus = _get_gpus_grouped_by_count(backend_gpus)
|
|
387
|
+
else:
|
|
388
|
+
gpus = _get_gpus_with_no_grouping(backend_gpus)
|
|
389
|
+
|
|
390
|
+
return ListGpusResponse(gpus=gpus)
|
|
@@ -152,7 +152,9 @@ def job_model_to_job_submission(
|
|
|
152
152
|
inactivity_secs=job_model.inactivity_secs,
|
|
153
153
|
status=job_model.status,
|
|
154
154
|
status_message=status_message,
|
|
155
|
-
termination_reason=job_model.termination_reason
|
|
155
|
+
termination_reason=job_model.termination_reason.value
|
|
156
|
+
if job_model.termination_reason
|
|
157
|
+
else None,
|
|
156
158
|
termination_reason_message=job_model.termination_reason_message,
|
|
157
159
|
exit_status=job_model.exit_status,
|
|
158
160
|
job_provisioning_data=job_provisioning_data,
|
|
@@ -49,6 +49,7 @@ async def get_offers_by_requirements(
|
|
|
49
49
|
backend_types = profile.backends
|
|
50
50
|
regions = profile.regions
|
|
51
51
|
availability_zones = profile.availability_zones
|
|
52
|
+
instance_types = profile.instance_types
|
|
52
53
|
|
|
53
54
|
if volumes:
|
|
54
55
|
mount_point_volumes = volumes[0]
|
|
@@ -97,9 +98,43 @@ async def get_offers_by_requirements(
|
|
|
97
98
|
exclude_not_available=exclude_not_available,
|
|
98
99
|
)
|
|
99
100
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
101
|
+
offers = filter_offers(
|
|
102
|
+
offers=offers,
|
|
103
|
+
# Double filtering by backends if backend returns offers for other backend.
|
|
104
|
+
backend_types=backend_types,
|
|
105
|
+
regions=regions,
|
|
106
|
+
availability_zones=availability_zones,
|
|
107
|
+
instance_types=instance_types,
|
|
108
|
+
placement_group=placement_group,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if blocks == 1:
|
|
112
|
+
return offers
|
|
113
|
+
|
|
114
|
+
shareable_offers = []
|
|
115
|
+
for backend, offer in offers:
|
|
116
|
+
resources = offer.instance.resources
|
|
117
|
+
cpu_count = resources.cpus
|
|
118
|
+
gpu_count = len(resources.gpus)
|
|
119
|
+
if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
|
|
120
|
+
# TPUs cannot be shared
|
|
121
|
+
gpu_count = 1
|
|
122
|
+
divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
|
|
123
|
+
if not divisible:
|
|
124
|
+
continue
|
|
125
|
+
offer.total_blocks = _blocks
|
|
126
|
+
shareable_offers.append((backend, offer))
|
|
127
|
+
return shareable_offers
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def filter_offers(
|
|
131
|
+
offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
|
|
132
|
+
backend_types: Optional[List[BackendType]] = None,
|
|
133
|
+
regions: Optional[List[str]] = None,
|
|
134
|
+
availability_zones: Optional[List[str]] = None,
|
|
135
|
+
instance_types: Optional[List[str]] = None,
|
|
136
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
137
|
+
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
103
138
|
if backend_types is not None:
|
|
104
139
|
offers = [(b, o) for b, o in offers if o.backend in backend_types]
|
|
105
140
|
|
|
@@ -119,39 +154,21 @@ async def get_offers_by_requirements(
|
|
|
119
154
|
new_offers.append((b, new_offer))
|
|
120
155
|
offers = new_offers
|
|
121
156
|
|
|
157
|
+
if instance_types is not None:
|
|
158
|
+
instance_types = [i.lower() for i in instance_types]
|
|
159
|
+
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
160
|
+
|
|
122
161
|
if placement_group is not None:
|
|
123
162
|
new_offers = []
|
|
124
163
|
for b, o in offers:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
new_offers.append((b, o))
|
|
131
|
-
break
|
|
164
|
+
compute = b.compute()
|
|
165
|
+
if isinstance(
|
|
166
|
+
compute, ComputeWithPlacementGroupSupport
|
|
167
|
+
) and compute.is_suitable_placement_group(placement_group, o):
|
|
168
|
+
new_offers.append((b, o))
|
|
132
169
|
offers = new_offers
|
|
133
170
|
|
|
134
|
-
|
|
135
|
-
instance_types = [i.lower() for i in profile.instance_types]
|
|
136
|
-
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
137
|
-
|
|
138
|
-
if blocks == 1:
|
|
139
|
-
return offers
|
|
140
|
-
|
|
141
|
-
shareable_offers = []
|
|
142
|
-
for backend, offer in offers:
|
|
143
|
-
resources = offer.instance.resources
|
|
144
|
-
cpu_count = resources.cpus
|
|
145
|
-
gpu_count = len(resources.gpus)
|
|
146
|
-
if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
|
|
147
|
-
# TPUs cannot be shared
|
|
148
|
-
gpu_count = 1
|
|
149
|
-
divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
|
|
150
|
-
if not divisible:
|
|
151
|
-
continue
|
|
152
|
-
offer.total_blocks = _blocks
|
|
153
|
-
shareable_offers.append((backend, offer))
|
|
154
|
-
return shareable_offers
|
|
171
|
+
return offers
|
|
155
172
|
|
|
156
173
|
|
|
157
174
|
def is_divisible_into_blocks(
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
from dstack._internal.core.models.runs import Probe
|
|
1
|
+
from dstack._internal.core.models.runs import Probe, ProbeSpec
|
|
2
2
|
from dstack._internal.server.models import ProbeModel
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def probe_model_to_probe(probe_model: ProbeModel) -> Probe:
|
|
6
6
|
return Probe(success_streak=probe_model.success_streak)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_probe_ready(probe: ProbeModel, spec: ProbeSpec) -> bool:
|
|
10
|
+
return probe.success_streak >= spec.ready_after
|