dstack 0.19.24__py3-none-any.whl → 0.19.25rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +14 -2
- dstack/_internal/cli/commands/init.py +47 -2
- dstack/_internal/cli/commands/offer.py +68 -60
- dstack/_internal/cli/services/configurators/run.py +38 -10
- dstack/_internal/cli/services/repos.py +6 -24
- dstack/_internal/cli/utils/common.py +7 -0
- dstack/_internal/cli/utils/gpu.py +210 -0
- dstack/_internal/cli/utils/run.py +33 -0
- dstack/_internal/core/backends/aws/compute.py +1 -4
- dstack/_internal/core/backends/base/compute.py +0 -4
- dstack/_internal/core/backends/gcp/compute.py +1 -4
- dstack/_internal/core/backends/nebius/compute.py +1 -4
- dstack/_internal/core/models/common.py +1 -1
- dstack/_internal/core/models/config.py +3 -1
- dstack/_internal/core/models/configurations.py +16 -14
- dstack/_internal/core/models/fleets.py +2 -2
- dstack/_internal/core/models/instances.py +1 -1
- dstack/_internal/core/models/profiles.py +2 -2
- dstack/_internal/core/models/repos/remote.py +2 -2
- dstack/_internal/core/models/resources.py +4 -4
- dstack/_internal/core/models/runs.py +1 -1
- dstack/_internal/core/services/configs/__init__.py +4 -6
- dstack/_internal/proxy/gateway/services/registry.py +2 -0
- dstack/_internal/server/app.py +2 -0
- dstack/_internal/server/background/tasks/process_fleets.py +10 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +65 -44
- dstack/_internal/server/background/tasks/process_runs.py +15 -14
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
- dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
- dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
- dstack/_internal/server/models.py +13 -1
- dstack/_internal/server/routers/gpus.py +29 -0
- dstack/_internal/server/schemas/gateways.py +1 -1
- dstack/_internal/server/schemas/gpus.py +66 -0
- dstack/_internal/server/services/docker.py +1 -1
- dstack/_internal/server/services/gpus.py +390 -0
- dstack/_internal/server/services/offers.py +48 -31
- dstack/_internal/server/services/probes.py +5 -1
- dstack/_internal/server/services/proxy/repo.py +1 -0
- dstack/_internal/server/services/runs.py +12 -11
- dstack/_internal/server/services/services/__init__.py +60 -41
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/logo-notext.svg +116 -0
- dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
- dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js → main-d151b300fcac3933213d.js} +19 -22
- dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js.map → main-d151b300fcac3933213d.js.map} +1 -1
- dstack/_internal/server/testing/common.py +7 -2
- dstack/api/_public/repos.py +8 -7
- dstack/api/server/__init__.py +6 -0
- dstack/api/server/_gpus.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.24.dist-info → dstack-0.19.25rc1.dist-info}/METADATA +1 -1
- {dstack-0.19.24.dist-info → dstack-0.19.25rc1.dist-info}/RECORD +56 -48
- {dstack-0.19.24.dist-info → dstack-0.19.25rc1.dist-info}/WHEEL +0 -0
- {dstack-0.19.24.dist-info → dstack-0.19.25rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.24.dist-info → dstack-0.19.25rc1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.backends.base.backend import Backend
|
|
6
|
+
from dstack._internal.core.models.instances import InstanceOfferWithAvailability
|
|
7
|
+
from dstack._internal.core.models.profiles import SpotPolicy
|
|
8
|
+
from dstack._internal.core.models.resources import Range
|
|
9
|
+
from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map
|
|
10
|
+
from dstack._internal.server.models import ProjectModel
|
|
11
|
+
from dstack._internal.server.schemas.gpus import (
|
|
12
|
+
BackendGpu,
|
|
13
|
+
BackendGpus,
|
|
14
|
+
GpuGroup,
|
|
15
|
+
ListGpusResponse,
|
|
16
|
+
)
|
|
17
|
+
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def _get_gpu_offers(
|
|
21
|
+
session: AsyncSession, project: ProjectModel, run_spec: RunSpec
|
|
22
|
+
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
23
|
+
"""Fetches all available instance offers that match the run spec's GPU requirements."""
|
|
24
|
+
profile = run_spec.merged_profile
|
|
25
|
+
requirements = Requirements(
|
|
26
|
+
resources=run_spec.configuration.resources,
|
|
27
|
+
max_price=profile.max_price,
|
|
28
|
+
spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
|
|
29
|
+
reservation=profile.reservation,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return await get_offers_by_requirements(
|
|
33
|
+
project=project,
|
|
34
|
+
profile=profile,
|
|
35
|
+
requirements=requirements,
|
|
36
|
+
exclude_not_available=False,
|
|
37
|
+
multinode=False,
|
|
38
|
+
volumes=None,
|
|
39
|
+
privileged=False,
|
|
40
|
+
instance_mounts=False,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _process_offers_into_backend_gpus(
|
|
45
|
+
offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
|
|
46
|
+
) -> List[BackendGpus]:
|
|
47
|
+
"""Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
|
|
48
|
+
backend_data: Dict[str, Dict] = {}
|
|
49
|
+
|
|
50
|
+
for backend, offer in offers:
|
|
51
|
+
backend_type = backend.TYPE
|
|
52
|
+
if backend_type not in backend_data:
|
|
53
|
+
backend_data[backend_type] = {"gpus": {}, "regions": set()}
|
|
54
|
+
|
|
55
|
+
backend_data[backend_type]["regions"].add(offer.region)
|
|
56
|
+
|
|
57
|
+
if not offer.instance.resources.gpus:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
gpu_types_in_offer = {}
|
|
61
|
+
for gpu in offer.instance.resources.gpus:
|
|
62
|
+
gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor)
|
|
63
|
+
if gpu_type_key not in gpu_types_in_offer:
|
|
64
|
+
gpu_types_in_offer[gpu_type_key] = 0
|
|
65
|
+
gpu_types_in_offer[gpu_type_key] += 1
|
|
66
|
+
|
|
67
|
+
for (
|
|
68
|
+
gpu_name,
|
|
69
|
+
gpu_memory_mib,
|
|
70
|
+
gpu_vendor,
|
|
71
|
+
), gpu_count_in_offer in gpu_types_in_offer.items():
|
|
72
|
+
instance_config_key = (
|
|
73
|
+
gpu_name,
|
|
74
|
+
gpu_memory_mib,
|
|
75
|
+
gpu_vendor,
|
|
76
|
+
gpu_count_in_offer,
|
|
77
|
+
offer.instance.resources.spot,
|
|
78
|
+
offer.region,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if instance_config_key not in backend_data[backend_type]["gpus"]:
|
|
82
|
+
backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu(
|
|
83
|
+
name=gpu_name,
|
|
84
|
+
memory_mib=gpu_memory_mib,
|
|
85
|
+
vendor=gpu_vendor,
|
|
86
|
+
availability=offer.availability,
|
|
87
|
+
spot=offer.instance.resources.spot,
|
|
88
|
+
count=gpu_count_in_offer,
|
|
89
|
+
price=offer.price,
|
|
90
|
+
region=offer.region,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
backend_gpus_list = []
|
|
94
|
+
for backend_type, data in backend_data.items():
|
|
95
|
+
gpus_list = sorted(
|
|
96
|
+
list(data["gpus"].values()),
|
|
97
|
+
key=lambda g: (
|
|
98
|
+
not g.availability.is_available(),
|
|
99
|
+
g.vendor.value,
|
|
100
|
+
g.name,
|
|
101
|
+
g.memory_mib,
|
|
102
|
+
),
|
|
103
|
+
)
|
|
104
|
+
backend_gpus_list.append(
|
|
105
|
+
BackendGpus(
|
|
106
|
+
backend_type=backend_type,
|
|
107
|
+
gpus=gpus_list,
|
|
108
|
+
regions=sorted(list(data["regions"])),
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
return backend_gpus_list
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
|
|
115
|
+
"""Updates an existing GpuGroup with new data from another GPU offer."""
|
|
116
|
+
spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
|
|
117
|
+
|
|
118
|
+
if gpu.availability not in row.availability:
|
|
119
|
+
row.availability.append(gpu.availability)
|
|
120
|
+
if spot_type not in row.spot:
|
|
121
|
+
row.spot.append(spot_type)
|
|
122
|
+
if row.backends and backend_type not in row.backends:
|
|
123
|
+
row.backends.append(backend_type)
|
|
124
|
+
|
|
125
|
+
row.count.min = min(row.count.min, gpu.count)
|
|
126
|
+
row.count.max = max(row.count.max, gpu.count)
|
|
127
|
+
per_gpu_price = gpu.price / gpu.count
|
|
128
|
+
row.price.min = min(row.price.min, per_gpu_price)
|
|
129
|
+
row.price.max = max(row.price.max, per_gpu_price)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
133
|
+
"""Aggregates GPU specs into a flat list, without any grouping."""
|
|
134
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
135
|
+
for backend in backend_gpus:
|
|
136
|
+
for gpu in backend.gpus:
|
|
137
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor)
|
|
138
|
+
if key not in gpu_rows:
|
|
139
|
+
per_gpu_price = gpu.price / gpu.count
|
|
140
|
+
price_range = Range[float](min=per_gpu_price, max=per_gpu_price)
|
|
141
|
+
|
|
142
|
+
gpu_rows[key] = GpuGroup(
|
|
143
|
+
name=gpu.name,
|
|
144
|
+
memory_mib=gpu.memory_mib,
|
|
145
|
+
vendor=gpu.vendor,
|
|
146
|
+
availability=[gpu.availability],
|
|
147
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
148
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
149
|
+
price=price_range,
|
|
150
|
+
backends=[backend.backend_type],
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
154
|
+
|
|
155
|
+
result = sorted(
|
|
156
|
+
list(gpu_rows.values()),
|
|
157
|
+
key=lambda g: (
|
|
158
|
+
not any(av.is_available() for av in g.availability),
|
|
159
|
+
g.price.min,
|
|
160
|
+
g.price.max,
|
|
161
|
+
g.name,
|
|
162
|
+
g.memory_mib,
|
|
163
|
+
),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return result
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
170
|
+
"""Aggregates GPU specs, grouping them by backend."""
|
|
171
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
172
|
+
for backend in backend_gpus:
|
|
173
|
+
for gpu in backend.gpus:
|
|
174
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type)
|
|
175
|
+
if key not in gpu_rows:
|
|
176
|
+
per_gpu_price = gpu.price / gpu.count
|
|
177
|
+
gpu_rows[key] = GpuGroup(
|
|
178
|
+
name=gpu.name,
|
|
179
|
+
memory_mib=gpu.memory_mib,
|
|
180
|
+
vendor=gpu.vendor,
|
|
181
|
+
availability=[gpu.availability],
|
|
182
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
183
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
184
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
185
|
+
backend=backend.backend_type,
|
|
186
|
+
regions=backend.regions.copy(),
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
190
|
+
|
|
191
|
+
return sorted(
|
|
192
|
+
list(gpu_rows.values()),
|
|
193
|
+
key=lambda g: (
|
|
194
|
+
not any(av.is_available() for av in g.availability),
|
|
195
|
+
g.price.min,
|
|
196
|
+
g.price.max,
|
|
197
|
+
g.backend.value,
|
|
198
|
+
g.name,
|
|
199
|
+
g.memory_mib,
|
|
200
|
+
),
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
205
|
+
"""Aggregates GPU specs, grouping them by both backend and region."""
|
|
206
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
207
|
+
for backend in backend_gpus:
|
|
208
|
+
for gpu in backend.gpus:
|
|
209
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
|
|
210
|
+
if key not in gpu_rows:
|
|
211
|
+
per_gpu_price = gpu.price / gpu.count
|
|
212
|
+
gpu_rows[key] = GpuGroup(
|
|
213
|
+
name=gpu.name,
|
|
214
|
+
memory_mib=gpu.memory_mib,
|
|
215
|
+
vendor=gpu.vendor,
|
|
216
|
+
availability=[gpu.availability],
|
|
217
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
218
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
219
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
220
|
+
backend=backend.backend_type,
|
|
221
|
+
region=gpu.region,
|
|
222
|
+
)
|
|
223
|
+
else:
|
|
224
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
225
|
+
|
|
226
|
+
return sorted(
|
|
227
|
+
list(gpu_rows.values()),
|
|
228
|
+
key=lambda g: (
|
|
229
|
+
not any(av.is_available() for av in g.availability),
|
|
230
|
+
g.price.min,
|
|
231
|
+
g.price.max,
|
|
232
|
+
g.backend.value,
|
|
233
|
+
g.region,
|
|
234
|
+
g.name,
|
|
235
|
+
g.memory_mib,
|
|
236
|
+
),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
241
|
+
"""Aggregates GPU specs, grouping them by GPU count."""
|
|
242
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
243
|
+
for backend in backend_gpus:
|
|
244
|
+
for gpu in backend.gpus:
|
|
245
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count)
|
|
246
|
+
if key not in gpu_rows:
|
|
247
|
+
per_gpu_price = gpu.price / gpu.count
|
|
248
|
+
gpu_rows[key] = GpuGroup(
|
|
249
|
+
name=gpu.name,
|
|
250
|
+
memory_mib=gpu.memory_mib,
|
|
251
|
+
vendor=gpu.vendor,
|
|
252
|
+
availability=[gpu.availability],
|
|
253
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
254
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
255
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
256
|
+
backends=[backend.backend_type],
|
|
257
|
+
)
|
|
258
|
+
else:
|
|
259
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
260
|
+
|
|
261
|
+
return sorted(
|
|
262
|
+
list(gpu_rows.values()),
|
|
263
|
+
key=lambda g: (
|
|
264
|
+
not any(av.is_available() for av in g.availability),
|
|
265
|
+
g.price.min,
|
|
266
|
+
g.price.max,
|
|
267
|
+
g.count.min,
|
|
268
|
+
g.name,
|
|
269
|
+
g.memory_mib,
|
|
270
|
+
),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
|
|
275
|
+
"""Aggregates GPU specs, grouping them by backend and GPU count."""
|
|
276
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
277
|
+
for backend in backend_gpus:
|
|
278
|
+
for gpu in backend.gpus:
|
|
279
|
+
key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count)
|
|
280
|
+
if key not in gpu_rows:
|
|
281
|
+
per_gpu_price = gpu.price / gpu.count
|
|
282
|
+
gpu_rows[key] = GpuGroup(
|
|
283
|
+
name=gpu.name,
|
|
284
|
+
memory_mib=gpu.memory_mib,
|
|
285
|
+
vendor=gpu.vendor,
|
|
286
|
+
availability=[gpu.availability],
|
|
287
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
288
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
289
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
290
|
+
backend=backend.backend_type,
|
|
291
|
+
regions=backend.regions.copy(),
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
295
|
+
|
|
296
|
+
return sorted(
|
|
297
|
+
list(gpu_rows.values()),
|
|
298
|
+
key=lambda g: (
|
|
299
|
+
not any(av.is_available() for av in g.availability),
|
|
300
|
+
g.price.min,
|
|
301
|
+
g.price.max,
|
|
302
|
+
g.backend.value,
|
|
303
|
+
g.count.min,
|
|
304
|
+
g.name,
|
|
305
|
+
g.memory_mib,
|
|
306
|
+
),
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def _get_gpus_grouped_by_backend_region_and_count(
|
|
311
|
+
backend_gpus: List[BackendGpus],
|
|
312
|
+
) -> List[GpuGroup]:
|
|
313
|
+
"""Aggregates GPU specs, grouping them by backend, region, and GPU count."""
|
|
314
|
+
gpu_rows: Dict[Tuple, GpuGroup] = {}
|
|
315
|
+
for backend in backend_gpus:
|
|
316
|
+
for gpu in backend.gpus:
|
|
317
|
+
key = (
|
|
318
|
+
gpu.name,
|
|
319
|
+
gpu.memory_mib,
|
|
320
|
+
gpu.vendor,
|
|
321
|
+
backend.backend_type,
|
|
322
|
+
gpu.region,
|
|
323
|
+
gpu.count,
|
|
324
|
+
)
|
|
325
|
+
if key not in gpu_rows:
|
|
326
|
+
per_gpu_price = gpu.price / gpu.count
|
|
327
|
+
gpu_rows[key] = GpuGroup(
|
|
328
|
+
name=gpu.name,
|
|
329
|
+
memory_mib=gpu.memory_mib,
|
|
330
|
+
vendor=gpu.vendor,
|
|
331
|
+
availability=[gpu.availability],
|
|
332
|
+
spot=["spot" if gpu.spot else "on-demand"],
|
|
333
|
+
count=Range[int](min=gpu.count, max=gpu.count),
|
|
334
|
+
price=Range[float](min=per_gpu_price, max=per_gpu_price),
|
|
335
|
+
backend=backend.backend_type,
|
|
336
|
+
region=gpu.region,
|
|
337
|
+
)
|
|
338
|
+
else:
|
|
339
|
+
_update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
|
|
340
|
+
|
|
341
|
+
return sorted(
|
|
342
|
+
list(gpu_rows.values()),
|
|
343
|
+
key=lambda g: (
|
|
344
|
+
not any(av.is_available() for av in g.availability),
|
|
345
|
+
g.price.min,
|
|
346
|
+
g.price.max,
|
|
347
|
+
g.backend.value,
|
|
348
|
+
g.region,
|
|
349
|
+
g.count.min,
|
|
350
|
+
g.name,
|
|
351
|
+
g.memory_mib,
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
async def list_gpus_grouped(
|
|
357
|
+
session: AsyncSession,
|
|
358
|
+
project: ProjectModel,
|
|
359
|
+
run_spec: RunSpec,
|
|
360
|
+
group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
|
|
361
|
+
) -> ListGpusResponse:
|
|
362
|
+
"""Retrieves available GPU specifications based on a run spec, with optional grouping."""
|
|
363
|
+
offers = await _get_gpu_offers(session, project, run_spec)
|
|
364
|
+
backend_gpus = _process_offers_into_backend_gpus(offers)
|
|
365
|
+
|
|
366
|
+
group_by_set = set(group_by) if group_by else set()
|
|
367
|
+
|
|
368
|
+
if "region" in group_by_set and "backend" not in group_by_set:
|
|
369
|
+
from dstack._internal.core.errors import ServerClientError
|
|
370
|
+
|
|
371
|
+
raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
|
|
372
|
+
|
|
373
|
+
# Determine grouping strategy based on combination
|
|
374
|
+
has_backend = "backend" in group_by_set
|
|
375
|
+
has_region = "region" in group_by_set
|
|
376
|
+
has_count = "count" in group_by_set
|
|
377
|
+
if has_backend and has_region and has_count:
|
|
378
|
+
gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
|
|
379
|
+
elif has_backend and has_count:
|
|
380
|
+
gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
|
|
381
|
+
elif has_backend and has_region:
|
|
382
|
+
gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
|
|
383
|
+
elif has_backend:
|
|
384
|
+
gpus = _get_gpus_grouped_by_backend(backend_gpus)
|
|
385
|
+
elif has_count:
|
|
386
|
+
gpus = _get_gpus_grouped_by_count(backend_gpus)
|
|
387
|
+
else:
|
|
388
|
+
gpus = _get_gpus_with_no_grouping(backend_gpus)
|
|
389
|
+
|
|
390
|
+
return ListGpusResponse(gpus=gpus)
|
|
@@ -49,6 +49,7 @@ async def get_offers_by_requirements(
|
|
|
49
49
|
backend_types = profile.backends
|
|
50
50
|
regions = profile.regions
|
|
51
51
|
availability_zones = profile.availability_zones
|
|
52
|
+
instance_types = profile.instance_types
|
|
52
53
|
|
|
53
54
|
if volumes:
|
|
54
55
|
mount_point_volumes = volumes[0]
|
|
@@ -97,9 +98,43 @@ async def get_offers_by_requirements(
|
|
|
97
98
|
exclude_not_available=exclude_not_available,
|
|
98
99
|
)
|
|
99
100
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
101
|
+
offers = filter_offers(
|
|
102
|
+
offers=offers,
|
|
103
|
+
# Double filtering by backends if backend returns offers for other backend.
|
|
104
|
+
backend_types=backend_types,
|
|
105
|
+
regions=regions,
|
|
106
|
+
availability_zones=availability_zones,
|
|
107
|
+
instance_types=instance_types,
|
|
108
|
+
placement_group=placement_group,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if blocks == 1:
|
|
112
|
+
return offers
|
|
113
|
+
|
|
114
|
+
shareable_offers = []
|
|
115
|
+
for backend, offer in offers:
|
|
116
|
+
resources = offer.instance.resources
|
|
117
|
+
cpu_count = resources.cpus
|
|
118
|
+
gpu_count = len(resources.gpus)
|
|
119
|
+
if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
|
|
120
|
+
# TPUs cannot be shared
|
|
121
|
+
gpu_count = 1
|
|
122
|
+
divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
|
|
123
|
+
if not divisible:
|
|
124
|
+
continue
|
|
125
|
+
offer.total_blocks = _blocks
|
|
126
|
+
shareable_offers.append((backend, offer))
|
|
127
|
+
return shareable_offers
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def filter_offers(
|
|
131
|
+
offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
|
|
132
|
+
backend_types: Optional[List[BackendType]] = None,
|
|
133
|
+
regions: Optional[List[str]] = None,
|
|
134
|
+
availability_zones: Optional[List[str]] = None,
|
|
135
|
+
instance_types: Optional[List[str]] = None,
|
|
136
|
+
placement_group: Optional[PlacementGroup] = None,
|
|
137
|
+
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
103
138
|
if backend_types is not None:
|
|
104
139
|
offers = [(b, o) for b, o in offers if o.backend in backend_types]
|
|
105
140
|
|
|
@@ -119,39 +154,21 @@ async def get_offers_by_requirements(
|
|
|
119
154
|
new_offers.append((b, new_offer))
|
|
120
155
|
offers = new_offers
|
|
121
156
|
|
|
157
|
+
if instance_types is not None:
|
|
158
|
+
instance_types = [i.lower() for i in instance_types]
|
|
159
|
+
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
160
|
+
|
|
122
161
|
if placement_group is not None:
|
|
123
162
|
new_offers = []
|
|
124
163
|
for b, o in offers:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
new_offers.append((b, o))
|
|
131
|
-
break
|
|
164
|
+
compute = b.compute()
|
|
165
|
+
if isinstance(
|
|
166
|
+
compute, ComputeWithPlacementGroupSupport
|
|
167
|
+
) and compute.is_suitable_placement_group(placement_group, o):
|
|
168
|
+
new_offers.append((b, o))
|
|
132
169
|
offers = new_offers
|
|
133
170
|
|
|
134
|
-
|
|
135
|
-
instance_types = [i.lower() for i in profile.instance_types]
|
|
136
|
-
offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
|
|
137
|
-
|
|
138
|
-
if blocks == 1:
|
|
139
|
-
return offers
|
|
140
|
-
|
|
141
|
-
shareable_offers = []
|
|
142
|
-
for backend, offer in offers:
|
|
143
|
-
resources = offer.instance.resources
|
|
144
|
-
cpu_count = resources.cpus
|
|
145
|
-
gpu_count = len(resources.gpus)
|
|
146
|
-
if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
|
|
147
|
-
# TPUs cannot be shared
|
|
148
|
-
gpu_count = 1
|
|
149
|
-
divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
|
|
150
|
-
if not divisible:
|
|
151
|
-
continue
|
|
152
|
-
offer.total_blocks = _blocks
|
|
153
|
-
shareable_offers.append((backend, offer))
|
|
154
|
-
return shareable_offers
|
|
171
|
+
return offers
|
|
155
172
|
|
|
156
173
|
|
|
157
174
|
def is_divisible_into_blocks(
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
from dstack._internal.core.models.runs import Probe
|
|
1
|
+
from dstack._internal.core.models.runs import Probe, ProbeSpec
|
|
2
2
|
from dstack._internal.server.models import ProbeModel
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def probe_model_to_probe(probe_model: ProbeModel) -> Probe:
|
|
6
6
|
return Probe(success_streak=probe_model.success_streak)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def is_probe_ready(probe: ProbeModel, spec: ProbeSpec) -> bool:
|
|
10
|
+
return probe.success_streak >= spec.ready_after
|
|
@@ -41,6 +41,7 @@ from dstack._internal.core.models.runs import (
|
|
|
41
41
|
JobStatus,
|
|
42
42
|
JobSubmission,
|
|
43
43
|
JobTerminationReason,
|
|
44
|
+
ProbeSpec,
|
|
44
45
|
Run,
|
|
45
46
|
RunPlan,
|
|
46
47
|
RunSpec,
|
|
@@ -58,6 +59,7 @@ from dstack._internal.server import settings
|
|
|
58
59
|
from dstack._internal.server.db import get_db
|
|
59
60
|
from dstack._internal.server.models import (
|
|
60
61
|
JobModel,
|
|
62
|
+
ProbeModel,
|
|
61
63
|
ProjectModel,
|
|
62
64
|
RepoModel,
|
|
63
65
|
RunModel,
|
|
@@ -86,6 +88,7 @@ from dstack._internal.server.services.locking import get_locker, string_to_lock_
|
|
|
86
88
|
from dstack._internal.server.services.logging import fmt
|
|
87
89
|
from dstack._internal.server.services.offers import get_offers_by_requirements
|
|
88
90
|
from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
91
|
+
from dstack._internal.server.services.probes import is_probe_ready
|
|
89
92
|
from dstack._internal.server.services.projects import list_user_project_models
|
|
90
93
|
from dstack._internal.server.services.resources import set_resources_defaults
|
|
91
94
|
from dstack._internal.server.services.secrets import get_project_secrets_mapping
|
|
@@ -1185,8 +1188,8 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica
|
|
|
1185
1188
|
elif {JobStatus.PROVISIONING, JobStatus.PULLING} & statuses:
|
|
1186
1189
|
# if there are any provisioning or pulling jobs, the replica is active and has the importance of 1
|
|
1187
1190
|
active_replicas.append((1, is_out_of_date, replica_num, replica_jobs))
|
|
1188
|
-
elif not
|
|
1189
|
-
# all jobs are running, but
|
|
1191
|
+
elif not is_replica_registered(replica_jobs):
|
|
1192
|
+
# all jobs are running, but not receiving traffic, the replica is active and has the importance of 2
|
|
1190
1193
|
active_replicas.append((2, is_out_of_date, replica_num, replica_jobs))
|
|
1191
1194
|
else:
|
|
1192
1195
|
# all jobs are running and ready, the replica is active and has the importance of 3
|
|
@@ -1273,15 +1276,13 @@ async def retry_run_replica_jobs(
|
|
|
1273
1276
|
session.add(new_job_model)
|
|
1274
1277
|
|
|
1275
1278
|
|
|
1276
|
-
def
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
return False
|
|
1284
|
-
return True
|
|
1279
|
+
def is_job_ready(probes: Iterable[ProbeModel], probe_specs: Iterable[ProbeSpec]) -> bool:
|
|
1280
|
+
return all(is_probe_ready(probe, probe_spec) for probe, probe_spec in zip(probes, probe_specs))
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
def is_replica_registered(jobs: list[JobModel]) -> bool:
|
|
1284
|
+
# Only job_num=0 is supposed to receive service requests
|
|
1285
|
+
return jobs[0].registered
|
|
1285
1286
|
|
|
1286
1287
|
|
|
1287
1288
|
def _remove_job_spec_sensitive_info(spec: JobSpec):
|