dstack 0.19.24__py3-none-any.whl → 0.19.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (56) hide show
  1. dstack/_internal/cli/commands/apply.py +14 -2
  2. dstack/_internal/cli/commands/init.py +47 -2
  3. dstack/_internal/cli/commands/offer.py +68 -60
  4. dstack/_internal/cli/services/configurators/run.py +35 -10
  5. dstack/_internal/cli/services/repos.py +6 -24
  6. dstack/_internal/cli/utils/common.py +7 -0
  7. dstack/_internal/cli/utils/gpu.py +210 -0
  8. dstack/_internal/cli/utils/run.py +33 -0
  9. dstack/_internal/core/backends/aws/compute.py +1 -4
  10. dstack/_internal/core/backends/base/compute.py +0 -4
  11. dstack/_internal/core/backends/gcp/compute.py +1 -4
  12. dstack/_internal/core/backends/nebius/compute.py +1 -4
  13. dstack/_internal/core/models/common.py +1 -1
  14. dstack/_internal/core/models/config.py +3 -1
  15. dstack/_internal/core/models/configurations.py +16 -14
  16. dstack/_internal/core/models/fleets.py +2 -2
  17. dstack/_internal/core/models/instances.py +1 -1
  18. dstack/_internal/core/models/profiles.py +2 -2
  19. dstack/_internal/core/models/repos/remote.py +2 -2
  20. dstack/_internal/core/models/resources.py +4 -4
  21. dstack/_internal/core/models/runs.py +1 -1
  22. dstack/_internal/core/services/configs/__init__.py +8 -7
  23. dstack/_internal/proxy/gateway/services/registry.py +2 -0
  24. dstack/_internal/server/app.py +2 -0
  25. dstack/_internal/server/background/tasks/process_fleets.py +10 -2
  26. dstack/_internal/server/background/tasks/process_running_jobs.py +65 -44
  27. dstack/_internal/server/background/tasks/process_runs.py +15 -14
  28. dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
  29. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  30. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  31. dstack/_internal/server/models.py +13 -1
  32. dstack/_internal/server/routers/gpus.py +29 -0
  33. dstack/_internal/server/schemas/gateways.py +1 -1
  34. dstack/_internal/server/schemas/gpus.py +66 -0
  35. dstack/_internal/server/services/docker.py +1 -1
  36. dstack/_internal/server/services/gpus.py +390 -0
  37. dstack/_internal/server/services/offers.py +48 -31
  38. dstack/_internal/server/services/probes.py +5 -1
  39. dstack/_internal/server/services/proxy/repo.py +1 -0
  40. dstack/_internal/server/services/runs.py +12 -11
  41. dstack/_internal/server/services/services/__init__.py +60 -41
  42. dstack/_internal/server/statics/index.html +1 -1
  43. dstack/_internal/server/statics/logo-notext.svg +116 -0
  44. dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
  45. dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js → main-d151b300fcac3933213d.js} +19 -22
  46. dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js.map → main-d151b300fcac3933213d.js.map} +1 -1
  47. dstack/_internal/server/testing/common.py +7 -2
  48. dstack/api/_public/repos.py +8 -7
  49. dstack/api/server/__init__.py +6 -0
  50. dstack/api/server/_gpus.py +22 -0
  51. dstack/version.py +1 -1
  52. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/METADATA +1 -1
  53. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/RECORD +56 -48
  54. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/WHEEL +0 -0
  55. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/entry_points.txt +0 -0
  56. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,390 @@
1
+ from typing import Dict, List, Literal, Optional, Tuple
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from dstack._internal.core.backends.base.backend import Backend
6
+ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
7
+ from dstack._internal.core.models.profiles import SpotPolicy
8
+ from dstack._internal.core.models.resources import Range
9
+ from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map
10
+ from dstack._internal.server.models import ProjectModel
11
+ from dstack._internal.server.schemas.gpus import (
12
+ BackendGpu,
13
+ BackendGpus,
14
+ GpuGroup,
15
+ ListGpusResponse,
16
+ )
17
+ from dstack._internal.server.services.offers import get_offers_by_requirements
18
+
19
+
20
+ async def _get_gpu_offers(
21
+ session: AsyncSession, project: ProjectModel, run_spec: RunSpec
22
+ ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
23
+ """Fetches all available instance offers that match the run spec's GPU requirements."""
24
+ profile = run_spec.merged_profile
25
+ requirements = Requirements(
26
+ resources=run_spec.configuration.resources,
27
+ max_price=profile.max_price,
28
+ spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
29
+ reservation=profile.reservation,
30
+ )
31
+
32
+ return await get_offers_by_requirements(
33
+ project=project,
34
+ profile=profile,
35
+ requirements=requirements,
36
+ exclude_not_available=False,
37
+ multinode=False,
38
+ volumes=None,
39
+ privileged=False,
40
+ instance_mounts=False,
41
+ )
42
+
43
+
44
+ def _process_offers_into_backend_gpus(
45
+ offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
46
+ ) -> List[BackendGpus]:
47
+ """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
48
+ backend_data: Dict[str, Dict] = {}
49
+
50
+ for backend, offer in offers:
51
+ backend_type = backend.TYPE
52
+ if backend_type not in backend_data:
53
+ backend_data[backend_type] = {"gpus": {}, "regions": set()}
54
+
55
+ backend_data[backend_type]["regions"].add(offer.region)
56
+
57
+ if not offer.instance.resources.gpus:
58
+ continue
59
+
60
+ gpu_types_in_offer = {}
61
+ for gpu in offer.instance.resources.gpus:
62
+ gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor)
63
+ if gpu_type_key not in gpu_types_in_offer:
64
+ gpu_types_in_offer[gpu_type_key] = 0
65
+ gpu_types_in_offer[gpu_type_key] += 1
66
+
67
+ for (
68
+ gpu_name,
69
+ gpu_memory_mib,
70
+ gpu_vendor,
71
+ ), gpu_count_in_offer in gpu_types_in_offer.items():
72
+ instance_config_key = (
73
+ gpu_name,
74
+ gpu_memory_mib,
75
+ gpu_vendor,
76
+ gpu_count_in_offer,
77
+ offer.instance.resources.spot,
78
+ offer.region,
79
+ )
80
+
81
+ if instance_config_key not in backend_data[backend_type]["gpus"]:
82
+ backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu(
83
+ name=gpu_name,
84
+ memory_mib=gpu_memory_mib,
85
+ vendor=gpu_vendor,
86
+ availability=offer.availability,
87
+ spot=offer.instance.resources.spot,
88
+ count=gpu_count_in_offer,
89
+ price=offer.price,
90
+ region=offer.region,
91
+ )
92
+
93
+ backend_gpus_list = []
94
+ for backend_type, data in backend_data.items():
95
+ gpus_list = sorted(
96
+ list(data["gpus"].values()),
97
+ key=lambda g: (
98
+ not g.availability.is_available(),
99
+ g.vendor.value,
100
+ g.name,
101
+ g.memory_mib,
102
+ ),
103
+ )
104
+ backend_gpus_list.append(
105
+ BackendGpus(
106
+ backend_type=backend_type,
107
+ gpus=gpus_list,
108
+ regions=sorted(list(data["regions"])),
109
+ )
110
+ )
111
+ return backend_gpus_list
112
+
113
+
114
+ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
115
+ """Updates an existing GpuGroup with new data from another GPU offer."""
116
+ spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
117
+
118
+ if gpu.availability not in row.availability:
119
+ row.availability.append(gpu.availability)
120
+ if spot_type not in row.spot:
121
+ row.spot.append(spot_type)
122
+ if row.backends and backend_type not in row.backends:
123
+ row.backends.append(backend_type)
124
+
125
+ row.count.min = min(row.count.min, gpu.count)
126
+ row.count.max = max(row.count.max, gpu.count)
127
+ per_gpu_price = gpu.price / gpu.count
128
+ row.price.min = min(row.price.min, per_gpu_price)
129
+ row.price.max = max(row.price.max, per_gpu_price)
130
+
131
+
132
+ def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
133
+ """Aggregates GPU specs into a flat list, without any grouping."""
134
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
135
+ for backend in backend_gpus:
136
+ for gpu in backend.gpus:
137
+ key = (gpu.name, gpu.memory_mib, gpu.vendor)
138
+ if key not in gpu_rows:
139
+ per_gpu_price = gpu.price / gpu.count
140
+ price_range = Range[float](min=per_gpu_price, max=per_gpu_price)
141
+
142
+ gpu_rows[key] = GpuGroup(
143
+ name=gpu.name,
144
+ memory_mib=gpu.memory_mib,
145
+ vendor=gpu.vendor,
146
+ availability=[gpu.availability],
147
+ spot=["spot" if gpu.spot else "on-demand"],
148
+ count=Range[int](min=gpu.count, max=gpu.count),
149
+ price=price_range,
150
+ backends=[backend.backend_type],
151
+ )
152
+ else:
153
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
154
+
155
+ result = sorted(
156
+ list(gpu_rows.values()),
157
+ key=lambda g: (
158
+ not any(av.is_available() for av in g.availability),
159
+ g.price.min,
160
+ g.price.max,
161
+ g.name,
162
+ g.memory_mib,
163
+ ),
164
+ )
165
+
166
+ return result
167
+
168
+
169
+ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
170
+ """Aggregates GPU specs, grouping them by backend."""
171
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
172
+ for backend in backend_gpus:
173
+ for gpu in backend.gpus:
174
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type)
175
+ if key not in gpu_rows:
176
+ per_gpu_price = gpu.price / gpu.count
177
+ gpu_rows[key] = GpuGroup(
178
+ name=gpu.name,
179
+ memory_mib=gpu.memory_mib,
180
+ vendor=gpu.vendor,
181
+ availability=[gpu.availability],
182
+ spot=["spot" if gpu.spot else "on-demand"],
183
+ count=Range[int](min=gpu.count, max=gpu.count),
184
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
185
+ backend=backend.backend_type,
186
+ regions=backend.regions.copy(),
187
+ )
188
+ else:
189
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
190
+
191
+ return sorted(
192
+ list(gpu_rows.values()),
193
+ key=lambda g: (
194
+ not any(av.is_available() for av in g.availability),
195
+ g.price.min,
196
+ g.price.max,
197
+ g.backend.value,
198
+ g.name,
199
+ g.memory_mib,
200
+ ),
201
+ )
202
+
203
+
204
+ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
205
+ """Aggregates GPU specs, grouping them by both backend and region."""
206
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
207
+ for backend in backend_gpus:
208
+ for gpu in backend.gpus:
209
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
210
+ if key not in gpu_rows:
211
+ per_gpu_price = gpu.price / gpu.count
212
+ gpu_rows[key] = GpuGroup(
213
+ name=gpu.name,
214
+ memory_mib=gpu.memory_mib,
215
+ vendor=gpu.vendor,
216
+ availability=[gpu.availability],
217
+ spot=["spot" if gpu.spot else "on-demand"],
218
+ count=Range[int](min=gpu.count, max=gpu.count),
219
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
220
+ backend=backend.backend_type,
221
+ region=gpu.region,
222
+ )
223
+ else:
224
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
225
+
226
+ return sorted(
227
+ list(gpu_rows.values()),
228
+ key=lambda g: (
229
+ not any(av.is_available() for av in g.availability),
230
+ g.price.min,
231
+ g.price.max,
232
+ g.backend.value,
233
+ g.region,
234
+ g.name,
235
+ g.memory_mib,
236
+ ),
237
+ )
238
+
239
+
240
+ def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
241
+ """Aggregates GPU specs, grouping them by GPU count."""
242
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
243
+ for backend in backend_gpus:
244
+ for gpu in backend.gpus:
245
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count)
246
+ if key not in gpu_rows:
247
+ per_gpu_price = gpu.price / gpu.count
248
+ gpu_rows[key] = GpuGroup(
249
+ name=gpu.name,
250
+ memory_mib=gpu.memory_mib,
251
+ vendor=gpu.vendor,
252
+ availability=[gpu.availability],
253
+ spot=["spot" if gpu.spot else "on-demand"],
254
+ count=Range[int](min=gpu.count, max=gpu.count),
255
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
256
+ backends=[backend.backend_type],
257
+ )
258
+ else:
259
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
260
+
261
+ return sorted(
262
+ list(gpu_rows.values()),
263
+ key=lambda g: (
264
+ not any(av.is_available() for av in g.availability),
265
+ g.price.min,
266
+ g.price.max,
267
+ g.count.min,
268
+ g.name,
269
+ g.memory_mib,
270
+ ),
271
+ )
272
+
273
+
274
+ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
275
+ """Aggregates GPU specs, grouping them by backend and GPU count."""
276
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
277
+ for backend in backend_gpus:
278
+ for gpu in backend.gpus:
279
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count)
280
+ if key not in gpu_rows:
281
+ per_gpu_price = gpu.price / gpu.count
282
+ gpu_rows[key] = GpuGroup(
283
+ name=gpu.name,
284
+ memory_mib=gpu.memory_mib,
285
+ vendor=gpu.vendor,
286
+ availability=[gpu.availability],
287
+ spot=["spot" if gpu.spot else "on-demand"],
288
+ count=Range[int](min=gpu.count, max=gpu.count),
289
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
290
+ backend=backend.backend_type,
291
+ regions=backend.regions.copy(),
292
+ )
293
+ else:
294
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
295
+
296
+ return sorted(
297
+ list(gpu_rows.values()),
298
+ key=lambda g: (
299
+ not any(av.is_available() for av in g.availability),
300
+ g.price.min,
301
+ g.price.max,
302
+ g.backend.value,
303
+ g.count.min,
304
+ g.name,
305
+ g.memory_mib,
306
+ ),
307
+ )
308
+
309
+
310
+ def _get_gpus_grouped_by_backend_region_and_count(
311
+ backend_gpus: List[BackendGpus],
312
+ ) -> List[GpuGroup]:
313
+ """Aggregates GPU specs, grouping them by backend, region, and GPU count."""
314
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
315
+ for backend in backend_gpus:
316
+ for gpu in backend.gpus:
317
+ key = (
318
+ gpu.name,
319
+ gpu.memory_mib,
320
+ gpu.vendor,
321
+ backend.backend_type,
322
+ gpu.region,
323
+ gpu.count,
324
+ )
325
+ if key not in gpu_rows:
326
+ per_gpu_price = gpu.price / gpu.count
327
+ gpu_rows[key] = GpuGroup(
328
+ name=gpu.name,
329
+ memory_mib=gpu.memory_mib,
330
+ vendor=gpu.vendor,
331
+ availability=[gpu.availability],
332
+ spot=["spot" if gpu.spot else "on-demand"],
333
+ count=Range[int](min=gpu.count, max=gpu.count),
334
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
335
+ backend=backend.backend_type,
336
+ region=gpu.region,
337
+ )
338
+ else:
339
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
340
+
341
+ return sorted(
342
+ list(gpu_rows.values()),
343
+ key=lambda g: (
344
+ not any(av.is_available() for av in g.availability),
345
+ g.price.min,
346
+ g.price.max,
347
+ g.backend.value,
348
+ g.region,
349
+ g.count.min,
350
+ g.name,
351
+ g.memory_mib,
352
+ ),
353
+ )
354
+
355
+
356
+ async def list_gpus_grouped(
357
+ session: AsyncSession,
358
+ project: ProjectModel,
359
+ run_spec: RunSpec,
360
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
361
+ ) -> ListGpusResponse:
362
+ """Retrieves available GPU specifications based on a run spec, with optional grouping."""
363
+ offers = await _get_gpu_offers(session, project, run_spec)
364
+ backend_gpus = _process_offers_into_backend_gpus(offers)
365
+
366
+ group_by_set = set(group_by) if group_by else set()
367
+
368
+ if "region" in group_by_set and "backend" not in group_by_set:
369
+ from dstack._internal.core.errors import ServerClientError
370
+
371
+ raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
372
+
373
+ # Determine grouping strategy based on combination
374
+ has_backend = "backend" in group_by_set
375
+ has_region = "region" in group_by_set
376
+ has_count = "count" in group_by_set
377
+ if has_backend and has_region and has_count:
378
+ gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
379
+ elif has_backend and has_count:
380
+ gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
381
+ elif has_backend and has_region:
382
+ gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
383
+ elif has_backend:
384
+ gpus = _get_gpus_grouped_by_backend(backend_gpus)
385
+ elif has_count:
386
+ gpus = _get_gpus_grouped_by_count(backend_gpus)
387
+ else:
388
+ gpus = _get_gpus_with_no_grouping(backend_gpus)
389
+
390
+ return ListGpusResponse(gpus=gpus)
@@ -49,6 +49,7 @@ async def get_offers_by_requirements(
49
49
  backend_types = profile.backends
50
50
  regions = profile.regions
51
51
  availability_zones = profile.availability_zones
52
+ instance_types = profile.instance_types
52
53
 
53
54
  if volumes:
54
55
  mount_point_volumes = volumes[0]
@@ -97,9 +98,43 @@ async def get_offers_by_requirements(
97
98
  exclude_not_available=exclude_not_available,
98
99
  )
99
100
 
100
- # Filter offers again for backends since a backend
101
- # can return offers of different backend types (e.g. BackendType.DSTACK).
102
- # The first filter should remain as an optimization.
101
+ offers = filter_offers(
102
+ offers=offers,
103
+ # Double filtering by backends if backend returns offers for other backend.
104
+ backend_types=backend_types,
105
+ regions=regions,
106
+ availability_zones=availability_zones,
107
+ instance_types=instance_types,
108
+ placement_group=placement_group,
109
+ )
110
+
111
+ if blocks == 1:
112
+ return offers
113
+
114
+ shareable_offers = []
115
+ for backend, offer in offers:
116
+ resources = offer.instance.resources
117
+ cpu_count = resources.cpus
118
+ gpu_count = len(resources.gpus)
119
+ if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
120
+ # TPUs cannot be shared
121
+ gpu_count = 1
122
+ divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
123
+ if not divisible:
124
+ continue
125
+ offer.total_blocks = _blocks
126
+ shareable_offers.append((backend, offer))
127
+ return shareable_offers
128
+
129
+
130
+ def filter_offers(
131
+ offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
132
+ backend_types: Optional[List[BackendType]] = None,
133
+ regions: Optional[List[str]] = None,
134
+ availability_zones: Optional[List[str]] = None,
135
+ instance_types: Optional[List[str]] = None,
136
+ placement_group: Optional[PlacementGroup] = None,
137
+ ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
103
138
  if backend_types is not None:
104
139
  offers = [(b, o) for b, o in offers if o.backend in backend_types]
105
140
 
@@ -119,39 +154,21 @@ async def get_offers_by_requirements(
119
154
  new_offers.append((b, new_offer))
120
155
  offers = new_offers
121
156
 
157
+ if instance_types is not None:
158
+ instance_types = [i.lower() for i in instance_types]
159
+ offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
160
+
122
161
  if placement_group is not None:
123
162
  new_offers = []
124
163
  for b, o in offers:
125
- for backend in backends:
126
- compute = backend.compute()
127
- if isinstance(
128
- compute, ComputeWithPlacementGroupSupport
129
- ) and compute.is_suitable_placement_group(placement_group, o):
130
- new_offers.append((b, o))
131
- break
164
+ compute = b.compute()
165
+ if isinstance(
166
+ compute, ComputeWithPlacementGroupSupport
167
+ ) and compute.is_suitable_placement_group(placement_group, o):
168
+ new_offers.append((b, o))
132
169
  offers = new_offers
133
170
 
134
- if profile.instance_types is not None:
135
- instance_types = [i.lower() for i in profile.instance_types]
136
- offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
137
-
138
- if blocks == 1:
139
- return offers
140
-
141
- shareable_offers = []
142
- for backend, offer in offers:
143
- resources = offer.instance.resources
144
- cpu_count = resources.cpus
145
- gpu_count = len(resources.gpus)
146
- if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
147
- # TPUs cannot be shared
148
- gpu_count = 1
149
- divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
150
- if not divisible:
151
- continue
152
- offer.total_blocks = _blocks
153
- shareable_offers.append((backend, offer))
154
- return shareable_offers
171
+ return offers
155
172
 
156
173
 
157
174
  def is_divisible_into_blocks(
@@ -1,6 +1,10 @@
1
- from dstack._internal.core.models.runs import Probe
1
+ from dstack._internal.core.models.runs import Probe, ProbeSpec
2
2
  from dstack._internal.server.models import ProbeModel
3
3
 
4
4
 
5
5
  def probe_model_to_probe(probe_model: ProbeModel) -> Probe:
6
6
  return Probe(success_streak=probe_model.success_streak)
7
+
8
+
9
+ def is_probe_ready(probe: ProbeModel, spec: ProbeSpec) -> bool:
10
+ return probe.success_streak >= spec.ready_after
@@ -54,6 +54,7 @@ class ServerProxyRepo(BaseProxyRepo):
54
54
  RunModel.gateway_id.is_(None),
55
55
  JobModel.run_name == run_name,
56
56
  JobModel.status == JobStatus.RUNNING,
57
+ JobModel.registered == True,
57
58
  JobModel.job_num == 0,
58
59
  )
59
60
  .options(
@@ -41,6 +41,7 @@ from dstack._internal.core.models.runs import (
41
41
  JobStatus,
42
42
  JobSubmission,
43
43
  JobTerminationReason,
44
+ ProbeSpec,
44
45
  Run,
45
46
  RunPlan,
46
47
  RunSpec,
@@ -58,6 +59,7 @@ from dstack._internal.server import settings
58
59
  from dstack._internal.server.db import get_db
59
60
  from dstack._internal.server.models import (
60
61
  JobModel,
62
+ ProbeModel,
61
63
  ProjectModel,
62
64
  RepoModel,
63
65
  RunModel,
@@ -86,6 +88,7 @@ from dstack._internal.server.services.locking import get_locker, string_to_lock_
86
88
  from dstack._internal.server.services.logging import fmt
87
89
  from dstack._internal.server.services.offers import get_offers_by_requirements
88
90
  from dstack._internal.server.services.plugins import apply_plugin_policies
91
+ from dstack._internal.server.services.probes import is_probe_ready
89
92
  from dstack._internal.server.services.projects import list_user_project_models
90
93
  from dstack._internal.server.services.resources import set_resources_defaults
91
94
  from dstack._internal.server.services.secrets import get_project_secrets_mapping
@@ -1185,8 +1188,8 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica
1185
1188
  elif {JobStatus.PROVISIONING, JobStatus.PULLING} & statuses:
1186
1189
  # if there are any provisioning or pulling jobs, the replica is active and has the importance of 1
1187
1190
  active_replicas.append((1, is_out_of_date, replica_num, replica_jobs))
1188
- elif not is_replica_ready(replica_jobs):
1189
- # all jobs are running, but probes are failing, the replica is active and has the importance of 2
1191
+ elif not is_replica_registered(replica_jobs):
1192
+ # all jobs are running, but not receiving traffic, the replica is active and has the importance of 2
1190
1193
  active_replicas.append((2, is_out_of_date, replica_num, replica_jobs))
1191
1194
  else:
1192
1195
  # all jobs are running and ready, the replica is active and has the importance of 3
@@ -1273,15 +1276,13 @@ async def retry_run_replica_jobs(
1273
1276
  session.add(new_job_model)
1274
1277
 
1275
1278
 
1276
- def is_replica_ready(jobs: Iterable[JobModel]) -> bool:
1277
- if not all(job.status == JobStatus.RUNNING for job in jobs):
1278
- return False
1279
- for job in jobs:
1280
- job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
1281
- for probe_spec, probe in zip(job_spec.probes, job.probes):
1282
- if probe.success_streak < probe_spec.ready_after:
1283
- return False
1284
- return True
1279
+ def is_job_ready(probes: Iterable[ProbeModel], probe_specs: Iterable[ProbeSpec]) -> bool:
1280
+ return all(is_probe_ready(probe, probe_spec) for probe, probe_spec in zip(probes, probe_specs))
1281
+
1282
+
1283
+ def is_replica_registered(jobs: list[JobModel]) -> bool:
1284
+ # Only job_num=0 is supposed to receive service requests
1285
+ return jobs[0].registered
1285
1286
 
1286
1287
 
1287
1288
  def _remove_job_spec_sensitive_info(spec: JobSpec):