dstack 0.19.23rc1__py3-none-any.whl → 0.19.25rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show
  1. dstack/_internal/cli/commands/apply.py +14 -2
  2. dstack/_internal/cli/commands/init.py +47 -2
  3. dstack/_internal/cli/commands/offer.py +68 -60
  4. dstack/_internal/cli/services/configurators/run.py +38 -10
  5. dstack/_internal/cli/services/repos.py +6 -24
  6. dstack/_internal/cli/utils/common.py +7 -0
  7. dstack/_internal/cli/utils/gpu.py +210 -0
  8. dstack/_internal/cli/utils/run.py +33 -0
  9. dstack/_internal/core/backends/aws/compute.py +1 -4
  10. dstack/_internal/core/backends/base/compute.py +0 -4
  11. dstack/_internal/core/backends/gcp/compute.py +1 -4
  12. dstack/_internal/core/backends/nebius/compute.py +1 -4
  13. dstack/_internal/core/models/common.py +1 -1
  14. dstack/_internal/core/models/config.py +3 -1
  15. dstack/_internal/core/models/configurations.py +16 -14
  16. dstack/_internal/core/models/fleets.py +2 -2
  17. dstack/_internal/core/models/instances.py +4 -1
  18. dstack/_internal/core/models/profiles.py +2 -2
  19. dstack/_internal/core/models/repos/remote.py +2 -2
  20. dstack/_internal/core/models/resources.py +4 -4
  21. dstack/_internal/core/models/runs.py +13 -9
  22. dstack/_internal/core/services/configs/__init__.py +4 -6
  23. dstack/_internal/proxy/gateway/services/registry.py +2 -0
  24. dstack/_internal/server/app.py +2 -0
  25. dstack/_internal/server/background/tasks/process_fleets.py +10 -2
  26. dstack/_internal/server/background/tasks/process_running_jobs.py +66 -46
  27. dstack/_internal/server/background/tasks/process_runs.py +16 -15
  28. dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
  29. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  30. dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py +484 -0
  31. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  32. dstack/_internal/server/models.py +24 -13
  33. dstack/_internal/server/routers/gpus.py +29 -0
  34. dstack/_internal/server/schemas/gateways.py +1 -1
  35. dstack/_internal/server/schemas/gpus.py +66 -0
  36. dstack/_internal/server/services/docker.py +1 -1
  37. dstack/_internal/server/services/gpus.py +390 -0
  38. dstack/_internal/server/services/jobs/__init__.py +3 -1
  39. dstack/_internal/server/services/offers.py +48 -31
  40. dstack/_internal/server/services/probes.py +5 -1
  41. dstack/_internal/server/services/proxy/repo.py +1 -0
  42. dstack/_internal/server/services/repos.py +1 -1
  43. dstack/_internal/server/services/runs.py +15 -12
  44. dstack/_internal/server/services/secrets.py +1 -1
  45. dstack/_internal/server/services/services/__init__.py +60 -41
  46. dstack/_internal/server/statics/index.html +1 -1
  47. dstack/_internal/server/statics/logo-notext.svg +116 -0
  48. dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
  49. dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js → main-d151b300fcac3933213d.js} +20 -23
  50. dstack/_internal/server/statics/{main-cc067b7fd1a8f33f97da.js.map → main-d151b300fcac3933213d.js.map} +1 -1
  51. dstack/_internal/server/testing/common.py +7 -2
  52. dstack/api/_public/repos.py +8 -7
  53. dstack/api/server/__init__.py +6 -0
  54. dstack/api/server/_gpus.py +22 -0
  55. dstack/version.py +1 -1
  56. {dstack-0.19.23rc1.dist-info → dstack-0.19.25rc1.dist-info}/METADATA +1 -1
  57. {dstack-0.19.23rc1.dist-info → dstack-0.19.25rc1.dist-info}/RECORD +60 -51
  58. {dstack-0.19.23rc1.dist-info → dstack-0.19.25rc1.dist-info}/WHEEL +0 -0
  59. {dstack-0.19.23rc1.dist-info → dstack-0.19.25rc1.dist-info}/entry_points.txt +0 -0
  60. {dstack-0.19.23rc1.dist-info → dstack-0.19.25rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -14,7 +14,7 @@ class CreateGatewayRequest(CoreModel):
14
14
  backend_type: Annotated[Optional[BackendType], Field(exclude=True)] = None
15
15
  region: Annotated[Optional[str], Field(exclude=True)] = None
16
16
 
17
- class Config:
17
+ class Config(CoreModel.Config):
18
18
  @staticmethod
19
19
  def schema_extra(schema: Dict[str, Any]) -> None:
20
20
  del schema["properties"]["name"]
@@ -0,0 +1,66 @@
1
+ from typing import List, Literal, Optional
2
+
3
+ import gpuhunt
4
+ from pydantic import Field
5
+
6
+ from dstack._internal.core.models.backends.base import BackendType
7
+ from dstack._internal.core.models.common import CoreModel
8
+ from dstack._internal.core.models.instances import InstanceAvailability
9
+ from dstack._internal.core.models.resources import Range
10
+ from dstack._internal.core.models.runs import RunSpec
11
+
12
+
13
+ class BackendGpu(CoreModel):
14
+ """GPU specification from a backend offer."""
15
+
16
+ name: str
17
+ memory_mib: int
18
+ vendor: gpuhunt.AcceleratorVendor
19
+ availability: InstanceAvailability
20
+ spot: bool
21
+ count: int
22
+ price: float
23
+ region: str
24
+
25
+
26
+ class BackendGpus(CoreModel):
27
+ """Backend GPU specifications."""
28
+
29
+ backend_type: BackendType
30
+ gpus: List[BackendGpu]
31
+ regions: List[str]
32
+
33
+
34
+ class ListGpusRequest(CoreModel):
35
+ """Request for listing GPUs with optional grouping."""
36
+
37
+ run_spec: RunSpec
38
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
39
+ default=None,
40
+ description="List of fields to group by. Valid values: 'backend', 'region', 'count'. "
41
+ "Note: 'region' can only be used together with 'backend'.",
42
+ )
43
+
44
+
45
+ class GpuGroup(CoreModel):
46
+ """GPU group that can handle all grouping scenarios."""
47
+
48
+ name: str
49
+ memory_mib: int
50
+ vendor: gpuhunt.AcceleratorVendor
51
+ availability: List[InstanceAvailability]
52
+ spot: List[Literal["spot", "on-demand"]]
53
+ count: Range[int]
54
+ price: Range[float]
55
+ backends: Optional[List[BackendType]] = None
56
+ backend: Optional[BackendType] = None
57
+ regions: Optional[List[str]] = None
58
+ region: Optional[str] = None
59
+
60
+
61
+ class ListGpusResponse(CoreModel):
62
+ """Response containing GPU specifications."""
63
+
64
+ gpus: List[GpuGroup] = Field(
65
+ description="List of GPU specifications, grouped according to the group_by parameter"
66
+ )
@@ -32,7 +32,7 @@ class DXFAuthAdapter:
32
32
 
33
33
 
34
34
  class DockerImage(CoreModel):
35
- class Config:
35
+ class Config(CoreModel.Config):
36
36
  frozen = True
37
37
 
38
38
  image: str
@@ -0,0 +1,390 @@
1
+ from typing import Dict, List, Literal, Optional, Tuple
2
+
3
+ from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
+ from dstack._internal.core.backends.base.backend import Backend
6
+ from dstack._internal.core.models.instances import InstanceOfferWithAvailability
7
+ from dstack._internal.core.models.profiles import SpotPolicy
8
+ from dstack._internal.core.models.resources import Range
9
+ from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map
10
+ from dstack._internal.server.models import ProjectModel
11
+ from dstack._internal.server.schemas.gpus import (
12
+ BackendGpu,
13
+ BackendGpus,
14
+ GpuGroup,
15
+ ListGpusResponse,
16
+ )
17
+ from dstack._internal.server.services.offers import get_offers_by_requirements
18
+
19
+
20
+ async def _get_gpu_offers(
21
+ session: AsyncSession, project: ProjectModel, run_spec: RunSpec
22
+ ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
23
+ """Fetches all available instance offers that match the run spec's GPU requirements."""
24
+ profile = run_spec.merged_profile
25
+ requirements = Requirements(
26
+ resources=run_spec.configuration.resources,
27
+ max_price=profile.max_price,
28
+ spot=get_policy_map(profile.spot_policy, default=SpotPolicy.AUTO),
29
+ reservation=profile.reservation,
30
+ )
31
+
32
+ return await get_offers_by_requirements(
33
+ project=project,
34
+ profile=profile,
35
+ requirements=requirements,
36
+ exclude_not_available=False,
37
+ multinode=False,
38
+ volumes=None,
39
+ privileged=False,
40
+ instance_mounts=False,
41
+ )
42
+
43
+
44
+ def _process_offers_into_backend_gpus(
45
+ offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
46
+ ) -> List[BackendGpus]:
47
+ """Transforms raw offers into a structured list of BackendGpus, aggregating GPU info."""
48
+ backend_data: Dict[str, Dict] = {}
49
+
50
+ for backend, offer in offers:
51
+ backend_type = backend.TYPE
52
+ if backend_type not in backend_data:
53
+ backend_data[backend_type] = {"gpus": {}, "regions": set()}
54
+
55
+ backend_data[backend_type]["regions"].add(offer.region)
56
+
57
+ if not offer.instance.resources.gpus:
58
+ continue
59
+
60
+ gpu_types_in_offer = {}
61
+ for gpu in offer.instance.resources.gpus:
62
+ gpu_type_key = (gpu.name, gpu.memory_mib, gpu.vendor)
63
+ if gpu_type_key not in gpu_types_in_offer:
64
+ gpu_types_in_offer[gpu_type_key] = 0
65
+ gpu_types_in_offer[gpu_type_key] += 1
66
+
67
+ for (
68
+ gpu_name,
69
+ gpu_memory_mib,
70
+ gpu_vendor,
71
+ ), gpu_count_in_offer in gpu_types_in_offer.items():
72
+ instance_config_key = (
73
+ gpu_name,
74
+ gpu_memory_mib,
75
+ gpu_vendor,
76
+ gpu_count_in_offer,
77
+ offer.instance.resources.spot,
78
+ offer.region,
79
+ )
80
+
81
+ if instance_config_key not in backend_data[backend_type]["gpus"]:
82
+ backend_data[backend_type]["gpus"][instance_config_key] = BackendGpu(
83
+ name=gpu_name,
84
+ memory_mib=gpu_memory_mib,
85
+ vendor=gpu_vendor,
86
+ availability=offer.availability,
87
+ spot=offer.instance.resources.spot,
88
+ count=gpu_count_in_offer,
89
+ price=offer.price,
90
+ region=offer.region,
91
+ )
92
+
93
+ backend_gpus_list = []
94
+ for backend_type, data in backend_data.items():
95
+ gpus_list = sorted(
96
+ list(data["gpus"].values()),
97
+ key=lambda g: (
98
+ not g.availability.is_available(),
99
+ g.vendor.value,
100
+ g.name,
101
+ g.memory_mib,
102
+ ),
103
+ )
104
+ backend_gpus_list.append(
105
+ BackendGpus(
106
+ backend_type=backend_type,
107
+ gpus=gpus_list,
108
+ regions=sorted(list(data["regions"])),
109
+ )
110
+ )
111
+ return backend_gpus_list
112
+
113
+
114
+ def _update_gpu_group(row: GpuGroup, gpu: BackendGpu, backend_type: str):
115
+ """Updates an existing GpuGroup with new data from another GPU offer."""
116
+ spot_type: Literal["spot", "on-demand"] = "spot" if gpu.spot else "on-demand"
117
+
118
+ if gpu.availability not in row.availability:
119
+ row.availability.append(gpu.availability)
120
+ if spot_type not in row.spot:
121
+ row.spot.append(spot_type)
122
+ if row.backends and backend_type not in row.backends:
123
+ row.backends.append(backend_type)
124
+
125
+ row.count.min = min(row.count.min, gpu.count)
126
+ row.count.max = max(row.count.max, gpu.count)
127
+ per_gpu_price = gpu.price / gpu.count
128
+ row.price.min = min(row.price.min, per_gpu_price)
129
+ row.price.max = max(row.price.max, per_gpu_price)
130
+
131
+
132
+ def _get_gpus_with_no_grouping(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
133
+ """Aggregates GPU specs into a flat list, without any grouping."""
134
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
135
+ for backend in backend_gpus:
136
+ for gpu in backend.gpus:
137
+ key = (gpu.name, gpu.memory_mib, gpu.vendor)
138
+ if key not in gpu_rows:
139
+ per_gpu_price = gpu.price / gpu.count
140
+ price_range = Range[float](min=per_gpu_price, max=per_gpu_price)
141
+
142
+ gpu_rows[key] = GpuGroup(
143
+ name=gpu.name,
144
+ memory_mib=gpu.memory_mib,
145
+ vendor=gpu.vendor,
146
+ availability=[gpu.availability],
147
+ spot=["spot" if gpu.spot else "on-demand"],
148
+ count=Range[int](min=gpu.count, max=gpu.count),
149
+ price=price_range,
150
+ backends=[backend.backend_type],
151
+ )
152
+ else:
153
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
154
+
155
+ result = sorted(
156
+ list(gpu_rows.values()),
157
+ key=lambda g: (
158
+ not any(av.is_available() for av in g.availability),
159
+ g.price.min,
160
+ g.price.max,
161
+ g.name,
162
+ g.memory_mib,
163
+ ),
164
+ )
165
+
166
+ return result
167
+
168
+
169
+ def _get_gpus_grouped_by_backend(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
170
+ """Aggregates GPU specs, grouping them by backend."""
171
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
172
+ for backend in backend_gpus:
173
+ for gpu in backend.gpus:
174
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type)
175
+ if key not in gpu_rows:
176
+ per_gpu_price = gpu.price / gpu.count
177
+ gpu_rows[key] = GpuGroup(
178
+ name=gpu.name,
179
+ memory_mib=gpu.memory_mib,
180
+ vendor=gpu.vendor,
181
+ availability=[gpu.availability],
182
+ spot=["spot" if gpu.spot else "on-demand"],
183
+ count=Range[int](min=gpu.count, max=gpu.count),
184
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
185
+ backend=backend.backend_type,
186
+ regions=backend.regions.copy(),
187
+ )
188
+ else:
189
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
190
+
191
+ return sorted(
192
+ list(gpu_rows.values()),
193
+ key=lambda g: (
194
+ not any(av.is_available() for av in g.availability),
195
+ g.price.min,
196
+ g.price.max,
197
+ g.backend.value,
198
+ g.name,
199
+ g.memory_mib,
200
+ ),
201
+ )
202
+
203
+
204
+ def _get_gpus_grouped_by_backend_and_region(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
205
+ """Aggregates GPU specs, grouping them by both backend and region."""
206
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
207
+ for backend in backend_gpus:
208
+ for gpu in backend.gpus:
209
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.region)
210
+ if key not in gpu_rows:
211
+ per_gpu_price = gpu.price / gpu.count
212
+ gpu_rows[key] = GpuGroup(
213
+ name=gpu.name,
214
+ memory_mib=gpu.memory_mib,
215
+ vendor=gpu.vendor,
216
+ availability=[gpu.availability],
217
+ spot=["spot" if gpu.spot else "on-demand"],
218
+ count=Range[int](min=gpu.count, max=gpu.count),
219
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
220
+ backend=backend.backend_type,
221
+ region=gpu.region,
222
+ )
223
+ else:
224
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
225
+
226
+ return sorted(
227
+ list(gpu_rows.values()),
228
+ key=lambda g: (
229
+ not any(av.is_available() for av in g.availability),
230
+ g.price.min,
231
+ g.price.max,
232
+ g.backend.value,
233
+ g.region,
234
+ g.name,
235
+ g.memory_mib,
236
+ ),
237
+ )
238
+
239
+
240
+ def _get_gpus_grouped_by_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
241
+ """Aggregates GPU specs, grouping them by GPU count."""
242
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
243
+ for backend in backend_gpus:
244
+ for gpu in backend.gpus:
245
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, gpu.count)
246
+ if key not in gpu_rows:
247
+ per_gpu_price = gpu.price / gpu.count
248
+ gpu_rows[key] = GpuGroup(
249
+ name=gpu.name,
250
+ memory_mib=gpu.memory_mib,
251
+ vendor=gpu.vendor,
252
+ availability=[gpu.availability],
253
+ spot=["spot" if gpu.spot else "on-demand"],
254
+ count=Range[int](min=gpu.count, max=gpu.count),
255
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
256
+ backends=[backend.backend_type],
257
+ )
258
+ else:
259
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
260
+
261
+ return sorted(
262
+ list(gpu_rows.values()),
263
+ key=lambda g: (
264
+ not any(av.is_available() for av in g.availability),
265
+ g.price.min,
266
+ g.price.max,
267
+ g.count.min,
268
+ g.name,
269
+ g.memory_mib,
270
+ ),
271
+ )
272
+
273
+
274
+ def _get_gpus_grouped_by_backend_and_count(backend_gpus: List[BackendGpus]) -> List[GpuGroup]:
275
+ """Aggregates GPU specs, grouping them by backend and GPU count."""
276
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
277
+ for backend in backend_gpus:
278
+ for gpu in backend.gpus:
279
+ key = (gpu.name, gpu.memory_mib, gpu.vendor, backend.backend_type, gpu.count)
280
+ if key not in gpu_rows:
281
+ per_gpu_price = gpu.price / gpu.count
282
+ gpu_rows[key] = GpuGroup(
283
+ name=gpu.name,
284
+ memory_mib=gpu.memory_mib,
285
+ vendor=gpu.vendor,
286
+ availability=[gpu.availability],
287
+ spot=["spot" if gpu.spot else "on-demand"],
288
+ count=Range[int](min=gpu.count, max=gpu.count),
289
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
290
+ backend=backend.backend_type,
291
+ regions=backend.regions.copy(),
292
+ )
293
+ else:
294
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
295
+
296
+ return sorted(
297
+ list(gpu_rows.values()),
298
+ key=lambda g: (
299
+ not any(av.is_available() for av in g.availability),
300
+ g.price.min,
301
+ g.price.max,
302
+ g.backend.value,
303
+ g.count.min,
304
+ g.name,
305
+ g.memory_mib,
306
+ ),
307
+ )
308
+
309
+
310
+ def _get_gpus_grouped_by_backend_region_and_count(
311
+ backend_gpus: List[BackendGpus],
312
+ ) -> List[GpuGroup]:
313
+ """Aggregates GPU specs, grouping them by backend, region, and GPU count."""
314
+ gpu_rows: Dict[Tuple, GpuGroup] = {}
315
+ for backend in backend_gpus:
316
+ for gpu in backend.gpus:
317
+ key = (
318
+ gpu.name,
319
+ gpu.memory_mib,
320
+ gpu.vendor,
321
+ backend.backend_type,
322
+ gpu.region,
323
+ gpu.count,
324
+ )
325
+ if key not in gpu_rows:
326
+ per_gpu_price = gpu.price / gpu.count
327
+ gpu_rows[key] = GpuGroup(
328
+ name=gpu.name,
329
+ memory_mib=gpu.memory_mib,
330
+ vendor=gpu.vendor,
331
+ availability=[gpu.availability],
332
+ spot=["spot" if gpu.spot else "on-demand"],
333
+ count=Range[int](min=gpu.count, max=gpu.count),
334
+ price=Range[float](min=per_gpu_price, max=per_gpu_price),
335
+ backend=backend.backend_type,
336
+ region=gpu.region,
337
+ )
338
+ else:
339
+ _update_gpu_group(gpu_rows[key], gpu, backend.backend_type)
340
+
341
+ return sorted(
342
+ list(gpu_rows.values()),
343
+ key=lambda g: (
344
+ not any(av.is_available() for av in g.availability),
345
+ g.price.min,
346
+ g.price.max,
347
+ g.backend.value,
348
+ g.region,
349
+ g.count.min,
350
+ g.name,
351
+ g.memory_mib,
352
+ ),
353
+ )
354
+
355
+
356
+ async def list_gpus_grouped(
357
+ session: AsyncSession,
358
+ project: ProjectModel,
359
+ run_spec: RunSpec,
360
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = None,
361
+ ) -> ListGpusResponse:
362
+ """Retrieves available GPU specifications based on a run spec, with optional grouping."""
363
+ offers = await _get_gpu_offers(session, project, run_spec)
364
+ backend_gpus = _process_offers_into_backend_gpus(offers)
365
+
366
+ group_by_set = set(group_by) if group_by else set()
367
+
368
+ if "region" in group_by_set and "backend" not in group_by_set:
369
+ from dstack._internal.core.errors import ServerClientError
370
+
371
+ raise ServerClientError("Cannot group by 'region' without also grouping by 'backend'")
372
+
373
+ # Determine grouping strategy based on combination
374
+ has_backend = "backend" in group_by_set
375
+ has_region = "region" in group_by_set
376
+ has_count = "count" in group_by_set
377
+ if has_backend and has_region and has_count:
378
+ gpus = _get_gpus_grouped_by_backend_region_and_count(backend_gpus)
379
+ elif has_backend and has_count:
380
+ gpus = _get_gpus_grouped_by_backend_and_count(backend_gpus)
381
+ elif has_backend and has_region:
382
+ gpus = _get_gpus_grouped_by_backend_and_region(backend_gpus)
383
+ elif has_backend:
384
+ gpus = _get_gpus_grouped_by_backend(backend_gpus)
385
+ elif has_count:
386
+ gpus = _get_gpus_grouped_by_count(backend_gpus)
387
+ else:
388
+ gpus = _get_gpus_with_no_grouping(backend_gpus)
389
+
390
+ return ListGpusResponse(gpus=gpus)
@@ -152,7 +152,9 @@ def job_model_to_job_submission(
152
152
  inactivity_secs=job_model.inactivity_secs,
153
153
  status=job_model.status,
154
154
  status_message=status_message,
155
- termination_reason=job_model.termination_reason,
155
+ termination_reason=job_model.termination_reason.value
156
+ if job_model.termination_reason
157
+ else None,
156
158
  termination_reason_message=job_model.termination_reason_message,
157
159
  exit_status=job_model.exit_status,
158
160
  job_provisioning_data=job_provisioning_data,
@@ -49,6 +49,7 @@ async def get_offers_by_requirements(
49
49
  backend_types = profile.backends
50
50
  regions = profile.regions
51
51
  availability_zones = profile.availability_zones
52
+ instance_types = profile.instance_types
52
53
 
53
54
  if volumes:
54
55
  mount_point_volumes = volumes[0]
@@ -97,9 +98,43 @@ async def get_offers_by_requirements(
97
98
  exclude_not_available=exclude_not_available,
98
99
  )
99
100
 
100
- # Filter offers again for backends since a backend
101
- # can return offers of different backend types (e.g. BackendType.DSTACK).
102
- # The first filter should remain as an optimization.
101
+ offers = filter_offers(
102
+ offers=offers,
103
+ # Double filtering by backends if backend returns offers for other backend.
104
+ backend_types=backend_types,
105
+ regions=regions,
106
+ availability_zones=availability_zones,
107
+ instance_types=instance_types,
108
+ placement_group=placement_group,
109
+ )
110
+
111
+ if blocks == 1:
112
+ return offers
113
+
114
+ shareable_offers = []
115
+ for backend, offer in offers:
116
+ resources = offer.instance.resources
117
+ cpu_count = resources.cpus
118
+ gpu_count = len(resources.gpus)
119
+ if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
120
+ # TPUs cannot be shared
121
+ gpu_count = 1
122
+ divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
123
+ if not divisible:
124
+ continue
125
+ offer.total_blocks = _blocks
126
+ shareable_offers.append((backend, offer))
127
+ return shareable_offers
128
+
129
+
130
+ def filter_offers(
131
+ offers: List[Tuple[Backend, InstanceOfferWithAvailability]],
132
+ backend_types: Optional[List[BackendType]] = None,
133
+ regions: Optional[List[str]] = None,
134
+ availability_zones: Optional[List[str]] = None,
135
+ instance_types: Optional[List[str]] = None,
136
+ placement_group: Optional[PlacementGroup] = None,
137
+ ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
103
138
  if backend_types is not None:
104
139
  offers = [(b, o) for b, o in offers if o.backend in backend_types]
105
140
 
@@ -119,39 +154,21 @@ async def get_offers_by_requirements(
119
154
  new_offers.append((b, new_offer))
120
155
  offers = new_offers
121
156
 
157
+ if instance_types is not None:
158
+ instance_types = [i.lower() for i in instance_types]
159
+ offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
160
+
122
161
  if placement_group is not None:
123
162
  new_offers = []
124
163
  for b, o in offers:
125
- for backend in backends:
126
- compute = backend.compute()
127
- if isinstance(
128
- compute, ComputeWithPlacementGroupSupport
129
- ) and compute.is_suitable_placement_group(placement_group, o):
130
- new_offers.append((b, o))
131
- break
164
+ compute = b.compute()
165
+ if isinstance(
166
+ compute, ComputeWithPlacementGroupSupport
167
+ ) and compute.is_suitable_placement_group(placement_group, o):
168
+ new_offers.append((b, o))
132
169
  offers = new_offers
133
170
 
134
- if profile.instance_types is not None:
135
- instance_types = [i.lower() for i in profile.instance_types]
136
- offers = [(b, o) for b, o in offers if o.instance.name.lower() in instance_types]
137
-
138
- if blocks == 1:
139
- return offers
140
-
141
- shareable_offers = []
142
- for backend, offer in offers:
143
- resources = offer.instance.resources
144
- cpu_count = resources.cpus
145
- gpu_count = len(resources.gpus)
146
- if gpu_count > 0 and resources.gpus[0].vendor == gpuhunt.AcceleratorVendor.GOOGLE:
147
- # TPUs cannot be shared
148
- gpu_count = 1
149
- divisible, _blocks = is_divisible_into_blocks(cpu_count, gpu_count, blocks)
150
- if not divisible:
151
- continue
152
- offer.total_blocks = _blocks
153
- shareable_offers.append((backend, offer))
154
- return shareable_offers
171
+ return offers
155
172
 
156
173
 
157
174
  def is_divisible_into_blocks(
@@ -1,6 +1,10 @@
1
- from dstack._internal.core.models.runs import Probe
1
+ from dstack._internal.core.models.runs import Probe, ProbeSpec
2
2
  from dstack._internal.server.models import ProbeModel
3
3
 
4
4
 
5
5
  def probe_model_to_probe(probe_model: ProbeModel) -> Probe:
6
6
  return Probe(success_streak=probe_model.success_streak)
7
+
8
+
9
+ def is_probe_ready(probe: ProbeModel, spec: ProbeSpec) -> bool:
10
+ return probe.success_streak >= spec.ready_after
@@ -54,6 +54,7 @@ class ServerProxyRepo(BaseProxyRepo):
54
54
  RunModel.gateway_id.is_(None),
55
55
  JobModel.run_name == run_name,
56
56
  JobModel.status == JobStatus.RUNNING,
57
+ JobModel.registered == True,
57
58
  JobModel.job_num == 0,
58
59
  )
59
60
  .options(
@@ -129,7 +129,7 @@ async def create_repo(
129
129
  repo = RepoModel(
130
130
  project_id=project.id,
131
131
  name=repo_id,
132
- type=repo_info.repo_type,
132
+ type=RepoType(repo_info.repo_type),
133
133
  info=repo_info.json(),
134
134
  )
135
135
  try: