dstack 0.19.24__py3-none-any.whl → 0.19.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (56) hide show
  1. dstack/_internal/cli/commands/apply.py +14 -2
  2. dstack/_internal/cli/commands/init.py +47 -2
  3. dstack/_internal/cli/commands/offer.py +68 -60
  4. dstack/_internal/cli/services/configurators/run.py +35 -10
  5. dstack/_internal/cli/services/repos.py +6 -24
  6. dstack/_internal/cli/utils/common.py +7 -0
  7. dstack/_internal/cli/utils/gpu.py +210 -0
  8. dstack/_internal/cli/utils/run.py +33 -0
  9. dstack/_internal/core/backends/aws/compute.py +1 -4
  10. dstack/_internal/core/backends/base/compute.py +0 -4
  11. dstack/_internal/core/backends/gcp/compute.py +1 -4
  12. dstack/_internal/core/backends/nebius/compute.py +1 -4
  13. dstack/_internal/core/models/common.py +1 -1
  14. dstack/_internal/core/models/config.py +3 -1
  15. dstack/_internal/core/models/configurations.py +16 -14
  16. dstack/_internal/core/models/fleets.py +2 -2
  17. dstack/_internal/core/models/instances.py +1 -1
  18. dstack/_internal/core/models/profiles.py +2 -2
  19. dstack/_internal/core/models/repos/remote.py +2 -2
  20. dstack/_internal/core/models/resources.py +4 -4
  21. dstack/_internal/core/models/runs.py +1 -1
  22. dstack/_internal/core/services/configs/__init__.py +8 -7
  23. dstack/_internal/proxy/gateway/services/registry.py +2 -0
  24. dstack/_internal/server/app.py +2 -0
  25. dstack/_internal/server/background/tasks/process_fleets.py +10 -2
  26. dstack/_internal/server/background/tasks/process_running_jobs.py +65 -44
  27. dstack/_internal/server/background/tasks/process_runs.py +15 -14
  28. dstack/_internal/server/background/tasks/process_submitted_jobs.py +251 -52
  29. dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py +28 -0
  30. dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py +41 -0
  31. dstack/_internal/server/models.py +13 -1
  32. dstack/_internal/server/routers/gpus.py +29 -0
  33. dstack/_internal/server/schemas/gateways.py +1 -1
  34. dstack/_internal/server/schemas/gpus.py +66 -0
  35. dstack/_internal/server/services/docker.py +1 -1
  36. dstack/_internal/server/services/gpus.py +390 -0
  37. dstack/_internal/server/services/offers.py +48 -31
  38. dstack/_internal/server/services/probes.py +5 -1
  39. dstack/_internal/server/services/proxy/repo.py +1 -0
  40. dstack/_internal/server/services/runs.py +12 -11
  41. dstack/_internal/server/services/services/__init__.py +60 -41
  42. dstack/_internal/server/statics/index.html +1 -1
  43. dstack/_internal/server/statics/logo-notext.svg +116 -0
  44. dstack/_internal/server/statics/{main-03e818b110e1d5705378.css → main-aec4762350e34d6fbff9.css} +1 -1
  45. dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js → main-d151b300fcac3933213d.js} +19 -22
  46. dstack/_internal/server/statics/{main-16813e4e1d1c4119eda3.js.map → main-d151b300fcac3933213d.js.map} +1 -1
  47. dstack/_internal/server/testing/common.py +7 -2
  48. dstack/api/_public/repos.py +8 -7
  49. dstack/api/server/__init__.py +6 -0
  50. dstack/api/server/_gpus.py +22 -0
  51. dstack/version.py +1 -1
  52. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/METADATA +1 -1
  53. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/RECORD +56 -48
  54. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/WHEEL +0 -0
  55. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/entry_points.txt +0 -0
  56. {dstack-0.19.24.dist-info → dstack-0.19.25.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,17 +1,20 @@
1
1
  import asyncio
2
+ import itertools
3
+ import math
2
4
  import uuid
3
5
  from datetime import datetime, timedelta
4
6
  from typing import List, Optional, Tuple
5
7
 
6
- from sqlalchemy import select
8
+ from sqlalchemy import and_, or_, select
7
9
  from sqlalchemy.ext.asyncio import AsyncSession
8
- from sqlalchemy.orm import joinedload, load_only, selectinload
10
+ from sqlalchemy.orm import contains_eager, joinedload, load_only, selectinload
9
11
 
10
12
  from dstack._internal.core.backends.base.backend import Backend
11
13
  from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
12
14
  from dstack._internal.core.errors import BackendError, ServerClientError
13
15
  from dstack._internal.core.models.common import NetworkMode
14
16
  from dstack._internal.core.models.fleets import (
17
+ Fleet,
15
18
  FleetConfiguration,
16
19
  FleetSpec,
17
20
  FleetStatus,
@@ -23,7 +26,7 @@ from dstack._internal.core.models.profiles import (
23
26
  CreationPolicy,
24
27
  TerminationPolicy,
25
28
  )
26
- from dstack._internal.core.models.resources import Memory
29
+ from dstack._internal.core.models.resources import Memory, Range
27
30
  from dstack._internal.core.models.runs import (
28
31
  Job,
29
32
  JobProvisioningData,
@@ -157,7 +160,10 @@ async def _process_next_submitted_job():
157
160
  async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
158
161
  # Refetch to load related attributes.
159
162
  res = await session.execute(
160
- select(JobModel).where(JobModel.id == job_model.id).options(joinedload(JobModel.instance))
163
+ select(JobModel)
164
+ .where(JobModel.id == job_model.id)
165
+ .options(joinedload(JobModel.instance))
166
+ .options(joinedload(JobModel.fleet).joinedload(FleetModel.instances))
161
167
  )
162
168
  job_model = res.unique().scalar_one()
163
169
  res = await session.execute(
@@ -176,6 +182,12 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
176
182
  profile = run_spec.merged_profile
177
183
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
178
184
 
185
+ # Master job chooses fleet for the run.
186
+ # Due to two-step processing, it's saved to job_model.fleet.
187
+ # Other jobs just inherit fleet from run_model.fleet.
188
+ # If master job chooses no fleet, the new fleet will be created.
189
+ fleet_model = run_model.fleet or job_model.fleet
190
+
179
191
  master_job = find_job(run.jobs, job_model.replica_num, 0)
180
192
  master_job_provisioning_data = None
181
193
  if job.job_spec.job_num != 0:
@@ -223,54 +235,80 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
223
235
  # Then, the job runs on the assigned instance or a new instance is provisioned.
224
236
  # This is needed to avoid holding instances lock for a long time.
225
237
  if not job_model.instance_assigned:
226
- # Try assigning an existing instance
227
- res = await session.execute(
228
- select(InstanceModel)
229
- .where(
230
- InstanceModel.project_id == project.id,
231
- InstanceModel.deleted == False,
232
- InstanceModel.total_blocks > InstanceModel.busy_blocks,
238
+ # If another job freed the instance but is still trying to detach volumes,
239
+ # do not provision on it to prevent attaching volumes that are currently detaching.
240
+ detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
241
+
242
+ fleet_filters = [
243
+ FleetModel.project_id == project.id,
244
+ FleetModel.deleted == False,
245
+ ]
246
+ if run_model.fleet is not None:
247
+ fleet_filters.append(FleetModel.id == run_model.fleet_id)
248
+ if run_spec.configuration.fleets is not None:
249
+ fleet_filters.append(FleetModel.name.in_(run_spec.configuration.fleets))
250
+
251
+ instance_filters = [
252
+ InstanceModel.deleted == False,
253
+ InstanceModel.total_blocks > InstanceModel.busy_blocks,
254
+ InstanceModel.id.not_in(detaching_instances_ids),
255
+ ]
256
+
257
+ fleet_models_with_instances, fleet_models_without_instances = await _select_fleet_models(
258
+ session=session,
259
+ fleet_filters=fleet_filters,
260
+ instance_filters=instance_filters,
261
+ )
262
+ instances_ids = sorted(
263
+ itertools.chain.from_iterable(
264
+ [i.id for i in f.instances] for f in fleet_models_with_instances
233
265
  )
234
- .order_by(InstanceModel.id) # take locks in order
235
- .with_for_update(key_share=True)
236
266
  )
237
- pool_instances = list(res.unique().scalars().all())
238
- instances_ids = sorted([i.id for i in pool_instances])
267
+ fleet_models = fleet_models_with_instances + fleet_models_without_instances
268
+ fleets_ids = [f.id for f in fleet_models]
269
+
239
270
  if get_db().dialect_name == "sqlite":
240
271
  # Start new transaction to see committed changes after lock
241
272
  await session.commit()
273
+
242
274
  async with get_locker(get_db().dialect_name).lock_ctx(
243
275
  InstanceModel.__tablename__, instances_ids
244
276
  ):
245
- # If another job freed the instance but is still trying to detach volumes,
246
- # do not provision on it to prevent attaching volumes that are currently detaching.
247
- detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
248
- # Refetch after lock
249
- res = await session.execute(
250
- select(InstanceModel)
251
- .where(
252
- InstanceModel.id.not_in(detaching_instances_ids),
253
- InstanceModel.id.in_(instances_ids),
254
- InstanceModel.deleted == False,
255
- InstanceModel.total_blocks > InstanceModel.busy_blocks,
277
+ if get_db().dialect_name == "sqlite":
278
+ fleet_models = await _refetch_fleet_models(
279
+ session=session,
280
+ fleets_ids=fleets_ids,
281
+ instances_ids=instances_ids,
282
+ fleet_filters=fleet_filters,
283
+ instance_filters=instance_filters,
256
284
  )
257
- .options(joinedload(InstanceModel.fleet))
258
- .execution_options(populate_existing=True)
259
- )
260
- pool_instances = list(res.unique().scalars().all())
261
- instance = await _assign_job_to_pool_instance(
262
- session=session,
263
- pool_instances=pool_instances,
264
- run_spec=run_spec,
265
- job_model=job_model,
285
+ fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
286
+ fleet_models=fleet_models,
287
+ run_model=run_model,
288
+ run_spec=run.run_spec,
266
289
  job=job,
267
- fleet_model=run_model.fleet,
268
290
  master_job_provisioning_data=master_job_provisioning_data,
269
291
  volumes=volumes,
270
292
  )
293
+ if fleet_model is None and run_spec.configuration.fleets is not None:
294
+ # Run cannot create new fleets when fleets are specified
295
+ logger.debug("%s: failed to use specified fleets", fmt(job_model))
296
+ job_model.status = JobStatus.TERMINATING
297
+ job_model.termination_reason = (
298
+ JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
299
+ )
300
+ job_model.last_processed_at = common_utils.get_current_datetime()
301
+ await session.commit()
302
+ return
303
+ instance = await _assign_job_to_fleet_instance(
304
+ session=session,
305
+ instances_with_offers=fleet_instances_with_offers,
306
+ job_model=job_model,
307
+ )
308
+ job_model.fleet = fleet_model
271
309
  job_model.instance_assigned = True
272
310
  job_model.last_processed_at = common_utils.get_current_datetime()
273
- if len(pool_instances) > 0:
311
+ if len(instances_ids) > 0:
274
312
  await session.commit()
275
313
  return
276
314
  # If no instances were locked, we can proceed in the same transaction.
@@ -297,7 +335,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
297
335
  # Create a new cloud instance
298
336
  run_job_result = await _run_job_on_new_instance(
299
337
  project=project,
300
- fleet_model=run_model.fleet,
338
+ fleet_model=fleet_model,
301
339
  job_model=job_model,
302
340
  run=run,
303
341
  job=job,
@@ -318,11 +356,11 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
318
356
  job_provisioning_data, offer = run_job_result
319
357
  job_model.job_provisioning_data = job_provisioning_data.json()
320
358
  job_model.status = JobStatus.PROVISIONING
321
- fleet_model = _get_or_create_fleet_model_for_job(
322
- project=project,
323
- run_model=run_model,
324
- run=run,
325
- )
359
+ if fleet_model is None:
360
+ fleet_model = _create_fleet_model_for_job(
361
+ project=project,
362
+ run=run,
363
+ )
326
364
  instance_num = await _get_next_instance_num(
327
365
  session=session,
328
366
  fleet_model=fleet_model,
@@ -376,16 +414,156 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
376
414
  await session.commit()
377
415
 
378
416
 
379
- async def _assign_job_to_pool_instance(
417
+ async def _select_fleet_models(
418
+ session: AsyncSession, fleet_filters: list, instance_filters: list
419
+ ) -> tuple[list[FleetModel], list[FleetModel]]:
420
+ # Selecting fleets in two queries since Postgres does not allow
421
+ # locking nullable side of an outer join. So, first lock instances with inner join.
422
+ # Then select left out fleets without instances.
423
+ res = await session.execute(
424
+ select(FleetModel)
425
+ .join(FleetModel.instances)
426
+ .where(*fleet_filters)
427
+ .where(*instance_filters)
428
+ .options(contains_eager(FleetModel.instances))
429
+ .order_by(InstanceModel.id) # take locks in order
430
+ .with_for_update(key_share=True, of=InstanceModel)
431
+ )
432
+ fleet_models_with_instances = list(res.unique().scalars().all())
433
+ fleet_models_with_instances_ids = [f.id for f in fleet_models_with_instances]
434
+ res = await session.execute(
435
+ select(FleetModel)
436
+ .outerjoin(FleetModel.instances)
437
+ .where(
438
+ *fleet_filters,
439
+ FleetModel.id.not_in(fleet_models_with_instances_ids),
440
+ )
441
+ .where(InstanceModel.id.is_(None))
442
+ .options(contains_eager(FleetModel.instances)) # loading empty relation
443
+ )
444
+ fleet_models_without_instances = list(res.unique().scalars().all())
445
+ return fleet_models_with_instances, fleet_models_without_instances
446
+
447
+
448
+ async def _refetch_fleet_models(
380
449
  session: AsyncSession,
381
- pool_instances: List[InstanceModel],
450
+ fleets_ids: list[uuid.UUID],
451
+ instances_ids: list[uuid.UUID],
452
+ fleet_filters: list,
453
+ instance_filters: list,
454
+ ) -> list[FleetModel]:
455
+ res = await session.execute(
456
+ select(FleetModel)
457
+ .outerjoin(FleetModel.instances)
458
+ .where(
459
+ FleetModel.id.in_(fleets_ids),
460
+ *fleet_filters,
461
+ )
462
+ .where(
463
+ or_(
464
+ InstanceModel.id.is_(None),
465
+ and_(
466
+ InstanceModel.id.in_(instances_ids),
467
+ *instance_filters,
468
+ ),
469
+ )
470
+ )
471
+ .options(contains_eager(FleetModel.instances))
472
+ .execution_options(populate_existing=True)
473
+ )
474
+ fleet_models = list(res.unique().scalars().all())
475
+ return fleet_models
476
+
477
+
478
+ def _find_optimal_fleet_with_offers(
479
+ fleet_models: list[FleetModel],
480
+ run_model: RunModel,
481
+ run_spec: RunSpec,
482
+ job: Job,
483
+ master_job_provisioning_data: Optional[JobProvisioningData],
484
+ volumes: Optional[list[list[Volume]]],
485
+ ) -> tuple[Optional[FleetModel], list[tuple[InstanceModel, InstanceOfferWithAvailability]]]:
486
+ if run_model.fleet is not None:
487
+ # Using the fleet that was already chosen by the master job
488
+ fleet_instances_with_offers = _get_fleet_instances_with_offers(
489
+ fleet_model=run_model.fleet,
490
+ run_spec=run_spec,
491
+ job=job,
492
+ master_job_provisioning_data=master_job_provisioning_data,
493
+ volumes=volumes,
494
+ )
495
+ return run_model.fleet, fleet_instances_with_offers
496
+
497
+ if len(fleet_models) == 0:
498
+ return None, []
499
+
500
+ nodes_required_num = _get_nodes_required_num_for_run(run_spec)
501
+ # The current strategy is to first consider fleets that can accommodate
502
+ # the run without additional provisioning and choose the one with the cheapest offer.
503
+ # Fallback to fleet with the cheapest offer among all fleets with offers.
504
+ candidate_fleets_with_offers: list[
505
+ tuple[
506
+ Optional[FleetModel],
507
+ list[tuple[InstanceModel, InstanceOfferWithAvailability]],
508
+ int,
509
+ tuple[int, float],
510
+ ]
511
+ ] = []
512
+ for candidate_fleet_model in fleet_models:
513
+ fleet_instances_with_offers = _get_fleet_instances_with_offers(
514
+ fleet_model=candidate_fleet_model,
515
+ run_spec=run_spec,
516
+ job=job,
517
+ master_job_provisioning_data=master_job_provisioning_data,
518
+ volumes=volumes,
519
+ )
520
+ fleet_available_offers = [
521
+ o for _, o in fleet_instances_with_offers if o.availability.is_available()
522
+ ]
523
+ fleet_has_available_capacity = nodes_required_num <= len(fleet_available_offers)
524
+ fleet_cheapest_offer = math.inf
525
+ if len(fleet_available_offers) > 0:
526
+ fleet_cheapest_offer = fleet_available_offers[0].price
527
+ fleet_priority = (not fleet_has_available_capacity, fleet_cheapest_offer)
528
+ candidate_fleets_with_offers.append(
529
+ (
530
+ candidate_fleet_model,
531
+ fleet_instances_with_offers,
532
+ len(fleet_available_offers),
533
+ fleet_priority,
534
+ )
535
+ )
536
+ if run_spec.configuration.fleets is None and all(
537
+ t[2] == 0 for t in candidate_fleets_with_offers
538
+ ):
539
+ # If fleets are not specified and no fleets have available offers, create a new fleet.
540
+ # This is for compatibility with non-fleet-first UX when runs created new fleets
541
+ # if there are no instances to reuse.
542
+ return None, []
543
+ candidate_fleets_with_offers.sort(key=lambda t: t[-1])
544
+ return candidate_fleets_with_offers[0][:2]
545
+
546
+
547
+ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
548
+ nodes_required_num = 1
549
+ if run_spec.configuration.type == "task":
550
+ nodes_required_num = run_spec.configuration.nodes
551
+ elif (
552
+ run_spec.configuration.type == "service"
553
+ and run_spec.configuration.replicas.min is not None
554
+ ):
555
+ nodes_required_num = run_spec.configuration.replicas.min
556
+ return nodes_required_num
557
+
558
+
559
+ def _get_fleet_instances_with_offers(
560
+ fleet_model: FleetModel,
382
561
  run_spec: RunSpec,
383
- job_model: JobModel,
384
562
  job: Job,
385
- fleet_model: Optional[FleetModel],
386
563
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
387
564
  volumes: Optional[List[List[Volume]]] = None,
388
- ) -> Optional[InstanceModel]:
565
+ ) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
566
+ pool_instances = fleet_model.instances
389
567
  instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]]
390
568
  profile = run_spec.merged_profile
391
569
  multinode = job.job_spec.jobs_per_replica > 1
@@ -414,7 +592,15 @@ async def _assign_job_to_pool_instance(
414
592
  volumes=volumes,
415
593
  )
416
594
  instances_with_offers.extend(shared_instances_with_offers)
595
+ instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
596
+ return instances_with_offers
417
597
 
598
+
599
+ async def _assign_job_to_fleet_instance(
600
+ session: AsyncSession,
601
+ instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]],
602
+ job_model: JobModel,
603
+ ) -> Optional[InstanceModel]:
418
604
  if len(instances_with_offers) == 0:
419
605
  return None
420
606
 
@@ -463,6 +649,11 @@ async def _run_job_on_new_instance(
463
649
  fleet = None
464
650
  if fleet_model is not None:
465
651
  fleet = fleet_model_to_fleet(fleet_model)
652
+ if not _check_can_create_new_instance_in_fleet(fleet):
653
+ logger.debug(
654
+ "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
655
+ )
656
+ return None
466
657
  multinode = job.job_spec.jobs_per_replica > 1 or (
467
658
  fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
468
659
  )
@@ -522,13 +713,20 @@ async def _run_job_on_new_instance(
522
713
  return None
523
714
 
524
715
 
525
- def _get_or_create_fleet_model_for_job(
716
+ def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
717
+ if fleet.spec.configuration.ssh_config is not None:
718
+ return False
719
+ # TODO: Respect nodes.max
720
+ # Ensure concurrent provisioning does not violate nodes.max
721
+ # E.g. lock fleet and split instance model creation
722
+ # and instance provisioning into separate transactions.
723
+ return True
724
+
725
+
726
+ def _create_fleet_model_for_job(
526
727
  project: ProjectModel,
527
- run_model: RunModel,
528
728
  run: Run,
529
729
  ) -> FleetModel:
530
- if run_model.fleet is not None:
531
- return run_model.fleet
532
730
  placement = InstanceGroupPlacement.ANY
533
731
  if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
534
732
  placement = InstanceGroupPlacement.CLUSTER
@@ -537,6 +735,7 @@ def _get_or_create_fleet_model_for_job(
537
735
  name=run.run_spec.run_name,
538
736
  placement=placement,
539
737
  reservation=run.run_spec.configuration.reservation,
738
+ nodes=Range(min=_get_nodes_required_num_for_run(run.run_spec), max=None),
540
739
  ),
541
740
  profile=run.run_spec.merged_profile,
542
741
  autocreated=True,
@@ -0,0 +1,28 @@
1
+ """Add JobModel.registered
2
+
3
+ Revision ID: 3d7f6c2ec000
4
+ Revises: 74a1f55209bd
5
+ Create Date: 2025-08-11 13:23:39.530103
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = "3d7f6c2ec000"
14
+ down_revision = "74a1f55209bd"
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
21
+ batch_op.add_column(
22
+ sa.Column("registered", sa.Boolean(), server_default=sa.false(), nullable=False)
23
+ )
24
+
25
+
26
+ def downgrade() -> None:
27
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
28
+ batch_op.drop_column("registered")
@@ -0,0 +1,41 @@
1
+ """Add JobModel.fleet
2
+
3
+ Revision ID: e2d08cd1b8d9
4
+ Revises: 3d7f6c2ec000
5
+ Create Date: 2025-08-15 11:26:05.670591
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = "e2d08cd1b8d9"
15
+ down_revision = "3d7f6c2ec000"
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade() -> None:
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
23
+ batch_op.add_column(
24
+ sa.Column(
25
+ "fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True
26
+ )
27
+ )
28
+ batch_op.create_foreign_key(
29
+ batch_op.f("fk_jobs_fleet_id_fleets"), "fleets", ["fleet_id"], ["id"]
30
+ )
31
+
32
+ # ### end Alembic commands ###
33
+
34
+
35
+ def downgrade() -> None:
36
+ # ### commands auto generated by Alembic - please adjust! ###
37
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
38
+ batch_op.drop_constraint(batch_op.f("fk_jobs_fleet_id_fleets"), type_="foreignkey")
39
+ batch_op.drop_column("fleet_id")
40
+
41
+ # ### end Alembic commands ###
@@ -84,7 +84,7 @@ class DecryptedString(CoreModel):
84
84
  decrypted: bool = True
85
85
  exc: Optional[Exception] = None
86
86
 
87
- class Config:
87
+ class Config(CoreModel.Config):
88
88
  arbitrary_types_allowed = True
89
89
 
90
90
  def get_plaintext_or_error(self) -> str:
@@ -390,10 +390,18 @@ class JobModel(BaseModel):
390
390
  id: Mapped[uuid.UUID] = mapped_column(
391
391
  UUIDType(binary=False), primary_key=True, default=uuid.uuid4
392
392
  )
393
+
393
394
  project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
394
395
  project: Mapped["ProjectModel"] = relationship()
396
+
395
397
  run_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("runs.id", ondelete="CASCADE"))
396
398
  run: Mapped["RunModel"] = relationship()
399
+
400
+ # Jobs need to reference fleets because we may choose an optimal fleet for a master job
401
+ # but not yet create an instance for it.
402
+ fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
403
+ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="jobs")
404
+
397
405
  run_name: Mapped[str] = mapped_column(String(100))
398
406
  job_num: Mapped[int] = mapped_column(Integer)
399
407
  job_name: Mapped[str] = mapped_column(String(100))
@@ -430,6 +438,9 @@ class JobModel(BaseModel):
430
438
  probes: Mapped[list["ProbeModel"]] = relationship(
431
439
  back_populates="job", order_by="ProbeModel.probe_num"
432
440
  )
441
+ # Whether the replica is registered to receive service requests.
442
+ # Always `False` for non-service runs.
443
+ registered: Mapped[bool] = mapped_column(Boolean, server_default=false())
433
444
 
434
445
 
435
446
  class GatewayModel(BaseModel):
@@ -537,6 +548,7 @@ class FleetModel(BaseModel):
537
548
  spec: Mapped[str] = mapped_column(Text)
538
549
 
539
550
  runs: Mapped[List["RunModel"]] = relationship(back_populates="fleet")
551
+ jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet")
540
552
  instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet")
541
553
 
542
554
 
@@ -0,0 +1,29 @@
1
+ from typing import Tuple
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+
6
+ from dstack._internal.server.db import get_session
7
+ from dstack._internal.server.models import ProjectModel, UserModel
8
+ from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse
9
+ from dstack._internal.server.security.permissions import ProjectMember
10
+ from dstack._internal.server.services.gpus import list_gpus_grouped
11
+ from dstack._internal.server.utils.routers import get_base_api_additional_responses
12
+
13
+ project_router = APIRouter(
14
+ prefix="/api/project/{project_name}/gpus",
15
+ tags=["gpus"],
16
+ responses=get_base_api_additional_responses(),
17
+ )
18
+
19
+
20
+ @project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True)
21
+ async def list_gpus(
22
+ body: ListGpusRequest,
23
+ session: AsyncSession = Depends(get_session),
24
+ user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
+ ) -> ListGpusResponse:
26
+ _, project = user_project
27
+ return await list_gpus_grouped(
28
+ session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
29
+ )
@@ -14,7 +14,7 @@ class CreateGatewayRequest(CoreModel):
14
14
  backend_type: Annotated[Optional[BackendType], Field(exclude=True)] = None
15
15
  region: Annotated[Optional[str], Field(exclude=True)] = None
16
16
 
17
- class Config:
17
+ class Config(CoreModel.Config):
18
18
  @staticmethod
19
19
  def schema_extra(schema: Dict[str, Any]) -> None:
20
20
  del schema["properties"]["name"]
@@ -0,0 +1,66 @@
1
+ from typing import List, Literal, Optional
2
+
3
+ import gpuhunt
4
+ from pydantic import Field
5
+
6
+ from dstack._internal.core.models.backends.base import BackendType
7
+ from dstack._internal.core.models.common import CoreModel
8
+ from dstack._internal.core.models.instances import InstanceAvailability
9
+ from dstack._internal.core.models.resources import Range
10
+ from dstack._internal.core.models.runs import RunSpec
11
+
12
+
13
+ class BackendGpu(CoreModel):
14
+ """GPU specification from a backend offer."""
15
+
16
+ name: str
17
+ memory_mib: int
18
+ vendor: gpuhunt.AcceleratorVendor
19
+ availability: InstanceAvailability
20
+ spot: bool
21
+ count: int
22
+ price: float
23
+ region: str
24
+
25
+
26
+ class BackendGpus(CoreModel):
27
+ """Backend GPU specifications."""
28
+
29
+ backend_type: BackendType
30
+ gpus: List[BackendGpu]
31
+ regions: List[str]
32
+
33
+
34
+ class ListGpusRequest(CoreModel):
35
+ """Request for listing GPUs with optional grouping."""
36
+
37
+ run_spec: RunSpec
38
+ group_by: Optional[List[Literal["backend", "region", "count"]]] = Field(
39
+ default=None,
40
+ description="List of fields to group by. Valid values: 'backend', 'region', 'count'. "
41
+ "Note: 'region' can only be used together with 'backend'.",
42
+ )
43
+
44
+
45
+ class GpuGroup(CoreModel):
46
+ """GPU group that can handle all grouping scenarios."""
47
+
48
+ name: str
49
+ memory_mib: int
50
+ vendor: gpuhunt.AcceleratorVendor
51
+ availability: List[InstanceAvailability]
52
+ spot: List[Literal["spot", "on-demand"]]
53
+ count: Range[int]
54
+ price: Range[float]
55
+ backends: Optional[List[BackendType]] = None
56
+ backend: Optional[BackendType] = None
57
+ regions: Optional[List[str]] = None
58
+ region: Optional[str] = None
59
+
60
+
61
+ class ListGpusResponse(CoreModel):
62
+ """Response containing GPU specifications."""
63
+
64
+ gpus: List[GpuGroup] = Field(
65
+ description="List of GPU specifications, grouped according to the group_by parameter"
66
+ )
@@ -32,7 +32,7 @@ class DXFAuthAdapter:
32
32
 
33
33
 
34
34
  class DockerImage(CoreModel):
35
- class Config:
35
+ class Config(CoreModel.Config):
36
36
  frozen = True
37
37
 
38
38
  image: str