dstack 0.19.25rc1__py3-none-any.whl → 0.19.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (161) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +293 -58
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +35 -48
  27. dstack/_internal/core/backends/amddevcloud/__init__.py +1 -0
  28. dstack/_internal/core/backends/amddevcloud/backend.py +16 -0
  29. dstack/_internal/core/backends/amddevcloud/compute.py +5 -0
  30. dstack/_internal/core/backends/amddevcloud/configurator.py +29 -0
  31. dstack/_internal/core/backends/aws/compute.py +6 -1
  32. dstack/_internal/core/backends/aws/configurator.py +11 -7
  33. dstack/_internal/core/backends/azure/configurator.py +11 -7
  34. dstack/_internal/core/backends/base/compute.py +33 -5
  35. dstack/_internal/core/backends/base/configurator.py +25 -13
  36. dstack/_internal/core/backends/base/offers.py +2 -0
  37. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  38. dstack/_internal/core/backends/configurators.py +15 -0
  39. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  40. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  41. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  42. dstack/_internal/core/backends/digitalocean/__init__.py +1 -0
  43. dstack/_internal/core/backends/digitalocean/backend.py +16 -0
  44. dstack/_internal/core/backends/digitalocean/compute.py +5 -0
  45. dstack/_internal/core/backends/digitalocean/configurator.py +31 -0
  46. dstack/_internal/core/backends/digitalocean_base/__init__.py +1 -0
  47. dstack/_internal/core/backends/digitalocean_base/api_client.py +104 -0
  48. dstack/_internal/core/backends/digitalocean_base/backend.py +5 -0
  49. dstack/_internal/core/backends/digitalocean_base/compute.py +173 -0
  50. dstack/_internal/core/backends/digitalocean_base/configurator.py +57 -0
  51. dstack/_internal/core/backends/digitalocean_base/models.py +43 -0
  52. dstack/_internal/core/backends/gcp/compute.py +32 -8
  53. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  54. dstack/_internal/core/backends/hotaisle/api_client.py +25 -33
  55. dstack/_internal/core/backends/hotaisle/compute.py +1 -6
  56. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  57. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  58. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  59. dstack/_internal/core/backends/models.py +7 -0
  60. dstack/_internal/core/backends/nebius/compute.py +1 -8
  61. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  62. dstack/_internal/core/backends/nebius/resources.py +21 -11
  63. dstack/_internal/core/backends/oci/compute.py +4 -5
  64. dstack/_internal/core/backends/oci/configurator.py +11 -7
  65. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  66. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  67. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  68. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  69. dstack/_internal/core/backends/vultr/compute.py +1 -5
  70. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  71. dstack/_internal/core/compatibility/fleets.py +5 -0
  72. dstack/_internal/core/compatibility/gpus.py +13 -0
  73. dstack/_internal/core/compatibility/runs.py +9 -1
  74. dstack/_internal/core/models/backends/base.py +5 -1
  75. dstack/_internal/core/models/common.py +3 -3
  76. dstack/_internal/core/models/configurations.py +191 -32
  77. dstack/_internal/core/models/files.py +1 -1
  78. dstack/_internal/core/models/fleets.py +80 -3
  79. dstack/_internal/core/models/profiles.py +41 -11
  80. dstack/_internal/core/models/resources.py +46 -42
  81. dstack/_internal/core/models/runs.py +28 -5
  82. dstack/_internal/core/services/configs/__init__.py +6 -3
  83. dstack/_internal/core/services/profiles.py +2 -2
  84. dstack/_internal/core/services/repos.py +86 -79
  85. dstack/_internal/core/services/ssh/ports.py +1 -1
  86. dstack/_internal/proxy/lib/deps.py +6 -2
  87. dstack/_internal/server/app.py +22 -17
  88. dstack/_internal/server/background/tasks/process_fleets.py +109 -13
  89. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  90. dstack/_internal/server/background/tasks/process_instances.py +22 -73
  91. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  92. dstack/_internal/server/background/tasks/process_running_jobs.py +12 -4
  93. dstack/_internal/server/background/tasks/process_runs.py +3 -1
  94. dstack/_internal/server/background/tasks/process_submitted_jobs.py +67 -44
  95. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  96. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  97. dstack/_internal/server/db.py +8 -4
  98. dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py +44 -0
  99. dstack/_internal/server/models.py +6 -2
  100. dstack/_internal/server/routers/gpus.py +1 -6
  101. dstack/_internal/server/schemas/runner.py +11 -0
  102. dstack/_internal/server/services/backends/__init__.py +14 -8
  103. dstack/_internal/server/services/backends/handlers.py +6 -1
  104. dstack/_internal/server/services/docker.py +5 -5
  105. dstack/_internal/server/services/fleets.py +37 -38
  106. dstack/_internal/server/services/gateways/__init__.py +2 -0
  107. dstack/_internal/server/services/gateways/client.py +5 -2
  108. dstack/_internal/server/services/gateways/connection.py +1 -1
  109. dstack/_internal/server/services/gpus.py +50 -49
  110. dstack/_internal/server/services/instances.py +44 -4
  111. dstack/_internal/server/services/jobs/__init__.py +15 -4
  112. dstack/_internal/server/services/jobs/configurators/base.py +53 -17
  113. dstack/_internal/server/services/jobs/configurators/dev.py +9 -4
  114. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +6 -8
  115. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +7 -9
  116. dstack/_internal/server/services/jobs/configurators/service.py +1 -3
  117. dstack/_internal/server/services/jobs/configurators/task.py +3 -3
  118. dstack/_internal/server/services/locking.py +5 -5
  119. dstack/_internal/server/services/logging.py +10 -2
  120. dstack/_internal/server/services/logs/__init__.py +8 -6
  121. dstack/_internal/server/services/logs/aws.py +330 -327
  122. dstack/_internal/server/services/logs/filelog.py +7 -6
  123. dstack/_internal/server/services/logs/gcp.py +141 -139
  124. dstack/_internal/server/services/plugins.py +1 -1
  125. dstack/_internal/server/services/projects.py +2 -5
  126. dstack/_internal/server/services/proxy/repo.py +5 -1
  127. dstack/_internal/server/services/requirements/__init__.py +0 -0
  128. dstack/_internal/server/services/requirements/combine.py +259 -0
  129. dstack/_internal/server/services/runner/client.py +7 -0
  130. dstack/_internal/server/services/runs.py +17 -1
  131. dstack/_internal/server/services/services/__init__.py +8 -2
  132. dstack/_internal/server/services/services/autoscalers.py +2 -0
  133. dstack/_internal/server/services/ssh.py +2 -1
  134. dstack/_internal/server/services/storage/__init__.py +5 -6
  135. dstack/_internal/server/services/storage/gcs.py +49 -49
  136. dstack/_internal/server/services/storage/s3.py +52 -52
  137. dstack/_internal/server/statics/index.html +1 -1
  138. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js → main-4eecc75fbe64067eb1bc.js} +1146 -899
  139. dstack/_internal/server/statics/{main-d151b300fcac3933213d.js.map → main-4eecc75fbe64067eb1bc.js.map} +1 -1
  140. dstack/_internal/server/statics/{main-aec4762350e34d6fbff9.css → main-56191c63d516fd0041c4.css} +1 -1
  141. dstack/_internal/server/testing/common.py +7 -4
  142. dstack/_internal/server/utils/logging.py +3 -3
  143. dstack/_internal/server/utils/provisioning.py +3 -3
  144. dstack/_internal/utils/json_schema.py +3 -1
  145. dstack/_internal/utils/path.py +8 -1
  146. dstack/_internal/utils/ssh.py +7 -0
  147. dstack/_internal/utils/typing.py +14 -0
  148. dstack/api/_public/repos.py +62 -8
  149. dstack/api/_public/runs.py +19 -8
  150. dstack/api/server/__init__.py +17 -19
  151. dstack/api/server/_gpus.py +2 -1
  152. dstack/api/server/_group.py +4 -3
  153. dstack/api/server/_repos.py +20 -3
  154. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  155. dstack/version.py +1 -1
  156. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/METADATA +2 -2
  157. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/RECORD +160 -142
  158. dstack/api/huggingface/__init__.py +0 -73
  159. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/WHEEL +0 -0
  160. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/entry_points.txt +0 -0
  161. {dstack-0.19.25rc1.dist-info → dstack-0.19.27.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,9 +5,9 @@ import uuid
5
5
  from datetime import datetime, timedelta
6
6
  from typing import List, Optional, Tuple
7
7
 
8
- from sqlalchemy import and_, or_, select
8
+ from sqlalchemy import and_, not_, or_, select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import contains_eager, joinedload, load_only, selectinload
10
+ from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
11
11
 
12
12
  from dstack._internal.core.backends.base.backend import Backend
13
13
  from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
@@ -16,6 +16,7 @@ from dstack._internal.core.models.common import NetworkMode
16
16
  from dstack._internal.core.models.fleets import (
17
17
  Fleet,
18
18
  FleetConfiguration,
19
+ FleetNodesSpec,
19
20
  FleetSpec,
20
21
  FleetStatus,
21
22
  InstanceGroupPlacement,
@@ -26,7 +27,7 @@ from dstack._internal.core.models.profiles import (
26
27
  CreationPolicy,
27
28
  TerminationPolicy,
28
29
  )
29
- from dstack._internal.core.models.resources import Memory, Range
30
+ from dstack._internal.core.models.resources import Memory
30
31
  from dstack._internal.core.models.runs import (
31
32
  Job,
32
33
  JobProvisioningData,
@@ -53,6 +54,8 @@ from dstack._internal.server.models import (
53
54
  from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
54
55
  from dstack._internal.server.services.fleets import (
55
56
  fleet_model_to_fleet,
57
+ get_fleet_requirements,
58
+ get_next_instance_num,
56
59
  )
57
60
  from dstack._internal.server.services.instances import (
58
61
  filter_pool_instances,
@@ -71,6 +74,10 @@ from dstack._internal.server.services.jobs import (
71
74
  from dstack._internal.server.services.locking import get_locker
72
75
  from dstack._internal.server.services.logging import fmt
73
76
  from dstack._internal.server.services.offers import get_offers_by_requirements
77
+ from dstack._internal.server.services.requirements.combine import (
78
+ combine_fleet_and_run_profiles,
79
+ combine_fleet_and_run_requirements,
80
+ )
74
81
  from dstack._internal.server.services.runs import (
75
82
  check_run_spec_requires_instance_mounts,
76
83
  run_model_to_run,
@@ -148,8 +155,8 @@ async def _process_next_submitted_job():
148
155
  if job_model is None:
149
156
  return
150
157
  lockset.add(job_model.id)
158
+ job_model_id = job_model.id
151
159
  try:
152
- job_model_id = job_model.id
153
160
  await _process_submitted_job(session=session, job_model=job_model)
154
161
  finally:
155
162
  lockset.difference_update([job_model_id])
@@ -245,8 +252,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
245
252
  ]
246
253
  if run_model.fleet is not None:
247
254
  fleet_filters.append(FleetModel.id == run_model.fleet_id)
248
- if run_spec.configuration.fleets is not None:
249
- fleet_filters.append(FleetModel.name.in_(run_spec.configuration.fleets))
255
+ if run_spec.merged_profile.fleets is not None:
256
+ fleet_filters.append(FleetModel.name.in_(run_spec.merged_profile.fleets))
250
257
 
251
258
  instance_filters = [
252
259
  InstanceModel.deleted == False,
@@ -264,9 +271,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
264
271
  [i.id for i in f.instances] for f in fleet_models_with_instances
265
272
  )
266
273
  )
267
- fleet_models = fleet_models_with_instances + fleet_models_without_instances
268
- fleets_ids = [f.id for f in fleet_models]
269
-
270
274
  if get_db().dialect_name == "sqlite":
271
275
  # Start new transaction to see committed changes after lock
272
276
  await session.commit()
@@ -275,13 +279,15 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
275
279
  InstanceModel.__tablename__, instances_ids
276
280
  ):
277
281
  if get_db().dialect_name == "sqlite":
278
- fleet_models = await _refetch_fleet_models(
282
+ fleets_with_instances_ids = [f.id for f in fleet_models_with_instances]
283
+ fleet_models_with_instances = await _refetch_fleet_models_with_instances(
279
284
  session=session,
280
- fleets_ids=fleets_ids,
285
+ fleets_ids=fleets_with_instances_ids,
281
286
  instances_ids=instances_ids,
282
287
  fleet_filters=fleet_filters,
283
288
  instance_filters=instance_filters,
284
289
  )
290
+ fleet_models = fleet_models_with_instances + fleet_models_without_instances
285
291
  fleet_model, fleet_instances_with_offers = _find_optimal_fleet_with_offers(
286
292
  fleet_models=fleet_models,
287
293
  run_model=run_model,
@@ -290,7 +296,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
290
296
  master_job_provisioning_data=master_job_provisioning_data,
291
297
  volumes=volumes,
292
298
  )
293
- if fleet_model is None and run_spec.configuration.fleets is not None:
299
+ if fleet_model is None and run_spec.merged_profile.fleets is not None:
294
300
  # Run cannot create new fleets when fleets are specified
295
301
  logger.debug("%s: failed to use specified fleets", fmt(job_model))
296
302
  job_model.status = JobStatus.TERMINATING
@@ -361,6 +367,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
361
367
  project=project,
362
368
  run=run,
363
369
  )
370
+ # FIXME: Fleet is not locked which may lead to duplicate instance_num.
371
+ # This is currently hard to fix without locking the fleet for entire provisioning duration.
372
+ # Processing should be done in multiple steps so that
373
+ # InstanceModel is created before provisioning.
364
374
  instance_num = await _get_next_instance_num(
365
375
  session=session,
366
376
  fleet_model=fleet_model,
@@ -376,6 +386,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
376
386
  instance_num=instance_num,
377
387
  )
378
388
  job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
389
+ # Both this task and process_fleets can add instances to fleets.
390
+ # TODO: Ensure this does not violate nodes.max when it's enforced.
379
391
  instance.fleet_id = fleet_model.id
380
392
  logger.info(
381
393
  "The job %s created the new instance %s",
@@ -438,14 +450,21 @@ async def _select_fleet_models(
438
450
  *fleet_filters,
439
451
  FleetModel.id.not_in(fleet_models_with_instances_ids),
440
452
  )
441
- .where(InstanceModel.id.is_(None))
442
- .options(contains_eager(FleetModel.instances)) # loading empty relation
453
+ .where(
454
+ or_(
455
+ InstanceModel.id.is_(None),
456
+ not_(and_(*instance_filters)),
457
+ )
458
+ )
459
+ # Load empty list of instances so that downstream code
460
+ # knows this fleet has no instances eligible for offers.
461
+ .options(noload(FleetModel.instances))
443
462
  )
444
463
  fleet_models_without_instances = list(res.unique().scalars().all())
445
464
  return fleet_models_with_instances, fleet_models_without_instances
446
465
 
447
466
 
448
- async def _refetch_fleet_models(
467
+ async def _refetch_fleet_models_with_instances(
449
468
  session: AsyncSession,
450
469
  fleets_ids: list[uuid.UUID],
451
470
  instances_ids: list[uuid.UUID],
@@ -460,13 +479,8 @@ async def _refetch_fleet_models(
460
479
  *fleet_filters,
461
480
  )
462
481
  .where(
463
- or_(
464
- InstanceModel.id.is_(None),
465
- and_(
466
- InstanceModel.id.in_(instances_ids),
467
- *instance_filters,
468
- ),
469
- )
482
+ InstanceModel.id.in_(instances_ids),
483
+ *instance_filters,
470
484
  )
471
485
  .options(contains_eager(FleetModel.instances))
472
486
  .execution_options(populate_existing=True)
@@ -533,7 +547,7 @@ def _find_optimal_fleet_with_offers(
533
547
  fleet_priority,
534
548
  )
535
549
  )
536
- if run_spec.configuration.fleets is None and all(
550
+ if run_spec.merged_profile.fleets is None and all(
537
551
  t[2] == 0 for t in candidate_fleets_with_offers
538
552
  ):
539
553
  # If fleets are not specified and no fleets have available offers, create a new fleet.
@@ -646,6 +660,8 @@ async def _run_job_on_new_instance(
646
660
  ) -> Optional[Tuple[JobProvisioningData, InstanceOfferWithAvailability]]:
647
661
  if volumes is None:
648
662
  volumes = []
663
+ profile = run.run_spec.merged_profile
664
+ requirements = job.job_spec.requirements
649
665
  fleet = None
650
666
  if fleet_model is not None:
651
667
  fleet = fleet_model_to_fleet(fleet_model)
@@ -654,13 +670,26 @@ async def _run_job_on_new_instance(
654
670
  "%s: cannot fit new instance into fleet %s", fmt(job_model), fleet_model.name
655
671
  )
656
672
  return None
673
+ profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, profile)
674
+ if profile is None:
675
+ logger.debug("%s: cannot combine fleet %s profile", fmt(job_model), fleet_model.name)
676
+ return None
677
+ fleet_requirements = get_fleet_requirements(fleet.spec)
678
+ requirements = combine_fleet_and_run_requirements(fleet_requirements, requirements)
679
+ if requirements is None:
680
+ logger.debug(
681
+ "%s: cannot combine fleet %s requirements", fmt(job_model), fleet_model.name
682
+ )
683
+ return None
684
+ # TODO: Respect fleet provisioning properties such as tags
685
+
657
686
  multinode = job.job_spec.jobs_per_replica > 1 or (
658
687
  fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
659
688
  )
660
689
  offers = await get_offers_by_requirements(
661
690
  project=project,
662
- profile=run.run_spec.merged_profile,
663
- requirements=job.job_spec.requirements,
691
+ profile=profile,
692
+ requirements=requirements,
664
693
  exclude_not_available=True,
665
694
  multinode=multinode,
666
695
  master_job_provisioning_data=master_job_provisioning_data,
@@ -730,12 +759,17 @@ def _create_fleet_model_for_job(
730
759
  placement = InstanceGroupPlacement.ANY
731
760
  if run.run_spec.configuration.type == "task" and run.run_spec.configuration.nodes > 1:
732
761
  placement = InstanceGroupPlacement.CLUSTER
762
+ nodes = _get_nodes_required_num_for_run(run.run_spec)
733
763
  spec = FleetSpec(
734
764
  configuration=FleetConfiguration(
735
765
  name=run.run_spec.run_name,
736
766
  placement=placement,
737
767
  reservation=run.run_spec.configuration.reservation,
738
- nodes=Range(min=_get_nodes_required_num_for_run(run.run_spec), max=None),
768
+ nodes=FleetNodesSpec(
769
+ min=nodes,
770
+ target=nodes,
771
+ max=None,
772
+ ),
739
773
  ),
740
774
  profile=run.run_spec.merged_profile,
741
775
  autocreated=True,
@@ -752,25 +786,14 @@ def _create_fleet_model_for_job(
752
786
 
753
787
 
754
788
  async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int:
755
- if len(fleet_model.instances) == 0:
756
- # No instances means the fleet is not in the db yet, so don't lock.
757
- return 0
758
- async with get_locker(get_db().dialect_name).lock_ctx(
759
- FleetModel.__tablename__, [fleet_model.id]
760
- ):
761
- fleet_model = (
762
- (
763
- await session.execute(
764
- select(FleetModel)
765
- .where(FleetModel.id == fleet_model.id)
766
- .options(joinedload(FleetModel.instances))
767
- .execution_options(populate_existing=True)
768
- )
769
- )
770
- .unique()
771
- .scalar_one()
789
+ res = await session.execute(
790
+ select(InstanceModel.instance_num).where(
791
+ InstanceModel.fleet_id == fleet_model.id,
792
+ InstanceModel.deleted.is_(False),
772
793
  )
773
- return len(fleet_model.instances)
794
+ )
795
+ taken_instance_nums = set(res.scalars().all())
796
+ return get_next_instance_num(taken_instance_nums)
774
797
 
775
798
 
776
799
  def _create_instance_model_for_job(
@@ -75,9 +75,9 @@ async def _process_next_terminating_job():
75
75
  return
76
76
  instance_lockset.add(instance_model.id)
77
77
  job_lockset.add(job_model.id)
78
+ job_model_id = job_model.id
79
+ instance_model_id = job_model.used_instance_id
78
80
  try:
79
- job_model_id = job_model.id
80
- instance_model_id = job_model.used_instance_id
81
81
  await _process_job(
82
82
  session=session,
83
83
  job_model=job_model,
@@ -42,8 +42,8 @@ async def process_submitted_volumes():
42
42
  if volume_model is None:
43
43
  return
44
44
  lockset.add(volume_model.id)
45
+ volume_model_id = volume_model.id
45
46
  try:
46
- volume_model_id = volume_model.id
47
47
  await _process_submitted_volume(session=session, volume_model=volume_model)
48
48
  finally:
49
49
  lockset.difference_update([volume_model_id])
@@ -4,8 +4,12 @@ from typing import Optional
4
4
  from alembic import command, config
5
5
  from sqlalchemy import AsyncAdaptedQueuePool, event
6
6
  from sqlalchemy.engine.interfaces import DBAPIConnection
7
- from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, create_async_engine
8
- from sqlalchemy.orm import sessionmaker
7
+ from sqlalchemy.ext.asyncio import (
8
+ AsyncEngine,
9
+ AsyncSession,
10
+ async_sessionmaker,
11
+ create_async_engine,
12
+ )
9
13
  from sqlalchemy.pool import ConnectionPoolEntry
10
14
 
11
15
  from dstack._internal.server import settings
@@ -26,8 +30,8 @@ class Database:
26
30
  pool_size=settings.DB_POOL_SIZE,
27
31
  max_overflow=settings.DB_MAX_OVERFLOW,
28
32
  )
29
- self.session_maker = sessionmaker(
30
- bind=self.engine,
33
+ self.session_maker = async_sessionmaker(
34
+ bind=self.engine, # type: ignore[assignment]
31
35
  expire_on_commit=False,
32
36
  class_=AsyncSession,
33
37
  )
@@ -0,0 +1,44 @@
1
+ """Add FleetModel.consolidation_attempt and FleetModel.last_consolidated_at
2
+
3
+ Revision ID: 2498ab323443
4
+ Revises: e2d08cd1b8d9
5
+ Create Date: 2025-08-29 16:08:48.686595
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ from alembic import op
11
+
12
+ import dstack._internal.server.models
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision = "2498ab323443"
16
+ down_revision = "e2d08cd1b8d9"
17
+ branch_labels = None
18
+ depends_on = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ # ### commands auto generated by Alembic - please adjust! ###
23
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
24
+ batch_op.add_column(
25
+ sa.Column("consolidation_attempt", sa.Integer(), server_default="0", nullable=False)
26
+ )
27
+ batch_op.add_column(
28
+ sa.Column(
29
+ "last_consolidated_at",
30
+ dstack._internal.server.models.NaiveDateTime(),
31
+ nullable=True,
32
+ )
33
+ )
34
+
35
+ # ### end Alembic commands ###
36
+
37
+
38
+ def downgrade() -> None:
39
+ # ### commands auto generated by Alembic - please adjust! ###
40
+ with op.batch_alter_table("fleets", schema=None) as batch_op:
41
+ batch_op.drop_column("last_consolidated_at")
42
+ batch_op.drop_column("consolidation_attempt")
43
+
44
+ # ### end Alembic commands ###
@@ -551,6 +551,9 @@ class FleetModel(BaseModel):
551
551
  jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet")
552
552
  instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet")
553
553
 
554
+ consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0")
555
+ last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
556
+
554
557
 
555
558
  class InstanceModel(BaseModel):
556
559
  __tablename__ = "instances"
@@ -605,8 +608,8 @@ class InstanceModel(BaseModel):
605
608
  Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME
606
609
  )
607
610
 
608
- # retry policy
609
- last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
611
+ # Deprecated
612
+ last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True)
610
613
 
611
614
  # instance termination handling
612
615
  termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
@@ -622,6 +625,7 @@ class InstanceModel(BaseModel):
622
625
  backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100))
623
626
  backend_data: Mapped[Optional[str]] = mapped_column(Text)
624
627
 
628
+ # Not set for cloud fleets that haven't been provisioning
625
629
  offer: Mapped[Optional[str]] = mapped_column(Text)
626
630
  region: Mapped[Optional[str]] = mapped_column(String(2000))
627
631
  price: Mapped[Optional[float]] = mapped_column(Float)
@@ -1,9 +1,7 @@
1
1
  from typing import Tuple
2
2
 
3
3
  from fastapi import APIRouter, Depends
4
- from sqlalchemy.ext.asyncio import AsyncSession
5
4
 
6
- from dstack._internal.server.db import get_session
7
5
  from dstack._internal.server.models import ProjectModel, UserModel
8
6
  from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse
9
7
  from dstack._internal.server.security.permissions import ProjectMember
@@ -20,10 +18,7 @@ project_router = APIRouter(
20
18
  @project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True)
21
19
  async def list_gpus(
22
20
  body: ListGpusRequest,
23
- session: AsyncSession = Depends(get_session),
24
21
  user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
25
22
  ) -> ListGpusResponse:
26
23
  _, project = user_project
27
- return await list_gpus_grouped(
28
- session=session, project=project, run_spec=body.run_spec, group_by=body.group_by
29
- )
24
+ return await list_gpus_grouped(project=project, run_spec=body.run_spec, group_by=body.group_by)
@@ -78,6 +78,7 @@ class SubmitBody(CoreModel):
78
78
  "max_duration",
79
79
  "ssh_key",
80
80
  "working_dir",
81
+ "repo_dir",
81
82
  "repo_data",
82
83
  "file_archives",
83
84
  }
@@ -159,6 +160,16 @@ class GPUDevice(CoreModel):
159
160
  path_in_container: str
160
161
 
161
162
 
163
+ class TaskListItem(CoreModel):
164
+ id: str
165
+ status: TaskStatus
166
+
167
+
168
+ class TaskListResponse(CoreModel):
169
+ ids: Optional[list[str]] = None # returned by pre-0.19.26 shim
170
+ tasks: Optional[list[TaskListItem]] = None # returned by 0.19.26+ shim
171
+
172
+
162
173
  class TaskInfoResponse(CoreModel):
163
174
  id: str
164
175
  status: TaskStatus
@@ -17,8 +17,8 @@ from dstack._internal.core.backends.configurators import (
17
17
  )
18
18
  from dstack._internal.core.backends.local.backend import LocalBackend
19
19
  from dstack._internal.core.backends.models import (
20
- AnyBackendConfig,
21
20
  AnyBackendConfigWithCreds,
21
+ AnyBackendConfigWithoutCreds,
22
22
  )
23
23
  from dstack._internal.core.errors import (
24
24
  BackendError,
@@ -126,19 +126,25 @@ async def get_backend_config(
126
126
  )
127
127
  continue
128
128
  if backend_model.type == backend_type:
129
- return get_backend_config_from_backend_model(
130
- configurator, backend_model, include_creds=True
131
- )
129
+ return get_backend_config_with_creds_from_backend_model(configurator, backend_model)
132
130
  return None
133
131
 
134
132
 
135
- def get_backend_config_from_backend_model(
133
+ def get_backend_config_with_creds_from_backend_model(
134
+ configurator: Configurator,
135
+ backend_model: BackendModel,
136
+ ) -> AnyBackendConfigWithCreds:
137
+ backend_record = get_stored_backend_record(backend_model)
138
+ backend_config = configurator.get_backend_config_with_creds(backend_record)
139
+ return backend_config
140
+
141
+
142
+ def get_backend_config_without_creds_from_backend_model(
136
143
  configurator: Configurator,
137
144
  backend_model: BackendModel,
138
- include_creds: bool,
139
- ) -> AnyBackendConfig:
145
+ ) -> AnyBackendConfigWithoutCreds:
140
146
  backend_record = get_stored_backend_record(backend_model)
141
- backend_config = configurator.get_backend_config(backend_record, include_creds=include_creds)
147
+ backend_config = configurator.get_backend_config_without_creds(backend_record)
142
148
  return backend_config
143
149
 
144
150
 
@@ -55,7 +55,11 @@ async def _check_active_instances(
55
55
  )
56
56
  for fleet_model in fleet_models:
57
57
  for instance in fleet_model.instances:
58
- if instance.status.is_active() and instance.backend in backends_types:
58
+ if (
59
+ instance.status.is_active()
60
+ and instance.backend is not None
61
+ and instance.backend in backends_types
62
+ ):
59
63
  if error:
60
64
  msg = (
61
65
  f"Backend {instance.backend.value} has active instances."
@@ -83,6 +87,7 @@ async def _check_active_volumes(
83
87
  if (
84
88
  volume_model.status.is_active()
85
89
  and volume_model.provisioning_data is not None
90
+ and volume_model.provisioning_data.backend is not None
86
91
  and volume_model.provisioning_data.backend in backends_types
87
92
  ):
88
93
  if error:
@@ -32,15 +32,15 @@ class DXFAuthAdapter:
32
32
 
33
33
 
34
34
  class DockerImage(CoreModel):
35
- class Config(CoreModel.Config):
36
- frozen = True
37
-
38
35
  image: str
39
36
  registry: Optional[str]
40
37
  repo: str
41
38
  tag: str
42
39
  digest: Optional[str]
43
40
 
41
+ class Config(CoreModel.Config):
42
+ frozen = True
43
+
44
44
 
45
45
  class ImageConfig(CoreModel):
46
46
  user: Annotated[Optional[str], Field(alias="User")] = None
@@ -77,7 +77,7 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) ->
77
77
  registry_client = PatchedDXF(
78
78
  host=image.registry or DEFAULT_REGISTRY,
79
79
  repo=image.repo,
80
- auth=DXFAuthAdapter(registry_auth),
80
+ auth=DXFAuthAdapter(registry_auth), # type: ignore[assignment]
81
81
  timeout=REGISTRY_REQUEST_TIMEOUT,
82
82
  )
83
83
 
@@ -88,7 +88,7 @@ def get_image_config(image_name: str, registry_auth: Optional[RegistryAuth]) ->
88
88
  )
89
89
  manifest = ImageManifest.__response__.parse_raw(manifest_resp)
90
90
  config_stream = registry_client.pull_blob(manifest.config.digest)
91
- config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE)
91
+ config_resp = join_byte_stream_checked(config_stream, MAX_CONFIG_OBJECT_SIZE) # type: ignore[arg-type]
92
92
  if config_resp is None:
93
93
  raise DockerRegistryError(
94
94
  f"Image config object exceeds the size limit of {MAX_CONFIG_OBJECT_SIZE} bytes"
@@ -279,7 +279,7 @@ async def get_plan(
279
279
  offers_with_backends = await get_create_instance_offers(
280
280
  project=project,
281
281
  profile=effective_spec.merged_profile,
282
- requirements=_get_fleet_requirements(effective_spec),
282
+ requirements=get_fleet_requirements(effective_spec),
283
283
  fleet_spec=effective_spec,
284
284
  blocks=effective_spec.configuration.blocks,
285
285
  )
@@ -449,25 +449,24 @@ async def create_fleet(
449
449
  return await _create_fleet(session=session, project=project, user=user, spec=spec)
450
450
 
451
451
 
452
- async def create_fleet_instance_model(
452
+ def create_fleet_instance_model(
453
453
  session: AsyncSession,
454
454
  project: ProjectModel,
455
- user: UserModel,
455
+ username: str,
456
456
  spec: FleetSpec,
457
- reservation: Optional[str],
458
457
  instance_num: int,
459
458
  ) -> InstanceModel:
460
459
  profile = spec.merged_profile
461
- requirements = _get_fleet_requirements(spec)
462
- instance_model = await instances_services.create_instance_model(
460
+ requirements = get_fleet_requirements(spec)
461
+ instance_model = instances_services.create_instance_model(
463
462
  session=session,
464
463
  project=project,
465
- user=user,
464
+ username=username,
466
465
  profile=profile,
467
466
  requirements=requirements,
468
467
  instance_name=f"{spec.configuration.name}-{instance_num}",
469
468
  instance_num=instance_num,
470
- reservation=reservation,
469
+ reservation=spec.merged_profile.reservation,
471
470
  blocks=spec.configuration.blocks,
472
471
  tags=spec.configuration.tags,
473
472
  )
@@ -504,6 +503,7 @@ async def create_fleet_ssh_instance_model(
504
503
  raise ServerClientError("ssh key or user not specified")
505
504
 
506
505
  if proxy_jump is not None:
506
+ assert proxy_jump.ssh_key is not None
507
507
  ssh_proxy = SSHConnectionParams(
508
508
  hostname=proxy_jump.hostname,
509
509
  port=proxy_jump.port or 22,
@@ -643,6 +643,30 @@ def is_fleet_empty(fleet_model: FleetModel) -> bool:
643
643
  return len(active_instances) == 0
644
644
 
645
645
 
646
+ def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
647
+ profile = fleet_spec.merged_profile
648
+ requirements = Requirements(
649
+ resources=fleet_spec.configuration.resources or ResourcesSpec(),
650
+ max_price=profile.max_price,
651
+ spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND),
652
+ reservation=fleet_spec.configuration.reservation,
653
+ )
654
+ return requirements
655
+
656
+
657
+ def get_next_instance_num(taken_instance_nums: set[int]) -> int:
658
+ if not taken_instance_nums:
659
+ return 0
660
+ min_instance_num = min(taken_instance_nums)
661
+ if min_instance_num > 0:
662
+ return 0
663
+ instance_num = min_instance_num + 1
664
+ while True:
665
+ if instance_num not in taken_instance_nums:
666
+ return instance_num
667
+ instance_num += 1
668
+
669
+
646
670
  async def _create_fleet(
647
671
  session: AsyncSession,
648
672
  project: ProjectModel,
@@ -693,12 +717,11 @@ async def _create_fleet(
693
717
  fleet_model.instances.append(instances_model)
694
718
  else:
695
719
  for i in range(_get_fleet_nodes_to_provision(spec)):
696
- instance_model = await create_fleet_instance_model(
720
+ instance_model = create_fleet_instance_model(
697
721
  session=session,
698
722
  project=project,
699
- user=user,
723
+ username=user.name,
700
724
  spec=spec,
701
- reservation=spec.configuration.reservation,
702
725
  instance_num=i,
703
726
  )
704
727
  fleet_model.instances.append(instance_model)
@@ -766,7 +789,7 @@ async def _update_fleet(
766
789
  if added_hosts:
767
790
  await _check_ssh_hosts_not_yet_added(session, spec, fleet.id)
768
791
  for host in added_hosts.values():
769
- instance_num = _get_next_instance_num(active_instance_nums)
792
+ instance_num = get_next_instance_num(active_instance_nums)
770
793
  instance_model = await create_fleet_ssh_instance_model(
771
794
  project=project,
772
795
  spec=spec,
@@ -982,9 +1005,9 @@ def _validate_internal_ips(ssh_config: SSHParams):
982
1005
 
983
1006
 
984
1007
  def _get_fleet_nodes_to_provision(spec: FleetSpec) -> int:
985
- if spec.configuration.nodes is None or spec.configuration.nodes.min is None:
1008
+ if spec.configuration.nodes is None:
986
1009
  return 0
987
- return spec.configuration.nodes.min
1010
+ return spec.configuration.nodes.target
988
1011
 
989
1012
 
990
1013
  def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[List[int]]):
@@ -1001,27 +1024,3 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[
1001
1024
  instance.deleted = True
1002
1025
  else:
1003
1026
  instance.status = InstanceStatus.TERMINATING
1004
-
1005
-
1006
- def _get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements:
1007
- profile = fleet_spec.merged_profile
1008
- requirements = Requirements(
1009
- resources=fleet_spec.configuration.resources or ResourcesSpec(),
1010
- max_price=profile.max_price,
1011
- spot=get_policy_map(profile.spot_policy, default=SpotPolicy.ONDEMAND),
1012
- reservation=fleet_spec.configuration.reservation,
1013
- )
1014
- return requirements
1015
-
1016
-
1017
- def _get_next_instance_num(instance_nums: set[int]) -> int:
1018
- if not instance_nums:
1019
- return 0
1020
- min_instance_num = min(instance_nums)
1021
- if min_instance_num > 0:
1022
- return 0
1023
- instance_num = min_instance_num + 1
1024
- while True:
1025
- if instance_num not in instance_nums:
1026
- return instance_num
1027
- instance_num += 1