dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (41) hide show
  1. dstack/_internal/cli/services/configurators/run.py +1 -1
  2. dstack/_internal/core/backends/base/compute.py +20 -1
  3. dstack/_internal/core/backends/base/models.py +10 -0
  4. dstack/_internal/core/backends/base/offers.py +1 -0
  5. dstack/_internal/core/backends/features.py +5 -0
  6. dstack/_internal/core/backends/nebius/compute.py +28 -16
  7. dstack/_internal/core/backends/nebius/configurator.py +1 -1
  8. dstack/_internal/core/backends/nebius/models.py +4 -0
  9. dstack/_internal/core/backends/nebius/resources.py +41 -20
  10. dstack/_internal/core/backends/runpod/api_client.py +245 -59
  11. dstack/_internal/core/backends/runpod/compute.py +157 -13
  12. dstack/_internal/core/models/compute_groups.py +39 -0
  13. dstack/_internal/core/models/fleets.py +6 -1
  14. dstack/_internal/core/models/profiles.py +3 -1
  15. dstack/_internal/core/models/runs.py +3 -0
  16. dstack/_internal/server/app.py +14 -2
  17. dstack/_internal/server/background/__init__.py +7 -0
  18. dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
  19. dstack/_internal/server/background/tasks/process_instances.py +81 -49
  20. dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
  21. dstack/_internal/server/migrations/env.py +20 -2
  22. dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
  23. dstack/_internal/server/models.py +39 -0
  24. dstack/_internal/server/routers/runs.py +15 -6
  25. dstack/_internal/server/services/compute_groups.py +22 -0
  26. dstack/_internal/server/services/fleets.py +1 -0
  27. dstack/_internal/server/services/jobs/__init__.py +13 -0
  28. dstack/_internal/server/services/jobs/configurators/base.py +3 -2
  29. dstack/_internal/server/services/requirements/combine.py +1 -0
  30. dstack/_internal/server/services/runs.py +17 -3
  31. dstack/_internal/server/testing/common.py +51 -0
  32. dstack/_internal/server/utils/routers.py +18 -20
  33. dstack/_internal/settings.py +4 -1
  34. dstack/_internal/utils/version.py +22 -0
  35. dstack/version.py +1 -1
  36. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
  37. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
  38. dstack/_internal/core/backends/nebius/fabrics.py +0 -49
  39. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
  40. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
  41. {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
@@ -3,16 +3,22 @@ import itertools
3
3
  import math
4
4
  import uuid
5
5
  from datetime import datetime, timedelta
6
- from typing import List, Optional
6
+ from typing import List, Optional, Union
7
7
 
8
8
  from sqlalchemy import and_, func, not_, or_, select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
10
  from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
11
11
 
12
12
  from dstack._internal.core.backends.base.backend import Backend
13
- from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport
13
+ from dstack._internal.core.backends.base.compute import (
14
+ ComputeWithGroupProvisioningSupport,
15
+ ComputeWithVolumeSupport,
16
+ )
17
+ from dstack._internal.core.backends.base.models import JobConfiguration
18
+ from dstack._internal.core.backends.features import BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT
14
19
  from dstack._internal.core.errors import BackendError, ServerClientError
15
20
  from dstack._internal.core.models.common import NetworkMode
21
+ from dstack._internal.core.models.compute_groups import ComputeGroupProvisioningData
16
22
  from dstack._internal.core.models.fleets import (
17
23
  Fleet,
18
24
  FleetConfiguration,
@@ -42,8 +48,10 @@ from dstack._internal.core.models.runs import (
42
48
  from dstack._internal.core.models.volumes import Volume
43
49
  from dstack._internal.core.services.profiles import get_termination
44
50
  from dstack._internal.server import settings
51
+ from dstack._internal.server.background.tasks.process_compute_groups import ComputeGroupStatus
45
52
  from dstack._internal.server.db import get_db, get_session_ctx
46
53
  from dstack._internal.server.models import (
54
+ ComputeGroupModel,
47
55
  FleetModel,
48
56
  InstanceModel,
49
57
  JobModel,
@@ -69,6 +77,7 @@ from dstack._internal.server.services.instances import (
69
77
  from dstack._internal.server.services.jobs import (
70
78
  check_can_attach_job_volumes,
71
79
  find_job,
80
+ find_jobs,
72
81
  get_instances_ids_with_detaching_volumes,
73
82
  get_job_configured_volume_models,
74
83
  get_job_configured_volumes,
@@ -132,6 +141,7 @@ async def _process_next_submitted_job():
132
141
  .join(JobModel.run)
133
142
  .where(
134
143
  JobModel.status == JobStatus.SUBMITTED,
144
+ JobModel.waiting_master_job.is_not(True),
135
145
  JobModel.id.not_in(lockset),
136
146
  )
137
147
  .options(load_only(JobModel.id))
@@ -190,6 +200,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
190
200
  run_spec = run.run_spec
191
201
  run_profile = run_spec.merged_profile
192
202
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
203
+ replica_jobs = find_jobs(run.jobs, replica_num=job_model.replica_num)
204
+ replica_job_models = _get_job_models_for_jobs(run_model.jobs, replica_jobs)
193
205
  multinode = job.job_spec.jobs_per_replica > 1
194
206
 
195
207
  # Master job chooses fleet for the run.
@@ -323,6 +335,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
323
335
  return
324
336
  # If no instances were locked, we can proceed in the same transaction.
325
337
 
338
+ # TODO: Volume attachment for compute groups is not yet supported since
339
+ # currently supported compute groups (e.g. Runpod) don't need explicit volume attachment.
340
+ need_volume_attachment = True
341
+
326
342
  if job_model.instance is not None:
327
343
  res = await session.execute(
328
344
  select(InstanceModel)
@@ -333,7 +349,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
333
349
  instance = res.unique().scalar_one()
334
350
  job_model.status = JobStatus.PROVISIONING
335
351
  else:
336
- # Assigned no instance, create a new one
337
352
  if run_profile.creation_policy == CreationPolicy.REUSE:
338
353
  logger.debug("%s: reuse instance failed", fmt(job_model))
339
354
  job_model.status = JobStatus.TERMINATING
@@ -342,13 +357,23 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
342
357
  await session.commit()
343
358
  return
344
359
 
345
- # Create a new cloud instance
346
- run_job_result = await _run_job_on_new_instance(
360
+ jobs_to_provision = [job]
361
+ if (
362
+ multinode
363
+ and job.job_spec.job_num == 0
364
+ # job_model.waiting_master_job is not set for legacy jobs.
365
+ # In this case compute group provisioning not supported
366
+ # and jobs always provision one-by-one.
367
+ and job_model.waiting_master_job is not None
368
+ ):
369
+ jobs_to_provision = replica_jobs
370
+
371
+ run_job_result = await _run_jobs_on_new_instances(
347
372
  project=project,
348
373
  fleet_model=fleet_model,
349
374
  job_model=job_model,
350
375
  run=run,
351
- job=job,
376
+ jobs=jobs_to_provision,
352
377
  project_ssh_public_key=project.ssh_public_key,
353
378
  project_ssh_private_key=project.ssh_private_key,
354
379
  master_job_provisioning_data=master_job_provisioning_data,
@@ -362,72 +387,102 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
362
387
  await session.commit()
363
388
  return
364
389
 
365
- logger.info("%s: now is provisioning a new instance", fmt(job_model))
366
- job_provisioning_data, offer, effective_profile, _ = run_job_result
367
- job_model.job_provisioning_data = job_provisioning_data.json()
368
- job_model.status = JobStatus.PROVISIONING
369
390
  if fleet_model is None:
370
391
  fleet_model = await _create_fleet_model_for_job(
371
392
  session=session,
372
393
  project=project,
373
394
  run=run,
374
395
  )
375
- # FIXME: Fleet is not locked which may lead to duplicate instance_num.
376
- # This is currently hard to fix without locking the fleet for entire provisioning duration.
377
- # Processing should be done in multiple steps so that
378
- # InstanceModel is created before provisioning.
379
- instance_num = await _get_next_instance_num(
380
- session=session,
381
- fleet_model=fleet_model,
382
- )
383
- instance = _create_instance_model_for_job(
384
- project=project,
385
- fleet_model=fleet_model,
386
- job_model=job_model,
387
- job_provisioning_data=job_provisioning_data,
388
- offer=offer,
389
- instance_num=instance_num,
390
- profile=effective_profile,
391
- )
392
- job_model.job_runtime_data = _prepare_job_runtime_data(offer, multinode).json()
393
- # Both this task and process_fleets can add instances to fleets.
394
- # TODO: Ensure this does not violate nodes.max when it's enforced.
395
- instance.fleet_id = fleet_model.id
396
- logger.info(
397
- "The job %s created the new instance %s",
398
- job_model.job_name,
399
- instance.name,
400
- extra={
401
- "instance_name": instance.name,
402
- "instance_status": InstanceStatus.PROVISIONING.value,
403
- },
404
- )
405
- session.add(instance)
406
- session.add(fleet_model)
407
- job_model.used_instance_id = instance.id
408
-
409
- volumes_ids = sorted([v.id for vs in volume_models for v in vs])
410
- # TODO: lock instances for attaching volumes?
411
- # Take lock to prevent attaching volumes that are to be deleted.
412
- # If the volume was deleted before the lock, the volume will fail to attach and the job will fail.
413
- await session.execute(
414
- select(VolumeModel)
415
- .where(VolumeModel.id.in_(volumes_ids))
416
- .options(joinedload(VolumeModel.user).load_only(UserModel.name))
417
- .order_by(VolumeModel.id) # take locks in order
418
- .with_for_update(key_share=True, of=VolumeModel)
419
- )
420
- async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
421
- if len(volume_models) > 0:
422
- await _attach_volumes(
396
+ session.add(fleet_model)
397
+
398
+ provisioning_data, offer, effective_profile, _ = run_job_result
399
+ compute_group_model = None
400
+ if isinstance(provisioning_data, ComputeGroupProvisioningData):
401
+ need_volume_attachment = False
402
+ provisioned_jobs = jobs_to_provision
403
+ jpds = provisioning_data.job_provisioning_datas
404
+ compute_group_model = ComputeGroupModel(
405
+ id=uuid.uuid4(),
406
+ project=project,
407
+ fleet=fleet_model,
408
+ status=ComputeGroupStatus.RUNNING,
409
+ provisioning_data=provisioning_data.json(),
410
+ )
411
+ session.add(compute_group_model)
412
+ else:
413
+ provisioned_jobs = [job]
414
+ jpds = [provisioning_data]
415
+ if len(jobs_to_provision) > 1:
416
+ # Tried provisioning multiple jobs but provisioned only one.
417
+ # Allow other jobs to provision one-by-one.
418
+ for replica_job_model in replica_job_models:
419
+ replica_job_model.waiting_master_job = False
420
+
421
+ logger.info("%s: provisioned %s new instance(s)", fmt(job_model), len(provisioned_jobs))
422
+ provisioned_job_models = _get_job_models_for_jobs(run_model.jobs, provisioned_jobs)
423
+ instance = None # Instance for attaching volumes in case of single job provisioned
424
+ for provisioned_job_model, jpd in zip(provisioned_job_models, jpds):
425
+ provisioned_job_model.job_provisioning_data = jpd.json()
426
+ provisioned_job_model.status = JobStatus.PROVISIONING
427
+ # FIXME: Fleet is not locked which may lead to duplicate instance_num.
428
+ # This is currently hard to fix without locking the fleet for entire provisioning duration.
429
+ # Processing should be done in multiple steps so that
430
+ # InstanceModel is created before provisioning.
431
+ instance_num = await _get_next_instance_num(
423
432
  session=session,
433
+ fleet_model=fleet_model,
434
+ )
435
+ instance = _create_instance_model_for_job(
424
436
  project=project,
425
- job_model=job_model,
426
- instance=instance,
427
- volume_models=volume_models,
437
+ fleet_model=fleet_model,
438
+ compute_group_model=compute_group_model,
439
+ job_model=provisioned_job_model,
440
+ job_provisioning_data=jpd,
441
+ offer=offer,
442
+ instance_num=instance_num,
443
+ profile=effective_profile,
428
444
  )
429
- job_model.last_processed_at = common_utils.get_current_datetime()
430
- await session.commit()
445
+ provisioned_job_model.job_runtime_data = _prepare_job_runtime_data(
446
+ offer, multinode
447
+ ).json()
448
+ logger.info(
449
+ "Created a new instance %s for job %s",
450
+ instance.name,
451
+ provisioned_job_model.job_name,
452
+ extra={
453
+ "instance_name": instance.name,
454
+ "instance_status": InstanceStatus.PROVISIONING.value,
455
+ },
456
+ )
457
+ session.add(instance)
458
+ provisioned_job_model.used_instance_id = instance.id
459
+ provisioned_job_model.last_processed_at = common_utils.get_current_datetime()
460
+
461
+ volumes_ids = sorted([v.id for vs in volume_models for v in vs])
462
+ if need_volume_attachment:
463
+ # TODO: Lock instances for attaching volumes?
464
+ # Take lock to prevent attaching volumes that are to be deleted.
465
+ # If the volume was deleted before the lock, the volume will fail to attach and the job will fail.
466
+ await session.execute(
467
+ select(VolumeModel)
468
+ .where(VolumeModel.id.in_(volumes_ids))
469
+ .options(joinedload(VolumeModel.user).load_only(UserModel.name))
470
+ .order_by(VolumeModel.id) # take locks in order
471
+ .with_for_update(key_share=True, of=VolumeModel)
472
+ )
473
+ async with get_locker(get_db().dialect_name).lock_ctx(
474
+ VolumeModel.__tablename__, volumes_ids
475
+ ):
476
+ if len(volume_models) > 0:
477
+ assert instance is not None
478
+ await _attach_volumes(
479
+ session=session,
480
+ project=project,
481
+ job_model=job_model,
482
+ instance=instance,
483
+ volume_models=volume_models,
484
+ )
485
+ await session.commit()
431
486
 
432
487
 
433
488
  async def _select_fleet_models(
@@ -553,10 +608,9 @@ async def _find_optimal_fleet_with_offers(
553
608
  except ValueError:
554
609
  fleet_backend_offers = []
555
610
  else:
556
- multinode = (
557
- candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
558
- or job.job_spec.jobs_per_replica > 1
559
- )
611
+ # Handle multinode for old jobs that don't have requirements.multinode set.
612
+ # TODO: Drop multinode param.
613
+ multinode = requirements.multinode or job.job_spec.jobs_per_replica > 1
560
614
  fleet_backend_offers = await get_offers_by_requirements(
561
615
  project=project,
562
616
  profile=profile,
@@ -728,19 +782,33 @@ async def _assign_job_to_fleet_instance(
728
782
  return instance
729
783
 
730
784
 
731
- async def _run_job_on_new_instance(
785
+ async def _run_jobs_on_new_instances(
732
786
  project: ProjectModel,
733
787
  job_model: JobModel,
734
788
  run: Run,
735
- job: Job,
789
+ jobs: list[Job],
736
790
  project_ssh_public_key: str,
737
791
  project_ssh_private_key: str,
738
792
  master_job_provisioning_data: Optional[JobProvisioningData] = None,
739
- volumes: Optional[List[List[Volume]]] = None,
793
+ volumes: Optional[list[list[Volume]]] = None,
740
794
  fleet_model: Optional[FleetModel] = None,
741
- ) -> Optional[tuple[JobProvisioningData, InstanceOfferWithAvailability, Profile, Requirements]]:
795
+ ) -> Optional[
796
+ tuple[
797
+ Union[JobProvisioningData, ComputeGroupProvisioningData],
798
+ InstanceOfferWithAvailability,
799
+ Profile,
800
+ Requirements,
801
+ ]
802
+ ]:
803
+ """
804
+ Provisions an instance for a job or a compute group for multiple jobs and runs the jobs.
805
+ Even when multiple jobs are passes, it may still provision only one instance
806
+ and run only the master job in case there are no offers supporting cluster groups.
807
+ Other jobs should be provisioned one-by-one later.
808
+ """
742
809
  if volumes is None:
743
810
  volumes = []
811
+ job = jobs[0]
744
812
  profile = run.run_spec.merged_profile
745
813
  requirements = job.job_spec.requirements
746
814
  fleet = None
@@ -758,9 +826,7 @@ async def _run_job_on_new_instance(
758
826
  return None
759
827
  # TODO: Respect fleet provisioning properties such as tags
760
828
 
761
- multinode = job.job_spec.jobs_per_replica > 1 or (
762
- fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
763
- )
829
+ multinode = requirements.multinode or job.job_spec.jobs_per_replica > 1
764
830
  offers = await get_offers_by_requirements(
765
831
  project=project,
766
832
  profile=profile,
@@ -784,17 +850,31 @@ async def _run_job_on_new_instance(
784
850
  offer.price,
785
851
  )
786
852
  offer_volumes = _get_offer_volumes(volumes, offer)
853
+ job_configurations = [JobConfiguration(job=j, volumes=offer_volumes) for j in jobs]
854
+ compute = backend.compute()
787
855
  try:
788
- job_provisioning_data = await common_utils.run_async(
789
- backend.compute().run_job,
790
- run,
791
- job,
792
- offer,
793
- project_ssh_public_key,
794
- project_ssh_private_key,
795
- offer_volumes,
796
- )
797
- return job_provisioning_data, offer, profile, requirements
856
+ if len(jobs) > 1 and offer.backend in BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT:
857
+ assert isinstance(compute, ComputeWithGroupProvisioningSupport)
858
+ cgpd = await common_utils.run_async(
859
+ compute.run_jobs,
860
+ run,
861
+ job_configurations,
862
+ offer,
863
+ project_ssh_public_key,
864
+ project_ssh_private_key,
865
+ )
866
+ return cgpd, offer, profile, requirements
867
+ else:
868
+ jpd = await common_utils.run_async(
869
+ compute.run_job,
870
+ run,
871
+ job,
872
+ offer,
873
+ project_ssh_public_key,
874
+ project_ssh_private_key,
875
+ offer_volumes,
876
+ )
877
+ return jpd, offer, profile, requirements
798
878
  except BackendError as e:
799
879
  logger.warning(
800
880
  "%s: %s launch in %s/%s failed: %s",
@@ -912,6 +992,7 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
912
992
  def _create_instance_model_for_job(
913
993
  project: ProjectModel,
914
994
  fleet_model: FleetModel,
995
+ compute_group_model: Optional[ComputeGroupModel],
915
996
  job_model: JobModel,
916
997
  job_provisioning_data: JobProvisioningData,
917
998
  offer: InstanceOfferWithAvailability,
@@ -931,6 +1012,8 @@ def _create_instance_model_for_job(
931
1012
  name=f"{fleet_model.name}-{instance_num}",
932
1013
  instance_num=instance_num,
933
1014
  project=project,
1015
+ fleet=fleet_model,
1016
+ compute_group=compute_group_model,
934
1017
  created_at=common_utils.get_current_datetime(),
935
1018
  started_at=common_utils.get_current_datetime(),
936
1019
  status=InstanceStatus.PROVISIONING,
@@ -1081,3 +1164,15 @@ async def _attach_volume(
1081
1164
  instance.volume_attachments.append(volume_attachment_model)
1082
1165
 
1083
1166
  volume_model.last_job_processed_at = common_utils.get_current_datetime()
1167
+
1168
+
1169
+ def _get_job_models_for_jobs(
1170
+ job_models: list[JobModel],
1171
+ jobs: list[Job],
1172
+ ) -> list[JobModel]:
1173
+ """
1174
+ Returns job models of latest submissions for a list of jobs.
1175
+ Preserves jobs order.
1176
+ """
1177
+ id_to_job_model_map = {jm.id: jm for jm in job_models}
1178
+ return [id_to_job_model_map[j.job_submissions[-1].id] for j in jobs]
@@ -6,7 +6,7 @@ from alembic import context
6
6
  from sqlalchemy import Connection, MetaData, text
7
7
 
8
8
  from dstack._internal.server.db import get_db
9
- from dstack._internal.server.models import BaseModel
9
+ from dstack._internal.server.models import BaseModel, EnumAsString
10
10
 
11
11
  config = context.config
12
12
 
@@ -21,6 +21,14 @@ def set_target_metadata(metadata: MetaData):
21
21
  target_metadata = metadata
22
22
 
23
23
 
24
+ def render_item(type_, obj, autogen_context):
25
+ """Apply custom rendering for selected items."""
26
+ if type_ == "type" and isinstance(obj, EnumAsString):
27
+ return f"sa.String(length={obj.length})"
28
+ # default rendering for other objects
29
+ return False
30
+
31
+
24
32
  def run_migrations_offline():
25
33
  """Run migrations in 'offline' mode.
26
34
  This configures the context with just a URL
@@ -35,8 +43,8 @@ def run_migrations_offline():
35
43
  target_metadata=target_metadata,
36
44
  literal_binds=True,
37
45
  dialect_opts={"paramstyle": "named"},
46
+ render_item=render_item,
38
47
  )
39
-
40
48
  with context.begin_transaction():
41
49
  context.run_migrations()
42
50
 
@@ -61,12 +69,22 @@ def run_migrations(connection: Connection):
61
69
  # https://alembic.sqlalchemy.org/en/latest/batch.html#dealing-with-referencing-foreign-keys
62
70
  if connection.dialect.name == "sqlite":
63
71
  connection.execute(text("PRAGMA foreign_keys=OFF;"))
72
+ elif connection.dialect.name == "postgresql":
73
+ # lock_timeout is needed so that migrations that acquire locks
74
+ # do not wait for locks forever, blocking live queries.
75
+ # Better to fail and retry a deployment.
76
+ connection.execute(text("SET lock_timeout='10s';"))
64
77
  connection.commit()
65
78
  context.configure(
66
79
  connection=connection,
67
80
  target_metadata=target_metadata,
68
81
  compare_type=True,
69
82
  render_as_batch=True,
83
+ render_item=render_item,
84
+ # Running each migration in a separate transaction.
85
+ # Running all migrations in one transaction may lead to deadlocks in HA deployments
86
+ # because lock ordering is not respected across all migrations.
87
+ transaction_per_migration=True,
70
88
  )
71
89
  with context.begin_transaction():
72
90
  context.run_migrations()
@@ -0,0 +1,93 @@
1
+ """Add ComputeGroupModel
2
+
3
+ Revision ID: 7d1ec2b920ac
4
+ Revises: ff1d94f65b08
5
+ Create Date: 2025-10-21 16:01:23.739395
6
+
7
+ """
8
+
9
+ import sqlalchemy as sa
10
+ import sqlalchemy_utils
11
+ from alembic import op
12
+
13
+ import dstack._internal.server.models
14
+
15
+ # revision identifiers, used by Alembic.
16
+ revision = "7d1ec2b920ac"
17
+ down_revision = "ff1d94f65b08"
18
+ branch_labels = None
19
+ depends_on = None
20
+
21
+
22
+ def upgrade() -> None:
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table(
25
+ "compute_groups",
26
+ sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
27
+ sa.Column(
28
+ "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
29
+ ),
30
+ sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
31
+ sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
32
+ sa.Column("status", sa.String(length=100), nullable=False),
33
+ sa.Column(
34
+ "last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False
35
+ ),
36
+ sa.Column("deleted", sa.Boolean(), nullable=False),
37
+ sa.Column("deleted_at", dstack._internal.server.models.NaiveDateTime(), nullable=True),
38
+ sa.Column("provisioning_data", sa.Text(), nullable=False),
39
+ sa.Column(
40
+ "first_termination_retry_at",
41
+ dstack._internal.server.models.NaiveDateTime(),
42
+ nullable=True,
43
+ ),
44
+ sa.Column(
45
+ "last_termination_retry_at",
46
+ dstack._internal.server.models.NaiveDateTime(),
47
+ nullable=True,
48
+ ),
49
+ sa.ForeignKeyConstraint(
50
+ ["fleet_id"], ["fleets.id"], name=op.f("fk_compute_groups_fleet_id_fleets")
51
+ ),
52
+ sa.ForeignKeyConstraint(
53
+ ["project_id"],
54
+ ["projects.id"],
55
+ name=op.f("fk_compute_groups_project_id_projects"),
56
+ ondelete="CASCADE",
57
+ ),
58
+ sa.PrimaryKeyConstraint("id", name=op.f("pk_compute_groups")),
59
+ )
60
+ with op.batch_alter_table("instances", schema=None) as batch_op:
61
+ batch_op.add_column(
62
+ sa.Column(
63
+ "compute_group_id",
64
+ sqlalchemy_utils.types.uuid.UUIDType(binary=False),
65
+ nullable=True,
66
+ )
67
+ )
68
+ batch_op.create_foreign_key(
69
+ batch_op.f("fk_instances_compute_group_id_compute_groups"),
70
+ "compute_groups",
71
+ ["compute_group_id"],
72
+ ["id"],
73
+ )
74
+
75
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
76
+ batch_op.add_column(sa.Column("waiting_master_job", sa.Boolean(), nullable=True))
77
+
78
+ # ### end Alembic commands ###
79
+
80
+
81
+ def downgrade() -> None:
82
+ # ### commands auto generated by Alembic - please adjust! ###
83
+ with op.batch_alter_table("jobs", schema=None) as batch_op:
84
+ batch_op.drop_column("waiting_master_job")
85
+
86
+ with op.batch_alter_table("instances", schema=None) as batch_op:
87
+ batch_op.drop_constraint(
88
+ batch_op.f("fk_instances_compute_group_id_compute_groups"), type_="foreignkey"
89
+ )
90
+ batch_op.drop_column("compute_group_id")
91
+
92
+ op.drop_table("compute_groups")
93
+ # ### end Alembic commands ###
@@ -25,6 +25,7 @@ from sqlalchemy_utils import UUIDType
25
25
  from dstack._internal.core.errors import DstackError
26
26
  from dstack._internal.core.models.backends.base import BackendType
27
27
  from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model
28
+ from dstack._internal.core.models.compute_groups import ComputeGroupStatus
28
29
  from dstack._internal.core.models.fleets import FleetStatus
29
30
  from dstack._internal.core.models.gateways import GatewayStatus
30
31
  from dstack._internal.core.models.health import HealthStatus
@@ -448,6 +449,12 @@ class JobModel(BaseModel):
448
449
  # Whether the replica is registered to receive service requests.
449
450
  # Always `False` for non-service runs.
450
451
  registered: Mapped[bool] = mapped_column(Boolean, server_default=false())
452
+ # `waiting_master_job` is `True` for non-master jobs that have to wait
453
+ # for master processing before they can be processed.
454
+ # This allows updating all replica jobs even when only master is locked,
455
+ # e.g. to provision instances for all jobs when processing master.
456
+ # If not set, all jobs should be processed only one-by-one.
457
+ waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean)
451
458
 
452
459
 
453
460
  class GatewayModel(BaseModel):
@@ -592,6 +599,9 @@ class InstanceModel(BaseModel):
592
599
  fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
593
600
  fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
594
601
 
602
+ compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id"))
603
+ compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances")
604
+
595
605
  status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True)
596
606
  unreachable: Mapped[bool] = mapped_column(Boolean)
597
607
 
@@ -743,6 +753,35 @@ class PlacementGroupModel(BaseModel):
743
753
  provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
744
754
 
745
755
 
756
+ class ComputeGroupModel(BaseModel):
757
+ __tablename__ = "compute_groups"
758
+
759
+ id: Mapped[uuid.UUID] = mapped_column(
760
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
761
+ )
762
+
763
+ project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
764
+ project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id])
765
+
766
+ fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id"))
767
+ fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id])
768
+
769
+ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
770
+ status: Mapped[ComputeGroupStatus] = mapped_column(EnumAsString(ComputeGroupStatus, 100))
771
+ last_processed_at: Mapped[datetime] = mapped_column(
772
+ NaiveDateTime, default=get_current_datetime
773
+ )
774
+ deleted: Mapped[bool] = mapped_column(Boolean, default=False)
775
+ deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
776
+
777
+ provisioning_data: Mapped[str] = mapped_column(Text)
778
+
779
+ first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
780
+ last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
781
+
782
+ instances: Mapped[List["InstanceModel"]] = relationship(back_populates="compute_group")
783
+
784
+
746
785
  class JobMetricsPoint(BaseModel):
747
786
  __tablename__ = "job_metrics_points"
748
787
 
@@ -1,6 +1,6 @@
1
- from typing import List, Tuple
1
+ from typing import Annotated, List, Optional, Tuple, cast
2
2
 
3
- from fastapi import APIRouter, Depends
3
+ from fastapi import APIRouter, Depends, Request
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
5
 
6
6
  from dstack._internal.core.errors import ResourceNotExistsError
@@ -35,6 +35,11 @@ project_router = APIRouter(
35
35
  )
36
36
 
37
37
 
38
+ def use_legacy_default_working_dir(request: Request) -> bool:
39
+ client_release = cast(Optional[tuple[int, ...]], request.state.client_release)
40
+ return client_release is not None and client_release < (0, 19, 27)
41
+
42
+
38
43
  @root_router.post(
39
44
  "/list",
40
45
  response_model=List[Run],
@@ -103,8 +108,9 @@ async def get_run(
103
108
  )
104
109
  async def get_plan(
105
110
  body: GetRunPlanRequest,
106
- session: AsyncSession = Depends(get_session),
107
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
111
+ session: Annotated[AsyncSession, Depends(get_session)],
112
+ user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())],
113
+ legacy_default_working_dir: Annotated[bool, Depends(use_legacy_default_working_dir)],
108
114
  ):
109
115
  """
110
116
  Returns a run plan for the given run spec.
@@ -119,6 +125,7 @@ async def get_plan(
119
125
  user=user,
120
126
  run_spec=body.run_spec,
121
127
  max_offers=body.max_offers,
128
+ legacy_default_working_dir=legacy_default_working_dir,
122
129
  )
123
130
  return CustomORJSONResponse(run_plan)
124
131
 
@@ -129,8 +136,9 @@ async def get_plan(
129
136
  )
130
137
  async def apply_plan(
131
138
  body: ApplyRunPlanRequest,
132
- session: AsyncSession = Depends(get_session),
133
- user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
139
+ session: Annotated[AsyncSession, Depends(get_session)],
140
+ user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())],
141
+ legacy_default_working_dir: Annotated[bool, Depends(use_legacy_default_working_dir)],
134
142
  ):
135
143
  """
136
144
  Creates a new run or updates an existing run.
@@ -148,6 +156,7 @@ async def apply_plan(
148
156
  project=project,
149
157
  plan=body.plan,
150
158
  force=body.force,
159
+ legacy_default_working_dir=legacy_default_working_dir,
151
160
  )
152
161
  )
153
162