dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/run.py +1 -1
- dstack/_internal/core/backends/base/compute.py +20 -1
- dstack/_internal/core/backends/base/models.py +10 -0
- dstack/_internal/core/backends/base/offers.py +1 -0
- dstack/_internal/core/backends/features.py +5 -0
- dstack/_internal/core/backends/nebius/compute.py +28 -16
- dstack/_internal/core/backends/nebius/configurator.py +1 -1
- dstack/_internal/core/backends/nebius/models.py +4 -0
- dstack/_internal/core/backends/nebius/resources.py +41 -20
- dstack/_internal/core/backends/runpod/api_client.py +245 -59
- dstack/_internal/core/backends/runpod/compute.py +157 -13
- dstack/_internal/core/models/compute_groups.py +39 -0
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/profiles.py +3 -1
- dstack/_internal/core/models/runs.py +3 -0
- dstack/_internal/server/app.py +14 -2
- dstack/_internal/server/background/__init__.py +7 -0
- dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
- dstack/_internal/server/background/tasks/process_instances.py +81 -49
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
- dstack/_internal/server/migrations/env.py +20 -2
- dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
- dstack/_internal/server/models.py +39 -0
- dstack/_internal/server/routers/runs.py +15 -6
- dstack/_internal/server/services/compute_groups.py +22 -0
- dstack/_internal/server/services/fleets.py +1 -0
- dstack/_internal/server/services/jobs/__init__.py +13 -0
- dstack/_internal/server/services/jobs/configurators/base.py +3 -2
- dstack/_internal/server/services/requirements/combine.py +1 -0
- dstack/_internal/server/services/runs.py +17 -3
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/server/utils/routers.py +18 -20
- dstack/_internal/settings.py +4 -1
- dstack/_internal/utils/version.py +22 -0
- dstack/version.py +1 -1
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
- dstack/_internal/core/backends/nebius/fabrics.py +0 -49
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -3,16 +3,22 @@ import itertools
|
|
|
3
3
|
import math
|
|
4
4
|
import uuid
|
|
5
5
|
from datetime import datetime, timedelta
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
7
|
|
|
8
8
|
from sqlalchemy import and_, func, not_, or_, select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
10
|
from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload
|
|
11
11
|
|
|
12
12
|
from dstack._internal.core.backends.base.backend import Backend
|
|
13
|
-
from dstack._internal.core.backends.base.compute import
|
|
13
|
+
from dstack._internal.core.backends.base.compute import (
|
|
14
|
+
ComputeWithGroupProvisioningSupport,
|
|
15
|
+
ComputeWithVolumeSupport,
|
|
16
|
+
)
|
|
17
|
+
from dstack._internal.core.backends.base.models import JobConfiguration
|
|
18
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT
|
|
14
19
|
from dstack._internal.core.errors import BackendError, ServerClientError
|
|
15
20
|
from dstack._internal.core.models.common import NetworkMode
|
|
21
|
+
from dstack._internal.core.models.compute_groups import ComputeGroupProvisioningData
|
|
16
22
|
from dstack._internal.core.models.fleets import (
|
|
17
23
|
Fleet,
|
|
18
24
|
FleetConfiguration,
|
|
@@ -42,8 +48,10 @@ from dstack._internal.core.models.runs import (
|
|
|
42
48
|
from dstack._internal.core.models.volumes import Volume
|
|
43
49
|
from dstack._internal.core.services.profiles import get_termination
|
|
44
50
|
from dstack._internal.server import settings
|
|
51
|
+
from dstack._internal.server.background.tasks.process_compute_groups import ComputeGroupStatus
|
|
45
52
|
from dstack._internal.server.db import get_db, get_session_ctx
|
|
46
53
|
from dstack._internal.server.models import (
|
|
54
|
+
ComputeGroupModel,
|
|
47
55
|
FleetModel,
|
|
48
56
|
InstanceModel,
|
|
49
57
|
JobModel,
|
|
@@ -69,6 +77,7 @@ from dstack._internal.server.services.instances import (
|
|
|
69
77
|
from dstack._internal.server.services.jobs import (
|
|
70
78
|
check_can_attach_job_volumes,
|
|
71
79
|
find_job,
|
|
80
|
+
find_jobs,
|
|
72
81
|
get_instances_ids_with_detaching_volumes,
|
|
73
82
|
get_job_configured_volume_models,
|
|
74
83
|
get_job_configured_volumes,
|
|
@@ -132,6 +141,7 @@ async def _process_next_submitted_job():
|
|
|
132
141
|
.join(JobModel.run)
|
|
133
142
|
.where(
|
|
134
143
|
JobModel.status == JobStatus.SUBMITTED,
|
|
144
|
+
JobModel.waiting_master_job.is_not(True),
|
|
135
145
|
JobModel.id.not_in(lockset),
|
|
136
146
|
)
|
|
137
147
|
.options(load_only(JobModel.id))
|
|
@@ -190,6 +200,8 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
190
200
|
run_spec = run.run_spec
|
|
191
201
|
run_profile = run_spec.merged_profile
|
|
192
202
|
job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
|
|
203
|
+
replica_jobs = find_jobs(run.jobs, replica_num=job_model.replica_num)
|
|
204
|
+
replica_job_models = _get_job_models_for_jobs(run_model.jobs, replica_jobs)
|
|
193
205
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
194
206
|
|
|
195
207
|
# Master job chooses fleet for the run.
|
|
@@ -323,6 +335,10 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
323
335
|
return
|
|
324
336
|
# If no instances were locked, we can proceed in the same transaction.
|
|
325
337
|
|
|
338
|
+
# TODO: Volume attachment for compute groups is not yet supported since
|
|
339
|
+
# currently supported compute groups (e.g. Runpod) don't need explicit volume attachment.
|
|
340
|
+
need_volume_attachment = True
|
|
341
|
+
|
|
326
342
|
if job_model.instance is not None:
|
|
327
343
|
res = await session.execute(
|
|
328
344
|
select(InstanceModel)
|
|
@@ -333,7 +349,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
333
349
|
instance = res.unique().scalar_one()
|
|
334
350
|
job_model.status = JobStatus.PROVISIONING
|
|
335
351
|
else:
|
|
336
|
-
# Assigned no instance, create a new one
|
|
337
352
|
if run_profile.creation_policy == CreationPolicy.REUSE:
|
|
338
353
|
logger.debug("%s: reuse instance failed", fmt(job_model))
|
|
339
354
|
job_model.status = JobStatus.TERMINATING
|
|
@@ -342,13 +357,23 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
342
357
|
await session.commit()
|
|
343
358
|
return
|
|
344
359
|
|
|
345
|
-
|
|
346
|
-
|
|
360
|
+
jobs_to_provision = [job]
|
|
361
|
+
if (
|
|
362
|
+
multinode
|
|
363
|
+
and job.job_spec.job_num == 0
|
|
364
|
+
# job_model.waiting_master_job is not set for legacy jobs.
|
|
365
|
+
# In this case compute group provisioning not supported
|
|
366
|
+
# and jobs always provision one-by-one.
|
|
367
|
+
and job_model.waiting_master_job is not None
|
|
368
|
+
):
|
|
369
|
+
jobs_to_provision = replica_jobs
|
|
370
|
+
|
|
371
|
+
run_job_result = await _run_jobs_on_new_instances(
|
|
347
372
|
project=project,
|
|
348
373
|
fleet_model=fleet_model,
|
|
349
374
|
job_model=job_model,
|
|
350
375
|
run=run,
|
|
351
|
-
|
|
376
|
+
jobs=jobs_to_provision,
|
|
352
377
|
project_ssh_public_key=project.ssh_public_key,
|
|
353
378
|
project_ssh_private_key=project.ssh_private_key,
|
|
354
379
|
master_job_provisioning_data=master_job_provisioning_data,
|
|
@@ -362,72 +387,102 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
362
387
|
await session.commit()
|
|
363
388
|
return
|
|
364
389
|
|
|
365
|
-
logger.info("%s: now is provisioning a new instance", fmt(job_model))
|
|
366
|
-
job_provisioning_data, offer, effective_profile, _ = run_job_result
|
|
367
|
-
job_model.job_provisioning_data = job_provisioning_data.json()
|
|
368
|
-
job_model.status = JobStatus.PROVISIONING
|
|
369
390
|
if fleet_model is None:
|
|
370
391
|
fleet_model = await _create_fleet_model_for_job(
|
|
371
392
|
session=session,
|
|
372
393
|
project=project,
|
|
373
394
|
run=run,
|
|
374
395
|
)
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
# Take lock to prevent attaching volumes that are to be deleted.
|
|
412
|
-
# If the volume was deleted before the lock, the volume will fail to attach and the job will fail.
|
|
413
|
-
await session.execute(
|
|
414
|
-
select(VolumeModel)
|
|
415
|
-
.where(VolumeModel.id.in_(volumes_ids))
|
|
416
|
-
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
417
|
-
.order_by(VolumeModel.id) # take locks in order
|
|
418
|
-
.with_for_update(key_share=True, of=VolumeModel)
|
|
419
|
-
)
|
|
420
|
-
async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids):
|
|
421
|
-
if len(volume_models) > 0:
|
|
422
|
-
await _attach_volumes(
|
|
396
|
+
session.add(fleet_model)
|
|
397
|
+
|
|
398
|
+
provisioning_data, offer, effective_profile, _ = run_job_result
|
|
399
|
+
compute_group_model = None
|
|
400
|
+
if isinstance(provisioning_data, ComputeGroupProvisioningData):
|
|
401
|
+
need_volume_attachment = False
|
|
402
|
+
provisioned_jobs = jobs_to_provision
|
|
403
|
+
jpds = provisioning_data.job_provisioning_datas
|
|
404
|
+
compute_group_model = ComputeGroupModel(
|
|
405
|
+
id=uuid.uuid4(),
|
|
406
|
+
project=project,
|
|
407
|
+
fleet=fleet_model,
|
|
408
|
+
status=ComputeGroupStatus.RUNNING,
|
|
409
|
+
provisioning_data=provisioning_data.json(),
|
|
410
|
+
)
|
|
411
|
+
session.add(compute_group_model)
|
|
412
|
+
else:
|
|
413
|
+
provisioned_jobs = [job]
|
|
414
|
+
jpds = [provisioning_data]
|
|
415
|
+
if len(jobs_to_provision) > 1:
|
|
416
|
+
# Tried provisioning multiple jobs but provisioned only one.
|
|
417
|
+
# Allow other jobs to provision one-by-one.
|
|
418
|
+
for replica_job_model in replica_job_models:
|
|
419
|
+
replica_job_model.waiting_master_job = False
|
|
420
|
+
|
|
421
|
+
logger.info("%s: provisioned %s new instance(s)", fmt(job_model), len(provisioned_jobs))
|
|
422
|
+
provisioned_job_models = _get_job_models_for_jobs(run_model.jobs, provisioned_jobs)
|
|
423
|
+
instance = None # Instance for attaching volumes in case of single job provisioned
|
|
424
|
+
for provisioned_job_model, jpd in zip(provisioned_job_models, jpds):
|
|
425
|
+
provisioned_job_model.job_provisioning_data = jpd.json()
|
|
426
|
+
provisioned_job_model.status = JobStatus.PROVISIONING
|
|
427
|
+
# FIXME: Fleet is not locked which may lead to duplicate instance_num.
|
|
428
|
+
# This is currently hard to fix without locking the fleet for entire provisioning duration.
|
|
429
|
+
# Processing should be done in multiple steps so that
|
|
430
|
+
# InstanceModel is created before provisioning.
|
|
431
|
+
instance_num = await _get_next_instance_num(
|
|
423
432
|
session=session,
|
|
433
|
+
fleet_model=fleet_model,
|
|
434
|
+
)
|
|
435
|
+
instance = _create_instance_model_for_job(
|
|
424
436
|
project=project,
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
437
|
+
fleet_model=fleet_model,
|
|
438
|
+
compute_group_model=compute_group_model,
|
|
439
|
+
job_model=provisioned_job_model,
|
|
440
|
+
job_provisioning_data=jpd,
|
|
441
|
+
offer=offer,
|
|
442
|
+
instance_num=instance_num,
|
|
443
|
+
profile=effective_profile,
|
|
428
444
|
)
|
|
429
|
-
|
|
430
|
-
|
|
445
|
+
provisioned_job_model.job_runtime_data = _prepare_job_runtime_data(
|
|
446
|
+
offer, multinode
|
|
447
|
+
).json()
|
|
448
|
+
logger.info(
|
|
449
|
+
"Created a new instance %s for job %s",
|
|
450
|
+
instance.name,
|
|
451
|
+
provisioned_job_model.job_name,
|
|
452
|
+
extra={
|
|
453
|
+
"instance_name": instance.name,
|
|
454
|
+
"instance_status": InstanceStatus.PROVISIONING.value,
|
|
455
|
+
},
|
|
456
|
+
)
|
|
457
|
+
session.add(instance)
|
|
458
|
+
provisioned_job_model.used_instance_id = instance.id
|
|
459
|
+
provisioned_job_model.last_processed_at = common_utils.get_current_datetime()
|
|
460
|
+
|
|
461
|
+
volumes_ids = sorted([v.id for vs in volume_models for v in vs])
|
|
462
|
+
if need_volume_attachment:
|
|
463
|
+
# TODO: Lock instances for attaching volumes?
|
|
464
|
+
# Take lock to prevent attaching volumes that are to be deleted.
|
|
465
|
+
# If the volume was deleted before the lock, the volume will fail to attach and the job will fail.
|
|
466
|
+
await session.execute(
|
|
467
|
+
select(VolumeModel)
|
|
468
|
+
.where(VolumeModel.id.in_(volumes_ids))
|
|
469
|
+
.options(joinedload(VolumeModel.user).load_only(UserModel.name))
|
|
470
|
+
.order_by(VolumeModel.id) # take locks in order
|
|
471
|
+
.with_for_update(key_share=True, of=VolumeModel)
|
|
472
|
+
)
|
|
473
|
+
async with get_locker(get_db().dialect_name).lock_ctx(
|
|
474
|
+
VolumeModel.__tablename__, volumes_ids
|
|
475
|
+
):
|
|
476
|
+
if len(volume_models) > 0:
|
|
477
|
+
assert instance is not None
|
|
478
|
+
await _attach_volumes(
|
|
479
|
+
session=session,
|
|
480
|
+
project=project,
|
|
481
|
+
job_model=job_model,
|
|
482
|
+
instance=instance,
|
|
483
|
+
volume_models=volume_models,
|
|
484
|
+
)
|
|
485
|
+
await session.commit()
|
|
431
486
|
|
|
432
487
|
|
|
433
488
|
async def _select_fleet_models(
|
|
@@ -553,10 +608,9 @@ async def _find_optimal_fleet_with_offers(
|
|
|
553
608
|
except ValueError:
|
|
554
609
|
fleet_backend_offers = []
|
|
555
610
|
else:
|
|
556
|
-
multinode
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
)
|
|
611
|
+
# Handle multinode for old jobs that don't have requirements.multinode set.
|
|
612
|
+
# TODO: Drop multinode param.
|
|
613
|
+
multinode = requirements.multinode or job.job_spec.jobs_per_replica > 1
|
|
560
614
|
fleet_backend_offers = await get_offers_by_requirements(
|
|
561
615
|
project=project,
|
|
562
616
|
profile=profile,
|
|
@@ -728,19 +782,33 @@ async def _assign_job_to_fleet_instance(
|
|
|
728
782
|
return instance
|
|
729
783
|
|
|
730
784
|
|
|
731
|
-
async def
|
|
785
|
+
async def _run_jobs_on_new_instances(
|
|
732
786
|
project: ProjectModel,
|
|
733
787
|
job_model: JobModel,
|
|
734
788
|
run: Run,
|
|
735
|
-
|
|
789
|
+
jobs: list[Job],
|
|
736
790
|
project_ssh_public_key: str,
|
|
737
791
|
project_ssh_private_key: str,
|
|
738
792
|
master_job_provisioning_data: Optional[JobProvisioningData] = None,
|
|
739
|
-
volumes: Optional[
|
|
793
|
+
volumes: Optional[list[list[Volume]]] = None,
|
|
740
794
|
fleet_model: Optional[FleetModel] = None,
|
|
741
|
-
) -> Optional[
|
|
795
|
+
) -> Optional[
|
|
796
|
+
tuple[
|
|
797
|
+
Union[JobProvisioningData, ComputeGroupProvisioningData],
|
|
798
|
+
InstanceOfferWithAvailability,
|
|
799
|
+
Profile,
|
|
800
|
+
Requirements,
|
|
801
|
+
]
|
|
802
|
+
]:
|
|
803
|
+
"""
|
|
804
|
+
Provisions an instance for a job or a compute group for multiple jobs and runs the jobs.
|
|
805
|
+
Even when multiple jobs are passes, it may still provision only one instance
|
|
806
|
+
and run only the master job in case there are no offers supporting cluster groups.
|
|
807
|
+
Other jobs should be provisioned one-by-one later.
|
|
808
|
+
"""
|
|
742
809
|
if volumes is None:
|
|
743
810
|
volumes = []
|
|
811
|
+
job = jobs[0]
|
|
744
812
|
profile = run.run_spec.merged_profile
|
|
745
813
|
requirements = job.job_spec.requirements
|
|
746
814
|
fleet = None
|
|
@@ -758,9 +826,7 @@ async def _run_job_on_new_instance(
|
|
|
758
826
|
return None
|
|
759
827
|
# TODO: Respect fleet provisioning properties such as tags
|
|
760
828
|
|
|
761
|
-
multinode = job.job_spec.jobs_per_replica > 1
|
|
762
|
-
fleet is not None and fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
763
|
-
)
|
|
829
|
+
multinode = requirements.multinode or job.job_spec.jobs_per_replica > 1
|
|
764
830
|
offers = await get_offers_by_requirements(
|
|
765
831
|
project=project,
|
|
766
832
|
profile=profile,
|
|
@@ -784,17 +850,31 @@ async def _run_job_on_new_instance(
|
|
|
784
850
|
offer.price,
|
|
785
851
|
)
|
|
786
852
|
offer_volumes = _get_offer_volumes(volumes, offer)
|
|
853
|
+
job_configurations = [JobConfiguration(job=j, volumes=offer_volumes) for j in jobs]
|
|
854
|
+
compute = backend.compute()
|
|
787
855
|
try:
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
856
|
+
if len(jobs) > 1 and offer.backend in BACKENDS_WITH_GROUP_PROVISIONING_SUPPORT:
|
|
857
|
+
assert isinstance(compute, ComputeWithGroupProvisioningSupport)
|
|
858
|
+
cgpd = await common_utils.run_async(
|
|
859
|
+
compute.run_jobs,
|
|
860
|
+
run,
|
|
861
|
+
job_configurations,
|
|
862
|
+
offer,
|
|
863
|
+
project_ssh_public_key,
|
|
864
|
+
project_ssh_private_key,
|
|
865
|
+
)
|
|
866
|
+
return cgpd, offer, profile, requirements
|
|
867
|
+
else:
|
|
868
|
+
jpd = await common_utils.run_async(
|
|
869
|
+
compute.run_job,
|
|
870
|
+
run,
|
|
871
|
+
job,
|
|
872
|
+
offer,
|
|
873
|
+
project_ssh_public_key,
|
|
874
|
+
project_ssh_private_key,
|
|
875
|
+
offer_volumes,
|
|
876
|
+
)
|
|
877
|
+
return jpd, offer, profile, requirements
|
|
798
878
|
except BackendError as e:
|
|
799
879
|
logger.warning(
|
|
800
880
|
"%s: %s launch in %s/%s failed: %s",
|
|
@@ -912,6 +992,7 @@ async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel)
|
|
|
912
992
|
def _create_instance_model_for_job(
|
|
913
993
|
project: ProjectModel,
|
|
914
994
|
fleet_model: FleetModel,
|
|
995
|
+
compute_group_model: Optional[ComputeGroupModel],
|
|
915
996
|
job_model: JobModel,
|
|
916
997
|
job_provisioning_data: JobProvisioningData,
|
|
917
998
|
offer: InstanceOfferWithAvailability,
|
|
@@ -931,6 +1012,8 @@ def _create_instance_model_for_job(
|
|
|
931
1012
|
name=f"{fleet_model.name}-{instance_num}",
|
|
932
1013
|
instance_num=instance_num,
|
|
933
1014
|
project=project,
|
|
1015
|
+
fleet=fleet_model,
|
|
1016
|
+
compute_group=compute_group_model,
|
|
934
1017
|
created_at=common_utils.get_current_datetime(),
|
|
935
1018
|
started_at=common_utils.get_current_datetime(),
|
|
936
1019
|
status=InstanceStatus.PROVISIONING,
|
|
@@ -1081,3 +1164,15 @@ async def _attach_volume(
|
|
|
1081
1164
|
instance.volume_attachments.append(volume_attachment_model)
|
|
1082
1165
|
|
|
1083
1166
|
volume_model.last_job_processed_at = common_utils.get_current_datetime()
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def _get_job_models_for_jobs(
|
|
1170
|
+
job_models: list[JobModel],
|
|
1171
|
+
jobs: list[Job],
|
|
1172
|
+
) -> list[JobModel]:
|
|
1173
|
+
"""
|
|
1174
|
+
Returns job models of latest submissions for a list of jobs.
|
|
1175
|
+
Preserves jobs order.
|
|
1176
|
+
"""
|
|
1177
|
+
id_to_job_model_map = {jm.id: jm for jm in job_models}
|
|
1178
|
+
return [id_to_job_model_map[j.job_submissions[-1].id] for j in jobs]
|
|
@@ -6,7 +6,7 @@ from alembic import context
|
|
|
6
6
|
from sqlalchemy import Connection, MetaData, text
|
|
7
7
|
|
|
8
8
|
from dstack._internal.server.db import get_db
|
|
9
|
-
from dstack._internal.server.models import BaseModel
|
|
9
|
+
from dstack._internal.server.models import BaseModel, EnumAsString
|
|
10
10
|
|
|
11
11
|
config = context.config
|
|
12
12
|
|
|
@@ -21,6 +21,14 @@ def set_target_metadata(metadata: MetaData):
|
|
|
21
21
|
target_metadata = metadata
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
def render_item(type_, obj, autogen_context):
|
|
25
|
+
"""Apply custom rendering for selected items."""
|
|
26
|
+
if type_ == "type" and isinstance(obj, EnumAsString):
|
|
27
|
+
return f"sa.String(length={obj.length})"
|
|
28
|
+
# default rendering for other objects
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
|
|
24
32
|
def run_migrations_offline():
|
|
25
33
|
"""Run migrations in 'offline' mode.
|
|
26
34
|
This configures the context with just a URL
|
|
@@ -35,8 +43,8 @@ def run_migrations_offline():
|
|
|
35
43
|
target_metadata=target_metadata,
|
|
36
44
|
literal_binds=True,
|
|
37
45
|
dialect_opts={"paramstyle": "named"},
|
|
46
|
+
render_item=render_item,
|
|
38
47
|
)
|
|
39
|
-
|
|
40
48
|
with context.begin_transaction():
|
|
41
49
|
context.run_migrations()
|
|
42
50
|
|
|
@@ -61,12 +69,22 @@ def run_migrations(connection: Connection):
|
|
|
61
69
|
# https://alembic.sqlalchemy.org/en/latest/batch.html#dealing-with-referencing-foreign-keys
|
|
62
70
|
if connection.dialect.name == "sqlite":
|
|
63
71
|
connection.execute(text("PRAGMA foreign_keys=OFF;"))
|
|
72
|
+
elif connection.dialect.name == "postgresql":
|
|
73
|
+
# lock_timeout is needed so that migrations that acquire locks
|
|
74
|
+
# do not wait for locks forever, blocking live queries.
|
|
75
|
+
# Better to fail and retry a deployment.
|
|
76
|
+
connection.execute(text("SET lock_timeout='10s';"))
|
|
64
77
|
connection.commit()
|
|
65
78
|
context.configure(
|
|
66
79
|
connection=connection,
|
|
67
80
|
target_metadata=target_metadata,
|
|
68
81
|
compare_type=True,
|
|
69
82
|
render_as_batch=True,
|
|
83
|
+
render_item=render_item,
|
|
84
|
+
# Running each migration in a separate transaction.
|
|
85
|
+
# Running all migrations in one transaction may lead to deadlocks in HA deployments
|
|
86
|
+
# because lock ordering is not respected across all migrations.
|
|
87
|
+
transaction_per_migration=True,
|
|
70
88
|
)
|
|
71
89
|
with context.begin_transaction():
|
|
72
90
|
context.run_migrations()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Add ComputeGroupModel
|
|
2
|
+
|
|
3
|
+
Revision ID: 7d1ec2b920ac
|
|
4
|
+
Revises: ff1d94f65b08
|
|
5
|
+
Create Date: 2025-10-21 16:01:23.739395
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import sqlalchemy_utils
|
|
11
|
+
from alembic import op
|
|
12
|
+
|
|
13
|
+
import dstack._internal.server.models
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision = "7d1ec2b920ac"
|
|
17
|
+
down_revision = "ff1d94f65b08"
|
|
18
|
+
branch_labels = None
|
|
19
|
+
depends_on = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
op.create_table(
|
|
25
|
+
"compute_groups",
|
|
26
|
+
sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
27
|
+
sa.Column(
|
|
28
|
+
"project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
|
|
29
|
+
),
|
|
30
|
+
sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
|
|
31
|
+
sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
|
|
32
|
+
sa.Column("status", sa.String(length=100), nullable=False),
|
|
33
|
+
sa.Column(
|
|
34
|
+
"last_processed_at", dstack._internal.server.models.NaiveDateTime(), nullable=False
|
|
35
|
+
),
|
|
36
|
+
sa.Column("deleted", sa.Boolean(), nullable=False),
|
|
37
|
+
sa.Column("deleted_at", dstack._internal.server.models.NaiveDateTime(), nullable=True),
|
|
38
|
+
sa.Column("provisioning_data", sa.Text(), nullable=False),
|
|
39
|
+
sa.Column(
|
|
40
|
+
"first_termination_retry_at",
|
|
41
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
42
|
+
nullable=True,
|
|
43
|
+
),
|
|
44
|
+
sa.Column(
|
|
45
|
+
"last_termination_retry_at",
|
|
46
|
+
dstack._internal.server.models.NaiveDateTime(),
|
|
47
|
+
nullable=True,
|
|
48
|
+
),
|
|
49
|
+
sa.ForeignKeyConstraint(
|
|
50
|
+
["fleet_id"], ["fleets.id"], name=op.f("fk_compute_groups_fleet_id_fleets")
|
|
51
|
+
),
|
|
52
|
+
sa.ForeignKeyConstraint(
|
|
53
|
+
["project_id"],
|
|
54
|
+
["projects.id"],
|
|
55
|
+
name=op.f("fk_compute_groups_project_id_projects"),
|
|
56
|
+
ondelete="CASCADE",
|
|
57
|
+
),
|
|
58
|
+
sa.PrimaryKeyConstraint("id", name=op.f("pk_compute_groups")),
|
|
59
|
+
)
|
|
60
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
61
|
+
batch_op.add_column(
|
|
62
|
+
sa.Column(
|
|
63
|
+
"compute_group_id",
|
|
64
|
+
sqlalchemy_utils.types.uuid.UUIDType(binary=False),
|
|
65
|
+
nullable=True,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
batch_op.create_foreign_key(
|
|
69
|
+
batch_op.f("fk_instances_compute_group_id_compute_groups"),
|
|
70
|
+
"compute_groups",
|
|
71
|
+
["compute_group_id"],
|
|
72
|
+
["id"],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
76
|
+
batch_op.add_column(sa.Column("waiting_master_job", sa.Boolean(), nullable=True))
|
|
77
|
+
|
|
78
|
+
# ### end Alembic commands ###
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def downgrade() -> None:
|
|
82
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
83
|
+
with op.batch_alter_table("jobs", schema=None) as batch_op:
|
|
84
|
+
batch_op.drop_column("waiting_master_job")
|
|
85
|
+
|
|
86
|
+
with op.batch_alter_table("instances", schema=None) as batch_op:
|
|
87
|
+
batch_op.drop_constraint(
|
|
88
|
+
batch_op.f("fk_instances_compute_group_id_compute_groups"), type_="foreignkey"
|
|
89
|
+
)
|
|
90
|
+
batch_op.drop_column("compute_group_id")
|
|
91
|
+
|
|
92
|
+
op.drop_table("compute_groups")
|
|
93
|
+
# ### end Alembic commands ###
|
|
@@ -25,6 +25,7 @@ from sqlalchemy_utils import UUIDType
|
|
|
25
25
|
from dstack._internal.core.errors import DstackError
|
|
26
26
|
from dstack._internal.core.models.backends.base import BackendType
|
|
27
27
|
from dstack._internal.core.models.common import CoreConfig, generate_dual_core_model
|
|
28
|
+
from dstack._internal.core.models.compute_groups import ComputeGroupStatus
|
|
28
29
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
29
30
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
30
31
|
from dstack._internal.core.models.health import HealthStatus
|
|
@@ -448,6 +449,12 @@ class JobModel(BaseModel):
|
|
|
448
449
|
# Whether the replica is registered to receive service requests.
|
|
449
450
|
# Always `False` for non-service runs.
|
|
450
451
|
registered: Mapped[bool] = mapped_column(Boolean, server_default=false())
|
|
452
|
+
# `waiting_master_job` is `True` for non-master jobs that have to wait
|
|
453
|
+
# for master processing before they can be processed.
|
|
454
|
+
# This allows updating all replica jobs even when only master is locked,
|
|
455
|
+
# e.g. to provision instances for all jobs when processing master.
|
|
456
|
+
# If not set, all jobs should be processed only one-by-one.
|
|
457
|
+
waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean)
|
|
451
458
|
|
|
452
459
|
|
|
453
460
|
class GatewayModel(BaseModel):
|
|
@@ -592,6 +599,9 @@ class InstanceModel(BaseModel):
|
|
|
592
599
|
fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"))
|
|
593
600
|
fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances")
|
|
594
601
|
|
|
602
|
+
compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id"))
|
|
603
|
+
compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances")
|
|
604
|
+
|
|
595
605
|
status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True)
|
|
596
606
|
unreachable: Mapped[bool] = mapped_column(Boolean)
|
|
597
607
|
|
|
@@ -743,6 +753,35 @@ class PlacementGroupModel(BaseModel):
|
|
|
743
753
|
provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
744
754
|
|
|
745
755
|
|
|
756
|
+
class ComputeGroupModel(BaseModel):
|
|
757
|
+
__tablename__ = "compute_groups"
|
|
758
|
+
|
|
759
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
760
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE"))
|
|
764
|
+
project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id])
|
|
765
|
+
|
|
766
|
+
fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id"))
|
|
767
|
+
fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id])
|
|
768
|
+
|
|
769
|
+
created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
|
|
770
|
+
status: Mapped[ComputeGroupStatus] = mapped_column(EnumAsString(ComputeGroupStatus, 100))
|
|
771
|
+
last_processed_at: Mapped[datetime] = mapped_column(
|
|
772
|
+
NaiveDateTime, default=get_current_datetime
|
|
773
|
+
)
|
|
774
|
+
deleted: Mapped[bool] = mapped_column(Boolean, default=False)
|
|
775
|
+
deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
776
|
+
|
|
777
|
+
provisioning_data: Mapped[str] = mapped_column(Text)
|
|
778
|
+
|
|
779
|
+
first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
780
|
+
last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
781
|
+
|
|
782
|
+
instances: Mapped[List["InstanceModel"]] = relationship(back_populates="compute_group")
|
|
783
|
+
|
|
784
|
+
|
|
746
785
|
class JobMetricsPoint(BaseModel):
|
|
747
786
|
__tablename__ = "job_metrics_points"
|
|
748
787
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from typing import List, Tuple
|
|
1
|
+
from typing import Annotated, List, Optional, Tuple, cast
|
|
2
2
|
|
|
3
|
-
from fastapi import APIRouter, Depends
|
|
3
|
+
from fastapi import APIRouter, Depends, Request
|
|
4
4
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
5
|
|
|
6
6
|
from dstack._internal.core.errors import ResourceNotExistsError
|
|
@@ -35,6 +35,11 @@ project_router = APIRouter(
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
def use_legacy_default_working_dir(request: Request) -> bool:
|
|
39
|
+
client_release = cast(Optional[tuple[int, ...]], request.state.client_release)
|
|
40
|
+
return client_release is not None and client_release < (0, 19, 27)
|
|
41
|
+
|
|
42
|
+
|
|
38
43
|
@root_router.post(
|
|
39
44
|
"/list",
|
|
40
45
|
response_model=List[Run],
|
|
@@ -103,8 +108,9 @@ async def get_run(
|
|
|
103
108
|
)
|
|
104
109
|
async def get_plan(
|
|
105
110
|
body: GetRunPlanRequest,
|
|
106
|
-
session: AsyncSession
|
|
107
|
-
user_project:
|
|
111
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
112
|
+
user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())],
|
|
113
|
+
legacy_default_working_dir: Annotated[bool, Depends(use_legacy_default_working_dir)],
|
|
108
114
|
):
|
|
109
115
|
"""
|
|
110
116
|
Returns a run plan for the given run spec.
|
|
@@ -119,6 +125,7 @@ async def get_plan(
|
|
|
119
125
|
user=user,
|
|
120
126
|
run_spec=body.run_spec,
|
|
121
127
|
max_offers=body.max_offers,
|
|
128
|
+
legacy_default_working_dir=legacy_default_working_dir,
|
|
122
129
|
)
|
|
123
130
|
return CustomORJSONResponse(run_plan)
|
|
124
131
|
|
|
@@ -129,8 +136,9 @@ async def get_plan(
|
|
|
129
136
|
)
|
|
130
137
|
async def apply_plan(
|
|
131
138
|
body: ApplyRunPlanRequest,
|
|
132
|
-
session: AsyncSession
|
|
133
|
-
user_project:
|
|
139
|
+
session: Annotated[AsyncSession, Depends(get_session)],
|
|
140
|
+
user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())],
|
|
141
|
+
legacy_default_working_dir: Annotated[bool, Depends(use_legacy_default_working_dir)],
|
|
134
142
|
):
|
|
135
143
|
"""
|
|
136
144
|
Creates a new run or updates an existing run.
|
|
@@ -148,6 +156,7 @@ async def apply_plan(
|
|
|
148
156
|
project=project,
|
|
149
157
|
plan=body.plan,
|
|
150
158
|
force=body.force,
|
|
159
|
+
legacy_default_working_dir=legacy_default_working_dir,
|
|
151
160
|
)
|
|
152
161
|
)
|
|
153
162
|
|