dstack 0.18.41__py3-none-any.whl → 0.18.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/utils/volume.py +9 -0
- dstack/_internal/core/backends/aws/compute.py +2 -1
- dstack/_internal/core/backends/gcp/compute.py +2 -1
- dstack/_internal/core/models/runs.py +3 -3
- dstack/_internal/core/models/volumes.py +23 -0
- dstack/_internal/server/background/tasks/process_instances.py +2 -3
- dstack/_internal/server/background/tasks/process_running_jobs.py +4 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +13 -2
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +17 -19
- dstack/_internal/server/services/fleets.py +5 -1
- dstack/_internal/server/services/jobs/__init__.py +4 -4
- dstack/_internal/server/services/offers.py +7 -7
- dstack/_internal/server/services/pools.py +3 -3
- dstack/_internal/server/services/runner/client.py +8 -5
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/testing/common.py +13 -9
- dstack/version.py +1 -1
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/RECORD +33 -31
- tests/_internal/server/background/tasks/test_process_running_jobs.py +1 -0
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +5 -3
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +11 -6
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +105 -1
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
|
@@ -22,6 +22,8 @@ def get_volumes_table(
|
|
|
22
22
|
if verbose:
|
|
23
23
|
table.add_column("REGION")
|
|
24
24
|
table.add_column("STATUS")
|
|
25
|
+
if verbose:
|
|
26
|
+
table.add_column("ATTACHED")
|
|
25
27
|
table.add_column("CREATED")
|
|
26
28
|
if verbose:
|
|
27
29
|
table.add_column("ERROR")
|
|
@@ -37,11 +39,18 @@ def get_volumes_table(
|
|
|
37
39
|
and volume.provisioning_data.availability_zone is not None
|
|
38
40
|
):
|
|
39
41
|
region += f" ({volume.provisioning_data.availability_zone})"
|
|
42
|
+
attached = "-"
|
|
43
|
+
if volume.attachments is not None:
|
|
44
|
+
attached = ", ".join(
|
|
45
|
+
{va.instance.fleet_name for va in volume.attachments if va.instance.fleet_name}
|
|
46
|
+
)
|
|
47
|
+
attached = attached or "-"
|
|
40
48
|
row = {
|
|
41
49
|
"NAME": volume.name,
|
|
42
50
|
"BACKEND": backend,
|
|
43
51
|
"REGION": region,
|
|
44
52
|
"STATUS": volume.status,
|
|
53
|
+
"ATTACHED": attached,
|
|
45
54
|
"CREATED": format_date(volume.created_at),
|
|
46
55
|
"ERROR": volume.status_message,
|
|
47
56
|
}
|
|
@@ -635,11 +635,12 @@ class AWSCompute(Compute):
|
|
|
635
635
|
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
|
|
636
636
|
|
|
637
637
|
logger.debug("Detaching EBS volume %s from instance %s", volume.volume_id, instance_id)
|
|
638
|
+
attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id))
|
|
638
639
|
try:
|
|
639
640
|
ec2_client.detach_volume(
|
|
640
641
|
VolumeId=volume.volume_id,
|
|
641
642
|
InstanceId=instance_id,
|
|
642
|
-
Device=
|
|
643
|
+
Device=attachment_data.device_name,
|
|
643
644
|
Force=force,
|
|
644
645
|
)
|
|
645
646
|
except botocore.exceptions.ClientError as e:
|
|
@@ -666,6 +666,7 @@ class GCPCompute(Compute):
|
|
|
666
666
|
instance_id,
|
|
667
667
|
)
|
|
668
668
|
zone = get_or_error(volume.provisioning_data).availability_zone
|
|
669
|
+
attachment_data = get_or_error(volume.get_attachment_data_for_instance(instance_id))
|
|
669
670
|
# This method has no information if the instance is a TPU or a VM,
|
|
670
671
|
# so we first try to see if there is a TPU with such name
|
|
671
672
|
try:
|
|
@@ -694,7 +695,7 @@ class GCPCompute(Compute):
|
|
|
694
695
|
project=self.config.project_id,
|
|
695
696
|
zone=get_or_error(volume.provisioning_data).availability_zone,
|
|
696
697
|
instance=instance_id,
|
|
697
|
-
device_name=
|
|
698
|
+
device_name=attachment_data.device_name,
|
|
698
699
|
)
|
|
699
700
|
gcp_resources.wait_for_extended_operation(operation, "persistent disk detachment")
|
|
700
701
|
logger.debug(
|
|
@@ -150,9 +150,9 @@ class JobTerminationReason(str, Enum):
|
|
|
150
150
|
class Requirements(CoreModel):
|
|
151
151
|
# TODO: Make requirements' fields required
|
|
152
152
|
resources: ResourcesSpec
|
|
153
|
-
max_price: Optional[float]
|
|
154
|
-
spot: Optional[bool]
|
|
155
|
-
reservation: Optional[str]
|
|
153
|
+
max_price: Optional[float] = None
|
|
154
|
+
spot: Optional[bool] = None
|
|
155
|
+
reservation: Optional[str] = None
|
|
156
156
|
|
|
157
157
|
def pretty_format(self, resources_only: bool = False):
|
|
158
158
|
res = self.resources.pretty_format()
|
|
@@ -71,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
|
|
|
71
71
|
device_name: Optional[str] = None
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
class VolumeInstance(CoreModel):
|
|
75
|
+
name: str
|
|
76
|
+
fleet_name: Optional[str] = None
|
|
77
|
+
instance_num: int
|
|
78
|
+
instance_id: Optional[str] = None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class VolumeAttachment(CoreModel):
|
|
82
|
+
instance: VolumeInstance
|
|
83
|
+
attachment_data: Optional[VolumeAttachmentData] = None
|
|
84
|
+
|
|
85
|
+
|
|
74
86
|
class Volume(CoreModel):
|
|
75
87
|
id: uuid.UUID
|
|
76
88
|
name: str
|
|
@@ -86,8 +98,19 @@ class Volume(CoreModel):
|
|
|
86
98
|
deleted: bool
|
|
87
99
|
volume_id: Optional[str] = None # id of the volume in the cloud
|
|
88
100
|
provisioning_data: Optional[VolumeProvisioningData] = None
|
|
101
|
+
attachments: Optional[List[VolumeAttachment]] = None
|
|
102
|
+
# attachment_data is deprecated in favor of attachments.
|
|
103
|
+
# It's only set for volumes that were attached before attachments.
|
|
89
104
|
attachment_data: Optional[VolumeAttachmentData] = None
|
|
90
105
|
|
|
106
|
+
def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
|
|
107
|
+
if self.attachments is not None:
|
|
108
|
+
for attachment in self.attachments:
|
|
109
|
+
if attachment.instance.instance_id == instance_id:
|
|
110
|
+
return attachment.attachment_data
|
|
111
|
+
# volume was attached before attachments were introduced
|
|
112
|
+
return self.attachment_data
|
|
113
|
+
|
|
91
114
|
|
|
92
115
|
class VolumePlan(CoreModel):
|
|
93
116
|
project_name: str
|
|
@@ -507,9 +507,9 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No
|
|
|
507
507
|
project=instance.project,
|
|
508
508
|
profile=profile,
|
|
509
509
|
requirements=requirements,
|
|
510
|
-
exclude_not_available=True,
|
|
511
510
|
fleet_model=instance.fleet,
|
|
512
511
|
blocks="auto" if instance.total_blocks is None else instance.total_blocks,
|
|
512
|
+
exclude_not_available=True,
|
|
513
513
|
)
|
|
514
514
|
|
|
515
515
|
if not offers and should_retry:
|
|
@@ -915,9 +915,8 @@ def _get_instance_offer_for_instance(
|
|
|
915
915
|
instance_offer.availability_zones = [
|
|
916
916
|
z
|
|
917
917
|
for z in instance_offer.availability_zones
|
|
918
|
-
if
|
|
918
|
+
if z == master_job_provisioning_data.availability_zone
|
|
919
919
|
]
|
|
920
|
-
|
|
921
920
|
return instance_offer
|
|
922
921
|
|
|
923
922
|
|
|
@@ -205,6 +205,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
205
205
|
None,
|
|
206
206
|
run,
|
|
207
207
|
job_model,
|
|
208
|
+
job_provisioning_data,
|
|
208
209
|
volumes,
|
|
209
210
|
secrets,
|
|
210
211
|
job.job_spec.registry_auth,
|
|
@@ -376,6 +377,7 @@ def _process_provisioning_with_shim(
|
|
|
376
377
|
ports: Dict[int, int],
|
|
377
378
|
run: Run,
|
|
378
379
|
job_model: JobModel,
|
|
380
|
+
job_provisioning_data: JobProvisioningData,
|
|
379
381
|
volumes: List[Volume],
|
|
380
382
|
secrets: Dict[str, str],
|
|
381
383
|
registry_auth: Optional[RegistryAuth],
|
|
@@ -459,6 +461,7 @@ def _process_provisioning_with_shim(
|
|
|
459
461
|
host_ssh_user=ssh_user,
|
|
460
462
|
host_ssh_keys=[ssh_key] if ssh_key else [],
|
|
461
463
|
container_ssh_keys=public_keys,
|
|
464
|
+
instance_id=job_provisioning_data.instance_id,
|
|
462
465
|
)
|
|
463
466
|
else:
|
|
464
467
|
submitted = shim_client.submit(
|
|
@@ -475,6 +478,7 @@ def _process_provisioning_with_shim(
|
|
|
475
478
|
mounts=volume_mounts,
|
|
476
479
|
volumes=volumes,
|
|
477
480
|
instance_mounts=instance_mounts,
|
|
481
|
+
instance_id=job_provisioning_data.instance_id,
|
|
478
482
|
)
|
|
479
483
|
if not submitted:
|
|
480
484
|
# This can happen when we lost connection to the runner (e.g., network issues), marked
|
|
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
43
43
|
PoolModel,
|
|
44
44
|
ProjectModel,
|
|
45
45
|
RunModel,
|
|
46
|
+
VolumeAttachmentModel,
|
|
46
47
|
VolumeModel,
|
|
47
48
|
)
|
|
48
49
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
@@ -236,7 +237,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
236
237
|
res = await session.execute(
|
|
237
238
|
select(InstanceModel)
|
|
238
239
|
.where(InstanceModel.id == job_model.instance.id)
|
|
239
|
-
.options(selectinload(InstanceModel.
|
|
240
|
+
.options(selectinload(InstanceModel.volume_attachments))
|
|
240
241
|
.execution_options(populate_existing=True)
|
|
241
242
|
)
|
|
242
243
|
instance = res.unique().scalar_one()
|
|
@@ -390,11 +391,11 @@ async def _assign_job_to_pool_instance(
|
|
|
390
391
|
|
|
391
392
|
instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
|
|
392
393
|
instance, offer = instances_with_offers[0]
|
|
393
|
-
# Reload InstanceModel with
|
|
394
|
+
# Reload InstanceModel with volume attachments
|
|
394
395
|
res = await session.execute(
|
|
395
396
|
select(InstanceModel)
|
|
396
397
|
.where(InstanceModel.id == instance.id)
|
|
397
|
-
.options(joinedload(InstanceModel.
|
|
398
|
+
.options(joinedload(InstanceModel.volume_attachments))
|
|
398
399
|
)
|
|
399
400
|
instance = res.unique().scalar_one()
|
|
400
401
|
instance.status = InstanceStatus.BUSY
|
|
@@ -580,7 +581,7 @@ def _create_instance_model_for_job(
|
|
|
580
581
|
backend=offer.backend,
|
|
581
582
|
price=offer.price,
|
|
582
583
|
region=offer.region,
|
|
583
|
-
|
|
584
|
+
volume_attachments=[],
|
|
584
585
|
total_blocks=1,
|
|
585
586
|
busy_blocks=1,
|
|
586
587
|
)
|
|
@@ -696,14 +697,18 @@ async def _attach_volume(
|
|
|
696
697
|
instance: InstanceModel,
|
|
697
698
|
instance_id: str,
|
|
698
699
|
):
|
|
700
|
+
volume = volume_model_to_volume(volume_model)
|
|
701
|
+
# Refresh only to check if the volume wasn't deleted before the lock
|
|
699
702
|
await session.refresh(volume_model)
|
|
700
703
|
if volume_model.deleted:
|
|
701
704
|
raise ServerClientError("Cannot attach a deleted volume")
|
|
702
|
-
volume = volume_model_to_volume(volume_model)
|
|
703
705
|
attachment_data = await common_utils.run_async(
|
|
704
706
|
backend.compute().attach_volume,
|
|
705
707
|
volume=volume,
|
|
706
708
|
instance_id=instance_id,
|
|
707
709
|
)
|
|
708
|
-
|
|
709
|
-
|
|
710
|
+
volume_attachment_model = VolumeAttachmentModel(
|
|
711
|
+
volume=volume_model,
|
|
712
|
+
attachment_data=attachment_data.json(),
|
|
713
|
+
)
|
|
714
|
+
instance.volume_attachments.append(volume_attachment_model)
|
|
@@ -6,7 +6,13 @@ from sqlalchemy.orm import joinedload, lazyload
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
8
|
from dstack._internal.server.db import get_session_ctx
|
|
9
|
-
from dstack._internal.server.models import
|
|
9
|
+
from dstack._internal.server.models import (
|
|
10
|
+
InstanceModel,
|
|
11
|
+
JobModel,
|
|
12
|
+
ProjectModel,
|
|
13
|
+
VolumeAttachmentModel,
|
|
14
|
+
VolumeModel,
|
|
15
|
+
)
|
|
10
16
|
from dstack._internal.server.services.jobs import (
|
|
11
17
|
process_terminating_job,
|
|
12
18
|
process_volumes_detaching,
|
|
@@ -80,7 +86,12 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
80
86
|
.where(InstanceModel.id == job_model.used_instance_id)
|
|
81
87
|
.options(
|
|
82
88
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
83
|
-
joinedload(InstanceModel.
|
|
89
|
+
joinedload(InstanceModel.volume_attachments)
|
|
90
|
+
.joinedload(VolumeAttachmentModel.volume)
|
|
91
|
+
.joinedload(VolumeModel.user),
|
|
92
|
+
joinedload(InstanceModel.volume_attachments)
|
|
93
|
+
.joinedload(VolumeAttachmentModel.volume)
|
|
94
|
+
.joinedload(VolumeModel.attachments),
|
|
84
95
|
)
|
|
85
96
|
)
|
|
86
97
|
instance_model = res.unique().scalar()
|
|
@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
|
|
|
5
5
|
from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
6
6
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
7
7
|
from dstack._internal.server.db import get_session_ctx
|
|
8
|
-
from dstack._internal.server.models import
|
|
8
|
+
from dstack._internal.server.models import (
|
|
9
|
+
InstanceModel,
|
|
10
|
+
ProjectModel,
|
|
11
|
+
VolumeAttachmentModel,
|
|
12
|
+
VolumeModel,
|
|
13
|
+
)
|
|
9
14
|
from dstack._internal.server.services import backends as backends_services
|
|
10
15
|
from dstack._internal.server.services import volumes as volumes_services
|
|
11
16
|
from dstack._internal.server.services.locking import get_locker
|
|
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
|
|
|
49
54
|
.where(VolumeModel.id == volume_model.id)
|
|
50
55
|
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
51
56
|
.options(joinedload(VolumeModel.user))
|
|
57
|
+
.options(
|
|
58
|
+
joinedload(VolumeModel.attachments)
|
|
59
|
+
.joinedload(VolumeAttachmentModel.instance)
|
|
60
|
+
.joinedload(InstanceModel.fleet)
|
|
61
|
+
)
|
|
52
62
|
.execution_options(populate_existing=True)
|
|
53
63
|
)
|
|
54
64
|
volume_model = res.unique().scalar_one()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Move attachment_data to volumes_attachments
|
|
2
|
+
|
|
3
|
+
Revision ID: a751ef183f27
|
|
4
|
+
Revises: 1e76fb0dde87
|
|
5
|
+
Create Date: 2025-02-12 13:19:57.569591
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "a751ef183f27"
|
|
14
|
+
down_revision = "1e76fb0dde87"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
22
|
+
batch_op.alter_column("instace_id", new_column_name="instance_id")
|
|
23
|
+
batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
|
|
24
|
+
|
|
25
|
+
# ### end Alembic commands ###
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade() -> None:
|
|
29
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
30
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
31
|
+
batch_op.drop_column("attachment_data")
|
|
32
|
+
batch_op.alter_column("instance_id", new_column_name="instace_id")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|
|
@@ -5,7 +5,6 @@ from typing import Callable, List, Optional, Union
|
|
|
5
5
|
from sqlalchemy import (
|
|
6
6
|
BigInteger,
|
|
7
7
|
Boolean,
|
|
8
|
-
Column,
|
|
9
8
|
DateTime,
|
|
10
9
|
Enum,
|
|
11
10
|
Float,
|
|
@@ -15,7 +14,6 @@ from sqlalchemy import (
|
|
|
15
14
|
LargeBinary,
|
|
16
15
|
MetaData,
|
|
17
16
|
String,
|
|
18
|
-
Table,
|
|
19
17
|
Text,
|
|
20
18
|
TypeDecorator,
|
|
21
19
|
UniqueConstraint,
|
|
@@ -554,10 +552,12 @@ class InstanceModel(BaseModel):
|
|
|
554
552
|
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
|
|
555
553
|
last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
556
554
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
555
|
+
volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
|
|
556
|
+
back_populates="instance",
|
|
557
|
+
# Add delete-orphan option so that removing entries from volume_attachments
|
|
558
|
+
# automatically marks them for deletion.
|
|
559
|
+
# SQLalchemy requires delete when using delete-orphan.
|
|
560
|
+
cascade="save-update, merge, delete-orphan, delete",
|
|
561
561
|
)
|
|
562
562
|
|
|
563
563
|
|
|
@@ -587,23 +587,21 @@ class VolumeModel(BaseModel):
|
|
|
587
587
|
|
|
588
588
|
configuration: Mapped[str] = mapped_column(Text)
|
|
589
589
|
volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
590
|
-
|
|
591
|
-
|
|
590
|
+
|
|
591
|
+
attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume")
|
|
592
|
+
|
|
593
|
+
# Deprecated in favor of VolumeAttachmentModel.attachment_data
|
|
592
594
|
volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
593
595
|
|
|
594
|
-
# instances the volume is attached to
|
|
595
|
-
instances: Mapped[List["InstanceModel"]] = relationship(
|
|
596
|
-
secondary="volumes_attachments",
|
|
597
|
-
back_populates="volumes",
|
|
598
|
-
)
|
|
599
596
|
|
|
597
|
+
class VolumeAttachmentModel(BaseModel):
|
|
598
|
+
__tablename__ = "volumes_attachments"
|
|
600
599
|
|
|
601
|
-
|
|
602
|
-
"
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
)
|
|
600
|
+
volume_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("volumes.id"), primary_key=True)
|
|
601
|
+
volume: Mapped[VolumeModel] = relationship(back_populates="attachments")
|
|
602
|
+
instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"), primary_key=True)
|
|
603
|
+
instance: Mapped[InstanceModel] = relationship(back_populates="volume_attachments")
|
|
604
|
+
attachment_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
607
605
|
|
|
608
606
|
|
|
609
607
|
class PlacementGroupModel(BaseModel):
|
|
@@ -257,6 +257,7 @@ async def get_plan(
|
|
|
257
257
|
project=project,
|
|
258
258
|
profile=spec.merged_profile,
|
|
259
259
|
requirements=_get_fleet_requirements(spec),
|
|
260
|
+
fleet_spec=spec,
|
|
260
261
|
blocks=spec.configuration.blocks,
|
|
261
262
|
)
|
|
262
263
|
offers = [offer for _, offer in offers_with_backends]
|
|
@@ -277,12 +278,15 @@ async def get_create_instance_offers(
|
|
|
277
278
|
project: ProjectModel,
|
|
278
279
|
profile: Profile,
|
|
279
280
|
requirements: Requirements,
|
|
280
|
-
|
|
281
|
+
fleet_spec: Optional[FleetSpec] = None,
|
|
281
282
|
fleet_model: Optional[FleetModel] = None,
|
|
282
283
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
284
|
+
exclude_not_available: bool = False,
|
|
283
285
|
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
284
286
|
multinode = False
|
|
285
287
|
master_job_provisioning_data = None
|
|
288
|
+
if fleet_spec is not None:
|
|
289
|
+
multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
286
290
|
if fleet_model is not None:
|
|
287
291
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
288
292
|
multinode = fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
@@ -242,7 +242,7 @@ async def process_terminating_job(
|
|
|
242
242
|
session=session, project=instance_model.project, names=jrd.volume_names
|
|
243
243
|
)
|
|
244
244
|
else:
|
|
245
|
-
volume_models = instance_model.
|
|
245
|
+
volume_models = [va.volume for va in instance_model.volume_attachments]
|
|
246
246
|
if len(volume_models) > 0:
|
|
247
247
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
248
248
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
@@ -306,7 +306,7 @@ async def process_volumes_detaching(
|
|
|
306
306
|
session=session, project=instance_model.project, names=jrd.volume_names
|
|
307
307
|
)
|
|
308
308
|
else:
|
|
309
|
-
volume_models = instance_model.
|
|
309
|
+
volume_models = [va.volume for va in instance_model.volume_attachments]
|
|
310
310
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
311
311
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
312
312
|
project=instance_model.project,
|
|
@@ -439,8 +439,8 @@ async def _detach_volumes_from_job_instance(
|
|
|
439
439
|
if job_model.volumes_detached_at is None:
|
|
440
440
|
job_model.volumes_detached_at = common.get_current_datetime()
|
|
441
441
|
detached_volumes_ids = {v.id for v in detached_volumes}
|
|
442
|
-
instance_model.
|
|
443
|
-
|
|
442
|
+
instance_model.volume_attachments = [
|
|
443
|
+
va for va in instance_model.volume_attachments if va.volume_id not in detached_volumes_ids
|
|
444
444
|
]
|
|
445
445
|
return all_detached
|
|
446
446
|
|
|
@@ -50,35 +50,35 @@ async def get_offers_by_requirements(
|
|
|
50
50
|
if volumes:
|
|
51
51
|
mount_point_volumes = volumes[0]
|
|
52
52
|
volumes_backend_types = [v.configuration.backend for v in mount_point_volumes]
|
|
53
|
-
if
|
|
53
|
+
if backend_types is None:
|
|
54
54
|
backend_types = volumes_backend_types
|
|
55
55
|
backend_types = [b for b in backend_types if b in volumes_backend_types]
|
|
56
56
|
volumes_regions = [v.configuration.region for v in mount_point_volumes]
|
|
57
|
-
if
|
|
57
|
+
if regions is None:
|
|
58
58
|
regions = volumes_regions
|
|
59
59
|
regions = [r for r in regions if r in volumes_regions]
|
|
60
60
|
|
|
61
61
|
if multinode:
|
|
62
|
-
if
|
|
62
|
+
if backend_types is None:
|
|
63
63
|
backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
|
|
64
64
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
|
|
65
65
|
|
|
66
66
|
if privileged or instance_mounts:
|
|
67
|
-
if
|
|
67
|
+
if backend_types is None:
|
|
68
68
|
backend_types = BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
69
69
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT]
|
|
70
70
|
|
|
71
71
|
if profile.reservation is not None:
|
|
72
|
-
if
|
|
72
|
+
if backend_types is None:
|
|
73
73
|
backend_types = BACKENDS_WITH_RESERVATION_SUPPORT
|
|
74
74
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_RESERVATION_SUPPORT]
|
|
75
75
|
|
|
76
76
|
# For multi-node, restrict backend and region.
|
|
77
77
|
# The default behavior is to provision all nodes in the same backend and region.
|
|
78
78
|
if master_job_provisioning_data is not None:
|
|
79
|
-
if
|
|
79
|
+
if backend_types is None:
|
|
80
80
|
backend_types = [master_job_provisioning_data.get_base_backend()]
|
|
81
|
-
if
|
|
81
|
+
if regions is None:
|
|
82
82
|
regions = [master_job_provisioning_data.region]
|
|
83
83
|
backend_types = [
|
|
84
84
|
b for b in backend_types if b == master_job_provisioning_data.get_base_backend()
|
|
@@ -462,19 +462,19 @@ def filter_pool_instances(
|
|
|
462
462
|
zones = [z for z in zones if z in volume_zones]
|
|
463
463
|
|
|
464
464
|
if multinode:
|
|
465
|
-
if
|
|
465
|
+
if backend_types is None:
|
|
466
466
|
backend_types = BACKENDS_WITH_MULTINODE_SUPPORT
|
|
467
467
|
backend_types = [b for b in backend_types if b in BACKENDS_WITH_MULTINODE_SUPPORT]
|
|
468
468
|
|
|
469
469
|
# For multi-node, restrict backend and region.
|
|
470
470
|
# The default behavior is to provision all nodes in the same backend and region.
|
|
471
471
|
if master_job_provisioning_data is not None:
|
|
472
|
-
if
|
|
472
|
+
if backend_types is None:
|
|
473
473
|
backend_types = [master_job_provisioning_data.get_base_backend()]
|
|
474
474
|
backend_types = [
|
|
475
475
|
b for b in backend_types if b == master_job_provisioning_data.get_base_backend()
|
|
476
476
|
]
|
|
477
|
-
if
|
|
477
|
+
if regions is None:
|
|
478
478
|
regions = [master_job_provisioning_data.region]
|
|
479
479
|
regions = [r for r in regions if r == master_job_provisioning_data.region]
|
|
480
480
|
|
|
@@ -239,6 +239,7 @@ class ShimClient:
|
|
|
239
239
|
host_ssh_user: str,
|
|
240
240
|
host_ssh_keys: list[str],
|
|
241
241
|
container_ssh_keys: list[str],
|
|
242
|
+
instance_id: str,
|
|
242
243
|
) -> None:
|
|
243
244
|
if not self.is_api_v2_supported():
|
|
244
245
|
raise ShimAPIVersionError()
|
|
@@ -255,7 +256,7 @@ class ShimClient:
|
|
|
255
256
|
memory=_memory_to_bytes(memory), # None = 0 = "all available"
|
|
256
257
|
shm_size=_memory_to_bytes(shm_size), # None = 0 = "use default value"
|
|
257
258
|
network_mode=network_mode,
|
|
258
|
-
volumes=[_volume_to_shim_volume_info(v) for v in volumes],
|
|
259
|
+
volumes=[_volume_to_shim_volume_info(v, instance_id) for v in volumes],
|
|
259
260
|
volume_mounts=volume_mounts,
|
|
260
261
|
instance_mounts=instance_mounts,
|
|
261
262
|
host_ssh_user=host_ssh_user,
|
|
@@ -303,6 +304,7 @@ class ShimClient:
|
|
|
303
304
|
mounts: List[VolumeMountPoint],
|
|
304
305
|
volumes: List[Volume],
|
|
305
306
|
instance_mounts: List[InstanceMountPoint],
|
|
307
|
+
instance_id: str,
|
|
306
308
|
) -> bool:
|
|
307
309
|
"""
|
|
308
310
|
Returns `True` if submitted and `False` if the shim already has a job (`409 Conflict`).
|
|
@@ -320,7 +322,7 @@ class ShimClient:
|
|
|
320
322
|
ssh_user=ssh_user,
|
|
321
323
|
ssh_key=ssh_key,
|
|
322
324
|
mounts=mounts,
|
|
323
|
-
volumes=[_volume_to_shim_volume_info(v) for v in volumes],
|
|
325
|
+
volumes=[_volume_to_shim_volume_info(v, instance_id) for v in volumes],
|
|
324
326
|
instance_mounts=instance_mounts,
|
|
325
327
|
)
|
|
326
328
|
resp = self._request("POST", "/api/submit", body)
|
|
@@ -398,10 +400,11 @@ def health_response_to_health_status(data: HealthcheckResponse) -> HealthStatus:
|
|
|
398
400
|
)
|
|
399
401
|
|
|
400
402
|
|
|
401
|
-
def _volume_to_shim_volume_info(volume: Volume) -> ShimVolumeInfo:
|
|
403
|
+
def _volume_to_shim_volume_info(volume: Volume, instance_id: str) -> ShimVolumeInfo:
|
|
402
404
|
device_name = None
|
|
403
|
-
|
|
404
|
-
|
|
405
|
+
attachment_data = volume.get_attachment_data_for_instance(instance_id)
|
|
406
|
+
if attachment_data is not None:
|
|
407
|
+
device_name = attachment_data.device_name
|
|
405
408
|
return ShimVolumeInfo(
|
|
406
409
|
backend=volume.configuration.backend.value,
|
|
407
410
|
name=volume.name,
|