dstack 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/__init__.py +2 -1
- dstack/_internal/cli/commands/apply.py +4 -2
- dstack/_internal/cli/commands/attach.py +21 -1
- dstack/_internal/cli/commands/completion.py +20 -0
- dstack/_internal/cli/commands/delete.py +3 -1
- dstack/_internal/cli/commands/fleet.py +2 -1
- dstack/_internal/cli/commands/gateway.py +7 -2
- dstack/_internal/cli/commands/logs.py +3 -2
- dstack/_internal/cli/commands/stats.py +2 -1
- dstack/_internal/cli/commands/stop.py +2 -1
- dstack/_internal/cli/commands/volume.py +2 -1
- dstack/_internal/cli/main.py +6 -0
- dstack/_internal/cli/services/completion.py +86 -0
- dstack/_internal/cli/services/configurators/run.py +10 -17
- dstack/_internal/cli/utils/fleet.py +5 -1
- dstack/_internal/cli/utils/volume.py +9 -0
- dstack/_internal/core/backends/aws/compute.py +24 -11
- dstack/_internal/core/backends/aws/resources.py +3 -3
- dstack/_internal/core/backends/azure/compute.py +14 -8
- dstack/_internal/core/backends/azure/resources.py +2 -0
- dstack/_internal/core/backends/base/compute.py +102 -2
- dstack/_internal/core/backends/base/offers.py +7 -1
- dstack/_internal/core/backends/cudo/compute.py +8 -4
- dstack/_internal/core/backends/datacrunch/compute.py +10 -4
- dstack/_internal/core/backends/gcp/auth.py +19 -13
- dstack/_internal/core/backends/gcp/compute.py +27 -20
- dstack/_internal/core/backends/gcp/resources.py +3 -10
- dstack/_internal/core/backends/kubernetes/compute.py +4 -3
- dstack/_internal/core/backends/lambdalabs/compute.py +9 -3
- dstack/_internal/core/backends/nebius/compute.py +2 -2
- dstack/_internal/core/backends/oci/compute.py +10 -4
- dstack/_internal/core/backends/runpod/compute.py +11 -4
- dstack/_internal/core/backends/tensordock/compute.py +14 -3
- dstack/_internal/core/backends/vastai/compute.py +12 -2
- dstack/_internal/core/backends/vultr/api_client.py +3 -3
- dstack/_internal/core/backends/vultr/compute.py +9 -3
- dstack/_internal/core/models/backends/aws.py +2 -0
- dstack/_internal/core/models/backends/base.py +1 -0
- dstack/_internal/core/models/configurations.py +0 -1
- dstack/_internal/core/models/runs.py +3 -3
- dstack/_internal/core/models/volumes.py +23 -0
- dstack/_internal/core/services/__init__.py +5 -1
- dstack/_internal/core/services/configs/__init__.py +3 -0
- dstack/_internal/server/background/tasks/common.py +22 -0
- dstack/_internal/server/background/tasks/process_instances.py +13 -21
- dstack/_internal/server/background/tasks/process_running_jobs.py +13 -16
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -7
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +7 -2
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +17 -19
- dstack/_internal/server/routers/logs.py +3 -0
- dstack/_internal/server/services/backends/configurators/aws.py +31 -1
- dstack/_internal/server/services/backends/configurators/gcp.py +8 -15
- dstack/_internal/server/services/config.py +11 -1
- dstack/_internal/server/services/fleets.py +5 -1
- dstack/_internal/server/services/jobs/__init__.py +14 -11
- dstack/_internal/server/services/jobs/configurators/dev.py +1 -3
- dstack/_internal/server/services/jobs/configurators/task.py +1 -3
- dstack/_internal/server/services/logs/__init__.py +78 -0
- dstack/_internal/server/services/{logs.py → logs/aws.py} +12 -207
- dstack/_internal/server/services/logs/base.py +47 -0
- dstack/_internal/server/services/logs/filelog.py +110 -0
- dstack/_internal/server/services/logs/gcp.py +165 -0
- dstack/_internal/server/services/offers.py +7 -7
- dstack/_internal/server/services/pools.py +19 -20
- dstack/_internal/server/services/proxy/routers/service_proxy.py +14 -7
- dstack/_internal/server/services/runner/client.py +8 -5
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/settings.py +3 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-ad5150a441de98cd8987.css → main-7510e71dfa9749a4e70e.css} +1 -1
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js → main-fe8fd9db55df8d10e648.js} +66 -66
- dstack/_internal/server/statics/{main-2ac66bfcbd2e39830b88.js.map → main-fe8fd9db55df8d10e648.js.map} +1 -1
- dstack/_internal/server/testing/common.py +46 -17
- dstack/api/_public/runs.py +1 -1
- dstack/version.py +2 -2
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/METADATA +4 -3
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/RECORD +97 -86
- tests/_internal/core/backends/base/__init__.py +0 -0
- tests/_internal/core/backends/base/test_compute.py +56 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +2 -1
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +5 -3
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +11 -6
- tests/_internal/server/conftest.py +4 -5
- tests/_internal/server/routers/test_backends.py +1 -0
- tests/_internal/server/routers/test_logs.py +1 -1
- tests/_internal/server/routers/test_runs.py +2 -2
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_logs.py +3 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +105 -1
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/WHEEL +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.41.dist-info → dstack-0.18.43.dist-info}/top_level.txt +0 -0
|
@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
|
|
|
43
43
|
PoolModel,
|
|
44
44
|
ProjectModel,
|
|
45
45
|
RunModel,
|
|
46
|
+
VolumeAttachmentModel,
|
|
46
47
|
VolumeModel,
|
|
47
48
|
)
|
|
48
49
|
from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
|
|
@@ -236,7 +237,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
|
|
|
236
237
|
res = await session.execute(
|
|
237
238
|
select(InstanceModel)
|
|
238
239
|
.where(InstanceModel.id == job_model.instance.id)
|
|
239
|
-
.options(selectinload(InstanceModel.
|
|
240
|
+
.options(selectinload(InstanceModel.volume_attachments))
|
|
240
241
|
.execution_options(populate_existing=True)
|
|
241
242
|
)
|
|
242
243
|
instance = res.unique().scalar_one()
|
|
@@ -390,11 +391,11 @@ async def _assign_job_to_pool_instance(
|
|
|
390
391
|
|
|
391
392
|
instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
|
|
392
393
|
instance, offer = instances_with_offers[0]
|
|
393
|
-
# Reload InstanceModel with
|
|
394
|
+
# Reload InstanceModel with volume attachments
|
|
394
395
|
res = await session.execute(
|
|
395
396
|
select(InstanceModel)
|
|
396
397
|
.where(InstanceModel.id == instance.id)
|
|
397
|
-
.options(joinedload(InstanceModel.
|
|
398
|
+
.options(joinedload(InstanceModel.volume_attachments))
|
|
398
399
|
)
|
|
399
400
|
instance = res.unique().scalar_one()
|
|
400
401
|
instance.status = InstanceStatus.BUSY
|
|
@@ -580,7 +581,7 @@ def _create_instance_model_for_job(
|
|
|
580
581
|
backend=offer.backend,
|
|
581
582
|
price=offer.price,
|
|
582
583
|
region=offer.region,
|
|
583
|
-
|
|
584
|
+
volume_attachments=[],
|
|
584
585
|
total_blocks=1,
|
|
585
586
|
busy_blocks=1,
|
|
586
587
|
)
|
|
@@ -696,14 +697,18 @@ async def _attach_volume(
|
|
|
696
697
|
instance: InstanceModel,
|
|
697
698
|
instance_id: str,
|
|
698
699
|
):
|
|
700
|
+
volume = volume_model_to_volume(volume_model)
|
|
701
|
+
# Refresh only to check if the volume wasn't deleted before the lock
|
|
699
702
|
await session.refresh(volume_model)
|
|
700
703
|
if volume_model.deleted:
|
|
701
704
|
raise ServerClientError("Cannot attach a deleted volume")
|
|
702
|
-
volume = volume_model_to_volume(volume_model)
|
|
703
705
|
attachment_data = await common_utils.run_async(
|
|
704
706
|
backend.compute().attach_volume,
|
|
705
707
|
volume=volume,
|
|
706
708
|
instance_id=instance_id,
|
|
707
709
|
)
|
|
708
|
-
|
|
709
|
-
|
|
710
|
+
volume_attachment_model = VolumeAttachmentModel(
|
|
711
|
+
volume=volume_model,
|
|
712
|
+
attachment_data=attachment_data.json(),
|
|
713
|
+
)
|
|
714
|
+
instance.volume_attachments.append(volume_attachment_model)
|
|
@@ -6,7 +6,12 @@ from sqlalchemy.orm import joinedload, lazyload
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.runs import JobStatus
|
|
8
8
|
from dstack._internal.server.db import get_session_ctx
|
|
9
|
-
from dstack._internal.server.models import
|
|
9
|
+
from dstack._internal.server.models import (
|
|
10
|
+
InstanceModel,
|
|
11
|
+
JobModel,
|
|
12
|
+
ProjectModel,
|
|
13
|
+
VolumeAttachmentModel,
|
|
14
|
+
)
|
|
10
15
|
from dstack._internal.server.services.jobs import (
|
|
11
16
|
process_terminating_job,
|
|
12
17
|
process_volumes_detaching,
|
|
@@ -80,7 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
|
|
|
80
85
|
.where(InstanceModel.id == job_model.used_instance_id)
|
|
81
86
|
.options(
|
|
82
87
|
joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
|
|
83
|
-
joinedload(InstanceModel.
|
|
88
|
+
joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
|
|
84
89
|
)
|
|
85
90
|
)
|
|
86
91
|
instance_model = res.unique().scalar()
|
|
@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
|
|
|
5
5
|
from dstack._internal.core.errors import BackendError, BackendNotAvailable
|
|
6
6
|
from dstack._internal.core.models.volumes import VolumeStatus
|
|
7
7
|
from dstack._internal.server.db import get_session_ctx
|
|
8
|
-
from dstack._internal.server.models import
|
|
8
|
+
from dstack._internal.server.models import (
|
|
9
|
+
InstanceModel,
|
|
10
|
+
ProjectModel,
|
|
11
|
+
VolumeAttachmentModel,
|
|
12
|
+
VolumeModel,
|
|
13
|
+
)
|
|
9
14
|
from dstack._internal.server.services import backends as backends_services
|
|
10
15
|
from dstack._internal.server.services import volumes as volumes_services
|
|
11
16
|
from dstack._internal.server.services.locking import get_locker
|
|
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
|
|
|
49
54
|
.where(VolumeModel.id == volume_model.id)
|
|
50
55
|
.options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
|
|
51
56
|
.options(joinedload(VolumeModel.user))
|
|
57
|
+
.options(
|
|
58
|
+
joinedload(VolumeModel.attachments)
|
|
59
|
+
.joinedload(VolumeAttachmentModel.instance)
|
|
60
|
+
.joinedload(InstanceModel.fleet)
|
|
61
|
+
)
|
|
52
62
|
.execution_options(populate_existing=True)
|
|
53
63
|
)
|
|
54
64
|
volume_model = res.unique().scalar_one()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Move attachment_data to volumes_attachments
|
|
2
|
+
|
|
3
|
+
Revision ID: a751ef183f27
|
|
4
|
+
Revises: 1e76fb0dde87
|
|
5
|
+
Create Date: 2025-02-12 13:19:57.569591
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "a751ef183f27"
|
|
14
|
+
down_revision = "1e76fb0dde87"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
22
|
+
batch_op.alter_column("instace_id", new_column_name="instance_id")
|
|
23
|
+
batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
|
|
24
|
+
|
|
25
|
+
# ### end Alembic commands ###
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade() -> None:
|
|
29
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
30
|
+
with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
|
|
31
|
+
batch_op.drop_column("attachment_data")
|
|
32
|
+
batch_op.alter_column("instance_id", new_column_name="instace_id")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|
|
@@ -5,7 +5,6 @@ from typing import Callable, List, Optional, Union
|
|
|
5
5
|
from sqlalchemy import (
|
|
6
6
|
BigInteger,
|
|
7
7
|
Boolean,
|
|
8
|
-
Column,
|
|
9
8
|
DateTime,
|
|
10
9
|
Enum,
|
|
11
10
|
Float,
|
|
@@ -15,7 +14,6 @@ from sqlalchemy import (
|
|
|
15
14
|
LargeBinary,
|
|
16
15
|
MetaData,
|
|
17
16
|
String,
|
|
18
|
-
Table,
|
|
19
17
|
Text,
|
|
20
18
|
TypeDecorator,
|
|
21
19
|
UniqueConstraint,
|
|
@@ -554,10 +552,12 @@ class InstanceModel(BaseModel):
|
|
|
554
552
|
jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
|
|
555
553
|
last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
556
554
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
555
|
+
volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
|
|
556
|
+
back_populates="instance",
|
|
557
|
+
# Add delete-orphan option so that removing entries from volume_attachments
|
|
558
|
+
# automatically marks them for deletion.
|
|
559
|
+
# SQLalchemy requires delete when using delete-orphan.
|
|
560
|
+
cascade="save-update, merge, delete-orphan, delete",
|
|
561
561
|
)
|
|
562
562
|
|
|
563
563
|
|
|
@@ -587,23 +587,21 @@ class VolumeModel(BaseModel):
|
|
|
587
587
|
|
|
588
588
|
configuration: Mapped[str] = mapped_column(Text)
|
|
589
589
|
volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
590
|
-
|
|
591
|
-
|
|
590
|
+
|
|
591
|
+
attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume")
|
|
592
|
+
|
|
593
|
+
# Deprecated in favor of VolumeAttachmentModel.attachment_data
|
|
592
594
|
volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
593
595
|
|
|
594
|
-
# instances the volume is attached to
|
|
595
|
-
instances: Mapped[List["InstanceModel"]] = relationship(
|
|
596
|
-
secondary="volumes_attachments",
|
|
597
|
-
back_populates="volumes",
|
|
598
|
-
)
|
|
599
596
|
|
|
597
|
+
class VolumeAttachmentModel(BaseModel):
|
|
598
|
+
__tablename__ = "volumes_attachments"
|
|
600
599
|
|
|
601
|
-
|
|
602
|
-
"
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
)
|
|
600
|
+
volume_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("volumes.id"), primary_key=True)
|
|
601
|
+
volume: Mapped[VolumeModel] = relationship(back_populates="attachments")
|
|
602
|
+
instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"), primary_key=True)
|
|
603
|
+
instance: Mapped[InstanceModel] = relationship(back_populates="volume_attachments")
|
|
604
|
+
attachment_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
607
605
|
|
|
608
606
|
|
|
609
607
|
class PlacementGroupModel(BaseModel):
|
|
@@ -24,4 +24,7 @@ async def poll_logs(
|
|
|
24
24
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
25
25
|
) -> JobSubmissionLogs:
|
|
26
26
|
_, project = user_project
|
|
27
|
+
# The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
|
|
28
|
+
# Otherwise, some logs with duplicated timestamps may be filtered out.
|
|
29
|
+
# This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
|
|
27
30
|
return await logs.poll_logs_async(project=project, request=body)
|
|
@@ -2,6 +2,7 @@ import concurrent.futures
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
import botocore.exceptions
|
|
5
6
|
from boto3.session import Session
|
|
6
7
|
|
|
7
8
|
from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
|
|
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
|
|
|
35
36
|
Configurator,
|
|
36
37
|
raise_invalid_credentials_error,
|
|
37
38
|
)
|
|
39
|
+
from dstack._internal.utils.logging import get_logger
|
|
40
|
+
|
|
41
|
+
logger = get_logger(__name__)
|
|
38
42
|
|
|
39
43
|
REGIONS = [
|
|
40
44
|
("US East, N. Virginia", "us-east-1"),
|
|
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
|
|
|
137
141
|
|
|
138
142
|
def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
139
143
|
self._check_tags_config(config)
|
|
140
|
-
self.
|
|
144
|
+
self._check_iam_instance_profile_config(session, config)
|
|
145
|
+
self._check_vpc_config(session, config)
|
|
141
146
|
|
|
142
147
|
def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
|
|
143
148
|
if not config.tags:
|
|
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
|
|
|
151
156
|
except BackendError as e:
|
|
152
157
|
raise ServerClientError(e.args[0])
|
|
153
158
|
|
|
159
|
+
def _check_iam_instance_profile_config(
|
|
160
|
+
self, session: Session, config: AWSConfigInfoWithCredsPartial
|
|
161
|
+
):
|
|
162
|
+
if config.iam_instance_profile is None:
|
|
163
|
+
return
|
|
164
|
+
try:
|
|
165
|
+
iam_client = session.client("iam")
|
|
166
|
+
iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
|
|
167
|
+
except botocore.exceptions.ClientError as e:
|
|
168
|
+
if e.response["Error"]["Code"] == "NoSuchEntity":
|
|
169
|
+
raise ServerClientError(
|
|
170
|
+
f"IAM instance profile {config.iam_instance_profile} not found"
|
|
171
|
+
)
|
|
172
|
+
logger.exception(
|
|
173
|
+
"Got botocore.exceptions.ClientError when checking iam_instance_profile"
|
|
174
|
+
)
|
|
175
|
+
raise ServerClientError(
|
|
176
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
177
|
+
)
|
|
178
|
+
except Exception:
|
|
179
|
+
logger.exception("Got exception when checking iam_instance_profile")
|
|
180
|
+
raise ServerClientError(
|
|
181
|
+
f"Failed to check IAM instance profile {config.iam_instance_profile}"
|
|
182
|
+
)
|
|
183
|
+
|
|
154
184
|
def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
|
|
155
185
|
allocate_public_ip = config.public_ips if config.public_ips is not None else True
|
|
156
186
|
use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True
|
|
@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
|
|
|
127
127
|
_, project_id = auth.authenticate(GCPDefaultCreds())
|
|
128
128
|
except BackendAuthError:
|
|
129
129
|
return []
|
|
130
|
-
|
|
131
|
-
if project_id is None:
|
|
132
|
-
return []
|
|
133
|
-
|
|
134
130
|
return [
|
|
135
131
|
GCPConfigInfoWithCreds(
|
|
136
132
|
project_id=project_id,
|
|
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
|
|
|
152
148
|
):
|
|
153
149
|
raise_invalid_credentials_error(fields=[["creds"]])
|
|
154
150
|
try:
|
|
155
|
-
credentials,
|
|
156
|
-
except BackendAuthError:
|
|
151
|
+
credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
|
|
152
|
+
except BackendAuthError as e:
|
|
153
|
+
details = None
|
|
154
|
+
if len(e.args) > 0:
|
|
155
|
+
details = e.args[0]
|
|
157
156
|
if is_core_model_instance(config.creds, GCPServiceAccountCreds):
|
|
158
|
-
raise_invalid_credentials_error(fields=[["creds", "data"]])
|
|
157
|
+
raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
|
|
159
158
|
else:
|
|
160
|
-
raise_invalid_credentials_error(fields=[["creds"]])
|
|
161
|
-
if (
|
|
162
|
-
project_id is not None
|
|
163
|
-
and config.project_id is not None
|
|
164
|
-
and config.project_id != project_id
|
|
165
|
-
):
|
|
166
|
-
raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
|
|
167
|
-
config_values.project_id = self._get_project_id_element(selected=project_id)
|
|
159
|
+
raise_invalid_credentials_error(fields=[["creds"]], details=details)
|
|
168
160
|
config_values.regions = self._get_regions_element(
|
|
169
161
|
selected=config.regions or DEFAULT_REGIONS
|
|
170
162
|
)
|
|
171
163
|
if config.project_id is None:
|
|
172
164
|
return config_values
|
|
165
|
+
config_values.project_id = self._get_project_id_element(selected=config.project_id)
|
|
173
166
|
self._check_config(config=config, credentials=credentials)
|
|
174
167
|
return config_values
|
|
175
168
|
|
|
@@ -107,6 +107,16 @@ class AWSConfig(CoreModel):
|
|
|
107
107
|
)
|
|
108
108
|
),
|
|
109
109
|
] = None
|
|
110
|
+
iam_instance_profile: Annotated[
|
|
111
|
+
Optional[str],
|
|
112
|
+
Field(
|
|
113
|
+
description=(
|
|
114
|
+
"The name of the IAM instance profile to associate with EC2 instances."
|
|
115
|
+
" You can also specify the IAM role name for roles created via the AWS console."
|
|
116
|
+
" AWS automatically creates an instance profile and gives it the same name as the role"
|
|
117
|
+
)
|
|
118
|
+
),
|
|
119
|
+
] = None
|
|
110
120
|
tags: Annotated[
|
|
111
121
|
Optional[Dict[str, str]],
|
|
112
122
|
Field(description="The tags that will be assigned to resources created by `dstack`"),
|
|
@@ -251,7 +261,7 @@ class GCPConfig(CoreModel):
|
|
|
251
261
|
),
|
|
252
262
|
] = None
|
|
253
263
|
vm_service_account: Annotated[
|
|
254
|
-
Optional[str], Field(description="The service account
|
|
264
|
+
Optional[str], Field(description="The service account to associate with provisioned VMs")
|
|
255
265
|
] = None
|
|
256
266
|
tags: Annotated[
|
|
257
267
|
Optional[Dict[str, str]],
|
|
@@ -257,6 +257,7 @@ async def get_plan(
|
|
|
257
257
|
project=project,
|
|
258
258
|
profile=spec.merged_profile,
|
|
259
259
|
requirements=_get_fleet_requirements(spec),
|
|
260
|
+
fleet_spec=spec,
|
|
260
261
|
blocks=spec.configuration.blocks,
|
|
261
262
|
)
|
|
262
263
|
offers = [offer for _, offer in offers_with_backends]
|
|
@@ -277,12 +278,15 @@ async def get_create_instance_offers(
|
|
|
277
278
|
project: ProjectModel,
|
|
278
279
|
profile: Profile,
|
|
279
280
|
requirements: Requirements,
|
|
280
|
-
|
|
281
|
+
fleet_spec: Optional[FleetSpec] = None,
|
|
281
282
|
fleet_model: Optional[FleetModel] = None,
|
|
282
283
|
blocks: Union[int, Literal["auto"]] = 1,
|
|
284
|
+
exclude_not_available: bool = False,
|
|
283
285
|
) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
|
|
284
286
|
multinode = False
|
|
285
287
|
master_job_provisioning_data = None
|
|
288
|
+
if fleet_spec is not None:
|
|
289
|
+
multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
286
290
|
if fleet_model is not None:
|
|
287
291
|
fleet = fleet_model_to_fleet(fleet_model)
|
|
288
292
|
multinode = fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
|
|
@@ -236,13 +236,14 @@ async def process_terminating_job(
|
|
|
236
236
|
logger.debug("%s: stopping container", fmt(job_model))
|
|
237
237
|
ssh_private_keys = get_instance_ssh_private_keys(instance_model)
|
|
238
238
|
await stop_container(job_model, jpd, ssh_private_keys)
|
|
239
|
-
volume_models: list[VolumeModel]
|
|
240
239
|
if jrd is not None and jrd.volume_names is not None:
|
|
241
|
-
|
|
242
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
243
|
-
)
|
|
240
|
+
volume_names = jrd.volume_names
|
|
244
241
|
else:
|
|
245
|
-
|
|
242
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
243
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
244
|
+
volume_models = await list_project_volume_models(
|
|
245
|
+
session=session, project=instance_model.project, names=volume_names
|
|
246
|
+
)
|
|
246
247
|
if len(volume_models) > 0:
|
|
247
248
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
248
249
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
|
|
|
302
303
|
jpd = get_or_error(get_job_provisioning_data(job_model))
|
|
303
304
|
jrd = get_job_runtime_data(job_model)
|
|
304
305
|
if jrd is not None and jrd.volume_names is not None:
|
|
305
|
-
|
|
306
|
-
session=session, project=instance_model.project, names=jrd.volume_names
|
|
307
|
-
)
|
|
306
|
+
volume_names = jrd.volume_names
|
|
308
307
|
else:
|
|
309
|
-
|
|
308
|
+
# Legacy jobs before job_runtime_data/blocks were introduced
|
|
309
|
+
volume_names = [va.volume.name for va in instance_model.volume_attachments]
|
|
310
|
+
volume_models = await list_project_volume_models(
|
|
311
|
+
session=session, project=instance_model.project, names=volume_names
|
|
312
|
+
)
|
|
310
313
|
logger.info("Detaching volumes: %s", [v.name for v in volume_models])
|
|
311
314
|
all_volumes_detached = await _detach_volumes_from_job_instance(
|
|
312
315
|
project=instance_model.project,
|
|
@@ -439,8 +442,8 @@ async def _detach_volumes_from_job_instance(
|
|
|
439
442
|
if job_model.volumes_detached_at is None:
|
|
440
443
|
job_model.volumes_detached_at = common.get_current_datetime()
|
|
441
444
|
detached_volumes_ids = {v.id for v in detached_volumes}
|
|
442
|
-
instance_model.
|
|
443
|
-
|
|
445
|
+
instance_model.volume_attachments = [
|
|
446
|
+
va for va in instance_model.volume_attachments if va.volume_id not in detached_volumes_ids
|
|
444
447
|
]
|
|
445
448
|
return all_detached
|
|
446
449
|
|
|
@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
|
|
|
6
6
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
7
7
|
from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
|
|
8
8
|
|
|
9
|
-
DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
|
|
10
|
-
|
|
11
9
|
INSTALL_IPYKERNEL = (
|
|
12
10
|
"(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
|
|
13
11
|
'echo "no pip, ipykernel was not installed"'
|
|
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
|
|
|
44
42
|
return False
|
|
45
43
|
|
|
46
44
|
def _default_max_duration(self) -> Optional[int]:
|
|
47
|
-
return
|
|
45
|
+
return None
|
|
48
46
|
|
|
49
47
|
def _spot_policy(self) -> SpotPolicy:
|
|
50
48
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
@@ -5,8 +5,6 @@ from dstack._internal.core.models.profiles import SpotPolicy
|
|
|
5
5
|
from dstack._internal.core.models.runs import JobSpec
|
|
6
6
|
from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
|
|
7
7
|
|
|
8
|
-
DEFAULT_MAX_DURATION_SECONDS = 72 * 3600
|
|
9
|
-
|
|
10
8
|
|
|
11
9
|
class TaskJobConfigurator(JobConfigurator):
|
|
12
10
|
TYPE: RunConfigurationType = RunConfigurationType.TASK
|
|
@@ -29,7 +27,7 @@ class TaskJobConfigurator(JobConfigurator):
|
|
|
29
27
|
return True
|
|
30
28
|
|
|
31
29
|
def _default_max_duration(self) -> Optional[int]:
|
|
32
|
-
return
|
|
30
|
+
return None
|
|
33
31
|
|
|
34
32
|
def _spot_policy(self) -> SpotPolicy:
|
|
35
33
|
return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.logs import JobSubmissionLogs
|
|
6
|
+
from dstack._internal.server import settings
|
|
7
|
+
from dstack._internal.server.models import ProjectModel
|
|
8
|
+
from dstack._internal.server.schemas.logs import PollLogsRequest
|
|
9
|
+
from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
|
|
10
|
+
from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
|
|
11
|
+
from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
|
|
12
|
+
from dstack._internal.server.services.logs.filelog import FileLogStorage
|
|
13
|
+
from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
|
|
14
|
+
from dstack._internal.utils.common import run_async
|
|
15
|
+
from dstack._internal.utils.logging import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_log_storage: Optional[LogStorage] = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_log_storage() -> LogStorage:
|
|
24
|
+
global _log_storage
|
|
25
|
+
if _log_storage is not None:
|
|
26
|
+
return _log_storage
|
|
27
|
+
if settings.SERVER_CLOUDWATCH_LOG_GROUP:
|
|
28
|
+
if BOTO_AVAILABLE:
|
|
29
|
+
try:
|
|
30
|
+
_log_storage = CloudWatchLogStorage(
|
|
31
|
+
group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
|
|
32
|
+
region=settings.SERVER_CLOUDWATCH_LOG_REGION,
|
|
33
|
+
)
|
|
34
|
+
except LogStorageError as e:
|
|
35
|
+
logger.error("Failed to initialize CloudWatch Logs storage: %s", e)
|
|
36
|
+
except Exception:
|
|
37
|
+
logger.exception("Got exception when initializing CloudWatch Logs storage")
|
|
38
|
+
else:
|
|
39
|
+
logger.debug("Using CloudWatch Logs storage")
|
|
40
|
+
else:
|
|
41
|
+
logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
|
|
42
|
+
elif settings.SERVER_GCP_LOGGING_PROJECT:
|
|
43
|
+
if GCP_LOGGING_AVAILABLE:
|
|
44
|
+
try:
|
|
45
|
+
_log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
|
|
46
|
+
except LogStorageError as e:
|
|
47
|
+
logger.error("Failed to initialize GCP Logs storage: %s", e)
|
|
48
|
+
except Exception:
|
|
49
|
+
logger.exception("Got exception when initializing GCP Logs storage")
|
|
50
|
+
else:
|
|
51
|
+
logger.debug("Using GCP Logs storage")
|
|
52
|
+
else:
|
|
53
|
+
logger.error("Cannot use GCP Logs storage: GCP deps are not installed")
|
|
54
|
+
if _log_storage is None:
|
|
55
|
+
_log_storage = FileLogStorage()
|
|
56
|
+
logger.debug("Using file-based storage")
|
|
57
|
+
atexit.register(_log_storage.close)
|
|
58
|
+
return _log_storage
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def write_logs(
|
|
62
|
+
project: ProjectModel,
|
|
63
|
+
run_name: str,
|
|
64
|
+
job_submission_id: UUID,
|
|
65
|
+
runner_logs: List[RunnerLogEvent],
|
|
66
|
+
job_logs: List[RunnerLogEvent],
|
|
67
|
+
) -> None:
|
|
68
|
+
return get_log_storage().write_logs(
|
|
69
|
+
project=project,
|
|
70
|
+
run_name=run_name,
|
|
71
|
+
job_submission_id=job_submission_id,
|
|
72
|
+
runner_logs=runner_logs,
|
|
73
|
+
job_logs=job_logs,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
|
|
78
|
+
return await run_async(get_log_storage().poll_logs, project=project, request=request)
|