PyPI - dstack - Versions diffs - 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl - Mend

dstack 0.18.41py3-none-any.whl → 0.18.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -43,6 +43,7 @@ from dstack._internal.server.models import (
     PoolModel,
     ProjectModel,
     RunModel,
+    VolumeAttachmentModel,
     VolumeModel,
 )
 from dstack._internal.server.services.backends import get_project_backend_by_type_or_error
@@ -236,7 +237,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         res = await session.execute(
             select(InstanceModel)
             .where(InstanceModel.id == job_model.instance.id)
-            .options(selectinload(InstanceModel.volumes))
+            .options(selectinload(InstanceModel.volume_attachments))
             .execution_options(populate_existing=True)
         )
         instance = res.unique().scalar_one()
@@ -390,11 +391,11 @@ async def _assign_job_to_pool_instance(
     instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
     instance, offer = instances_with_offers[0]
-    # Reload InstanceModel with volumes
+    # Reload InstanceModel with volume attachments
     res = await session.execute(
         select(InstanceModel)
         .where(InstanceModel.id == instance.id)
-        .options(joinedload(InstanceModel.volumes))
+        .options(joinedload(InstanceModel.volume_attachments))
     )
     instance = res.unique().scalar_one()
     instance.status = InstanceStatus.BUSY
@@ -580,7 +581,7 @@ def _create_instance_model_for_job(
         backend=offer.backend,
         price=offer.price,
         region=offer.region,
-        volumes=[],
+        volume_attachments=[],
         total_blocks=1,
         busy_blocks=1,
     )
@@ -696,14 +697,18 @@ async def _attach_volume(
     instance: InstanceModel,
     instance_id: str,
 ):
+    volume = volume_model_to_volume(volume_model)
+    # Refresh only to check if the volume wasn't deleted before the lock
     await session.refresh(volume_model)
     if volume_model.deleted:
         raise ServerClientError("Cannot attach a deleted volume")
-    volume = volume_model_to_volume(volume_model)
     attachment_data = await common_utils.run_async(
         backend.compute().attach_volume,
         volume=volume,
         instance_id=instance_id,
     )
-    volume_model.volume_attachment_data = attachment_data.json()
-    instance.volumes.append(volume_model)
+    volume_attachment_model = VolumeAttachmentModel(
+        volume=volume_model,
+        attachment_data=attachment_data.json(),
+    )
+    instance.volume_attachments.append(volume_attachment_model)

dstack/_internal/server/background/tasks/process_terminating_jobs.py CHANGED Viewed

@@ -6,7 +6,12 @@ from sqlalchemy.orm import joinedload, lazyload
 from dstack._internal.core.models.runs import JobStatus
 from dstack._internal.server.db import get_session_ctx
-from dstack._internal.server.models import InstanceModel, JobModel, ProjectModel, VolumeModel
+from dstack._internal.server.models import (
+    InstanceModel,
+    JobModel,
+    ProjectModel,
+    VolumeAttachmentModel,
+)
 from dstack._internal.server.services.jobs import (
     process_terminating_job,
     process_volumes_detaching,
@@ -80,7 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
         .where(InstanceModel.id == job_model.used_instance_id)
         .options(
             joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
-            joinedload(InstanceModel.volumes).joinedload(VolumeModel.user),
+            joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
         )
     )
     instance_model = res.unique().scalar()

dstack/_internal/server/background/tasks/process_volumes.py CHANGED Viewed

@@ -5,7 +5,12 @@ from sqlalchemy.orm import joinedload
 from dstack._internal.core.errors import BackendError, BackendNotAvailable
 from dstack._internal.core.models.volumes import VolumeStatus
 from dstack._internal.server.db import get_session_ctx
-from dstack._internal.server.models import ProjectModel, VolumeModel
+from dstack._internal.server.models import (
+    InstanceModel,
+    ProjectModel,
+    VolumeAttachmentModel,
+    VolumeModel,
+)
 from dstack._internal.server.services import backends as backends_services
 from dstack._internal.server.services import volumes as volumes_services
 from dstack._internal.server.services.locking import get_locker
@@ -49,6 +54,11 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM
         .where(VolumeModel.id == volume_model.id)
         .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends))
         .options(joinedload(VolumeModel.user))
+        .options(
+            joinedload(VolumeModel.attachments)
+            .joinedload(VolumeAttachmentModel.instance)
+            .joinedload(InstanceModel.fleet)
+        )
         .execution_options(populate_existing=True)
     )
     volume_model = res.unique().scalar_one()

dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Move attachment_data to volumes_attachments
+Revision ID: a751ef183f27
+Revises: 1e76fb0dde87
+Create Date: 2025-02-12 13:19:57.569591
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "a751ef183f27"
+down_revision = "1e76fb0dde87"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
+        batch_op.alter_column("instace_id", new_column_name="instance_id")
+        batch_op.add_column(sa.Column("attachment_data", sa.Text(), nullable=True))
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("volumes_attachments", schema=None) as batch_op:
+        batch_op.drop_column("attachment_data")
+        batch_op.alter_column("instance_id", new_column_name="instace_id")
+    # ### end Alembic commands ###

dstack/_internal/server/models.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Callable, List, Optional, Union
 from sqlalchemy import (
     BigInteger,
     Boolean,
-    Column,
     DateTime,
     Enum,
     Float,
@@ -15,7 +14,6 @@ from sqlalchemy import (
     LargeBinary,
     MetaData,
     String,
-    Table,
     Text,
     TypeDecorator,
     UniqueConstraint,
@@ -554,10 +552,12 @@ class InstanceModel(BaseModel):
     jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
     last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
-    # volumes attached to the instance
-    volumes: Mapped[List["VolumeModel"]] = relationship(
-        secondary="volumes_attachments",
-        back_populates="instances",
+    volume_attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(
+        back_populates="instance",
+        # Add delete-orphan option so that removing entries from volume_attachments
+        # automatically marks them for deletion.
+        # SQLalchemy requires delete when using delete-orphan.
+        cascade="save-update, merge, delete-orphan, delete",
     )
@@ -587,23 +587,21 @@ class VolumeModel(BaseModel):
     configuration: Mapped[str] = mapped_column(Text)
     volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
-    # FIXME: volume_attachment_data should be in "volumes_attachments"
-    # to support multi-attach volumes
+    attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume")
+    # Deprecated in favor of VolumeAttachmentModel.attachment_data
     volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text)
-    # instances the volume is attached to
-    instances: Mapped[List["InstanceModel"]] = relationship(
-        secondary="volumes_attachments",
-        back_populates="volumes",
-    )
+class VolumeAttachmentModel(BaseModel):
+    __tablename__ = "volumes_attachments"
-volumes_attachments_table = Table(
-    "volumes_attachments",
-    BackendModel.metadata,
-    Column("volume_id", ForeignKey("volumes.id"), primary_key=True),
-    Column("instace_id", ForeignKey("instances.id"), primary_key=True),
-)
+    volume_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("volumes.id"), primary_key=True)
+    volume: Mapped[VolumeModel] = relationship(back_populates="attachments")
+    instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"), primary_key=True)
+    instance: Mapped[InstanceModel] = relationship(back_populates="volume_attachments")
+    attachment_data: Mapped[Optional[str]] = mapped_column(Text)
 class PlacementGroupModel(BaseModel):

dstack/_internal/server/routers/logs.py CHANGED Viewed

@@ -24,4 +24,7 @@ async def poll_logs(
     user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
 ) -> JobSubmissionLogs:
     _, project = user_project
+    # The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
+    # Otherwise, some logs with duplicated timestamps may be filtered out.
+    # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
     return await logs.poll_logs_async(project=project, request=body)

dstack/_internal/server/services/backends/configurators/aws.py CHANGED Viewed

@@ -2,6 +2,7 @@ import concurrent.futures
 import json
 from typing import List
+import botocore.exceptions
 from boto3.session import Session
 from dstack._internal.core.backends.aws import AWSBackend, auth, compute, resources
@@ -35,6 +36,9 @@ from dstack._internal.server.services.backends.configurators.base import (
     Configurator,
     raise_invalid_credentials_error,
 )
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
 REGIONS = [
     ("US East, N. Virginia", "us-east-1"),
@@ -137,7 +141,8 @@ class AWSConfigurator(Configurator):
     def _check_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
         self._check_tags_config(config)
-        self._check_vpc_config(session=session, config=config)
+        self._check_iam_instance_profile_config(session, config)
+        self._check_vpc_config(session, config)
     def _check_tags_config(self, config: AWSConfigInfoWithCredsPartial):
         if not config.tags:
@@ -151,6 +156,31 @@ class AWSConfigurator(Configurator):
         except BackendError as e:
             raise ServerClientError(e.args[0])
+    def _check_iam_instance_profile_config(
+        self, session: Session, config: AWSConfigInfoWithCredsPartial
+    ):
+        if config.iam_instance_profile is None:
+            return
+        try:
+            iam_client = session.client("iam")
+            iam_client.get_instance_profile(InstanceProfileName=config.iam_instance_profile)
+        except botocore.exceptions.ClientError as e:
+            if e.response["Error"]["Code"] == "NoSuchEntity":
+                raise ServerClientError(
+                    f"IAM instance profile {config.iam_instance_profile} not found"
+                )
+            logger.exception(
+                "Got botocore.exceptions.ClientError when checking iam_instance_profile"
+            )
+            raise ServerClientError(
+                f"Failed to check IAM instance profile {config.iam_instance_profile}"
+            )
+        except Exception:
+            logger.exception("Got exception when checking iam_instance_profile")
+            raise ServerClientError(
+                f"Failed to check IAM instance profile {config.iam_instance_profile}"
+            )
     def _check_vpc_config(self, session: Session, config: AWSConfigInfoWithCredsPartial):
         allocate_public_ip = config.public_ips if config.public_ips is not None else True
         use_default_vpcs = config.default_vpcs if config.default_vpcs is not None else True

dstack/_internal/server/services/backends/configurators/gcp.py CHANGED Viewed

@@ -127,10 +127,6 @@ class GCPConfigurator(Configurator):
             _, project_id = auth.authenticate(GCPDefaultCreds())
         except BackendAuthError:
             return []
-        if project_id is None:
-            return []
         return [
             GCPConfigInfoWithCreds(
                 project_id=project_id,
@@ -152,24 +148,21 @@ class GCPConfigurator(Configurator):
         ):
             raise_invalid_credentials_error(fields=[["creds"]])
         try:
-            credentials, project_id = auth.authenticate(creds=config.creds)
-        except BackendAuthError:
+            credentials, _ = auth.authenticate(creds=config.creds, project_id=config.project_id)
+        except BackendAuthError as e:
+            details = None
+            if len(e.args) > 0:
+                details = e.args[0]
             if is_core_model_instance(config.creds, GCPServiceAccountCreds):
-                raise_invalid_credentials_error(fields=[["creds", "data"]])
+                raise_invalid_credentials_error(fields=[["creds", "data"]], details=details)
             else:
-                raise_invalid_credentials_error(fields=[["creds"]])
-        if (
-            project_id is not None
-            and config.project_id is not None
-            and config.project_id != project_id
-        ):
-            raise ServerClientError(msg="Wrong project_id", fields=[["project_id"]])
-        config_values.project_id = self._get_project_id_element(selected=project_id)
+                raise_invalid_credentials_error(fields=[["creds"]], details=details)
         config_values.regions = self._get_regions_element(
             selected=config.regions or DEFAULT_REGIONS
         )
         if config.project_id is None:
             return config_values
+        config_values.project_id = self._get_project_id_element(selected=config.project_id)
         self._check_config(config=config, credentials=credentials)
         return config_values

dstack/_internal/server/services/config.py CHANGED Viewed

@@ -107,6 +107,16 @@ class AWSConfig(CoreModel):
             )
         ),
     ] = None
+    iam_instance_profile: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "The name of the IAM instance profile to associate with EC2 instances."
+                " You can also specify the IAM role name for roles created via the AWS console."
+                " AWS automatically creates an instance profile and gives it the same name as the role"
+            )
+        ),
+    ] = None
     tags: Annotated[
         Optional[Dict[str, str]],
         Field(description="The tags that will be assigned to resources created by `dstack`"),
@@ -251,7 +261,7 @@ class GCPConfig(CoreModel):
         ),
     ] = None
     vm_service_account: Annotated[
-        Optional[str], Field(description="The service account associated with provisioned VMs")
+        Optional[str], Field(description="The service account to associate with provisioned VMs")
     ] = None
     tags: Annotated[
         Optional[Dict[str, str]],

dstack/_internal/server/services/fleets.py CHANGED Viewed

@@ -257,6 +257,7 @@ async def get_plan(
             project=project,
             profile=spec.merged_profile,
             requirements=_get_fleet_requirements(spec),
+            fleet_spec=spec,
             blocks=spec.configuration.blocks,
         )
         offers = [offer for _, offer in offers_with_backends]
@@ -277,12 +278,15 @@ async def get_create_instance_offers(
     project: ProjectModel,
     profile: Profile,
     requirements: Requirements,
-    exclude_not_available=False,
+    fleet_spec: Optional[FleetSpec] = None,
     fleet_model: Optional[FleetModel] = None,
     blocks: Union[int, Literal["auto"]] = 1,
+    exclude_not_available: bool = False,
 ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
     multinode = False
     master_job_provisioning_data = None
+    if fleet_spec is not None:
+        multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER
     if fleet_model is not None:
         fleet = fleet_model_to_fleet(fleet_model)
         multinode = fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER

dstack/_internal/server/services/jobs/__init__.py CHANGED Viewed

@@ -236,13 +236,14 @@ async def process_terminating_job(
         logger.debug("%s: stopping container", fmt(job_model))
         ssh_private_keys = get_instance_ssh_private_keys(instance_model)
         await stop_container(job_model, jpd, ssh_private_keys)
-        volume_models: list[VolumeModel]
         if jrd is not None and jrd.volume_names is not None:
-            volume_models = await list_project_volume_models(
-                session=session, project=instance_model.project, names=jrd.volume_names
-            )
+            volume_names = jrd.volume_names
         else:
-            volume_models = instance_model.volumes
+            # Legacy jobs before job_runtime_data/blocks were introduced
+            volume_names = [va.volume.name for va in instance_model.volume_attachments]
+        volume_models = await list_project_volume_models(
+            session=session, project=instance_model.project, names=volume_names
+        )
         if len(volume_models) > 0:
             logger.info("Detaching volumes: %s", [v.name for v in volume_models])
             all_volumes_detached = await _detach_volumes_from_job_instance(
@@ -302,11 +303,13 @@ async def process_volumes_detaching(
     jpd = get_or_error(get_job_provisioning_data(job_model))
     jrd = get_job_runtime_data(job_model)
     if jrd is not None and jrd.volume_names is not None:
-        volume_models = await list_project_volume_models(
-            session=session, project=instance_model.project, names=jrd.volume_names
-        )
+        volume_names = jrd.volume_names
     else:
-        volume_models = instance_model.volumes
+        # Legacy jobs before job_runtime_data/blocks were introduced
+        volume_names = [va.volume.name for va in instance_model.volume_attachments]
+    volume_models = await list_project_volume_models(
+        session=session, project=instance_model.project, names=volume_names
+    )
     logger.info("Detaching volumes: %s", [v.name for v in volume_models])
     all_volumes_detached = await _detach_volumes_from_job_instance(
         project=instance_model.project,
@@ -439,8 +442,8 @@ async def _detach_volumes_from_job_instance(
     if job_model.volumes_detached_at is None:
         job_model.volumes_detached_at = common.get_current_datetime()
     detached_volumes_ids = {v.id for v in detached_volumes}
-    instance_model.volumes = [
-        v for v in instance_model.volumes if v.id not in detached_volumes_ids
+    instance_model.volume_attachments = [
+        va for va in instance_model.volume_attachments if va.volume_id not in detached_volumes_ids
     ]
     return all_detached

dstack/_internal/server/services/jobs/configurators/dev.py CHANGED Viewed

@@ -6,8 +6,6 @@ from dstack._internal.core.models.runs import RunSpec
 from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
 from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop
-DEFAULT_MAX_DURATION_SECONDS = 6 * 3600
 INSTALL_IPYKERNEL = (
     "(echo pip install ipykernel... && pip install -q --no-cache-dir ipykernel 2> /dev/null) || "
     'echo "no pip, ipykernel was not installed"'
@@ -44,7 +42,7 @@ class DevEnvironmentJobConfigurator(JobConfigurator):
         return False
     def _default_max_duration(self) -> Optional[int]:
-        return DEFAULT_MAX_DURATION_SECONDS
+        return None
     def _spot_policy(self) -> SpotPolicy:
         return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND

dstack/_internal/server/services/jobs/configurators/task.py CHANGED Viewed

@@ -5,8 +5,6 @@ from dstack._internal.core.models.profiles import SpotPolicy
 from dstack._internal.core.models.runs import JobSpec
 from dstack._internal.server.services.jobs.configurators.base import JobConfigurator
-DEFAULT_MAX_DURATION_SECONDS = 72 * 3600
 class TaskJobConfigurator(JobConfigurator):
     TYPE: RunConfigurationType = RunConfigurationType.TASK
@@ -29,7 +27,7 @@ class TaskJobConfigurator(JobConfigurator):
         return True
     def _default_max_duration(self) -> Optional[int]:
-        return DEFAULT_MAX_DURATION_SECONDS
+        return None
     def _spot_policy(self) -> SpotPolicy:
         return self.run_spec.merged_profile.spot_policy or SpotPolicy.ONDEMAND

dstack/_internal/server/services/logs/__init__.py ADDED Viewed

@@ -0,0 +1,78 @@
+import atexit
+from typing import List, Optional
+from uuid import UUID
+from dstack._internal.core.models.logs import JobSubmissionLogs
+from dstack._internal.server import settings
+from dstack._internal.server.models import ProjectModel
+from dstack._internal.server.schemas.logs import PollLogsRequest
+from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent
+from dstack._internal.server.services.logs.aws import BOTO_AVAILABLE, CloudWatchLogStorage
+from dstack._internal.server.services.logs.base import LogStorage, LogStorageError
+from dstack._internal.server.services.logs.filelog import FileLogStorage
+from dstack._internal.server.services.logs.gcp import GCP_LOGGING_AVAILABLE, GCPLogStorage
+from dstack._internal.utils.common import run_async
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+_log_storage: Optional[LogStorage] = None
+def get_log_storage() -> LogStorage:
+    global _log_storage
+    if _log_storage is not None:
+        return _log_storage
+    if settings.SERVER_CLOUDWATCH_LOG_GROUP:
+        if BOTO_AVAILABLE:
+            try:
+                _log_storage = CloudWatchLogStorage(
+                    group=settings.SERVER_CLOUDWATCH_LOG_GROUP,
+                    region=settings.SERVER_CLOUDWATCH_LOG_REGION,
+                )
+            except LogStorageError as e:
+                logger.error("Failed to initialize CloudWatch Logs storage: %s", e)
+            except Exception:
+                logger.exception("Got exception when initializing CloudWatch Logs storage")
+            else:
+                logger.debug("Using CloudWatch Logs storage")
+        else:
+            logger.error("Cannot use CloudWatch Logs storage: boto3 is not installed")
+    elif settings.SERVER_GCP_LOGGING_PROJECT:
+        if GCP_LOGGING_AVAILABLE:
+            try:
+                _log_storage = GCPLogStorage(project_id=settings.SERVER_GCP_LOGGING_PROJECT)
+            except LogStorageError as e:
+                logger.error("Failed to initialize GCP Logs storage: %s", e)
+            except Exception:
+                logger.exception("Got exception when initializing GCP Logs storage")
+            else:
+                logger.debug("Using GCP Logs storage")
+        else:
+            logger.error("Cannot use GCP Logs storage: GCP deps are not installed")
+    if _log_storage is None:
+        _log_storage = FileLogStorage()
+        logger.debug("Using file-based storage")
+    atexit.register(_log_storage.close)
+    return _log_storage
+def write_logs(
+    project: ProjectModel,
+    run_name: str,
+    job_submission_id: UUID,
+    runner_logs: List[RunnerLogEvent],
+    job_logs: List[RunnerLogEvent],
+) -> None:
+    return get_log_storage().write_logs(
+        project=project,
+        run_name=run_name,
+        job_submission_id=job_submission_id,
+        runner_logs=runner_logs,
+        job_logs=job_logs,
+    )
+async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
+    return await run_async(get_log_storage().poll_logs, project=project, request=request)

dstack 0.18.41__py3-none-any.whl → 0.18.43__py3-none-any.whl

dstack 0.18.41py3-none-any.whl → 0.18.43py3-none-any.whl