PyPI - dstack - Versions diffs - 0.18.40rc1__py3-none-any.whl → 0.18.41__py3-none-any.whl - Mend

dstack 0.18.40rc1py3-none-any.whl → 0.18.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Add JobModel.inactivity_secs
+Revision ID: 1e76fb0dde87
+Revises: 63c3f19cb184
+Create Date: 2025-02-11 23:37:58.823710
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "1e76fb0dde87"
+down_revision = "63c3f19cb184"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("inactivity_secs", sa.Integer(), nullable=True))
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.drop_column("inactivity_secs")
+    # ### end Alembic commands ###

dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Add InstanceModel blocks fields
+Revision ID: 51d45659d574
+Revises: da574e93fee0
+Create Date: 2025-02-04 11:10:41.626273
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "51d45659d574"
+down_revision = "da574e93fee0"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("total_blocks", sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column("busy_blocks", sa.Integer(), nullable=True))
+    op.execute("""
+        UPDATE instances
+        SET total_blocks = 1
+    """)
+    op.execute("""
+        UPDATE instances
+        SET busy_blocks = CASE
+            WHEN job_id IS NOT NULL THEN 1
+            ELSE 0
+        END
+    """)
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.alter_column("busy_blocks", existing_type=sa.INTEGER(), nullable=False)
+def downgrade() -> None:
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.drop_column("busy_blocks")
+        batch_op.drop_column("total_blocks")

dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Add JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
+Revision ID: 63c3f19cb184
+Revises: 1338b788b612
+Create Date: 2025-02-11 22:30:47.289393
+"""
+from alembic import op
+from alembic_postgresql_enum import TableReference
+# revision identifiers, used by Alembic.
+revision = "63c3f19cb184"
+down_revision = "1338b788b612"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.sync_enum_values(
+        enum_schema="public",
+        enum_name="jobterminationreason",
+        new_values=[
+            "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+            "INTERRUPTED_BY_NO_CAPACITY",
+            "WAITING_INSTANCE_LIMIT_EXCEEDED",
+            "WAITING_RUNNER_LIMIT_EXCEEDED",
+            "TERMINATED_BY_USER",
+            "VOLUME_ERROR",
+            "GATEWAY_ERROR",
+            "SCALED_DOWN",
+            "DONE_BY_RUNNER",
+            "ABORTED_BY_USER",
+            "TERMINATED_BY_SERVER",
+            "INACTIVITY_DURATION_EXCEEDED",
+            "CONTAINER_EXITED_WITH_ERROR",
+            "PORTS_BINDING_FAILED",
+            "CREATING_CONTAINER_ERROR",
+            "EXECUTOR_ERROR",
+            "MAX_DURATION_EXCEEDED",
+        ],
+        affected_columns=[
+            TableReference(
+                table_schema="public", table_name="jobs", column_name="termination_reason"
+            )
+        ],
+        enum_values_to_rename=[],
+    )
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.sync_enum_values(
+        enum_schema="public",
+        enum_name="jobterminationreason",
+        new_values=[
+            "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+            "INTERRUPTED_BY_NO_CAPACITY",
+            "WAITING_INSTANCE_LIMIT_EXCEEDED",
+            "WAITING_RUNNER_LIMIT_EXCEEDED",
+            "TERMINATED_BY_USER",
+            "VOLUME_ERROR",
+            "GATEWAY_ERROR",
+            "SCALED_DOWN",
+            "DONE_BY_RUNNER",
+            "ABORTED_BY_USER",
+            "TERMINATED_BY_SERVER",
+            "CONTAINER_EXITED_WITH_ERROR",
+            "PORTS_BINDING_FAILED",
+            "CREATING_CONTAINER_ERROR",
+            "EXECUTOR_ERROR",
+            "MAX_DURATION_EXCEEDED",
+        ],
+        affected_columns=[
+            TableReference(
+                table_schema="public", table_name="jobs", column_name="termination_reason"
+            )
+        ],
+        enum_values_to_rename=[],
+    )
+    # ### end Alembic commands ###

dstack/_internal/server/models.py CHANGED Viewed

@@ -351,13 +351,17 @@ class JobModel(BaseModel):
     job_spec_data: Mapped[str] = mapped_column(Text)
     job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text)
     runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger)
+    inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer)  # 0 - active, None - N/A
     # `removed` is used to ensure that the instance is killed after the job is finished
     remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
     volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
     # `instance_assigned` means instance assignment was done.
     # if `instance_assigned` is True and `instance` is None, no instance was assiged.
     instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False)
-    instance: Mapped[Optional["InstanceModel"]] = relationship(back_populates="job")
+    instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(
+        ForeignKey("instances.id", ondelete="CASCADE")
+    )
+    instance: Mapped[Optional["InstanceModel"]] = relationship(back_populates="jobs")
     used_instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(UUIDType(binary=False))
     replica_num: Mapped[int] = mapped_column(Integer)
     job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)
@@ -543,9 +547,11 @@ class InstanceModel(BaseModel):
     remote_connection_info: Mapped[Optional[str]] = mapped_column(Text)
-    # current job
-    job_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("jobs.id"))
-    job: Mapped[Optional["JobModel"]] = relationship(back_populates="instance", lazy="joined")
+    # NULL means `auto` (only during provisioning, when ready it's not NULL)
+    total_blocks: Mapped[Optional[int]] = mapped_column(Integer)
+    busy_blocks: Mapped[int] = mapped_column(Integer, default=0)
+    jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance", lazy="joined")
     last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
     # volumes attached to the instance

dstack/_internal/server/routers/runs.py CHANGED Viewed

@@ -47,6 +47,7 @@ async def list_runs(
     """
     Returns all runs visible to user sorted by descending `submitted_at`.
     `project_name`, `repo_id`, `username`, and `only_active` can be specified as filters.
+    Setting `only_active` to `true` excludes finished runs and deleted runs.
     Specifying `repo_id` without `project_name` returns no runs.
     The results are paginated. To get the next page, pass `submitted_at` and `id` of

dstack/_internal/server/schemas/runner.py CHANGED Viewed

@@ -34,6 +34,7 @@ class PullResponse(CoreModel):
     job_logs: List[LogEvent]
     runner_logs: List[LogEvent]
     last_updated: int
+    no_connections_secs: Optional[int] = None  # Optional for compatibility with old runners
 class SubmitBody(CoreModel):

dstack/_internal/server/services/backends/configurators/azure.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Optional, Tuple
+import azure.core.exceptions
 from azure.core.credentials import TokenCredential
 from azure.mgmt import network as network_mgmt
 from azure.mgmt import resource as resource_mgmt
@@ -154,16 +155,17 @@ class AzureConfigurator(Configurator):
         if is_core_model_instance(config.creds, AzureClientCreds):
             self._set_client_creds_tenant_id(config.creds, config.tenant_id)
         credential, _ = auth.authenticate(config.creds)
-        resource_group = self._create_resource_group(
-            credential=credential,
-            subscription_id=config.subscription_id,
-            location=MAIN_LOCATION,
-            project_name=project.name,
-        )
+        if config.resource_group is None:
+            config.resource_group = self._create_resource_group(
+                credential=credential,
+                subscription_id=config.subscription_id,
+                location=MAIN_LOCATION,
+                project_name=project.name,
+            )
         self._create_network_resources(
             credential=credential,
             subscription_id=config.subscription_id,
-            resource_group=resource_group,
+            resource_group=config.resource_group,
             locations=config.locations,
             create_default_network=config.vpc_ids is None,
         )
@@ -172,7 +174,6 @@ class AzureConfigurator(Configurator):
             type=self.TYPE.value,
             config=AzureStoredConfig(
                 **AzureConfigInfo.__response__.parse_obj(config).dict(),
-                resource_group=resource_group,
             ).json(),
             auth=DecryptedString(plaintext=AzureCreds.parse_obj(config.creds).__root__.json()),
         )
@@ -322,6 +323,7 @@ class AzureConfigurator(Configurator):
         self, config: AzureConfigInfoWithCredsPartial, credential: auth.AzureCredential
     ):
         self._check_tags_config(config)
+        self._check_resource_group(config=config, credential=credential)
         self._check_vpc_config(config=config, credential=credential)
     def _check_tags_config(self, config: AzureConfigInfoWithCredsPartial):
@@ -336,6 +338,18 @@ class AzureConfigurator(Configurator):
         except BackendError as e:
             raise ServerClientError(e.args[0])
+    def _check_resource_group(
+        self, config: AzureConfigInfoWithCredsPartial, credential: auth.AzureCredential
+    ):
+        if config.resource_group is None:
+            return
+        resource_manager = ResourceManager(
+            credential=credential,
+            subscription_id=config.subscription_id,
+        )
+        if not resource_manager.resource_group_exists(config.resource_group):
+            raise ServerClientError(f"Resource group {config.resource_group} not found")
     def _check_vpc_config(
         self, config: AzureConfigInfoWithCredsPartial, credential: auth.AzureCredential
     ):
@@ -406,6 +420,18 @@ class ResourceManager:
         )
         return resource_group.name
+    def resource_group_exists(
+        self,
+        name: str,
+    ) -> bool:
+        try:
+            self.resource_client.resource_groups.get(
+                resource_group_name=name,
+            )
+        except azure.core.exceptions.ResourceNotFoundError:
+            return False
+        return True
 class NetworkManager:
     def __init__(self, credential: TokenCredential, subscription_id: str):

dstack/_internal/server/services/config.py CHANGED Viewed

@@ -124,6 +124,15 @@ class AzureConfig(CoreModel):
     type: Annotated[Literal["azure"], Field(description="The type of the backend")] = "azure"
     tenant_id: Annotated[str, Field(description="The tenant ID")]
     subscription_id: Annotated[str, Field(description="The subscription ID")]
+    resource_group: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "The resource group for resources created by `dstack`."
+                " If not specified, `dstack` will create a new resource group"
+            )
+        ),
+    ] = None
     regions: Annotated[
         Optional[List[str]],
         Field(description="The list of Azure regions (locations). Omit to use all regions"),

dstack/_internal/server/services/fleets.py CHANGED Viewed

@@ -2,7 +2,7 @@ import random
 import string
 import uuid
 from datetime import datetime, timezone
-from typing import List, Optional, Tuple, Union, cast
+from typing import List, Literal, Optional, Tuple, Union, cast
 from sqlalchemy import and_, func, or_, select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -31,6 +31,7 @@ from dstack._internal.core.models.instances import (
     InstanceOfferWithAvailability,
     InstanceStatus,
     RemoteConnectionInfo,
+    SSHConnectionParams,
     SSHKey,
 )
 from dstack._internal.core.models.pools import Instance
@@ -256,6 +257,7 @@ async def get_plan(
             project=project,
             profile=spec.merged_profile,
             requirements=_get_fleet_requirements(spec),
+            blocks=spec.configuration.blocks,
         )
         offers = [offer for _, offer in offers_with_backends]
     _remove_fleet_spec_sensitive_info(spec)
@@ -277,6 +279,7 @@ async def get_create_instance_offers(
     requirements: Requirements,
     exclude_not_available=False,
     fleet_model: Optional[FleetModel] = None,
+    blocks: Union[int, Literal["auto"]] = 1,
 ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]:
     multinode = False
     master_job_provisioning_data = None
@@ -296,6 +299,7 @@ async def get_create_instance_offers(
         exclude_not_available=exclude_not_available,
         multinode=multinode,
         master_job_provisioning_data=master_job_provisioning_data,
+        blocks=blocks,
     )
     offers = [
         (backend, offer)
@@ -406,6 +410,7 @@ async def create_fleet_instance_model(
         instance_num=instance_num,
         placement_group_name=placement_group_name,
         reservation=reservation,
+        blocks=spec.configuration.blocks,
     )
     return instance_model
@@ -424,18 +429,33 @@ async def create_fleet_ssh_instance_model(
         ssh_user = ssh_params.user
         ssh_key = ssh_params.ssh_key
         port = ssh_params.port
+        proxy_jump = ssh_params.proxy_jump
         internal_ip = None
+        blocks = 1
     else:
         hostname = host.hostname
         ssh_user = host.user or ssh_params.user
         ssh_key = host.ssh_key or ssh_params.ssh_key
         port = host.port or ssh_params.port
+        proxy_jump = host.proxy_jump or ssh_params.proxy_jump
         internal_ip = host.internal_ip
+        blocks = host.blocks
     if ssh_user is None or ssh_key is None:
         # This should not be reachable but checked by fleet spec validation
         raise ServerClientError("ssh key or user not specified")
+    if proxy_jump is not None:
+        ssh_proxy = SSHConnectionParams(
+            hostname=proxy_jump.hostname,
+            port=proxy_jump.port or 22,
+            username=proxy_jump.user,
+        )
+        ssh_proxy_keys = [proxy_jump.ssh_key]
+    else:
+        ssh_proxy = None
+        ssh_proxy_keys = None
     instance_model = await pools_services.create_ssh_instance_model(
         project=project,
         pool=pool,
@@ -445,10 +465,13 @@ async def create_fleet_ssh_instance_model(
         host=hostname,
         ssh_user=ssh_user,
         ssh_keys=[ssh_key],
+        ssh_proxy=ssh_proxy,
+        ssh_proxy_keys=ssh_proxy_keys,
         env=env,
         internal_ip=internal_ip,
         instance_network=ssh_params.network,
         port=port or 22,
+        blocks=blocks,
     )
     return instance_model
@@ -544,7 +567,7 @@ async def generate_fleet_name(session: AsyncSession, project: ProjectModel) -> s
 def is_fleet_in_use(fleet_model: FleetModel, instance_nums: Optional[List[int]] = None) -> bool:
-    instances_in_use = [i for i in fleet_model.instances if i.job_id is not None and not i.deleted]
+    instances_in_use = [i for i in fleet_model.instances if i.jobs and not i.deleted]
     selected_instance_in_use = instances_in_use
     if instance_nums is not None:
         selected_instance_in_use = [i for i in instances_in_use if i.instance_num in instance_nums]
@@ -606,6 +629,8 @@ async def create_instance(
         instance_configuration=None,
         termination_policy=termination_policy,
         termination_idle_time=termination_idle_time,
+        total_blocks=1,
+        busy_blocks=0,
     )
     logger.info(
         "Added a new instance %s",

dstack/_internal/server/services/gateways/client.py CHANGED Viewed

@@ -74,10 +74,18 @@ class GatewayClient:
         resp.raise_for_status()
         self.is_server_ready = True
-    async def register_replica(self, run: Run, job_submission: JobSubmission):
+    async def register_replica(
+        self,
+        run: Run,
+        job_submission: JobSubmission,
+        ssh_head_proxy: Optional[SSHConnectionParams],
+        ssh_head_proxy_private_key: Optional[str],
+    ):
         payload = {
             "job_id": job_submission.id.hex,
             "app_port": run.run_spec.configuration.port.container_port,
+            "ssh_head_proxy": ssh_head_proxy.dict() if ssh_head_proxy is not None else None,
+            "ssh_head_proxy_private_key": ssh_head_proxy_private_key,
         }
         jpd = job_submission.job_provisioning_data
         if not jpd.dockerized:

dstack 0.18.40rc1__py3-none-any.whl → 0.18.41__py3-none-any.whl

dstack 0.18.40rc1py3-none-any.whl → 0.18.41py3-none-any.whl