PyPI - virtool-workflow - Versions diffs - 0.0.0__py3-none-any.whl - Mend

virtool-workflow 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

virtool_workflow/__init__.py +13 -0
virtool_workflow/analysis/__init__.py +1 -0
virtool_workflow/analysis/fastqc.py +467 -0
virtool_workflow/analysis/skewer.py +265 -0
virtool_workflow/analysis/trimming.py +56 -0
virtool_workflow/analysis/utils.py +27 -0
virtool_workflow/api/__init__.py +0 -0
virtool_workflow/api/acquire.py +66 -0
virtool_workflow/api/client.py +132 -0
virtool_workflow/api/utils.py +109 -0
virtool_workflow/cli.py +66 -0
virtool_workflow/data/__init__.py +22 -0
virtool_workflow/data/analyses.py +106 -0
virtool_workflow/data/hmms.py +109 -0
virtool_workflow/data/indexes.py +319 -0
virtool_workflow/data/jobs.py +62 -0
virtool_workflow/data/ml.py +82 -0
virtool_workflow/data/samples.py +190 -0
virtool_workflow/data/subtractions.py +244 -0
virtool_workflow/data/uploads.py +35 -0
virtool_workflow/decorators.py +47 -0
virtool_workflow/errors.py +62 -0
virtool_workflow/files.py +40 -0
virtool_workflow/hooks.py +140 -0
virtool_workflow/pytest_plugin/__init__.py +35 -0
virtool_workflow/pytest_plugin/data.py +197 -0
virtool_workflow/pytest_plugin/utils.py +9 -0
virtool_workflow/runtime/__init__.py +0 -0
virtool_workflow/runtime/config.py +21 -0
virtool_workflow/runtime/discover.py +95 -0
virtool_workflow/runtime/events.py +7 -0
virtool_workflow/runtime/hook.py +129 -0
virtool_workflow/runtime/path.py +19 -0
virtool_workflow/runtime/ping.py +54 -0
virtool_workflow/runtime/redis.py +65 -0
virtool_workflow/runtime/run.py +276 -0
virtool_workflow/runtime/run_subprocess.py +168 -0
virtool_workflow/runtime/sentry.py +28 -0
virtool_workflow/utils.py +90 -0
virtool_workflow/workflow.py +90 -0
virtool_workflow-0.0.0.dist-info/LICENSE +21 -0
virtool_workflow-0.0.0.dist-info/METADATA +71 -0
virtool_workflow-0.0.0.dist-info/RECORD +45 -0
virtool_workflow-0.0.0.dist-info/WHEEL +4 -0
virtool_workflow-0.0.0.dist-info/entry_points.txt +3 -0

virtool_workflow/data/ml.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""A fixture and dataclass for working with machine learning models in workflows."""
+import asyncio
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from pyfixtures import fixture
+from structlog import get_logger
+from virtool.ml.models import MLModelRelease
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.data.analyses import WFAnalysis
+from virtool_workflow.utils import make_directory, move_all_model_files, untar
+logger = get_logger("api")
+@dataclass
+class WFMLModelRelease:
+    """A machine learning model.
+    This class represents a machine learning model and the selected release of that
+    model in the workflow.
+    """
+    id: int
+    """The unique ID for the model release."""
+    name: str
+    """The name of the model release."""
+    path: Path
+    """The path to the model directory."""
+    @property
+    def file_path(self) -> Path:
+        """The path to the model data file."""
+        return self.path / "model.tar.gz"
+@fixture
+async def ml(
+    _api: APIClient,
+    analysis: WFAnalysis,
+    work_path: Path,
+) -> WFMLModelRelease | None:
+    if analysis.ml is None:
+        return None
+    model_id = analysis.ml.model.id
+    model_release_id = analysis.ml.id
+    log = logger.bind(model_id=analysis.ml.id, model_release_id=model_release_id)
+    model_release_json = await _api.get_json(
+        f"/ml/{model_id}/releases/{model_release_id}",
+    )
+    model_release = MLModelRelease(**model_release_json)
+    log.info("fetched ml model release json")
+    release = WFMLModelRelease(
+        id=model_release.id,
+        name=model_release.name,
+        path=work_path / "ml" / str(model_release.model.id) / str(model_release_id),
+    )
+    await make_directory(release.path)
+    await _api.get_file(
+        f"/ml/{model_id}/releases/{model_release_id}/model.tar.gz",
+        release.file_path,
+    )
+    await asyncio.to_thread(untar, release.file_path, release.path)
+    await asyncio.to_thread(move_all_model_files, release.path / "model", release.path)
+    await asyncio.to_thread(shutil.rmtree, release.path / "model")
+    log.info("downloaded ml model release file")
+    return release

virtool_workflow/data/samples.py ADDED Viewed

@@ -0,0 +1,190 @@
+import asyncio
+from collections.abc import Callable, Coroutine
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from pyfixtures import fixture
+from structlog import get_logger
+from virtool.jobs.models import Job
+from virtool.models.enums import LibraryType
+from virtool.samples.models import Quality, Sample
+from virtool_workflow.analysis.utils import ReadPaths
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.data.uploads import WFUploads
+from virtool_workflow.errors import JobsAPINotFoundError
+from virtool_workflow.files import VirtoolFileFormat
+logger = get_logger("api")
+@dataclass
+class WFSample:
+    """A sample whose data is being used in a workflow."""
+    id: str
+    """The unique ID of the sample."""
+    library_type: LibraryType
+    """The library type of the sample."""
+    name: str
+    """The sample's name."""
+    paired: bool
+    """Whether the sample consists of paired reads."""
+    quality: Quality
+    """The quality data for the sample."""
+    read_paths: ReadPaths
+    """The paths to the raw sample reads."""
+    @property
+    def min_length(self) -> int | None:
+        """The minimum observed read length in the sample sequencing data.
+        Returns ``None`` if the sample is still being created and no quality data is available.
+        """
+        return self.quality.length[0] if self.quality else None
+    @property
+    def max_length(self) -> int | None:
+        """The maximum observed read length in the sample sequencing data.
+        Returns ``None`` if the sample is still being created and no quality data is available.
+        """
+        return self.quality.length[1] if self.quality else None
+@dataclass
+class WFNewSampleUpload:
+    id: int
+    name: str
+    size: int
+    path: Path
+@dataclass
+class WFNewSample:
+    """A sample that is being created in the workflow."""
+    id: str
+    name: str
+    paired: bool
+    uploads: tuple[WFNewSampleUpload] | tuple[WFNewSampleUpload, WFNewSampleUpload]
+    delete: Callable[[], Coroutine[None, None, None]]
+    finalize: Callable[[dict[str, Any]], Coroutine[None, None, None]]
+    upload: Callable[[Path, VirtoolFileFormat], Coroutine[None, None, None]]
+@fixture
+async def sample(
+    _api: APIClient,
+    job: Job,
+    uploads: WFUploads,
+    work_path: Path,
+) -> WFSample:
+    """The sample associated with the current job."""
+    id_ = job.args["sample_id"]
+    base_url_path = f"/samples/{id_}"
+    try:
+        sample_json = await _api.get_json(base_url_path)
+    except JobsAPINotFoundError:
+        raise JobsAPINotFoundError("Sample not found")
+    sample = Sample(**sample_json)
+    reads_path = work_path / "reads"
+    await asyncio.to_thread(reads_path.mkdir, exist_ok=True, parents=True)
+    await _api.get_file(
+        f"{base_url_path}/reads/reads_1.fq.gz",
+        reads_path / "reads_1.fq.gz",
+    )
+    if sample.paired:
+        read_paths = (
+            reads_path / "reads_1.fq.gz",
+            reads_path / "reads_2.fq.gz",
+        )
+        await _api.get_file(
+            f"{base_url_path}/reads/reads_2.fq.gz",
+            reads_path / "reads_2.fq.gz",
+        )
+    else:
+        read_paths = (reads_path / "reads_1.fq.gz",)
+    return WFSample(
+        id=sample.id,
+        library_type=sample.library_type,
+        name=sample.name,
+        paired=sample.paired,
+        quality=sample.quality,
+        read_paths=read_paths,
+    )
+@fixture
+async def new_sample(
+    _api: APIClient,
+    job: Job,
+    uploads: WFUploads,
+    work_path: Path,
+) -> WFNewSample:
+    """The sample associated with the current job."""
+    id_ = job.args["sample_id"]
+    log = logger.bind(resource="sample", id=id_)
+    log.info("loading sample for sample creation")
+    base_url_path = f"/samples/{id_}"
+    sample_dict = await _api.get_json(base_url_path)
+    sample = Sample(**sample_dict)
+    log.info("got sample json")
+    uploads_path = work_path / "uploads"
+    await asyncio.to_thread(uploads_path.mkdir, exist_ok=True, parents=True)
+    log.info("created uploads directory")
+    files = tuple(
+        WFNewSampleUpload(
+            id=f["id"],
+            name=f["name"],
+            path=Path(uploads_path / f["name"]),
+            size=f["size"],
+        )
+        for f in job.args["files"]
+    )
+    await asyncio.gather(*[uploads.download(f.id, f.path) for f in files])
+    log.info("downloaded sample files")
+    async def finalize(quality: dict[str, Any]):
+        await _api.patch_json(base_url_path, data={"quality": quality})
+    async def delete():
+        await _api.delete(base_url_path)
+    async def upload(path: Path, fmt: VirtoolFileFormat = "fastq"):
+        await _api.put_file(f"{base_url_path}/reads/{path.name}", path, "fastq")
+    return WFNewSample(
+        id=sample.id,
+        delete=delete,
+        finalize=finalize,
+        name=sample.name,
+        paired=sample.paired,
+        upload=upload,
+        uploads=files,
+    )

virtool_workflow/data/subtractions.py ADDED Viewed

@@ -0,0 +1,244 @@
+import asyncio
+from collections.abc import Callable, Coroutine
+from dataclasses import dataclass
+from pathlib import Path
+from pyfixtures import fixture
+from structlog import get_logger
+from virtool.jobs.models import Job
+from virtool.subtractions.models import (
+    NucleotideComposition,
+    Subtraction,
+    SubtractionFile,
+)
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.data.analyses import WFAnalysis
+from virtool_workflow.data.uploads import WFUploads
+from virtool_workflow.errors import MissingJobArgumentError
+logger = get_logger("api")
+@dataclass
+class WFSubtraction:
+    """A Virtool subtraction that has been loaded into the workflow environment.
+    The subtraction files are downloaded to the workflow's local work path so they can
+    be used for analysis.
+    """
+    id: str
+    """The unique ID for the subtraction."""
+    files: list[SubtractionFile]
+    """The files associated with the subtraction."""
+    gc: NucleotideComposition
+    """The nucleotide composition of the subtraction."""
+    nickname: str
+    """The nickname for the subtraction."""
+    name: str
+    """The display name for the subtraction."""
+    path: Path
+    """
+    The path to the subtraction directory.
+    The subtraction directory contains the FASTA and Bowtie2 files for the subtraction.
+    """
+    @property
+    def fasta_path(self) -> Path:
+        """The path to the gzipped FASTA file for the subtraction."""
+        return self.path / "subtraction.fa.gz"
+    @property
+    def bowtie2_index_path(self) -> Path:
+        """The path to Bowtie2 prefix in the running workflow's work_path
+        For example, ``/work/subtractions/<id>/subtraction`` refers to the Bowtie2
+        index that comprises the files:
+        - ``/work/subtractions/<id>/subtraction.1.bt2``
+        - ``/work/subtractions/<id>/subtraction.2.bt2``
+        - ``/work/subtractions/<id>/subtraction.3.bt2``
+        - ``/work/subtractions/<id>/subtraction.4.bt2``
+        - ``/work/subtractions/<id>/subtraction.rev.1.bt2``
+        - ``/work/subtractions/<id>/subtraction.rev.2.bt2``
+        """
+        return self.path / "subtraction"
+@dataclass
+class WFNewSubtraction:
+    id: str
+    """The unique ID for the subtraction."""
+    delete: Callable[[], Coroutine[None, None, None]]
+    """
+    A callable that deletes the subtraction from Virtool.
+    This should be called if the subtraction creation fails before the subtraction is
+    finalized.
+    """
+    finalize: Callable[[dict[str, int | float], int], Coroutine[None, None, None]]
+    """
+    A callable that finalizes the subtraction in Virtool.
+    This makes it impossible to further alter the files and ready state of the
+    subtraction. This must be called before the workflow ends to make the subtraction
+    usable.
+    """
+    name: str
+    """The display name for the subtraction."""
+    nickname: str
+    """An optional nickname for the subtraction."""
+    path: Path
+    """
+    The path to the subtraction directory.
+    The data files for the subtraction should be created here.
+    """
+    upload: Callable[[Path], Coroutine[None, None, None]]
+    @property
+    def fasta_path(self) -> Path:
+        """The path to the FASTA file that should be used to create the subtraction."""
+        return self.path / "subtraction.fa.gz"
+@fixture
+async def subtractions(
+    _api: APIClient,
+    analysis: WFAnalysis,
+    work_path: Path,
+) -> list[WFSubtraction]:
+    """The subtractions to be used for the current analysis job."""
+    subtraction_work_path = work_path / "subtractions"
+    await asyncio.to_thread(subtraction_work_path.mkdir)
+    subtractions_ = []
+    for subtraction_id in [s.id for s in analysis.subtractions]:
+        subtraction_json = await _api.get_json(f"/subtractions/{subtraction_id}")
+        subtraction = Subtraction(**subtraction_json)
+        subtraction = WFSubtraction(
+            id=subtraction.id,
+            files=subtraction.files,
+            gc=subtraction.gc,
+            nickname=subtraction.nickname,
+            name=subtraction.name,
+            path=subtraction_work_path / subtraction.id,
+        )
+        await asyncio.to_thread(subtraction.path.mkdir, parents=True, exist_ok=True)
+        subtractions_.append(subtraction)
+    # Do this in a separate loop in case fetching the JSON fails. This prevents
+    # expensive and unnecessary file downloads.
+    for subtraction in subtractions_:
+        logger.info("downloading subtraction files", id=subtraction.id)
+        for subtraction_file in subtraction.files:
+            await _api.get_file(
+                f"/subtractions/{subtraction.id}/files/{subtraction_file.name}",
+                subtraction.path / subtraction_file.name,
+            )
+    return subtractions_
+@fixture
+async def new_subtraction(
+    _api: APIClient,
+    job: Job,
+    uploads: WFUploads,
+    work_path: Path,
+) -> WFNewSubtraction:
+    """A new subtraction that will be created during the current job.
+    Currently only used for the `create-subtraction` workflow.
+    """
+    try:
+        id_ = job.args["subtraction_id"]
+    except KeyError:
+        raise MissingJobArgumentError("subtraction_id")
+    try:
+        upload_id = job.args["files"][0]["id"]
+    except KeyError:
+        raise MissingJobArgumentError("files")
+    subtraction_json = await _api.get_json(f"/subtractions/{id_}")
+    subtraction_ = Subtraction(**subtraction_json)
+    subtraction_work_path = work_path / "subtractions" / subtraction_.id
+    await asyncio.to_thread(subtraction_work_path.mkdir, parents=True, exist_ok=True)
+    await uploads.download(upload_id, subtraction_work_path / "subtraction.fa.gz")
+    url_path = f"/subtractions/{subtraction_.id}"
+    async def delete():
+        """Delete the subtraction if the job fails."""
+        await _api.delete(f"/subtractions/{subtraction_.id}")
+    async def finalize(gc: dict[str, int | float], count: int):
+        """Finalize the subtraction by setting the gc.
+        :param gc: the nucleotide composition of the subtraction
+        :param count: the number of sequences in the FASTA file
+        :return: the updated subtraction.
+        """
+        gc_ = NucleotideComposition(**{"n": 0.0, **gc})
+        await _api.patch_json(url_path, {"gc": gc_.dict(), "count": count})
+    async def upload(path: Path):
+        """Upload a file relating to this subtraction.
+        Filenames must be one of:
+            - subtraction.fa.gz
+            - subtraction.1.bt2
+            - subtraction.2.bt2
+            - subtraction.3.bt2
+            - subtraction.4.bt2
+            - subtraction.rev.1.bt2
+            - subtraction.rev.2.bt2
+        :param path: The path to the file
+        """
+        filename = path.name
+        log = logger.bind(id=id_, filename=filename)
+        log.info("Uploading subtraction file")
+        await _api.put_file(
+            f"/subtractions/{subtraction_.id}/files/{filename}",
+            path,
+            "unknown",
+        )
+        log.info("Finished uploading subtraction file")
+    return WFNewSubtraction(
+        id=subtraction_.id,
+        name=subtraction_.name,
+        nickname=subtraction_.nickname,
+        path=subtraction_work_path,
+        delete=delete,
+        finalize=finalize,
+        upload=upload,
+    )

virtool_workflow/data/uploads.py ADDED Viewed

@@ -0,0 +1,35 @@
+from dataclasses import dataclass
+from pathlib import Path
+from pyfixtures import fixture
+from virtool_workflow.api.client import APIClient
+@dataclass
+class WFUploads:
+    def __init__(self, api: APIClient):
+        self._api = api
+    async def download(self, upload_id: int, path: Path):
+        """Download the upload with the given ID to the given path."""
+        await self._api.get_file(f"/uploads/{upload_id}", path)
+@fixture
+async def uploads(_api: APIClient) -> WFUploads:
+    """Provides access to files that have been uploaded to the Virtool instance.
+    Files can be downloaded into the workflow environment be calling
+    :meth:`.WFUploads.download`.
+    Example:
+    -------
+    .. code-block:: python
+        @step
+        async def step_one(uploads: WFUploads, work_path: Path):
+            await uploads.download(1, work_path / "file.txt")
+    """
+    return WFUploads(_api)

virtool_workflow/decorators.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Create Workflows by decorating module scope functions."""
+from collections.abc import Callable
+from types import ModuleType
+from virtool_workflow.errors import WorkflowStepsError
+from virtool_workflow.workflow import Workflow
+def step(func: Callable | None = None, *, name: str | None = None) -> Callable:
+    """Mark a function as a workflow step function.
+    :param func: the workflow step function
+    :param name: the display name of the workflow step. A name
+        will be generated based on the function name if not provided.
+    """
+    if func is None:
+        return lambda _f: step(_f, name=name)
+    func.__workflow_marker__ = "step"
+    func.__workflow_step_props__ = {"name": name}
+    return func
+def collect(module: ModuleType) -> Workflow:
+    """Build a :class:`.Workflow` object from a workflow module.
+    :param module: A workflow module
+    :return: A workflow object
+    """
+    workflow = Workflow()
+    markers = [
+        value
+        for value in module.__dict__.values()
+        if hasattr(value, "__workflow_marker__")
+    ]
+    for marked in markers:
+        if marked.__workflow_marker__ == "step":
+            workflow.step(marked, **marked.__workflow_step_props__)
+    if not workflow.steps:
+        raise WorkflowStepsError(str(module))
+    return workflow

virtool_workflow/errors.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Custom exceptions for ``virtool_workflow``."""
+from subprocess import SubprocessError
+class JobAlreadyAcquiredError(Exception):
+    """Raised when an attempt is made to reacquire a job."""
+    def __init__(self, job_id: str) -> None:
+        """Initialize the exception with a message containing the job ID."""
+        super().__init__(
+            f"Job {job_id} is has already been acquired.",
+        )
+class JobsAPIError(Exception):
+    """A base exception for errors due to HTTP errors from the jobs API."""
+class JobsAPIBadRequestError(JobsAPIError):
+    """A ``400 Bad Request`` response was received from the jobs API."""
+    status = 400
+class JobsAPIForbiddenError(JobsAPIError):
+    """A ``403 Forbidden`` response was received from the jobs API."""
+    status = 403
+class JobsAPINotFoundError(JobsAPIError):
+    """A ``404 Not Found`` response was received from the jobs API."""
+    status = 404
+class JobsAPIConflictError(JobsAPIError):
+    """A ``409 Conflict`` response was received from the jobs API."""
+    status = 409
+class JobsAPIServerError(JobsAPIError):
+    """A ``500 Internal Server Error`` response was received from the jobs API."""
+    status = 500
+class MissingJobArgumentError(ValueError):
+    """The `job.args` dict is missing a required key for some funcionality."""
+class WorkflowStepsError(Exception):
+    """Raised when no workflow steps are found in a module."""
+    def __init__(self, module: str) -> None:
+        super().__init__(f"No workflow steps could be found in {module}")
+class SubprocessFailedError(SubprocessError):
+    """Subprocess exited with non-zero status during a workflow."""

virtool_workflow/files.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Dataclasses for describing files uploaded to the Virtool server."""
+import datetime
+from dataclasses import dataclass
+from typing import Literal
+VirtoolFileFormat = Literal[
+    "sam",
+    "bam",
+    "fasta",
+    "fastq",
+    "csv",
+    "tsv",
+    "json",
+    "unknown",
+]
+"""A literal type hint for the format of a :class:`.VirtoolFile`."""
+@dataclass
+class VirtoolFile:
+    """A description of a file  uploaded to the Virtool server."""
+    id: int
+    """The unique ID for the file."""
+    name: str
+    """The name of the file."""
+    size: int
+    """The size of the file in bytes."""
+    format: VirtoolFileFormat
+    """The format of the file."""
+    name_on_disk: str | None = None
+    """The actual name of the file on disk."""
+    uploaded_at: datetime.datetime | None = None
+    """When the file was uploaded."""