PyPI - virtool-workflow - Versions diffs - 0.0.0__py3-none-any.whl - Mend

virtool-workflow 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

virtool_workflow/__init__.py +13 -0
virtool_workflow/analysis/__init__.py +1 -0
virtool_workflow/analysis/fastqc.py +467 -0
virtool_workflow/analysis/skewer.py +265 -0
virtool_workflow/analysis/trimming.py +56 -0
virtool_workflow/analysis/utils.py +27 -0
virtool_workflow/api/__init__.py +0 -0
virtool_workflow/api/acquire.py +66 -0
virtool_workflow/api/client.py +132 -0
virtool_workflow/api/utils.py +109 -0
virtool_workflow/cli.py +66 -0
virtool_workflow/data/__init__.py +22 -0
virtool_workflow/data/analyses.py +106 -0
virtool_workflow/data/hmms.py +109 -0
virtool_workflow/data/indexes.py +319 -0
virtool_workflow/data/jobs.py +62 -0
virtool_workflow/data/ml.py +82 -0
virtool_workflow/data/samples.py +190 -0
virtool_workflow/data/subtractions.py +244 -0
virtool_workflow/data/uploads.py +35 -0
virtool_workflow/decorators.py +47 -0
virtool_workflow/errors.py +62 -0
virtool_workflow/files.py +40 -0
virtool_workflow/hooks.py +140 -0
virtool_workflow/pytest_plugin/__init__.py +35 -0
virtool_workflow/pytest_plugin/data.py +197 -0
virtool_workflow/pytest_plugin/utils.py +9 -0
virtool_workflow/runtime/__init__.py +0 -0
virtool_workflow/runtime/config.py +21 -0
virtool_workflow/runtime/discover.py +95 -0
virtool_workflow/runtime/events.py +7 -0
virtool_workflow/runtime/hook.py +129 -0
virtool_workflow/runtime/path.py +19 -0
virtool_workflow/runtime/ping.py +54 -0
virtool_workflow/runtime/redis.py +65 -0
virtool_workflow/runtime/run.py +276 -0
virtool_workflow/runtime/run_subprocess.py +168 -0
virtool_workflow/runtime/sentry.py +28 -0
virtool_workflow/utils.py +90 -0
virtool_workflow/workflow.py +90 -0
virtool_workflow-0.0.0.dist-info/LICENSE +21 -0
virtool_workflow-0.0.0.dist-info/METADATA +71 -0
virtool_workflow-0.0.0.dist-info/RECORD +45 -0
virtool_workflow-0.0.0.dist-info/WHEEL +4 -0
virtool_workflow-0.0.0.dist-info/entry_points.txt +3 -0

virtool_workflow/cli.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Command Line Interface to virtool_workflow"""
+import asyncio
+from pathlib import Path
+import click
+from virtool_workflow.runtime.run import start_runtime
+@click.option(
+    "--dev",
+    help="Run in development mode.",
+    is_flag=True,
+)
+@click.option(
+    "--jobs-api-connection-string",
+    help="The URL of the jobs API.",
+    default="https://localhost:9950",
+)
+@click.option(
+    "--mem",
+    help="The amount of memory to use in GB.",
+    type=int,
+    default=8,
+)
+@click.option(
+    "--proc",
+    help="The number of processes to use.",
+    type=int,
+    default=2,
+)
+@click.option(
+    "--redis-connection-string",
+    help="The URL for connecting to Redis.",
+    default="redis://localhost:6317",
+)
+@click.option(
+    "--redis-list-name",
+    help="The name of the Redis list to watch for incoming jobs.",
+    required=True,
+)
+@click.option(
+    "--sentry-dsn",
+    help="A Sentry DSN. Sentry will not be configured if no value is provided.",
+    default=None,
+)
+@click.option(
+    "--timeout",
+    help="Maximum time to wait for an incoming job",
+    default=1000,
+)
+@click.option(
+    "--work-path",
+    default="temp",
+    help="The path where temporary files will be stored.",
+    type=click.Path(path_type=Path),
+)
+@click.command()
+def run_workflow(**kwargs):
+    """Run a workflow."""
+    asyncio.run(start_runtime(**kwargs))
+def cli_main():
+    """Main pip entrypoint."""
+    run_workflow(auto_envvar_prefix="VT")

virtool_workflow/data/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+from virtool_workflow.data.analyses import analysis
+from virtool_workflow.analysis.fastqc import fastqc
+from virtool_workflow.data.hmms import hmms
+from virtool_workflow.data.indexes import index
+from virtool_workflow.data.jobs import job, push_status
+from virtool_workflow.data.ml import ml
+from virtool_workflow.data.samples import sample
+from virtool_workflow.data.subtractions import subtractions
+from virtool_workflow.data.uploads import uploads
+__all__ = [
+    "analysis",
+    "fastqc",
+    "hmms",
+    "index",
+    "job",
+    "ml",
+    "push_status",
+    "sample",
+    "subtractions",
+    "uploads",
+]

virtool_workflow/data/analyses.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""A fixture and class for representing the analysis associated with a workflow run."""
+from pathlib import Path
+from typing import Any
+from pyfixtures import fixture
+from virtool.analyses.models import Analysis, AnalysisSample
+from virtool.indexes.models import IndexNested
+from virtool.jobs.models import JobNested
+from virtool.ml.models import MLModelRelease
+from virtool.references.models import ReferenceNested
+from virtool.subtractions.models import SubtractionNested
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.files import VirtoolFileFormat
+class WFAnalysis:
+    """The Virtool analysis being populated by the running workflow."""
+    def __init__(
+        self,
+        api: APIClient,
+        analysis_id: str,
+        index: IndexNested,
+        ml: MLModelRelease | None,
+        reference: ReferenceNested,
+        sample: AnalysisSample,
+        subtractions: list[SubtractionNested],
+        workflow: str,
+    ):
+        self._api = api
+        self.id = analysis_id
+        """The unique ID for the analysis."""
+        self.index = index
+        """The index being used for the analysis."""
+        self.ml = ml
+        """The ML model release being used for the analysis."""
+        self.reference = reference
+        """The reference being used for the analysis."""
+        self.sample = sample
+        """The parent sample for the analysis."""
+        self.subtractions = subtractions
+        """The subtractions being used for the analysis."""
+        self.workflow = workflow
+        """The workflow being run to populate the analysis."""
+    async def delete(self):
+        """Delete the analysis.
+        This method should be called if the workflow fails before a result is uploaded.
+        """
+        await self._api.delete(f"/analyses/{self.id}")
+    async def upload_file(self, path: Path, fmt: VirtoolFileFormat = "unknown"):
+        """Upload files in the workflow environment that should be associated with the
+        current analysis.
+        :param path: the path to the file to upload
+        :param fmt: the file format
+        """
+        await self._api.post_file(
+            f"/analyses/{self.id}/files",
+            path,
+            fmt,
+        )
+    async def upload_result(self, results: dict[str, Any]):
+        """Upload the results dict for the analysis.
+        :param results: the analysis results
+        """
+        await self._api.patch_json(f"/analyses/{self.id}", {"results": results})
+@fixture
+async def analysis(
+    _api: APIClient,
+    job: JobNested,
+) -> WFAnalysis:
+    """A :class:`.WFAnalysis` object that represents the analysis associated with the running
+    workflow.
+    """
+    id_ = job.args["analysis_id"]
+    analysis_dict = await _api.get_json(f"/analyses/{id_}")
+    analysis = Analysis(**analysis_dict)
+    return WFAnalysis(
+        api=_api,
+        analysis_id=id_,
+        index=analysis.index,
+        ml=analysis.ml,
+        reference=analysis.reference,
+        sample=analysis.sample,
+        subtractions=analysis.subtractions,
+        workflow=job.workflow,
+    )

virtool_workflow/data/hmms.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""A class and fixture for accessing Virtool HMM data for use in analysis workflows."""
+import asyncio
+import json
+from dataclasses import dataclass
+from functools import cached_property
+from pathlib import Path
+from shutil import which
+import aiofiles
+from pyfixtures import fixture
+from virtool.hmm.models import HMM
+from virtool.utils import decompress_file
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.runtime.run_subprocess import RunSubprocess
+@dataclass
+class WFHMMs:
+    """A class that exposes:
+    1. A :class:`dict` the links `HMMER <http://hmmer.org/>`_ cluster IDs to Virtool
+       annotation IDs.
+    2. The path to the HMM profiles file.
+    """
+    annotations: list[HMM]
+    """All annotations in the HMM dataset."""
+    path: Path
+    """
+    The path to the ``profiles.hmm`` file in the ``work_path`` of the running
+    workflow.
+    """
+    @cached_property
+    def cluster_annotation_map(self) -> dict[int, str]:
+        """A :class:`dict` that maps cluster IDs used to identify HMMs in
+        `HMMER <http://hmmer.org/>`_ to annotation IDs used in Virtool.
+        """
+        return {hmm.cluster: hmm.id for hmm in self.annotations}
+    @property
+    def profiles_path(self):
+        """The path to the ``profiles.hmm`` file.
+        It can be provided directly to HMMER.
+        """
+        return self.path / "profiles.hmm"
+    def get_id_by_cluster(self, cluster: int) -> str:
+        """Get the Virtool HMM annotation ID for a given cluster ID.
+        :param cluster: a cluster ID
+        :return: the corresponding annotation ID
+        """
+        return self.cluster_annotation_map[cluster]
+@fixture
+async def hmms(
+    _api: APIClient,
+    proc: int,
+    run_subprocess: RunSubprocess,
+    work_path: Path,
+):
+    """A fixture for accessing HMM data.
+    The ``*.hmm`` file is copied from the data directory and ``hmmpress`` is run to
+    create all the HMM files.
+    Returns an :class:`.HMMs` object containing the path to the HMM profile file and a
+    `dict` that maps HMM cluster numbers to database IDs.
+    :raises: :class:`RuntimeError`: hmmpress is not installed
+    :raises: :class:`RuntimeError`: hmmpress command failed
+    """
+    if await asyncio.to_thread(which, "hmmpress") is None:
+        raise RuntimeError("hmmpress is not installed")
+    hmms_path = work_path / "hmms"
+    await asyncio.to_thread(hmms_path.mkdir, parents=True, exist_ok=True)
+    compressed_annotations_path = hmms_path / "annotations.json.gz"
+    await _api.get_file("/hmms/files/annotations.json.gz", compressed_annotations_path)
+    annotations_path = hmms_path / "annotations.json"
+    await asyncio.to_thread(
+        decompress_file,
+        compressed_annotations_path,
+        annotations_path,
+        proc,
+    )
+    profiles_path = hmms_path / "profiles.hmm"
+    await _api.get_file("/hmms/files/profiles.hmm", profiles_path)
+    async with aiofiles.open(annotations_path) as f:
+        annotations = [HMM(**hmm) for hmm in json.loads(await f.read())]
+    p = await run_subprocess(["hmmpress", str(profiles_path)])
+    if p.returncode != 0:
+        raise RuntimeError("hmmpress command failed")
+    return WFHMMs(annotations, hmms_path)

virtool_workflow/data/indexes.py ADDED Viewed

@@ -0,0 +1,319 @@
+import asyncio
+import json
+from dataclasses import dataclass
+from pathlib import Path
+import aiofiles
+from pyfixtures import fixture
+from structlog import get_logger
+from virtool.analyses.models import Analysis
+from virtool.indexes.models import Index
+from virtool.jobs.models import Job
+from virtool.references.models import ReferenceNested
+from virtool.utils import decompress_file
+from virtool_workflow.api.client import APIClient
+from virtool_workflow.errors import MissingJobArgumentError
+from virtool_workflow.files import VirtoolFileFormat
+logger = get_logger("api")
+@dataclass
+class WFIndex:
+    """Represents a Virtool reference index for use in analysis workflows."""
+    id: str
+    """The ID of the index."""
+    path: Path
+    """The path to the index directory in the workflow's work directory."""
+    manifest: dict[str, int | str]
+    """The manifest (OTU ID: OTU Version) for the index."""
+    reference: ReferenceNested
+    """The parent reference."""
+    sequence_lengths: dict[str, int]
+    """A dictionary of the lengths of all sequences keyed by their IDs."""
+    sequence_otu_map: dict[str, str]
+    """A dictionary of the OTU IDs for all sequences keyed by their sequence IDs."""
+    @property
+    def bowtie_path(self) -> Path:
+        """The path to the Bowtie2 index prefix for the Virtool index."""
+        return self.path / "reference"
+    @property
+    def fasta_path(self) -> Path:
+        """The path to the complete FASTA file for the reference index in the workflow's
+        work directory.
+        """
+        return self.path / "ref.fa"
+    @property
+    def json_path(self) -> Path:
+        """The path to the JSON representation of the reference index in the workflow's
+        work directory.
+        """
+        return self.path / "otus.json"
+    def get_otu_id_by_sequence_id(self, sequence_id: str) -> str:
+        """Get the ID of the parent OTU for the given ``sequence_id``.
+        :param sequence_id: the sequence ID
+        :return: the matching OTU ID
+        """
+        try:
+            return self.sequence_otu_map[sequence_id]
+        except KeyError:
+            raise ValueError("The sequence_id does not exist in the index")
+    def get_sequence_length(self, sequence_id: str) -> int:
+        """Get the sequence length for the given ``sequence_id``.
+        :param sequence_id: the sequence ID
+        :return: the length of the sequence
+        """
+        try:
+            return self.sequence_lengths[sequence_id]
+        except KeyError:
+            raise ValueError("The sequence_id does not exist in the index")
+    async def write_isolate_fasta(
+        self,
+        otu_ids: list[str],
+        path: Path,
+    ) -> dict[str, int]:
+        """Generate a FASTA file for all the isolates of the OTUs specified by ``otu_ids``.
+        :param otu_ids: the list of OTU IDs for which to generate and index
+        :param path: the path to the reference index directory
+        :return: a dictionary of the lengths of all sequences keyed by their IDS
+        """
+        unique_otu_ids = set(otu_ids)
+        def func():
+            with open(self.json_path) as f:
+                otus = [otu for otu in json.load(f) if otu["_id"] in unique_otu_ids]
+            lengths = {}
+            with open(path, "w") as f:
+                for otu in otus:
+                    for isolate in otu["isolates"]:
+                        for sequence in isolate["sequences"]:
+                            f.write(f">{sequence['_id']}\n{sequence['sequence']}\n")
+                            lengths[sequence["_id"]] = len(sequence["sequence"])
+            return lengths
+        return await asyncio.to_thread(func)
+class WFNewIndex:
+    def __init__(
+        self,
+        api: APIClient,
+        index_id: str,
+        manifest: dict[str, int | str],
+        path: Path,
+        reference: ReferenceNested,
+    ):
+        self._api = api
+        self.id = index_id
+        """The ID of the index."""
+        self.manifest = manifest
+        """The manifest (OTU ID: OTU Version) for the index."""
+        self.path = path
+        """The path to the index directory in the workflow's work directory."""
+        self.reference = reference
+        """The parent reference."""
+    async def delete(self):
+        await self._api.delete(f"/indexes/{self.id}")
+    async def finalize(self):
+        """Finalize the current index."""
+        await self._api.patch_json(f"/indexes/{self.id}", {})
+    async def upload(
+        self,
+        path: Path,
+        fmt: VirtoolFileFormat = "fasta",
+        name: str | None = None,
+    ):
+        """Upload a file to associate with the index being built.
+        Allowed file names are:
+        - reference.json.gz
+        - reference.fa.gz
+        - reference.1.bt2
+        - reference.2.bt2
+        - reference.3.bt2
+        - reference.4.bt4
+        - reference.rev.1.bt2
+        - reference.rev.2.bt2
+        :param path: The path to the file.
+        :param fmt: The format of the file.
+        :param name: An optional name for the file different that its name on disk.
+        :return: A :class:`VirtoolFile` object.
+        """
+        return await self._api.put_file(
+            f"/indexes/{self.id}/files/{name or path.name}",
+            path,
+            fmt,
+        )
+    @property
+    def otus_json_path(self) -> Path:
+        """The path to the JSON representation of the reference index in the workflow's
+        work directory.
+        """
+        return self.path / "otus.json"
+@fixture
+async def index(
+    _api: APIClient,
+    analysis: Analysis,
+    proc: int,
+    work_path: Path,
+) -> WFIndex:
+    """The :class:`WFIndex` for the current analysis job."""
+    id_ = analysis.index.id
+    log = logger.bind(id=id_, resource="index")
+    log.info("loading index")
+    index_json = await _api.get_json(f"/indexes/{id_}")
+    index_ = Index(**index_json)
+    log.info("got index json")
+    index_work_path = work_path / "indexes" / index_.id
+    await asyncio.to_thread(index_work_path.mkdir, parents=True, exist_ok=True)
+    log.info("created index directory")
+    for name in (
+        "otus.json.gz",
+        "reference.json.gz",
+        "reference.fa.gz",
+        "reference.1.bt2",
+        "reference.2.bt2",
+        "reference.3.bt2",
+        "reference.4.bt2",
+        "reference.rev.1.bt2",
+        "reference.rev.2.bt2",
+    ):
+        await _api.get_file(f"/indexes/{id_}/files/{name}", index_work_path / name)
+        log.info("downloaded index file", name=name)
+    await asyncio.to_thread(
+        decompress_file,
+        index_work_path / "reference.fa.gz",
+        index_work_path / "reference.fa",
+        proc,
+    )
+    log.info("decompressed reference fasta")
+    json_path = index_work_path / "otus.json"
+    await asyncio.to_thread(
+        decompress_file,
+        index_work_path / "otus.json.gz",
+        index_work_path / json_path,
+        proc,
+    )
+    log.info("decompressed reference otus json")
+    async with aiofiles.open(json_path) as f:
+        data = json.loads(await f.read())
+    sequence_lengths = {}
+    sequence_otu_map = {}
+    for otu in data:
+        for isolate in otu["isolates"]:
+            for sequence in isolate["sequences"]:
+                sequence_id = sequence["_id"]
+                sequence_otu_map[sequence_id] = otu["_id"]
+                sequence_lengths[sequence_id] = len(sequence["sequence"])
+    log.info("parsed and loaded maps from otus json")
+    return WFIndex(
+        id=id_,
+        path=index_work_path,
+        manifest=index_.manifest,
+        reference=index_.reference,
+        sequence_lengths=sequence_lengths,
+        sequence_otu_map=sequence_otu_map,
+    )
+@fixture
+async def new_index(
+    _api: APIClient,
+    job: Job,
+    proc: int,
+    work_path: Path,
+) -> WFNewIndex:
+    """The :class:`.WFNewIndex` for an index being created by the current job."""
+    try:
+        id_ = job.args["index_id"]
+    except KeyError:
+        raise MissingJobArgumentError("Missing jobs args key 'index_id'")
+    log = logger.bind(resource="new_index", id=id_, job_id=job.id)
+    log.info("loading index")
+    index_json = await _api.get_json(f"/indexes/{id_}")
+    index_ = Index(**index_json)
+    log.info("got index json")
+    index_work_path = work_path / "indexes" / index_.id
+    await asyncio.to_thread(index_work_path.mkdir, parents=True, exist_ok=True)
+    log.info("created index directory")
+    compressed_otus_json_path = index_work_path / "otus.json.gz"
+    await _api.get_file(f"/indexes/{id_}/files/otus.json.gz", compressed_otus_json_path)
+    log.info("downloaded otus json")
+    await asyncio.to_thread(
+        decompress_file,
+        compressed_otus_json_path,
+        index_work_path / "otus.json",
+        processes=proc,
+    )
+    log.info("decompressed otus json")
+    return WFNewIndex(
+        api=_api,
+        index_id=id_,
+        manifest=index_.manifest,
+        path=index_work_path,
+        reference=index_.reference,
+    )

virtool_workflow/data/jobs.py ADDED Viewed

@@ -0,0 +1,62 @@
+import traceback
+from pyfixtures import fixture
+from structlog import get_logger
+from virtool.jobs.models import JobAcquired, Job, JobState
+from virtool_workflow import Workflow, WorkflowStep
+from virtool_workflow.api.client import APIClient
+MAX_TB = 50
+logger = get_logger("api")
+@fixture
+async def job(_api: APIClient, _job: JobAcquired) -> Job:
+    return Job.parse_obj(_job)
+@fixture(scope="function")
+async def push_status(
+    _api: APIClient,
+    _job: JobAcquired,
+    _error: Exception | None,
+    _state: JobState,
+    _step: WorkflowStep | None,
+    _workflow: Workflow,
+):
+    error = None
+    if _error:
+        error = {
+            "type": _error.__class__.__name__,
+            "traceback": traceback.format_tb(_error.__traceback__, MAX_TB),
+            "details": [str(arg) for arg in _error.args],
+        }
+        logger.critical("reporting error to api", error=_error)
+    if _state in (JobState.WAITING, JobState.PREPARING):
+        progress = 0
+    elif _state == JobState.COMPLETE:
+        progress = 100
+    else:
+        progress = (100 // len(_workflow.steps)) * _workflow.steps.index(_step)
+    step_name = _step.display_name if _step is not None else ""
+    payload = {
+        "error": error,
+        "progress": progress,
+        "stage": _step.function.__name__ if _step is not None else "",
+        "state": _state.value,
+        "step_description": _step.description if _step is not None else "",
+        "step_name": step_name,
+    }
+    async def func():
+        await _api.post_json(f"/jobs/{_job.id}/status", payload)
+        logger.info("reported status to api", step=step_name, state=_state)
+    return func