PyPI - virtool-workflow - Versions diffs - 0.0.0__py3-none-any.whl - Mend

virtool-workflow 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

virtool_workflow/__init__.py +13 -0
virtool_workflow/analysis/__init__.py +1 -0
virtool_workflow/analysis/fastqc.py +467 -0
virtool_workflow/analysis/skewer.py +265 -0
virtool_workflow/analysis/trimming.py +56 -0
virtool_workflow/analysis/utils.py +27 -0
virtool_workflow/api/__init__.py +0 -0
virtool_workflow/api/acquire.py +66 -0
virtool_workflow/api/client.py +132 -0
virtool_workflow/api/utils.py +109 -0
virtool_workflow/cli.py +66 -0
virtool_workflow/data/__init__.py +22 -0
virtool_workflow/data/analyses.py +106 -0
virtool_workflow/data/hmms.py +109 -0
virtool_workflow/data/indexes.py +319 -0
virtool_workflow/data/jobs.py +62 -0
virtool_workflow/data/ml.py +82 -0
virtool_workflow/data/samples.py +190 -0
virtool_workflow/data/subtractions.py +244 -0
virtool_workflow/data/uploads.py +35 -0
virtool_workflow/decorators.py +47 -0
virtool_workflow/errors.py +62 -0
virtool_workflow/files.py +40 -0
virtool_workflow/hooks.py +140 -0
virtool_workflow/pytest_plugin/__init__.py +35 -0
virtool_workflow/pytest_plugin/data.py +197 -0
virtool_workflow/pytest_plugin/utils.py +9 -0
virtool_workflow/runtime/__init__.py +0 -0
virtool_workflow/runtime/config.py +21 -0
virtool_workflow/runtime/discover.py +95 -0
virtool_workflow/runtime/events.py +7 -0
virtool_workflow/runtime/hook.py +129 -0
virtool_workflow/runtime/path.py +19 -0
virtool_workflow/runtime/ping.py +54 -0
virtool_workflow/runtime/redis.py +65 -0
virtool_workflow/runtime/run.py +276 -0
virtool_workflow/runtime/run_subprocess.py +168 -0
virtool_workflow/runtime/sentry.py +28 -0
virtool_workflow/utils.py +90 -0
virtool_workflow/workflow.py +90 -0
virtool_workflow-0.0.0.dist-info/LICENSE +21 -0
virtool_workflow-0.0.0.dist-info/METADATA +71 -0
virtool_workflow-0.0.0.dist-info/RECORD +45 -0
virtool_workflow-0.0.0.dist-info/WHEEL +4 -0
virtool_workflow-0.0.0.dist-info/entry_points.txt +3 -0

virtool_workflow/analysis/skewer.py ADDED Viewed

@@ -0,0 +1,265 @@
+"""Utilities and a fixture for using `Skewer <https://github.com/relipmoc/skewer>`_ to
+trim reads.
+"""
+import asyncio
+import os
+import shutil
+from asyncio.subprocess import Process
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from tempfile import mkdtemp
+from typing import Protocol
+from pyfixtures import fixture
+from virtool.models.enums import LibraryType
+from virtool_workflow import RunSubprocess
+from virtool_workflow.analysis.utils import ReadPaths
+from virtool_workflow.data.samples import WFSample
+class SkewerMode(str, Enum):
+    """The mode to run Skewer in."""
+    PAIRED_END = "pe"
+    """Run Skewer in paired-end mode."""
+    SINGLE_END = "any"
+    """Run Skewer in single-end mode."""
+@dataclass
+class SkewerConfiguration:
+    """A configuration for running Skewer."""
+    min_length: int
+    """The minimum length of a trimmed read."""
+    mode: SkewerMode
+    """The mode to run Skewer in."""
+    end_quality: int = 20
+    """The minimum quality score for the end of a trimmed read."""
+    max_error_rate: float = 0.1
+    """
+    The maximum error rate for a trimmed read. Reads that exceed the rate will be
+    discarded.
+    """
+    max_indel_rate: float = 0.03
+    """
+    The maximum indel rate for a trimmed read. Reads that exceed the rate will be
+    discarded.
+    """
+    mean_quality: int = 25
+    """The minimum mean quality score for a trimmed read. Reads  """
+    number_of_processes: int = 1
+    """The number of processes to use when running Skewer."""
+    quiet: bool = True
+    """Whether to run Skewer in quiet mode."""
+    other_options: tuple[str] = ("-n", "-z")
+    """Other options to pass to Skewer."""
+@dataclass
+class SkewerResult:
+    """Represents the result of running Skewer to trim a paired or unpaired FASTQ dataset."""
+    command: list[str]
+    """The command used to run Skewer."""
+    output_path: Path
+    """The path to the directory containing the trimmed reads."""
+    process: Process
+    """The process that ran Skewer."""
+    read_paths: ReadPaths
+    """The paths to the trimmed reads."""
+    @property
+    def left(self) -> Path:
+        """The path to one of:
+        - the FASTQ trimming result for an unpaired Illumina dataset
+        - the FASTA trimming result for the left reads of a paired Illumina dataset
+        """
+        return self.read_paths[0]
+    @property
+    def right(self) -> Path | None:
+        """The path to the rights reads of a paired Illumina dataset.
+        ``None`` if the dataset in unpaired.
+        :type: :class:`.Path`
+        """
+        try:
+            return self.read_paths[1]
+        except IndexError:
+            return None
+def calculate_skewer_trimming_parameters(
+    sample: WFSample,
+    min_read_length: int,
+) -> SkewerConfiguration:
+    """Calculates trimming parameters based on the library type, and minimum allowed trim length.
+    :param sample: The sample to calculate trimming parameters for.
+    :param min_read_length: The minimum length of a read before it is discarded.
+    :return: the trimming parameters
+    """
+    config = SkewerConfiguration(
+        min_length=min_read_length,
+        mode=SkewerMode.PAIRED_END if sample.paired else SkewerMode.SINGLE_END,
+    )
+    if sample.library_type == LibraryType.amplicon:
+        config.end_quality = 0
+        config.mean_quality = 0
+        config.min_length = min_read_length
+        return config
+    if sample.library_type == LibraryType.srna:
+        config.max_length = 22
+        config.min_length = 20
+        return config
+    raise ValueError(f"Unknown library type: {sample.library_type}")
+class SkewerRunner(Protocol):
+    """A protocol describing callables that can be used to run Skewer."""
+    async def __call__(
+        self,
+        config: SkewerConfiguration,
+        paths: ReadPaths,
+        output_path: Path,
+    ) -> SkewerResult: ...
+@fixture
+def skewer(proc: int, run_subprocess: RunSubprocess) -> SkewerRunner:
+    """Provides an asynchronous function that can run skewer.
+    The provided function takes a :class:`.SkewerConfiguration` and a tuple of paths to
+    the left and right reads to trim. If a single member tuple is provided, the dataset
+    is assumed to be unpaired.
+    The Skewer process will automatically be assigned the number of processes configured
+    for the workflow run.
+    Example:
+    -------
+    .. code-block:: python
+        @step
+        async def step_one(skewer: SkewerRunner, work_path: Path):
+           config = SkewerConfiguration(
+               mean_quality=30
+           )
+           skewer_result = await skewer(config, (
+               work_path / "test_1.fq.gz",
+               work_path / "test_2.fq.gz",
+           ))
+    """
+    if shutil.which("skewer") is None:
+        raise RuntimeError("skewer is not installed.")
+    async def func(
+        config: SkewerConfiguration,
+        read_paths: ReadPaths,
+        output_path: Path,
+    ):
+        temp_path = Path(await asyncio.to_thread(mkdtemp, suffix="_virtool_skewer"))
+        await asyncio.to_thread(output_path.mkdir, exist_ok=True, parents=True)
+        command = [
+            str(a)
+            for a in [
+                "skewer",
+                "-r",
+                config.max_error_rate,
+                "-d",
+                config.max_indel_rate,
+                "-m",
+                config.mode.value,
+                "-l",
+                config.min_length,
+                "-q",
+                config.end_quality,
+                "-Q",
+                config.mean_quality,
+                "-t",
+                proc,
+                # Skewer spams the console with progress updates. Set quiet to avoid.
+                "--quiet",
+                # Compress the trimmed output.
+                "-z",
+                "-o",
+                f"{temp_path}/reads",
+                *read_paths,
+            ]
+        ]
+        process = await run_subprocess(
+            command,
+            cwd=read_paths[0].parent,
+            env={**os.environ, "LD_LIBRARY_PATH": "/usr/lib/x86_64-linux-gnu"},
+        )
+        read_paths = await asyncio.to_thread(
+            _rename_trimming_results,
+            temp_path,
+            output_path,
+        )
+        return SkewerResult(command, output_path, process, read_paths)
+    return func
+def _rename_trimming_results(temp_path: Path, output_path: Path) -> ReadPaths:
+    """Rename Skewer output to a simple name used in Virtool.
+    :param path: The path containing the results from Skewer
+    """
+    shutil.move(
+        temp_path / "reads-trimmed.log",
+        output_path / "trim.log",
+    )
+    try:
+        return (
+            shutil.move(
+                temp_path / "reads-trimmed.fastq.gz",
+                output_path / "reads_1.fq.gz",
+            ),
+        )
+    except FileNotFoundError:
+        return (
+            shutil.move(
+                temp_path / "reads-trimmed-pair1.fastq.gz",
+                output_path / "reads_1.fq.gz",
+            ),
+            shutil.move(
+                temp_path / "reads-trimmed-pair2.fastq.gz",
+                output_path / "reads_2.fq.gz",
+            ),
+        )

virtool_workflow/analysis/trimming.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Calculate trimming parameters which are passed the Skewer read trimming tool."""
+import hashlib
+import json
+from virtool.models.enums import LibraryType
+from virtool_workflow.data.samples import WFSample
+def calculate_trimming_cache_key(
+    sample_id: str,
+    trimming_parameters: dict,
+    program: str = "skewer",
+):
+    """Compute a unique cache key.
+    **This is not currently used.**
+    :param sample_id: The ID of the sample being trimmed.
+    :param trimming_parameters: The trimming parameters.
+    :param program: The name of the trimming program.
+    :return: A unique cache key.
+    """
+    raw_key = "reads-" + json.dumps(
+        {
+            "id": sample_id,
+            "parameters": trimming_parameters,
+            "program": program,
+        },
+        sort_keys=True,
+    )
+    return hashlib.sha256(raw_key.encode()).hexdigest()
+def calculate_trimming_min_length(sample: WFSample) -> int:
+    """Calculate the minimum trimming length that should be used for the passed sample.
+    This takes into account the library type (:class:`.LibraryType`) and the maximum
+    observed read length in the sample.
+    :param sample: the sample
+    :return: the minimum allowed trimmed read length
+    """
+    if sample.library_type == LibraryType.amplicon:
+        return round(0.95 * sample.max_length)
+    if sample.max_length < 80:
+        return 35
+    if sample.max_length < 160:
+        return 100
+    return 160

virtool_workflow/analysis/utils.py ADDED Viewed

@@ -0,0 +1,27 @@
+from pathlib import Path
+from typing import Callable, TypeAlias
+ReadPaths: TypeAlias = tuple[Path] | tuple[Path, Path]
+"""A tuple of paths to FASTQ files. There may be one or two paths, depending on whether the dataset is paired."""
+def _make_paired_paths(
+    dir_path: Path, paired: bool, mkstr: Callable[[int], str]
+) -> ReadPaths:
+    path1 = dir_path / mkstr(1)
+    return (path1, dir_path / mkstr(2)) if paired else (path1,)
+def make_read_paths(reads_dir_path: Path, paired: bool) -> ReadPaths:
+    """
+    Get the path(s) locating the compressed fastq files containing the read data.
+    :param reads_dir_path: The directory containing the fastq file(s).
+    :param paired: A boolean indicating if the sequence is paired (two fastq files).
+    :return: A :class:`Tuple[Path]` if :obj:`paired` is `False`, else a :class:`Tuple[Path, Path]`.
+    """
+    return _make_paired_paths(reads_dir_path, paired, lambda n: f"reads_{n}.fq.gz")
+def make_legacy_read_paths(reads_dir_path: Path, paired: bool) -> ReadPaths:
+    return _make_paired_paths(reads_dir_path, paired, lambda n: f"reads_{n}.fastq")

virtool_workflow/api/__init__.py ADDED Viewed

File without changes

virtool_workflow/api/acquire.py ADDED Viewed

@@ -0,0 +1,66 @@
+import asyncio
+from aiohttp import ClientConnectionError, ClientSession, TCPConnector
+from structlog import get_logger
+from virtool.jobs.models import JobAcquired
+from virtool_workflow.errors import (
+    JobAlreadyAcquiredError,
+    JobsAPIError,
+    JobsAPIServerError,
+)
+logger = get_logger("api")
+async def acquire_job_by_id(
+    jobs_api_connection_string: str,
+    job_id: str,
+) -> JobAcquired:
+    """Acquire the job with a given ID via the API.
+    :param jobs_api_connection_string: The url for the jobs API.
+    :param job_id: The id of the job to acquire
+    :return: a job including its API key
+    """
+    async with ClientSession(
+        connector=TCPConnector(force_close=True, limit=100),
+    ) as session:
+        attempts = 4
+        while attempts > 0:
+            try:
+                async with session.patch(
+                    f"{jobs_api_connection_string}/jobs/{job_id}",
+                    json={"acquired": True},
+                ) as resp:
+                    logger.info("acquiring job", remaining_attempts=attempts, id=job_id)
+                    if resp.status == 200:
+                        job_json = await resp.json()
+                        logger.info("acquired job", id=job_id)
+                        return JobAcquired(**job_json)
+                    if resp.status == 400:
+                        if "already acquired" in await resp.text():
+                            raise JobAlreadyAcquiredError(await resp.json())
+                    logger.critical(
+                        "unexpected api error during job acquisition",
+                        status=resp.status,
+                        body=await resp.text(),
+                    )
+                    raise JobsAPIError("Unexpected API error during job acquisition")
+            except ClientConnectionError:
+                logger.warning(
+                    "unable to connect to server. retrying in 1 second.",
+                    remaining_attemtps=attempts,
+                    id=job_id,
+                )
+                await asyncio.sleep(1)
+            attempts -= 1
+    raise JobsAPIServerError("Unable to connect to server.")

virtool_workflow/api/client.py ADDED Viewed

@@ -0,0 +1,132 @@
+from contextlib import asynccontextmanager
+from pathlib import Path
+import aiofiles
+from aiohttp import BasicAuth, ClientSession
+from virtool_workflow.api.utils import (
+    decode_json_response,
+    raise_exception_by_status_code,
+)
+from virtool_workflow.errors import JobsAPIError
+from virtool_workflow.files import VirtoolFileFormat
+CHUNK_SIZE = 1024 * 1024 * 2
+class APIClient:
+    def __init__(self, http: ClientSession, jobs_api_connection_string: str):
+        self.http = http
+        self.jobs_api_connection_string = jobs_api_connection_string
+    async def get_json(self, path: str) -> dict:
+        """Get the JSON response from the provided API ``path``."""
+        async with self.http.get(f"{self.jobs_api_connection_string}{path}") as resp:
+            await raise_exception_by_status_code(resp)
+            return await decode_json_response(resp)
+    async def get_file(self, path: str, target_path: Path):
+        """Download the file at URL ``path`` to the local filesystem path ``target_path``.
+        """
+        async with self.http.get(f"{self.jobs_api_connection_string}{path}") as resp:
+            if resp.status != 200:
+                raise JobsAPIError(
+                    f"Encountered {resp.status} while downloading '{path}'",
+                )
+            async with aiofiles.open(target_path, "wb") as f:
+                async for chunk in resp.content.iter_chunked(CHUNK_SIZE):
+                    await f.write(chunk)
+            return target_path
+    async def patch_json(self, path: str, data: dict) -> dict:
+        """Make a patch request against the provided API ``path`` and return the response
+        as a dictionary of decoded JSON.
+        :param path: the API path to make the request against
+        :param data: the data to send with the request
+        :return: the response as a dictionary of decoded JSON
+        """
+        async with self.http.patch(
+            f"{self.jobs_api_connection_string}{path}", json=data,
+        ) as resp:
+            await raise_exception_by_status_code(resp)
+            return await decode_json_response(resp)
+    async def post_file(
+        self,
+        path: str,
+        file_path: Path,
+        file_format: VirtoolFileFormat,
+        params: dict | None = None,
+    ):
+        if not params:
+            params = {"name": file_path.name}
+        if file_format is not None:
+            params.update(format=file_format)
+        async with self.http.post(
+            f"{self.jobs_api_connection_string}{path}",
+            data={"file": open(file_path, "rb")},
+            params=params,
+        ) as response:
+            await raise_exception_by_status_code(response)
+    async def post_json(self, path: str, data: dict) -> dict:
+        async with self.http.post(
+            f"{self.jobs_api_connection_string}{path}", json=data,
+        ) as resp:
+            await raise_exception_by_status_code(resp)
+            return await decode_json_response(resp)
+    async def put_file(
+        self,
+        path: str,
+        file_path: Path,
+        file_format: VirtoolFileFormat,
+        params: dict | None = None,
+    ):
+        if not params:
+            params = {"name": file_path.name}
+        if file_format is not None:
+            params.update(format=file_format)
+        async with self.http.put(
+            f"{self.jobs_api_connection_string}{path}",
+            data={"file": open(file_path, "rb")},
+            params=params,
+        ) as response:
+            await raise_exception_by_status_code(response)
+    async def put_json(self, path: str, data: dict) -> dict:
+        async with self.http.put(
+            f"{self.jobs_api_connection_string}{path}", json=data,
+        ) as resp:
+            await raise_exception_by_status_code(resp)
+            return await decode_json_response(resp)
+    async def delete(self, path: str) -> dict | None:
+        """Make a delete request against the provided API ``path``."""
+        async with self.http.delete(f"{self.jobs_api_connection_string}{path}") as resp:
+            await raise_exception_by_status_code(resp)
+            try:
+                return await decode_json_response(resp)
+            except ValueError:
+                return None
+@asynccontextmanager
+async def api_client(
+    jobs_api_connection_string: str,
+    job_id: str,
+    key: str,
+):
+    """An authenticated :class:``APIClient`` to make requests against the jobs API.
+    """
+    async with ClientSession(
+        auth=BasicAuth(login=f"job-{job_id}", password=key),
+    ) as http:
+        yield APIClient(http, jobs_api_connection_string)

virtool_workflow/api/utils.py ADDED Viewed

@@ -0,0 +1,109 @@
+import asyncio
+import functools
+from aiohttp import (
+    ClientConnectorError,
+    ClientResponse,
+    ContentTypeError,
+    ServerDisconnectedError,
+)
+from structlog import get_logger
+from virtool_workflow.errors import (
+    JobsAPIBadRequestError,
+    JobsAPIConflictError,
+    JobsAPIForbiddenError,
+    JobsAPINotFoundError,
+    JobsAPIServerError,
+)
+logger = get_logger("api")
+def retry(func):
+    """Retry an API call five times when encountering the following exceptions:
+      * ``ConnectionRefusedError``.
+      * ``ClientConnectorError``.
+      * ``ServerDisconnectedError``.
+    These are probably due to transient issues in the cluster network.
+    """
+    @functools.wraps(func)
+    async def wrapped(*args, **kwargs):
+        attempts = 0
+        try:
+            return await func(*args, **kwargs)
+        except (
+            ConnectionRefusedError,
+            ClientConnectorError,
+            ServerDisconnectedError,
+        ) as err:
+            if attempts == 5:
+                raise
+            attempts += 1
+            get_logger("runtime").info(
+                f"Encountered {type(err).__name__}. Retrying in 5 seconds.",
+            )
+            await asyncio.sleep(5)
+            return await func(*args, **kwargs)
+    return wrapped
+async def decode_json_response(resp: ClientResponse) -> dict | list | None:
+    """Decode a JSON response from a :class:``ClientResponse``.
+    Raise a :class:`ValueError` if the response is not JSON.
+    :param resp: the response to decode
+    :return: the decoded JSON
+    """
+    try:
+        return await resp.json()
+    except ContentTypeError:
+        raise ValueError(f"Response from  {resp.url} was not JSON. {await resp.text()}")
+async def raise_exception_by_status_code(resp: ClientResponse):
+    """Raise an exception based on the status code of the response.
+    :param resp: the response to check
+    :raise JobsAPIBadRequest: the response status code is 400
+    :raise JobsAPIForbidden: the response status code is 403
+    :raise JobsAPINotFound: the response status code is 404
+    :raise JobsAPIConflict: the response status code is 409
+    :raise JobsAPIServerError: the response status code is 500
+    """
+    status_exception_map = {
+        400: JobsAPIBadRequestError,
+        403: JobsAPIForbiddenError,
+        404: JobsAPINotFoundError,
+        409: JobsAPIConflictError,
+        500: JobsAPIServerError,
+    }
+    try:
+        resp_json: dict | None = await resp.json()
+    except ContentTypeError:
+        resp_json = None
+    if resp.status not in range(200, 299):
+        if resp_json is None:
+            try:
+                message = await resp.text()
+            except UnicodeDecodeError:
+                message = "Could not decode response message"
+        else:
+            message = resp_json["message"] if "message" in resp_json else str(resp_json)
+        if resp.status in status_exception_map:
+            raise status_exception_map[resp.status](message)
+        else:
+            raise ValueError(
+                f"Status code {resp.status} not handled for response\n {resp}",
+            )