virtool-workflow 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virtool_workflow-0.0.0/LICENSE +21 -0
- virtool_workflow-0.0.0/PKG-INFO +71 -0
- virtool_workflow-0.0.0/README.md +48 -0
- virtool_workflow-0.0.0/pyproject.toml +87 -0
- virtool_workflow-0.0.0/virtool_workflow/__init__.py +13 -0
- virtool_workflow-0.0.0/virtool_workflow/analysis/__init__.py +1 -0
- virtool_workflow-0.0.0/virtool_workflow/analysis/fastqc.py +467 -0
- virtool_workflow-0.0.0/virtool_workflow/analysis/skewer.py +265 -0
- virtool_workflow-0.0.0/virtool_workflow/analysis/trimming.py +56 -0
- virtool_workflow-0.0.0/virtool_workflow/analysis/utils.py +27 -0
- virtool_workflow-0.0.0/virtool_workflow/api/__init__.py +0 -0
- virtool_workflow-0.0.0/virtool_workflow/api/acquire.py +66 -0
- virtool_workflow-0.0.0/virtool_workflow/api/client.py +132 -0
- virtool_workflow-0.0.0/virtool_workflow/api/utils.py +109 -0
- virtool_workflow-0.0.0/virtool_workflow/cli.py +66 -0
- virtool_workflow-0.0.0/virtool_workflow/data/__init__.py +22 -0
- virtool_workflow-0.0.0/virtool_workflow/data/analyses.py +106 -0
- virtool_workflow-0.0.0/virtool_workflow/data/hmms.py +109 -0
- virtool_workflow-0.0.0/virtool_workflow/data/indexes.py +319 -0
- virtool_workflow-0.0.0/virtool_workflow/data/jobs.py +62 -0
- virtool_workflow-0.0.0/virtool_workflow/data/ml.py +82 -0
- virtool_workflow-0.0.0/virtool_workflow/data/samples.py +190 -0
- virtool_workflow-0.0.0/virtool_workflow/data/subtractions.py +244 -0
- virtool_workflow-0.0.0/virtool_workflow/data/uploads.py +35 -0
- virtool_workflow-0.0.0/virtool_workflow/decorators.py +47 -0
- virtool_workflow-0.0.0/virtool_workflow/errors.py +62 -0
- virtool_workflow-0.0.0/virtool_workflow/files.py +40 -0
- virtool_workflow-0.0.0/virtool_workflow/hooks.py +140 -0
- virtool_workflow-0.0.0/virtool_workflow/pytest_plugin/__init__.py +35 -0
- virtool_workflow-0.0.0/virtool_workflow/pytest_plugin/data.py +197 -0
- virtool_workflow-0.0.0/virtool_workflow/pytest_plugin/utils.py +9 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/__init__.py +0 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/config.py +21 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/discover.py +95 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/events.py +7 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/hook.py +129 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/path.py +19 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/ping.py +54 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/redis.py +65 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/run.py +276 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/run_subprocess.py +168 -0
- virtool_workflow-0.0.0/virtool_workflow/runtime/sentry.py +28 -0
- virtool_workflow-0.0.0/virtool_workflow/utils.py +90 -0
- virtool_workflow-0.0.0/virtool_workflow/workflow.py +90 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Canadian Food Inspection Agency
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
@@ -0,0 +1,71 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: virtool-workflow
|
3
|
+
Version: 0.0.0
|
4
|
+
Summary: A framework for developing bioinformatics workflows for Virtool.
|
5
|
+
License: MIT
|
6
|
+
Author: Ian Boyes
|
7
|
+
Maintainer: Ian Boyes
|
8
|
+
Requires-Python: >=3.12.3,<3.13.0
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
11
|
+
Requires-Dist: aiofiles (>=0.7.0,<0.8.0)
|
12
|
+
Requires-Dist: aiohttp (>=3.8.1,<4.0.0)
|
13
|
+
Requires-Dist: biopython (>=1.81,<2.0)
|
14
|
+
Requires-Dist: click (>=8.1.7,<9.0.0)
|
15
|
+
Requires-Dist: orjson (>=3.9.9,<4.0.0)
|
16
|
+
Requires-Dist: pydantic-factories (>=1.17.3,<2.0.0)
|
17
|
+
Requires-Dist: pyfixtures (>=1.0.0,<2.0.0)
|
18
|
+
Requires-Dist: sentry-sdk (>=2.3.1,<3.0.0)
|
19
|
+
Requires-Dist: structlog-sentry (>=2.2.1,<3.0.0)
|
20
|
+
Requires-Dist: virtool (>=31.1.3,<32.0.0)
|
21
|
+
Description-Content-Type: text/markdown
|
22
|
+
|
23
|
+
# Virtool Workflow
|
24
|
+
|
25
|
+

|
26
|
+
[](https://badge.fury.io/py/virtool-workflow)
|
27
|
+
|
28
|
+
A framework for developing bioinformatic workflows in Python.
|
29
|
+
|
30
|
+
```python
|
31
|
+
from virtool_workflow import step
|
32
|
+
|
33
|
+
|
34
|
+
@step
|
35
|
+
def step_function():
|
36
|
+
...
|
37
|
+
|
38
|
+
|
39
|
+
@step
|
40
|
+
def step_function_2():
|
41
|
+
...
|
42
|
+
```
|
43
|
+
|
44
|
+
## Contributing
|
45
|
+
|
46
|
+
### Commits
|
47
|
+
|
48
|
+
We require specific commit formatting. Any commit that does not follow the guidelines
|
49
|
+
will be squashed at our discretion.
|
50
|
+
|
51
|
+
Read our [commit and release](https://dev.virtool.ca/en/latest/commits_releases.html)
|
52
|
+
documentation for more information.
|
53
|
+
|
54
|
+
### Tests
|
55
|
+
|
56
|
+
Run tests with:
|
57
|
+
|
58
|
+
```shell
|
59
|
+
# Bring up Redis and the test container.
|
60
|
+
docker compose up -d
|
61
|
+
|
62
|
+
# Run tests in the test container.
|
63
|
+
docker compose exec test poetry run pytest
|
64
|
+
```
|
65
|
+
|
66
|
+
Run specific tests like:
|
67
|
+
|
68
|
+
```shell
|
69
|
+
docker compose exec test poetry run pytest tests/test_status.py
|
70
|
+
```
|
71
|
+
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# Virtool Workflow
|
2
|
+
|
3
|
+

|
4
|
+
[](https://badge.fury.io/py/virtool-workflow)
|
5
|
+
|
6
|
+
A framework for developing bioinformatic workflows in Python.
|
7
|
+
|
8
|
+
```python
|
9
|
+
from virtool_workflow import step
|
10
|
+
|
11
|
+
|
12
|
+
@step
|
13
|
+
def step_function():
|
14
|
+
...
|
15
|
+
|
16
|
+
|
17
|
+
@step
|
18
|
+
def step_function_2():
|
19
|
+
...
|
20
|
+
```
|
21
|
+
|
22
|
+
## Contributing
|
23
|
+
|
24
|
+
### Commits
|
25
|
+
|
26
|
+
We require specific commit formatting. Any commit that does not follow the guidelines
|
27
|
+
will be squashed at our discretion.
|
28
|
+
|
29
|
+
Read our [commit and release](https://dev.virtool.ca/en/latest/commits_releases.html)
|
30
|
+
documentation for more information.
|
31
|
+
|
32
|
+
### Tests
|
33
|
+
|
34
|
+
Run tests with:
|
35
|
+
|
36
|
+
```shell
|
37
|
+
# Bring up Redis and the test container.
|
38
|
+
docker compose up -d
|
39
|
+
|
40
|
+
# Run tests in the test container.
|
41
|
+
docker compose exec test poetry run pytest
|
42
|
+
```
|
43
|
+
|
44
|
+
Run specific tests like:
|
45
|
+
|
46
|
+
```shell
|
47
|
+
docker compose exec test poetry run pytest tests/test_status.py
|
48
|
+
```
|
@@ -0,0 +1,87 @@
|
|
1
|
+
[project]
|
2
|
+
name = "virtool-workflow"
|
3
|
+
description = "A framework for developing bioinformatics workflows for Virtool."
|
4
|
+
version = "0.0.0"
|
5
|
+
authors = [
|
6
|
+
{name = "Ian Boyes"},
|
7
|
+
{name = "Reece Hoffman"},
|
8
|
+
{name = "Blake Smith"},
|
9
|
+
{name = "Ryan Fang"},
|
10
|
+
{name = "Matt Curtis"},
|
11
|
+
{name = "Aman Monga"},
|
12
|
+
{name = "Bryce Davidson"},
|
13
|
+
{name = "Christine Wong Chong"},
|
14
|
+
{name = "Lilly Roberts"},
|
15
|
+
{name = "Markus Swoveland"},
|
16
|
+
{name = "Ryan Fang"},
|
17
|
+
]
|
18
|
+
dynamic = ["dependencies"]
|
19
|
+
license = "MIT"
|
20
|
+
maintainers = [
|
21
|
+
{name = "Ian Boyes"},
|
22
|
+
{name = "Reece Hoffman"},
|
23
|
+
]
|
24
|
+
readme = "README.md"
|
25
|
+
repository = "https://github.com/virtool/virtool-workflow"
|
26
|
+
requires-python = ">=3.12.3,<3.13.0"
|
27
|
+
|
28
|
+
[project.scripts]
|
29
|
+
run-workflow = "virtool_workflow.cli:cli_main"
|
30
|
+
|
31
|
+
[tool.poetry]
|
32
|
+
packages = [
|
33
|
+
{ include = "virtool_workflow" },
|
34
|
+
]
|
35
|
+
|
36
|
+
[tool.poetry.dependencies]
|
37
|
+
aiofiles = "^0.7.0"
|
38
|
+
aiohttp = "^3.8.1"
|
39
|
+
biopython = "^1.81"
|
40
|
+
click = "^8.1.7"
|
41
|
+
orjson = "^3.9.9"
|
42
|
+
pydantic-factories = "^1.17.3"
|
43
|
+
pyfixtures = "^1.0.0"
|
44
|
+
sentry-sdk = "^2.3.1"
|
45
|
+
structlog-sentry = "^2.2.1"
|
46
|
+
virtool = "^31.1.3"
|
47
|
+
|
48
|
+
[tool.poetry.group.dev.dependencies]
|
49
|
+
piccolo-theme = "^0.24.0"
|
50
|
+
pytest = "^7.4.2"
|
51
|
+
pytest-asyncio = "^0.21.0"
|
52
|
+
pytest-mock = "^3.10.0"
|
53
|
+
pytest-structlog = "^1.0"
|
54
|
+
ruff = "^0.4.6"
|
55
|
+
sphinx = "^4.0.2"
|
56
|
+
sphinx-autobuild = "^2021.3.14"
|
57
|
+
sphinx-nameko-theme = "^0.0.3"
|
58
|
+
sphinx-toolbox = "^3.5.0"
|
59
|
+
syrupy = "^3.0.5"
|
60
|
+
pytest-aiohttp = "^1.1.0"
|
61
|
+
|
62
|
+
[tool.pytest.ini_options]
|
63
|
+
asyncio_mode = "auto"
|
64
|
+
|
65
|
+
[tool.ruff]
|
66
|
+
exclude = [
|
67
|
+
"docs",
|
68
|
+
".eggs",
|
69
|
+
".git",
|
70
|
+
".github",
|
71
|
+
".mypy_cache",
|
72
|
+
".pytest-cache",
|
73
|
+
".ruff_cache",
|
74
|
+
"__pypackages__",
|
75
|
+
]
|
76
|
+
|
77
|
+
[tool.ruff.lint]
|
78
|
+
ignore = [
|
79
|
+
"ANN101",
|
80
|
+
"D203",
|
81
|
+
"D213"
|
82
|
+
]
|
83
|
+
select = ["ALL"]
|
84
|
+
|
85
|
+
[build-system]
|
86
|
+
requires = ["poetry-core>=1.0.0"]
|
87
|
+
build-backend = "poetry.core.masonry.api"
|
@@ -0,0 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
A framework for defining Virtool workflows.
|
3
|
+
"""
|
4
|
+
from virtool_workflow.decorators import step
|
5
|
+
from virtool_workflow.runtime.run_subprocess import RunSubprocess
|
6
|
+
from virtool_workflow.workflow import Workflow, WorkflowStep
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"step",
|
10
|
+
"RunSubprocess",
|
11
|
+
"Workflow",
|
12
|
+
"WorkflowStep",
|
13
|
+
]
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Tools for workflows relating to sequence analysis."""
|
@@ -0,0 +1,467 @@
|
|
1
|
+
"""Utilities and fixtures for running FastQC."""
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import asyncio
|
5
|
+
import shutil
|
6
|
+
import statistics
|
7
|
+
import tempfile
|
8
|
+
from dataclasses import dataclass
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import IO, Protocol, TextIO
|
11
|
+
|
12
|
+
from pyfixtures import fixture
|
13
|
+
|
14
|
+
from virtool_workflow import RunSubprocess
|
15
|
+
from virtool_workflow.analysis.utils import ReadPaths
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class NucleotidePoint:
|
20
|
+
g: float
|
21
|
+
a: float
|
22
|
+
t: float
|
23
|
+
c: float
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class QualityPoint:
|
28
|
+
mean: float
|
29
|
+
median: float
|
30
|
+
lower_quartile: float
|
31
|
+
upper_quartile: float
|
32
|
+
tenth_percentile: float
|
33
|
+
ninetieth_percentile: float
|
34
|
+
|
35
|
+
|
36
|
+
class BaseQualityParser:
|
37
|
+
"""Parse the section of FastQC output containing per-base quality data."""
|
38
|
+
|
39
|
+
pattern = ">>Per base sequence quality"
|
40
|
+
|
41
|
+
def __init__(self):
|
42
|
+
self.data: list[QualityPoint] = []
|
43
|
+
|
44
|
+
def composite(self, parser: BaseQualityParser):
|
45
|
+
p = BaseQualityParser()
|
46
|
+
|
47
|
+
p.data = [
|
48
|
+
QualityPoint(
|
49
|
+
mean=statistics.mean([this.mean, other.mean]),
|
50
|
+
median=statistics.mean([this.median, other.median]),
|
51
|
+
lower_quartile=statistics.mean(
|
52
|
+
[this.lower_quartile, other.lower_quartile],
|
53
|
+
),
|
54
|
+
upper_quartile=statistics.mean(
|
55
|
+
[this.upper_quartile, other.upper_quartile],
|
56
|
+
),
|
57
|
+
tenth_percentile=statistics.mean(
|
58
|
+
[this.tenth_percentile, other.tenth_percentile],
|
59
|
+
),
|
60
|
+
ninetieth_percentile=statistics.mean(
|
61
|
+
[this.ninetieth_percentile, other.ninetieth_percentile],
|
62
|
+
),
|
63
|
+
)
|
64
|
+
for this, other in zip(self.data, parser.data)
|
65
|
+
]
|
66
|
+
|
67
|
+
return p
|
68
|
+
|
69
|
+
def handle(self, f: TextIO):
|
70
|
+
max_index = -1
|
71
|
+
|
72
|
+
while True:
|
73
|
+
line = f.readline().rstrip()
|
74
|
+
|
75
|
+
if line == ">>END_MODULE":
|
76
|
+
break
|
77
|
+
|
78
|
+
if not line or line[0] == "#":
|
79
|
+
continue
|
80
|
+
|
81
|
+
split = line.split()
|
82
|
+
|
83
|
+
# Convert all fields except first to 2-decimal floats.
|
84
|
+
try:
|
85
|
+
values = [float(value) for value in split[1:]]
|
86
|
+
except ValueError as err:
|
87
|
+
if "NaN" not in str(err):
|
88
|
+
raise
|
89
|
+
|
90
|
+
values = _handle_base_quality_nan(split)
|
91
|
+
|
92
|
+
(
|
93
|
+
mean,
|
94
|
+
median,
|
95
|
+
lower_quartile,
|
96
|
+
upper_quartile,
|
97
|
+
tenth_percentile,
|
98
|
+
ninetieth_percentile,
|
99
|
+
) = values
|
100
|
+
|
101
|
+
indexes = _calculate_index_range(split[0])
|
102
|
+
|
103
|
+
for i in indexes:
|
104
|
+
self.data.append(
|
105
|
+
QualityPoint(
|
106
|
+
mean=mean,
|
107
|
+
median=median,
|
108
|
+
lower_quartile=lower_quartile,
|
109
|
+
upper_quartile=upper_quartile,
|
110
|
+
tenth_percentile=tenth_percentile,
|
111
|
+
ninetieth_percentile=ninetieth_percentile,
|
112
|
+
),
|
113
|
+
)
|
114
|
+
|
115
|
+
if i - max_index != 1:
|
116
|
+
raise ValueError("Non-contiguous index")
|
117
|
+
|
118
|
+
max_index = i
|
119
|
+
|
120
|
+
|
121
|
+
class BasicStatisticsParser:
|
122
|
+
"""Parse the section of FastQC output containing basic statistics."""
|
123
|
+
|
124
|
+
pattern = ">>Basic Statistics"
|
125
|
+
|
126
|
+
def __init__(self):
|
127
|
+
self.count = 0
|
128
|
+
self.encoding = None
|
129
|
+
self.gc = None
|
130
|
+
self.length = None
|
131
|
+
|
132
|
+
self._populated = False
|
133
|
+
|
134
|
+
def composite(self, parser: BasicStatisticsParser):
|
135
|
+
p = BasicStatisticsParser()
|
136
|
+
|
137
|
+
p.count = self.count + parser.count
|
138
|
+
p.encoding = self.encoding
|
139
|
+
p.gc = (self.gc + parser.gc) / 2
|
140
|
+
p.length = [
|
141
|
+
min(self.length + parser.length),
|
142
|
+
max(self.length + parser.length),
|
143
|
+
]
|
144
|
+
|
145
|
+
return p
|
146
|
+
|
147
|
+
def handle(self, f: IO):
|
148
|
+
while True:
|
149
|
+
line = f.readline().rstrip()
|
150
|
+
|
151
|
+
if line.startswith("#"):
|
152
|
+
continue
|
153
|
+
|
154
|
+
if line == ">>END_MODULE":
|
155
|
+
break
|
156
|
+
|
157
|
+
if "Total Sequences" in line:
|
158
|
+
self.count = int(line.split("\t")[1])
|
159
|
+
|
160
|
+
elif "Encoding" in line:
|
161
|
+
self.encoding = line.split("\t")[1]
|
162
|
+
|
163
|
+
elif "Sequence length" in line:
|
164
|
+
length_range = [int(s) for s in line.split("\t")[1].split("-")]
|
165
|
+
self.length = [min(length_range), max(length_range)]
|
166
|
+
|
167
|
+
# GC-content
|
168
|
+
elif "%GC" in line and "#" not in line:
|
169
|
+
self.gc = float(line.split("\t")[1])
|
170
|
+
|
171
|
+
|
172
|
+
class NucleotideCompositionParser:
|
173
|
+
"""Parse the section of FastQC output containing per-base nucleotide composition."""
|
174
|
+
|
175
|
+
pattern = ">>Per base sequence content"
|
176
|
+
|
177
|
+
def __init__(self):
|
178
|
+
self.data: list[NucleotidePoint] = []
|
179
|
+
|
180
|
+
def composite(self, parser: NucleotideCompositionParser):
|
181
|
+
"""Make a composite dataset given another :class:`.NucleotideCompositionParser`."""
|
182
|
+
p = NucleotideCompositionParser()
|
183
|
+
|
184
|
+
p.data = [
|
185
|
+
NucleotidePoint(
|
186
|
+
g=(this.g + other.g) / 2,
|
187
|
+
a=(this.a + other.a) / 2,
|
188
|
+
t=(this.t + other.t) / 2,
|
189
|
+
c=(this.c + other.c) / 2,
|
190
|
+
)
|
191
|
+
for this, other in zip(self.data, parser.data)
|
192
|
+
]
|
193
|
+
|
194
|
+
return p
|
195
|
+
|
196
|
+
def handle(self, f: TextIO):
|
197
|
+
max_index = -1
|
198
|
+
|
199
|
+
while True:
|
200
|
+
line = f.readline().rstrip()
|
201
|
+
|
202
|
+
if line == ">>END_MODULE":
|
203
|
+
break
|
204
|
+
|
205
|
+
if not line or line[0] == "#":
|
206
|
+
continue
|
207
|
+
|
208
|
+
split = line.split()
|
209
|
+
|
210
|
+
try:
|
211
|
+
g, a, t, c = (float(value) for value in split[1:])
|
212
|
+
except ValueError as err:
|
213
|
+
if "NaN" not in str(err):
|
214
|
+
raise
|
215
|
+
|
216
|
+
g, a, t, c = _handle_base_quality_nan(split)
|
217
|
+
|
218
|
+
indexes = _calculate_index_range(split[0])
|
219
|
+
|
220
|
+
for i in indexes:
|
221
|
+
self.data.append(NucleotidePoint(g, a, t, c))
|
222
|
+
|
223
|
+
if i - max_index != 1:
|
224
|
+
raise ValueError("Non-contiguous index")
|
225
|
+
|
226
|
+
max_index = i
|
227
|
+
|
228
|
+
|
229
|
+
class SequenceQualityParser:
|
230
|
+
"""Parse the section of FastQC output containing per-sequence quality data."""
|
231
|
+
|
232
|
+
pattern = ">>Per sequence quality scores"
|
233
|
+
|
234
|
+
def __init__(self):
|
235
|
+
self.data = [0] * 50
|
236
|
+
|
237
|
+
def composite(self, parser: SequenceQualityParser):
|
238
|
+
p = SequenceQualityParser()
|
239
|
+
p.data = [sum(both) for both in zip(self.data, parser.data)]
|
240
|
+
|
241
|
+
return p
|
242
|
+
|
243
|
+
def handle(self, f: TextIO):
|
244
|
+
while True:
|
245
|
+
line = f.readline().rstrip()
|
246
|
+
|
247
|
+
if not line or line.startswith("#"):
|
248
|
+
continue
|
249
|
+
|
250
|
+
if line == ">>END_MODULE":
|
251
|
+
break
|
252
|
+
|
253
|
+
line = line.split()
|
254
|
+
|
255
|
+
quality = int(line[0])
|
256
|
+
count = int(float(line[1]))
|
257
|
+
|
258
|
+
self.data[quality] = count
|
259
|
+
|
260
|
+
|
261
|
+
@dataclass
|
262
|
+
class FastQCSide:
|
263
|
+
base_quality: BaseQualityParser
|
264
|
+
basic_statistics: BasicStatisticsParser
|
265
|
+
nucleotide_composition: NucleotideCompositionParser
|
266
|
+
sequence_quality: SequenceQualityParser
|
267
|
+
|
268
|
+
|
269
|
+
def _calculate_index_range(base: str) -> range:
|
270
|
+
pos = [int(x) for x in base.split("-")]
|
271
|
+
|
272
|
+
if len(pos) > 1:
|
273
|
+
return range(pos[0] - 1, pos[1])
|
274
|
+
|
275
|
+
return range(pos[0] - 1, pos[0])
|
276
|
+
|
277
|
+
|
278
|
+
def _handle_base_quality_nan(split_line: list) -> list:
|
279
|
+
"""Parse a per-base quality line from FastQC containing NaN values.
|
280
|
+
|
281
|
+
:param split_line: the quality line split into a :class:`.List`
|
282
|
+
:return: replacement values
|
283
|
+
|
284
|
+
"""
|
285
|
+
values = split_line[1:]
|
286
|
+
|
287
|
+
for value in values:
|
288
|
+
try:
|
289
|
+
return [value for _ in values]
|
290
|
+
except ValueError:
|
291
|
+
pass
|
292
|
+
|
293
|
+
# Return all zeroes if none of the quality values are numbers.
|
294
|
+
if set(values) == {"NaN"}:
|
295
|
+
return [0] * 4
|
296
|
+
|
297
|
+
joined = ",".join(split_line)
|
298
|
+
|
299
|
+
raise ValueError(f"Could not parse base quality values '{joined}'")
|
300
|
+
|
301
|
+
|
302
|
+
def _parse_fastqc(fastqc_path: Path, output_path: Path) -> dict:
|
303
|
+
"""Parse the FastQC results at `fastqc_path`.
|
304
|
+
|
305
|
+
All FastQC data except the textual data file are removed.
|
306
|
+
|
307
|
+
:param fastqc_path: the FastQC output data path
|
308
|
+
:param sample_path: the FastQC text output file will be moved here
|
309
|
+
:return: a dict containing a representation of the parsed FastQC data
|
310
|
+
|
311
|
+
"""
|
312
|
+
output_path.mkdir(exist_ok=True, parents=True)
|
313
|
+
|
314
|
+
sides = []
|
315
|
+
|
316
|
+
# Get the text data files from the FastQC output
|
317
|
+
for path in fastqc_path.iterdir():
|
318
|
+
if not path.is_dir():
|
319
|
+
continue
|
320
|
+
|
321
|
+
for file_path in path.iterdir():
|
322
|
+
if file_path.name != "fastqc_data.txt":
|
323
|
+
continue
|
324
|
+
|
325
|
+
new_path = output_path / f"{path.name}.txt"
|
326
|
+
|
327
|
+
shutil.move(file_path, new_path)
|
328
|
+
|
329
|
+
base_quality = BaseQualityParser()
|
330
|
+
basic_statistics = BasicStatisticsParser()
|
331
|
+
nucleotide_composition = NucleotideCompositionParser()
|
332
|
+
sequence_quality = SequenceQualityParser()
|
333
|
+
|
334
|
+
with open(new_path) as f:
|
335
|
+
while True:
|
336
|
+
line = f.readline()
|
337
|
+
|
338
|
+
if not line:
|
339
|
+
break
|
340
|
+
|
341
|
+
if basic_statistics.pattern in line:
|
342
|
+
basic_statistics.handle(f)
|
343
|
+
|
344
|
+
if base_quality.pattern in line:
|
345
|
+
base_quality.handle(f)
|
346
|
+
|
347
|
+
if nucleotide_composition.pattern in line:
|
348
|
+
nucleotide_composition.handle(f)
|
349
|
+
|
350
|
+
if SequenceQualityParser.pattern in line:
|
351
|
+
sequence_quality.handle(f)
|
352
|
+
|
353
|
+
sides.append(
|
354
|
+
FastQCSide(
|
355
|
+
base_quality=base_quality,
|
356
|
+
basic_statistics=basic_statistics,
|
357
|
+
nucleotide_composition=nucleotide_composition,
|
358
|
+
sequence_quality=sequence_quality,
|
359
|
+
),
|
360
|
+
)
|
361
|
+
|
362
|
+
if len(sides) == 1:
|
363
|
+
left = sides[0]
|
364
|
+
|
365
|
+
return {
|
366
|
+
"bases": [
|
367
|
+
[
|
368
|
+
round(n, 3)
|
369
|
+
for n in [
|
370
|
+
point.mean,
|
371
|
+
point.median,
|
372
|
+
point.lower_quartile,
|
373
|
+
point.upper_quartile,
|
374
|
+
point.tenth_percentile,
|
375
|
+
point.ninetieth_percentile,
|
376
|
+
]
|
377
|
+
]
|
378
|
+
for point in left.base_quality.data
|
379
|
+
],
|
380
|
+
"composition": [
|
381
|
+
[round(n, 1) for n in [point.g, point.a, point.t, point.c]]
|
382
|
+
for point in left.nucleotide_composition.data
|
383
|
+
],
|
384
|
+
"count": left.basic_statistics.count,
|
385
|
+
"encoding": left.basic_statistics.encoding,
|
386
|
+
"gc": left.basic_statistics.gc,
|
387
|
+
"length": left.basic_statistics.length,
|
388
|
+
"sequences": left.sequence_quality.data,
|
389
|
+
}
|
390
|
+
|
391
|
+
left, right = sides
|
392
|
+
|
393
|
+
basic = left.basic_statistics.composite(right.basic_statistics)
|
394
|
+
|
395
|
+
return {
|
396
|
+
"bases": [
|
397
|
+
[
|
398
|
+
round(n, 3)
|
399
|
+
for n in [
|
400
|
+
point.mean,
|
401
|
+
point.median,
|
402
|
+
point.lower_quartile,
|
403
|
+
point.upper_quartile,
|
404
|
+
point.tenth_percentile,
|
405
|
+
point.ninetieth_percentile,
|
406
|
+
]
|
407
|
+
]
|
408
|
+
for point in left.base_quality.composite(right.base_quality).data
|
409
|
+
],
|
410
|
+
"composition": [
|
411
|
+
[round(n, 1) for n in [point.g, point.a, point.t, point.c]]
|
412
|
+
for point in left.nucleotide_composition.composite(
|
413
|
+
right.nucleotide_composition,
|
414
|
+
).data
|
415
|
+
],
|
416
|
+
"count": basic.count,
|
417
|
+
"length": basic.length,
|
418
|
+
"encoding": left.basic_statistics.encoding,
|
419
|
+
"gc": basic.gc,
|
420
|
+
"sequences": left.sequence_quality.composite(right.sequence_quality).data,
|
421
|
+
}
|
422
|
+
|
423
|
+
|
424
|
+
class FastQCRunner(Protocol):
|
425
|
+
"""A protocol describing callables that can be used to run FastQC."""
|
426
|
+
|
427
|
+
async def __call__(self, paths: ReadPaths, output_path: Path) -> dict:
|
428
|
+
...
|
429
|
+
|
430
|
+
|
431
|
+
@fixture
|
432
|
+
async def fastqc(run_subprocess: RunSubprocess):
|
433
|
+
"""Provides an asynchronous function that can run FastQC as a subprocess.
|
434
|
+
|
435
|
+
The function takes a one or two paths to FASTQ read files (:class:`.ReadPaths`) in
|
436
|
+
a tuple.
|
437
|
+
|
438
|
+
Example:
|
439
|
+
-------
|
440
|
+
.. code-block:: python
|
441
|
+
|
442
|
+
@step
|
443
|
+
async def step_one(fastqc: FastQCRunner, work_path: Path):
|
444
|
+
fastqc_result = await fastqc((
|
445
|
+
work_path / "reads_1.fq",
|
446
|
+
work_path / "reads_2.fq"
|
447
|
+
))
|
448
|
+
|
449
|
+
"""
|
450
|
+
temp_path = Path(await asyncio.to_thread(tempfile.mkdtemp))
|
451
|
+
|
452
|
+
async def func(paths: ReadPaths, output_path: Path) -> dict:
|
453
|
+
command = [
|
454
|
+
"fastqc",
|
455
|
+
"-f",
|
456
|
+
"fastq",
|
457
|
+
"-o",
|
458
|
+
str(temp_path),
|
459
|
+
"--extract",
|
460
|
+
*[str(path) for path in paths],
|
461
|
+
]
|
462
|
+
|
463
|
+
await run_subprocess(command)
|
464
|
+
|
465
|
+
return _parse_fastqc(temp_path, output_path)
|
466
|
+
|
467
|
+
return func
|