virtool-workflow 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- virtool_workflow/__init__.py +13 -0
- virtool_workflow/analysis/__init__.py +1 -0
- virtool_workflow/analysis/fastqc.py +467 -0
- virtool_workflow/analysis/skewer.py +265 -0
- virtool_workflow/analysis/trimming.py +56 -0
- virtool_workflow/analysis/utils.py +27 -0
- virtool_workflow/api/__init__.py +0 -0
- virtool_workflow/api/acquire.py +66 -0
- virtool_workflow/api/client.py +132 -0
- virtool_workflow/api/utils.py +109 -0
- virtool_workflow/cli.py +66 -0
- virtool_workflow/data/__init__.py +22 -0
- virtool_workflow/data/analyses.py +106 -0
- virtool_workflow/data/hmms.py +109 -0
- virtool_workflow/data/indexes.py +319 -0
- virtool_workflow/data/jobs.py +62 -0
- virtool_workflow/data/ml.py +82 -0
- virtool_workflow/data/samples.py +190 -0
- virtool_workflow/data/subtractions.py +244 -0
- virtool_workflow/data/uploads.py +35 -0
- virtool_workflow/decorators.py +47 -0
- virtool_workflow/errors.py +62 -0
- virtool_workflow/files.py +40 -0
- virtool_workflow/hooks.py +140 -0
- virtool_workflow/pytest_plugin/__init__.py +35 -0
- virtool_workflow/pytest_plugin/data.py +197 -0
- virtool_workflow/pytest_plugin/utils.py +9 -0
- virtool_workflow/runtime/__init__.py +0 -0
- virtool_workflow/runtime/config.py +21 -0
- virtool_workflow/runtime/discover.py +95 -0
- virtool_workflow/runtime/events.py +7 -0
- virtool_workflow/runtime/hook.py +129 -0
- virtool_workflow/runtime/path.py +19 -0
- virtool_workflow/runtime/ping.py +54 -0
- virtool_workflow/runtime/redis.py +65 -0
- virtool_workflow/runtime/run.py +276 -0
- virtool_workflow/runtime/run_subprocess.py +168 -0
- virtool_workflow/runtime/sentry.py +28 -0
- virtool_workflow/utils.py +90 -0
- virtool_workflow/workflow.py +90 -0
- virtool_workflow-0.0.0.dist-info/LICENSE +21 -0
- virtool_workflow-0.0.0.dist-info/METADATA +71 -0
- virtool_workflow-0.0.0.dist-info/RECORD +45 -0
- virtool_workflow-0.0.0.dist-info/WHEEL +4 -0
- virtool_workflow-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,13 @@
|
|
1
|
+
"""
|
2
|
+
A framework for defining Virtool workflows.
|
3
|
+
"""
|
4
|
+
from virtool_workflow.decorators import step
|
5
|
+
from virtool_workflow.runtime.run_subprocess import RunSubprocess
|
6
|
+
from virtool_workflow.workflow import Workflow, WorkflowStep
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"step",
|
10
|
+
"RunSubprocess",
|
11
|
+
"Workflow",
|
12
|
+
"WorkflowStep",
|
13
|
+
]
|
@@ -0,0 +1 @@
|
|
1
|
+
"""Tools for workflows relating to sequence analysis."""
|
@@ -0,0 +1,467 @@
|
|
1
|
+
"""Utilities and fixtures for running FastQC."""
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import asyncio
|
5
|
+
import shutil
|
6
|
+
import statistics
|
7
|
+
import tempfile
|
8
|
+
from dataclasses import dataclass
|
9
|
+
from pathlib import Path
|
10
|
+
from typing import IO, Protocol, TextIO
|
11
|
+
|
12
|
+
from pyfixtures import fixture
|
13
|
+
|
14
|
+
from virtool_workflow import RunSubprocess
|
15
|
+
from virtool_workflow.analysis.utils import ReadPaths
|
16
|
+
|
17
|
+
|
18
|
+
@dataclass
|
19
|
+
class NucleotidePoint:
|
20
|
+
g: float
|
21
|
+
a: float
|
22
|
+
t: float
|
23
|
+
c: float
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass
|
27
|
+
class QualityPoint:
|
28
|
+
mean: float
|
29
|
+
median: float
|
30
|
+
lower_quartile: float
|
31
|
+
upper_quartile: float
|
32
|
+
tenth_percentile: float
|
33
|
+
ninetieth_percentile: float
|
34
|
+
|
35
|
+
|
36
|
+
class BaseQualityParser:
|
37
|
+
"""Parse the section of FastQC output containing per-base quality data."""
|
38
|
+
|
39
|
+
pattern = ">>Per base sequence quality"
|
40
|
+
|
41
|
+
def __init__(self):
|
42
|
+
self.data: list[QualityPoint] = []
|
43
|
+
|
44
|
+
def composite(self, parser: BaseQualityParser):
|
45
|
+
p = BaseQualityParser()
|
46
|
+
|
47
|
+
p.data = [
|
48
|
+
QualityPoint(
|
49
|
+
mean=statistics.mean([this.mean, other.mean]),
|
50
|
+
median=statistics.mean([this.median, other.median]),
|
51
|
+
lower_quartile=statistics.mean(
|
52
|
+
[this.lower_quartile, other.lower_quartile],
|
53
|
+
),
|
54
|
+
upper_quartile=statistics.mean(
|
55
|
+
[this.upper_quartile, other.upper_quartile],
|
56
|
+
),
|
57
|
+
tenth_percentile=statistics.mean(
|
58
|
+
[this.tenth_percentile, other.tenth_percentile],
|
59
|
+
),
|
60
|
+
ninetieth_percentile=statistics.mean(
|
61
|
+
[this.ninetieth_percentile, other.ninetieth_percentile],
|
62
|
+
),
|
63
|
+
)
|
64
|
+
for this, other in zip(self.data, parser.data)
|
65
|
+
]
|
66
|
+
|
67
|
+
return p
|
68
|
+
|
69
|
+
def handle(self, f: TextIO):
|
70
|
+
max_index = -1
|
71
|
+
|
72
|
+
while True:
|
73
|
+
line = f.readline().rstrip()
|
74
|
+
|
75
|
+
if line == ">>END_MODULE":
|
76
|
+
break
|
77
|
+
|
78
|
+
if not line or line[0] == "#":
|
79
|
+
continue
|
80
|
+
|
81
|
+
split = line.split()
|
82
|
+
|
83
|
+
# Convert all fields except first to 2-decimal floats.
|
84
|
+
try:
|
85
|
+
values = [float(value) for value in split[1:]]
|
86
|
+
except ValueError as err:
|
87
|
+
if "NaN" not in str(err):
|
88
|
+
raise
|
89
|
+
|
90
|
+
values = _handle_base_quality_nan(split)
|
91
|
+
|
92
|
+
(
|
93
|
+
mean,
|
94
|
+
median,
|
95
|
+
lower_quartile,
|
96
|
+
upper_quartile,
|
97
|
+
tenth_percentile,
|
98
|
+
ninetieth_percentile,
|
99
|
+
) = values
|
100
|
+
|
101
|
+
indexes = _calculate_index_range(split[0])
|
102
|
+
|
103
|
+
for i in indexes:
|
104
|
+
self.data.append(
|
105
|
+
QualityPoint(
|
106
|
+
mean=mean,
|
107
|
+
median=median,
|
108
|
+
lower_quartile=lower_quartile,
|
109
|
+
upper_quartile=upper_quartile,
|
110
|
+
tenth_percentile=tenth_percentile,
|
111
|
+
ninetieth_percentile=ninetieth_percentile,
|
112
|
+
),
|
113
|
+
)
|
114
|
+
|
115
|
+
if i - max_index != 1:
|
116
|
+
raise ValueError("Non-contiguous index")
|
117
|
+
|
118
|
+
max_index = i
|
119
|
+
|
120
|
+
|
121
|
+
class BasicStatisticsParser:
|
122
|
+
"""Parse the section of FastQC output containing basic statistics."""
|
123
|
+
|
124
|
+
pattern = ">>Basic Statistics"
|
125
|
+
|
126
|
+
def __init__(self):
|
127
|
+
self.count = 0
|
128
|
+
self.encoding = None
|
129
|
+
self.gc = None
|
130
|
+
self.length = None
|
131
|
+
|
132
|
+
self._populated = False
|
133
|
+
|
134
|
+
def composite(self, parser: BasicStatisticsParser):
|
135
|
+
p = BasicStatisticsParser()
|
136
|
+
|
137
|
+
p.count = self.count + parser.count
|
138
|
+
p.encoding = self.encoding
|
139
|
+
p.gc = (self.gc + parser.gc) / 2
|
140
|
+
p.length = [
|
141
|
+
min(self.length + parser.length),
|
142
|
+
max(self.length + parser.length),
|
143
|
+
]
|
144
|
+
|
145
|
+
return p
|
146
|
+
|
147
|
+
def handle(self, f: IO):
|
148
|
+
while True:
|
149
|
+
line = f.readline().rstrip()
|
150
|
+
|
151
|
+
if line.startswith("#"):
|
152
|
+
continue
|
153
|
+
|
154
|
+
if line == ">>END_MODULE":
|
155
|
+
break
|
156
|
+
|
157
|
+
if "Total Sequences" in line:
|
158
|
+
self.count = int(line.split("\t")[1])
|
159
|
+
|
160
|
+
elif "Encoding" in line:
|
161
|
+
self.encoding = line.split("\t")[1]
|
162
|
+
|
163
|
+
elif "Sequence length" in line:
|
164
|
+
length_range = [int(s) for s in line.split("\t")[1].split("-")]
|
165
|
+
self.length = [min(length_range), max(length_range)]
|
166
|
+
|
167
|
+
# GC-content
|
168
|
+
elif "%GC" in line and "#" not in line:
|
169
|
+
self.gc = float(line.split("\t")[1])
|
170
|
+
|
171
|
+
|
172
|
+
class NucleotideCompositionParser:
|
173
|
+
"""Parse the section of FastQC output containing per-base nucleotide composition."""
|
174
|
+
|
175
|
+
pattern = ">>Per base sequence content"
|
176
|
+
|
177
|
+
def __init__(self):
|
178
|
+
self.data: list[NucleotidePoint] = []
|
179
|
+
|
180
|
+
def composite(self, parser: NucleotideCompositionParser):
|
181
|
+
"""Make a composite dataset given another :class:`.NucleotideCompositionParser`."""
|
182
|
+
p = NucleotideCompositionParser()
|
183
|
+
|
184
|
+
p.data = [
|
185
|
+
NucleotidePoint(
|
186
|
+
g=(this.g + other.g) / 2,
|
187
|
+
a=(this.a + other.a) / 2,
|
188
|
+
t=(this.t + other.t) / 2,
|
189
|
+
c=(this.c + other.c) / 2,
|
190
|
+
)
|
191
|
+
for this, other in zip(self.data, parser.data)
|
192
|
+
]
|
193
|
+
|
194
|
+
return p
|
195
|
+
|
196
|
+
def handle(self, f: TextIO):
|
197
|
+
max_index = -1
|
198
|
+
|
199
|
+
while True:
|
200
|
+
line = f.readline().rstrip()
|
201
|
+
|
202
|
+
if line == ">>END_MODULE":
|
203
|
+
break
|
204
|
+
|
205
|
+
if not line or line[0] == "#":
|
206
|
+
continue
|
207
|
+
|
208
|
+
split = line.split()
|
209
|
+
|
210
|
+
try:
|
211
|
+
g, a, t, c = (float(value) for value in split[1:])
|
212
|
+
except ValueError as err:
|
213
|
+
if "NaN" not in str(err):
|
214
|
+
raise
|
215
|
+
|
216
|
+
g, a, t, c = _handle_base_quality_nan(split)
|
217
|
+
|
218
|
+
indexes = _calculate_index_range(split[0])
|
219
|
+
|
220
|
+
for i in indexes:
|
221
|
+
self.data.append(NucleotidePoint(g, a, t, c))
|
222
|
+
|
223
|
+
if i - max_index != 1:
|
224
|
+
raise ValueError("Non-contiguous index")
|
225
|
+
|
226
|
+
max_index = i
|
227
|
+
|
228
|
+
|
229
|
+
class SequenceQualityParser:
|
230
|
+
"""Parse the section of FastQC output containing per-sequence quality data."""
|
231
|
+
|
232
|
+
pattern = ">>Per sequence quality scores"
|
233
|
+
|
234
|
+
def __init__(self):
|
235
|
+
self.data = [0] * 50
|
236
|
+
|
237
|
+
def composite(self, parser: SequenceQualityParser):
|
238
|
+
p = SequenceQualityParser()
|
239
|
+
p.data = [sum(both) for both in zip(self.data, parser.data)]
|
240
|
+
|
241
|
+
return p
|
242
|
+
|
243
|
+
def handle(self, f: TextIO):
|
244
|
+
while True:
|
245
|
+
line = f.readline().rstrip()
|
246
|
+
|
247
|
+
if not line or line.startswith("#"):
|
248
|
+
continue
|
249
|
+
|
250
|
+
if line == ">>END_MODULE":
|
251
|
+
break
|
252
|
+
|
253
|
+
line = line.split()
|
254
|
+
|
255
|
+
quality = int(line[0])
|
256
|
+
count = int(float(line[1]))
|
257
|
+
|
258
|
+
self.data[quality] = count
|
259
|
+
|
260
|
+
|
261
|
+
@dataclass
|
262
|
+
class FastQCSide:
|
263
|
+
base_quality: BaseQualityParser
|
264
|
+
basic_statistics: BasicStatisticsParser
|
265
|
+
nucleotide_composition: NucleotideCompositionParser
|
266
|
+
sequence_quality: SequenceQualityParser
|
267
|
+
|
268
|
+
|
269
|
+
def _calculate_index_range(base: str) -> range:
|
270
|
+
pos = [int(x) for x in base.split("-")]
|
271
|
+
|
272
|
+
if len(pos) > 1:
|
273
|
+
return range(pos[0] - 1, pos[1])
|
274
|
+
|
275
|
+
return range(pos[0] - 1, pos[0])
|
276
|
+
|
277
|
+
|
278
|
+
def _handle_base_quality_nan(split_line: list) -> list:
|
279
|
+
"""Parse a per-base quality line from FastQC containing NaN values.
|
280
|
+
|
281
|
+
:param split_line: the quality line split into a :class:`.List`
|
282
|
+
:return: replacement values
|
283
|
+
|
284
|
+
"""
|
285
|
+
values = split_line[1:]
|
286
|
+
|
287
|
+
for value in values:
|
288
|
+
try:
|
289
|
+
return [value for _ in values]
|
290
|
+
except ValueError:
|
291
|
+
pass
|
292
|
+
|
293
|
+
# Return all zeroes if none of the quality values are numbers.
|
294
|
+
if set(values) == {"NaN"}:
|
295
|
+
return [0] * 4
|
296
|
+
|
297
|
+
joined = ",".join(split_line)
|
298
|
+
|
299
|
+
raise ValueError(f"Could not parse base quality values '{joined}'")
|
300
|
+
|
301
|
+
|
302
|
+
def _parse_fastqc(fastqc_path: Path, output_path: Path) -> dict:
|
303
|
+
"""Parse the FastQC results at `fastqc_path`.
|
304
|
+
|
305
|
+
All FastQC data except the textual data file are removed.
|
306
|
+
|
307
|
+
:param fastqc_path: the FastQC output data path
|
308
|
+
:param sample_path: the FastQC text output file will be moved here
|
309
|
+
:return: a dict containing a representation of the parsed FastQC data
|
310
|
+
|
311
|
+
"""
|
312
|
+
output_path.mkdir(exist_ok=True, parents=True)
|
313
|
+
|
314
|
+
sides = []
|
315
|
+
|
316
|
+
# Get the text data files from the FastQC output
|
317
|
+
for path in fastqc_path.iterdir():
|
318
|
+
if not path.is_dir():
|
319
|
+
continue
|
320
|
+
|
321
|
+
for file_path in path.iterdir():
|
322
|
+
if file_path.name != "fastqc_data.txt":
|
323
|
+
continue
|
324
|
+
|
325
|
+
new_path = output_path / f"{path.name}.txt"
|
326
|
+
|
327
|
+
shutil.move(file_path, new_path)
|
328
|
+
|
329
|
+
base_quality = BaseQualityParser()
|
330
|
+
basic_statistics = BasicStatisticsParser()
|
331
|
+
nucleotide_composition = NucleotideCompositionParser()
|
332
|
+
sequence_quality = SequenceQualityParser()
|
333
|
+
|
334
|
+
with open(new_path) as f:
|
335
|
+
while True:
|
336
|
+
line = f.readline()
|
337
|
+
|
338
|
+
if not line:
|
339
|
+
break
|
340
|
+
|
341
|
+
if basic_statistics.pattern in line:
|
342
|
+
basic_statistics.handle(f)
|
343
|
+
|
344
|
+
if base_quality.pattern in line:
|
345
|
+
base_quality.handle(f)
|
346
|
+
|
347
|
+
if nucleotide_composition.pattern in line:
|
348
|
+
nucleotide_composition.handle(f)
|
349
|
+
|
350
|
+
if SequenceQualityParser.pattern in line:
|
351
|
+
sequence_quality.handle(f)
|
352
|
+
|
353
|
+
sides.append(
|
354
|
+
FastQCSide(
|
355
|
+
base_quality=base_quality,
|
356
|
+
basic_statistics=basic_statistics,
|
357
|
+
nucleotide_composition=nucleotide_composition,
|
358
|
+
sequence_quality=sequence_quality,
|
359
|
+
),
|
360
|
+
)
|
361
|
+
|
362
|
+
if len(sides) == 1:
|
363
|
+
left = sides[0]
|
364
|
+
|
365
|
+
return {
|
366
|
+
"bases": [
|
367
|
+
[
|
368
|
+
round(n, 3)
|
369
|
+
for n in [
|
370
|
+
point.mean,
|
371
|
+
point.median,
|
372
|
+
point.lower_quartile,
|
373
|
+
point.upper_quartile,
|
374
|
+
point.tenth_percentile,
|
375
|
+
point.ninetieth_percentile,
|
376
|
+
]
|
377
|
+
]
|
378
|
+
for point in left.base_quality.data
|
379
|
+
],
|
380
|
+
"composition": [
|
381
|
+
[round(n, 1) for n in [point.g, point.a, point.t, point.c]]
|
382
|
+
for point in left.nucleotide_composition.data
|
383
|
+
],
|
384
|
+
"count": left.basic_statistics.count,
|
385
|
+
"encoding": left.basic_statistics.encoding,
|
386
|
+
"gc": left.basic_statistics.gc,
|
387
|
+
"length": left.basic_statistics.length,
|
388
|
+
"sequences": left.sequence_quality.data,
|
389
|
+
}
|
390
|
+
|
391
|
+
left, right = sides
|
392
|
+
|
393
|
+
basic = left.basic_statistics.composite(right.basic_statistics)
|
394
|
+
|
395
|
+
return {
|
396
|
+
"bases": [
|
397
|
+
[
|
398
|
+
round(n, 3)
|
399
|
+
for n in [
|
400
|
+
point.mean,
|
401
|
+
point.median,
|
402
|
+
point.lower_quartile,
|
403
|
+
point.upper_quartile,
|
404
|
+
point.tenth_percentile,
|
405
|
+
point.ninetieth_percentile,
|
406
|
+
]
|
407
|
+
]
|
408
|
+
for point in left.base_quality.composite(right.base_quality).data
|
409
|
+
],
|
410
|
+
"composition": [
|
411
|
+
[round(n, 1) for n in [point.g, point.a, point.t, point.c]]
|
412
|
+
for point in left.nucleotide_composition.composite(
|
413
|
+
right.nucleotide_composition,
|
414
|
+
).data
|
415
|
+
],
|
416
|
+
"count": basic.count,
|
417
|
+
"length": basic.length,
|
418
|
+
"encoding": left.basic_statistics.encoding,
|
419
|
+
"gc": basic.gc,
|
420
|
+
"sequences": left.sequence_quality.composite(right.sequence_quality).data,
|
421
|
+
}
|
422
|
+
|
423
|
+
|
424
|
+
class FastQCRunner(Protocol):
|
425
|
+
"""A protocol describing callables that can be used to run FastQC."""
|
426
|
+
|
427
|
+
async def __call__(self, paths: ReadPaths, output_path: Path) -> dict:
|
428
|
+
...
|
429
|
+
|
430
|
+
|
431
|
+
@fixture
|
432
|
+
async def fastqc(run_subprocess: RunSubprocess):
|
433
|
+
"""Provides an asynchronous function that can run FastQC as a subprocess.
|
434
|
+
|
435
|
+
The function takes a one or two paths to FASTQ read files (:class:`.ReadPaths`) in
|
436
|
+
a tuple.
|
437
|
+
|
438
|
+
Example:
|
439
|
+
-------
|
440
|
+
.. code-block:: python
|
441
|
+
|
442
|
+
@step
|
443
|
+
async def step_one(fastqc: FastQCRunner, work_path: Path):
|
444
|
+
fastqc_result = await fastqc((
|
445
|
+
work_path / "reads_1.fq",
|
446
|
+
work_path / "reads_2.fq"
|
447
|
+
))
|
448
|
+
|
449
|
+
"""
|
450
|
+
temp_path = Path(await asyncio.to_thread(tempfile.mkdtemp))
|
451
|
+
|
452
|
+
async def func(paths: ReadPaths, output_path: Path) -> dict:
|
453
|
+
command = [
|
454
|
+
"fastqc",
|
455
|
+
"-f",
|
456
|
+
"fastq",
|
457
|
+
"-o",
|
458
|
+
str(temp_path),
|
459
|
+
"--extract",
|
460
|
+
*[str(path) for path in paths],
|
461
|
+
]
|
462
|
+
|
463
|
+
await run_subprocess(command)
|
464
|
+
|
465
|
+
return _parse_fastqc(temp_path, output_path)
|
466
|
+
|
467
|
+
return func
|