virtool-workflow 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. virtool_workflow/__init__.py +13 -0
  2. virtool_workflow/analysis/__init__.py +1 -0
  3. virtool_workflow/analysis/fastqc.py +467 -0
  4. virtool_workflow/analysis/skewer.py +265 -0
  5. virtool_workflow/analysis/trimming.py +56 -0
  6. virtool_workflow/analysis/utils.py +27 -0
  7. virtool_workflow/api/__init__.py +0 -0
  8. virtool_workflow/api/acquire.py +66 -0
  9. virtool_workflow/api/client.py +132 -0
  10. virtool_workflow/api/utils.py +109 -0
  11. virtool_workflow/cli.py +66 -0
  12. virtool_workflow/data/__init__.py +22 -0
  13. virtool_workflow/data/analyses.py +106 -0
  14. virtool_workflow/data/hmms.py +109 -0
  15. virtool_workflow/data/indexes.py +319 -0
  16. virtool_workflow/data/jobs.py +62 -0
  17. virtool_workflow/data/ml.py +82 -0
  18. virtool_workflow/data/samples.py +190 -0
  19. virtool_workflow/data/subtractions.py +244 -0
  20. virtool_workflow/data/uploads.py +35 -0
  21. virtool_workflow/decorators.py +47 -0
  22. virtool_workflow/errors.py +62 -0
  23. virtool_workflow/files.py +40 -0
  24. virtool_workflow/hooks.py +140 -0
  25. virtool_workflow/pytest_plugin/__init__.py +35 -0
  26. virtool_workflow/pytest_plugin/data.py +197 -0
  27. virtool_workflow/pytest_plugin/utils.py +9 -0
  28. virtool_workflow/runtime/__init__.py +0 -0
  29. virtool_workflow/runtime/config.py +21 -0
  30. virtool_workflow/runtime/discover.py +95 -0
  31. virtool_workflow/runtime/events.py +7 -0
  32. virtool_workflow/runtime/hook.py +129 -0
  33. virtool_workflow/runtime/path.py +19 -0
  34. virtool_workflow/runtime/ping.py +54 -0
  35. virtool_workflow/runtime/redis.py +65 -0
  36. virtool_workflow/runtime/run.py +276 -0
  37. virtool_workflow/runtime/run_subprocess.py +168 -0
  38. virtool_workflow/runtime/sentry.py +28 -0
  39. virtool_workflow/utils.py +90 -0
  40. virtool_workflow/workflow.py +90 -0
  41. virtool_workflow-0.0.0.dist-info/LICENSE +21 -0
  42. virtool_workflow-0.0.0.dist-info/METADATA +71 -0
  43. virtool_workflow-0.0.0.dist-info/RECORD +45 -0
  44. virtool_workflow-0.0.0.dist-info/WHEEL +4 -0
  45. virtool_workflow-0.0.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,13 @@
1
+ """
2
+ A framework for defining Virtool workflows.
3
+ """
4
+ from virtool_workflow.decorators import step
5
+ from virtool_workflow.runtime.run_subprocess import RunSubprocess
6
+ from virtool_workflow.workflow import Workflow, WorkflowStep
7
+
8
+ __all__ = [
9
+ "step",
10
+ "RunSubprocess",
11
+ "Workflow",
12
+ "WorkflowStep",
13
+ ]
@@ -0,0 +1 @@
1
+ """Tools for workflows relating to sequence analysis."""
@@ -0,0 +1,467 @@
1
+ """Utilities and fixtures for running FastQC."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import shutil
6
+ import statistics
7
+ import tempfile
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import IO, Protocol, TextIO
11
+
12
+ from pyfixtures import fixture
13
+
14
+ from virtool_workflow import RunSubprocess
15
+ from virtool_workflow.analysis.utils import ReadPaths
16
+
17
+
18
+ @dataclass
19
+ class NucleotidePoint:
20
+ g: float
21
+ a: float
22
+ t: float
23
+ c: float
24
+
25
+
26
+ @dataclass
27
+ class QualityPoint:
28
+ mean: float
29
+ median: float
30
+ lower_quartile: float
31
+ upper_quartile: float
32
+ tenth_percentile: float
33
+ ninetieth_percentile: float
34
+
35
+
36
+ class BaseQualityParser:
37
+ """Parse the section of FastQC output containing per-base quality data."""
38
+
39
+ pattern = ">>Per base sequence quality"
40
+
41
+ def __init__(self):
42
+ self.data: list[QualityPoint] = []
43
+
44
+ def composite(self, parser: BaseQualityParser):
45
+ p = BaseQualityParser()
46
+
47
+ p.data = [
48
+ QualityPoint(
49
+ mean=statistics.mean([this.mean, other.mean]),
50
+ median=statistics.mean([this.median, other.median]),
51
+ lower_quartile=statistics.mean(
52
+ [this.lower_quartile, other.lower_quartile],
53
+ ),
54
+ upper_quartile=statistics.mean(
55
+ [this.upper_quartile, other.upper_quartile],
56
+ ),
57
+ tenth_percentile=statistics.mean(
58
+ [this.tenth_percentile, other.tenth_percentile],
59
+ ),
60
+ ninetieth_percentile=statistics.mean(
61
+ [this.ninetieth_percentile, other.ninetieth_percentile],
62
+ ),
63
+ )
64
+ for this, other in zip(self.data, parser.data)
65
+ ]
66
+
67
+ return p
68
+
69
+ def handle(self, f: TextIO):
70
+ max_index = -1
71
+
72
+ while True:
73
+ line = f.readline().rstrip()
74
+
75
+ if line == ">>END_MODULE":
76
+ break
77
+
78
+ if not line or line[0] == "#":
79
+ continue
80
+
81
+ split = line.split()
82
+
83
+ # Convert all fields except first to 2-decimal floats.
84
+ try:
85
+ values = [float(value) for value in split[1:]]
86
+ except ValueError as err:
87
+ if "NaN" not in str(err):
88
+ raise
89
+
90
+ values = _handle_base_quality_nan(split)
91
+
92
+ (
93
+ mean,
94
+ median,
95
+ lower_quartile,
96
+ upper_quartile,
97
+ tenth_percentile,
98
+ ninetieth_percentile,
99
+ ) = values
100
+
101
+ indexes = _calculate_index_range(split[0])
102
+
103
+ for i in indexes:
104
+ self.data.append(
105
+ QualityPoint(
106
+ mean=mean,
107
+ median=median,
108
+ lower_quartile=lower_quartile,
109
+ upper_quartile=upper_quartile,
110
+ tenth_percentile=tenth_percentile,
111
+ ninetieth_percentile=ninetieth_percentile,
112
+ ),
113
+ )
114
+
115
+ if i - max_index != 1:
116
+ raise ValueError("Non-contiguous index")
117
+
118
+ max_index = i
119
+
120
+
121
+ class BasicStatisticsParser:
122
+ """Parse the section of FastQC output containing basic statistics."""
123
+
124
+ pattern = ">>Basic Statistics"
125
+
126
+ def __init__(self):
127
+ self.count = 0
128
+ self.encoding = None
129
+ self.gc = None
130
+ self.length = None
131
+
132
+ self._populated = False
133
+
134
+ def composite(self, parser: BasicStatisticsParser):
135
+ p = BasicStatisticsParser()
136
+
137
+ p.count = self.count + parser.count
138
+ p.encoding = self.encoding
139
+ p.gc = (self.gc + parser.gc) / 2
140
+ p.length = [
141
+ min(self.length + parser.length),
142
+ max(self.length + parser.length),
143
+ ]
144
+
145
+ return p
146
+
147
+ def handle(self, f: IO):
148
+ while True:
149
+ line = f.readline().rstrip()
150
+
151
+ if line.startswith("#"):
152
+ continue
153
+
154
+ if line == ">>END_MODULE":
155
+ break
156
+
157
+ if "Total Sequences" in line:
158
+ self.count = int(line.split("\t")[1])
159
+
160
+ elif "Encoding" in line:
161
+ self.encoding = line.split("\t")[1]
162
+
163
+ elif "Sequence length" in line:
164
+ length_range = [int(s) for s in line.split("\t")[1].split("-")]
165
+ self.length = [min(length_range), max(length_range)]
166
+
167
+ # GC-content
168
+ elif "%GC" in line and "#" not in line:
169
+ self.gc = float(line.split("\t")[1])
170
+
171
+
172
+ class NucleotideCompositionParser:
173
+ """Parse the section of FastQC output containing per-base nucleotide composition."""
174
+
175
+ pattern = ">>Per base sequence content"
176
+
177
+ def __init__(self):
178
+ self.data: list[NucleotidePoint] = []
179
+
180
+ def composite(self, parser: NucleotideCompositionParser):
181
+ """Make a composite dataset given another :class:`.NucleotideCompositionParser`."""
182
+ p = NucleotideCompositionParser()
183
+
184
+ p.data = [
185
+ NucleotidePoint(
186
+ g=(this.g + other.g) / 2,
187
+ a=(this.a + other.a) / 2,
188
+ t=(this.t + other.t) / 2,
189
+ c=(this.c + other.c) / 2,
190
+ )
191
+ for this, other in zip(self.data, parser.data)
192
+ ]
193
+
194
+ return p
195
+
196
+ def handle(self, f: TextIO):
197
+ max_index = -1
198
+
199
+ while True:
200
+ line = f.readline().rstrip()
201
+
202
+ if line == ">>END_MODULE":
203
+ break
204
+
205
+ if not line or line[0] == "#":
206
+ continue
207
+
208
+ split = line.split()
209
+
210
+ try:
211
+ g, a, t, c = (float(value) for value in split[1:])
212
+ except ValueError as err:
213
+ if "NaN" not in str(err):
214
+ raise
215
+
216
+ g, a, t, c = _handle_base_quality_nan(split)
217
+
218
+ indexes = _calculate_index_range(split[0])
219
+
220
+ for i in indexes:
221
+ self.data.append(NucleotidePoint(g, a, t, c))
222
+
223
+ if i - max_index != 1:
224
+ raise ValueError("Non-contiguous index")
225
+
226
+ max_index = i
227
+
228
+
229
+ class SequenceQualityParser:
230
+ """Parse the section of FastQC output containing per-sequence quality data."""
231
+
232
+ pattern = ">>Per sequence quality scores"
233
+
234
+ def __init__(self):
235
+ self.data = [0] * 50
236
+
237
+ def composite(self, parser: SequenceQualityParser):
238
+ p = SequenceQualityParser()
239
+ p.data = [sum(both) for both in zip(self.data, parser.data)]
240
+
241
+ return p
242
+
243
+ def handle(self, f: TextIO):
244
+ while True:
245
+ line = f.readline().rstrip()
246
+
247
+ if not line or line.startswith("#"):
248
+ continue
249
+
250
+ if line == ">>END_MODULE":
251
+ break
252
+
253
+ line = line.split()
254
+
255
+ quality = int(line[0])
256
+ count = int(float(line[1]))
257
+
258
+ self.data[quality] = count
259
+
260
+
261
+ @dataclass
262
+ class FastQCSide:
263
+ base_quality: BaseQualityParser
264
+ basic_statistics: BasicStatisticsParser
265
+ nucleotide_composition: NucleotideCompositionParser
266
+ sequence_quality: SequenceQualityParser
267
+
268
+
269
+ def _calculate_index_range(base: str) -> range:
270
+ pos = [int(x) for x in base.split("-")]
271
+
272
+ if len(pos) > 1:
273
+ return range(pos[0] - 1, pos[1])
274
+
275
+ return range(pos[0] - 1, pos[0])
276
+
277
+
278
+ def _handle_base_quality_nan(split_line: list) -> list:
279
+ """Parse a per-base quality line from FastQC containing NaN values.
280
+
281
+ :param split_line: the quality line split into a :class:`.List`
282
+ :return: replacement values
283
+
284
+ """
285
+ values = split_line[1:]
286
+
287
+ for value in values:
288
+ try:
289
+ return [value for _ in values]
290
+ except ValueError:
291
+ pass
292
+
293
+ # Return all zeroes if none of the quality values are numbers.
294
+ if set(values) == {"NaN"}:
295
+ return [0] * 4
296
+
297
+ joined = ",".join(split_line)
298
+
299
+ raise ValueError(f"Could not parse base quality values '{joined}'")
300
+
301
+
302
+ def _parse_fastqc(fastqc_path: Path, output_path: Path) -> dict:
303
+ """Parse the FastQC results at `fastqc_path`.
304
+
305
+ All FastQC data except the textual data file are removed.
306
+
307
+ :param fastqc_path: the FastQC output data path
308
+ :param sample_path: the FastQC text output file will be moved here
309
+ :return: a dict containing a representation of the parsed FastQC data
310
+
311
+ """
312
+ output_path.mkdir(exist_ok=True, parents=True)
313
+
314
+ sides = []
315
+
316
+ # Get the text data files from the FastQC output
317
+ for path in fastqc_path.iterdir():
318
+ if not path.is_dir():
319
+ continue
320
+
321
+ for file_path in path.iterdir():
322
+ if file_path.name != "fastqc_data.txt":
323
+ continue
324
+
325
+ new_path = output_path / f"{path.name}.txt"
326
+
327
+ shutil.move(file_path, new_path)
328
+
329
+ base_quality = BaseQualityParser()
330
+ basic_statistics = BasicStatisticsParser()
331
+ nucleotide_composition = NucleotideCompositionParser()
332
+ sequence_quality = SequenceQualityParser()
333
+
334
+ with open(new_path) as f:
335
+ while True:
336
+ line = f.readline()
337
+
338
+ if not line:
339
+ break
340
+
341
+ if basic_statistics.pattern in line:
342
+ basic_statistics.handle(f)
343
+
344
+ if base_quality.pattern in line:
345
+ base_quality.handle(f)
346
+
347
+ if nucleotide_composition.pattern in line:
348
+ nucleotide_composition.handle(f)
349
+
350
+ if SequenceQualityParser.pattern in line:
351
+ sequence_quality.handle(f)
352
+
353
+ sides.append(
354
+ FastQCSide(
355
+ base_quality=base_quality,
356
+ basic_statistics=basic_statistics,
357
+ nucleotide_composition=nucleotide_composition,
358
+ sequence_quality=sequence_quality,
359
+ ),
360
+ )
361
+
362
+ if len(sides) == 1:
363
+ left = sides[0]
364
+
365
+ return {
366
+ "bases": [
367
+ [
368
+ round(n, 3)
369
+ for n in [
370
+ point.mean,
371
+ point.median,
372
+ point.lower_quartile,
373
+ point.upper_quartile,
374
+ point.tenth_percentile,
375
+ point.ninetieth_percentile,
376
+ ]
377
+ ]
378
+ for point in left.base_quality.data
379
+ ],
380
+ "composition": [
381
+ [round(n, 1) for n in [point.g, point.a, point.t, point.c]]
382
+ for point in left.nucleotide_composition.data
383
+ ],
384
+ "count": left.basic_statistics.count,
385
+ "encoding": left.basic_statistics.encoding,
386
+ "gc": left.basic_statistics.gc,
387
+ "length": left.basic_statistics.length,
388
+ "sequences": left.sequence_quality.data,
389
+ }
390
+
391
+ left, right = sides
392
+
393
+ basic = left.basic_statistics.composite(right.basic_statistics)
394
+
395
+ return {
396
+ "bases": [
397
+ [
398
+ round(n, 3)
399
+ for n in [
400
+ point.mean,
401
+ point.median,
402
+ point.lower_quartile,
403
+ point.upper_quartile,
404
+ point.tenth_percentile,
405
+ point.ninetieth_percentile,
406
+ ]
407
+ ]
408
+ for point in left.base_quality.composite(right.base_quality).data
409
+ ],
410
+ "composition": [
411
+ [round(n, 1) for n in [point.g, point.a, point.t, point.c]]
412
+ for point in left.nucleotide_composition.composite(
413
+ right.nucleotide_composition,
414
+ ).data
415
+ ],
416
+ "count": basic.count,
417
+ "length": basic.length,
418
+ "encoding": left.basic_statistics.encoding,
419
+ "gc": basic.gc,
420
+ "sequences": left.sequence_quality.composite(right.sequence_quality).data,
421
+ }
422
+
423
+
424
+ class FastQCRunner(Protocol):
425
+ """A protocol describing callables that can be used to run FastQC."""
426
+
427
+ async def __call__(self, paths: ReadPaths, output_path: Path) -> dict:
428
+ ...
429
+
430
+
431
+ @fixture
432
+ async def fastqc(run_subprocess: RunSubprocess):
433
+ """Provides an asynchronous function that can run FastQC as a subprocess.
434
+
435
+ The function takes a one or two paths to FASTQ read files (:class:`.ReadPaths`) in
436
+ a tuple.
437
+
438
+ Example:
439
+ -------
440
+ .. code-block:: python
441
+
442
+ @step
443
+ async def step_one(fastqc: FastQCRunner, work_path: Path):
444
+ fastqc_result = await fastqc((
445
+ work_path / "reads_1.fq",
446
+ work_path / "reads_2.fq"
447
+ ))
448
+
449
+ """
450
+ temp_path = Path(await asyncio.to_thread(tempfile.mkdtemp))
451
+
452
+ async def func(paths: ReadPaths, output_path: Path) -> dict:
453
+ command = [
454
+ "fastqc",
455
+ "-f",
456
+ "fastq",
457
+ "-o",
458
+ str(temp_path),
459
+ "--extract",
460
+ *[str(path) for path in paths],
461
+ ]
462
+
463
+ await run_subprocess(command)
464
+
465
+ return _parse_fastqc(temp_path, output_path)
466
+
467
+ return func