varvamp 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +6 -3
- varvamp/command.py +131 -57
- varvamp/scripts/alignment.py +54 -164
- varvamp/scripts/default_config.py +5 -3
- varvamp/scripts/logging.py +66 -20
- varvamp/scripts/param_estimation.py +84 -62
- varvamp/scripts/primers.py +190 -46
- varvamp/scripts/qpcr.py +141 -117
- varvamp/scripts/reporting.py +41 -34
- varvamp/scripts/scheme.py +101 -52
- varvamp-1.3.dist-info/METADATA +760 -0
- varvamp-1.3.dist-info/RECORD +22 -0
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/WHEEL +1 -1
- varvamp-1.3.dist-info/licenses/LICENSE +674 -0
- varvamp-1.2.2.dist-info/METADATA +0 -87
- varvamp-1.2.2.dist-info/RECORD +0 -21
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/entry_points.txt +0 -0
- {varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/top_level.txt +0 -0
varvamp/__init__.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
1
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
__version__ = version("varvamp")
|
|
5
|
+
except PackageNotFoundError:
|
|
6
|
+
__version__ = "unknown"
|
varvamp/command.py
CHANGED
|
@@ -7,7 +7,6 @@ import sys
|
|
|
7
7
|
import os
|
|
8
8
|
import datetime
|
|
9
9
|
import argparse
|
|
10
|
-
import multiprocessing
|
|
11
10
|
|
|
12
11
|
# varVAMP
|
|
13
12
|
from varvamp.scripts import alignment
|
|
@@ -22,7 +21,6 @@ from varvamp.scripts import reporting
|
|
|
22
21
|
from varvamp.scripts import scheme
|
|
23
22
|
from varvamp.scripts import blast
|
|
24
23
|
from varvamp import __version__
|
|
25
|
-
from . import _program
|
|
26
24
|
|
|
27
25
|
|
|
28
26
|
def get_args(sysargs):
|
|
@@ -30,7 +28,6 @@ def get_args(sysargs):
|
|
|
30
28
|
arg parsing for varvamp
|
|
31
29
|
"""
|
|
32
30
|
parser = argparse.ArgumentParser(
|
|
33
|
-
prog=_program,
|
|
34
31
|
usage='''\tvarvamp <mode> --help\n\tvarvamp <mode> [mode optional arguments] <alignment> <output dir>''')
|
|
35
32
|
mode_parser = parser.add_subparsers(
|
|
36
33
|
title="varvamp mode",
|
|
@@ -49,7 +46,7 @@ def get_args(sysargs):
|
|
|
49
46
|
QPCR_parser = mode_parser.add_parser(
|
|
50
47
|
"qpcr",
|
|
51
48
|
help="design qPCR primers",
|
|
52
|
-
usage="varvamp qpcr [optional arguments] <alignment> <output dir>"
|
|
49
|
+
usage="varvamp qpcr -t <threshold> [optional arguments] <alignment> <output dir>"
|
|
53
50
|
)
|
|
54
51
|
parser.add_argument(
|
|
55
52
|
"input",
|
|
@@ -57,20 +54,12 @@ def get_args(sysargs):
|
|
|
57
54
|
help="alignment file and dir to write results"
|
|
58
55
|
)
|
|
59
56
|
for par in (SINGLE_parser, TILED_parser, QPCR_parser):
|
|
60
|
-
par.add_argument(
|
|
61
|
-
"-t",
|
|
62
|
-
"--threshold",
|
|
63
|
-
metavar="",
|
|
64
|
-
type=float,
|
|
65
|
-
default=None,
|
|
66
|
-
help="threshold for consensus nucleotides"
|
|
67
|
-
)
|
|
68
57
|
par.add_argument(
|
|
69
58
|
"-a",
|
|
70
59
|
"--n-ambig",
|
|
71
|
-
metavar="",
|
|
60
|
+
metavar="2",
|
|
72
61
|
type=int,
|
|
73
|
-
default=
|
|
62
|
+
default=2,
|
|
74
63
|
help="max number of ambiguous characters in a primer"
|
|
75
64
|
)
|
|
76
65
|
par.add_argument(
|
|
@@ -96,7 +85,23 @@ def get_args(sysargs):
|
|
|
96
85
|
type=str,
|
|
97
86
|
default="varVAMP"
|
|
98
87
|
)
|
|
88
|
+
par.add_argument(
|
|
89
|
+
"--compatible-primers",
|
|
90
|
+
metavar="None",
|
|
91
|
+
type=str,
|
|
92
|
+
default=None,
|
|
93
|
+
help="FASTA primer file with which new primers should not form dimers. Sequences >40 nt are ignored. Can significantly increase runtime."
|
|
94
|
+
)
|
|
95
|
+
|
|
99
96
|
for par in (SINGLE_parser, TILED_parser):
|
|
97
|
+
par.add_argument(
|
|
98
|
+
"-t",
|
|
99
|
+
"--threshold",
|
|
100
|
+
metavar="",
|
|
101
|
+
type=float,
|
|
102
|
+
default=None,
|
|
103
|
+
help="consensus threshold (0-1) - if not set it will be estimated (higher values result in higher specificity at the expense of found primers)"
|
|
104
|
+
)
|
|
100
105
|
par.add_argument(
|
|
101
106
|
"-ol",
|
|
102
107
|
"--opt-length",
|
|
@@ -117,8 +122,8 @@ def get_args(sysargs):
|
|
|
117
122
|
"-o",
|
|
118
123
|
"--overlap",
|
|
119
124
|
type=int,
|
|
120
|
-
metavar="
|
|
121
|
-
default=
|
|
125
|
+
metavar="25",
|
|
126
|
+
default=25,
|
|
122
127
|
help="min overlap of the amplicon inserts"
|
|
123
128
|
)
|
|
124
129
|
SINGLE_parser.add_argument(
|
|
@@ -137,6 +142,13 @@ def get_args(sysargs):
|
|
|
137
142
|
default=None,
|
|
138
143
|
help="max number of ambiguous characters in a probe"
|
|
139
144
|
)
|
|
145
|
+
QPCR_parser.add_argument(
|
|
146
|
+
"-t",
|
|
147
|
+
"--threshold",
|
|
148
|
+
required=True,
|
|
149
|
+
type=float,
|
|
150
|
+
help="consensus threshold (0-1) - higher values result in higher specificity at the expense of found primers"
|
|
151
|
+
)
|
|
140
152
|
QPCR_parser.add_argument(
|
|
141
153
|
"-n",
|
|
142
154
|
"--test-n",
|
|
@@ -151,7 +163,7 @@ def get_args(sysargs):
|
|
|
151
163
|
type=int,
|
|
152
164
|
metavar="-3",
|
|
153
165
|
default=-3,
|
|
154
|
-
help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting
|
|
166
|
+
help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting temperature"
|
|
155
167
|
)
|
|
156
168
|
parser.add_argument(
|
|
157
169
|
"--verbose",
|
|
@@ -178,19 +190,30 @@ def shared_workflow(args, log_file):
|
|
|
178
190
|
"""
|
|
179
191
|
# start varvamp
|
|
180
192
|
logging.varvamp_progress(log_file, mode=args.mode)
|
|
181
|
-
|
|
182
193
|
# read in alignment and preprocess
|
|
183
194
|
preprocessed_alignment = alignment.preprocess(args.input[0])
|
|
184
|
-
#
|
|
195
|
+
# read in external primer sequences with which new primers should not form dimers
|
|
196
|
+
if args.compatible_primers is not None:
|
|
197
|
+
compatible_primers = primers.parse_primer_fasta(args.compatible_primers)
|
|
198
|
+
if not compatible_primers:
|
|
199
|
+
logging.raise_error(
|
|
200
|
+
"no valid primers found in the provided primer file.\n",
|
|
201
|
+
log_file,
|
|
202
|
+
)
|
|
203
|
+
else:
|
|
204
|
+
compatible_primers = None
|
|
205
|
+
# check alignment length and number of gaps and report if its significantly more/less than expected
|
|
185
206
|
logging.check_alignment_length(preprocessed_alignment, log_file)
|
|
207
|
+
logging.check_gaped_sequences(preprocessed_alignment, log_file)
|
|
186
208
|
|
|
187
209
|
# estimate threshold or number of ambiguous bases if args were not supplied
|
|
188
|
-
if args.threshold is None
|
|
189
|
-
args.threshold
|
|
210
|
+
if args.threshold is None and not args.mode == 'qpcr':
|
|
211
|
+
args.threshold = param_estimation.get_parameters(preprocessed_alignment, args, log_file)
|
|
212
|
+
# set the number of ambiguous chars for qPCR probes to one less than for primers if not given
|
|
190
213
|
if args.mode == "qpcr" and args.pn_ambig is None:
|
|
191
214
|
if args.n_ambig == 0:
|
|
192
215
|
args.pn_ambig = 0
|
|
193
|
-
|
|
216
|
+
else:
|
|
194
217
|
args.pn_ambig = args.n_ambig - 1
|
|
195
218
|
with open(log_file, "a") as f:
|
|
196
219
|
print(f"Automatic parameter selection set -pa {args.pn_ambig}.", file=f)
|
|
@@ -211,7 +234,6 @@ def shared_workflow(args, log_file):
|
|
|
211
234
|
alignment_cleaned, gaps_to_mask = alignment.process_alignment(
|
|
212
235
|
preprocessed_alignment,
|
|
213
236
|
args.threshold,
|
|
214
|
-
args.threads
|
|
215
237
|
)
|
|
216
238
|
logging.varvamp_progress(
|
|
217
239
|
log_file,
|
|
@@ -237,6 +259,9 @@ def shared_workflow(args, log_file):
|
|
|
237
259
|
ambiguous_consensus,
|
|
238
260
|
args.n_ambig
|
|
239
261
|
)
|
|
262
|
+
|
|
263
|
+
potential_primer_regions = regions.mean(primer_regions, majority_consensus)
|
|
264
|
+
|
|
240
265
|
if not primer_regions:
|
|
241
266
|
logging.raise_error(
|
|
242
267
|
"no primer regions found. Lower the threshold!",
|
|
@@ -247,7 +272,7 @@ def shared_workflow(args, log_file):
|
|
|
247
272
|
log_file,
|
|
248
273
|
progress=0.4,
|
|
249
274
|
job="Finding primer regions.",
|
|
250
|
-
progress_text=f"{
|
|
275
|
+
progress_text=f"{potential_primer_regions} % of the consensus sequence will be evaluated for primers"
|
|
251
276
|
)
|
|
252
277
|
|
|
253
278
|
# produce kmers for all primer regions
|
|
@@ -266,7 +291,8 @@ def shared_workflow(args, log_file):
|
|
|
266
291
|
left_primer_candidates, right_primer_candidates = primers.find_primers(
|
|
267
292
|
kmers,
|
|
268
293
|
ambiguous_consensus,
|
|
269
|
-
alignment_cleaned
|
|
294
|
+
alignment_cleaned,
|
|
295
|
+
args.threads
|
|
270
296
|
)
|
|
271
297
|
for primer_type, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
|
|
272
298
|
if not primer_candidates:
|
|
@@ -282,20 +308,41 @@ def shared_workflow(args, log_file):
|
|
|
282
308
|
progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv potential primers"
|
|
283
309
|
)
|
|
284
310
|
|
|
285
|
-
|
|
311
|
+
# filter primers against user-provided list of compatible primers, can use multi-processing
|
|
312
|
+
if compatible_primers:
|
|
313
|
+
left_primer_candidates = primers.filter_non_dimer_candidates(
|
|
314
|
+
left_primer_candidates, compatible_primers, args.threads
|
|
315
|
+
)
|
|
316
|
+
right_primer_candidates = primers.filter_non_dimer_candidates(
|
|
317
|
+
right_primer_candidates, compatible_primers, args.threads
|
|
318
|
+
)
|
|
319
|
+
logging.varvamp_progress(
|
|
320
|
+
log_file,
|
|
321
|
+
progress=0.65,
|
|
322
|
+
job="Filtering primers against provided primers.",
|
|
323
|
+
progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv primers after filtering"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers
|
|
286
327
|
|
|
287
328
|
|
|
288
|
-
def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, data_dir, log_file):
|
|
329
|
+
def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, potential_primer_regions, data_dir, log_file):
|
|
289
330
|
"""
|
|
290
331
|
part of the workflow shared by the single and tiled mode
|
|
291
332
|
"""
|
|
292
333
|
|
|
293
334
|
# find best primers and create primer dict
|
|
294
|
-
|
|
335
|
+
# depending on the percentage of potential primer regions use high conservation mode
|
|
336
|
+
if potential_primer_regions >= 90:
|
|
337
|
+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=True)
|
|
338
|
+
job_text = "Excluding overlapping primers (stringent)."
|
|
339
|
+
else:
|
|
340
|
+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=False)
|
|
341
|
+
job_text = "Excluding overlapping primers."
|
|
295
342
|
logging.varvamp_progress(
|
|
296
343
|
log_file,
|
|
297
344
|
progress=0.7,
|
|
298
|
-
job="
|
|
345
|
+
job=f"{job_text}",
|
|
299
346
|
progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rv primers"
|
|
300
347
|
)
|
|
301
348
|
|
|
@@ -307,8 +354,7 @@ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
|
|
|
307
354
|
)
|
|
308
355
|
if not amplicons:
|
|
309
356
|
logging.raise_error(
|
|
310
|
-
"no amplicons found. Increase the max amplicon length or
|
|
311
|
-
number of ambiguous bases or lower threshold!\n",
|
|
357
|
+
"no amplicons found. Increase the max amplicon length or number of ambiguous bases or lower threshold!\n",
|
|
312
358
|
log_file,
|
|
313
359
|
exit=True
|
|
314
360
|
)
|
|
@@ -353,7 +399,7 @@ def single_workflow(args, amplicons, log_file):
|
|
|
353
399
|
return amplicon_scheme
|
|
354
400
|
|
|
355
401
|
|
|
356
|
-
def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file
|
|
402
|
+
def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
|
|
357
403
|
"""
|
|
358
404
|
part of the workflow specific for the tiled mode
|
|
359
405
|
"""
|
|
@@ -367,21 +413,9 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
|
|
|
367
413
|
amplicon_graph
|
|
368
414
|
)
|
|
369
415
|
|
|
370
|
-
# check for dimers
|
|
371
|
-
dimers_not_solved = scheme.check_and_solve_heterodimers(
|
|
372
|
-
amplicon_scheme,
|
|
373
|
-
left_primer_candidates,
|
|
374
|
-
right_primer_candidates,
|
|
375
|
-
all_primers)
|
|
376
|
-
if dimers_not_solved:
|
|
377
|
-
logging.raise_error(
|
|
378
|
-
f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
|
|
379
|
-
log_file
|
|
380
|
-
)
|
|
381
|
-
reporting.write_dimers(results_dir, dimers_not_solved)
|
|
382
|
-
|
|
383
416
|
# evaluate coverage
|
|
384
|
-
# ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers,
|
|
417
|
+
# ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers,
|
|
418
|
+
# but this potential, minor inaccuracy is currently accepted.
|
|
385
419
|
percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
|
|
386
420
|
logging.varvamp_progress(
|
|
387
421
|
log_file,
|
|
@@ -398,10 +432,36 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
|
|
|
398
432
|
"\t - relax primer settings (not recommended)\n",
|
|
399
433
|
log_file
|
|
400
434
|
)
|
|
401
|
-
return amplicon_scheme
|
|
402
435
|
|
|
436
|
+
# check for dimers
|
|
437
|
+
dimers_not_solved, n_initial_dimers = scheme.check_and_solve_heterodimers(
|
|
438
|
+
amplicon_scheme,
|
|
439
|
+
left_primer_candidates,
|
|
440
|
+
right_primer_candidates,
|
|
441
|
+
all_primers,
|
|
442
|
+
args.threads
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# report dimers solve
|
|
446
|
+
if n_initial_dimers > 0 and not dimers_not_solved:
|
|
447
|
+
logging.varvamp_progress(
|
|
448
|
+
log_file,
|
|
449
|
+
progress=0.95,
|
|
450
|
+
job="Trying to solve primer dimers.",
|
|
451
|
+
progress_text=f"all dimers (n={n_initial_dimers}) could be resolved"
|
|
452
|
+
)
|
|
453
|
+
elif dimers_not_solved:
|
|
454
|
+
logging.varvamp_progress(
|
|
455
|
+
log_file,
|
|
456
|
+
progress=0.95,
|
|
457
|
+
job="Trying to solve primer dimers.",
|
|
458
|
+
progress_text=f"{len(dimers_not_solved)}/{n_initial_dimers} dimers could not be resolved"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
return amplicon_scheme, dimers_not_solved
|
|
403
462
|
|
|
404
|
-
|
|
463
|
+
|
|
464
|
+
def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, compatible_primers, log_file):
|
|
405
465
|
"""
|
|
406
466
|
part of the workflow specific for the tiled mode
|
|
407
467
|
"""
|
|
@@ -412,7 +472,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
|
|
|
412
472
|
)
|
|
413
473
|
if not probe_regions:
|
|
414
474
|
logging.raise_error(
|
|
415
|
-
"no regions that
|
|
475
|
+
"no regions that fulfill probe criteria! lower threshold or increase number of ambiguous chars in probe\n",
|
|
416
476
|
log_file,
|
|
417
477
|
exit=True
|
|
418
478
|
)
|
|
@@ -424,7 +484,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
|
|
|
424
484
|
config.QPROBE_SIZES
|
|
425
485
|
)
|
|
426
486
|
# find potential probes
|
|
427
|
-
qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned)
|
|
487
|
+
qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned, args.threads)
|
|
428
488
|
if not qpcr_probes:
|
|
429
489
|
logging.raise_error(
|
|
430
490
|
"no qpcr probes found\n",
|
|
@@ -438,8 +498,21 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
|
|
|
438
498
|
progress_text=f"{len(qpcr_probes)} potential qPCR probes"
|
|
439
499
|
)
|
|
440
500
|
|
|
501
|
+
# filter primers against non-dimer sequences if provided
|
|
502
|
+
if compatible_primers:
|
|
503
|
+
qpcr_probes = primers.filter_non_dimer_candidates(
|
|
504
|
+
qpcr_probes, compatible_primers, args.threads)
|
|
505
|
+
logging.varvamp_progress(
|
|
506
|
+
log_file,
|
|
507
|
+
progress=0.75,
|
|
508
|
+
job="Filtering probes against provided primers.",
|
|
509
|
+
progress_text=f"{len(qpcr_probes)} potential qPCR probes after filtering"
|
|
510
|
+
)
|
|
511
|
+
|
|
441
512
|
# find unique amplicons with a low penalty and an internal probe
|
|
442
|
-
qpcr_scheme_candidates = qpcr.find_qcr_schemes(
|
|
513
|
+
qpcr_scheme_candidates = qpcr.find_qcr_schemes(
|
|
514
|
+
qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus, ambiguous_consensus, args.threads
|
|
515
|
+
)
|
|
443
516
|
if not qpcr_scheme_candidates:
|
|
444
517
|
logging.raise_error(
|
|
445
518
|
"no qPCR scheme candidates found. lower threshold or increase number of ambiguous chars in primer and/or probe\n",
|
|
@@ -500,7 +573,7 @@ def main():
|
|
|
500
573
|
blast.check_BLAST_installation(log_file)
|
|
501
574
|
|
|
502
575
|
# mode unspecific part of the workflow
|
|
503
|
-
alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates = shared_workflow(args, log_file)
|
|
576
|
+
alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers = shared_workflow(args, log_file)
|
|
504
577
|
|
|
505
578
|
# write files that are shared in all modes
|
|
506
579
|
reporting.write_regions_to_bed(primer_regions, args.name, data_dir)
|
|
@@ -522,10 +595,12 @@ def main():
|
|
|
522
595
|
|
|
523
596
|
# SINGLE/TILED mode
|
|
524
597
|
if args.mode == "tiled" or args.mode == "single":
|
|
598
|
+
dimers_not_solved = None
|
|
525
599
|
all_primers, amplicons = single_and_tiled_shared_workflow(
|
|
526
600
|
args,
|
|
527
601
|
left_primer_candidates,
|
|
528
602
|
right_primer_candidates,
|
|
603
|
+
potential_primer_regions,
|
|
529
604
|
data_dir,
|
|
530
605
|
log_file
|
|
531
606
|
)
|
|
@@ -536,7 +611,7 @@ def main():
|
|
|
536
611
|
log_file
|
|
537
612
|
)
|
|
538
613
|
elif args.mode == "tiled":
|
|
539
|
-
amplicon_scheme = tiled_workflow(
|
|
614
|
+
amplicon_scheme, dimers_not_solved = tiled_workflow(
|
|
540
615
|
args,
|
|
541
616
|
amplicons,
|
|
542
617
|
left_primer_candidates,
|
|
@@ -544,11 +619,9 @@ def main():
|
|
|
544
619
|
all_primers,
|
|
545
620
|
ambiguous_consensus,
|
|
546
621
|
log_file,
|
|
547
|
-
results_dir
|
|
548
622
|
)
|
|
549
623
|
|
|
550
624
|
# write files
|
|
551
|
-
|
|
552
625
|
if args.mode == "tiled":
|
|
553
626
|
# assign amplicon numbers from 5' to 3' along the genome
|
|
554
627
|
amplicon_scheme.sort(key=lambda x: x["LEFT"][1])
|
|
@@ -562,7 +635,8 @@ def main():
|
|
|
562
635
|
ambiguous_consensus,
|
|
563
636
|
args.name,
|
|
564
637
|
args.mode,
|
|
565
|
-
log_file
|
|
638
|
+
log_file,
|
|
639
|
+
dimers_not_solved
|
|
566
640
|
)
|
|
567
641
|
reporting.varvamp_plot(
|
|
568
642
|
results_dir,
|
|
@@ -584,11 +658,11 @@ def main():
|
|
|
584
658
|
majority_consensus,
|
|
585
659
|
left_primer_candidates,
|
|
586
660
|
right_primer_candidates,
|
|
661
|
+
compatible_primers,
|
|
587
662
|
log_file
|
|
588
663
|
)
|
|
589
664
|
|
|
590
665
|
# write files
|
|
591
|
-
|
|
592
666
|
# make sure amplicons with no off-target products and with low penalties get the lowest numbers
|
|
593
667
|
final_schemes.sort(key=lambda x: (x.get("off_targets", False), x["penalty"]))
|
|
594
668
|
reporting.write_regions_to_bed(probe_regions, args.name, data_dir, "probe")
|
varvamp/scripts/alignment.py
CHANGED
|
@@ -2,14 +2,11 @@
|
|
|
2
2
|
alignment preprocessing
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
# BUILT-INS
|
|
6
|
-
import re
|
|
7
|
-
import multiprocessing
|
|
8
|
-
|
|
9
5
|
# varVAMP
|
|
10
6
|
from varvamp.scripts import config
|
|
11
7
|
|
|
12
8
|
# LIBS
|
|
9
|
+
import numpy as np
|
|
13
10
|
from Bio import AlignIO
|
|
14
11
|
from Bio.Seq import Seq
|
|
15
12
|
|
|
@@ -47,181 +44,74 @@ def preprocess(alignment_path):
|
|
|
47
44
|
return preprocessed_alignment
|
|
48
45
|
|
|
49
46
|
|
|
50
|
-
def find_internal_gaps(unique_gaps, gap):
|
|
51
|
-
"""
|
|
52
|
-
find all unique gaps that
|
|
53
|
-
lie within the current gap
|
|
54
|
-
"""
|
|
55
|
-
overlapping_gaps = []
|
|
56
|
-
|
|
57
|
-
if gap[1] - gap[0] == 0:
|
|
58
|
-
# if the gap length = 1 there are
|
|
59
|
-
# no overlapping gaps
|
|
60
|
-
overlapping_gaps = [gap]
|
|
61
|
-
else:
|
|
62
|
-
# for each unique gap check if the intersection with the
|
|
63
|
-
# gap is the same as the unique gap -> internal gap of
|
|
64
|
-
# the current gap
|
|
65
|
-
for unique_gap in unique_gaps:
|
|
66
|
-
unique_set = set(range(unique_gap[0], unique_gap[1]))
|
|
67
|
-
current_range = range(gap[0], gap[1])
|
|
68
|
-
intersection = unique_set.intersection(current_range)
|
|
69
|
-
if not intersection:
|
|
70
|
-
continue
|
|
71
|
-
if min(intersection) == unique_gap[0] and max(intersection) + 1 == unique_gap[1]:
|
|
72
|
-
overlapping_gaps.append(unique_gap)
|
|
73
|
-
|
|
74
|
-
return overlapping_gaps
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def find_overlapping_gaps_worker(gap_list, unique_gaps):
|
|
78
|
-
"""
|
|
79
|
-
Worker function to find overlapping gaps and count their occurrences.
|
|
80
|
-
"""
|
|
81
|
-
gap_dict_part = {}
|
|
82
|
-
for gap in gap_list:
|
|
83
|
-
overlapping_gaps = find_internal_gaps(unique_gaps, gap)
|
|
84
|
-
for overlapping_gap in overlapping_gaps:
|
|
85
|
-
if overlapping_gap in gap_dict_part:
|
|
86
|
-
gap_dict_part[overlapping_gap] += 1
|
|
87
|
-
else:
|
|
88
|
-
gap_dict_part[overlapping_gap] = 1
|
|
89
|
-
return gap_dict_part
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def create_gap_dictionary(unique_gaps, all_gaps, n_threads):
|
|
93
|
-
"""
|
|
94
|
-
Creates a dictionary with all gap counts.
|
|
95
|
-
Counts also all overlapping gaps per gap.
|
|
96
|
-
Uses multiprocessing for parallelization.
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
with multiprocessing.Pool(processes=n_threads) as pool:
|
|
100
|
-
results = pool.starmap(find_overlapping_gaps_worker, [(gap_list, unique_gaps) for gap_list in all_gaps])
|
|
101
|
-
|
|
102
|
-
gap_dict = {}
|
|
103
|
-
for gap_dict_part in results:
|
|
104
|
-
for gap, count in gap_dict_part.items():
|
|
105
|
-
if gap in gap_dict:
|
|
106
|
-
gap_dict[gap] += count
|
|
107
|
-
else:
|
|
108
|
-
gap_dict[gap] = count
|
|
109
|
-
|
|
110
|
-
return gap_dict
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def find_gaps_to_mask(gap_dict, cutoff):
|
|
114
|
-
"""
|
|
115
|
-
filters gaps for their freq cutoff.
|
|
116
|
-
condenses final gaps if there is
|
|
117
|
-
an overlap.
|
|
118
|
-
"""
|
|
119
|
-
gaps_to_mask = []
|
|
120
|
-
potential_gaps = []
|
|
121
|
-
opened_region = []
|
|
122
|
-
|
|
123
|
-
# check for each region if it is covered
|
|
124
|
-
# by enough sequences
|
|
125
|
-
for gap in gap_dict:
|
|
126
|
-
if gap_dict[gap] > cutoff:
|
|
127
|
-
potential_gaps.append(gap)
|
|
128
|
-
|
|
129
|
-
# sort by start and stop
|
|
130
|
-
potential_gaps = sorted(potential_gaps)
|
|
131
|
-
|
|
132
|
-
# get the min and max of overlapping gaps
|
|
133
|
-
for i, region in enumerate(potential_gaps):
|
|
134
|
-
region = list(region)
|
|
135
|
-
if opened_region:
|
|
136
|
-
# write the opened region if the start of the current region
|
|
137
|
-
# > opened_region[stop] and the last still opened region
|
|
138
|
-
if region[0] > opened_region[1] or i == len(potential_gaps) - 1:
|
|
139
|
-
gaps_to_mask.append(opened_region)
|
|
140
|
-
opened_region = region
|
|
141
|
-
else:
|
|
142
|
-
# 1 case: same start and further stop -> new stop
|
|
143
|
-
if region[0] == opened_region[0]:
|
|
144
|
-
opened_region[1] = region[1]
|
|
145
|
-
# 2 case: further start and further stop -> new stop
|
|
146
|
-
if region[0] > opened_region[0] and region[1] > opened_region[1]:
|
|
147
|
-
opened_region[1] = region[1]
|
|
148
|
-
else:
|
|
149
|
-
opened_region = region
|
|
150
|
-
|
|
151
|
-
return gaps_to_mask
|
|
152
|
-
|
|
153
|
-
|
|
154
47
|
def clean_gaps(alignment, gaps_to_mask):
|
|
155
48
|
"""
|
|
156
|
-
|
|
49
|
+
Clean an alignment of large common deletions based on gaps_to_mask.
|
|
50
|
+
gaps_to_mask: list of [start, end] (inclusive), sorted by start.
|
|
157
51
|
"""
|
|
158
52
|
cleaned_alignment = []
|
|
53
|
+
gaps_to_mask = sorted(gaps_to_mask, key=lambda x: x[0])
|
|
159
54
|
|
|
160
|
-
for
|
|
55
|
+
for seq_id, seq in alignment:
|
|
161
56
|
start = 0
|
|
162
|
-
|
|
163
|
-
for
|
|
164
|
-
|
|
165
|
-
|
|
57
|
+
pieces = []
|
|
58
|
+
# for each seq in the alignment, mask the regions
|
|
59
|
+
for region_start, region_end in gaps_to_mask:
|
|
60
|
+
# mask length for this region
|
|
61
|
+
if (region_end - region_start + 1) >= config.QAMPLICON_DEL_CUTOFF:
|
|
166
62
|
mask = "NN"
|
|
167
|
-
# or mask with one N (small deletion)
|
|
168
63
|
else:
|
|
169
64
|
mask = "N"
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
if max(gaps_to_mask)[1] < len(sequence[1]) - 1:
|
|
183
|
-
# append the last seq if no gap is at
|
|
184
|
-
# the end of the sequence
|
|
185
|
-
start = max(gaps_to_mask)[1]
|
|
186
|
-
stop = len(sequence[1]) - 1
|
|
187
|
-
masked_seq_temp = sequence[1][start:stop]
|
|
188
|
-
masked_seq = masked_seq + mask + masked_seq_temp
|
|
189
|
-
else:
|
|
190
|
-
# append the mask to the end of the seq
|
|
191
|
-
masked_seq = masked_seq + mask
|
|
192
|
-
|
|
193
|
-
cleaned_alignment.append([sequence[0], masked_seq])
|
|
65
|
+
# part before region
|
|
66
|
+
pieces.append(seq[start:region_start])
|
|
67
|
+
# mask for region
|
|
68
|
+
pieces.append(mask)
|
|
69
|
+
# next start is after region
|
|
70
|
+
start = region_end + 1
|
|
71
|
+
|
|
72
|
+
# tail after last masked region
|
|
73
|
+
if start < len(seq):
|
|
74
|
+
pieces.append(seq[start:])
|
|
75
|
+
|
|
76
|
+
cleaned_alignment.append([seq_id, "".join(pieces)])
|
|
194
77
|
|
|
195
78
|
return cleaned_alignment
|
|
196
79
|
|
|
197
80
|
|
|
198
|
-
def process_alignment(preprocessed_alignment, threshold
|
|
81
|
+
def process_alignment(preprocessed_alignment, threshold):
|
|
199
82
|
"""
|
|
200
|
-
|
|
83
|
+
- build an array of shape (n_seq, seq_len)
|
|
84
|
+
- for each column, count how many sequences are '-'
|
|
85
|
+
- mark columns to mask if count > cutoff
|
|
86
|
+
- turn those columns into contiguous regions
|
|
201
87
|
"""
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
for
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
if
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
gaps_to_mask
|
|
224
|
-
|
|
88
|
+
|
|
89
|
+
# build char array
|
|
90
|
+
seqs = [seq for seq_id, seq in preprocessed_alignment]
|
|
91
|
+
arr = np.array([list(s) for s in seqs], dtype="U1")
|
|
92
|
+
n_seq, len_seq = arr.shape
|
|
93
|
+
|
|
94
|
+
# per-column gap counts
|
|
95
|
+
cols_to_mask = (arr == "-").sum(axis=0) > n_seq * (1 - threshold)
|
|
96
|
+
|
|
97
|
+
# convert bool mask into list of (start, end) regions (end inclusive)
|
|
98
|
+
gaps_to_mask = []
|
|
99
|
+
in_gap = False
|
|
100
|
+
start = None
|
|
101
|
+
for i, is_gap in enumerate(cols_to_mask):
|
|
102
|
+
if is_gap and not in_gap:
|
|
103
|
+
in_gap = True
|
|
104
|
+
start = i
|
|
105
|
+
elif not is_gap and in_gap:
|
|
106
|
+
in_gap = False
|
|
107
|
+
gaps_to_mask.append([start, i - 1])
|
|
108
|
+
if in_gap:
|
|
109
|
+
gaps_to_mask.append([start, len_seq - 1])
|
|
110
|
+
|
|
111
|
+
if not gaps_to_mask:
|
|
112
|
+
return preprocessed_alignment, []
|
|
113
|
+
|
|
114
|
+
alignment_cleaned = clean_gaps(preprocessed_alignment, gaps_to_mask)
|
|
225
115
|
|
|
226
116
|
return alignment_cleaned, gaps_to_mask
|
|
227
117
|
|