varvamp 0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp-0.3/PKG-INFO +48 -0
- varvamp-0.3/README.md +36 -0
- varvamp-0.3/setup.cfg +4 -0
- varvamp-0.3/setup.py +39 -0
- varvamp-0.3/varvamp/__init__.py +3 -0
- varvamp-0.3/varvamp/__main__.py +5 -0
- varvamp-0.3/varvamp/command.py +263 -0
- varvamp-0.3/varvamp/scripts/__init__.py +0 -0
- varvamp-0.3/varvamp/scripts/alignment.py +223 -0
- varvamp-0.3/varvamp/scripts/config.py +59 -0
- varvamp-0.3/varvamp/scripts/consensus.py +111 -0
- varvamp-0.3/varvamp/scripts/conserved.py +118 -0
- varvamp-0.3/varvamp/scripts/logging.py +321 -0
- varvamp-0.3/varvamp/scripts/primers.py +417 -0
- varvamp-0.3/varvamp/scripts/reporting.py +353 -0
- varvamp-0.3/varvamp/scripts/scheme.py +390 -0
- varvamp-0.3/varvamp.egg-info/PKG-INFO +48 -0
- varvamp-0.3/varvamp.egg-info/SOURCES.txt +21 -0
- varvamp-0.3/varvamp.egg-info/dependency_links.txt +1 -0
- varvamp-0.3/varvamp.egg-info/entry_points.txt +2 -0
- varvamp-0.3/varvamp.egg-info/not-zip-safe +1 -0
- varvamp-0.3/varvamp.egg-info/requires.txt +5 -0
- varvamp-0.3/varvamp.egg-info/top_level.txt +1 -0
varvamp-0.3/PKG-INFO
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: varvamp
|
|
3
|
+
Version: 0.3
|
|
4
|
+
Summary: varvamp
|
|
5
|
+
Home-page: https://github.com/jonas-fuchs/varVAMP
|
|
6
|
+
Author: Dr. Jonas Fuchs
|
|
7
|
+
Author-email: jonas.fuchs@uniklinik-freiburg.de
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
9
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
**var**iable **V**irus**AMP**licons (varVAMP) is a tool to design primers for highly diverse viruses. The input is an alignment of your viral (full-genome) sequences.
|
|
14
|
+
|
|
15
|
+
# varVAMP
|
|
16
|
+
|
|
17
|
+
[](https://www.gnu.org/licenses/gpl-3.0)
|
|
18
|
+
|
|
19
|
+
For a lot of virus genera it is difficult to design pan-specific primers. varVAMP solves this, by introducing ambiguous characters into primers and minimizes mismatches at the 3' end. Primers might not work for some sequences of your input alignment but should recognize the large majority.
|
|
20
|
+
|
|
21
|
+
**varVAMP comes in three different flavors:**
|
|
22
|
+
|
|
23
|
+
<img src="./docs/varvamp.png" alt="varVAMP logo" />
|
|
24
|
+
|
|
25
|
+
**SANGER** *(coming soon)*: varVAMP searches for the very best primers and reports back an amplicon which can be used for PCR-based screening approaches.
|
|
26
|
+
|
|
27
|
+
**TILED**: varVAMP uses a graph based approach to design overlapping amplicons that tile the entire viral genome. This designs amplicons that are suitable for Oxford Nanopore or Illumina based full-genome sequencing.
|
|
28
|
+
|
|
29
|
+
**QPCR** *(coming soon)*: varVAMP searches for small amplicons with an internal primer for the probe. It minimizes temperature differences between the primers.
|
|
30
|
+
|
|
31
|
+
This program is currently being developed and in an alpha state. You are welcome to use this software. If you successfully design primers, drop me a mail. It might be possible to collaborate!
|
|
32
|
+
|
|
33
|
+
# Documentation
|
|
34
|
+
|
|
35
|
+
* [Installation](docs/installation.md)
|
|
36
|
+
* [Preparing the data](docs/preparing_the_data.md)
|
|
37
|
+
* [Usage](docs/usage.md)
|
|
38
|
+
* [Output](docs/output.md)
|
|
39
|
+
* [How it works](docs/how_varvamp_works.md)
|
|
40
|
+
* [FAQ](docs/FAQ.md)
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
**Important disclaimer:**
|
|
45
|
+
*For the primer design, varVAMP uses [primer3](https://pypi.org/project/primer3-py/) to check if digested kmers of a sequence are potential primers. Some of the functions for this were adapted from [primalscheme](www.github.com/aresti/primalscheme) and I do not claim credit.*
|
|
46
|
+
|
|
47
|
+
*The remaing code is under the GPLv3 licence. The code is WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or
|
|
48
|
+
(at your option) any later version.*
|
varvamp-0.3/README.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
**var**iable **V**irus**AMP**licons (varVAMP) is a tool to design primers for highly diverse viruses. The input is an alignment of your viral (full-genome) sequences.
|
|
2
|
+
|
|
3
|
+
# varVAMP
|
|
4
|
+
|
|
5
|
+
[](https://www.gnu.org/licenses/gpl-3.0)
|
|
6
|
+
|
|
7
|
+
For a lot of virus genera it is difficult to design pan-specific primers. varVAMP solves this, by introducing ambiguous characters into primers and minimizes mismatches at the 3' end. Primers might not work for some sequences of your input alignment but should recognize the large majority.
|
|
8
|
+
|
|
9
|
+
**varVAMP comes in three different flavors:**
|
|
10
|
+
|
|
11
|
+
<img src="./docs/varvamp.png" alt="varVAMP logo" />
|
|
12
|
+
|
|
13
|
+
**SANGER** *(coming soon)*: varVAMP searches for the very best primers and reports back an amplicon which can be used for PCR-based screening approaches.
|
|
14
|
+
|
|
15
|
+
**TILED**: varVAMP uses a graph based approach to design overlapping amplicons that tile the entire viral genome. This designs amplicons that are suitable for Oxford Nanopore or Illumina based full-genome sequencing.
|
|
16
|
+
|
|
17
|
+
**QPCR** *(coming soon)*: varVAMP searches for small amplicons with an internal primer for the probe. It minimizes temperature differences between the primers.
|
|
18
|
+
|
|
19
|
+
This program is currently being developed and in an alpha state. You are welcome to use this software. If you successfully design primers, drop me a mail. It might be possible to collaborate!
|
|
20
|
+
|
|
21
|
+
# Documentation
|
|
22
|
+
|
|
23
|
+
* [Installation](docs/installation.md)
|
|
24
|
+
* [Preparing the data](docs/preparing_the_data.md)
|
|
25
|
+
* [Usage](docs/usage.md)
|
|
26
|
+
* [Output](docs/output.md)
|
|
27
|
+
* [How it works](docs/how_varvamp_works.md)
|
|
28
|
+
* [FAQ](docs/FAQ.md)
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
**Important disclaimer:**
|
|
33
|
+
*For the primer design, varVAMP uses [primer3](https://pypi.org/project/primer3-py/) to check if digested kmers of a sequence are potential primers. Some of the functions for this were adapted from [primalscheme](www.github.com/aresti/primalscheme) and I do not claim credit.*
|
|
34
|
+
|
|
35
|
+
*The remaing code is under the GPLv3 licence. The code is WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or
|
|
36
|
+
(at your option) any later version.*
|
varvamp-0.3/setup.cfg
ADDED
varvamp-0.3/setup.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
from varvamp import __version__, _program
|
|
3
|
+
|
|
4
|
+
# read the contents of your README file
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
this_directory = Path(__file__).parent
|
|
7
|
+
long_description = (this_directory / "README.md").read_text()
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name='varvamp',
|
|
11
|
+
long_description=long_description,
|
|
12
|
+
long_description_content_type='text/markdown',
|
|
13
|
+
version=__version__,
|
|
14
|
+
python_requires=">=3.9",
|
|
15
|
+
license_files=('licence.txt'),
|
|
16
|
+
packages=find_packages(),
|
|
17
|
+
install_requires=[
|
|
18
|
+
"biopython>=1.79",
|
|
19
|
+
"matplotlib>=3.5.1",
|
|
20
|
+
"primer3-py>=1.1.0",
|
|
21
|
+
"pandas>=1.4.4",
|
|
22
|
+
"numpy>=1.23.3"
|
|
23
|
+
],
|
|
24
|
+
description='varvamp',
|
|
25
|
+
url='https://github.com/jonas-fuchs/varVAMP',
|
|
26
|
+
author='Dr. Jonas Fuchs',
|
|
27
|
+
author_email='jonas.fuchs@uniklinik-freiburg.de',
|
|
28
|
+
classifiers=[
|
|
29
|
+
"Programming Language :: Python :: 3.9",
|
|
30
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)"
|
|
31
|
+
],
|
|
32
|
+
entry_points="""
|
|
33
|
+
[console_scripts]
|
|
34
|
+
{program} = varvamp.command:main
|
|
35
|
+
""".format(program=_program),
|
|
36
|
+
include_package_data=True,
|
|
37
|
+
keywords=[],
|
|
38
|
+
zip_safe=False
|
|
39
|
+
)
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
main workflow
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
import argparse
|
|
10
|
+
|
|
11
|
+
# varVAMP
|
|
12
|
+
from . import _program
|
|
13
|
+
from varvamp import __version__
|
|
14
|
+
from varvamp.scripts import logging
|
|
15
|
+
from varvamp.scripts import alignment
|
|
16
|
+
from varvamp.scripts import config
|
|
17
|
+
from varvamp.scripts import consensus
|
|
18
|
+
from varvamp.scripts import conserved
|
|
19
|
+
from varvamp.scripts import primers
|
|
20
|
+
from varvamp.scripts import reporting
|
|
21
|
+
from varvamp.scripts import scheme
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# DEFs
|
|
25
|
+
def get_args(sysargs):
|
|
26
|
+
"""
|
|
27
|
+
arg parsing for varvamp
|
|
28
|
+
"""
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog=_program,
|
|
31
|
+
description='varvamp: variable virus amplicon design',
|
|
32
|
+
usage='''varvamp <alignment> <output dir> [options]''')
|
|
33
|
+
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"input",
|
|
36
|
+
nargs=2,
|
|
37
|
+
help="alignment file and dir to write results"
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"-ol",
|
|
41
|
+
"--opt-length",
|
|
42
|
+
help="optimal length of the amplicons",
|
|
43
|
+
type=int,
|
|
44
|
+
default=config.AMPLICON_OPT_LENGTH
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-ml",
|
|
48
|
+
"--max-length",
|
|
49
|
+
help="max length of the amplicons",
|
|
50
|
+
type=int,
|
|
51
|
+
default=config.AMPLICON_MAX_LENGTH
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-o",
|
|
55
|
+
"--overlap",
|
|
56
|
+
type=float,
|
|
57
|
+
default=config.AMPLICON_MIN_OVERLAP,
|
|
58
|
+
help="min overlap of the amplicons"
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"-t",
|
|
62
|
+
"--threshold",
|
|
63
|
+
type=float,
|
|
64
|
+
default=config.FREQUENCY_THRESHOLD,
|
|
65
|
+
help="threshold for nucleotides in alignment to be considered conserved"
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"-a",
|
|
69
|
+
"--allowed-ambiguous",
|
|
70
|
+
type=int,
|
|
71
|
+
default=config.PRIMER_ALLOWED_N_AMB,
|
|
72
|
+
help="number of ambiguous characters that are allowed within a primer"
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument(
|
|
75
|
+
"--console",
|
|
76
|
+
action=argparse.BooleanOptionalAction,
|
|
77
|
+
default=True,
|
|
78
|
+
help="show varvamp console output"
|
|
79
|
+
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"-v",
|
|
82
|
+
"--version",
|
|
83
|
+
action='version',
|
|
84
|
+
version=f"varvamp {__version__}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if len(sysargs) < 1:
|
|
88
|
+
parser.print_help()
|
|
89
|
+
sys.exit(-1)
|
|
90
|
+
else:
|
|
91
|
+
return parser.parse_args(sysargs)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main(sysargs=sys.argv[1:]):
|
|
95
|
+
"""
|
|
96
|
+
main varvamp workflow
|
|
97
|
+
"""
|
|
98
|
+
# start varVAMP
|
|
99
|
+
args = get_args(sysargs)
|
|
100
|
+
if not args.console:
|
|
101
|
+
sys.stdout = open(os.devnull, 'w')
|
|
102
|
+
start_time = time.process_time()
|
|
103
|
+
results_dir, data_dir, log_file = logging.create_dir_structure(args.input[1])
|
|
104
|
+
logging.raise_arg_errors(args, log_file)
|
|
105
|
+
logging.varvamp_progress(log_file)
|
|
106
|
+
# config check
|
|
107
|
+
logging.confirm_config(args, log_file)
|
|
108
|
+
logging.varvamp_progress(
|
|
109
|
+
log_file,
|
|
110
|
+
progress=0.1,
|
|
111
|
+
job="Checking config.",
|
|
112
|
+
progress_text="config file passed"
|
|
113
|
+
)
|
|
114
|
+
# preprocess and clean alignment of gaps
|
|
115
|
+
alignment_cleaned, gaps_to_mask = alignment.process_alignment(
|
|
116
|
+
args.input[0],
|
|
117
|
+
args.threshold
|
|
118
|
+
)
|
|
119
|
+
logging.varvamp_progress(
|
|
120
|
+
log_file,
|
|
121
|
+
progress=0.2,
|
|
122
|
+
job="Preprocessing alignment and cleaning gaps.",
|
|
123
|
+
progress_text=f"{len(gaps_to_mask)} gaps with {alignment.calculate_total_masked_gaps(gaps_to_mask)} nucleotides"
|
|
124
|
+
)
|
|
125
|
+
# create consensus sequences
|
|
126
|
+
majority_consensus, ambiguous_consensus = consensus.create_consensus(
|
|
127
|
+
alignment_cleaned,
|
|
128
|
+
args.threshold
|
|
129
|
+
)
|
|
130
|
+
logging.varvamp_progress(
|
|
131
|
+
log_file,
|
|
132
|
+
progress=0.3,
|
|
133
|
+
job="Creating consensus sequences.",
|
|
134
|
+
progress_text=f"length of the consensus is {len(majority_consensus)} nt"
|
|
135
|
+
)
|
|
136
|
+
# generate conserved region list
|
|
137
|
+
conserved_regions = conserved.find_regions(
|
|
138
|
+
ambiguous_consensus,
|
|
139
|
+
args.allowed_ambiguous
|
|
140
|
+
)
|
|
141
|
+
if not conserved_regions:
|
|
142
|
+
logging.raise_error(
|
|
143
|
+
"nothing conserved. Lower the threshold!",
|
|
144
|
+
log_file,
|
|
145
|
+
exit=True
|
|
146
|
+
)
|
|
147
|
+
logging.varvamp_progress(
|
|
148
|
+
log_file,
|
|
149
|
+
progress=0.4,
|
|
150
|
+
job="Finding conserved regions.",
|
|
151
|
+
progress_text=f"{conserved.mean(conserved_regions, majority_consensus)} % conserved"
|
|
152
|
+
)
|
|
153
|
+
# produce kmers for all conserved regions
|
|
154
|
+
kmers = conserved.produce_kmers(
|
|
155
|
+
conserved_regions,
|
|
156
|
+
majority_consensus
|
|
157
|
+
)
|
|
158
|
+
logging.varvamp_progress(
|
|
159
|
+
log_file,
|
|
160
|
+
progress=0.5,
|
|
161
|
+
job="Digesting into kmers.",
|
|
162
|
+
progress_text=f"{len(kmers)} kmers"
|
|
163
|
+
)
|
|
164
|
+
# find potential primers
|
|
165
|
+
left_primer_candidates, right_primer_candidates = primers.find_primers(
|
|
166
|
+
kmers,
|
|
167
|
+
ambiguous_consensus,
|
|
168
|
+
alignment_cleaned
|
|
169
|
+
)
|
|
170
|
+
for type, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
|
|
171
|
+
if not primer_candidates:
|
|
172
|
+
logging.raise_error(
|
|
173
|
+
f"no {type} primers found.\n",
|
|
174
|
+
log_file,
|
|
175
|
+
exit=True
|
|
176
|
+
)
|
|
177
|
+
logging.varvamp_progress(
|
|
178
|
+
log_file,
|
|
179
|
+
progress=0.6,
|
|
180
|
+
job="Filtering for primers.",
|
|
181
|
+
progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rw potential primers"
|
|
182
|
+
)
|
|
183
|
+
# find best primers and create primer dict
|
|
184
|
+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
|
|
185
|
+
logging.varvamp_progress(
|
|
186
|
+
log_file,
|
|
187
|
+
progress=0.7,
|
|
188
|
+
job="Considering only high scoring primers.",
|
|
189
|
+
progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rw primers"
|
|
190
|
+
)
|
|
191
|
+
# find all possible amplicons
|
|
192
|
+
amplicons = scheme.find_amplicons(
|
|
193
|
+
all_primers,
|
|
194
|
+
args.opt_length,
|
|
195
|
+
args.max_length
|
|
196
|
+
)
|
|
197
|
+
if not amplicons:
|
|
198
|
+
logging.raise_error(
|
|
199
|
+
"no amplicons found. Increase the max "
|
|
200
|
+
"amplicon length or lower threshold!\n",
|
|
201
|
+
log_file,
|
|
202
|
+
exit=True
|
|
203
|
+
)
|
|
204
|
+
amplicon_graph = scheme.create_amplicon_graph(amplicons, args.overlap)
|
|
205
|
+
logging.varvamp_progress(
|
|
206
|
+
log_file,
|
|
207
|
+
progress=0.8,
|
|
208
|
+
job="Finding potential amplicons.",
|
|
209
|
+
progress_text=str(len(amplicons)) + " potential amplicons"
|
|
210
|
+
)
|
|
211
|
+
# search for amplicon scheme
|
|
212
|
+
coverage, amplicon_scheme = scheme.find_best_covering_scheme(
|
|
213
|
+
amplicons,
|
|
214
|
+
amplicon_graph,
|
|
215
|
+
all_primers
|
|
216
|
+
)
|
|
217
|
+
dimers_not_solved = scheme.check_and_solve_heterodimers(
|
|
218
|
+
amplicon_scheme,
|
|
219
|
+
left_primer_candidates,
|
|
220
|
+
right_primer_candidates,
|
|
221
|
+
all_primers)
|
|
222
|
+
if dimers_not_solved:
|
|
223
|
+
logging.raise_error(
|
|
224
|
+
f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
|
|
225
|
+
log_file
|
|
226
|
+
)
|
|
227
|
+
reporting.write_dimers(dir, dimers_not_solved)
|
|
228
|
+
percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
|
|
229
|
+
logging.varvamp_progress(
|
|
230
|
+
log_file,
|
|
231
|
+
progress=0.9,
|
|
232
|
+
job="Creating amplicon scheme.",
|
|
233
|
+
progress_text=f"{percent_coverage} % total coverage with {len(amplicon_scheme[0]) + len(amplicon_scheme[1])} amplicons"
|
|
234
|
+
)
|
|
235
|
+
if percent_coverage < 70:
|
|
236
|
+
logging.raise_error(
|
|
237
|
+
"coverage < 70 %. Possible solutions:\n"
|
|
238
|
+
"\t - lower threshold\n"
|
|
239
|
+
"\t - increase amplicons lengths\n"
|
|
240
|
+
"\t - increase number of ambiguous nucleotides\n"
|
|
241
|
+
"\t - relax primer settings (not recommended)\n",
|
|
242
|
+
log_file
|
|
243
|
+
)
|
|
244
|
+
# write files
|
|
245
|
+
reporting.write_alignment(data_dir, alignment_cleaned)
|
|
246
|
+
reporting.write_fasta(data_dir, "majority_consensus", majority_consensus)
|
|
247
|
+
reporting.write_fasta(results_dir, "ambiguous_consensus", ambiguous_consensus)
|
|
248
|
+
reporting.write_conserved_to_bed(conserved_regions, data_dir)
|
|
249
|
+
reporting.write_all_primers(data_dir, all_primers)
|
|
250
|
+
reporting.write_scheme_to_files(
|
|
251
|
+
results_dir,
|
|
252
|
+
amplicon_scheme,
|
|
253
|
+
ambiguous_consensus
|
|
254
|
+
)
|
|
255
|
+
reporting.varvamp_plot(
|
|
256
|
+
results_dir,
|
|
257
|
+
args.threshold,
|
|
258
|
+
alignment_cleaned,
|
|
259
|
+
conserved_regions,
|
|
260
|
+
all_primers,
|
|
261
|
+
amplicon_scheme,
|
|
262
|
+
)
|
|
263
|
+
logging.varvamp_progress(log_file, progress=1, start_time=start_time)
|
|
File without changes
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
alignment preprocessing
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
# LIBS
|
|
9
|
+
from Bio import AlignIO
|
|
10
|
+
from Bio.Seq import Seq
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def read_alignment(alignment_path):
|
|
14
|
+
"""
|
|
15
|
+
read alignment with AlignIO and
|
|
16
|
+
convert to list of lists
|
|
17
|
+
"""
|
|
18
|
+
alignment_list = []
|
|
19
|
+
|
|
20
|
+
for sequence in AlignIO.read(alignment_path, "fasta"):
|
|
21
|
+
alignment_list.append([sequence.id, str(sequence.seq)])
|
|
22
|
+
|
|
23
|
+
return alignment_list
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def preprocess(alignment):
|
|
27
|
+
"""
|
|
28
|
+
force nucleotides to lower and
|
|
29
|
+
back transcripe if its RNA
|
|
30
|
+
"""
|
|
31
|
+
preprocessed_alignment = []
|
|
32
|
+
|
|
33
|
+
for sequence in alignment:
|
|
34
|
+
seq = Seq(sequence[1])
|
|
35
|
+
seq = seq.lower()
|
|
36
|
+
if "u" in seq:
|
|
37
|
+
seq = seq.back_transcribe()
|
|
38
|
+
preprocessed_alignment.append([sequence[0], str(seq)])
|
|
39
|
+
|
|
40
|
+
return preprocessed_alignment
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def find_gaps_in_alignment(alignment):
|
|
44
|
+
"""
|
|
45
|
+
find all gaps for each sequence in alignment
|
|
46
|
+
"""
|
|
47
|
+
all_gaps = []
|
|
48
|
+
|
|
49
|
+
for seq in alignment:
|
|
50
|
+
# find all gaps for all sequences with regular expression -{min}
|
|
51
|
+
all_gaps.append(
|
|
52
|
+
[(gap.start(0), gap.end(0)-1) for gap in re.finditer(
|
|
53
|
+
"-{1,}", seq[1])]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return all_gaps
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def find_unique_gaps(all_gaps):
|
|
60
|
+
"""
|
|
61
|
+
get all unique gaps
|
|
62
|
+
"""
|
|
63
|
+
result = list(set(gaps for gap_list in all_gaps for gaps in gap_list))
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def find_internal_gaps(unique_gaps, gap):
|
|
68
|
+
"""
|
|
69
|
+
find all unique gaps that
|
|
70
|
+
lie within the current gap
|
|
71
|
+
"""
|
|
72
|
+
overlapping_gaps = []
|
|
73
|
+
|
|
74
|
+
if gap[1] - gap[0] == 0:
|
|
75
|
+
# if the gap length = 1 there are
|
|
76
|
+
# no overlapping gaps
|
|
77
|
+
overlapping_gaps = [gap]
|
|
78
|
+
else:
|
|
79
|
+
# for each unique gap check if the intersection with the
|
|
80
|
+
# gap is the same as the unique gap -> internal gap of
|
|
81
|
+
# the current gap
|
|
82
|
+
for unique_gap in unique_gaps:
|
|
83
|
+
unique_set = set(range(unique_gap[0], unique_gap[1]))
|
|
84
|
+
current_range = range(gap[0], gap[1])
|
|
85
|
+
intersection = unique_set.intersection(current_range)
|
|
86
|
+
if not intersection:
|
|
87
|
+
continue
|
|
88
|
+
if min(intersection) == unique_gap[0] and max(intersection)+1 == unique_gap[1]:
|
|
89
|
+
overlapping_gaps.append(unique_gap)
|
|
90
|
+
|
|
91
|
+
return overlapping_gaps
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def create_gap_dictionary(unique_gaps, all_gaps):
|
|
95
|
+
"""
|
|
96
|
+
creates a dictionary with gap counts.
|
|
97
|
+
counts also all overlapping gaps per gap.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
gap_dict = {}
|
|
101
|
+
|
|
102
|
+
for gap_list in all_gaps:
|
|
103
|
+
for gap in gap_list:
|
|
104
|
+
overlapping_gaps = find_internal_gaps(unique_gaps, gap)
|
|
105
|
+
for overlapping_gap in overlapping_gaps:
|
|
106
|
+
if overlapping_gap in gap_dict:
|
|
107
|
+
gap_dict[overlapping_gap] += 1
|
|
108
|
+
else:
|
|
109
|
+
gap_dict[overlapping_gap] = 1
|
|
110
|
+
|
|
111
|
+
return gap_dict
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def find_gaps_to_mask(gap_dict, cutoff):
|
|
115
|
+
"""
|
|
116
|
+
filters gaps for their freq cutoff.
|
|
117
|
+
condenses final gaps if there is
|
|
118
|
+
an overlap.
|
|
119
|
+
"""
|
|
120
|
+
gaps_to_mask = []
|
|
121
|
+
potential_gaps = []
|
|
122
|
+
|
|
123
|
+
# check for each region if it is covered
|
|
124
|
+
# by enough sequences
|
|
125
|
+
for gap in gap_dict:
|
|
126
|
+
if gap_dict[gap] > cutoff:
|
|
127
|
+
potential_gaps.append(gap)
|
|
128
|
+
|
|
129
|
+
# sort by start and stop
|
|
130
|
+
potential_gaps = sorted(potential_gaps)
|
|
131
|
+
|
|
132
|
+
# get the min and max of overlapping gaps
|
|
133
|
+
opened_region = []
|
|
134
|
+
gaps_to_mask = []
|
|
135
|
+
for i, region in enumerate(potential_gaps):
|
|
136
|
+
region = list(region)
|
|
137
|
+
if opened_region:
|
|
138
|
+
# write the opened region if the start of the current region
|
|
139
|
+
# > opened_region[stop] and the last still opened region
|
|
140
|
+
if region[0] > opened_region[1] or i == len(potential_gaps)-1:
|
|
141
|
+
gaps_to_mask.append(opened_region)
|
|
142
|
+
opened_region = region
|
|
143
|
+
else:
|
|
144
|
+
# 1 case: same start and further stop -> new stop
|
|
145
|
+
if region[0] == opened_region[0]:
|
|
146
|
+
opened_region[1] = region[1]
|
|
147
|
+
# 2 case: further start and further stop -> new stop
|
|
148
|
+
if region[0] > opened_region[0] and region[1] > opened_region[1]:
|
|
149
|
+
opened_region[1] = region[1]
|
|
150
|
+
else:
|
|
151
|
+
opened_region = region
|
|
152
|
+
|
|
153
|
+
return gaps_to_mask
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def clean_gaps(alignment, gaps_to_mask):
|
|
157
|
+
"""
|
|
158
|
+
clean an alignment of large common deletions.
|
|
159
|
+
"""
|
|
160
|
+
cleaned_alignment = []
|
|
161
|
+
|
|
162
|
+
for sequence in alignment:
|
|
163
|
+
start = 0
|
|
164
|
+
masked_seq = str()
|
|
165
|
+
for region in gaps_to_mask:
|
|
166
|
+
stop = region[0]
|
|
167
|
+
masked_seq_temp = sequence[1][start:stop]
|
|
168
|
+
# check if the deletion is at the start
|
|
169
|
+
if len(masked_seq_temp) != 0:
|
|
170
|
+
masked_seq = (masked_seq + "N" + masked_seq_temp)
|
|
171
|
+
start = region[1]+1
|
|
172
|
+
if max(gaps_to_mask)[1] < len(sequence[1])-1:
|
|
173
|
+
# append the last gaps if it is not
|
|
174
|
+
# the end of the sequence
|
|
175
|
+
start = max(gaps_to_mask)[1]
|
|
176
|
+
stop = len(sequence[1])-1
|
|
177
|
+
masked_seq_temp = sequence[1][start:stop]
|
|
178
|
+
masked_seq = (masked_seq + "N" + masked_seq_temp)
|
|
179
|
+
else:
|
|
180
|
+
# append the mask to the end of the seq
|
|
181
|
+
masked_seq = masked_seq + "N"
|
|
182
|
+
|
|
183
|
+
cleaned_alignment.append([sequence[0], masked_seq])
|
|
184
|
+
|
|
185
|
+
return cleaned_alignment
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def process_alignment(alignment_path, threshold):
|
|
189
|
+
"""
|
|
190
|
+
proprocesses alignment and cleans gaps
|
|
191
|
+
"""
|
|
192
|
+
alignment = read_alignment(alignment_path)
|
|
193
|
+
gap_cutoff = len(alignment)*(1-threshold)
|
|
194
|
+
|
|
195
|
+
alignment_preprocessed = preprocess(alignment)
|
|
196
|
+
all_gaps = find_gaps_in_alignment(alignment_preprocessed)
|
|
197
|
+
unique_gaps = find_unique_gaps(all_gaps)
|
|
198
|
+
|
|
199
|
+
if unique_gaps:
|
|
200
|
+
gap_dic = create_gap_dictionary(unique_gaps, all_gaps)
|
|
201
|
+
gaps_to_mask = find_gaps_to_mask(gap_dic, gap_cutoff)
|
|
202
|
+
alignment_cleaned = clean_gaps(
|
|
203
|
+
alignment_preprocessed, gaps_to_mask
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
gaps_to_mask = []
|
|
207
|
+
alignment_cleaned = alignment_preprocessed
|
|
208
|
+
|
|
209
|
+
return alignment_cleaned, gaps_to_mask
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def calculate_total_masked_gaps(gaps_to_mask):
|
|
213
|
+
"""
|
|
214
|
+
calculates the cummulative length of gaps
|
|
215
|
+
that were masked.
|
|
216
|
+
"""
|
|
217
|
+
if gaps_to_mask:
|
|
218
|
+
sum_gaps = 0
|
|
219
|
+
for region in gaps_to_mask:
|
|
220
|
+
sum_gaps += region[1] - region[0] + 1
|
|
221
|
+
return sum_gaps
|
|
222
|
+
else:
|
|
223
|
+
return 0
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This contains all varVAMP parameters. Options that can be adjusted by arguments
|
|
3
|
+
are FREQUENCY_THRESHOLD, PRIMER_ALLOWED_N_AMB, AMPLICON_MIN_OVERLAP, AMPLICON_OPT_LENGTH,
|
|
4
|
+
AMPLICON_MAX_LENGTH.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# CAN BE CHANGED
|
|
8
|
+
|
|
9
|
+
# alignment and consensus creation threshold
|
|
10
|
+
FREQUENCY_THRESHOLD = 0.9 # freq at which a nucleotide is considered conserved
|
|
11
|
+
PRIMER_ALLOWED_N_AMB = 4 # allowed number of ambiguous chars in primer
|
|
12
|
+
|
|
13
|
+
# basic primer parameters
|
|
14
|
+
PRIMER_TMP = (57, 63, 60) # temperatur (min, max, opt)
|
|
15
|
+
PRIMER_GC_RANGE = (40, 60, 50) # gc (min, max, opt)
|
|
16
|
+
PRIMER_SIZES = (17, 27, 20) # size (min, max, opt)
|
|
17
|
+
PRIMER_MAX_POLYX = 4 # max number of polyx repeats
|
|
18
|
+
PRIMER_MAX_DINUC_REPEATS = 4 # max number of dinucleotide repeats
|
|
19
|
+
PRIMER_HAIRPIN = 47 # max melting temp for secondary structures
|
|
20
|
+
PRIMER_MAX_GC_END = 3 # max GCs in the last 5 bases of the primer
|
|
21
|
+
PRIMER_GC_CLAMP = 1 # min number of GC nucleotides at the very 3' end
|
|
22
|
+
PRIMER_MIN_3_WITHOUT_AMB = 2 # min len of 3' without ambiguous charaters
|
|
23
|
+
PRIMER_MAX_DIMER_TMP = 47 # max melting temp for dimers (homo- or heterodimers)
|
|
24
|
+
|
|
25
|
+
# PCR parameters
|
|
26
|
+
PCR_MV_CONC = 50 # monovalent cations mM
|
|
27
|
+
PCR_DV_CONC = 2 # divalent cations mM
|
|
28
|
+
PCR_DNTP_CONC = 0.8 # dntp concentration mM
|
|
29
|
+
PCR_DNA_CONC = 50 # primer concentration nM
|
|
30
|
+
|
|
31
|
+
# multipliers for primer base penalties
|
|
32
|
+
PRIMER_TM_PENALTY = 2 # temperature penalty
|
|
33
|
+
PRIMER_GC_PENALTY = 0.2 # gc penalty
|
|
34
|
+
PRIMER_SIZE_PENALTY = 0.5 # size penalty
|
|
35
|
+
PRIMER_MAX_BASE_PENALTY = 8 # max base penalty for a primer
|
|
36
|
+
PRIMER_3_PENALTY = (10, 10, 10) # penalties for 3' mismatches
|
|
37
|
+
PRIMER_PERMUTATION_PENALTY = 0.1 # penalty for the number of permutations
|
|
38
|
+
|
|
39
|
+
# amplicon settings
|
|
40
|
+
AMPLICON_MIN_OVERLAP = 100
|
|
41
|
+
AMPLICON_OPT_LENGTH = 1000
|
|
42
|
+
AMPLICON_MAX_LENGTH = 2000
|
|
43
|
+
|
|
44
|
+
# DO NOT CHANGE
|
|
45
|
+
# nucleotide definitions
|
|
46
|
+
nucs = set("atcg")
|
|
47
|
+
ambig_nucs = {
|
|
48
|
+
"r": ["a", "g"],
|
|
49
|
+
"y": ["c", "t"],
|
|
50
|
+
"s": ["g", "c"],
|
|
51
|
+
"w": ["a", "t"],
|
|
52
|
+
"k": ["g", "t"],
|
|
53
|
+
"m": ["a", "c"],
|
|
54
|
+
"b": ["c", "g", "t"],
|
|
55
|
+
"d": ["a", "g", "t"],
|
|
56
|
+
"h": ["a", "c", "t"],
|
|
57
|
+
"v": ["a", "c", "g"],
|
|
58
|
+
"n": ["a", "c", "g", "t"]
|
|
59
|
+
}
|