varvamp 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +3 -0
- varvamp/__main__.py +5 -0
- varvamp/command.py +263 -0
- varvamp/scripts/__init__.py +0 -0
- varvamp/scripts/alignment.py +223 -0
- varvamp/scripts/config.py +59 -0
- varvamp/scripts/consensus.py +111 -0
- varvamp/scripts/conserved.py +118 -0
- varvamp/scripts/logging.py +321 -0
- varvamp/scripts/primers.py +417 -0
- varvamp/scripts/reporting.py +353 -0
- varvamp/scripts/scheme.py +390 -0
- varvamp-0.3.dist-info/METADATA +53 -0
- varvamp-0.3.dist-info/RECORD +17 -0
- varvamp-0.3.dist-info/WHEEL +5 -0
- varvamp-0.3.dist-info/entry_points.txt +2 -0
- varvamp-0.3.dist-info/top_level.txt +1 -0
varvamp/__init__.py
ADDED
varvamp/__main__.py
ADDED
varvamp/command.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
main workflow
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
import argparse
|
|
10
|
+
|
|
11
|
+
# varVAMP
|
|
12
|
+
from . import _program
|
|
13
|
+
from varvamp import __version__
|
|
14
|
+
from varvamp.scripts import logging
|
|
15
|
+
from varvamp.scripts import alignment
|
|
16
|
+
from varvamp.scripts import config
|
|
17
|
+
from varvamp.scripts import consensus
|
|
18
|
+
from varvamp.scripts import conserved
|
|
19
|
+
from varvamp.scripts import primers
|
|
20
|
+
from varvamp.scripts import reporting
|
|
21
|
+
from varvamp.scripts import scheme
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# DEFs
|
|
25
|
+
def get_args(sysargs):
|
|
26
|
+
"""
|
|
27
|
+
arg parsing for varvamp
|
|
28
|
+
"""
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
prog=_program,
|
|
31
|
+
description='varvamp: variable virus amplicon design',
|
|
32
|
+
usage='''varvamp <alignment> <output dir> [options]''')
|
|
33
|
+
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"input",
|
|
36
|
+
nargs=2,
|
|
37
|
+
help="alignment file and dir to write results"
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"-ol",
|
|
41
|
+
"--opt-length",
|
|
42
|
+
help="optimal length of the amplicons",
|
|
43
|
+
type=int,
|
|
44
|
+
default=config.AMPLICON_OPT_LENGTH
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"-ml",
|
|
48
|
+
"--max-length",
|
|
49
|
+
help="max length of the amplicons",
|
|
50
|
+
type=int,
|
|
51
|
+
default=config.AMPLICON_MAX_LENGTH
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"-o",
|
|
55
|
+
"--overlap",
|
|
56
|
+
type=float,
|
|
57
|
+
default=config.AMPLICON_MIN_OVERLAP,
|
|
58
|
+
help="min overlap of the amplicons"
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"-t",
|
|
62
|
+
"--threshold",
|
|
63
|
+
type=float,
|
|
64
|
+
default=config.FREQUENCY_THRESHOLD,
|
|
65
|
+
help="threshold for nucleotides in alignment to be considered conserved"
|
|
66
|
+
)
|
|
67
|
+
parser.add_argument(
|
|
68
|
+
"-a",
|
|
69
|
+
"--allowed-ambiguous",
|
|
70
|
+
type=int,
|
|
71
|
+
default=config.PRIMER_ALLOWED_N_AMB,
|
|
72
|
+
help="number of ambiguous characters that are allowed within a primer"
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument(
|
|
75
|
+
"--console",
|
|
76
|
+
action=argparse.BooleanOptionalAction,
|
|
77
|
+
default=True,
|
|
78
|
+
help="show varvamp console output"
|
|
79
|
+
)
|
|
80
|
+
parser.add_argument(
|
|
81
|
+
"-v",
|
|
82
|
+
"--version",
|
|
83
|
+
action='version',
|
|
84
|
+
version=f"varvamp {__version__}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
if len(sysargs) < 1:
|
|
88
|
+
parser.print_help()
|
|
89
|
+
sys.exit(-1)
|
|
90
|
+
else:
|
|
91
|
+
return parser.parse_args(sysargs)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main(sysargs=sys.argv[1:]):
|
|
95
|
+
"""
|
|
96
|
+
main varvamp workflow
|
|
97
|
+
"""
|
|
98
|
+
# start varVAMP
|
|
99
|
+
args = get_args(sysargs)
|
|
100
|
+
if not args.console:
|
|
101
|
+
sys.stdout = open(os.devnull, 'w')
|
|
102
|
+
start_time = time.process_time()
|
|
103
|
+
results_dir, data_dir, log_file = logging.create_dir_structure(args.input[1])
|
|
104
|
+
logging.raise_arg_errors(args, log_file)
|
|
105
|
+
logging.varvamp_progress(log_file)
|
|
106
|
+
# config check
|
|
107
|
+
logging.confirm_config(args, log_file)
|
|
108
|
+
logging.varvamp_progress(
|
|
109
|
+
log_file,
|
|
110
|
+
progress=0.1,
|
|
111
|
+
job="Checking config.",
|
|
112
|
+
progress_text="config file passed"
|
|
113
|
+
)
|
|
114
|
+
# preprocess and clean alignment of gaps
|
|
115
|
+
alignment_cleaned, gaps_to_mask = alignment.process_alignment(
|
|
116
|
+
args.input[0],
|
|
117
|
+
args.threshold
|
|
118
|
+
)
|
|
119
|
+
logging.varvamp_progress(
|
|
120
|
+
log_file,
|
|
121
|
+
progress=0.2,
|
|
122
|
+
job="Preprocessing alignment and cleaning gaps.",
|
|
123
|
+
progress_text=f"{len(gaps_to_mask)} gaps with {alignment.calculate_total_masked_gaps(gaps_to_mask)} nucleotides"
|
|
124
|
+
)
|
|
125
|
+
# create consensus sequences
|
|
126
|
+
majority_consensus, ambiguous_consensus = consensus.create_consensus(
|
|
127
|
+
alignment_cleaned,
|
|
128
|
+
args.threshold
|
|
129
|
+
)
|
|
130
|
+
logging.varvamp_progress(
|
|
131
|
+
log_file,
|
|
132
|
+
progress=0.3,
|
|
133
|
+
job="Creating consensus sequences.",
|
|
134
|
+
progress_text=f"length of the consensus is {len(majority_consensus)} nt"
|
|
135
|
+
)
|
|
136
|
+
# generate conserved region list
|
|
137
|
+
conserved_regions = conserved.find_regions(
|
|
138
|
+
ambiguous_consensus,
|
|
139
|
+
args.allowed_ambiguous
|
|
140
|
+
)
|
|
141
|
+
if not conserved_regions:
|
|
142
|
+
logging.raise_error(
|
|
143
|
+
"nothing conserved. Lower the threshold!",
|
|
144
|
+
log_file,
|
|
145
|
+
exit=True
|
|
146
|
+
)
|
|
147
|
+
logging.varvamp_progress(
|
|
148
|
+
log_file,
|
|
149
|
+
progress=0.4,
|
|
150
|
+
job="Finding conserved regions.",
|
|
151
|
+
progress_text=f"{conserved.mean(conserved_regions, majority_consensus)} % conserved"
|
|
152
|
+
)
|
|
153
|
+
# produce kmers for all conserved regions
|
|
154
|
+
kmers = conserved.produce_kmers(
|
|
155
|
+
conserved_regions,
|
|
156
|
+
majority_consensus
|
|
157
|
+
)
|
|
158
|
+
logging.varvamp_progress(
|
|
159
|
+
log_file,
|
|
160
|
+
progress=0.5,
|
|
161
|
+
job="Digesting into kmers.",
|
|
162
|
+
progress_text=f"{len(kmers)} kmers"
|
|
163
|
+
)
|
|
164
|
+
# find potential primers
|
|
165
|
+
left_primer_candidates, right_primer_candidates = primers.find_primers(
|
|
166
|
+
kmers,
|
|
167
|
+
ambiguous_consensus,
|
|
168
|
+
alignment_cleaned
|
|
169
|
+
)
|
|
170
|
+
for type, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
|
|
171
|
+
if not primer_candidates:
|
|
172
|
+
logging.raise_error(
|
|
173
|
+
f"no {type} primers found.\n",
|
|
174
|
+
log_file,
|
|
175
|
+
exit=True
|
|
176
|
+
)
|
|
177
|
+
logging.varvamp_progress(
|
|
178
|
+
log_file,
|
|
179
|
+
progress=0.6,
|
|
180
|
+
job="Filtering for primers.",
|
|
181
|
+
progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rw potential primers"
|
|
182
|
+
)
|
|
183
|
+
# find best primers and create primer dict
|
|
184
|
+
all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
|
|
185
|
+
logging.varvamp_progress(
|
|
186
|
+
log_file,
|
|
187
|
+
progress=0.7,
|
|
188
|
+
job="Considering only high scoring primers.",
|
|
189
|
+
progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rw primers"
|
|
190
|
+
)
|
|
191
|
+
# find all possible amplicons
|
|
192
|
+
amplicons = scheme.find_amplicons(
|
|
193
|
+
all_primers,
|
|
194
|
+
args.opt_length,
|
|
195
|
+
args.max_length
|
|
196
|
+
)
|
|
197
|
+
if not amplicons:
|
|
198
|
+
logging.raise_error(
|
|
199
|
+
"no amplicons found. Increase the max "
|
|
200
|
+
"amplicon length or lower threshold!\n",
|
|
201
|
+
log_file,
|
|
202
|
+
exit=True
|
|
203
|
+
)
|
|
204
|
+
amplicon_graph = scheme.create_amplicon_graph(amplicons, args.overlap)
|
|
205
|
+
logging.varvamp_progress(
|
|
206
|
+
log_file,
|
|
207
|
+
progress=0.8,
|
|
208
|
+
job="Finding potential amplicons.",
|
|
209
|
+
progress_text=str(len(amplicons)) + " potential amplicons"
|
|
210
|
+
)
|
|
211
|
+
# search for amplicon scheme
|
|
212
|
+
coverage, amplicon_scheme = scheme.find_best_covering_scheme(
|
|
213
|
+
amplicons,
|
|
214
|
+
amplicon_graph,
|
|
215
|
+
all_primers
|
|
216
|
+
)
|
|
217
|
+
dimers_not_solved = scheme.check_and_solve_heterodimers(
|
|
218
|
+
amplicon_scheme,
|
|
219
|
+
left_primer_candidates,
|
|
220
|
+
right_primer_candidates,
|
|
221
|
+
all_primers)
|
|
222
|
+
if dimers_not_solved:
|
|
223
|
+
logging.raise_error(
|
|
224
|
+
f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
|
|
225
|
+
log_file
|
|
226
|
+
)
|
|
227
|
+
reporting.write_dimers(dir, dimers_not_solved)
|
|
228
|
+
percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
|
|
229
|
+
logging.varvamp_progress(
|
|
230
|
+
log_file,
|
|
231
|
+
progress=0.9,
|
|
232
|
+
job="Creating amplicon scheme.",
|
|
233
|
+
progress_text=f"{percent_coverage} % total coverage with {len(amplicon_scheme[0]) + len(amplicon_scheme[1])} amplicons"
|
|
234
|
+
)
|
|
235
|
+
if percent_coverage < 70:
|
|
236
|
+
logging.raise_error(
|
|
237
|
+
"coverage < 70 %. Possible solutions:\n"
|
|
238
|
+
"\t - lower threshold\n"
|
|
239
|
+
"\t - increase amplicons lengths\n"
|
|
240
|
+
"\t - increase number of ambiguous nucleotides\n"
|
|
241
|
+
"\t - relax primer settings (not recommended)\n",
|
|
242
|
+
log_file
|
|
243
|
+
)
|
|
244
|
+
# write files
|
|
245
|
+
reporting.write_alignment(data_dir, alignment_cleaned)
|
|
246
|
+
reporting.write_fasta(data_dir, "majority_consensus", majority_consensus)
|
|
247
|
+
reporting.write_fasta(results_dir, "ambiguous_consensus", ambiguous_consensus)
|
|
248
|
+
reporting.write_conserved_to_bed(conserved_regions, data_dir)
|
|
249
|
+
reporting.write_all_primers(data_dir, all_primers)
|
|
250
|
+
reporting.write_scheme_to_files(
|
|
251
|
+
results_dir,
|
|
252
|
+
amplicon_scheme,
|
|
253
|
+
ambiguous_consensus
|
|
254
|
+
)
|
|
255
|
+
reporting.varvamp_plot(
|
|
256
|
+
results_dir,
|
|
257
|
+
args.threshold,
|
|
258
|
+
alignment_cleaned,
|
|
259
|
+
conserved_regions,
|
|
260
|
+
all_primers,
|
|
261
|
+
amplicon_scheme,
|
|
262
|
+
)
|
|
263
|
+
logging.varvamp_progress(log_file, progress=1, start_time=start_time)
|
|
File without changes
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
alignment preprocessing
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
# LIBS
|
|
9
|
+
from Bio import AlignIO
|
|
10
|
+
from Bio.Seq import Seq
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def read_alignment(alignment_path):
|
|
14
|
+
"""
|
|
15
|
+
read alignment with AlignIO and
|
|
16
|
+
convert to list of lists
|
|
17
|
+
"""
|
|
18
|
+
alignment_list = []
|
|
19
|
+
|
|
20
|
+
for sequence in AlignIO.read(alignment_path, "fasta"):
|
|
21
|
+
alignment_list.append([sequence.id, str(sequence.seq)])
|
|
22
|
+
|
|
23
|
+
return alignment_list
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def preprocess(alignment):
|
|
27
|
+
"""
|
|
28
|
+
force nucleotides to lower and
|
|
29
|
+
back transcripe if its RNA
|
|
30
|
+
"""
|
|
31
|
+
preprocessed_alignment = []
|
|
32
|
+
|
|
33
|
+
for sequence in alignment:
|
|
34
|
+
seq = Seq(sequence[1])
|
|
35
|
+
seq = seq.lower()
|
|
36
|
+
if "u" in seq:
|
|
37
|
+
seq = seq.back_transcribe()
|
|
38
|
+
preprocessed_alignment.append([sequence[0], str(seq)])
|
|
39
|
+
|
|
40
|
+
return preprocessed_alignment
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def find_gaps_in_alignment(alignment):
|
|
44
|
+
"""
|
|
45
|
+
find all gaps for each sequence in alignment
|
|
46
|
+
"""
|
|
47
|
+
all_gaps = []
|
|
48
|
+
|
|
49
|
+
for seq in alignment:
|
|
50
|
+
# find all gaps for all sequences with regular expression -{min}
|
|
51
|
+
all_gaps.append(
|
|
52
|
+
[(gap.start(0), gap.end(0)-1) for gap in re.finditer(
|
|
53
|
+
"-{1,}", seq[1])]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return all_gaps
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def find_unique_gaps(all_gaps):
|
|
60
|
+
"""
|
|
61
|
+
get all unique gaps
|
|
62
|
+
"""
|
|
63
|
+
result = list(set(gaps for gap_list in all_gaps for gaps in gap_list))
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def find_internal_gaps(unique_gaps, gap):
|
|
68
|
+
"""
|
|
69
|
+
find all unique gaps that
|
|
70
|
+
lie within the current gap
|
|
71
|
+
"""
|
|
72
|
+
overlapping_gaps = []
|
|
73
|
+
|
|
74
|
+
if gap[1] - gap[0] == 0:
|
|
75
|
+
# if the gap length = 1 there are
|
|
76
|
+
# no overlapping gaps
|
|
77
|
+
overlapping_gaps = [gap]
|
|
78
|
+
else:
|
|
79
|
+
# for each unique gap check if the intersection with the
|
|
80
|
+
# gap is the same as the unique gap -> internal gap of
|
|
81
|
+
# the current gap
|
|
82
|
+
for unique_gap in unique_gaps:
|
|
83
|
+
unique_set = set(range(unique_gap[0], unique_gap[1]))
|
|
84
|
+
current_range = range(gap[0], gap[1])
|
|
85
|
+
intersection = unique_set.intersection(current_range)
|
|
86
|
+
if not intersection:
|
|
87
|
+
continue
|
|
88
|
+
if min(intersection) == unique_gap[0] and max(intersection)+1 == unique_gap[1]:
|
|
89
|
+
overlapping_gaps.append(unique_gap)
|
|
90
|
+
|
|
91
|
+
return overlapping_gaps
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def create_gap_dictionary(unique_gaps, all_gaps):
|
|
95
|
+
"""
|
|
96
|
+
creates a dictionary with gap counts.
|
|
97
|
+
counts also all overlapping gaps per gap.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
gap_dict = {}
|
|
101
|
+
|
|
102
|
+
for gap_list in all_gaps:
|
|
103
|
+
for gap in gap_list:
|
|
104
|
+
overlapping_gaps = find_internal_gaps(unique_gaps, gap)
|
|
105
|
+
for overlapping_gap in overlapping_gaps:
|
|
106
|
+
if overlapping_gap in gap_dict:
|
|
107
|
+
gap_dict[overlapping_gap] += 1
|
|
108
|
+
else:
|
|
109
|
+
gap_dict[overlapping_gap] = 1
|
|
110
|
+
|
|
111
|
+
return gap_dict
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def find_gaps_to_mask(gap_dict, cutoff):
|
|
115
|
+
"""
|
|
116
|
+
filters gaps for their freq cutoff.
|
|
117
|
+
condenses final gaps if there is
|
|
118
|
+
an overlap.
|
|
119
|
+
"""
|
|
120
|
+
gaps_to_mask = []
|
|
121
|
+
potential_gaps = []
|
|
122
|
+
|
|
123
|
+
# check for each region if it is covered
|
|
124
|
+
# by enough sequences
|
|
125
|
+
for gap in gap_dict:
|
|
126
|
+
if gap_dict[gap] > cutoff:
|
|
127
|
+
potential_gaps.append(gap)
|
|
128
|
+
|
|
129
|
+
# sort by start and stop
|
|
130
|
+
potential_gaps = sorted(potential_gaps)
|
|
131
|
+
|
|
132
|
+
# get the min and max of overlapping gaps
|
|
133
|
+
opened_region = []
|
|
134
|
+
gaps_to_mask = []
|
|
135
|
+
for i, region in enumerate(potential_gaps):
|
|
136
|
+
region = list(region)
|
|
137
|
+
if opened_region:
|
|
138
|
+
# write the opened region if the start of the current region
|
|
139
|
+
# > opened_region[stop] and the last still opened region
|
|
140
|
+
if region[0] > opened_region[1] or i == len(potential_gaps)-1:
|
|
141
|
+
gaps_to_mask.append(opened_region)
|
|
142
|
+
opened_region = region
|
|
143
|
+
else:
|
|
144
|
+
# 1 case: same start and further stop -> new stop
|
|
145
|
+
if region[0] == opened_region[0]:
|
|
146
|
+
opened_region[1] = region[1]
|
|
147
|
+
# 2 case: further start and further stop -> new stop
|
|
148
|
+
if region[0] > opened_region[0] and region[1] > opened_region[1]:
|
|
149
|
+
opened_region[1] = region[1]
|
|
150
|
+
else:
|
|
151
|
+
opened_region = region
|
|
152
|
+
|
|
153
|
+
return gaps_to_mask
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def clean_gaps(alignment, gaps_to_mask):
|
|
157
|
+
"""
|
|
158
|
+
clean an alignment of large common deletions.
|
|
159
|
+
"""
|
|
160
|
+
cleaned_alignment = []
|
|
161
|
+
|
|
162
|
+
for sequence in alignment:
|
|
163
|
+
start = 0
|
|
164
|
+
masked_seq = str()
|
|
165
|
+
for region in gaps_to_mask:
|
|
166
|
+
stop = region[0]
|
|
167
|
+
masked_seq_temp = sequence[1][start:stop]
|
|
168
|
+
# check if the deletion is at the start
|
|
169
|
+
if len(masked_seq_temp) != 0:
|
|
170
|
+
masked_seq = (masked_seq + "N" + masked_seq_temp)
|
|
171
|
+
start = region[1]+1
|
|
172
|
+
if max(gaps_to_mask)[1] < len(sequence[1])-1:
|
|
173
|
+
# append the last gaps if it is not
|
|
174
|
+
# the end of the sequence
|
|
175
|
+
start = max(gaps_to_mask)[1]
|
|
176
|
+
stop = len(sequence[1])-1
|
|
177
|
+
masked_seq_temp = sequence[1][start:stop]
|
|
178
|
+
masked_seq = (masked_seq + "N" + masked_seq_temp)
|
|
179
|
+
else:
|
|
180
|
+
# append the mask to the end of the seq
|
|
181
|
+
masked_seq = masked_seq + "N"
|
|
182
|
+
|
|
183
|
+
cleaned_alignment.append([sequence[0], masked_seq])
|
|
184
|
+
|
|
185
|
+
return cleaned_alignment
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def process_alignment(alignment_path, threshold):
|
|
189
|
+
"""
|
|
190
|
+
proprocesses alignment and cleans gaps
|
|
191
|
+
"""
|
|
192
|
+
alignment = read_alignment(alignment_path)
|
|
193
|
+
gap_cutoff = len(alignment)*(1-threshold)
|
|
194
|
+
|
|
195
|
+
alignment_preprocessed = preprocess(alignment)
|
|
196
|
+
all_gaps = find_gaps_in_alignment(alignment_preprocessed)
|
|
197
|
+
unique_gaps = find_unique_gaps(all_gaps)
|
|
198
|
+
|
|
199
|
+
if unique_gaps:
|
|
200
|
+
gap_dic = create_gap_dictionary(unique_gaps, all_gaps)
|
|
201
|
+
gaps_to_mask = find_gaps_to_mask(gap_dic, gap_cutoff)
|
|
202
|
+
alignment_cleaned = clean_gaps(
|
|
203
|
+
alignment_preprocessed, gaps_to_mask
|
|
204
|
+
)
|
|
205
|
+
else:
|
|
206
|
+
gaps_to_mask = []
|
|
207
|
+
alignment_cleaned = alignment_preprocessed
|
|
208
|
+
|
|
209
|
+
return alignment_cleaned, gaps_to_mask
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def calculate_total_masked_gaps(gaps_to_mask):
|
|
213
|
+
"""
|
|
214
|
+
calculates the cummulative length of gaps
|
|
215
|
+
that were masked.
|
|
216
|
+
"""
|
|
217
|
+
if gaps_to_mask:
|
|
218
|
+
sum_gaps = 0
|
|
219
|
+
for region in gaps_to_mask:
|
|
220
|
+
sum_gaps += region[1] - region[0] + 1
|
|
221
|
+
return sum_gaps
|
|
222
|
+
else:
|
|
223
|
+
return 0
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This contains all varVAMP parameters. Options that can be adjusted by arguments
|
|
3
|
+
are FREQUENCY_THRESHOLD, PRIMER_ALLOWED_N_AMB, AMPLICON_MIN_OVERLAP, AMPLICON_OPT_LENGTH,
|
|
4
|
+
AMPLICON_MAX_LENGTH.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# CAN BE CHANGED
|
|
8
|
+
|
|
9
|
+
# alignment and consensus creation threshold
|
|
10
|
+
FREQUENCY_THRESHOLD = 0.9 # freq at which a nucleotide is considered conserved
|
|
11
|
+
PRIMER_ALLOWED_N_AMB = 4 # allowed number of ambiguous chars in primer
|
|
12
|
+
|
|
13
|
+
# basic primer parameters
|
|
14
|
+
PRIMER_TMP = (57, 63, 60) # temperatur (min, max, opt)
|
|
15
|
+
PRIMER_GC_RANGE = (40, 60, 50) # gc (min, max, opt)
|
|
16
|
+
PRIMER_SIZES = (17, 27, 20) # size (min, max, opt)
|
|
17
|
+
PRIMER_MAX_POLYX = 4 # max number of polyx repeats
|
|
18
|
+
PRIMER_MAX_DINUC_REPEATS = 4 # max number of dinucleotide repeats
|
|
19
|
+
PRIMER_HAIRPIN = 47 # max melting temp for secondary structures
|
|
20
|
+
PRIMER_MAX_GC_END = 3 # max GCs in the last 5 bases of the primer
|
|
21
|
+
PRIMER_GC_CLAMP = 1 # min number of GC nucleotides at the very 3' end
|
|
22
|
+
PRIMER_MIN_3_WITHOUT_AMB = 2 # min len of 3' without ambiguous charaters
|
|
23
|
+
PRIMER_MAX_DIMER_TMP = 47 # max melting temp for dimers (homo- or heterodimers)
|
|
24
|
+
|
|
25
|
+
# PCR parameters
|
|
26
|
+
PCR_MV_CONC = 50 # monovalent cations mM
|
|
27
|
+
PCR_DV_CONC = 2 # divalent cations mM
|
|
28
|
+
PCR_DNTP_CONC = 0.8 # dntp concentration mM
|
|
29
|
+
PCR_DNA_CONC = 50 # primer concentration nM
|
|
30
|
+
|
|
31
|
+
# multipliers for primer base penalties
|
|
32
|
+
PRIMER_TM_PENALTY = 2 # temperature penalty
|
|
33
|
+
PRIMER_GC_PENALTY = 0.2 # gc penalty
|
|
34
|
+
PRIMER_SIZE_PENALTY = 0.5 # size penalty
|
|
35
|
+
PRIMER_MAX_BASE_PENALTY = 8 # max base penalty for a primer
|
|
36
|
+
PRIMER_3_PENALTY = (10, 10, 10) # penalties for 3' mismatches
|
|
37
|
+
PRIMER_PERMUTATION_PENALTY = 0.1 # penalty for the number of permutations
|
|
38
|
+
|
|
39
|
+
# amplicon settings
|
|
40
|
+
AMPLICON_MIN_OVERLAP = 100
|
|
41
|
+
AMPLICON_OPT_LENGTH = 1000
|
|
42
|
+
AMPLICON_MAX_LENGTH = 2000
|
|
43
|
+
|
|
44
|
+
# DO NOT CHANGE
|
|
45
|
+
# nucleotide definitions
|
|
46
|
+
nucs = set("atcg")
|
|
47
|
+
ambig_nucs = {
|
|
48
|
+
"r": ["a", "g"],
|
|
49
|
+
"y": ["c", "t"],
|
|
50
|
+
"s": ["g", "c"],
|
|
51
|
+
"w": ["a", "t"],
|
|
52
|
+
"k": ["g", "t"],
|
|
53
|
+
"m": ["a", "c"],
|
|
54
|
+
"b": ["c", "g", "t"],
|
|
55
|
+
"d": ["a", "g", "t"],
|
|
56
|
+
"h": ["a", "c", "t"],
|
|
57
|
+
"v": ["a", "c", "g"],
|
|
58
|
+
"n": ["a", "c", "g", "t"]
|
|
59
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
consensus creation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import collections
|
|
7
|
+
|
|
8
|
+
# varVAMP
|
|
9
|
+
from varvamp.scripts import config
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def determine_nucleotide_counts(alignment, idx):
|
|
13
|
+
"""
|
|
14
|
+
count the number of each nucleotides at
|
|
15
|
+
an idx of the alignment. return sorted dic.
|
|
16
|
+
handels ambiguous nucleotides in sequences.
|
|
17
|
+
also handels gaps.
|
|
18
|
+
"""
|
|
19
|
+
nucleotide_list = []
|
|
20
|
+
|
|
21
|
+
# get all nucleotides
|
|
22
|
+
for sequence in alignment:
|
|
23
|
+
nucleotide_list.append(sequence[1][idx])
|
|
24
|
+
# count occurences of nucleotides
|
|
25
|
+
counter = dict(collections.Counter(nucleotide_list))
|
|
26
|
+
# get permutations of an ambiguous nucleotide
|
|
27
|
+
to_delete = []
|
|
28
|
+
temp_dict = {}
|
|
29
|
+
for nucleotide in counter:
|
|
30
|
+
if nucleotide in config.ambig_nucs:
|
|
31
|
+
to_delete.append(nucleotide)
|
|
32
|
+
permutations = config.ambig_nucs[nucleotide]
|
|
33
|
+
adjusted_freq = 1/len(permutations)
|
|
34
|
+
for permutation in permutations:
|
|
35
|
+
if permutation in temp_dict:
|
|
36
|
+
temp_dict[permutation] += adjusted_freq
|
|
37
|
+
else:
|
|
38
|
+
temp_dict[permutation] = adjusted_freq
|
|
39
|
+
if nucleotide == "-":
|
|
40
|
+
to_delete.append(nucleotide)
|
|
41
|
+
|
|
42
|
+
# drop ambiguous entrys and add adjusted freqs to
|
|
43
|
+
if to_delete:
|
|
44
|
+
for i in to_delete:
|
|
45
|
+
counter.pop(i)
|
|
46
|
+
for nucleotide in temp_dict:
|
|
47
|
+
if nucleotide in counter:
|
|
48
|
+
counter[nucleotide] += temp_dict[nucleotide]
|
|
49
|
+
else:
|
|
50
|
+
counter[nucleotide] = temp_dict[nucleotide]
|
|
51
|
+
|
|
52
|
+
return dict(sorted(counter.items(), key=lambda x: x[1], reverse=True))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_consensus_nucleotides(nucleotide_counts, consensus_cutoff):
|
|
56
|
+
"""
|
|
57
|
+
get a list of nucleotides for the consensus seq
|
|
58
|
+
"""
|
|
59
|
+
n = 0
|
|
60
|
+
|
|
61
|
+
consensus_nucleotides = []
|
|
62
|
+
for nuc in nucleotide_counts:
|
|
63
|
+
n += nucleotide_counts[nuc]
|
|
64
|
+
consensus_nucleotides.append(nuc)
|
|
65
|
+
if n >= consensus_cutoff:
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
return consensus_nucleotides
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_ambiguous_char(nucleotides):
|
|
72
|
+
"""
|
|
73
|
+
get ambiguous char from a list of nucleotides
|
|
74
|
+
"""
|
|
75
|
+
for ambiguous, permutations in config.ambig_nucs.items():
|
|
76
|
+
if set(permutations) == set(nucleotides):
|
|
77
|
+
return ambiguous
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def create_consensus(alignment, threshold):
|
|
81
|
+
"""
|
|
82
|
+
build a majority sequence and a sequence that
|
|
83
|
+
has ambiguous chars as determined by the freq
|
|
84
|
+
threshold.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
# ini the consensus seq
|
|
88
|
+
ambiguous_consensus = str()
|
|
89
|
+
majority_consensus = str()
|
|
90
|
+
|
|
91
|
+
# define consensus cut-off
|
|
92
|
+
consensus_cutoff = len(alignment)*threshold
|
|
93
|
+
# define length of the consensus from the first seq in alignment
|
|
94
|
+
length_consensus = len(alignment[0][1])
|
|
95
|
+
|
|
96
|
+
# built consensus sequences
|
|
97
|
+
for idx in range(length_consensus):
|
|
98
|
+
nucleotide_counts = determine_nucleotide_counts(alignment, idx)
|
|
99
|
+
consensus_nucleotide = get_consensus_nucleotides(
|
|
100
|
+
nucleotide_counts,
|
|
101
|
+
consensus_cutoff
|
|
102
|
+
)
|
|
103
|
+
if len(consensus_nucleotide) > 1:
|
|
104
|
+
amb_consensus_nucleotide = get_ambiguous_char(consensus_nucleotide)
|
|
105
|
+
ambiguous_consensus = ambiguous_consensus + amb_consensus_nucleotide
|
|
106
|
+
else:
|
|
107
|
+
ambiguous_consensus = ambiguous_consensus + consensus_nucleotide[0]
|
|
108
|
+
|
|
109
|
+
majority_consensus = majority_consensus + consensus_nucleotide[0]
|
|
110
|
+
|
|
111
|
+
return majority_consensus, ambiguous_consensus
|