varvamp 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +3 -0
- varvamp/__main__.py +5 -0
- varvamp/command.py +263 -0
- varvamp/scripts/__init__.py +0 -0
- varvamp/scripts/alignment.py +223 -0
- varvamp/scripts/config.py +59 -0
- varvamp/scripts/consensus.py +111 -0
- varvamp/scripts/conserved.py +118 -0
- varvamp/scripts/logging.py +321 -0
- varvamp/scripts/primers.py +417 -0
- varvamp/scripts/reporting.py +353 -0
- varvamp/scripts/scheme.py +390 -0
- varvamp-0.3.dist-info/METADATA +53 -0
- varvamp-0.3.dist-info/RECORD +17 -0
- varvamp-0.3.dist-info/WHEEL +5 -0
- varvamp-0.3.dist-info/entry_points.txt +2 -0
- varvamp-0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
primer creation and evaluation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# LIBS
|
|
6
|
+
from Bio.Seq import Seq
|
|
7
|
+
import primer3 as p3
|
|
8
|
+
|
|
9
|
+
# varVAMP
|
|
10
|
+
from varvamp.scripts import config
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def calc_gc(seq):
|
|
14
|
+
"""
|
|
15
|
+
calculate the gc of a sequence
|
|
16
|
+
"""
|
|
17
|
+
return 100*(seq.count("g")+seq.count("c"))/len(seq)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def calc_temp(seq):
|
|
21
|
+
"""
|
|
22
|
+
calculate the melting temperature
|
|
23
|
+
"""
|
|
24
|
+
return p3.calc_tm(
|
|
25
|
+
seq.upper(),
|
|
26
|
+
mv_conc=config.PCR_MV_CONC,
|
|
27
|
+
dv_conc=config.PCR_DV_CONC,
|
|
28
|
+
dntp_conc=config.PCR_DNTP_CONC,
|
|
29
|
+
dna_conc=config.PCR_DNA_CONC
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def calc_hairpin(seq):
|
|
34
|
+
"""
|
|
35
|
+
calculates hairpins
|
|
36
|
+
"""
|
|
37
|
+
return p3.calc_hairpin(
|
|
38
|
+
seq.upper(),
|
|
39
|
+
mv_conc=config.PCR_MV_CONC,
|
|
40
|
+
dv_conc=config.PCR_DV_CONC,
|
|
41
|
+
dntp_conc=config.PCR_DNTP_CONC,
|
|
42
|
+
dna_conc=config.PCR_DNA_CONC
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def calc_dimer(seq1, seq2):
|
|
47
|
+
"""
|
|
48
|
+
Calculate the heterodimerization thermodynamics of two DNA sequences.
|
|
49
|
+
Return primer3 thermo object.
|
|
50
|
+
"""
|
|
51
|
+
return p3.calc_heterodimer(
|
|
52
|
+
seq1.upper(),
|
|
53
|
+
seq2.upper(),
|
|
54
|
+
mv_conc=config.PCR_MV_CONC,
|
|
55
|
+
dv_conc=config.PCR_DV_CONC,
|
|
56
|
+
dna_conc=config.PCR_DNA_CONC,
|
|
57
|
+
dntp_conc=config.PCR_DNTP_CONC,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def calc_max_polyx(seq):
|
|
62
|
+
"""
|
|
63
|
+
calculate maximum polyx of a seq
|
|
64
|
+
"""
|
|
65
|
+
previous_nuc = seq[0]
|
|
66
|
+
counter = 0
|
|
67
|
+
max_polyx = 0
|
|
68
|
+
for nuc in seq[1:]:
|
|
69
|
+
if nuc == previous_nuc:
|
|
70
|
+
counter += 1
|
|
71
|
+
else:
|
|
72
|
+
counter = 0
|
|
73
|
+
previous_nuc = nuc
|
|
74
|
+
if counter > max_polyx:
|
|
75
|
+
max_polyx = counter
|
|
76
|
+
return(max_polyx)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def calc_max_dinuc_repeats(seq):
|
|
80
|
+
"""
|
|
81
|
+
calculate the amount of repeating
|
|
82
|
+
dinucleotides in a sequence
|
|
83
|
+
"""
|
|
84
|
+
for s in [seq, seq[1:]]:
|
|
85
|
+
previous_dinuc = s[0:2]
|
|
86
|
+
max_dinuc = 0
|
|
87
|
+
counter = 0
|
|
88
|
+
for i in range(2, len(s), 2):
|
|
89
|
+
if s[i:i+2] == previous_dinuc:
|
|
90
|
+
counter += 1
|
|
91
|
+
else:
|
|
92
|
+
if counter > max_dinuc:
|
|
93
|
+
max_dinuc = counter
|
|
94
|
+
counter = 0
|
|
95
|
+
previous_dinuc = s[i:i+2]
|
|
96
|
+
return max_dinuc
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def calc_end_gc(seq):
|
|
100
|
+
"""
|
|
101
|
+
check how many gc nucleotides
|
|
102
|
+
are within the last 5 bases of
|
|
103
|
+
the 3' end
|
|
104
|
+
"""
|
|
105
|
+
return seq[-5:].count('g') + seq[-5:].count('c')
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def gc_clamp_present(seq):
|
|
109
|
+
"""
|
|
110
|
+
checks if a gc clamp is present
|
|
111
|
+
"""
|
|
112
|
+
if config.PRIMER_GC_CLAMP > 0:
|
|
113
|
+
for nuc in seq[-config.PRIMER_GC_CLAMP:]:
|
|
114
|
+
if nuc in "cg":
|
|
115
|
+
clamp_present = True
|
|
116
|
+
else:
|
|
117
|
+
clamp_present = False
|
|
118
|
+
break
|
|
119
|
+
else:
|
|
120
|
+
clamp_present = True
|
|
121
|
+
|
|
122
|
+
return clamp_present
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def is_three_prime_ambiguous(amb_seq):
|
|
126
|
+
"""
|
|
127
|
+
determine if a sequence contains an ambiguous char at the 3'prime
|
|
128
|
+
"""
|
|
129
|
+
len_3_prime = config.PRIMER_MIN_3_WITHOUT_AMB
|
|
130
|
+
|
|
131
|
+
if len_3_prime != 0:
|
|
132
|
+
for nuc in amb_seq[len(amb_seq)-len_3_prime:]:
|
|
133
|
+
if nuc not in config.nucs:
|
|
134
|
+
is_amb = True
|
|
135
|
+
break
|
|
136
|
+
else:
|
|
137
|
+
is_amb = False
|
|
138
|
+
else:
|
|
139
|
+
is_amb = False
|
|
140
|
+
|
|
141
|
+
return is_amb
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def rev_complement(seq):
|
|
145
|
+
"""
|
|
146
|
+
reverse complement a sequence
|
|
147
|
+
"""
|
|
148
|
+
return(str(Seq(seq).reverse_complement()))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def calc_permutation_penalty(amb_seq):
|
|
152
|
+
"""
|
|
153
|
+
get all permutations of a primer with ambiguous
|
|
154
|
+
nucleotides and multiply with permutation penalty
|
|
155
|
+
"""
|
|
156
|
+
permutations = 0
|
|
157
|
+
|
|
158
|
+
for nuc in amb_seq:
|
|
159
|
+
if nuc in config.ambig_nucs:
|
|
160
|
+
n = len(config.ambig_nucs[nuc])
|
|
161
|
+
if permutations != 0:
|
|
162
|
+
permutations = permutations*n
|
|
163
|
+
else:
|
|
164
|
+
permutations = n
|
|
165
|
+
|
|
166
|
+
return permutations*config.PRIMER_PERMUTATION_PENALTY
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def calc_base_penalty(seq):
|
|
170
|
+
"""
|
|
171
|
+
Calculate intrinsic primer penalty.
|
|
172
|
+
"""
|
|
173
|
+
penalty = 0
|
|
174
|
+
|
|
175
|
+
tm = calc_temp(seq)
|
|
176
|
+
gc = calc_gc(seq)
|
|
177
|
+
size = len(seq)
|
|
178
|
+
|
|
179
|
+
# TEMP penalty
|
|
180
|
+
if tm > config.PRIMER_TMP[2]:
|
|
181
|
+
penalty += config.PRIMER_TM_PENALTY*(
|
|
182
|
+
tm - config.PRIMER_TMP[2]
|
|
183
|
+
)
|
|
184
|
+
if tm < config.PRIMER_TMP[2]:
|
|
185
|
+
penalty += config.PRIMER_TM_PENALTY*(
|
|
186
|
+
config.PRIMER_TMP[2] - tm
|
|
187
|
+
)
|
|
188
|
+
# GC penalty
|
|
189
|
+
if gc > config.PRIMER_GC_RANGE[2]:
|
|
190
|
+
penalty += config.PRIMER_GC_PENALTY*(
|
|
191
|
+
gc - config.PRIMER_GC_RANGE[2]
|
|
192
|
+
)
|
|
193
|
+
if gc < config.PRIMER_GC_RANGE[2]:
|
|
194
|
+
penalty += config.PRIMER_GC_PENALTY*(
|
|
195
|
+
config.PRIMER_GC_RANGE[2] - gc
|
|
196
|
+
)
|
|
197
|
+
# SIZE penalty
|
|
198
|
+
if size > config.PRIMER_SIZES[2]:
|
|
199
|
+
penalty += config.PRIMER_SIZE_PENALTY*(
|
|
200
|
+
size - config.PRIMER_SIZES[2]
|
|
201
|
+
)
|
|
202
|
+
if size < config.PRIMER_SIZES[2]:
|
|
203
|
+
penalty += config.PRIMER_SIZE_PENALTY * (
|
|
204
|
+
config.PRIMER_SIZES[2] - size
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return penalty
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def calc_per_base_mismatches(kmer, alignment, ambiguous_consensus):
|
|
211
|
+
"""
|
|
212
|
+
calculate for a given kmer with [seq, start, stop]
|
|
213
|
+
the percent mismatch per kmer pos with the alignment.
|
|
214
|
+
considers if kmer or aln sequences have an amb nuc. returns
|
|
215
|
+
a list of percent mismatches for each kmer position.
|
|
216
|
+
"""
|
|
217
|
+
# ini list
|
|
218
|
+
mismatches = len(kmer[0])*[0]
|
|
219
|
+
# get kmer with ambiguous nucs
|
|
220
|
+
amb_kmer = ambiguous_consensus[kmer[1]:kmer[2]]
|
|
221
|
+
# test it against all sequences in the alignment
|
|
222
|
+
for sequence in alignment:
|
|
223
|
+
# slice each sequence of the alignment for the kmer
|
|
224
|
+
# start and stop positions
|
|
225
|
+
seq_slice = sequence[1][kmer[1]:kmer[2]]
|
|
226
|
+
for idx, slice_nuc in enumerate(seq_slice):
|
|
227
|
+
# find the respective nuc to that of the slice
|
|
228
|
+
current_kmer_pos = amb_kmer[idx]
|
|
229
|
+
if slice_nuc == current_kmer_pos:
|
|
230
|
+
continue
|
|
231
|
+
# check if the slice nucleotide is an amb pos
|
|
232
|
+
if slice_nuc in config.ambig_nucs:
|
|
233
|
+
# check if the kmer has an amb pos
|
|
234
|
+
if current_kmer_pos in config.ambig_nucs:
|
|
235
|
+
slice_nuc_set = set(config.ambig_nucs[slice_nuc])
|
|
236
|
+
pri_set = set(config.ambig_nucs[current_kmer_pos])
|
|
237
|
+
# check if these sets have no overlap
|
|
238
|
+
# -> mismatch
|
|
239
|
+
if len(slice_nuc_set.intersection(pri_set)) == 0:
|
|
240
|
+
mismatches[idx] += 1
|
|
241
|
+
# if no amb pos is in kmer then check if kmer nuc
|
|
242
|
+
# is part of the amb slice nuc
|
|
243
|
+
elif current_kmer_pos not in config.ambig_nucs[slice_nuc]:
|
|
244
|
+
mismatches[idx] += 1
|
|
245
|
+
# check if kmer has an amb pos but the current
|
|
246
|
+
# slice_nuc is not part of this amb nucleotide
|
|
247
|
+
elif current_kmer_pos in config.ambig_nucs:
|
|
248
|
+
if slice_nuc not in config.ambig_nucs[current_kmer_pos]:
|
|
249
|
+
mismatches[idx] += 1
|
|
250
|
+
# mismatch
|
|
251
|
+
else:
|
|
252
|
+
mismatches[idx] += 1
|
|
253
|
+
|
|
254
|
+
# gives a percent mismatch over all positions of the kmer from 5' to 3'
|
|
255
|
+
mismatches = [round(x/len(alignment), 2) for x in mismatches]
|
|
256
|
+
|
|
257
|
+
return mismatches
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def calc_3_prime_penalty(direction, mismatches):
|
|
261
|
+
"""
|
|
262
|
+
calculate the penalty for mismatches at the 3' end.
|
|
263
|
+
the more mismatches are closer to the 3' end of the kmer,
|
|
264
|
+
the higher the penalty. uses the previously calculated
|
|
265
|
+
mismatch list.
|
|
266
|
+
"""
|
|
267
|
+
if config.PRIMER_3_PENALTY:
|
|
268
|
+
if direction == "-":
|
|
269
|
+
penalty = sum([m * p for m, p in zip(mismatches[0:len(config.PRIMER_3_PENALTY)], config.PRIMER_3_PENALTY)])
|
|
270
|
+
elif direction == "+":
|
|
271
|
+
penalty = sum([m * p for m, p in zip(mismatches[::-1][0:len(config.PRIMER_3_PENALTY)], config.PRIMER_3_PENALTY)])
|
|
272
|
+
else:
|
|
273
|
+
penalty = 0
|
|
274
|
+
|
|
275
|
+
return(penalty)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def filter_kmer_direction_independent(seq):
|
|
279
|
+
"""
|
|
280
|
+
filter kmer for temperature, gc content,
|
|
281
|
+
poly x, dinucleotide repeats and homodimerization
|
|
282
|
+
"""
|
|
283
|
+
return(
|
|
284
|
+
(config.PRIMER_TMP[0] <= calc_temp(seq) <= config.PRIMER_TMP[1])
|
|
285
|
+
and (config.PRIMER_GC_RANGE[0] <= calc_gc(seq) <= config.PRIMER_GC_RANGE[1])
|
|
286
|
+
and (calc_max_polyx(seq) <= config.PRIMER_MAX_POLYX)
|
|
287
|
+
and (calc_max_dinuc_repeats(seq) <= config.PRIMER_MAX_DINUC_REPEATS)
|
|
288
|
+
and (calc_base_penalty(seq) <= config.PRIMER_MAX_BASE_PENALTY)
|
|
289
|
+
and (calc_dimer(seq, seq).tm <= config.PRIMER_MAX_DIMER_TMP)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
|
|
294
|
+
"""
|
|
295
|
+
filter for 3'ambiguous, hairpin temp and end GC content.
|
|
296
|
+
this differs depending on the direction of the kmer.
|
|
297
|
+
"""
|
|
298
|
+
# get the correct amb kmer to test
|
|
299
|
+
if direction == "+":
|
|
300
|
+
kmer_seq = kmer[0]
|
|
301
|
+
amb_kmer_seq = ambiguous_consensus[kmer[1]:kmer[2]]
|
|
302
|
+
elif direction == "-":
|
|
303
|
+
kmer_seq = rev_complement(kmer[0])
|
|
304
|
+
amb_kmer_seq = rev_complement(ambiguous_consensus[kmer[1]:kmer[2]])
|
|
305
|
+
# filter kmer
|
|
306
|
+
return(
|
|
307
|
+
(calc_hairpin(kmer_seq).tm <= config.PRIMER_HAIRPIN)
|
|
308
|
+
and (calc_end_gc(kmer_seq) <= config.PRIMER_MAX_GC_END)
|
|
309
|
+
and gc_clamp_present(kmer_seq)
|
|
310
|
+
and not is_three_prime_ambiguous(amb_kmer_seq)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def find_primers(kmers, ambiguous_consensus, alignment):
|
|
315
|
+
"""
|
|
316
|
+
filter kmers direction specific and append penalties
|
|
317
|
+
--> potential primers
|
|
318
|
+
"""
|
|
319
|
+
left_primer_candidates = []
|
|
320
|
+
right_primer_candidates = []
|
|
321
|
+
|
|
322
|
+
for kmer in kmers:
|
|
323
|
+
# filter kmers based on their direction independend stats
|
|
324
|
+
if not filter_kmer_direction_independent(kmer[0]):
|
|
325
|
+
continue
|
|
326
|
+
# calc base penalty
|
|
327
|
+
base_penalty = calc_base_penalty(kmer[0])
|
|
328
|
+
# calcualte per base mismatches
|
|
329
|
+
per_base_mismatches = calc_per_base_mismatches(
|
|
330
|
+
kmer,
|
|
331
|
+
alignment,
|
|
332
|
+
ambiguous_consensus
|
|
333
|
+
)
|
|
334
|
+
# calculate permutation penealty
|
|
335
|
+
permutation_penalty = calc_permutation_penalty(
|
|
336
|
+
ambiguous_consensus[kmer[1]:kmer[2]]
|
|
337
|
+
)
|
|
338
|
+
# now check direction specific
|
|
339
|
+
for direction in ["+", "-"]:
|
|
340
|
+
# check if kmer passes direction filter
|
|
341
|
+
if not filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
|
|
342
|
+
continue
|
|
343
|
+
# calculate the 3' penalty
|
|
344
|
+
three_prime_penalty = calc_3_prime_penalty(
|
|
345
|
+
direction,
|
|
346
|
+
per_base_mismatches
|
|
347
|
+
)
|
|
348
|
+
# add all penalties
|
|
349
|
+
primer_penalty = base_penalty + permutation_penalty + three_prime_penalty
|
|
350
|
+
# sort into lists
|
|
351
|
+
if direction == "+":
|
|
352
|
+
left_primer_candidates.append(
|
|
353
|
+
[kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches]
|
|
354
|
+
)
|
|
355
|
+
if direction == "-":
|
|
356
|
+
right_primer_candidates.append(
|
|
357
|
+
[rev_complement(kmer[0]), kmer[1], kmer[2], primer_penalty, per_base_mismatches]
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
return left_primer_candidates, right_primer_candidates
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def create_primer_dictionary(primer_candidates, direction):
|
|
364
|
+
"""
|
|
365
|
+
creates a primer dictionary from primer list
|
|
366
|
+
"""
|
|
367
|
+
primer_dict = {}
|
|
368
|
+
primer_idx = 0
|
|
369
|
+
|
|
370
|
+
for primer in primer_candidates:
|
|
371
|
+
if direction == "+":
|
|
372
|
+
direction_name = "LEFT"
|
|
373
|
+
elif direction == "-":
|
|
374
|
+
direction_name = "RIGHT"
|
|
375
|
+
primer_name = direction_name + "_" + str(primer_idx)
|
|
376
|
+
primer_dict[primer_name] = primer
|
|
377
|
+
primer_idx += 1
|
|
378
|
+
|
|
379
|
+
return primer_dict
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def find_best_primers(left_primer_candidates, right_primer_candidates):
|
|
383
|
+
"""
|
|
384
|
+
Primer candidates are likely overlapping. Here, the list of primers
|
|
385
|
+
is sorted for the best to worst scoring. Then, the next best scoring
|
|
386
|
+
is retained if it does not have any nucleotides that have already
|
|
387
|
+
been covered by a better scoring primer candidate. This significantly
|
|
388
|
+
reduces the amount of primers while retaining the best scoring ones.
|
|
389
|
+
"""
|
|
390
|
+
all_primers = {}
|
|
391
|
+
|
|
392
|
+
for direction, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
|
|
393
|
+
# sort the primers for the best scoring
|
|
394
|
+
primer_candidates.sort(key=lambda x: x[3])
|
|
395
|
+
# ini everything with the top scoring primer
|
|
396
|
+
to_retain = [primer_candidates[0]]
|
|
397
|
+
primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]+1))
|
|
398
|
+
primer_set = set(primer_ranges)
|
|
399
|
+
|
|
400
|
+
for primer in primer_candidates:
|
|
401
|
+
primer_positions = list(range(primer[1], primer[2]+1))
|
|
402
|
+
# check if none of the nucleotides of the next primer
|
|
403
|
+
# are already covered by a better primer
|
|
404
|
+
if not any(x in primer_positions for x in primer_set):
|
|
405
|
+
# update the primer set
|
|
406
|
+
primer_set.update(primer_positions)
|
|
407
|
+
# append this primer as it is well scoring and not overlapping
|
|
408
|
+
# with another already retained primer
|
|
409
|
+
to_retain.append(primer)
|
|
410
|
+
|
|
411
|
+
# sort by start
|
|
412
|
+
to_retain.sort(key=lambda x: x[1])
|
|
413
|
+
# create dict
|
|
414
|
+
all_primers[direction] = create_primer_dictionary(to_retain, direction)
|
|
415
|
+
|
|
416
|
+
# and create a dict
|
|
417
|
+
return all_primers
|