varvamp 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,417 @@
1
+ """
2
+ primer creation and evaluation
3
+ """
4
+
5
+ # LIBS
6
+ from Bio.Seq import Seq
7
+ import primer3 as p3
8
+
9
+ # varVAMP
10
+ from varvamp.scripts import config
11
+
12
+
13
+ def calc_gc(seq):
14
+ """
15
+ calculate the gc of a sequence
16
+ """
17
+ return 100*(seq.count("g")+seq.count("c"))/len(seq)
18
+
19
+
20
+ def calc_temp(seq):
21
+ """
22
+ calculate the melting temperature
23
+ """
24
+ return p3.calc_tm(
25
+ seq.upper(),
26
+ mv_conc=config.PCR_MV_CONC,
27
+ dv_conc=config.PCR_DV_CONC,
28
+ dntp_conc=config.PCR_DNTP_CONC,
29
+ dna_conc=config.PCR_DNA_CONC
30
+ )
31
+
32
+
33
+ def calc_hairpin(seq):
34
+ """
35
+ calculates hairpins
36
+ """
37
+ return p3.calc_hairpin(
38
+ seq.upper(),
39
+ mv_conc=config.PCR_MV_CONC,
40
+ dv_conc=config.PCR_DV_CONC,
41
+ dntp_conc=config.PCR_DNTP_CONC,
42
+ dna_conc=config.PCR_DNA_CONC
43
+ )
44
+
45
+
46
+ def calc_dimer(seq1, seq2):
47
+ """
48
+ Calculate the heterodimerization thermodynamics of two DNA sequences.
49
+ Return primer3 thermo object.
50
+ """
51
+ return p3.calc_heterodimer(
52
+ seq1.upper(),
53
+ seq2.upper(),
54
+ mv_conc=config.PCR_MV_CONC,
55
+ dv_conc=config.PCR_DV_CONC,
56
+ dna_conc=config.PCR_DNA_CONC,
57
+ dntp_conc=config.PCR_DNTP_CONC,
58
+ )
59
+
60
+
61
+ def calc_max_polyx(seq):
62
+ """
63
+ calculate maximum polyx of a seq
64
+ """
65
+ previous_nuc = seq[0]
66
+ counter = 0
67
+ max_polyx = 0
68
+ for nuc in seq[1:]:
69
+ if nuc == previous_nuc:
70
+ counter += 1
71
+ else:
72
+ counter = 0
73
+ previous_nuc = nuc
74
+ if counter > max_polyx:
75
+ max_polyx = counter
76
+ return(max_polyx)
77
+
78
+
79
+ def calc_max_dinuc_repeats(seq):
80
+ """
81
+ calculate the amount of repeating
82
+ dinucleotides in a sequence
83
+ """
84
+ for s in [seq, seq[1:]]:
85
+ previous_dinuc = s[0:2]
86
+ max_dinuc = 0
87
+ counter = 0
88
+ for i in range(2, len(s), 2):
89
+ if s[i:i+2] == previous_dinuc:
90
+ counter += 1
91
+ else:
92
+ if counter > max_dinuc:
93
+ max_dinuc = counter
94
+ counter = 0
95
+ previous_dinuc = s[i:i+2]
96
+ return max_dinuc
97
+
98
+
99
+ def calc_end_gc(seq):
100
+ """
101
+ check how many gc nucleotides
102
+ are within the last 5 bases of
103
+ the 3' end
104
+ """
105
+ return seq[-5:].count('g') + seq[-5:].count('c')
106
+
107
+
108
+ def gc_clamp_present(seq):
109
+ """
110
+ checks if a gc clamp is present
111
+ """
112
+ if config.PRIMER_GC_CLAMP > 0:
113
+ for nuc in seq[-config.PRIMER_GC_CLAMP:]:
114
+ if nuc in "cg":
115
+ clamp_present = True
116
+ else:
117
+ clamp_present = False
118
+ break
119
+ else:
120
+ clamp_present = True
121
+
122
+ return clamp_present
123
+
124
+
125
+ def is_three_prime_ambiguous(amb_seq):
126
+ """
127
+ determine if a sequence contains an ambiguous char at the 3'prime
128
+ """
129
+ len_3_prime = config.PRIMER_MIN_3_WITHOUT_AMB
130
+
131
+ if len_3_prime != 0:
132
+ for nuc in amb_seq[len(amb_seq)-len_3_prime:]:
133
+ if nuc not in config.nucs:
134
+ is_amb = True
135
+ break
136
+ else:
137
+ is_amb = False
138
+ else:
139
+ is_amb = False
140
+
141
+ return is_amb
142
+
143
+
144
+ def rev_complement(seq):
145
+ """
146
+ reverse complement a sequence
147
+ """
148
+ return(str(Seq(seq).reverse_complement()))
149
+
150
+
151
+ def calc_permutation_penalty(amb_seq):
152
+ """
153
+ get all permutations of a primer with ambiguous
154
+ nucleotides and multiply with permutation penalty
155
+ """
156
+ permutations = 0
157
+
158
+ for nuc in amb_seq:
159
+ if nuc in config.ambig_nucs:
160
+ n = len(config.ambig_nucs[nuc])
161
+ if permutations != 0:
162
+ permutations = permutations*n
163
+ else:
164
+ permutations = n
165
+
166
+ return permutations*config.PRIMER_PERMUTATION_PENALTY
167
+
168
+
169
+ def calc_base_penalty(seq):
170
+ """
171
+ Calculate intrinsic primer penalty.
172
+ """
173
+ penalty = 0
174
+
175
+ tm = calc_temp(seq)
176
+ gc = calc_gc(seq)
177
+ size = len(seq)
178
+
179
+ # TEMP penalty
180
+ if tm > config.PRIMER_TMP[2]:
181
+ penalty += config.PRIMER_TM_PENALTY*(
182
+ tm - config.PRIMER_TMP[2]
183
+ )
184
+ if tm < config.PRIMER_TMP[2]:
185
+ penalty += config.PRIMER_TM_PENALTY*(
186
+ config.PRIMER_TMP[2] - tm
187
+ )
188
+ # GC penalty
189
+ if gc > config.PRIMER_GC_RANGE[2]:
190
+ penalty += config.PRIMER_GC_PENALTY*(
191
+ gc - config.PRIMER_GC_RANGE[2]
192
+ )
193
+ if gc < config.PRIMER_GC_RANGE[2]:
194
+ penalty += config.PRIMER_GC_PENALTY*(
195
+ config.PRIMER_GC_RANGE[2] - gc
196
+ )
197
+ # SIZE penalty
198
+ if size > config.PRIMER_SIZES[2]:
199
+ penalty += config.PRIMER_SIZE_PENALTY*(
200
+ size - config.PRIMER_SIZES[2]
201
+ )
202
+ if size < config.PRIMER_SIZES[2]:
203
+ penalty += config.PRIMER_SIZE_PENALTY * (
204
+ config.PRIMER_SIZES[2] - size
205
+ )
206
+
207
+ return penalty
208
+
209
+
210
+ def calc_per_base_mismatches(kmer, alignment, ambiguous_consensus):
211
+ """
212
+ calculate for a given kmer with [seq, start, stop]
213
+ the percent mismatch per kmer pos with the alignment.
214
+ considers if kmer or aln sequences have an amb nuc. returns
215
+ a list of percent mismatches for each kmer position.
216
+ """
217
+ # ini list
218
+ mismatches = len(kmer[0])*[0]
219
+ # get kmer with ambiguous nucs
220
+ amb_kmer = ambiguous_consensus[kmer[1]:kmer[2]]
221
+ # test it against all sequences in the alignment
222
+ for sequence in alignment:
223
+ # slice each sequence of the alignment for the kmer
224
+ # start and stop positions
225
+ seq_slice = sequence[1][kmer[1]:kmer[2]]
226
+ for idx, slice_nuc in enumerate(seq_slice):
227
+ # find the respective nuc to that of the slice
228
+ current_kmer_pos = amb_kmer[idx]
229
+ if slice_nuc == current_kmer_pos:
230
+ continue
231
+ # check if the slice nucleotide is an amb pos
232
+ if slice_nuc in config.ambig_nucs:
233
+ # check if the kmer has an amb pos
234
+ if current_kmer_pos in config.ambig_nucs:
235
+ slice_nuc_set = set(config.ambig_nucs[slice_nuc])
236
+ pri_set = set(config.ambig_nucs[current_kmer_pos])
237
+ # check if these sets have no overlap
238
+ # -> mismatch
239
+ if len(slice_nuc_set.intersection(pri_set)) == 0:
240
+ mismatches[idx] += 1
241
+ # if no amb pos is in kmer then check if kmer nuc
242
+ # is part of the amb slice nuc
243
+ elif current_kmer_pos not in config.ambig_nucs[slice_nuc]:
244
+ mismatches[idx] += 1
245
+ # check if kmer has an amb pos but the current
246
+ # slice_nuc is not part of this amb nucleotide
247
+ elif current_kmer_pos in config.ambig_nucs:
248
+ if slice_nuc not in config.ambig_nucs[current_kmer_pos]:
249
+ mismatches[idx] += 1
250
+ # mismatch
251
+ else:
252
+ mismatches[idx] += 1
253
+
254
+ # gives a percent mismatch over all positions of the kmer from 5' to 3'
255
+ mismatches = [round(x/len(alignment), 2) for x in mismatches]
256
+
257
+ return mismatches
258
+
259
+
260
+ def calc_3_prime_penalty(direction, mismatches):
261
+ """
262
+ calculate the penalty for mismatches at the 3' end.
263
+ the more mismatches are closer to the 3' end of the kmer,
264
+ the higher the penalty. uses the previously calculated
265
+ mismatch list.
266
+ """
267
+ if config.PRIMER_3_PENALTY:
268
+ if direction == "-":
269
+ penalty = sum([m * p for m, p in zip(mismatches[0:len(config.PRIMER_3_PENALTY)], config.PRIMER_3_PENALTY)])
270
+ elif direction == "+":
271
+ penalty = sum([m * p for m, p in zip(mismatches[::-1][0:len(config.PRIMER_3_PENALTY)], config.PRIMER_3_PENALTY)])
272
+ else:
273
+ penalty = 0
274
+
275
+ return(penalty)
276
+
277
+
278
+ def filter_kmer_direction_independent(seq):
279
+ """
280
+ filter kmer for temperature, gc content,
281
+ poly x, dinucleotide repeats and homodimerization
282
+ """
283
+ return(
284
+ (config.PRIMER_TMP[0] <= calc_temp(seq) <= config.PRIMER_TMP[1])
285
+ and (config.PRIMER_GC_RANGE[0] <= calc_gc(seq) <= config.PRIMER_GC_RANGE[1])
286
+ and (calc_max_polyx(seq) <= config.PRIMER_MAX_POLYX)
287
+ and (calc_max_dinuc_repeats(seq) <= config.PRIMER_MAX_DINUC_REPEATS)
288
+ and (calc_base_penalty(seq) <= config.PRIMER_MAX_BASE_PENALTY)
289
+ and (calc_dimer(seq, seq).tm <= config.PRIMER_MAX_DIMER_TMP)
290
+ )
291
+
292
+
293
+ def filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
294
+ """
295
+ filter for 3'ambiguous, hairpin temp and end GC content.
296
+ this differs depending on the direction of the kmer.
297
+ """
298
+ # get the correct amb kmer to test
299
+ if direction == "+":
300
+ kmer_seq = kmer[0]
301
+ amb_kmer_seq = ambiguous_consensus[kmer[1]:kmer[2]]
302
+ elif direction == "-":
303
+ kmer_seq = rev_complement(kmer[0])
304
+ amb_kmer_seq = rev_complement(ambiguous_consensus[kmer[1]:kmer[2]])
305
+ # filter kmer
306
+ return(
307
+ (calc_hairpin(kmer_seq).tm <= config.PRIMER_HAIRPIN)
308
+ and (calc_end_gc(kmer_seq) <= config.PRIMER_MAX_GC_END)
309
+ and gc_clamp_present(kmer_seq)
310
+ and not is_three_prime_ambiguous(amb_kmer_seq)
311
+ )
312
+
313
+
314
+ def find_primers(kmers, ambiguous_consensus, alignment):
315
+ """
316
+ filter kmers direction specific and append penalties
317
+ --> potential primers
318
+ """
319
+ left_primer_candidates = []
320
+ right_primer_candidates = []
321
+
322
+ for kmer in kmers:
323
+ # filter kmers based on their direction independend stats
324
+ if not filter_kmer_direction_independent(kmer[0]):
325
+ continue
326
+ # calc base penalty
327
+ base_penalty = calc_base_penalty(kmer[0])
328
+ # calcualte per base mismatches
329
+ per_base_mismatches = calc_per_base_mismatches(
330
+ kmer,
331
+ alignment,
332
+ ambiguous_consensus
333
+ )
334
+ # calculate permutation penealty
335
+ permutation_penalty = calc_permutation_penalty(
336
+ ambiguous_consensus[kmer[1]:kmer[2]]
337
+ )
338
+ # now check direction specific
339
+ for direction in ["+", "-"]:
340
+ # check if kmer passes direction filter
341
+ if not filter_kmer_direction_dependend(direction, kmer, ambiguous_consensus):
342
+ continue
343
+ # calculate the 3' penalty
344
+ three_prime_penalty = calc_3_prime_penalty(
345
+ direction,
346
+ per_base_mismatches
347
+ )
348
+ # add all penalties
349
+ primer_penalty = base_penalty + permutation_penalty + three_prime_penalty
350
+ # sort into lists
351
+ if direction == "+":
352
+ left_primer_candidates.append(
353
+ [kmer[0], kmer[1], kmer[2], primer_penalty, per_base_mismatches]
354
+ )
355
+ if direction == "-":
356
+ right_primer_candidates.append(
357
+ [rev_complement(kmer[0]), kmer[1], kmer[2], primer_penalty, per_base_mismatches]
358
+ )
359
+
360
+ return left_primer_candidates, right_primer_candidates
361
+
362
+
363
+ def create_primer_dictionary(primer_candidates, direction):
364
+ """
365
+ creates a primer dictionary from primer list
366
+ """
367
+ primer_dict = {}
368
+ primer_idx = 0
369
+
370
+ for primer in primer_candidates:
371
+ if direction == "+":
372
+ direction_name = "LEFT"
373
+ elif direction == "-":
374
+ direction_name = "RIGHT"
375
+ primer_name = direction_name + "_" + str(primer_idx)
376
+ primer_dict[primer_name] = primer
377
+ primer_idx += 1
378
+
379
+ return primer_dict
380
+
381
+
382
+ def find_best_primers(left_primer_candidates, right_primer_candidates):
383
+ """
384
+ Primer candidates are likely overlapping. Here, the list of primers
385
+ is sorted for the best to worst scoring. Then, the next best scoring
386
+ is retained if it does not have any nucleotides that have already
387
+ been covered by a better scoring primer candidate. This significantly
388
+ reduces the amount of primers while retaining the best scoring ones.
389
+ """
390
+ all_primers = {}
391
+
392
+ for direction, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
393
+ # sort the primers for the best scoring
394
+ primer_candidates.sort(key=lambda x: x[3])
395
+ # ini everything with the top scoring primer
396
+ to_retain = [primer_candidates[0]]
397
+ primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]+1))
398
+ primer_set = set(primer_ranges)
399
+
400
+ for primer in primer_candidates:
401
+ primer_positions = list(range(primer[1], primer[2]+1))
402
+ # check if none of the nucleotides of the next primer
403
+ # are already covered by a better primer
404
+ if not any(x in primer_positions for x in primer_set):
405
+ # update the primer set
406
+ primer_set.update(primer_positions)
407
+ # append this primer as it is well scoring and not overlapping
408
+ # with another already retained primer
409
+ to_retain.append(primer)
410
+
411
+ # sort by start
412
+ to_retain.sort(key=lambda x: x[1])
413
+ # create dict
414
+ all_primers[direction] = create_primer_dictionary(to_retain, direction)
415
+
416
+ # and create a dict
417
+ return all_primers