varvamp 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +3 -0
- varvamp/__main__.py +5 -0
- varvamp/command.py +263 -0
- varvamp/scripts/__init__.py +0 -0
- varvamp/scripts/alignment.py +223 -0
- varvamp/scripts/config.py +59 -0
- varvamp/scripts/consensus.py +111 -0
- varvamp/scripts/conserved.py +118 -0
- varvamp/scripts/logging.py +321 -0
- varvamp/scripts/primers.py +417 -0
- varvamp/scripts/reporting.py +353 -0
- varvamp/scripts/scheme.py +390 -0
- varvamp-0.3.dist-info/METADATA +53 -0
- varvamp-0.3.dist-info/RECORD +17 -0
- varvamp-0.3.dist-info/WHEEL +5 -0
- varvamp-0.3.dist-info/entry_points.txt +2 -0
- varvamp-0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""
|
|
2
|
+
data writing and visualization.
|
|
3
|
+
"""
|
|
4
|
+
# BUILT-INS
|
|
5
|
+
import os
|
|
6
|
+
import math
|
|
7
|
+
import itertools
|
|
8
|
+
|
|
9
|
+
# LIBS
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import numpy as np
|
|
12
|
+
import matplotlib.pyplot as plt
|
|
13
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# varVAMP
|
|
17
|
+
from varvamp.scripts import primers
|
|
18
|
+
from varvamp.scripts import config
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def write_fasta(dir, seq_id, seq):
|
|
22
|
+
"""
|
|
23
|
+
write fasta files
|
|
24
|
+
"""
|
|
25
|
+
name = seq_id + ".fasta"
|
|
26
|
+
out = os.path.join(dir, name)
|
|
27
|
+
with open(out, 'w') as o:
|
|
28
|
+
print(f">{seq_id}\n{seq}", file=o)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def write_alignment(dir, alignment):
|
|
32
|
+
"""
|
|
33
|
+
write alignment to file
|
|
34
|
+
"""
|
|
35
|
+
name = "alignment_cleaned.fasta"
|
|
36
|
+
out = os.path.join(dir, name)
|
|
37
|
+
with open(out, "w") as o:
|
|
38
|
+
for seq in alignment:
|
|
39
|
+
print(f">{seq[0]}\n{seq[1]}", file=o)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def write_conserved_to_bed(conserved_regions, dir):
|
|
43
|
+
"""
|
|
44
|
+
write conserved regions as bed file
|
|
45
|
+
"""
|
|
46
|
+
counter = 0
|
|
47
|
+
outfile = dir+"conserved_regions.bed"
|
|
48
|
+
with open(outfile, 'w') as o:
|
|
49
|
+
for region in conserved_regions:
|
|
50
|
+
print(
|
|
51
|
+
"ambiguous_consensus",
|
|
52
|
+
region[0],
|
|
53
|
+
region[1],
|
|
54
|
+
"region_"+str(counter),
|
|
55
|
+
sep="\t",
|
|
56
|
+
file=o
|
|
57
|
+
)
|
|
58
|
+
counter += 1
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def write_primers_to_bed(outfile, primer_name, primer_properties, direction):
|
|
62
|
+
"""
|
|
63
|
+
write primers as bed file
|
|
64
|
+
"""
|
|
65
|
+
with open(outfile, 'a') as o:
|
|
66
|
+
print(
|
|
67
|
+
"ambiguous_consensus",
|
|
68
|
+
primer_properties[1], # start
|
|
69
|
+
primer_properties[2], # stop
|
|
70
|
+
primer_name,
|
|
71
|
+
round(primer_properties[3], 1), # score
|
|
72
|
+
direction,
|
|
73
|
+
sep="\t",
|
|
74
|
+
file=o
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_all_primers(dir, all_primers):
|
|
79
|
+
"""
|
|
80
|
+
write all primers that varVAMP designed as bed file
|
|
81
|
+
"""
|
|
82
|
+
outfile = dir + "all_primers.bed"
|
|
83
|
+
|
|
84
|
+
for direction in all_primers:
|
|
85
|
+
for primer in all_primers[direction]:
|
|
86
|
+
write_primers_to_bed(outfile, primer, all_primers[direction][primer], direction)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_permutations(seq):
|
|
90
|
+
"""
|
|
91
|
+
get all permutations of an ambiguous sequence. needed to
|
|
92
|
+
correctly report the gc and the temperature.
|
|
93
|
+
"""
|
|
94
|
+
groups = itertools.groupby(seq, lambda char: char not in config.ambig_nucs)
|
|
95
|
+
splits = []
|
|
96
|
+
for b, group in groups:
|
|
97
|
+
if b:
|
|
98
|
+
splits.extend([[g] for g in group])
|
|
99
|
+
else:
|
|
100
|
+
for nuc in group:
|
|
101
|
+
splits.append(config.ambig_nucs[nuc])
|
|
102
|
+
return[''.join(p) for p in itertools.product(*splits)]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def write_scheme_to_files(dir, amplicon_scheme, ambiguous_consensus):
|
|
106
|
+
"""
|
|
107
|
+
write all relevant bed files and a tsv file with all primer stats
|
|
108
|
+
"""
|
|
109
|
+
# ini
|
|
110
|
+
tsv_file = os.path.join(dir, "primers.tsv")
|
|
111
|
+
primer_bed_file = os.path.join(dir, "primers.bed")
|
|
112
|
+
amplicon_bed_file = os.path.join(dir, "amplicons.bed")
|
|
113
|
+
tabular_file = os.path.join(dir, "primer_to_amplicon_assignment.tabular")
|
|
114
|
+
|
|
115
|
+
counter = 0
|
|
116
|
+
|
|
117
|
+
# open files to write
|
|
118
|
+
with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
|
|
119
|
+
# write header for primer tsv
|
|
120
|
+
print(
|
|
121
|
+
"amlicon_name\tprimer_name\tpool\tstart\tstop\tseq\tsize\tgc_best\ttemp_best\tmean_gc\tmean_temp\tscore",
|
|
122
|
+
file=tsv
|
|
123
|
+
)
|
|
124
|
+
counter = 0
|
|
125
|
+
|
|
126
|
+
for pool in amplicon_scheme:
|
|
127
|
+
for amp in amplicon_scheme[pool]:
|
|
128
|
+
# give a new amplicon name
|
|
129
|
+
new_name = f"amplicon_{str(counter)}"
|
|
130
|
+
counter += 1
|
|
131
|
+
# get left and right primers and their names
|
|
132
|
+
primer_names = list(amplicon_scheme[pool][amp].keys())
|
|
133
|
+
left = (primer_names[0], amplicon_scheme[pool][amp][primer_names[0]])
|
|
134
|
+
right = (primer_names[1], amplicon_scheme[pool][amp][primer_names[1]])
|
|
135
|
+
|
|
136
|
+
# write amplicon bed
|
|
137
|
+
print("ambiguous_consensus", left[1][1], right[1][2], new_name, pool, sep="\t", file=bed)
|
|
138
|
+
# write primer assignments tabular file
|
|
139
|
+
print(left[0], right[0], sep="\t", file=tabular)
|
|
140
|
+
|
|
141
|
+
# write primer tsv and primer bed
|
|
142
|
+
for direction, primer in [("+", left), ("-", right)]:
|
|
143
|
+
seq = ambiguous_consensus[primer[1][1]:primer[1][2]]
|
|
144
|
+
if direction == "-":
|
|
145
|
+
seq = primers.rev_complement(seq)
|
|
146
|
+
# calc primer parameters for all permutations
|
|
147
|
+
gc = 0
|
|
148
|
+
temp = 0
|
|
149
|
+
permutations = get_permutations(seq)
|
|
150
|
+
for permutation in permutations:
|
|
151
|
+
gc += primers.calc_gc(permutation)
|
|
152
|
+
temp += primers.calc_temp(permutation)
|
|
153
|
+
# write tsv file
|
|
154
|
+
print(
|
|
155
|
+
new_name,
|
|
156
|
+
primer[0],
|
|
157
|
+
pool,
|
|
158
|
+
primer[1][1],
|
|
159
|
+
primer[1][2],
|
|
160
|
+
seq,
|
|
161
|
+
len(primer[1][0]),
|
|
162
|
+
round(primers.calc_gc(primer[1][0]), 1),
|
|
163
|
+
round(primers.calc_temp(primer[1][0]), 1),
|
|
164
|
+
round(gc/len(permutations), 1),
|
|
165
|
+
round(temp/len(permutations), 1),
|
|
166
|
+
round(primer[1][3], 1),
|
|
167
|
+
sep="\t",
|
|
168
|
+
file=tsv
|
|
169
|
+
)
|
|
170
|
+
# write primer bed file
|
|
171
|
+
write_primers_to_bed(primer_bed_file, primer[0], primer[1], direction)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def write_dimers(dir, not_solved):
|
|
175
|
+
"""
|
|
176
|
+
write dimers for which no replacement was found to file
|
|
177
|
+
"""
|
|
178
|
+
tsv_file = os.path.join(dir, "unsolvable_primer_dimers.tsv")
|
|
179
|
+
print(
|
|
180
|
+
"pool\tprimer_name_1\tprimer_name_2\tdimer melting temp",
|
|
181
|
+
file=tsv_file
|
|
182
|
+
)
|
|
183
|
+
for dimers in not_solved:
|
|
184
|
+
print(
|
|
185
|
+
dimers[0][0],
|
|
186
|
+
dimers[0][2],
|
|
187
|
+
dimers[1][2],
|
|
188
|
+
round(primers.calc_dimer(dimers[0][3][0], dimers[1][3][0]).tm, 1),
|
|
189
|
+
sep="\t",
|
|
190
|
+
file=tsv_file
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def entropy(pos, states):
|
|
195
|
+
"""calculate the entropy on the basis of a string and a list of unique_chars"""
|
|
196
|
+
max_ent = -1/(math.log(1/float(states), 10))
|
|
197
|
+
# only a rough normalization factor, not needed, but gives more
|
|
198
|
+
# beautiful plots
|
|
199
|
+
unique_chars = list(set(pos))
|
|
200
|
+
ent = 0.0
|
|
201
|
+
if len(pos) < 2:
|
|
202
|
+
return ent
|
|
203
|
+
# calculate the entropy at the particular position
|
|
204
|
+
for char in unique_chars:
|
|
205
|
+
freq = pos.count(char)
|
|
206
|
+
if freq > 0:
|
|
207
|
+
freq = float(freq)/float(len(pos))
|
|
208
|
+
ent += freq*math.log(freq, 50)
|
|
209
|
+
if ent == 0:
|
|
210
|
+
return ent
|
|
211
|
+
else:
|
|
212
|
+
return -ent*max_ent
|
|
213
|
+
# max_ent is the normalization
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def alignment_entropy(alignment_cleaned):
|
|
217
|
+
"""
|
|
218
|
+
calculate the entropy for every position in an alignment.
|
|
219
|
+
return pandas df.
|
|
220
|
+
"""
|
|
221
|
+
position = list()
|
|
222
|
+
entropys = list()
|
|
223
|
+
# iterate over alignment positions and the sequences
|
|
224
|
+
for nuc_pos in range(0, len(alignment_cleaned[0][1])):
|
|
225
|
+
pos = []
|
|
226
|
+
for seq_number in range(0, len(alignment_cleaned)):
|
|
227
|
+
pos.append(alignment_cleaned[seq_number][1][nuc_pos])
|
|
228
|
+
entropys.append(entropy(pos, 4))
|
|
229
|
+
position.append(nuc_pos)
|
|
230
|
+
# create df
|
|
231
|
+
entropy_df = pd.DataFrame()
|
|
232
|
+
entropy_df["position"] = position
|
|
233
|
+
entropy_df["entropy"] = entropys
|
|
234
|
+
entropy_df["average"] = entropy_df["entropy"].rolling(10, center=True).mean()
|
|
235
|
+
|
|
236
|
+
return entropy_df
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def varvamp_plot(dir, threshold, alignment_cleaned, conserved_regions, all_primers, amplicon_scheme):
|
|
240
|
+
"""
|
|
241
|
+
creates overview plot for the amplicon design
|
|
242
|
+
and per base coverage plots
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
amplicon_primers = []
|
|
246
|
+
# first plot: overview
|
|
247
|
+
# - create pdf name
|
|
248
|
+
name = "amplicon_plot.pdf"
|
|
249
|
+
out = os.path.join(dir, name)
|
|
250
|
+
# - create entropy df
|
|
251
|
+
entropy_df = alignment_entropy(alignment_cleaned)
|
|
252
|
+
|
|
253
|
+
# - ini figure
|
|
254
|
+
fig, ax = plt.subplots(2, 1, figsize=[22, 6], squeeze=True, sharex=True, gridspec_kw={'height_ratios': [4, 1]})
|
|
255
|
+
fig.subplots_adjust(hspace=0)
|
|
256
|
+
# - entropy plot
|
|
257
|
+
ax[0].fill_between(entropy_df["position"], entropy_df["entropy"], color="gainsboro", label="entropy")
|
|
258
|
+
ax[0].plot(entropy_df["position"], entropy_df["average"], color="black", label="average entropy", linewidth=0.5)
|
|
259
|
+
ax[0].set_ylim((0, 1))
|
|
260
|
+
ax[0].set_xlim(0, max(entropy_df["position"]))
|
|
261
|
+
ax[0].set_ylabel("alignment entropy")
|
|
262
|
+
ax[0].set_title("final amplicon design")
|
|
263
|
+
ax[0].spines['top'].set_visible(False)
|
|
264
|
+
ax[0].spines['right'].set_visible(False)
|
|
265
|
+
|
|
266
|
+
# - conserved regions plot
|
|
267
|
+
for region in conserved_regions:
|
|
268
|
+
ax[1].hlines([1], region[0], region[1], linewidth=15, color="darkorange")
|
|
269
|
+
# - conserved legend
|
|
270
|
+
ax[1].hlines([1], conserved_regions[0][1], conserved_regions[0][1], label="possible primer regions", linewidth=5, color="darkorange")
|
|
271
|
+
|
|
272
|
+
# - all primer plot
|
|
273
|
+
for direction in all_primers:
|
|
274
|
+
if direction == "-":
|
|
275
|
+
primer_position = 0.85
|
|
276
|
+
primer_color = "darkgrey"
|
|
277
|
+
primer_label = "all right primers"
|
|
278
|
+
elif direction == "+":
|
|
279
|
+
primer_position = 0.8
|
|
280
|
+
primer_color = "dimgrey"
|
|
281
|
+
primer_label = "all left primers"
|
|
282
|
+
for primer in all_primers[direction]:
|
|
283
|
+
ax[1].hlines(primer_position, all_primers[direction][primer][1], all_primers[direction][primer][2], linewidth=5, color=primer_color)
|
|
284
|
+
# - legend
|
|
285
|
+
ax[1].hlines(primer_position, all_primers[direction][primer][1], all_primers[direction][primer][2], linewidth=5, color=primer_color, label=primer_label)
|
|
286
|
+
|
|
287
|
+
# - amplicon, text and primer plot
|
|
288
|
+
counter = 0
|
|
289
|
+
for pool in amplicon_scheme:
|
|
290
|
+
for amp in amplicon_scheme[pool]:
|
|
291
|
+
if pool == 0:
|
|
292
|
+
position_amp = 0.7
|
|
293
|
+
position_text = 0.6
|
|
294
|
+
elif pool == 1:
|
|
295
|
+
position_amp = 0.6
|
|
296
|
+
position_text = 0.65
|
|
297
|
+
primer_names = list(amplicon_scheme[pool][amp].keys())
|
|
298
|
+
left = amplicon_scheme[pool][amp][primer_names[0]]
|
|
299
|
+
right = amplicon_scheme[pool][amp][primer_names[1]]
|
|
300
|
+
# amplicons
|
|
301
|
+
ax[1].hlines(position_amp, left[1], right[2], linewidth=5)
|
|
302
|
+
# text
|
|
303
|
+
ax[1].text(right[2] - (right[2]-left[1])/2, position_text, str(counter), fontsize=8)
|
|
304
|
+
# primers
|
|
305
|
+
ax[1].hlines(position_amp, left[1], left[2], linewidth=5, color="red")
|
|
306
|
+
ax[1].hlines(position_amp, right[1], right[2], linewidth=5, color="red")
|
|
307
|
+
|
|
308
|
+
counter += 1
|
|
309
|
+
# remember primers and names as they are needed for the last plot
|
|
310
|
+
amplicon_primers.append((primer_names[0], left))
|
|
311
|
+
amplicon_primers.append((primer_names[1], right))
|
|
312
|
+
|
|
313
|
+
# - legends
|
|
314
|
+
ax[1].hlines(position_amp, left[1]+config.PRIMER_SIZES[1], right[2]-config.PRIMER_SIZES[1], linewidth=5, label="amplicons")
|
|
315
|
+
ax[1].hlines(position_amp, left[1], left[2], linewidth=5, color="red", label="primers")
|
|
316
|
+
|
|
317
|
+
# - finalize
|
|
318
|
+
ax[1].spines['right'].set_visible(False)
|
|
319
|
+
ax[1].spines['left'].set_visible(False)
|
|
320
|
+
ax[1].spines['bottom'].set_visible(False)
|
|
321
|
+
ax[1].axes.get_yaxis().set_visible(False)
|
|
322
|
+
ax[1].set_xlabel("genome position")
|
|
323
|
+
ax[1].set_ylim((0.5, 1))
|
|
324
|
+
fig.legend(loc=(0.83, 0.7))
|
|
325
|
+
# - save fig
|
|
326
|
+
fig.savefig(out, bbox_inches='tight')
|
|
327
|
+
|
|
328
|
+
# second plot: per base primer mismatches
|
|
329
|
+
# - ini name
|
|
330
|
+
name = "per_base_mismatches.pdf"
|
|
331
|
+
out = os.path.join(dir, name)
|
|
332
|
+
# - ini multi pdf
|
|
333
|
+
with PdfPages(out) as pdf:
|
|
334
|
+
# - always print 4 primers to one page
|
|
335
|
+
for i in range(0, len(amplicon_primers), 4):
|
|
336
|
+
# - ini figure
|
|
337
|
+
primers_temp = amplicon_primers[i:i+4]
|
|
338
|
+
fig, ax = plt.subplots(len(primers_temp), figsize=(12, len(primers_temp)*4), squeeze=True)
|
|
339
|
+
fig.suptitle("Per base mismatches", fontsize=18)
|
|
340
|
+
fig.tight_layout(rect=[0.05, 0.05, 1, 0.98])
|
|
341
|
+
fig.subplots_adjust(hspace=0.5)
|
|
342
|
+
# - plotting
|
|
343
|
+
for idx, primer in enumerate(primers_temp):
|
|
344
|
+
x = [pos+primer[1][1] for pos in range(0, len(primer[1][4]))]
|
|
345
|
+
ax[idx].bar(x, primer[1][4], color='lightgrey', edgecolor='black')
|
|
346
|
+
ax[idx].set_title(primer[0], loc="left")
|
|
347
|
+
ax[idx].xaxis.set_ticks(np.arange(primer[1][1], primer[1][1]+len(x), 1))
|
|
348
|
+
ax[idx].xaxis.set_ticklabels(x, rotation=45)
|
|
349
|
+
ax[idx].set_ylabel(ylabel="% of sequences")
|
|
350
|
+
ax[idx].set_xlabel("position")
|
|
351
|
+
ax[idx].set_ylim(0, 1-threshold)
|
|
352
|
+
# - to pdf
|
|
353
|
+
pdf.savefig(fig, bbox_inches='tight')
|