varvamp 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ """
2
+ data writing and visualization.
3
+ """
4
+ # BUILT-INS
5
+ import os
6
+ import math
7
+ import itertools
8
+
9
+ # LIBS
10
+ import pandas as pd
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ from matplotlib.backends.backend_pdf import PdfPages
14
+
15
+
16
+ # varVAMP
17
+ from varvamp.scripts import primers
18
+ from varvamp.scripts import config
19
+
20
+
21
+ def write_fasta(dir, seq_id, seq):
22
+ """
23
+ write fasta files
24
+ """
25
+ name = seq_id + ".fasta"
26
+ out = os.path.join(dir, name)
27
+ with open(out, 'w') as o:
28
+ print(f">{seq_id}\n{seq}", file=o)
29
+
30
+
31
+ def write_alignment(dir, alignment):
32
+ """
33
+ write alignment to file
34
+ """
35
+ name = "alignment_cleaned.fasta"
36
+ out = os.path.join(dir, name)
37
+ with open(out, "w") as o:
38
+ for seq in alignment:
39
+ print(f">{seq[0]}\n{seq[1]}", file=o)
40
+
41
+
42
+ def write_conserved_to_bed(conserved_regions, dir):
43
+ """
44
+ write conserved regions as bed file
45
+ """
46
+ counter = 0
47
+ outfile = dir+"conserved_regions.bed"
48
+ with open(outfile, 'w') as o:
49
+ for region in conserved_regions:
50
+ print(
51
+ "ambiguous_consensus",
52
+ region[0],
53
+ region[1],
54
+ "region_"+str(counter),
55
+ sep="\t",
56
+ file=o
57
+ )
58
+ counter += 1
59
+
60
+
61
+ def write_primers_to_bed(outfile, primer_name, primer_properties, direction):
62
+ """
63
+ write primers as bed file
64
+ """
65
+ with open(outfile, 'a') as o:
66
+ print(
67
+ "ambiguous_consensus",
68
+ primer_properties[1], # start
69
+ primer_properties[2], # stop
70
+ primer_name,
71
+ round(primer_properties[3], 1), # score
72
+ direction,
73
+ sep="\t",
74
+ file=o
75
+ )
76
+
77
+
78
+ def write_all_primers(dir, all_primers):
79
+ """
80
+ write all primers that varVAMP designed as bed file
81
+ """
82
+ outfile = dir + "all_primers.bed"
83
+
84
+ for direction in all_primers:
85
+ for primer in all_primers[direction]:
86
+ write_primers_to_bed(outfile, primer, all_primers[direction][primer], direction)
87
+
88
+
89
+ def get_permutations(seq):
90
+ """
91
+ get all permutations of an ambiguous sequence. needed to
92
+ correctly report the gc and the temperature.
93
+ """
94
+ groups = itertools.groupby(seq, lambda char: char not in config.ambig_nucs)
95
+ splits = []
96
+ for b, group in groups:
97
+ if b:
98
+ splits.extend([[g] for g in group])
99
+ else:
100
+ for nuc in group:
101
+ splits.append(config.ambig_nucs[nuc])
102
+ return[''.join(p) for p in itertools.product(*splits)]
103
+
104
+
105
+ def write_scheme_to_files(dir, amplicon_scheme, ambiguous_consensus):
106
+ """
107
+ write all relevant bed files and a tsv file with all primer stats
108
+ """
109
+ # ini
110
+ tsv_file = os.path.join(dir, "primers.tsv")
111
+ primer_bed_file = os.path.join(dir, "primers.bed")
112
+ amplicon_bed_file = os.path.join(dir, "amplicons.bed")
113
+ tabular_file = os.path.join(dir, "primer_to_amplicon_assignment.tabular")
114
+
115
+ counter = 0
116
+
117
+ # open files to write
118
+ with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
119
+ # write header for primer tsv
120
+ print(
121
+ "amlicon_name\tprimer_name\tpool\tstart\tstop\tseq\tsize\tgc_best\ttemp_best\tmean_gc\tmean_temp\tscore",
122
+ file=tsv
123
+ )
124
+ counter = 0
125
+
126
+ for pool in amplicon_scheme:
127
+ for amp in amplicon_scheme[pool]:
128
+ # give a new amplicon name
129
+ new_name = f"amplicon_{str(counter)}"
130
+ counter += 1
131
+ # get left and right primers and their names
132
+ primer_names = list(amplicon_scheme[pool][amp].keys())
133
+ left = (primer_names[0], amplicon_scheme[pool][amp][primer_names[0]])
134
+ right = (primer_names[1], amplicon_scheme[pool][amp][primer_names[1]])
135
+
136
+ # write amplicon bed
137
+ print("ambiguous_consensus", left[1][1], right[1][2], new_name, pool, sep="\t", file=bed)
138
+ # write primer assignments tabular file
139
+ print(left[0], right[0], sep="\t", file=tabular)
140
+
141
+ # write primer tsv and primer bed
142
+ for direction, primer in [("+", left), ("-", right)]:
143
+ seq = ambiguous_consensus[primer[1][1]:primer[1][2]]
144
+ if direction == "-":
145
+ seq = primers.rev_complement(seq)
146
+ # calc primer parameters for all permutations
147
+ gc = 0
148
+ temp = 0
149
+ permutations = get_permutations(seq)
150
+ for permutation in permutations:
151
+ gc += primers.calc_gc(permutation)
152
+ temp += primers.calc_temp(permutation)
153
+ # write tsv file
154
+ print(
155
+ new_name,
156
+ primer[0],
157
+ pool,
158
+ primer[1][1],
159
+ primer[1][2],
160
+ seq,
161
+ len(primer[1][0]),
162
+ round(primers.calc_gc(primer[1][0]), 1),
163
+ round(primers.calc_temp(primer[1][0]), 1),
164
+ round(gc/len(permutations), 1),
165
+ round(temp/len(permutations), 1),
166
+ round(primer[1][3], 1),
167
+ sep="\t",
168
+ file=tsv
169
+ )
170
+ # write primer bed file
171
+ write_primers_to_bed(primer_bed_file, primer[0], primer[1], direction)
172
+
173
+
174
+ def write_dimers(dir, not_solved):
175
+ """
176
+ write dimers for which no replacement was found to file
177
+ """
178
+ tsv_file = os.path.join(dir, "unsolvable_primer_dimers.tsv")
179
+ print(
180
+ "pool\tprimer_name_1\tprimer_name_2\tdimer melting temp",
181
+ file=tsv_file
182
+ )
183
+ for dimers in not_solved:
184
+ print(
185
+ dimers[0][0],
186
+ dimers[0][2],
187
+ dimers[1][2],
188
+ round(primers.calc_dimer(dimers[0][3][0], dimers[1][3][0]).tm, 1),
189
+ sep="\t",
190
+ file=tsv_file
191
+ )
192
+
193
+
194
+ def entropy(pos, states):
195
+ """calculate the entropy on the basis of a string and a list of unique_chars"""
196
+ max_ent = -1/(math.log(1/float(states), 10))
197
+ # only a rough normalization factor, not needed, but gives more
198
+ # beautiful plots
199
+ unique_chars = list(set(pos))
200
+ ent = 0.0
201
+ if len(pos) < 2:
202
+ return ent
203
+ # calculate the entropy at the particular position
204
+ for char in unique_chars:
205
+ freq = pos.count(char)
206
+ if freq > 0:
207
+ freq = float(freq)/float(len(pos))
208
+ ent += freq*math.log(freq, 50)
209
+ if ent == 0:
210
+ return ent
211
+ else:
212
+ return -ent*max_ent
213
+ # max_ent is the normalization
214
+
215
+
216
+ def alignment_entropy(alignment_cleaned):
217
+ """
218
+ calculate the entropy for every position in an alignment.
219
+ return pandas df.
220
+ """
221
+ position = list()
222
+ entropys = list()
223
+ # iterate over alignment positions and the sequences
224
+ for nuc_pos in range(0, len(alignment_cleaned[0][1])):
225
+ pos = []
226
+ for seq_number in range(0, len(alignment_cleaned)):
227
+ pos.append(alignment_cleaned[seq_number][1][nuc_pos])
228
+ entropys.append(entropy(pos, 4))
229
+ position.append(nuc_pos)
230
+ # create df
231
+ entropy_df = pd.DataFrame()
232
+ entropy_df["position"] = position
233
+ entropy_df["entropy"] = entropys
234
+ entropy_df["average"] = entropy_df["entropy"].rolling(10, center=True).mean()
235
+
236
+ return entropy_df
237
+
238
+
239
+ def varvamp_plot(dir, threshold, alignment_cleaned, conserved_regions, all_primers, amplicon_scheme):
240
+ """
241
+ creates overview plot for the amplicon design
242
+ and per base coverage plots
243
+ """
244
+
245
+ amplicon_primers = []
246
+ # first plot: overview
247
+ # - create pdf name
248
+ name = "amplicon_plot.pdf"
249
+ out = os.path.join(dir, name)
250
+ # - create entropy df
251
+ entropy_df = alignment_entropy(alignment_cleaned)
252
+
253
+ # - ini figure
254
+ fig, ax = plt.subplots(2, 1, figsize=[22, 6], squeeze=True, sharex=True, gridspec_kw={'height_ratios': [4, 1]})
255
+ fig.subplots_adjust(hspace=0)
256
+ # - entropy plot
257
+ ax[0].fill_between(entropy_df["position"], entropy_df["entropy"], color="gainsboro", label="entropy")
258
+ ax[0].plot(entropy_df["position"], entropy_df["average"], color="black", label="average entropy", linewidth=0.5)
259
+ ax[0].set_ylim((0, 1))
260
+ ax[0].set_xlim(0, max(entropy_df["position"]))
261
+ ax[0].set_ylabel("alignment entropy")
262
+ ax[0].set_title("final amplicon design")
263
+ ax[0].spines['top'].set_visible(False)
264
+ ax[0].spines['right'].set_visible(False)
265
+
266
+ # - conserved regions plot
267
+ for region in conserved_regions:
268
+ ax[1].hlines([1], region[0], region[1], linewidth=15, color="darkorange")
269
+ # - conserved legend
270
+ ax[1].hlines([1], conserved_regions[0][1], conserved_regions[0][1], label="possible primer regions", linewidth=5, color="darkorange")
271
+
272
+ # - all primer plot
273
+ for direction in all_primers:
274
+ if direction == "-":
275
+ primer_position = 0.85
276
+ primer_color = "darkgrey"
277
+ primer_label = "all right primers"
278
+ elif direction == "+":
279
+ primer_position = 0.8
280
+ primer_color = "dimgrey"
281
+ primer_label = "all left primers"
282
+ for primer in all_primers[direction]:
283
+ ax[1].hlines(primer_position, all_primers[direction][primer][1], all_primers[direction][primer][2], linewidth=5, color=primer_color)
284
+ # - legend
285
+ ax[1].hlines(primer_position, all_primers[direction][primer][1], all_primers[direction][primer][2], linewidth=5, color=primer_color, label=primer_label)
286
+
287
+ # - amplicon, text and primer plot
288
+ counter = 0
289
+ for pool in amplicon_scheme:
290
+ for amp in amplicon_scheme[pool]:
291
+ if pool == 0:
292
+ position_amp = 0.7
293
+ position_text = 0.6
294
+ elif pool == 1:
295
+ position_amp = 0.6
296
+ position_text = 0.65
297
+ primer_names = list(amplicon_scheme[pool][amp].keys())
298
+ left = amplicon_scheme[pool][amp][primer_names[0]]
299
+ right = amplicon_scheme[pool][amp][primer_names[1]]
300
+ # amplicons
301
+ ax[1].hlines(position_amp, left[1], right[2], linewidth=5)
302
+ # text
303
+ ax[1].text(right[2] - (right[2]-left[1])/2, position_text, str(counter), fontsize=8)
304
+ # primers
305
+ ax[1].hlines(position_amp, left[1], left[2], linewidth=5, color="red")
306
+ ax[1].hlines(position_amp, right[1], right[2], linewidth=5, color="red")
307
+
308
+ counter += 1
309
+ # remember primers and names as they are needed for the last plot
310
+ amplicon_primers.append((primer_names[0], left))
311
+ amplicon_primers.append((primer_names[1], right))
312
+
313
+ # - legends
314
+ ax[1].hlines(position_amp, left[1]+config.PRIMER_SIZES[1], right[2]-config.PRIMER_SIZES[1], linewidth=5, label="amplicons")
315
+ ax[1].hlines(position_amp, left[1], left[2], linewidth=5, color="red", label="primers")
316
+
317
+ # - finalize
318
+ ax[1].spines['right'].set_visible(False)
319
+ ax[1].spines['left'].set_visible(False)
320
+ ax[1].spines['bottom'].set_visible(False)
321
+ ax[1].axes.get_yaxis().set_visible(False)
322
+ ax[1].set_xlabel("genome position")
323
+ ax[1].set_ylim((0.5, 1))
324
+ fig.legend(loc=(0.83, 0.7))
325
+ # - save fig
326
+ fig.savefig(out, bbox_inches='tight')
327
+
328
+ # second plot: per base primer mismatches
329
+ # - ini name
330
+ name = "per_base_mismatches.pdf"
331
+ out = os.path.join(dir, name)
332
+ # - ini multi pdf
333
+ with PdfPages(out) as pdf:
334
+ # - always print 4 primers to one page
335
+ for i in range(0, len(amplicon_primers), 4):
336
+ # - ini figure
337
+ primers_temp = amplicon_primers[i:i+4]
338
+ fig, ax = plt.subplots(len(primers_temp), figsize=(12, len(primers_temp)*4), squeeze=True)
339
+ fig.suptitle("Per base mismatches", fontsize=18)
340
+ fig.tight_layout(rect=[0.05, 0.05, 1, 0.98])
341
+ fig.subplots_adjust(hspace=0.5)
342
+ # - plotting
343
+ for idx, primer in enumerate(primers_temp):
344
+ x = [pos+primer[1][1] for pos in range(0, len(primer[1][4]))]
345
+ ax[idx].bar(x, primer[1][4], color='lightgrey', edgecolor='black')
346
+ ax[idx].set_title(primer[0], loc="left")
347
+ ax[idx].xaxis.set_ticks(np.arange(primer[1][1], primer[1][1]+len(x), 1))
348
+ ax[idx].xaxis.set_ticklabels(x, rotation=45)
349
+ ax[idx].set_ylabel(ylabel="% of sequences")
350
+ ax[idx].set_xlabel("position")
351
+ ax[idx].set_ylim(0, 1-threshold)
352
+ # - to pdf
353
+ pdf.savefig(fig, bbox_inches='tight')