varvamp 0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ """
2
+ finding and digesting conserved regions.
3
+ """
4
+
5
+ # varVAMP
6
+ from varvamp.scripts import config
7
+
8
+
9
+ def find_regions(consensus_amb, allowed_ambiguous):
10
+ """
11
+ finds conserved regions as specified by a
12
+ certain amount of ambiguous bases in a given
13
+ sequence length
14
+ """
15
+ # init the variables
16
+ current_window = []
17
+ writable = False
18
+ in_ambiguous_region = True
19
+ last_amb = 0
20
+ conserved_regions = []
21
+
22
+ seq = str(consensus_amb) + 2*'N'
23
+ for idx, nuc in enumerate(seq):
24
+ if in_ambiguous_region and nuc in config.nucs:
25
+ in_ambiguous_region = False
26
+ # just entered a new stretch of non-ambiguous bases
27
+ # may be time to open a new window
28
+ if not current_window:
29
+ current_window = [idx, 0]
30
+ amb_pos = []
31
+ # create new window if none is there. First element
32
+ # keeps track of start of the window, second element is
33
+ # a counter that resets if two ambiguous chars are longer
34
+ # than specified apart and last one counts all ambiguous
35
+ # chars. also track all amb chars after a window has opened
36
+ continue
37
+ if nuc not in config.nucs:
38
+ if current_window:
39
+ in_ambiguous_region = True
40
+ amb_to_amb_len = idx - last_amb
41
+ if nuc != "N":
42
+ # track previous amb pos only if current pos is not a N as this
43
+ # region is witeable
44
+ amb_pos.append(idx)
45
+ if current_window[1] >= allowed_ambiguous or nuc == "N":
46
+ # check if there were too many previous amb char in subwindow
47
+ # and make it writable. Always make it writeable if N is
48
+ # reached
49
+ writable = True
50
+ if amb_to_amb_len >= config.PRIMER_SIZES[0] and nuc != "N":
51
+ # check if the last amb is sufficiently far, if yes keep
52
+ # window open and set amb counter to 0, reset also the
53
+ # list of amb positions and track only the current pos
54
+ current_window[1] = 0
55
+ writable = False
56
+ amb_pos = [idx]
57
+
58
+ current_window[1] += 1
59
+
60
+ if writable:
61
+ writable = False
62
+ window_length = idx-current_window[0]
63
+ if window_length >= config.PRIMER_SIZES[0]:
64
+ # check if the writable window has a sufficient length.
65
+ conserved_regions.append([current_window[0], idx])
66
+ # reset the window and the list of amb positions
67
+ # after it was written
68
+ current_window = []
69
+ elif nuc == "N":
70
+ # if nuc was a N and region was not written also open a
71
+ # new window
72
+ current_window = []
73
+ else:
74
+ # else set the start pos to the next amb pos and
75
+ # check again if the new window matches the criteria
76
+ current_window[0] = amb_pos[0]+1
77
+ current_window[1] = current_window[1]-1
78
+ amb_pos.pop(0)
79
+ last_amb = idx
80
+
81
+ return conserved_regions
82
+
83
+
84
+ def mean(conserved_regions, consensus):
85
+ """
86
+ calculate the percentage of regions
87
+ that are conserved
88
+ """
89
+ sum = 0
90
+ for region in conserved_regions:
91
+ sum += region[1]-region[0]
92
+ return round(sum/len(consensus)*100, 1)
93
+
94
+
95
+ def digest_seq(seq, kmer_size):
96
+ """
97
+ digest the sequence into kmers
98
+ """
99
+ return[[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]
100
+
101
+
102
+ def produce_kmers(conserved_regions, consensus):
103
+ """
104
+ produce kmers for all conserved regions
105
+ """
106
+ kmers = []
107
+
108
+ for region in conserved_regions:
109
+ sliced_seq = consensus[region[0]:region[1]]
110
+ for kmer_size in range(config.PRIMER_SIZES[0], config.PRIMER_SIZES[1]+1):
111
+ kmers_temp = digest_seq(sliced_seq, kmer_size)
112
+ # adjust the start and stop position of the kmers
113
+ for kmer_temp in kmers_temp:
114
+ kmer_temp[1] = kmer_temp[1]+region[0]
115
+ kmer_temp[2] = kmer_temp[2]+region[0]
116
+ kmers += kmers_temp
117
+
118
+ return kmers
@@ -0,0 +1,321 @@
1
+ """
2
+ varVAMP logging and raising errors
3
+ """
4
+
5
+ # BUILT-INS
6
+ import sys
7
+ import os
8
+ import shutil
9
+ import time
10
+ import datetime
11
+
12
+ # varVAMP
13
+ from varvamp.scripts import config
14
+
15
+
16
+ def create_dir_structure(dir):
17
+ """
18
+ create output folders and log file
19
+ """
20
+ cwd = os.getcwd()
21
+ results_dir = os.path.join(cwd, dir)
22
+ data_dir = os.path.join(results_dir, "data/")
23
+ # create folders
24
+ if not os.path.exists(results_dir):
25
+ os.makedirs(results_dir)
26
+ else:
27
+ shutil.rmtree(results_dir)
28
+ os.makedirs(results_dir)
29
+ os.makedirs(data_dir)
30
+
31
+ log_file = os.path.join(results_dir, "varvamp_log.txt")
32
+
33
+ return results_dir, data_dir, log_file
34
+
35
+
36
+ def varvamp_progress(log_file, start_time=None, progress=0, job="", progress_text=""):
37
+ """
38
+ progress bar, main progress logging and folder creation
39
+ """
40
+ barLength = 40
41
+ block = int(round(barLength*progress))
42
+
43
+ if progress == 0:
44
+ print(
45
+ "\nStarting \033[31m\033[1mvarVAMP ◥(ºwº)◤\033[0m primer design\n",
46
+ flush=True
47
+ )
48
+ with open(log_file, 'w') as f:
49
+ f.write('VARVAMP log \n\n')
50
+ else:
51
+ if progress == 1:
52
+ stop_time = str(round(time.process_time() - start_time, 2))
53
+ progress_text = f"all done \n\n\rvarVAMP created an amplicon scheme in {stop_time} sec!\n{datetime.datetime.now()}"
54
+ job = "Finalizing output."
55
+ print(
56
+ "\rJob:\t\t " + job + "\nProgress: \t [{0}] {1}%".format("█"*block + "-"*(barLength-block), progress*100) + "\t" + progress_text,
57
+ flush=True
58
+ )
59
+ with open(log_file, 'a') as f:
60
+ print(
61
+ f"\rJob:\t {job} \nResult:\t {progress_text}",
62
+ file=f
63
+ )
64
+
65
+
66
+ def raise_error(message, log_file, exit=False):
67
+ """
68
+ raises warnings or errors, writes to log
69
+ """
70
+ # print to log
71
+ with open(log_file, 'a') as f:
72
+ if exit:
73
+ print(f"ERROR: {message}", file=f)
74
+ else:
75
+ print(f"WARNING: {message}", file=f)
76
+ # print to console
77
+ if exit:
78
+ sys.exit(f"\n\033[31m\033[1mERROR:\033[0m {message}")
79
+ else:
80
+ print(f"\033[31m\033[1mWARNING:\033[0m {message}")
81
+
82
+
83
+ def raise_arg_errors(args, log_file):
84
+ """
85
+ checks arguments for non-valid input and raises warnings
86
+ """
87
+ # threshold error
88
+ if args.threshold > 1 or args.threshold < 0:
89
+ raise_error(
90
+ "threshold can only be between 0-1",
91
+ log_file,
92
+ exit=True
93
+ )
94
+ if args.allowed_ambiguous < 0:
95
+ raise_error(
96
+ "number of ambiguous chars can not be negative",
97
+ log_file,
98
+ exit=True
99
+ )
100
+ if args.allowed_ambiguous > 4:
101
+ raise_error(
102
+ "high number of ambiguous nucleotides in primer leads to a high "
103
+ "degeneracy. Consider reducing.",
104
+ log_file
105
+ )
106
+ if args.opt_length > args.max_length:
107
+ raise_error(
108
+ "optimal length can not be higher than the maximum amplicon length.",
109
+ log_file,
110
+ exit=True
111
+ )
112
+ if args.opt_length < 0 or args.max_length < 0:
113
+ raise_error(
114
+ "amplicon lengths can not be negative.",
115
+ log_file,
116
+ exit=True
117
+ )
118
+ if args.opt_length < 200 or args.max_length < 200:
119
+ raise_error(
120
+ "your amplicon lengths might be to small. Consider increasing",
121
+ log_file
122
+ )
123
+ if args.overlap < 0:
124
+ raise_error(
125
+ "overlap size can not be negative.",
126
+ log_file,
127
+ exit=True
128
+ )
129
+ if args.overlap < 50:
130
+ raise_error(
131
+ "small overlaps might hinder downstream analyses. Consider increasing.",
132
+ log_file
133
+ )
134
+ if args.overlap > args.max_length/2 - config.PRIMER_SIZES[1]:
135
+ raise_error(
136
+ "min overlap must be lower than half of your maximum length - maximum primer length. To achieve optimal results reduce it to at least half of your optimal length",
137
+ log_file,
138
+ exit=True
139
+ )
140
+ if args.overlap > args.opt_length:
141
+ raise_error(
142
+ "overlap can not be higher than your optimal length.",
143
+ log_file,
144
+ exit=True
145
+ )
146
+ if args.overlap > args.opt_length/2:
147
+ raise_error(
148
+ "your intended overlap is higher than half of your optimal length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
149
+ log_file
150
+ )
151
+
152
+
153
+ def confirm_config(args, log_file):
154
+ """
155
+ checks the config. raises error and warnings
156
+ if nececarry. writes settings to log
157
+ """
158
+ error = False
159
+
160
+ # check if all variables exists
161
+ all_vars = [
162
+ # arg dependent
163
+ "FREQUENCY_THRESHOLD",
164
+ "PRIMER_ALLOWED_N_AMB",
165
+ "AMPLICON_OPT_LENGTH",
166
+ "AMPLICON_MAX_LENGTH",
167
+ "AMPLICON_MIN_OVERLAP",
168
+ # arg independent
169
+ "PRIMER_TMP",
170
+ "PRIMER_GC_RANGE",
171
+ "PRIMER_SIZES",
172
+ "PRIMER_HAIRPIN",
173
+ "PRIMER_MAX_POLYX",
174
+ "PRIMER_MAX_DINUC_REPEATS",
175
+ "PRIMER_MAX_DIMER_TMP",
176
+ "PRIMER_MIN_3_WITHOUT_AMB",
177
+ "PCR_MV_CONC",
178
+ "PCR_DV_CONC",
179
+ "PCR_DNTP_CONC",
180
+ "PCR_DNA_CONC",
181
+ "PRIMER_TM_PENALTY",
182
+ "PRIMER_GC_PENALTY",
183
+ "PRIMER_SIZE_PENALTY",
184
+ "PRIMER_MAX_BASE_PENALTY",
185
+ "PRIMER_3_PENALTY",
186
+ "PRIMER_PERMUTATION_PENALTY",
187
+ ]
188
+
189
+ for var in all_vars:
190
+ if var not in vars(config):
191
+ raise_error(
192
+ f"{var} does not exist in config!",
193
+ log_file
194
+ )
195
+ error = True
196
+ # exit if variables are not defined
197
+ if error:
198
+ raise_error(
199
+ "config is missing parameters. Look at the above warnings!",
200
+ log_file,
201
+ exit=True
202
+ )
203
+ # confirm tuples
204
+ for type, tup in [("temp", config.PRIMER_TMP), ("gc", config.PRIMER_GC_RANGE), ("size", config.PRIMER_SIZES)]:
205
+ if len(tup) != 3:
206
+ raise_error(
207
+ f"{type} tuple has to have the form (min, max, opt)!",
208
+ log_file
209
+ )
210
+ error = True
211
+ if tup[0] > tup[1]:
212
+ raise_error(
213
+ f"min {type} should not exeed max {type}!",
214
+ log_file
215
+ )
216
+ error = True
217
+ if tup[0] > tup[2]:
218
+ raise_error(
219
+ f"min {type} should not exeed opt {type}!",
220
+ log_file
221
+ )
222
+ error = True
223
+ if tup[2] > tup[1]:
224
+ raise_error(
225
+ f"opt {type} should not exeed max {type}!",
226
+ log_file
227
+ )
228
+ error = True
229
+ if any(map(lambda var: var < 0, tup)):
230
+ raise_error(
231
+ f"{type} can not contain negative values!",
232
+ log_file
233
+ )
234
+ error = True
235
+
236
+ # check values that cannot be zero
237
+ non_negative_var = [
238
+ ("max polyx repeats", config.PRIMER_MAX_POLYX),
239
+ ("max dinucleotide repeats", config.PRIMER_MAX_DINUC_REPEATS),
240
+ ("max GCs at the 3' end", config.PRIMER_MAX_GC_END),
241
+ ("GC clamp", config.PRIMER_GC_CLAMP),
242
+ ("min number of 3 prime nucleotides without ambiguous nucleotides", config.PRIMER_MIN_3_WITHOUT_AMB),
243
+ ("monovalent cation concentration", config.PCR_MV_CONC),
244
+ ("divalent cation concentration", config.PCR_DV_CONC),
245
+ ("dNTP concentration", config.PCR_DNTP_CONC),
246
+ ("primer temperatur penalty", config.PRIMER_TM_PENALTY),
247
+ ("primer gc penalty", config.PRIMER_GC_PENALTY),
248
+ ("primer size penalty", config.PRIMER_SIZE_PENALTY),
249
+ ("max base penalty", config.PRIMER_MAX_BASE_PENALTY),
250
+ ("primer permutation penalty", config.PRIMER_PERMUTATION_PENALTY)
251
+ ]
252
+ for type, var in non_negative_var:
253
+ if var < 0:
254
+ raise_error(
255
+ f"{type} can not be negative!",
256
+ log_file
257
+ )
258
+ error = True
259
+ if any(map(lambda var: var < 0, config.PRIMER_3_PENALTY)):
260
+ raise_error(
261
+ "3' penalties can not be zero!",
262
+ log_file
263
+ )
264
+ error = True
265
+ # exit if variables are not properly defined
266
+ if error:
267
+ raise_error(
268
+ "config has flaws. Look at the above warnings!",
269
+ log_file,
270
+ exit=True
271
+ )
272
+ # specific warnings
273
+ if config.PRIMER_HAIRPIN < 0:
274
+ raise_error(
275
+ "decreasing hairpin melting temp to negative values "
276
+ "will influence successful primer search!",
277
+ log_file
278
+ )
279
+ if config.PRIMER_MAX_DIMER_TMP < 0:
280
+ raise_error(
281
+ "there is no need to set max dimer melting temp below 0.",
282
+ log_file
283
+ )
284
+ if config.PRIMER_MAX_BASE_PENALTY < 8:
285
+ raise_error(
286
+ "decreasing the base penalty will filter out more primers.",
287
+ log_file
288
+ )
289
+ if config.PRIMER_GC_CLAMP > 3:
290
+ raise_error(
291
+ "large GC clamps will results in too high 3'end stability",
292
+ log_file
293
+ )
294
+ if config.PRIMER_MAX_GC_END < 5 and config.PRIMER_MAX_GC_END < config.PRIMER_GC_CLAMP:
295
+ raise_error(
296
+ f"GC clamp of {config.PRIMER_GC_CLAMP} length will not be enforced as there are only {config.PRIMER_MAX_GC_END} gc characters allowed at the 3' end",
297
+ log_file
298
+ )
299
+ if config.PRIMER_MAX_GC_END > 5:
300
+ raise_error(
301
+ "only the last 5 nucleotides of the 3' end are considered for GC 3'end calculation.",
302
+ log_file
303
+ )
304
+
305
+ # write all settings to file
306
+ var_dic = vars(config)
307
+ with open(log_file, 'a') as f:
308
+ print(
309
+ "settings that can be adjusted via arguments\n",
310
+ f"PRIMER_OPT_LENGTH = {args.opt_length}",
311
+ f"PRIMER_MAX_LENGTH = {args.max_length}",
312
+ f"PRIMER_MIN_OVERLAP = {args.overlap}",
313
+ f"PRIMER_THRESHOLD = {args.threshold}",
314
+ f"PRIMER_ALLOWED_N_AMB = {args.allowed_ambiguous}",
315
+ "\nconfig settings\n",
316
+ sep="\n",
317
+ file=f
318
+ )
319
+ for var in all_vars[5:]:
320
+ print(f"{var} = {var_dic[var]}", file=f)
321
+ print("\nprogress\n", file=f)