varvamp 0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varvamp/__init__.py +3 -0
- varvamp/__main__.py +5 -0
- varvamp/command.py +263 -0
- varvamp/scripts/__init__.py +0 -0
- varvamp/scripts/alignment.py +223 -0
- varvamp/scripts/config.py +59 -0
- varvamp/scripts/consensus.py +111 -0
- varvamp/scripts/conserved.py +118 -0
- varvamp/scripts/logging.py +321 -0
- varvamp/scripts/primers.py +417 -0
- varvamp/scripts/reporting.py +353 -0
- varvamp/scripts/scheme.py +390 -0
- varvamp-0.3.dist-info/METADATA +53 -0
- varvamp-0.3.dist-info/RECORD +17 -0
- varvamp-0.3.dist-info/WHEEL +5 -0
- varvamp-0.3.dist-info/entry_points.txt +2 -0
- varvamp-0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
finding and digesting conserved regions.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# varVAMP
|
|
6
|
+
from varvamp.scripts import config
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def find_regions(consensus_amb, allowed_ambiguous):
|
|
10
|
+
"""
|
|
11
|
+
finds conserved regions as specified by a
|
|
12
|
+
certain amount of ambiguous bases in a given
|
|
13
|
+
sequence length
|
|
14
|
+
"""
|
|
15
|
+
# init the variables
|
|
16
|
+
current_window = []
|
|
17
|
+
writable = False
|
|
18
|
+
in_ambiguous_region = True
|
|
19
|
+
last_amb = 0
|
|
20
|
+
conserved_regions = []
|
|
21
|
+
|
|
22
|
+
seq = str(consensus_amb) + 2*'N'
|
|
23
|
+
for idx, nuc in enumerate(seq):
|
|
24
|
+
if in_ambiguous_region and nuc in config.nucs:
|
|
25
|
+
in_ambiguous_region = False
|
|
26
|
+
# just entered a new stretch of non-ambiguous bases
|
|
27
|
+
# may be time to open a new window
|
|
28
|
+
if not current_window:
|
|
29
|
+
current_window = [idx, 0]
|
|
30
|
+
amb_pos = []
|
|
31
|
+
# create new window if none is there. First element
|
|
32
|
+
# keeps track of start of the window, second element is
|
|
33
|
+
# a counter that resets if two ambiguous chars are longer
|
|
34
|
+
# than specified apart and last one counts all ambiguous
|
|
35
|
+
# chars. also track all amb chars after a window has opened
|
|
36
|
+
continue
|
|
37
|
+
if nuc not in config.nucs:
|
|
38
|
+
if current_window:
|
|
39
|
+
in_ambiguous_region = True
|
|
40
|
+
amb_to_amb_len = idx - last_amb
|
|
41
|
+
if nuc != "N":
|
|
42
|
+
# track previous amb pos only if current pos is not a N as this
|
|
43
|
+
# region is witeable
|
|
44
|
+
amb_pos.append(idx)
|
|
45
|
+
if current_window[1] >= allowed_ambiguous or nuc == "N":
|
|
46
|
+
# check if there were too many previous amb char in subwindow
|
|
47
|
+
# and make it writable. Always make it writeable if N is
|
|
48
|
+
# reached
|
|
49
|
+
writable = True
|
|
50
|
+
if amb_to_amb_len >= config.PRIMER_SIZES[0] and nuc != "N":
|
|
51
|
+
# check if the last amb is sufficiently far, if yes keep
|
|
52
|
+
# window open and set amb counter to 0, reset also the
|
|
53
|
+
# list of amb positions and track only the current pos
|
|
54
|
+
current_window[1] = 0
|
|
55
|
+
writable = False
|
|
56
|
+
amb_pos = [idx]
|
|
57
|
+
|
|
58
|
+
current_window[1] += 1
|
|
59
|
+
|
|
60
|
+
if writable:
|
|
61
|
+
writable = False
|
|
62
|
+
window_length = idx-current_window[0]
|
|
63
|
+
if window_length >= config.PRIMER_SIZES[0]:
|
|
64
|
+
# check if the writable window has a sufficient length.
|
|
65
|
+
conserved_regions.append([current_window[0], idx])
|
|
66
|
+
# reset the window and the list of amb positions
|
|
67
|
+
# after it was written
|
|
68
|
+
current_window = []
|
|
69
|
+
elif nuc == "N":
|
|
70
|
+
# if nuc was a N and region was not written also open a
|
|
71
|
+
# new window
|
|
72
|
+
current_window = []
|
|
73
|
+
else:
|
|
74
|
+
# else set the start pos to the next amb pos and
|
|
75
|
+
# check again if the new window matches the criteria
|
|
76
|
+
current_window[0] = amb_pos[0]+1
|
|
77
|
+
current_window[1] = current_window[1]-1
|
|
78
|
+
amb_pos.pop(0)
|
|
79
|
+
last_amb = idx
|
|
80
|
+
|
|
81
|
+
return conserved_regions
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def mean(conserved_regions, consensus):
|
|
85
|
+
"""
|
|
86
|
+
calculate the percentage of regions
|
|
87
|
+
that are conserved
|
|
88
|
+
"""
|
|
89
|
+
sum = 0
|
|
90
|
+
for region in conserved_regions:
|
|
91
|
+
sum += region[1]-region[0]
|
|
92
|
+
return round(sum/len(consensus)*100, 1)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def digest_seq(seq, kmer_size):
|
|
96
|
+
"""
|
|
97
|
+
digest the sequence into kmers
|
|
98
|
+
"""
|
|
99
|
+
return[[seq[i:i+kmer_size], i, i+len(seq[i:i+kmer_size])] for i in range(len(seq)-kmer_size+1)]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def produce_kmers(conserved_regions, consensus):
|
|
103
|
+
"""
|
|
104
|
+
produce kmers for all conserved regions
|
|
105
|
+
"""
|
|
106
|
+
kmers = []
|
|
107
|
+
|
|
108
|
+
for region in conserved_regions:
|
|
109
|
+
sliced_seq = consensus[region[0]:region[1]]
|
|
110
|
+
for kmer_size in range(config.PRIMER_SIZES[0], config.PRIMER_SIZES[1]+1):
|
|
111
|
+
kmers_temp = digest_seq(sliced_seq, kmer_size)
|
|
112
|
+
# adjust the start and stop position of the kmers
|
|
113
|
+
for kmer_temp in kmers_temp:
|
|
114
|
+
kmer_temp[1] = kmer_temp[1]+region[0]
|
|
115
|
+
kmer_temp[2] = kmer_temp[2]+region[0]
|
|
116
|
+
kmers += kmers_temp
|
|
117
|
+
|
|
118
|
+
return kmers
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
varVAMP logging and raising errors
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# BUILT-INS
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
import shutil
|
|
9
|
+
import time
|
|
10
|
+
import datetime
|
|
11
|
+
|
|
12
|
+
# varVAMP
|
|
13
|
+
from varvamp.scripts import config
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_dir_structure(dir):
|
|
17
|
+
"""
|
|
18
|
+
create output folders and log file
|
|
19
|
+
"""
|
|
20
|
+
cwd = os.getcwd()
|
|
21
|
+
results_dir = os.path.join(cwd, dir)
|
|
22
|
+
data_dir = os.path.join(results_dir, "data/")
|
|
23
|
+
# create folders
|
|
24
|
+
if not os.path.exists(results_dir):
|
|
25
|
+
os.makedirs(results_dir)
|
|
26
|
+
else:
|
|
27
|
+
shutil.rmtree(results_dir)
|
|
28
|
+
os.makedirs(results_dir)
|
|
29
|
+
os.makedirs(data_dir)
|
|
30
|
+
|
|
31
|
+
log_file = os.path.join(results_dir, "varvamp_log.txt")
|
|
32
|
+
|
|
33
|
+
return results_dir, data_dir, log_file
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def varvamp_progress(log_file, start_time=None, progress=0, job="", progress_text=""):
|
|
37
|
+
"""
|
|
38
|
+
progress bar, main progress logging and folder creation
|
|
39
|
+
"""
|
|
40
|
+
barLength = 40
|
|
41
|
+
block = int(round(barLength*progress))
|
|
42
|
+
|
|
43
|
+
if progress == 0:
|
|
44
|
+
print(
|
|
45
|
+
"\nStarting \033[31m\033[1mvarVAMP ◥(ºwº)◤\033[0m primer design\n",
|
|
46
|
+
flush=True
|
|
47
|
+
)
|
|
48
|
+
with open(log_file, 'w') as f:
|
|
49
|
+
f.write('VARVAMP log \n\n')
|
|
50
|
+
else:
|
|
51
|
+
if progress == 1:
|
|
52
|
+
stop_time = str(round(time.process_time() - start_time, 2))
|
|
53
|
+
progress_text = f"all done \n\n\rvarVAMP created an amplicon scheme in {stop_time} sec!\n{datetime.datetime.now()}"
|
|
54
|
+
job = "Finalizing output."
|
|
55
|
+
print(
|
|
56
|
+
"\rJob:\t\t " + job + "\nProgress: \t [{0}] {1}%".format("█"*block + "-"*(barLength-block), progress*100) + "\t" + progress_text,
|
|
57
|
+
flush=True
|
|
58
|
+
)
|
|
59
|
+
with open(log_file, 'a') as f:
|
|
60
|
+
print(
|
|
61
|
+
f"\rJob:\t {job} \nResult:\t {progress_text}",
|
|
62
|
+
file=f
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def raise_error(message, log_file, exit=False):
|
|
67
|
+
"""
|
|
68
|
+
raises warnings or errors, writes to log
|
|
69
|
+
"""
|
|
70
|
+
# print to log
|
|
71
|
+
with open(log_file, 'a') as f:
|
|
72
|
+
if exit:
|
|
73
|
+
print(f"ERROR: {message}", file=f)
|
|
74
|
+
else:
|
|
75
|
+
print(f"WARNING: {message}", file=f)
|
|
76
|
+
# print to console
|
|
77
|
+
if exit:
|
|
78
|
+
sys.exit(f"\n\033[31m\033[1mERROR:\033[0m {message}")
|
|
79
|
+
else:
|
|
80
|
+
print(f"\033[31m\033[1mWARNING:\033[0m {message}")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def raise_arg_errors(args, log_file):
|
|
84
|
+
"""
|
|
85
|
+
checks arguments for non-valid input and raises warnings
|
|
86
|
+
"""
|
|
87
|
+
# threshold error
|
|
88
|
+
if args.threshold > 1 or args.threshold < 0:
|
|
89
|
+
raise_error(
|
|
90
|
+
"threshold can only be between 0-1",
|
|
91
|
+
log_file,
|
|
92
|
+
exit=True
|
|
93
|
+
)
|
|
94
|
+
if args.allowed_ambiguous < 0:
|
|
95
|
+
raise_error(
|
|
96
|
+
"number of ambiguous chars can not be negative",
|
|
97
|
+
log_file,
|
|
98
|
+
exit=True
|
|
99
|
+
)
|
|
100
|
+
if args.allowed_ambiguous > 4:
|
|
101
|
+
raise_error(
|
|
102
|
+
"high number of ambiguous nucleotides in primer leads to a high "
|
|
103
|
+
"degeneracy. Consider reducing.",
|
|
104
|
+
log_file
|
|
105
|
+
)
|
|
106
|
+
if args.opt_length > args.max_length:
|
|
107
|
+
raise_error(
|
|
108
|
+
"optimal length can not be higher than the maximum amplicon length.",
|
|
109
|
+
log_file,
|
|
110
|
+
exit=True
|
|
111
|
+
)
|
|
112
|
+
if args.opt_length < 0 or args.max_length < 0:
|
|
113
|
+
raise_error(
|
|
114
|
+
"amplicon lengths can not be negative.",
|
|
115
|
+
log_file,
|
|
116
|
+
exit=True
|
|
117
|
+
)
|
|
118
|
+
if args.opt_length < 200 or args.max_length < 200:
|
|
119
|
+
raise_error(
|
|
120
|
+
"your amplicon lengths might be to small. Consider increasing",
|
|
121
|
+
log_file
|
|
122
|
+
)
|
|
123
|
+
if args.overlap < 0:
|
|
124
|
+
raise_error(
|
|
125
|
+
"overlap size can not be negative.",
|
|
126
|
+
log_file,
|
|
127
|
+
exit=True
|
|
128
|
+
)
|
|
129
|
+
if args.overlap < 50:
|
|
130
|
+
raise_error(
|
|
131
|
+
"small overlaps might hinder downstream analyses. Consider increasing.",
|
|
132
|
+
log_file
|
|
133
|
+
)
|
|
134
|
+
if args.overlap > args.max_length/2 - config.PRIMER_SIZES[1]:
|
|
135
|
+
raise_error(
|
|
136
|
+
"min overlap must be lower than half of your maximum length - maximum primer length. To achieve optimal results reduce it to at least half of your optimal length",
|
|
137
|
+
log_file,
|
|
138
|
+
exit=True
|
|
139
|
+
)
|
|
140
|
+
if args.overlap > args.opt_length:
|
|
141
|
+
raise_error(
|
|
142
|
+
"overlap can not be higher than your optimal length.",
|
|
143
|
+
log_file,
|
|
144
|
+
exit=True
|
|
145
|
+
)
|
|
146
|
+
if args.overlap > args.opt_length/2:
|
|
147
|
+
raise_error(
|
|
148
|
+
"your intended overlap is higher than half of your optimal length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
|
|
149
|
+
log_file
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def confirm_config(args, log_file):
|
|
154
|
+
"""
|
|
155
|
+
checks the config. raises error and warnings
|
|
156
|
+
if nececarry. writes settings to log
|
|
157
|
+
"""
|
|
158
|
+
error = False
|
|
159
|
+
|
|
160
|
+
# check if all variables exists
|
|
161
|
+
all_vars = [
|
|
162
|
+
# arg dependent
|
|
163
|
+
"FREQUENCY_THRESHOLD",
|
|
164
|
+
"PRIMER_ALLOWED_N_AMB",
|
|
165
|
+
"AMPLICON_OPT_LENGTH",
|
|
166
|
+
"AMPLICON_MAX_LENGTH",
|
|
167
|
+
"AMPLICON_MIN_OVERLAP",
|
|
168
|
+
# arg independent
|
|
169
|
+
"PRIMER_TMP",
|
|
170
|
+
"PRIMER_GC_RANGE",
|
|
171
|
+
"PRIMER_SIZES",
|
|
172
|
+
"PRIMER_HAIRPIN",
|
|
173
|
+
"PRIMER_MAX_POLYX",
|
|
174
|
+
"PRIMER_MAX_DINUC_REPEATS",
|
|
175
|
+
"PRIMER_MAX_DIMER_TMP",
|
|
176
|
+
"PRIMER_MIN_3_WITHOUT_AMB",
|
|
177
|
+
"PCR_MV_CONC",
|
|
178
|
+
"PCR_DV_CONC",
|
|
179
|
+
"PCR_DNTP_CONC",
|
|
180
|
+
"PCR_DNA_CONC",
|
|
181
|
+
"PRIMER_TM_PENALTY",
|
|
182
|
+
"PRIMER_GC_PENALTY",
|
|
183
|
+
"PRIMER_SIZE_PENALTY",
|
|
184
|
+
"PRIMER_MAX_BASE_PENALTY",
|
|
185
|
+
"PRIMER_3_PENALTY",
|
|
186
|
+
"PRIMER_PERMUTATION_PENALTY",
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
for var in all_vars:
|
|
190
|
+
if var not in vars(config):
|
|
191
|
+
raise_error(
|
|
192
|
+
f"{var} does not exist in config!",
|
|
193
|
+
log_file
|
|
194
|
+
)
|
|
195
|
+
error = True
|
|
196
|
+
# exit if variables are not defined
|
|
197
|
+
if error:
|
|
198
|
+
raise_error(
|
|
199
|
+
"config is missing parameters. Look at the above warnings!",
|
|
200
|
+
log_file,
|
|
201
|
+
exit=True
|
|
202
|
+
)
|
|
203
|
+
# confirm tuples
|
|
204
|
+
for type, tup in [("temp", config.PRIMER_TMP), ("gc", config.PRIMER_GC_RANGE), ("size", config.PRIMER_SIZES)]:
|
|
205
|
+
if len(tup) != 3:
|
|
206
|
+
raise_error(
|
|
207
|
+
f"{type} tuple has to have the form (min, max, opt)!",
|
|
208
|
+
log_file
|
|
209
|
+
)
|
|
210
|
+
error = True
|
|
211
|
+
if tup[0] > tup[1]:
|
|
212
|
+
raise_error(
|
|
213
|
+
f"min {type} should not exeed max {type}!",
|
|
214
|
+
log_file
|
|
215
|
+
)
|
|
216
|
+
error = True
|
|
217
|
+
if tup[0] > tup[2]:
|
|
218
|
+
raise_error(
|
|
219
|
+
f"min {type} should not exeed opt {type}!",
|
|
220
|
+
log_file
|
|
221
|
+
)
|
|
222
|
+
error = True
|
|
223
|
+
if tup[2] > tup[1]:
|
|
224
|
+
raise_error(
|
|
225
|
+
f"opt {type} should not exeed max {type}!",
|
|
226
|
+
log_file
|
|
227
|
+
)
|
|
228
|
+
error = True
|
|
229
|
+
if any(map(lambda var: var < 0, tup)):
|
|
230
|
+
raise_error(
|
|
231
|
+
f"{type} can not contain negative values!",
|
|
232
|
+
log_file
|
|
233
|
+
)
|
|
234
|
+
error = True
|
|
235
|
+
|
|
236
|
+
# check values that cannot be zero
|
|
237
|
+
non_negative_var = [
|
|
238
|
+
("max polyx repeats", config.PRIMER_MAX_POLYX),
|
|
239
|
+
("max dinucleotide repeats", config.PRIMER_MAX_DINUC_REPEATS),
|
|
240
|
+
("max GCs at the 3' end", config.PRIMER_MAX_GC_END),
|
|
241
|
+
("GC clamp", config.PRIMER_GC_CLAMP),
|
|
242
|
+
("min number of 3 prime nucleotides without ambiguous nucleotides", config.PRIMER_MIN_3_WITHOUT_AMB),
|
|
243
|
+
("monovalent cation concentration", config.PCR_MV_CONC),
|
|
244
|
+
("divalent cation concentration", config.PCR_DV_CONC),
|
|
245
|
+
("dNTP concentration", config.PCR_DNTP_CONC),
|
|
246
|
+
("primer temperatur penalty", config.PRIMER_TM_PENALTY),
|
|
247
|
+
("primer gc penalty", config.PRIMER_GC_PENALTY),
|
|
248
|
+
("primer size penalty", config.PRIMER_SIZE_PENALTY),
|
|
249
|
+
("max base penalty", config.PRIMER_MAX_BASE_PENALTY),
|
|
250
|
+
("primer permutation penalty", config.PRIMER_PERMUTATION_PENALTY)
|
|
251
|
+
]
|
|
252
|
+
for type, var in non_negative_var:
|
|
253
|
+
if var < 0:
|
|
254
|
+
raise_error(
|
|
255
|
+
f"{type} can not be negative!",
|
|
256
|
+
log_file
|
|
257
|
+
)
|
|
258
|
+
error = True
|
|
259
|
+
if any(map(lambda var: var < 0, config.PRIMER_3_PENALTY)):
|
|
260
|
+
raise_error(
|
|
261
|
+
"3' penalties can not be zero!",
|
|
262
|
+
log_file
|
|
263
|
+
)
|
|
264
|
+
error = True
|
|
265
|
+
# exit if variables are not properly defined
|
|
266
|
+
if error:
|
|
267
|
+
raise_error(
|
|
268
|
+
"config has flaws. Look at the above warnings!",
|
|
269
|
+
log_file,
|
|
270
|
+
exit=True
|
|
271
|
+
)
|
|
272
|
+
# specific warnings
|
|
273
|
+
if config.PRIMER_HAIRPIN < 0:
|
|
274
|
+
raise_error(
|
|
275
|
+
"decreasing hairpin melting temp to negative values "
|
|
276
|
+
"will influence successful primer search!",
|
|
277
|
+
log_file
|
|
278
|
+
)
|
|
279
|
+
if config.PRIMER_MAX_DIMER_TMP < 0:
|
|
280
|
+
raise_error(
|
|
281
|
+
"there is no need to set max dimer melting temp below 0.",
|
|
282
|
+
log_file
|
|
283
|
+
)
|
|
284
|
+
if config.PRIMER_MAX_BASE_PENALTY < 8:
|
|
285
|
+
raise_error(
|
|
286
|
+
"decreasing the base penalty will filter out more primers.",
|
|
287
|
+
log_file
|
|
288
|
+
)
|
|
289
|
+
if config.PRIMER_GC_CLAMP > 3:
|
|
290
|
+
raise_error(
|
|
291
|
+
"large GC clamps will results in too high 3'end stability",
|
|
292
|
+
log_file
|
|
293
|
+
)
|
|
294
|
+
if config.PRIMER_MAX_GC_END < 5 and config.PRIMER_MAX_GC_END < config.PRIMER_GC_CLAMP:
|
|
295
|
+
raise_error(
|
|
296
|
+
f"GC clamp of {config.PRIMER_GC_CLAMP} length will not be enforced as there are only {config.PRIMER_MAX_GC_END} gc characters allowed at the 3' end",
|
|
297
|
+
log_file
|
|
298
|
+
)
|
|
299
|
+
if config.PRIMER_MAX_GC_END > 5:
|
|
300
|
+
raise_error(
|
|
301
|
+
"only the last 5 nucleotides of the 3' end are considered for GC 3'end calculation.",
|
|
302
|
+
log_file
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# write all settings to file
|
|
306
|
+
var_dic = vars(config)
|
|
307
|
+
with open(log_file, 'a') as f:
|
|
308
|
+
print(
|
|
309
|
+
"settings that can be adjusted via arguments\n",
|
|
310
|
+
f"PRIMER_OPT_LENGTH = {args.opt_length}",
|
|
311
|
+
f"PRIMER_MAX_LENGTH = {args.max_length}",
|
|
312
|
+
f"PRIMER_MIN_OVERLAP = {args.overlap}",
|
|
313
|
+
f"PRIMER_THRESHOLD = {args.threshold}",
|
|
314
|
+
f"PRIMER_ALLOWED_N_AMB = {args.allowed_ambiguous}",
|
|
315
|
+
"\nconfig settings\n",
|
|
316
|
+
sep="\n",
|
|
317
|
+
file=f
|
|
318
|
+
)
|
|
319
|
+
for var in all_vars[5:]:
|
|
320
|
+
print(f"{var} = {var_dic[var]}", file=f)
|
|
321
|
+
print("\nprogress\n", file=f)
|