varvamp 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ __all__ = [
9
9
  'PCR_DNA_CONC', 'PCR_DNTP_CONC', 'PCR_DV_CONC', 'PCR_MV_CONC',
10
10
  'PRIMER_3_PENALTY', 'PRIMER_GC_END', 'PRIMER_GC_PENALTY',
11
11
  'PRIMER_GC_RANGE', 'PRIMER_HAIRPIN', 'PRIMER_MAX_BASE_PENALTY',
12
- 'PRIMER_MAX_DIMER_TMP', 'PRIMER_MAX_DINUC_REPEATS', 'PRIMER_MAX_POLYX',
12
+ 'PRIMER_MAX_DIMER_TMP', 'PRIMER_MAX_DIMER_DELTAG', 'PRIMER_MAX_DINUC_REPEATS', 'PRIMER_MAX_POLYX',
13
13
  'PRIMER_MIN_3_WITHOUT_AMB', 'PRIMER_PERMUTATION_PENALTY',
14
14
  'PRIMER_SIZES', 'PRIMER_SIZE_PENALTY',
15
15
  'PRIMER_TMP', 'PRIMER_TM_PENALTY',
@@ -30,7 +30,10 @@ PRIMER_MAX_DINUC_REPEATS = 4 # max number of polyXY
30
30
  PRIMER_HAIRPIN = 47 # max melting temp for secondary structure
31
31
  PRIMER_GC_END = (1, 3) # min/max GCs in the last 5 bases of the 3' end
32
32
  PRIMER_MIN_3_WITHOUT_AMB = 3 # min len of 3' without ambiguous charaters
33
- PRIMER_MAX_DIMER_TMP = 47 # max melting temp for dimers (homo- or heterodimers)
33
+ # primer dimer constraints
34
+ PRIMER_MAX_DIMER_TMP = 35 # max melting temp for dimers, lower temperature means more stringent filtering
35
+ PRIMER_MAX_DIMER_DELTAG = -9000 # max allowed gibbs free energy for dimer formation, higher values mean more stringent filtering
36
+ END_OVERLAP = 5 # maximum allowed nt overlap between ends of primers to be considered a dimer
34
37
 
35
38
  # QPCR parameters
36
39
  # basic probe parameters
@@ -42,7 +45,6 @@ QPROBE_GC_END = (0, 4)
42
45
  QPRIMER_DIFF = 2 # maximal temperature diff of qPCR primers
43
46
  QPROBE_TEMP_DIFF = (5, 10) # min/max temp diff between probe and primers
44
47
  QPROBE_DISTANCE = (4, 15) # min/max distance to the primer on the same strand
45
- END_OVERLAP = 5 # maximum allowed nt overlap between the ends of probe and primer
46
48
  QAMPLICON_LENGTH = (70, 200) # min/max length of the qPCR amplicon
47
49
  QAMPLICON_GC = (40, 60) # GC min/max of the qPCR amplicon
48
50
  QAMPLICON_DEL_CUTOFF = 4 # consider regions of the alignment for deltaG calculation if they have smaller deletions than cutoff
@@ -9,6 +9,7 @@ import shutil
9
9
  import datetime
10
10
  import random
11
11
  import statistics
12
+ import re
12
13
 
13
14
  # varVAMP
14
15
  from varvamp.scripts import config
@@ -106,13 +107,12 @@ def raise_arg_errors(args, log_file):
106
107
  "degeneracy. Consider reducing.",
107
108
  log_file
108
109
  )
109
- if args.database is not None:
110
- if args.threads < 1:
111
- raise_error(
112
- "number of threads cannot be smaller than 1.",
113
- log_file,
114
- exit=True
115
- )
110
+ if args.threads < 1:
111
+ raise_error(
112
+ "number of threads cannot be smaller than 1.",
113
+ log_file,
114
+ exit=True
115
+ )
116
116
  if args.mode in ("tiled", "single"):
117
117
  if args.opt_length > args.max_length:
118
118
  raise_error(
@@ -152,9 +152,9 @@ def raise_arg_errors(args, log_file):
152
152
  "small overlaps might hinder downstream analyses. Consider increasing.",
153
153
  log_file
154
154
  )
155
- if args.overlap > args.max_length / 2 - config.PRIMER_SIZES[1]:
155
+ if args.overlap > args.max_length / 2 - config.PRIMER_SIZES[0] * 2:
156
156
  raise_error(
157
- "min overlap must be lower than half of your maximum length - maximum primer length. To achieve optimal results reduce it to at least half of your optimal length",
157
+ "min overlap must be lower than your maximum length / 2 - 2 * min primer length.",
158
158
  log_file,
159
159
  exit=True
160
160
  )
@@ -164,9 +164,9 @@ def raise_arg_errors(args, log_file):
164
164
  log_file,
165
165
  exit=True
166
166
  )
167
- if args.overlap > args.opt_length / 2:
167
+ if args.overlap > args.opt_length / 2 - config.PRIMER_SIZES[0] * 2:
168
168
  raise_error(
169
- "your intended overlap is higher than half of your optimal length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
169
+ "your min overlap is lower than your optimal length / 2 - 2 * min primer length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
170
170
  log_file
171
171
  )
172
172
  # QPCR specific warnings
@@ -243,6 +243,34 @@ def check_alignment_length(preprocessed_alignment, log_file):
243
243
  )
244
244
 
245
245
 
246
+ def check_gaped_sequences(preprocessed_alignment, log_file):
247
+ """
248
+ checks the number of gaps in each sequence of the alignment and reports a warning
249
+ if the number of gaps is larger than the mean + 3std
250
+ """
251
+ number_of_gaps = {}
252
+
253
+ for seq in preprocessed_alignment:
254
+ # find all gaps for all sequences with regular expression -{min}
255
+ number_of_gaps[seq[0]] = len(list(re.finditer(r"-+", seq[1])))
256
+
257
+ # clac mean and std
258
+ mean_gaps, mean_gaps_std = statistics.mean(number_of_gaps.values()), statistics.stdev(number_of_gaps.values())
259
+
260
+ warning = []
261
+
262
+ for name, n_gaps in number_of_gaps.items():
263
+ if n_gaps < mean_gaps - 3 * mean_gaps_std:
264
+ warning.append(f"{name} ({n_gaps} gaps)\n")
265
+
266
+ if warning:
267
+ raise_error(
268
+ f"The following sequences contain less gaps than the alignment mean ({round(mean_gaps)} gaps) and might overproportionally gap the alignment:\n{"".join(warning)}",
269
+ log_file,
270
+ exit=False
271
+ )
272
+
273
+
246
274
  def confirm_config(args, log_file):
247
275
  """
248
276
  checks the config. raises error and warnings
@@ -252,7 +280,7 @@ def confirm_config(args, log_file):
252
280
 
253
281
  # check if all variables exists
254
282
  all_vars = [
255
- # arg independent TILED, SINGLE mode
283
+ # arg independent all modes
256
284
  (
257
285
  "PRIMER_TMP",
258
286
  "PRIMER_GC_RANGE",
@@ -262,6 +290,7 @@ def confirm_config(args, log_file):
262
290
  "PRIMER_MAX_DINUC_REPEATS",
263
291
  "PRIMER_GC_END",
264
292
  "PRIMER_MAX_DIMER_TMP",
293
+ "PRIMER_MAX_DIMER_DELTAG",
265
294
  "PRIMER_MIN_3_WITHOUT_AMB",
266
295
  "PCR_MV_CONC",
267
296
  "PCR_DV_CONC",
@@ -272,7 +301,8 @@ def confirm_config(args, log_file):
272
301
  "PRIMER_SIZE_PENALTY",
273
302
  "PRIMER_MAX_BASE_PENALTY",
274
303
  "PRIMER_3_PENALTY",
275
- "PRIMER_PERMUTATION_PENALTY"
304
+ "PRIMER_PERMUTATION_PENALTY",
305
+ "END_OVERLAP"
276
306
  ),
277
307
  # arg independent QPCR mode
278
308
  (
@@ -283,7 +313,6 @@ def confirm_config(args, log_file):
283
313
  "QPRIMER_DIFF",
284
314
  "QPROBE_TEMP_DIFF",
285
315
  "QPROBE_DISTANCE",
286
- "END_OVERLAP",
287
316
  "QAMPLICON_LENGTH",
288
317
  "QAMPLICON_GC",
289
318
  "QAMPLICON_DEL_CUTOFF"
@@ -379,7 +408,7 @@ def confirm_config(args, log_file):
379
408
  ("max base penalty", config.PRIMER_MAX_BASE_PENALTY),
380
409
  ("primer permutation penalty", config.PRIMER_PERMUTATION_PENALTY),
381
410
  ("qpcr flanking primer difference", config.QPRIMER_DIFF),
382
- ("probe and primer end overlap", config.END_OVERLAP),
411
+ ("end overlap", config.END_OVERLAP),
383
412
  ("qpcr deletion size still considered for deltaG calculation", config.QAMPLICON_DEL_CUTOFF),
384
413
  ("maximum difference between primer and blast db", config.BLAST_MAX_DIFF),
385
414
  ("multiplier of the maximum length for non-specific amplicons", config.BLAST_SIZE_MULTI),
@@ -417,15 +446,20 @@ def confirm_config(args, log_file):
417
446
  )
418
447
  if config.PRIMER_HAIRPIN < 0:
419
448
  raise_error(
420
- "decreasing hairpin melting temp to negative values "
421
- "will influence successful primer search!",
449
+ "decreasing hairpin melting temp to negative values will influence successful primer search!",
450
+ log_file
451
+ )
452
+ if config.PRIMER_MAX_DIMER_TMP < 21:
453
+ raise_error(
454
+ "there is no need to set max dimer melting temp below room temperature.",
422
455
  log_file
423
456
  )
424
- if config.PRIMER_MAX_DIMER_TMP < 0:
457
+ if config.PRIMER_MAX_DIMER_DELTAG > -6000:
425
458
  raise_error(
426
- "there is no need to set max dimer melting temp below 0.",
459
+ "primer interactions with deltaG values higher than -6000 are generally considered unproblematic.",
427
460
  log_file
428
461
  )
462
+
429
463
  if config.PRIMER_MAX_BASE_PENALTY < 8:
430
464
  raise_error(
431
465
  "decreasing the base penalty will filter out more primers.",
@@ -566,7 +600,6 @@ def goodbye_message():
566
600
  messages = [
567
601
  "Thank you. Come again.",
568
602
  ">Placeholder for your advertisement<",
569
- "Make primers great again!",
570
603
  "Ciao cacao!",
571
604
  "And now lets pray to the PCR gods.",
572
605
  "**bibobibobop** task finished",
@@ -582,6 +615,19 @@ def goodbye_message():
582
615
  "Task failed successfully.",
583
616
  "Never gonna give you up, never gonna let you down.",
584
617
  "Have you tried turning it off and on again?",
618
+ "Just try it. PCR is magic!",
619
+ "One goat was sacrificed for this primer design to work.",
620
+ "You seem trustworthy. Here's a cookie!",
621
+ "Owwww yeah, primers done!",
622
+ "These primers were designed without AI assistance.",
623
+ "Research is fun (if you ignore the pipetting).",
624
+ "Balance your primers, balance your life.",
625
+ "Keep calm and PCR on.",
626
+ "In silico we trust.",
627
+ "May the primers be with you.",
628
+ "Designing primers like a boss!",
629
+ "Primer design completed. Time for a break!",
630
+ "Eureka! Your primers are ready.",
585
631
  "Look, I am your primer scheme.",
586
632
  "Quod erat demonstrandum.",
587
633
  "Miau?",
@@ -1,36 +1,38 @@
1
1
  """
2
- estimate varVAMP threshold and max n of ambiguous bases if none are given
2
+ estimate varVAMP threshold if not given
3
3
  """
4
4
 
5
+ # libs
6
+ import numpy as np
7
+
5
8
  # varVAMP
6
9
  from varvamp.scripts import config
7
10
 
8
11
 
9
12
  def calculate_frequencies(preprocessed_alignment):
10
13
  """
11
- calculate the max nucleotide freq at each pos
14
+ calculate individual frequencies for a, t, c, g, and gaps at each position
15
+ returns a 2D numpy array where each column is a position and rows are [a, c, t, g, -]
12
16
  """
13
- all_freqs = []
17
+ # convert alignment to numpy array
18
+ alignment_array = np.array([list(seq.lower()) for _, seq in preprocessed_alignment])
19
+ num_sequences, sequence_length = alignment_array.shape
20
+
21
+ # calculate frequencies
22
+ frequencies = np.zeros((5, sequence_length))
23
+ nucleotides = ['a', 'c', 't', 'g', '-']
14
24
 
15
- for i in range(0, len(preprocessed_alignment[0][1])):
16
- nuc_dict = {
17
- "a": 0,
18
- "c": 0,
19
- "t": 0,
20
- "g": 0,
21
- }
22
- for seq in preprocessed_alignment:
23
- if seq[1][i] in nuc_dict:
24
- nuc_dict[seq[1][i]] += 1
25
- highest_freq = max(nuc_dict.values())
26
- all_freqs.append(highest_freq/len(preprocessed_alignment))
25
+ for i in range(sequence_length):
26
+ column = alignment_array[:, i]
27
+ for nuc_idx, nuc in enumerate(nucleotides):
28
+ frequencies[nuc_idx, i] = np.count_nonzero(column == nuc) / num_sequences
27
29
 
28
- return all_freqs
30
+ return np.max(frequencies[0:4, :], axis=0), frequencies[4, :]
29
31
 
30
32
 
31
- def calculate_distances(all_freqs, threshold):
33
+ def calculate_distances(highest_freq, deletion_freq, threshold):
32
34
  """
33
- calc the distance for each nuc freq to the prior
35
+ calc the distance for each nuc freq to the prior
34
36
  nuc freq that fell below the cutoff
35
37
  """
36
38
 
@@ -38,11 +40,14 @@ def calculate_distances(all_freqs, threshold):
38
40
  previous_dis = 0
39
41
  all_dis = []
40
42
 
41
- for idx, freq in enumerate(all_freqs):
43
+ for idx, (freq, del_freq) in enumerate(zip(highest_freq, deletion_freq)):
42
44
  if freq < threshold:
43
45
  current_dis = 0
46
+ # ignore gaps -> they will be excluded later and might skew distance calc
47
+ if del_freq > 1-threshold:
48
+ continue
44
49
  current_dis += 1
45
- if current_dis <= previous_dis or idx == len(all_freqs)-1:
50
+ if current_dis <= previous_dis or idx == len(highest_freq)-1:
46
51
  all_dis.append(previous_dis)
47
52
 
48
53
  previous_dis = current_dis
@@ -54,53 +59,70 @@ def get_parameters(preprocessed_alignment, args, log_file):
54
59
  """
55
60
  give an estimate for number of ambiguous chars and/or threshold
56
61
  writes to log file
62
+
63
+ How it works:
64
+
65
+ - calculate a np array with frequencies per position for ATCG and gaps (-)
66
+ - increment for different threshold starting at 0.1
67
+ - for each increment calculate:
68
+ - the lengths of conseq. nucleotides that are >= threshold
69
+ - skip all positions with gaps >= 1 - threshold
70
+ - this results in an array of stretch lengths
71
+ - Then check for each stretch if it could be considered as a primer region:
72
+ e.g. 4 ambiguous bases:
73
+ stretches: [1,1,1,1,6,10,5,3,1,1]
74
+ - first check for the current stretch if the sum of the next n stretches (with n being
75
+ the n ambiguous bases + 1) is larger than the minimal length of a primer
76
+ - this would result that the with stars marked positions would pass:
77
+ [1,1,1,1*,6*,10*,5,3,1,1]
78
+ - now calculate the distance between the current start of a stretch and the previous stop
79
+ - if this is larger than the minimal amplicon length - exit the optimization
80
+ - reset the threshold to second prior iteration to make it more robust (allow more primer regions) as
81
+ sometimes the optimization would fail with just one iteration
82
+
83
+ Manual optimization can be beneficial in some cases.
57
84
  """
58
85
  # set coverage to max
59
- coverage = 1
60
- # read in the alignment and calc freqs
61
- frequencies = calculate_frequencies(preprocessed_alignment)
62
- # if no args for both threshold and n_ambig are given
63
- # set the n_ambig to 2 and optimize threshold
64
- if args.threshold is None:
65
- args.threshold = 0.1
66
- if args.n_ambig is None:
67
- args.n_ambig = 2
68
- text = f"AUTOMATIC PARAMETER SELECTION\nvarVAMP estimates the threshold at {args.n_ambig} ambiguous bases"
69
- fixed = False
70
- # if threshold is given, optimize n_ambig (number of ambiguous chars)
71
- else:
72
- args.n_ambig = config.PRIMER_SIZES[0]
73
- text = f"varVAMP estimates the number of ambiguous bases at a threshold of {args.threshold}"
74
- fixed = True
86
+ args.threshold = 0.1
87
+ # calc freqs
88
+ highest_frequencies, deletion_frequency = calculate_frequencies(preprocessed_alignment)
75
89
  # write to log
76
90
  with open(log_file, 'a') as f:
77
- print(f"{text}\nto consider ~50% of the alignment for potential primers:\n\n-t\t-a\testimated coverage", file=f)
78
-
79
- # optimize until less than 50 % is covered
80
- while coverage >= 0.5 and args.threshold < 1:
81
- distances = calculate_distances(frequencies, args.threshold)
82
- # calculate the cummulative sum of the sum of n conseq. streches
83
- # that are together larger than the min primer length
84
- covered_pos = sum(
85
- [distances[x] for x in range(0, len(distances)) if sum(distances[x:x+args.n_ambig+1]) >= config.PRIMER_SIZES[0]]
86
- )
87
- # calculate coverage
88
- coverage = (covered_pos+1)/len(preprocessed_alignment[0][1])
89
- # change the non fixed param if threshold has not been reached
90
- if coverage >= 0.5:
91
- # write each iteration to log
92
- print(round(args.threshold, 2), args.n_ambig, round(coverage*100, 1), sep="\t", file=f)
93
- if fixed:
94
- args.n_ambig -= 1
95
- else:
91
+ print(f"AUTOMATIC THRESHOLD SELECTION\n", file=f)
92
+ print(f"-t\tmaximum non-covered region", file=f)
93
+ # calculate distance between passing potential primer regions
94
+ previous_stop = 0
95
+ while args.threshold < 1:
96
+ distances = calculate_distances(highest_frequencies, deletion_frequency, args.threshold)
97
+ max_distance_between_passing, previous_passing = 0, 0
98
+ # check if the distance between potential primer regions is not larger than:
99
+ # args.opt_length - 2 * args.overlap
100
+ for idx, dis in enumerate(distances):
101
+ if sum(distances[idx:idx + 1 + args.n_ambig]) >= config.PRIMER_SIZES[1]:
102
+ # the stretch start in the gap-excluded alignment is the sum of all prior distances including the current
103
+ # minus the distance of the current stretch
104
+ stretch_start = sum(distances[:idx])
105
+ # then the distance between the prior stop and current start is calculated
106
+ current_dis = stretch_start - previous_stop
107
+ # and the max is updated if necessary
108
+ if max_distance_between_passing < current_dis:
109
+ max_distance_between_passing = current_dis
110
+ # update previous stop position
111
+ previous_stop = stretch_start + distances[idx]
112
+ # write each iteration to log
113
+ print(round(args.threshold, 2), max_distance_between_passing, sep="\t", file=f)
114
+ # check if the distance is acceptable
115
+ distance_threshold = args.opt_length - 2 * args.overlap if args.mode == 'tiled' else args.opt_length
116
+ if max_distance_between_passing < distance_threshold:
117
+ # never exceed 0.99
118
+ if args.threshold < 0.99:
96
119
  args.threshold += 0.01
97
- # or reset to the param of the prior iteration
98
- else:
99
- if fixed:
100
- args.n_ambig += 1
101
120
  else:
102
- args.threshold -= 0.01
121
+ break
122
+ # or reset to the param of the two previous iterations
123
+ else:
124
+ args.threshold -= 0.02
103
125
  break
104
- print(f"Automatic parameter selection set -t {round(args.threshold, 2)} and -a {args.n_ambig}.", file=f)
126
+ print(f"Automatic parameter selection set -t {round(args.threshold, 2)} at -a {args.n_ambig}.", file=f)
105
127
 
106
- return round(args.threshold, 2), args.n_ambig
128
+ return round(args.threshold, 2)