PyPI - varvamp - Versions diffs - 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl - Mend

varvamp 1.2.1py3-none-any.whl → 1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

varvamp/__init__.py +6 -3
varvamp/command.py +134 -60
varvamp/scripts/alignment.py +54 -164
varvamp/scripts/default_config.py +5 -3
varvamp/scripts/logging.py +67 -21
varvamp/scripts/param_estimation.py +84 -62
varvamp/scripts/primers.py +190 -46
varvamp/scripts/qpcr.py +141 -117
varvamp/scripts/reporting.py +45 -40
varvamp/scripts/scheme.py +101 -52
varvamp-1.3.dist-info/METADATA +760 -0
varvamp-1.3.dist-info/RECORD +22 -0
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/WHEEL +1 -1
varvamp-1.3.dist-info/licenses/LICENSE +674 -0
varvamp-1.2.1.dist-info/METADATA +0 -78
varvamp-1.2.1.dist-info/RECORD +0 -21
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/entry_points.txt +0 -0
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/top_level.txt +0 -0

varvamp/scripts/default_config.py CHANGED Viewed

@@ -9,7 +9,7 @@ __all__ = [
     'PCR_DNA_CONC', 'PCR_DNTP_CONC', 'PCR_DV_CONC', 'PCR_MV_CONC',
     'PRIMER_3_PENALTY', 'PRIMER_GC_END', 'PRIMER_GC_PENALTY',
     'PRIMER_GC_RANGE', 'PRIMER_HAIRPIN', 'PRIMER_MAX_BASE_PENALTY',
-    'PRIMER_MAX_DIMER_TMP', 'PRIMER_MAX_DINUC_REPEATS', 'PRIMER_MAX_POLYX',
+    'PRIMER_MAX_DIMER_TMP', 'PRIMER_MAX_DIMER_DELTAG', 'PRIMER_MAX_DINUC_REPEATS', 'PRIMER_MAX_POLYX',
     'PRIMER_MIN_3_WITHOUT_AMB', 'PRIMER_PERMUTATION_PENALTY',
     'PRIMER_SIZES', 'PRIMER_SIZE_PENALTY',
     'PRIMER_TMP', 'PRIMER_TM_PENALTY',
@@ -30,7 +30,10 @@ PRIMER_MAX_DINUC_REPEATS = 4  # max number of polyXY
 PRIMER_HAIRPIN = 47  # max melting temp for secondary structure
 PRIMER_GC_END = (1, 3)  # min/max GCs in the last 5 bases of the 3' end
 PRIMER_MIN_3_WITHOUT_AMB = 3  # min len of 3' without ambiguous charaters
-PRIMER_MAX_DIMER_TMP = 47  # max melting temp for dimers (homo- or heterodimers)
+# primer dimer constraints
+PRIMER_MAX_DIMER_TMP = 35  # max melting temp for dimers, lower temperature means more stringent filtering
+PRIMER_MAX_DIMER_DELTAG = -9000  # max allowed gibbs free energy for dimer formation, higher values mean more stringent filtering
+END_OVERLAP = 5  # maximum allowed nt overlap between ends of primers to be considered a dimer
 # QPCR parameters
 # basic probe parameters
@@ -42,7 +45,6 @@ QPROBE_GC_END = (0, 4)
 QPRIMER_DIFF = 2  # maximal temperature diff of qPCR primers
 QPROBE_TEMP_DIFF = (5, 10)  # min/max temp diff between probe and primers
 QPROBE_DISTANCE = (4, 15)  # min/max distance to the primer on the same strand
-END_OVERLAP = 5  # maximum allowed nt overlap between the ends of probe and primer
 QAMPLICON_LENGTH = (70, 200)  # min/max length of the qPCR amplicon
 QAMPLICON_GC = (40, 60)  # GC min/max of the qPCR amplicon
 QAMPLICON_DEL_CUTOFF = 4  # consider regions of the alignment for deltaG calculation if they have smaller deletions than cutoff

varvamp/scripts/logging.py CHANGED Viewed

@@ -9,6 +9,7 @@ import shutil
 import datetime
 import random
 import statistics
+import re
 # varVAMP
 from varvamp.scripts import config
@@ -106,13 +107,12 @@ def raise_arg_errors(args, log_file):
             "degeneracy. Consider reducing.",
             log_file
         )
-    if args.database is not None:
-        if args.threads < 1:
-            raise_error(
-                "number of threads cannot be smaller than 1.",
-                log_file,
-                exit=True
-            )
+    if args.threads < 1:
+        raise_error(
+            "number of threads cannot be smaller than 1.",
+            log_file,
+            exit=True
+        )
     if args.mode in ("tiled", "single"):
         if args.opt_length > args.max_length:
             raise_error(
@@ -147,14 +147,14 @@ def raise_arg_errors(args, log_file):
                 log_file,
                 exit=True
             )
-        if args.overlap < 50:
+        if args.overlap < 10:
             raise_error(
                 "small overlaps might hinder downstream analyses. Consider increasing.",
                 log_file
             )
-        if args.overlap > args.max_length / 2 - config.PRIMER_SIZES[1]:
+        if args.overlap > args.max_length / 2 - config.PRIMER_SIZES[0] * 2:
             raise_error(
-                "min overlap must be lower than half of your maximum length - maximum primer length. To achieve optimal results reduce it to at least half of your optimal length",
+                "min overlap must be lower than your maximum length / 2 - 2 * min primer length.",
                 log_file,
                 exit=True
             )
@@ -164,9 +164,9 @@ def raise_arg_errors(args, log_file):
                 log_file,
                 exit=True
             )
-        if args.overlap > args.opt_length / 2:
+        if args.overlap > args.opt_length / 2 - config.PRIMER_SIZES[0] * 2:
             raise_error(
-                "your intended overlap is higher than half of your optimal length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
+                "your min overlap is lower than your optimal length / 2 - 2 * min primer length. This reduces how well varvamps will find overlapping amplicons. Consider decreasing.",
                 log_file
             )
     # QPCR specific warnings
@@ -243,6 +243,34 @@ def check_alignment_length(preprocessed_alignment, log_file):
         )
+def check_gaped_sequences(preprocessed_alignment, log_file):
+    """
+    checks the number of gaps in each sequence of the alignment and reports a warning
+    if the number of gaps is larger than the mean + 3std
+    """
+    number_of_gaps = {}
+    for seq in preprocessed_alignment:
+        # find all gaps for all sequences with regular expression -{min}
+        number_of_gaps[seq[0]] = len(list(re.finditer(r"-+", seq[1])))
+    # clac mean and std
+    mean_gaps, mean_gaps_std = statistics.mean(number_of_gaps.values()), statistics.stdev(number_of_gaps.values())
+    warning = []
+    for name, n_gaps in number_of_gaps.items():
+        if n_gaps < mean_gaps - 3 * mean_gaps_std:
+            warning.append(f"{name} ({n_gaps} gaps)\n")
+    if warning:
+        raise_error(
+            f"The following sequences contain less gaps than the alignment mean ({round(mean_gaps)} gaps) and might overproportionally gap the alignment:\n{"".join(warning)}",
+            log_file,
+            exit=False
+        )
 def confirm_config(args, log_file):
     """
     checks the config. raises error and warnings
@@ -252,7 +280,7 @@ def confirm_config(args, log_file):
     # check if all variables exists
     all_vars = [
-        # arg independent TILED, SINGLE mode
+        # arg independent all modes
         (
             "PRIMER_TMP",
             "PRIMER_GC_RANGE",
@@ -262,6 +290,7 @@ def confirm_config(args, log_file):
             "PRIMER_MAX_DINUC_REPEATS",
             "PRIMER_GC_END",
             "PRIMER_MAX_DIMER_TMP",
+            "PRIMER_MAX_DIMER_DELTAG",
             "PRIMER_MIN_3_WITHOUT_AMB",
             "PCR_MV_CONC",
             "PCR_DV_CONC",
@@ -272,7 +301,8 @@ def confirm_config(args, log_file):
             "PRIMER_SIZE_PENALTY",
             "PRIMER_MAX_BASE_PENALTY",
             "PRIMER_3_PENALTY",
-            "PRIMER_PERMUTATION_PENALTY"
+            "PRIMER_PERMUTATION_PENALTY",
+            "END_OVERLAP"
         ),
         # arg independent QPCR mode
         (
@@ -283,7 +313,6 @@ def confirm_config(args, log_file):
             "QPRIMER_DIFF",
             "QPROBE_TEMP_DIFF",
             "QPROBE_DISTANCE",
-            "END_OVERLAP",
             "QAMPLICON_LENGTH",
             "QAMPLICON_GC",
             "QAMPLICON_DEL_CUTOFF"
@@ -379,7 +408,7 @@ def confirm_config(args, log_file):
         ("max base penalty", config.PRIMER_MAX_BASE_PENALTY),
         ("primer permutation penalty", config.PRIMER_PERMUTATION_PENALTY),
         ("qpcr flanking primer difference", config.QPRIMER_DIFF),
-        ("probe and primer end overlap", config.END_OVERLAP),
+        ("end overlap", config.END_OVERLAP),
         ("qpcr deletion size still considered for deltaG calculation", config.QAMPLICON_DEL_CUTOFF),
         ("maximum difference between primer and blast db", config.BLAST_MAX_DIFF),
         ("multiplier of the maximum length for non-specific amplicons", config.BLAST_SIZE_MULTI),
@@ -417,15 +446,20 @@ def confirm_config(args, log_file):
         )
     if config.PRIMER_HAIRPIN < 0:
         raise_error(
-            "decreasing hairpin melting temp to negative values "
-            "will influence successful primer search!",
+            "decreasing hairpin melting temp to negative values will influence successful primer search!",
+            log_file
+        )
+    if config.PRIMER_MAX_DIMER_TMP < 21:
+        raise_error(
+            "there is no need to set max dimer melting temp below room temperature.",
             log_file
         )
-    if config.PRIMER_MAX_DIMER_TMP < 0:
+    if config.PRIMER_MAX_DIMER_DELTAG > -6000:
         raise_error(
-            "there is no need to set max dimer melting temp below 0.",
+            "primer interactions with deltaG values higher than -6000 are generally considered unproblematic.",
             log_file
         )
     if config.PRIMER_MAX_BASE_PENALTY < 8:
         raise_error(
             "decreasing the base penalty will filter out more primers.",
@@ -566,7 +600,6 @@ def goodbye_message():
     messages = [
         "Thank you. Come again.",
         ">Placeholder for your advertisement<",
-        "Make primers great again!",
         "Ciao cacao!",
         "And now lets pray to the PCR gods.",
         "**bibobibobop** task finished",
@@ -582,6 +615,19 @@ def goodbye_message():
         "Task failed successfully.",
         "Never gonna give you up, never gonna let you down.",
         "Have you tried turning it off and on again?",
+        "Just try it. PCR is magic!",
+        "One goat was sacrificed for this primer design to work.",
+        "You seem trustworthy. Here's a cookie!",
+        "Owwww yeah, primers done!",
+        "These primers were designed without AI assistance.",
+        "Research is fun (if you ignore the pipetting).",
+        "Balance your primers, balance your life.",
+        "Keep calm and PCR on.",
+        "In silico we trust.",
+        "May the primers be with you.",
+        "Designing primers like a boss!",
+        "Primer design completed. Time for a break!",
+        "Eureka! Your primers are ready.",
         "Look, I am your primer scheme.",
         "Quod erat demonstrandum.",
         "Miau?",

varvamp/scripts/param_estimation.py CHANGED Viewed

@@ -1,36 +1,38 @@
 """
-estimate varVAMP threshold and max n of ambiguous bases if none are given
+estimate varVAMP threshold if not given
 """
+# libs
+import numpy as np
 # varVAMP
 from varvamp.scripts import config
 def calculate_frequencies(preprocessed_alignment):
     """
-    calculate the max nucleotide freq at each pos
+    calculate individual frequencies for a, t, c, g, and gaps at each position
+    returns a 2D numpy array where each column is a position and rows are [a, c, t, g, -]
     """
-    all_freqs = []
+    # convert alignment to numpy array
+    alignment_array = np.array([list(seq.lower()) for _, seq in preprocessed_alignment])
+    num_sequences, sequence_length = alignment_array.shape
+    # calculate frequencies
+    frequencies = np.zeros((5, sequence_length))
+    nucleotides = ['a', 'c', 't', 'g', '-']
-    for i in range(0, len(preprocessed_alignment[0][1])):
-        nuc_dict = {
-            "a": 0,
-            "c": 0,
-            "t": 0,
-            "g": 0,
-        }
-        for seq in preprocessed_alignment:
-            if seq[1][i] in nuc_dict:
-                nuc_dict[seq[1][i]] += 1
-        highest_freq = max(nuc_dict.values())
-        all_freqs.append(highest_freq/len(preprocessed_alignment))
+    for i in range(sequence_length):
+        column = alignment_array[:, i]
+        for nuc_idx, nuc in enumerate(nucleotides):
+            frequencies[nuc_idx, i] = np.count_nonzero(column == nuc) / num_sequences
-    return all_freqs
+    return np.max(frequencies[0:4, :], axis=0), frequencies[4, :]
-def calculate_distances(all_freqs, threshold):
+def calculate_distances(highest_freq, deletion_freq, threshold):
     """
-    calc the distance for each  nuc freq to the prior
+    calc the distance for each nuc freq to the prior
     nuc freq that fell below the cutoff
     """
@@ -38,11 +40,14 @@ def calculate_distances(all_freqs, threshold):
     previous_dis = 0
     all_dis = []
-    for idx, freq in enumerate(all_freqs):
+    for idx, (freq, del_freq) in enumerate(zip(highest_freq, deletion_freq)):
         if freq < threshold:
             current_dis = 0
+        # ignore gaps -> they will be excluded later and might skew distance calc
+        if del_freq > 1-threshold:
+            continue
         current_dis += 1
-        if current_dis <= previous_dis or idx == len(all_freqs)-1:
+        if current_dis <= previous_dis or idx == len(highest_freq)-1:
             all_dis.append(previous_dis)
         previous_dis = current_dis
@@ -54,53 +59,70 @@ def get_parameters(preprocessed_alignment, args, log_file):
     """
     give an estimate for number of ambiguous chars and/or threshold
     writes to log file
+    How it works:
+    - calculate a np array with frequencies per position for ATCG and gaps (-)
+    - increment for different threshold starting at 0.1
+    - for each increment calculate:
+        - the lengths of conseq. nucleotides that are >= threshold
+        - skip all positions with gaps >= 1 - threshold
+        - this results in an array of stretch lengths
+        - Then check for each stretch if it could be considered as a primer region:
+            e.g. 4 ambiguous bases:
+                stretches: [1,1,1,1,6,10,5,3,1,1]
+                - first check for the current stretch if the sum of the next n stretches (with n being
+                the n ambiguous bases + 1) is larger than the minimal length of a primer
+                - this would result that the with stars marked positions would pass:
+                [1,1,1,1*,6*,10*,5,3,1,1]
+        - now calculate the distance between the current start of a stretch and the previous stop
+        - if this is larger than the minimal amplicon length - exit the optimization
+        - reset the threshold to second prior iteration to make it more robust (allow more primer regions) as
+        sometimes the optimization would fail with just one iteration
+    Manual optimization can be beneficial in some cases.
     """
     # set coverage to max
-    coverage = 1
-    # read in the alignment and calc freqs
-    frequencies = calculate_frequencies(preprocessed_alignment)
-    # if no args for both threshold and n_ambig are given
-    # set the n_ambig to 2 and optimize threshold
-    if args.threshold is None:
-        args.threshold = 0.1
-        if args.n_ambig is None:
-            args.n_ambig = 2
-        text = f"AUTOMATIC PARAMETER SELECTION\nvarVAMP estimates the threshold at {args.n_ambig} ambiguous bases"
-        fixed = False
-    # if threshold is given, optimize n_ambig (number of ambiguous chars)
-    else:
-        args.n_ambig = config.PRIMER_SIZES[0]
-        text = f"varVAMP estimates the number of ambiguous bases at a threshold of {args.threshold}"
-        fixed = True
+    args.threshold = 0.1
+    # calc freqs
+    highest_frequencies, deletion_frequency = calculate_frequencies(preprocessed_alignment)
     # write to log
     with open(log_file, 'a') as f:
-        print(f"{text}\nto consider ~50% of the alignment for potential primers:\n\n-t\t-a\testimated coverage", file=f)
-        # optimize until less than 50 % is covered
-        while coverage >= 0.5 and args.threshold < 1:
-            distances = calculate_distances(frequencies, args.threshold)
-            # calculate the cummulative sum of the sum of n conseq. streches
-            # that are together larger than the min primer length
-            covered_pos = sum(
-                [distances[x] for x in range(0, len(distances)) if sum(distances[x:x+args.n_ambig+1]) >= config.PRIMER_SIZES[0]]
-            )
-            # calculate coverage
-            coverage = (covered_pos+1)/len(preprocessed_alignment[0][1])
-            # change the non fixed param if threshold has not been reached
-            if coverage >= 0.5:
-                # write each iteration to log
-                print(round(args.threshold, 2), args.n_ambig, round(coverage*100, 1), sep="\t", file=f)
-                if fixed:
-                    args.n_ambig -= 1
-                else:
+        print(f"AUTOMATIC THRESHOLD SELECTION\n", file=f)
+        print(f"-t\tmaximum non-covered region", file=f)
+        # calculate distance between passing potential primer regions
+        previous_stop = 0
+        while args.threshold < 1:
+            distances = calculate_distances(highest_frequencies, deletion_frequency, args.threshold)
+            max_distance_between_passing, previous_passing = 0, 0
+            # check if the distance between potential primer regions is not larger than:
+            # args.opt_length - 2 * args.overlap
+            for idx, dis in enumerate(distances):
+                if sum(distances[idx:idx + 1 + args.n_ambig]) >= config.PRIMER_SIZES[1]:
+                    # the stretch start in the gap-excluded alignment is the sum of all prior distances including the current
+                    # minus the distance of the current stretch
+                    stretch_start = sum(distances[:idx])
+                    # then the distance between the prior stop and current start is calculated
+                    current_dis = stretch_start - previous_stop
+                    # and the max is updated if necessary
+                    if max_distance_between_passing < current_dis:
+                        max_distance_between_passing = current_dis
+                    # update previous stop position
+                    previous_stop = stretch_start + distances[idx]
+            # write each iteration to log
+            print(round(args.threshold, 2), max_distance_between_passing, sep="\t", file=f)
+            # check if the distance is acceptable
+            distance_threshold = args.opt_length - 2 * args.overlap if args.mode == 'tiled' else args.opt_length
+            if max_distance_between_passing < distance_threshold:
+                # never exceed 0.99
+                if args.threshold < 0.99:
                     args.threshold += 0.01
-            # or reset to the param of the prior iteration
-            else:
-                if fixed:
-                    args.n_ambig += 1
                 else:
-                    args.threshold -= 0.01
+                    break
+            # or reset to the param of the two previous iterations
+            else:
+                args.threshold -= 0.02
                 break
-        print(f"Automatic parameter selection set -t {round(args.threshold, 2)} and -a {args.n_ambig}.", file=f)
+        print(f"Automatic parameter selection set -t {round(args.threshold, 2)} at -a {args.n_ambig}.", file=f)
-    return round(args.threshold, 2), args.n_ambig
+    return round(args.threshold, 2)

varvamp 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl

varvamp 1.2.1py3-none-any.whl → 1.3py3-none-any.whl