PyPI - varvamp - Versions diffs - 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl - Mend

varvamp 1.2.2py3-none-any.whl → 1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

varvamp/__init__.py +6 -3
varvamp/command.py +131 -57
varvamp/scripts/alignment.py +54 -164
varvamp/scripts/default_config.py +5 -3
varvamp/scripts/logging.py +66 -20
varvamp/scripts/param_estimation.py +84 -62
varvamp/scripts/primers.py +190 -46
varvamp/scripts/qpcr.py +141 -117
varvamp/scripts/reporting.py +41 -34
varvamp/scripts/scheme.py +101 -52
varvamp-1.3.dist-info/METADATA +760 -0
varvamp-1.3.dist-info/RECORD +22 -0
{varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/WHEEL +1 -1
varvamp-1.3.dist-info/licenses/LICENSE +674 -0
varvamp-1.2.2.dist-info/METADATA +0 -87
varvamp-1.2.2.dist-info/RECORD +0 -21
{varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/entry_points.txt +0 -0
{varvamp-1.2.2.dist-info → varvamp-1.3.dist-info}/top_level.txt +0 -0

varvamp/scripts/reporting.py CHANGED Viewed

@@ -4,7 +4,6 @@ data writing and visualization.
 # BUILT-INS
 import os
 import math
-import itertools
 # LIBS
 import pandas as pd
@@ -94,20 +93,6 @@ def write_all_primers(path, scheme_name, all_primers):
             write_primers_to_bed(outfile, scheme_name, primer, all_primers[direction][primer], round(all_primers[direction][primer][3], 2), direction)
-def get_permutations(seq):
-    """
-    get all permutations of an ambiguous sequence. needed to
-    correctly report the gc and the temperature.
-    """
-    groups = itertools.groupby(seq, lambda char: char not in config.AMBIG_NUCS)
-    splits = []
-    for b, group in groups:
-        if b:
-            splits.extend([[g] for g in group])
-        else:
-            for nuc in group:
-                splits.append(config.AMBIG_NUCS[nuc])
-    return[''.join(p) for p in itertools.product(*splits)]
 def calc_mean_stats(permutations):
@@ -190,7 +175,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
                 else:
                     direction = "+"
-                permutations = get_permutations(seq)
+                permutations = primers.get_permutations(seq)
                 gc, temp = calc_mean_stats(permutations)
                 primer_name = f"{amp_name}_{oligo_type}"
@@ -224,7 +209,7 @@ def write_qpcr_to_files(path, final_schemes, ambiguous_consensus, scheme_name, l
                 print(f">{primer_name}\n{seq.upper()}", file=fasta)
-def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file):
+def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_name, mode, log_file, primer_dimers=None):
     """
     write all relevant bed files and a tsv file with all primer stats
     """
@@ -233,6 +218,9 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
     amplicon_bed_file = os.path.join(path, "amplicons.bed")
     tabular_file = os.path.join(path, "primer_to_amplicon_assignment.tabular")
+    # Map old primer names to new amplicon-based names
+    name_mapping = {}
     # open files to write
     with open(tsv_file, "w") as tsv, open(amplicon_bed_file, "w") as bed, open(tabular_file, "w") as tabular:
         # write header for primer tsv
@@ -248,11 +236,11 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
             if mode == "single":
                 primer_fasta_file = os.path.join(path, "primers.fasta")
             else:
-                primer_fasta_file = os.path.join(path, f"primers_pool_{pool+1}.fasta")
+                primer_fasta_file = os.path.join(path, f"primers_pool_{pool + 1}.fasta")
             with open(primer_fasta_file, "w") as primer_fasta:
                 for counter, amp in enumerate(amplicon_scheme[pool::len(pools)]):
                     # give a new amplicon name
-                    amplicon_index = counter*len(pools) + pool
+                    amplicon_index = counter * len(pools) + pool
                     amp_name = f"{scheme_name}_{amplicon_index}"
                     # get left and right primers and their names
                     amp_length = amp["RIGHT"][2] - amp["LEFT"][1]
@@ -266,7 +254,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                         amplicon_has_off_target = "n.d."
                     # write amplicon bed
                     if mode == "tiled":
-                        bed_score = pool+1
+                        bed_score = pool + 1
                     elif mode == "single":
                         bed_score = round(amp["LEFT"][3] + amp["RIGHT"][3], 1)
                     amplicon_bed_records.append(
@@ -284,6 +272,10 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             (f"{amp_name}_LEFT", f"{amp_name}_RIGHT")
                         )
                     )
+                    # Build name mapping for dimers
+                    name_mapping[amp["LEFT"][-1]] = f"{amp_name}_LEFT"
+                    name_mapping[amp["RIGHT"][-1]] = f"{amp_name}_RIGHT"
                     # write primer tsv and primer bed
                     for direction, primer in [("+", amp["LEFT"]), ("-", amp["RIGHT"])]:
                         seq = ambiguous_consensus[primer[1]:primer[2]]
@@ -295,7 +287,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                         # write primers to fasta pool file
                         print(f">{primer_name}\n{seq.upper()}", file=primer_fasta)
                         # calc primer parameters for all permutations
-                        permutations = get_permutations(seq)
+                        permutations = primers.get_permutations(seq)
                         gc, temp = calc_mean_stats(permutations)
                         # write tsv file
                         print(
@@ -303,7 +295,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             amp_length,
                             primer_name,
                             primer[-1],
-                            pool+1,
+                            pool + 1,
                             primer[1] + 1,
                             primer[2],
                             seq.upper(),
@@ -321,7 +313,7 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                             (
                                 # will need amplicon_index for sorting
                                 amplicon_index,
-                                (primer_name, primer, pool+1, direction, seq.upper())
+                                (primer_name, primer, pool + 1, direction, seq.upper())
                             )
                         )
         # write amplicon bed with amplicons sorted by start position
@@ -348,26 +340,41 @@ def write_scheme_to_files(path, amplicon_scheme, ambiguous_consensus, scheme_nam
                 *record[1]
             )
+    # Write dimers with renamed primers
+    if primer_dimers:
+        write_dimers(path, primer_dimers, name_mapping)
-def write_dimers(path, primer_dimers):
+def write_dimers(path, primer_dimers, name_mapping):
     """
     write dimers for which no replacement was found to file
     """
-    tsv_file = os.path.join(path, "unsolvable_primer_dimers.tsv")
-    with open(tsv_file, "w") as tsv:
-        print(
-            "pool\tprimer_name_1\tprimer_name_2\tdimer melting temp",
-            file=tsv
-        )
+    file = os.path.join(path, "unsolvable_primer_dimers.txt")
+    with open(file, "w") as f:
         for pool, primer1, primer2 in primer_dimers:
+            dimer_result = primers.calc_dimer(primer1[2][0], primer2[2][0], structure=True)
+            print(
+                "pool\tprimer 1\tprimer 2\tdimer melting temp\tdeltaG",
+                file=f
+            )
             print(
                 pool+1,
-                primer1[1],
-                primer2[1],
-                round(primers.calc_dimer(primer1[2][0], primer2[2][0]).tm, 1),
+                name_mapping[primer1[1]],
+                name_mapping[primer2[1]],
+                round(dimer_result.tm, 1),
+                dimer_result.dg,
                 sep="\t",
-                file=tsv
+                file=f
             )
+            structure = [x[4:] for x in dimer_result.ascii_structure_lines]
+            print("\nDimer structure:", file=f)
+            for line in structure:
+                print(
+                    line,
+                    file=f
+                )
+            print(file=f)
 def entropy(chars, states):
     """

varvamp/scripts/scheme.py CHANGED Viewed

@@ -5,6 +5,8 @@ amplicon search
 # BUILT-INS
 import heapq
 import math
+import multiprocessing
+import functools
 # varVAMP
 from varvamp.scripts import config, primers
@@ -73,7 +75,7 @@ def find_amplicons(all_primers, opt_len, max_len):
             amplicon_length = right_primer[2] - left_primer[1]
             if not opt_len <= amplicon_length <= max_len:
                 continue
-            if primers.calc_dimer(left_primer[0], right_primer[0]).tm > config.PRIMER_MAX_DIMER_TMP:
+            if primers.is_dimer(left_primer[0], right_primer[0]):
                 continue
             # calculate length dependend amplicon costs as the cumulative primer
             # penalty multiplied by the e^(fold length of the optimal length).
@@ -92,6 +94,26 @@ def find_amplicons(all_primers, opt_len, max_len):
     return amplicons
+def has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
+    """
+    check if two amplicons overlap sufficiently to connect them in the graph
+    """
+    # connect amplicons if they sufficiently overlap because:
+    # ... the start of next amplicon lies in the second half of the prior amplicon
+    if next_amplicon["LEFT"][1] < current_amplicon["LEFT"][1] + current_amplicon["length"] / 2:
+        return False
+    # ... the stop of the left primer of the next amplicon does not lie in the minimum amplicon insert
+    if next_amplicon["LEFT"][2] > current_amplicon["RIGHT"][1] - min_overlap:
+        return False
+    # ... half of the next amplicon does not overlap with the previous amplicon --> enough space for a
+    # further amplicon that lies in the second half next amplicon and cannot overlap with a primer of the
+    # current amplicon
+    if next_amplicon["RIGHT"][2] <= current_amplicon["RIGHT"][2] + next_amplicon["length"] / 2:
+        return False
+    return True
 def create_amplicon_graph(amplicons, min_overlap):
     """
     creates the amplicon graph.
@@ -100,34 +122,26 @@ def create_amplicon_graph(amplicons, min_overlap):
     amplicon_graph = {}
     nodes = []
-    # add the maximum len of a primer to ensure that possible amplicon starts
-    # before the min overlap
-    min_overlap = min_overlap + config.PRIMER_SIZES[1]
     for current_amplicon in amplicons:
         # remember all vertices
         amplicon_id = current_amplicon["id"]
         nodes.append(amplicon_id)
-        start = current_amplicon["LEFT"][1] + current_amplicon["length"]/2
-        stop = current_amplicon["RIGHT"][1] - min_overlap
         for next_amplicon in amplicons:
-            # check if the next amplicon lies within the start/stop range of
-            # the current amplicon and if its non-overlapping part is large
-            # enough to ensure space for a primer and the min overlap of the
-            # following amplicon.
-            if start <= next_amplicon["LEFT"][1] <= stop and next_amplicon["RIGHT"][2] > current_amplicon["RIGHT"][2] + next_amplicon["length"]/2:
-                if amplicon_id not in amplicon_graph:
-                    amplicon_graph[amplicon_id] = {
-                        next_amplicon["id"]: (
-                            next_amplicon.get("off_targets", False),
-                            next_amplicon["penalty"]
-                        )
-                    }
-                else:
-                    amplicon_graph[amplicon_id][next_amplicon["id"]] = (
+            if not has_qualifying_overlap(current_amplicon, next_amplicon, min_overlap):
+                continue
+            # --> write to graph
+            if amplicon_id not in amplicon_graph:
+                amplicon_graph[amplicon_id] = {
+                    next_amplicon["id"]: (
                         next_amplicon.get("off_targets", False),
                         next_amplicon["penalty"]
                     )
+                }
+            else:
+                amplicon_graph[amplicon_id][next_amplicon["id"]] = (
+                    next_amplicon.get("off_targets", False),
+                    next_amplicon["penalty"]
+                )
     # return a graph object
     return Graph(nodes, amplicon_graph)
@@ -274,6 +288,7 @@ def find_best_covering_scheme(amplicons, amplicon_graph):
         # if no previous nodes are found but the single amplicon results in the largest
         # coverage - return as the best scheme
         amplicon_path = [best_start_node]
     return best_coverage, create_scheme(amplicon_path, amps_by_id)
@@ -283,8 +298,15 @@ def test_scheme_for_dimers(amplicon_scheme):
     """
     primer_dimers = []
-    pools = {amp["pool"] for amp in amplicon_scheme}
-    for pool in pools:
+    non_dimers = {amp["pool"]:set() for amp in amplicon_scheme}
+    # write all primer sequences in the respective pools -->
+    # these primers should not be violated by primer switching
+    # and primers are only switched later if no primer dimers
+    # with the existing 'good' scheme are created
+    for amp in amplicon_scheme:
+        non_dimers[amp["pool"]].add(amp["LEFT"][0])
+        non_dimers[amp["pool"]].add(amp["RIGHT"][0])
+    for pool in non_dimers:
         # test the primer dimers only within the respective pools
         tested_primers = []
         for amp_index, amp in enumerate(amplicon_scheme):
@@ -297,13 +319,16 @@ def test_scheme_for_dimers(amplicon_scheme):
                 current_seq = current_primer[2][0]
                 for tested in tested_primers:
                     tested_seq = tested[2][0]
-                    if primers.calc_dimer(current_seq, tested_seq).tm <= config.PRIMER_MAX_DIMER_TMP:
+                    if not primers.is_dimer(current_seq, tested_seq):
                         continue
                     primer_dimers.append((current_primer, tested))
+                    non_dimers[pool].discard(current_seq)
+                    non_dimers[pool].discard(tested_seq)
                 # and remember all tested primers
                 tested_primers.append(current_primer)
-    return primer_dimers
+    # report both dimers and non-dimers
+    return primer_dimers, non_dimers
 def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates):
@@ -317,13 +342,16 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
     # test each primer in dimer
     for amp_index, primer_name, primer in dimer:
         overlapping_primers_temp = []
-        thirds_len = int((primer[2] - primer[1]) / 3)
-        # get the middle third of the primer (here are the previously excluded primers)
-        overlap_set = set(range(primer[1] + thirds_len, primer[2] - thirds_len))
-        # check in which list to look for them
+        # as switching could violate overlap criteria,
+        # only consider primers that overlap in the left half (LEFT primers)
+        # or right half (RIGHT primers) respectively, however this can result in slightly
+        # longer amplicons than allowed.
+        half_length = int((primer[2] - primer[1]) / 2)
         if "RIGHT" in primer_name:
+            overlap_set = set(range(primer[1] + half_length, primer[2]))
             primers_to_test = right_primer_candidates
         else:
+            overlap_set = set(range(primer[1], primer[1] + half_length))
             primers_to_test = left_primer_candidates
         # and check this list for all primers that overlap
         for potential_new in primers_to_test:
@@ -337,40 +365,60 @@ def get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidat
     return overlapping_primers
-def test_overlaps_for_dimers(overlapping_primers):
+def test_overlaps_for_dimers(overlapping_primers, non_dimers):
     """
-    test the overlapping primers for dimers. return new primers.
+    test all possible overlapping primers against each other for dimers
+    and return the first pair that doesn't form a dimer with each other
+    and with all non-dimer forming primers in the pool.
     """
     for first_overlap in overlapping_primers[0]:
+        if any(primers.is_dimer(seq, first_overlap[2][0]) for seq in non_dimers):
+            continue
         for second_overlap in overlapping_primers[1]:
-            # return the first match. primers are sorted by penalty.
-            # first pair that makes it has the lowest penalty
-            if primers.calc_dimer(first_overlap[2][0], second_overlap[2][0]).tm <= config.PRIMER_MAX_DIMER_TMP:
+            if any(primers.is_dimer(seq, second_overlap[2][0]) for seq in non_dimers):
+                continue
+            if not primers.is_dimer(first_overlap[2][0], second_overlap[2][0]):
                 return [first_overlap, second_overlap]
-def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers):
+def _solve_single_dimer(amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools, dimer):
+    """
+    Helper function for multiprocessing: solve a single dimer independently.
+    Returns (amp_index, primer_name, new_primer) tuples or empty list if no solution.
+    """
+    pool = amplicon_scheme[dimer[0][0]]["pool"]
+    non_dimers = non_dimers_all_pools[pool]
+    overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
+    new_primers = test_overlaps_for_dimers(overlapping_primers, non_dimers)
+    return new_primers if new_primers else []
+def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_primer_candidates, all_primers, num_processes):
     """
     check scheme for heterodimers, try to find
     new primers that overlap and replace the existing ones.
-    this can lead to new primer dimers. therefore the scheme
-    is checked a second time. if there are still primer dimers
-    present the non-solvable dimers are returned
+    Uses multiprocessing to solve dimers in parallel.
     """
+    primer_dimers, non_dimers_all_pools = test_scheme_for_dimers(amplicon_scheme)
+    n_initial_dimers = len(primer_dimers)
-    primer_dimers = test_scheme_for_dimers(amplicon_scheme)
+    if not primer_dimers:
+        return [], 0
-    if primer_dimers:
-        print(f"varVAMP found {len(primer_dimers)} dimer pairs in scheme ... trying to find replacements")
-    else:
-        return []
-    for dimer in primer_dimers:
-        # get overlapping primers that have not been considered
-        overlapping_primers = get_overlapping_primers(dimer, left_primer_candidates, right_primer_candidates)
-        # test all possible primers against each other for dimers
-        new_primers = test_overlaps_for_dimers(overlapping_primers)
-        # now change these primers in the scheme
+    # Prepare arguments for each dimer
+    callable_f = functools.partial(
+        _solve_single_dimer,
+    amplicon_scheme, left_primer_candidates, right_primer_candidates, non_dimers_all_pools
+    )
+    # Solve dimers in parallel
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        results = pool.map(callable_f, primer_dimers)
+    # Apply all solutions to the scheme
+    for new_primers in results:
         if new_primers:
             for amp_index, primer_name, primer in new_primers:
                 # overwrite in final scheme
@@ -386,12 +434,13 @@ def check_and_solve_heterodimers(amplicon_scheme, left_primer_candidates, right_
                 # and in all primers
                 all_primers[strand][primer_name] = primer
     # get remaining dimers in the revised scheme and add pool identifier for reporting
+    remaining_primer_dimers, _ = test_scheme_for_dimers(amplicon_scheme)
     primer_dimers = [
         (amplicon_scheme[primer1[0]]["pool"], primer1, primer2)
-        for primer1, primer2 in test_scheme_for_dimers(amplicon_scheme)
+        for primer1, primer2 in remaining_primer_dimers
     ]
-    return primer_dimers
+    return primer_dimers, n_initial_dimers
 def find_single_amplicons(amplicons, n):

varvamp 1.2.2__py3-none-any.whl → 1.3__py3-none-any.whl

varvamp 1.2.2py3-none-any.whl → 1.3py3-none-any.whl