PyPI - varvamp - Versions diffs - 1.1.3__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

varvamp 1.1.3py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

varvamp/__init__.py +1 -1
varvamp/command.py +38 -20
varvamp/scripts/blast.py +36 -66
varvamp/scripts/default_config.py +1 -2
varvamp/scripts/logging.py +0 -7
varvamp/scripts/primers.py +2 -2
varvamp/scripts/qpcr.py +38 -36
varvamp/scripts/reporting.py +156 -118
varvamp/scripts/scheme.py +115 -107
{varvamp-1.1.3.dist-info → varvamp-1.2.0.dist-info}/METADATA +1 -1
varvamp-1.2.0.dist-info/RECORD +21 -0
varvamp-1.1.3.dist-info/RECORD +0 -21
{varvamp-1.1.3.dist-info → varvamp-1.2.0.dist-info}/WHEEL +0 -0
{varvamp-1.1.3.dist-info → varvamp-1.2.0.dist-info}/entry_points.txt +0 -0
{varvamp-1.1.3.dist-info → varvamp-1.2.0.dist-info}/top_level.txt +0 -0

varvamp/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Tool to design amplicons for highly variable virusgenomes"""
 _program = "varvamp"
-__version__ = "1.1.3"
+__version__ = "1.2.0"

varvamp/command.py CHANGED Viewed

@@ -314,9 +314,9 @@ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
     if args.database is not None:
         # create blast query
-        query_path = blast.create_BLAST_query(all_primers, amplicons, data_dir)
+        query_path = blast.create_BLAST_query(amplicons, data_dir)
         # perform primer blast
-        amplicons, off_target_amplicons = blast.primer_blast(
+        amplicons = blast.primer_blast(
             data_dir,
             args.database,
             query_path,
@@ -326,10 +326,8 @@ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
             log_file,
             mode="single_tiled"
         )
-    else:
-        off_target_amplicons = []
-    return all_primers, amplicons, off_target_amplicons
+    return all_primers, amplicons
 def single_workflow(args, amplicons, all_primers, log_file):
@@ -337,12 +335,12 @@ def single_workflow(args, amplicons, all_primers, log_file):
     workflow part specific for single mode
     """
-    amplicon_scheme = scheme.find_single_amplicons(amplicons, all_primers, args.report_n)
+    amplicon_scheme = scheme.find_single_amplicons(amplicons, args.report_n)
     logging.varvamp_progress(
         log_file,
         progress=0.9,
         job="Finding amplicons with low penalties.",
-        progress_text=f"{len(amplicon_scheme[0])} amplicons."
+        progress_text=f"{len(amplicon_scheme)} amplicons."
     )
     return amplicon_scheme
@@ -359,8 +357,7 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
     # search for amplicon scheme
     coverage, amplicon_scheme = scheme.find_best_covering_scheme(
         amplicons,
-        amplicon_graph,
-        all_primers
+        amplicon_graph
     )
     # check for dimers
@@ -377,12 +374,13 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
         reporting.write_dimers(results_dir, dimers_not_solved)
     # evaluate coverage
+    # ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers, but this potential, minor inaccuracy is currently accepted.
     percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
     logging.varvamp_progress(
         log_file,
         progress=0.9,
         job="Creating amplicon scheme.",
-        progress_text=f"{percent_coverage} % total coverage with {len(amplicon_scheme[0]) + len(amplicon_scheme[1])} amplicons"
+        progress_text=f"{percent_coverage} % total coverage with {len(amplicon_scheme)} amplicons"
     )
     if percent_coverage < 70:
         logging.raise_error(
@@ -450,9 +448,9 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
     # run blast if db is given
     if args.database is not None:
         # create blast query
-        query_path = blast.create_BLAST_query_qpcr(qpcr_scheme_candidates, data_dir)
+        query_path = blast.create_BLAST_query(qpcr_scheme_candidates, data_dir, mode="qpcr")
         # perform primer blast
-        amplicons, off_target_amplicons = blast.primer_blast(
+        qpcr_scheme_candidates = blast.primer_blast(
             data_dir,
             args.database,
             query_path,
@@ -470,9 +468,6 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
             log_file,
             exit=True
         )
-    # report potential blast warnings
-    if args.database is not None:
-        blast.write_BLAST_warning(off_target_amplicons, final_schemes, log_file)
     logging.varvamp_progress(
         log_file,
         progress=0.9,
@@ -506,9 +501,21 @@ def main(sysargs=sys.argv[1:]):
     reporting.write_fasta(data_dir, "majority_consensus", majority_consensus)
     reporting.write_fasta(results_dir, "ambiguous_consensus", ambiguous_consensus)
+    # Functions called from here on return lists of amplicons that are refined step-wise into final schemes.
+    # These lists that are passed between functions and later used for reporting consist of dictionary elemnts,
+    # which represent individual amplicons. A minimal amplicon dict could take the form:
+    # {
+    #     "id": amplicon_name,
+    #     "penalty": amplicon_cost,
+    #     "length": amplicon_length,
+    #     "LEFT": [left primer data],
+    #     "RIGHT": [right primer data]
+    # }
+    # to which different functions may add additional information.
     # SINGLE/TILED mode
     if args.mode == "tiled" or args.mode == "single":
-        all_primers, amplicons, off_target_amplicons = single_and_tiled_shared_workflow(
+        all_primers, amplicons = single_and_tiled_shared_workflow(
             args,
             left_primer_candidates,
             right_primer_candidates,
@@ -533,15 +540,22 @@ def main(sysargs=sys.argv[1:]):
                 log_file,
                 results_dir
             )
-        if args.database is not None:
-            blast.write_BLAST_warning(off_target_amplicons, amplicon_scheme, log_file)
         # write files
+        if args.mode == "tiled":
+            # assign amplicon numbers from 5' to 3' along the genome
+            amplicon_scheme.sort(key=lambda x: x["LEFT"][1])
+        else:
+            # make sure amplicons with no off-target products and with low penalties get the lowest numbers
+            amplicon_scheme.sort(key=lambda x: (x.get("off_targets", False), x["penalty"]))
         reporting.write_all_primers(data_dir, all_primers)
         reporting.write_scheme_to_files(
             results_dir,
             amplicon_scheme,
             ambiguous_consensus,
-            args.mode
+            args.mode,
+            log_file
         )
         reporting.varvamp_plot(
             results_dir,
@@ -564,9 +578,13 @@ def main(sysargs=sys.argv[1:]):
             right_primer_candidates,
             log_file
         )
         # write files
+        # make sure amplicons with no off-target products and with low penalties get the lowest numbers
+        final_schemes.sort(key=lambda x: (x.get("off_targets", False), x["penalty"]))
         reporting.write_regions_to_bed(probe_regions, data_dir, "probe")
-        reporting.write_qpcr_to_files(results_dir, final_schemes, ambiguous_consensus)
+        reporting.write_qpcr_to_files(results_dir, final_schemes, ambiguous_consensus, log_file)
         reporting.varvamp_plot(
             results_dir,
             alignment_cleaned,

varvamp/scripts/blast.py CHANGED Viewed

@@ -29,41 +29,24 @@ def check_BLAST_installation(log_file):
         logging.raise_error("BLASTN is not installed", log_file, exit=True)
-def create_BLAST_query(all_primers, amplicons, data_dir):
+def create_BLAST_query(amplicons, data_dir, mode="single_tiled"):
     """
-    create a query for the BLAST search (tiled, single mode)
+    create a query for the BLAST search
     """
-    already_written = []
     query_path = os.path.join(data_dir, "BLAST_query.fasta")
-    with open(query_path, "w") as query:
-        for amp in amplicons:
-            fw_primer, rv_primer = amplicons[amp][2], amplicons[amp][3]
-            if fw_primer not in already_written:
-                print(f">{fw_primer}\n{all_primers['+'][fw_primer][0]}", file=query)
-                already_written.append(fw_primer)
-            if rv_primer not in already_written:
-                print(f">{rv_primer}\n{all_primers['-'][rv_primer][0]}", file=query)
-                already_written.append(rv_primer)
-    return query_path
-def create_BLAST_query_qpcr(qpcr_scheme_candidates, data_dir):
-    """
-    create a query for the BLAST search (qpcr mode)
-    """
-    already_written = []
+    if mode == "single_tiled":
+        primer_types = ["LEFT", "RIGHT"]
+    elif mode == "qpcr":
+        primer_types = ["PROBE", "LEFT", "RIGHT"]
+    already_written = set()
-    query_path = os.path.join(data_dir, "BLAST_query.fasta")
     with open(query_path, "w") as query:
-        for amp in qpcr_scheme_candidates:
-            for primer_type in ["PROBE", "LEFT", "RIGHT"]:
-                name = f"{primer_type}_{qpcr_scheme_candidates[amp][primer_type][1]}_{qpcr_scheme_candidates[amp][primer_type][2]}"
-                if name in already_written:
-                    continue
-                print(f">{name}\n{qpcr_scheme_candidates[amp][primer_type][0]}", file=query)
-                already_written.append(name)
+        for amp in amplicons:
+            for primer_type in primer_types:
+                name = f"{primer_type}_{amp[primer_type][1]}_{amp[primer_type][2]}"
+                if name not in already_written:
+                    print(f">{name}\n{amp[primer_type][0]}", file=query)
+                    already_written.add(name)
     return query_path
@@ -168,21 +151,24 @@ def predict_non_specific_amplicons_worker(amp, blast_df, max_length, mode):
     """
     Worker function to predict unspecific targets for a single amplicon.
     """
-    name, data = amp
     # get correct primers
     if mode == "single_tiled":
-        primers = [data[2], data[3]]
+        primer_types = ["LEFT", "RIGHT"]
     elif mode == "qpcr":
-        primers = []
-        for primer_type in ["PROBE", "LEFT", "RIGHT"]:
-            primers.append(f"{primer_type}_{data[primer_type][1]}_{data[primer_type][2]}")
+        primer_types = ["PROBE", "LEFT", "RIGHT"]
+    primers = []
+    for primer_type in primer_types:
+        primers.append(f"{primer_type}_{amp[primer_type][1]}_{amp[primer_type][2]}")
     # subset df for primers
     df_amp_primers = blast_df[blast_df["query"].isin(primers)]
     # sort by reference and ref start
     df_amp_primers_sorted = df_amp_primers.sort_values(["ref", "ref_start"])
     # check for off-targets for specific primers
     if check_off_targets(df_amp_primers_sorted, max_length, primers):
-        return name
+        amp["off_targets"] = True
+    else:
+        amp["off_targets"] = False
+    return amp
 def predict_non_specific_amplicons(amplicons, blast_df, max_length, mode, n_threads):
@@ -190,22 +176,16 @@ def predict_non_specific_amplicons(amplicons, blast_df, max_length, mode, n_thre
     Main function to predict unspecific targets within a size range and give
     these primers a high penalty. Uses multiprocessing for parallelization.
     """
-    off_targets = []
     # process amplicons concurrently
     with multiprocessing.Pool(processes=n_threads) as pool:
-        amp_items = amplicons.items()
-        results = pool.starmap(predict_non_specific_amplicons_worker, [(amp, blast_df, max_length, mode) for amp in amp_items])
-    # check results
-    for off_target in results:
-        if off_target is None:
-            continue
-        off_targets.append(off_target)
-        if mode == "single_tiled":
-            amplicons[off_target][5] = amplicons[off_target][5] + config.BLAST_PENALTY
-        elif mode == "qpcr":
-            amplicons[off_target]["penalty"] = amplicons[off_target]["penalty"] + config.BLAST_PENALTY
-    return off_targets, amplicons
+        annotated_amps = [
+            result for result in pool.starmap(
+                predict_non_specific_amplicons_worker,
+                [(amp, blast_df, max_length, mode) for amp in amplicons]
+            ) if result is not None
+        ]
+    n_off_targets = sum(amp["off_targets"] for amp in annotated_amps)
+    return n_off_targets, annotated_amps
 def primer_blast(data_dir, db, query_path, amplicons, max_length, n_threads, log_file, mode):
@@ -237,14 +217,17 @@ def primer_blast(data_dir, db, query_path, amplicons, max_length, n_threads, log
     blast_df = parse_and_filter_BLAST_output(blast_out)
     print("Predicting non-specific amplicons...")
-    off_target_amplicons, amplicons = predict_non_specific_amplicons(
+    n_off_targets, amplicons = predict_non_specific_amplicons(
         amplicons,
         blast_df,
         max_length,
         mode,
         n_threads
     )
-    success_text = f"varVAMP successfully predicted non-specific amplicons:\n\t> {len(off_target_amplicons)}/{len(amplicons)} amplicons could produce amplicons with the blast db.\n\t> raised their amplicon penalty by {config.BLAST_PENALTY}"
+    if n_off_targets > 0:
+        success_text = f"varVAMP predicted non-specific amplicons:\n\t> {n_off_targets}/{len(amplicons)} amplicons could produce amplicons with the blast db.\n\t> will attempt to avoid them in the final list of amplicons"
+    else:
+        success_text = f"NO off-target amplicons found with the blast db and a total of {len(amplicons)} amplicons"
     print(success_text)
     with open(log_file, 'a') as f:
         print(
@@ -253,18 +236,5 @@ def primer_blast(data_dir, db, query_path, amplicons, max_length, n_threads, log
         )
     print("\n#### off-target search finished ####\n")
-    return amplicons, off_target_amplicons
+    return amplicons
-def write_BLAST_warning(off_target_amplicons, amplicon_scheme, log_file):
-    """
-    for each primer pair that has potential unspecific amplicons
-    write warnings to file.
-    """
-    for amp in off_target_amplicons:
-        if amp in amplicon_scheme:
-            logging.raise_error(
-                f"{amp} could produce off-targets. No better amplicon in this area was found.",
-                log_file,
-                exit=False,
-            )

varvamp/scripts/default_config.py CHANGED Viewed

@@ -4,7 +4,7 @@ This contains all varVAMP parameters.
 # List of all known parameters. DO NOT CHANGE!
 __all__ = [
-    'BLAST_MAX_DIFF', 'BLAST_PENALTY', 'BLAST_SETTINGS', 'BLAST_SIZE_MULTI',
+    'BLAST_MAX_DIFF', 'BLAST_SETTINGS', 'BLAST_SIZE_MULTI',
     'END_OVERLAP',
     'PCR_DNA_CONC', 'PCR_DNTP_CONC', 'PCR_DV_CONC', 'PCR_MV_CONC',
     'PRIMER_3_PENALTY', 'PRIMER_GC_END', 'PRIMER_GC_PENALTY',
@@ -74,7 +74,6 @@ BLAST_SETTINGS = {  # blast settings for query search
 }
 BLAST_MAX_DIFF = 0.5  # min percent match between primer and BLAST hit (coverage and/or mismatches)
 BLAST_SIZE_MULTI = 2  # multiplier for the max_amp size of off targets (in relation to max amp size)
-BLAST_PENALTY = 50  # amplicon penalty increase -> considered only if no other possibilities
 # nucleotide definitions, do NOT change
 NUCS = set("atcg")

varvamp/scripts/logging.py CHANGED Viewed

@@ -291,7 +291,6 @@ def confirm_config(args, log_file):
         (
             "BLAST_MAX_DIFF",
             "BLAST_SIZE_MULTI",
-            "BLAST_PENALTY"
         )
     ]
@@ -384,7 +383,6 @@ def confirm_config(args, log_file):
         ("qpcr deletion size still considered for deltaG calculation", config.QAMPLICON_DEL_CUTOFF),
         ("maximum difference between primer and blast db", config.BLAST_MAX_DIFF),
         ("multiplier of the maximum length for non-specific amplicons", config.BLAST_SIZE_MULTI),
-        ("blast penalty for off targets", config.BLAST_PENALTY)
     ]
     for var_type, var in non_negative_var:
         if var < 0:
@@ -468,11 +466,6 @@ def confirm_config(args, log_file):
             log_file,
             exit=True
         )
-    if config.BLAST_PENALTY < 10:
-        raise_error(
-            "giving a too small penalty could result in the selection of off-target producing amplicons in the final scheme.",
-            log_file,
-        )
     # confirm proper BLAST settings in dictionary
     if not isinstance(config.BLAST_SETTINGS, dict):
         raise_error(

varvamp/scripts/primers.py CHANGED Viewed

@@ -386,13 +386,13 @@ def find_best_primers(left_primer_candidates, right_primer_candidates):
         primer_candidates.sort(key=lambda x: (x[3], x[1]))
         # ini everything with the primer with the lowest penalty
         to_retain = [primer_candidates[0]]
-        primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]+1))
+        primer_ranges = list(range(primer_candidates[0][1], primer_candidates[0][2]))
         primer_set = set(primer_ranges)
         for primer in primer_candidates:
             # get the thirds of the primer, only consider the middle
             thirds_len = int((primer[2] - primer[1])/3)
-            primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len + 1))
+            primer_positions = list(range(primer[1] + thirds_len, primer[2] - thirds_len))
             # check if none of the nucleotides of the next primer
             # are already covered by a better primer
             if not any(x in primer_positions for x in primer_set):

varvamp/scripts/qpcr.py CHANGED Viewed

@@ -211,13 +211,13 @@ def assess_amplicons(left_subset, right_subset, qpcr_probes, probe, majority_con
             if "LEFT" in probe:
                 if not qpcr_probes[probe][1] in range(
                         left_primer[2] + config.QPROBE_DISTANCE[0],
-                        left_primer[2] + config.QPROBE_DISTANCE[1] + 1
+                        left_primer[2] + config.QPROBE_DISTANCE[1]
                 ):
                     continue
             elif "RIGHT" in probe:
                 if not right_primer[1] in range(
                         qpcr_probes[probe][2] + config.QPROBE_DISTANCE[0],
-                        qpcr_probes[probe][2] + config.QPROBE_DISTANCE[1] + 1
+                        qpcr_probes[probe][2] + config.QPROBE_DISTANCE[1]
                 ):
                     continue
@@ -258,7 +258,7 @@ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidate
     there is no need to consider this primer probe combination.
     """
-    qpcr_scheme_candidates = {}
+    qpcr_scheme_candidates = []
     found_amplicons = []
     amplicon_nr = -1
@@ -279,15 +279,16 @@ def find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidate
         # populate the primer dictionary:
         amplicon_nr += 1
         found_amplicons.append(primer_combination)
-        qpcr_scheme_candidates[f"AMPLICON_{amplicon_nr}"] = {
-            "penalty": qpcr_probes[probe][3] + primer_combination[0][3] + primer_combination[1][3],
-            "PROBE": qpcr_probes[probe],
-            "LEFT": primer_combination[0],
-            "RIGHT": primer_combination[1]
-        }
+        qpcr_scheme_candidates.append(
+            {
+                "id": f"AMPLICON_{amplicon_nr}",
+                "penalty": qpcr_probes[probe][3] + primer_combination[0][3] + primer_combination[1][3],
+                "PROBE": qpcr_probes[probe],
+                "LEFT": primer_combination[0],
+                "RIGHT": primer_combination[1]
+            }
+        )
     # and again sort by total penalty (left + right + probe)
-    qpcr_scheme_candidates = dict(sorted(qpcr_scheme_candidates.items(), key=lambda x: x[1]["penalty"]))
     return qpcr_scheme_candidates
@@ -296,21 +297,17 @@ def process_single_amplicon_deltaG(amplicon, majority_consensus):
     Process a single amplicon to test its deltaG and apply filtering.
     This function will be called concurrently by multiple threads.
     """
-    name, data = amplicon
-    start = data["LEFT"][1]
-    stop = data["RIGHT"][2]
-    seq = majority_consensus[start:stop]
+    seq = majority_consensus[amplicon["LEFT"][1]:amplicon["RIGHT"][2]]
     seq = seq.replace("N", "")
     seq = seq.replace("n", "")
-    amp_positions = list(range(start, stop + 1))
     # check if the amplicon overlaps with an amplicon that was previously
     # found and had a high enough deltaG
-    min_temp = min((primers.calc_temp(data["LEFT"][0]),
-                    primers.calc_temp(data["RIGHT"][0])))
+    min_temp = min((primers.calc_temp(amplicon["LEFT"][0]),
+                    primers.calc_temp(amplicon["RIGHT"][0])))
     # calculate deltaG at the minimal primer temp
-    deltaG = seqfold.dg(seq, min_temp)
+    amplicon["deltaG"] = seqfold.dg(seq, min_temp)
-    return deltaG, amp_positions, name
+    return amplicon
 def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n_to_test, deltaG_cutoff, n_threads):
@@ -319,29 +316,34 @@ def test_amplicon_deltaG_parallel(qpcr_schemes_candidates, majority_consensus, n
     and filters if they fall below the cutoff. Multiple processes are used
     for processing amplicons in parallel.
     """
-    final_schemes = {}
-    passed_counter = 0  # counter for re-naming amplicons that passed deltaG cutoff
-    amplicon_set = set()
+    final_amplicons = []
     # Create a pool of processes to handle the concurrent processing
     with multiprocessing.Pool(processes=n_threads) as pool:
         # Create a list of the first n amplicon tuples for processing
-        amplicons = itertools.islice(qpcr_schemes_candidates.items(), n_to_test)
+        # The list is sorted first on whether offset targets were predicted for the amplicon,
+        # then by penalty. This ensures that amplicons with offset targets are always considered last
+        amplicons = itertools.islice(
+            sorted(qpcr_schemes_candidates, key=lambda x: (x.get("offset_targets", False), x["penalty"])),
+            n_to_test
+        )
         # process amplicons concurrently
         results = pool.starmap(process_single_amplicon_deltaG, [(amp, majority_consensus) for amp in amplicons])
         # Process the results
-        for deltaG, amp_positions, amp_name in results:
+        retained_ranges = []
+        for amp in results:
             # check if the amplicon overlaps with an amplicon that was previously
             # found and had a high enough deltaG
-            if any(x in amp_positions for x in amplicon_set):
+            if amp["deltaG"] <= deltaG_cutoff:
                 continue
-            # and if this passes cutoff make a dict entry and do not allow further
-            # amplicons in that region (they will have a lower penalty)
-            if deltaG > deltaG_cutoff:
-                new_name = f"QPCR_SCHEME_{passed_counter}"
-                final_schemes[new_name] = qpcr_schemes_candidates[amp_name]
-                final_schemes[new_name]["deltaG"] = deltaG
-                amplicon_set.update(amp_positions)
-                passed_counter += 1
-    return final_schemes
+            amp_range = range(amp["LEFT"][1], amp["RIGHT"][2])
+            overlaps_retained = False
+            for r in retained_ranges:
+                if amp_range.start < r.stop and r.start < amp_range.stop:
+                    overlaps_retained = True
+                    break
+            if not overlaps_retained:
+                final_amplicons.append(amp)
+                retained_ranges.append(amp_range)
+    return final_amplicons

varvamp 1.1.3__py3-none-any.whl → 1.2.0__py3-none-any.whl

varvamp 1.1.3py3-none-any.whl → 1.2.0py3-none-any.whl