PyPI - varvamp - Versions diffs - 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl - Mend

varvamp 1.2.1py3-none-any.whl → 1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

varvamp/__init__.py +6 -3
varvamp/command.py +134 -60
varvamp/scripts/alignment.py +54 -164
varvamp/scripts/default_config.py +5 -3
varvamp/scripts/logging.py +67 -21
varvamp/scripts/param_estimation.py +84 -62
varvamp/scripts/primers.py +190 -46
varvamp/scripts/qpcr.py +141 -117
varvamp/scripts/reporting.py +45 -40
varvamp/scripts/scheme.py +101 -52
varvamp-1.3.dist-info/METADATA +760 -0
varvamp-1.3.dist-info/RECORD +22 -0
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/WHEEL +1 -1
varvamp-1.3.dist-info/licenses/LICENSE +674 -0
varvamp-1.2.1.dist-info/METADATA +0 -78
varvamp-1.2.1.dist-info/RECORD +0 -21
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/entry_points.txt +0 -0
{varvamp-1.2.1.dist-info → varvamp-1.3.dist-info}/top_level.txt +0 -0

varvamp/__init__.py CHANGED Viewed

@@ -1,3 +1,6 @@
-"""Tool to design amplicons for highly variable virusgenomes"""
-_program = "varvamp"
-__version__ = "1.2.1"
+from importlib.metadata import version, PackageNotFoundError
+try:
+    __version__ = version("varvamp")
+except PackageNotFoundError:
+    __version__ = "unknown"

varvamp/command.py CHANGED Viewed

@@ -7,7 +7,6 @@ import sys
 import os
 import datetime
 import argparse
-import multiprocessing
 # varVAMP
 from varvamp.scripts import alignment
@@ -22,7 +21,6 @@ from varvamp.scripts import reporting
 from varvamp.scripts import scheme
 from varvamp.scripts import blast
 from varvamp import __version__
-from . import _program
 def get_args(sysargs):
@@ -30,7 +28,6 @@ def get_args(sysargs):
     arg parsing for varvamp
     """
     parser = argparse.ArgumentParser(
-        prog=_program,
         usage='''\tvarvamp <mode> --help\n\tvarvamp <mode> [mode optional arguments] <alignment> <output dir>''')
     mode_parser = parser.add_subparsers(
         title="varvamp mode",
@@ -49,7 +46,7 @@ def get_args(sysargs):
     QPCR_parser = mode_parser.add_parser(
         "qpcr",
         help="design qPCR primers",
-        usage="varvamp qpcr [optional arguments] <alignment> <output dir>"
+        usage="varvamp qpcr -t <threshold> [optional arguments] <alignment> <output dir>"
     )
     parser.add_argument(
         "input",
@@ -57,20 +54,12 @@ def get_args(sysargs):
         help="alignment file and dir to write results"
     )
     for par in (SINGLE_parser, TILED_parser, QPCR_parser):
-        par.add_argument(
-            "-t",
-            "--threshold",
-            metavar="",
-            type=float,
-            default=None,
-            help="threshold for consensus nucleotides"
-        )
         par.add_argument(
             "-a",
             "--n-ambig",
-            metavar="",
+            metavar="2",
             type=int,
-            default=None,
+            default=2,
             help="max number of ambiguous characters in a primer"
         )
         par.add_argument(
@@ -96,7 +85,23 @@ def get_args(sysargs):
             type=str,
             default="varVAMP"
         )
+        par.add_argument(
+            "--compatible-primers",
+            metavar="None",
+            type=str,
+            default=None,
+            help="FASTA primer file with which new primers should not form dimers. Sequences >40 nt are ignored. Can significantly increase runtime."
+        )
     for par in (SINGLE_parser, TILED_parser):
+        par.add_argument(
+            "-t",
+            "--threshold",
+            metavar="",
+            type=float,
+            default=None,
+            help="consensus threshold (0-1) - if not set it will be estimated (higher values result in higher specificity at the expense of found primers)"
+        )
         par.add_argument(
             "-ol",
             "--opt-length",
@@ -117,9 +122,9 @@ def get_args(sysargs):
         "-o",
         "--overlap",
         type=int,
-        metavar="100",
-        default=100,
-        help="min overlap of the amplicons"
+        metavar="25",
+        default=25,
+        help="min overlap of the amplicon inserts"
     )
     SINGLE_parser.add_argument(
         "-n",
@@ -137,6 +142,13 @@ def get_args(sysargs):
         default=None,
         help="max number of ambiguous characters in a probe"
     )
+    QPCR_parser.add_argument(
+        "-t",
+        "--threshold",
+        required=True,
+        type=float,
+        help="consensus threshold (0-1) - higher values result in higher specificity at the expense of found primers"
+    )
     QPCR_parser.add_argument(
         "-n",
         "--test-n",
@@ -151,7 +163,7 @@ def get_args(sysargs):
         type=int,
         metavar="-3",
         default=-3,
-        help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting temp"
+        help="minimum free energy (kcal/mol/K) cutoff at the lowest primer melting temperature"
     )
     parser.add_argument(
         "--verbose",
@@ -178,19 +190,30 @@ def shared_workflow(args, log_file):
     """
     # start varvamp
     logging.varvamp_progress(log_file, mode=args.mode)
     # read in alignment and preprocess
     preprocessed_alignment = alignment.preprocess(args.input[0])
-    # check alignment length distribution
+    # read in external primer sequences with which new primers should not form dimers
+    if args.compatible_primers is not None:
+        compatible_primers = primers.parse_primer_fasta(args.compatible_primers)
+        if not compatible_primers:
+            logging.raise_error(
+                "no valid primers found in the provided primer file.\n",
+                log_file,
+            )
+    else:
+        compatible_primers = None
+    # check alignment length and number of gaps and report if its significantly more/less than expected
     logging.check_alignment_length(preprocessed_alignment, log_file)
+    logging.check_gaped_sequences(preprocessed_alignment, log_file)
     # estimate threshold or number of ambiguous bases if args were not supplied
-    if args.threshold is None or args.n_ambig is None:
-        args.threshold, args.n_ambig = param_estimation.get_parameters(preprocessed_alignment, args, log_file)
+    if args.threshold is None and not args.mode == 'qpcr':
+        args.threshold = param_estimation.get_parameters(preprocessed_alignment, args, log_file)
+    # set the number of ambiguous chars for qPCR probes to one less than for primers if not given
     if args.mode == "qpcr" and args.pn_ambig is None:
         if args.n_ambig == 0:
             args.pn_ambig = 0
-        if args.n_ambig > 0:
+        else:
             args.pn_ambig = args.n_ambig - 1
         with open(log_file, "a") as f:
             print(f"Automatic parameter selection set -pa {args.pn_ambig}.", file=f)
@@ -211,7 +234,6 @@ def shared_workflow(args, log_file):
     alignment_cleaned, gaps_to_mask = alignment.process_alignment(
         preprocessed_alignment,
         args.threshold,
-        args.threads
     )
     logging.varvamp_progress(
         log_file,
@@ -237,6 +259,9 @@ def shared_workflow(args, log_file):
         ambiguous_consensus,
         args.n_ambig
     )
+    potential_primer_regions = regions.mean(primer_regions, majority_consensus)
     if not primer_regions:
         logging.raise_error(
             "no primer regions found. Lower the threshold!",
@@ -247,7 +272,7 @@ def shared_workflow(args, log_file):
         log_file,
         progress=0.4,
         job="Finding primer regions.",
-        progress_text=f"{regions.mean(primer_regions, majority_consensus)} % of the consensus sequence will be evaluated for primers"
+        progress_text=f"{potential_primer_regions} % of the consensus sequence will be evaluated for primers"
     )
     # produce kmers for all primer regions
@@ -266,7 +291,8 @@ def shared_workflow(args, log_file):
     left_primer_candidates, right_primer_candidates = primers.find_primers(
         kmers,
         ambiguous_consensus,
-        alignment_cleaned
+        alignment_cleaned,
+        args.threads
     )
     for primer_type, primer_candidates in [("+", left_primer_candidates), ("-", right_primer_candidates)]:
         if not primer_candidates:
@@ -282,20 +308,41 @@ def shared_workflow(args, log_file):
         progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv potential primers"
     )
-    return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates
+    # filter primers against user-provided list of compatible primers, can use multi-processing
+    if compatible_primers:
+        left_primer_candidates = primers.filter_non_dimer_candidates(
+            left_primer_candidates, compatible_primers, args.threads
+        )
+        right_primer_candidates = primers.filter_non_dimer_candidates(
+            right_primer_candidates, compatible_primers, args.threads
+        )
+        logging.varvamp_progress(
+            log_file,
+            progress=0.65,
+            job="Filtering primers against provided primers.",
+            progress_text=f"{len(left_primer_candidates)} fw and {len(right_primer_candidates)} rv primers after filtering"
+        )
+    return alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers
-def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, data_dir, log_file):
+def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_candidates, potential_primer_regions, data_dir, log_file):
     """
     part of the workflow shared by the single and tiled mode
     """
     # find best primers and create primer dict
-    all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates)
+    # depending on the percentage of potential primer regions use high conservation mode
+    if potential_primer_regions >= 90:
+        all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=True)
+        job_text = "Excluding overlapping primers (stringent)."
+    else:
+        all_primers = primers.find_best_primers(left_primer_candidates, right_primer_candidates, high_conservation=False)
+        job_text = "Excluding overlapping primers."
     logging.varvamp_progress(
         log_file,
         progress=0.7,
-        job="Considering primers with low penalties.",
+        job=f"{job_text}",
         progress_text=f"{len(all_primers['+'])} fw and {len(all_primers['-'])} rv primers"
     )
@@ -307,8 +354,7 @@ def single_and_tiled_shared_workflow(args, left_primer_candidates, right_primer_
     )
     if not amplicons:
         logging.raise_error(
-            "no amplicons found. Increase the max amplicon length or \
-            number of ambiguous bases or lower threshold!\n",
+            "no amplicons found. Increase the max amplicon length or number of ambiguous bases or lower threshold!\n",
             log_file,
             exit=True
         )
@@ -353,7 +399,7 @@ def single_workflow(args, amplicons, log_file):
     return amplicon_scheme
-def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file, results_dir):
+def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candidates, all_primers, ambiguous_consensus, log_file):
     """
     part of the workflow specific for the tiled mode
     """
@@ -367,21 +413,9 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
         amplicon_graph
     )
-    # check for dimers
-    dimers_not_solved = scheme.check_and_solve_heterodimers(
-        amplicon_scheme,
-        left_primer_candidates,
-        right_primer_candidates,
-        all_primers)
-    if dimers_not_solved:
-        logging.raise_error(
-            f"varVAMP found {len(dimers_not_solved)} primer dimers without replacements. Check the dimer file and perform the PCR for incomaptible amplicons in a sperate reaction.",
-            log_file
-        )
-        reporting.write_dimers(results_dir, dimers_not_solved)
     # evaluate coverage
-    # ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers, but this potential, minor inaccuracy is currently accepted.
+    # ATTENTION: Genome coverage of the scheme might still change slightly through resolution of primer dimers,
+    # but this potential, minor inaccuracy is currently accepted.
     percent_coverage = round(coverage/len(ambiguous_consensus)*100, 2)
     logging.varvamp_progress(
         log_file,
@@ -398,10 +432,36 @@ def tiled_workflow(args, amplicons, left_primer_candidates, right_primer_candida
             "\t - relax primer settings (not recommended)\n",
             log_file
         )
-    return amplicon_scheme
+    # check for dimers
+    dimers_not_solved, n_initial_dimers = scheme.check_and_solve_heterodimers(
+        amplicon_scheme,
+        left_primer_candidates,
+        right_primer_candidates,
+        all_primers,
+        args.threads
+    )
+    # report dimers solve
+    if n_initial_dimers > 0 and not dimers_not_solved:
+        logging.varvamp_progress(
+            log_file,
+            progress=0.95,
+            job="Trying to solve primer dimers.",
+            progress_text=f"all dimers (n={n_initial_dimers}) could be resolved"
+        )
+    elif dimers_not_solved:
+        logging.varvamp_progress(
+            log_file,
+            progress=0.95,
+            job="Trying to solve primer dimers.",
+            progress_text=f"{len(dimers_not_solved)}/{n_initial_dimers} dimers could not be resolved"
+        )
+    return amplicon_scheme, dimers_not_solved
-def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, log_file):
+def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majority_consensus, left_primer_candidates, right_primer_candidates, compatible_primers, log_file):
     """
     part of the workflow specific for the tiled mode
     """
@@ -412,7 +472,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
     )
     if not probe_regions:
         logging.raise_error(
-            "no regions that fullfill probe criteria! lower threshold or increase number of ambiguous chars in probe\n",
+            "no regions that fulfill probe criteria! lower threshold or increase number of ambiguous chars in probe\n",
             log_file,
             exit=True
         )
@@ -424,7 +484,7 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
         config.QPROBE_SIZES
     )
     # find potential probes
-    qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned)
+    qpcr_probes = qpcr.get_qpcr_probes(probe_kmers, ambiguous_consensus, alignment_cleaned, args.threads)
     if not qpcr_probes:
         logging.raise_error(
             "no qpcr probes found\n",
@@ -438,8 +498,21 @@ def qpcr_workflow(args, data_dir, alignment_cleaned, ambiguous_consensus, majori
         progress_text=f"{len(qpcr_probes)} potential qPCR probes"
     )
+    # filter primers against non-dimer sequences if provided
+    if compatible_primers:
+        qpcr_probes = primers.filter_non_dimer_candidates(
+            qpcr_probes, compatible_primers, args.threads)
+        logging.varvamp_progress(
+            log_file,
+            progress=0.75,
+            job="Filtering probes against provided primers.",
+            progress_text=f"{len(qpcr_probes)} potential qPCR probes after filtering"
+        )
     # find unique amplicons with a low penalty and an internal probe
-    qpcr_scheme_candidates = qpcr.find_qcr_schemes(qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus, ambiguous_consensus)
+    qpcr_scheme_candidates = qpcr.find_qcr_schemes(
+        qpcr_probes, left_primer_candidates, right_primer_candidates, majority_consensus, ambiguous_consensus, args.threads
+    )
     if not qpcr_scheme_candidates:
         logging.raise_error(
             "no qPCR scheme candidates found. lower threshold or increase number of ambiguous chars in primer and/or probe\n",
@@ -500,13 +573,13 @@ def main():
         blast.check_BLAST_installation(log_file)
     # mode unspecific part of the workflow
-    alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates = shared_workflow(args, log_file)
+    alignment_cleaned, majority_consensus, ambiguous_consensus, primer_regions, left_primer_candidates, right_primer_candidates, potential_primer_regions, compatible_primers = shared_workflow(args, log_file)
     # write files that are shared in all modes
     reporting.write_regions_to_bed(primer_regions, args.name, data_dir)
     reporting.write_alignment(data_dir, alignment_cleaned)
-    reporting.write_fasta(data_dir, f"majority_consensus", f"{args.name}_consensus",majority_consensus)
-    reporting.write_fasta(results_dir, f"ambiguous_consensus", f"{args.name}_consensus", ambiguous_consensus)
+    reporting.write_fasta(data_dir, f"majority_consensus", f"{args.name}_majority_consensus",majority_consensus)
+    reporting.write_fasta(results_dir, f"ambiguous_consensus", f"{args.name}_ambiguous_consensus", ambiguous_consensus)
     # Functions called from here on return lists of amplicons that are refined step-wise into final schemes.
     # These lists that are passed between functions and later used for reporting consist of dictionary elemnts,
@@ -522,10 +595,12 @@ def main():
     # SINGLE/TILED mode
     if args.mode == "tiled" or args.mode == "single":
+        dimers_not_solved = None
         all_primers, amplicons = single_and_tiled_shared_workflow(
             args,
             left_primer_candidates,
             right_primer_candidates,
+            potential_primer_regions,
             data_dir,
             log_file
         )
@@ -536,7 +611,7 @@ def main():
                 log_file
             )
         elif args.mode == "tiled":
-            amplicon_scheme = tiled_workflow(
+            amplicon_scheme, dimers_not_solved = tiled_workflow(
                 args,
                 amplicons,
                 left_primer_candidates,
@@ -544,11 +619,9 @@ def main():
                 all_primers,
                 ambiguous_consensus,
                 log_file,
-                results_dir
             )
         # write files
         if args.mode == "tiled":
             # assign amplicon numbers from 5' to 3' along the genome
             amplicon_scheme.sort(key=lambda x: x["LEFT"][1])
@@ -562,7 +635,8 @@ def main():
             ambiguous_consensus,
             args.name,
             args.mode,
-            log_file
+            log_file,
+            dimers_not_solved
         )
         reporting.varvamp_plot(
             results_dir,
@@ -584,11 +658,11 @@ def main():
             majority_consensus,
             left_primer_candidates,
             right_primer_candidates,
+            compatible_primers,
             log_file
         )
         # write files
         # make sure amplicons with no off-target products and with low penalties get the lowest numbers
         final_schemes.sort(key=lambda x: (x.get("off_targets", False), x["penalty"]))
         reporting.write_regions_to_bed(probe_regions, args.name, data_dir, "probe")

varvamp/scripts/alignment.py CHANGED Viewed

@@ -2,14 +2,11 @@
 alignment preprocessing
 """
-# BUILT-INS
-import re
-import multiprocessing
 # varVAMP
 from varvamp.scripts import config
 # LIBS
+import numpy as np
 from Bio import AlignIO
 from Bio.Seq import Seq
@@ -47,181 +44,74 @@ def preprocess(alignment_path):
     return preprocessed_alignment
-def find_internal_gaps(unique_gaps, gap):
-    """
-    find all unique gaps that
-    lie within the current gap
-    """
-    overlapping_gaps = []
-    if gap[1] - gap[0] == 0:
-        # if the gap length = 1 there are
-        # no overlapping gaps
-        overlapping_gaps = [gap]
-    else:
-        # for each unique gap check if the intersection with the
-        # gap is the same as the unique gap -> internal gap of
-        # the current gap
-        for unique_gap in unique_gaps:
-            unique_set = set(range(unique_gap[0], unique_gap[1]))
-            current_range = range(gap[0], gap[1])
-            intersection = unique_set.intersection(current_range)
-            if not intersection:
-                continue
-            if min(intersection) == unique_gap[0] and max(intersection) + 1 == unique_gap[1]:
-                overlapping_gaps.append(unique_gap)
-    return overlapping_gaps
-def find_overlapping_gaps_worker(gap_list, unique_gaps):
-    """
-    Worker function to find overlapping gaps and count their occurrences.
-    """
-    gap_dict_part = {}
-    for gap in gap_list:
-        overlapping_gaps = find_internal_gaps(unique_gaps, gap)
-        for overlapping_gap in overlapping_gaps:
-            if overlapping_gap in gap_dict_part:
-                gap_dict_part[overlapping_gap] += 1
-            else:
-                gap_dict_part[overlapping_gap] = 1
-    return gap_dict_part
-def create_gap_dictionary(unique_gaps, all_gaps, n_threads):
-    """
-    Creates a dictionary with all gap counts.
-    Counts also all overlapping gaps per gap.
-    Uses multiprocessing for parallelization.
-    """
-    with multiprocessing.Pool(processes=n_threads) as pool:
-        results = pool.starmap(find_overlapping_gaps_worker, [(gap_list, unique_gaps) for gap_list in all_gaps])
-    gap_dict = {}
-    for gap_dict_part in results:
-        for gap, count in gap_dict_part.items():
-            if gap in gap_dict:
-                gap_dict[gap] += count
-            else:
-                gap_dict[gap] = count
-    return gap_dict
-def find_gaps_to_mask(gap_dict, cutoff):
-    """
-    filters gaps for their freq cutoff.
-    condenses final gaps if there is
-    an overlap.
-    """
-    gaps_to_mask = []
-    potential_gaps = []
-    opened_region = []
-    # check for each region if it is covered
-    # by enough sequences
-    for gap in gap_dict:
-        if gap_dict[gap] > cutoff:
-            potential_gaps.append(gap)
-    # sort by start and stop
-    potential_gaps = sorted(potential_gaps)
-    # get the min and max of overlapping gaps
-    for i, region in enumerate(potential_gaps):
-        region = list(region)
-        if opened_region:
-            # write the opened region if the start of the current region
-            # > opened_region[stop] and the last still opened region
-            if region[0] > opened_region[1] or i == len(potential_gaps) - 1:
-                gaps_to_mask.append(opened_region)
-                opened_region = region
-            else:
-                # 1 case: same start and further stop -> new stop
-                if region[0] == opened_region[0]:
-                    opened_region[1] = region[1]
-                # 2 case: further start and further stop -> new stop
-                if region[0] > opened_region[0] and region[1] > opened_region[1]:
-                    opened_region[1] = region[1]
-        else:
-            opened_region = region
-    return gaps_to_mask
 def clean_gaps(alignment, gaps_to_mask):
     """
-    clean an alignment of large common deletions.
+    Clean an alignment of large common deletions based on gaps_to_mask.
+    gaps_to_mask: list of [start, end] (inclusive), sorted by start.
     """
     cleaned_alignment = []
+    gaps_to_mask = sorted(gaps_to_mask, key=lambda x: x[0])
-    for sequence in alignment:
+    for seq_id, seq in alignment:
         start = 0
-        masked_seq = str()
-        for region in gaps_to_mask:
-            # check if it is three bases or more and mask with 2 Ns
-            if region[1] - region[0] >= config.QAMPLICON_DEL_CUTOFF:
+        pieces = []
+        # for each seq in the alignment, mask the regions
+        for region_start, region_end in gaps_to_mask:
+            # mask length for this region
+            if (region_end - region_start + 1) >= config.QAMPLICON_DEL_CUTOFF:
                 mask = "NN"
-            # or mask with one N (small deletion)
             else:
                 mask = "N"
-            stop = region[0]
-            masked_seq_temp = sequence[1][start:stop]
-            # check if the deletion is at the start
-            if start == 0 and len(masked_seq_temp) == 0:
-                masked_seq = mask
-            # check if deletion is not at start
-            elif start == 0 and len(masked_seq_temp) != 0:
-                masked_seq = masked_seq_temp
-            # else we are in the middle of the alignment
-            else:
-                masked_seq = masked_seq + mask + masked_seq_temp
-            start = region[1] + 1
-        if max(gaps_to_mask)[1] < len(sequence[1]) - 1:
-            # append the last seq if no gap is at
-            # the end of the sequence
-            start = max(gaps_to_mask)[1]
-            stop = len(sequence[1]) - 1
-            masked_seq_temp = sequence[1][start:stop]
-            masked_seq = masked_seq + mask + masked_seq_temp
-        else:
-            # append the mask to the end of the seq
-            masked_seq = masked_seq + mask
-        cleaned_alignment.append([sequence[0], masked_seq])
+            # part before region
+            pieces.append(seq[start:region_start])
+            # mask for region
+            pieces.append(mask)
+            # next start is after region
+            start = region_end + 1
+        # tail after last masked region
+        if start < len(seq):
+            pieces.append(seq[start:])
+        cleaned_alignment.append([seq_id, "".join(pieces)])
     return cleaned_alignment
-def process_alignment(preprocessed_alignment, threshold, n_threads):
+def process_alignment(preprocessed_alignment, threshold):
     """
-    proprocesses alignment and cleans gaps
+    - build an array of shape (n_seq, seq_len)
+    - for each column, count how many sequences are '-'
+    - mark columns to mask if count > cutoff
+    - turn those columns into contiguous regions
     """
-    all_gaps = []
-    gap_cutoff = len(preprocessed_alignment) * (1 - threshold)
-    for seq in preprocessed_alignment:
-        # find all gaps for all sequences with regular expression -{min}
-        all_gaps.append(
-            [(gap.start(0), gap.end(0) - 1) for gap in re.finditer(
-                "-{1,}", seq[1])]
-        )
-    unique_gaps = list(set(gaps for gap_list in all_gaps for gaps in gap_list))
-    if unique_gaps:
-        gap_dic = create_gap_dictionary(unique_gaps, all_gaps, n_threads)
-        gaps_to_mask = find_gaps_to_mask(gap_dic, gap_cutoff)
-        if gaps_to_mask:
-            alignment_cleaned = clean_gaps(
-                preprocessed_alignment, gaps_to_mask
-            )
-        else:
-            alignment_cleaned = preprocessed_alignment
-    else:
-        gaps_to_mask = []
-        alignment_cleaned = preprocessed_alignment
+    # build char array
+    seqs = [seq for seq_id, seq in preprocessed_alignment]
+    arr = np.array([list(s) for s in seqs], dtype="U1")
+    n_seq, len_seq = arr.shape
+    # per-column gap counts
+    cols_to_mask = (arr == "-").sum(axis=0) > n_seq * (1 - threshold)
+    # convert bool mask into list of (start, end) regions (end inclusive)
+    gaps_to_mask = []
+    in_gap = False
+    start = None
+    for i, is_gap in enumerate(cols_to_mask):
+        if is_gap and not in_gap:
+            in_gap = True
+            start = i
+        elif not is_gap and in_gap:
+            in_gap = False
+            gaps_to_mask.append([start, i - 1])
+    if in_gap:
+        gaps_to_mask.append([start, len_seq - 1])
+    if not gaps_to_mask:
+        return preprocessed_alignment, []
+    alignment_cleaned = clean_gaps(preprocessed_alignment, gaps_to_mask)
     return alignment_cleaned, gaps_to_mask

varvamp 1.2.1__py3-none-any.whl → 1.3__py3-none-any.whl

varvamp 1.2.1py3-none-any.whl → 1.3py3-none-any.whl