PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.2.1__tar.gz → 1.2.3__tar.gz - Mend

mgnify-pipelines-toolkit 1.2.1tar.gz → 1.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (55) hide show

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.2.1
+Version: 1.2.3
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/mgnify_pipelines_toolkit/analysis/amplicon/classify_var_regions.py RENAMED Viewed

@@ -22,12 +22,12 @@ import os
 import logging
 import json
 import time
+import numpy as np
 from mgnify_pipelines_toolkit.constants.thresholds import (
     MIN_OVERLAP,
     MIN_SEQ_COUNT,
     MAX_ERROR_PROPORTION,
-    MAX_INTERNAL_PRIMER_PROPORTION,
 )
 from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
     REGIONS_16S_BACTERIA,
@@ -62,8 +62,16 @@ def get_multiregion(raw_sequence_coords, regions):
     Returns:
         amplified_region: Amplified variable regions.
+        region_coverages: Coverage of all detected variable regions
     """
+    region_coverages = defaultdict(float)
+    for region, limits in regions.items():
+        overlap = calc_overlap(raw_sequence_coords, limits)
+        region_coverages[region] = overlap
     # check if any of the coords are inside the region
     matched_regions = [
         region
@@ -76,7 +84,7 @@ def get_multiregion(raw_sequence_coords, regions):
         amplified_region = matched_regions[0]
     else:
         amplified_region = ""
-    return amplified_region
+    return amplified_region, region_coverages
 def check_primer_position(raw_sequence_coords, regions):
@@ -90,7 +98,7 @@ def check_primer_position(raw_sequence_coords, regions):
     """
     result_flag = False
-    margin = 3  # allowed margin of error
+    margin = 10  # allowed margin of error
     for coord in raw_sequence_coords:
         for region in regions.values():
             if coord in range(region[0] + margin, region[1] - margin):
@@ -342,22 +350,30 @@ def retrieve_regions(
         per_read_info = (
             dict()
         )  # dictionary will contain read names for each variable region
+        all_region_coverages = defaultdict(lambda: defaultdict(list))
         for read in data:
+            # Example structure of `read`
+            # ('ERR14650515.1', 'SSU_rRNA_archaea', 'RF01959', 'hmm', '3', '525', '1', '518', '+', '-', '6', '0.55', '0.6', '363.6', '7.8e-107')
             regions = determine_cm(read[2])
             sequence_counter_total += 1
             limits = list(map(int, read[4:6]))
             domain = determine_domain(read[2])
             marker_gene = determine_marker_gene(domain)
             if not regions == "unsupported":
-                multiregion_matches.setdefault(read[2], []).append(
-                    get_multiregion(limits, regions)
-                )
+                matches, coverages = get_multiregion(limits, regions)
+                [
+                    all_region_coverages[domain][region].append(coverage)
+                    for region, coverage in coverages.items()
+                ]
+                multiregion_matches.setdefault(read[2], []).append(matches)
                 if check_primer_position(limits, regions):
                     primer_inside_vr += 1
                 sequence_counter_useful += 1
-                per_read_info.setdefault(
-                    marker_gene + "." + get_multiregion(limits, regions), []
-                ).append(read[0])
+                per_read_info.setdefault(marker_gene + "." + matches, []).append(
+                    read[0]
+                )
             else:
                 unsupported_matches += 1
@@ -385,18 +401,6 @@ def retrieve_regions(
             )
             continue
-        # filter out runs with too many sequences starting/ending inside variable regions
-        internal_seq_fract = primer_inside_vr / len(data)
-        if internal_seq_fract > MAX_INTERNAL_PRIMER_PROPORTION:
-            failed_run_counter += 1
-            logging.info("No output will be produced - too many internal mappings")
-            logging.info(
-                "Excluded due to high proportion of internal primers:\t{}\t{}\n".format(
-                    tblout_file, "{0:.2f}".format(internal_seq_fract)
-                )
-            )
-            continue
         normalised_matches[run_id] = dict()
         region_counter = defaultdict(int)
@@ -432,14 +436,12 @@ def retrieve_regions(
             multiregion_matches[model] = new_value
         [multiregion_matches.pop(model) for model in models_to_remove]
-        print(multiregion_matches)
         run_status = "one"
         run_result = dict()
         total_useful_sequences = 0.0
         temp_seq_counter = dict()
         for model, model_regions in multiregion_matches.items():
-            print(model)
             result = normalise_results(model_regions)
             if result is None:
                 run_status = "ambiguous"
@@ -469,6 +471,16 @@ def retrieve_regions(
             logging.info("No output will be produced - the run is ambiguous.")
             continue
+    coverage_fw = open(f"{outfile_prefix}_all_coverages.txt", "w")
+    for domain, regions in all_region_coverages.items():
+        for region in regions:
+            if len(regions[region]) < MIN_SEQ_COUNT:
+                continue
+            region_coverage = float(np.mean(regions[region]))
+            if region_coverage > 0:
+                coverage_fw.write(f"{domain}:{region}: {region_coverage}\n")
     json_outfile = "{}.json".format(outfile_prefix)
     tsv_outfile = "{}.tsv".format(outfile_prefix)
     with open(json_outfile, "w") as f:

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py RENAMED Viewed

@@ -300,23 +300,24 @@ def main():
     if paired_end:
         rev_fr.close()
-    ref_db = ""
-    if len(taxa_df.columns) == 9:
-        tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
-        ref_db = "silva"
-    elif len(taxa_df.columns) == 10:
-        tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
-        ref_db = "pr2"
-    with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
-        for tax_assignment, count in tax_assignment_dict.items():
-            fw.write(f"{count}\t{tax_assignment}\n")
-    asv_count_df = generate_asv_count_dict(asv_dict)
-    asv_count_df.to_csv(
-        f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
-    )
+    if asv_dict:   # if there are matches between taxonomic and ASV annotations
+        ref_db = ""
+        if len(taxa_df.columns) == 9:
+            tax_assignment_dict = make_tax_assignment_dict_silva(taxa_df, asv_dict)
+            ref_db = "silva"
+        elif len(taxa_df.columns) == 10:
+            tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
+            ref_db = "pr2"
+        with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
+            for tax_assignment, count in tax_assignment_dict.items():
+                fw.write(f"{count}\t{tax_assignment}\n")
+        asv_count_df = generate_asv_count_dict(asv_dict)
+        asv_count_df.to_csv(
+            f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
+        )
 if __name__ == "__main__":

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py RENAMED Viewed

@@ -19,6 +19,7 @@ from collections import defaultdict
 import re
 from Bio import SeqIO
+from Bio.Seq import Seq
 import pandas as pd
 from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
@@ -49,18 +50,26 @@ def parse_args():
         help="Path to concatenated primers fasta file",
     )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
+    parser.add_argument(
+        "--se",
+        action=argparse.BooleanOptionalAction,
+        help="Flag for if run is single-end",
+    )
     args = parser.parse_args()
     input = args.input
     fasta = args.fasta
     sample = args.sample
+    single_end = args.se
-    return input, fasta, sample
+    return input, fasta, sample, single_end
 def get_amp_region(beg, end, strand, model):
     prev_region = ""
+    margin = -10
     for region, region_coords in model.items():
         region_beg = region_coords[0]
@@ -68,10 +77,10 @@ def get_amp_region(beg, end, strand, model):
         end_diff = region_beg - end
         if strand == STRAND_FWD:
-            if beg_diff > 0 and end_diff > 0:
+            if beg_diff >= margin and end_diff >= margin:
                 return region
         else:
-            if beg_diff > 0 and end_diff > 0:
+            if beg_diff >= margin and end_diff >= margin:
                 return prev_region
         prev_region = region
@@ -80,10 +89,14 @@ def get_amp_region(beg, end, strand, model):
 def main():
-    input, fasta, sample = parse_args()
+    input, fasta, sample, single_end = parse_args()
     res_dict = defaultdict(list)
     fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
+    fwd_primers_fw = open("./fwd_primers.fasta", "w")
+    rev_primers_fw = open("./rev_primers.fasta", "w")
     with open(input, "r") as fr:
         for line in fr:
             line = line.strip()
@@ -104,8 +117,12 @@ def main():
             elif rfam == "RF01960":
                 gene = "18S"
                 model = REGIONS_18S
-            else:
-                continue
+            else:  # For cases when it's a std primer but for some reason hasn't matched the model
+                if primer_name == "F_auto" or primer_name == "R_auto":
+                    continue
+                gene = "Unknown"
+                amp_region = "Unknown"
+                model = ""
             res_dict["Run"].append(sample)
             res_dict["AssertionEvidence"].append("ECO_0000363")
@@ -113,12 +130,13 @@ def main():
             strand = ""
-            if "F" in primer_name:
+            if primer_name == "F_auto" or primer_name[-1] == "F":
                 strand = STRAND_FWD
-            elif "R" in primer_name:
+            elif primer_name == "R_auto" or primer_name[-1] == "R":
                 strand = STRAND_REV
-            amp_region = get_amp_region(beg, end, strand, model)
+            if model:
+                amp_region = get_amp_region(beg, end, strand, model)
             primer_seq = str(fasta_dict[primer_name].seq)
             res_dict["Gene"].append(gene)
@@ -127,9 +145,19 @@ def main():
             res_dict["PrimerStrand"].append(strand)
             res_dict["PrimerSeq"].append(primer_seq)
+            if strand == STRAND_FWD:
+                fwd_primers_fw.write(f">{primer_name}\n{primer_seq}\n")
+            elif strand == STRAND_REV:
+                if single_end:
+                    primer_seq = Seq(primer_seq).reverse_complement()
+                rev_primers_fw.write(f">{primer_name}\n{primer_seq}\n")
     res_df = pd.DataFrame.from_dict(res_dict)
     res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
+    fwd_primers_fw.close()
+    rev_primers_fw.close()
 if __name__ == "__main__":
     main()

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/mgnify_pipelines_toolkit.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.2.1
+Version: 1.2.3
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.2.1 → mgnify_pipelines_toolkit-1.2.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mgnify_pipelines_toolkit"
-version = "1.2.1"
+version = "1.2.3"
 readme = "README.md"
 license = { text = "Apache Software License 2.0" }
 authors = [