PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (31) hide show

mgnify_pipelines_toolkit/analysis/amplicon/find_mcp_inflection_points.py CHANGED Viewed

@@ -20,21 +20,29 @@ import argparse
 import pandas as pd
 import numpy as np
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to mcp tsv file to find inflection points")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to mcp tsv file to find inflection points",
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
     args = parser.parse_args()
-    _PATH = args.input
-    _SAMPLE = args.sample
-    _OUTPUT = args.output
-    return _PATH, _SAMPLE, _OUTPUT
+    path = args.input
+    sample = args.sample
+    output = args.output
+    return path, sample, output
 def find_mcp_inf_points(mcp_df):
     """
@@ -50,45 +58,54 @@ def find_mcp_inf_points(mcp_df):
     """
     inf_point_dict = defaultdict(list)
-    start_indices = [ int(i) for i in mcp_df.columns.tolist() ]
+    start_indices = [int(i) for i in mcp_df.columns.tolist()]
-    for i in range(len(mcp_df)): # Loop through both possible strands of the mcp_df
+    for i in range(len(mcp_df)):  # Loop through both possible strands of the mcp_df
         strand = mcp_df.index[i]
         props = mcp_df.iloc[i].tolist()
-        props = [ -val for val in props ]
+        props = [-val for val in props]
-        prop_diff = np.diff(props)/np.diff(start_indices) # Get the derivative
-        infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[0] # Grab points above 80th percentile
+        prop_diff = np.diff(props) / np.diff(start_indices)  # Get the derivative
+        infl_points = np.where(prop_diff > np.percentile(prop_diff, 80))[
+            0
+        ]  # Grab points above 80th percentile
         for ind in infl_points:
             inf_point = start_indices[ind]
-            if inf_point < 10 or inf_point > 20: # Rule to facilitate results - won't accept
-                continue                         # points below index 10 or above index 20
-                                                 # 10 means a cutoff of 15 and 20 a cutoff of 25
-                                                 # literature points to no primers existing that are
-                                                 # shorter or bigger  than these lengths
+            if (
+                inf_point < 10 or inf_point > 20
+            ):  # Rule to facilitate results - won't accept
+                continue  # points below index 10 or above index 20
+                # 10 means a cutoff of 15 and 20 a cutoff of 25
+                # literature points to no primers existing that are
+                # shorter or bigger  than these lengths
+            inf_point_dict["strand"].append(strand)
+            inf_point_dict["inf_point"].append(inf_point)
-            inf_point_dict['strand'].append(strand)
-            inf_point_dict['inf_point'].append(inf_point)
     return inf_point_dict
 def main():
-    _PATH, _SAMPLE, _OUTPUT = parse_args()
+    path, sample, output = parse_args()
-    mcp_df = pd.read_csv(_PATH, sep='\t', index_col=0) # Read mcp_df
-    inf_point_dict = find_mcp_inf_points(mcp_df) # Generate inflection points dict
+    mcp_df = pd.read_csv(path, sep="\t", index_col=0)  # Read mcp_df
+    inf_point_dict = find_mcp_inf_points(mcp_df)  # Generate inflection points dict
-    if len(inf_point_dict) > 0: # If the inf_point_dict isn't empty..
-        inf_point_df = pd.DataFrame.from_dict(inf_point_dict) # .. turn it into a dataframe
-        inf_point_df.to_csv(f'{_OUTPUT}/{_SAMPLE}_inf_points.tsv', sep='\t', index=False) # ..save it to a .tsv file
+    if len(inf_point_dict) > 0:  # If the inf_point_dict isn't empty..
+        inf_point_df = pd.DataFrame.from_dict(
+            inf_point_dict
+        )  # .. turn it into a dataframe
+        inf_point_df.to_csv(
+            f"{output}/{sample}_inf_points.tsv", sep="\t", index=False
+        )  # ..save it to a .tsv file
-    else: # If it is empty..
-        fw = open(f'{_OUTPUT}/{_SAMPLE}_inf_points.tsv', 'w') # ..make an empty file
+    else:  # If it is empty..
+        fw = open(f"{output}/{sample}_inf_points.tsv", "w")  # ..make an empty file
         fw.close()
 if __name__ == "__main__":
-    main()
+    main()

mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py CHANGED Viewed

@@ -20,32 +20,48 @@ import logging
 import pandas as pd
-from mgnify_pipelines_toolkit.constants.tax_ranks import _SILVA_TAX_RANKS, _PR2_TAX_RANKS
+from mgnify_pipelines_toolkit.constants.tax_ranks import (
+    _SILVA_TAX_RANKS,
+    _PR2_TAX_RANKS,
+)
 logging.basicConfig(level=logging.DEBUG)
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-t", "--taxa", required=True, type=str, help="Path to taxa file")
-    parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file")
-    parser.add_argument("-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file")
     parser.add_argument(
-        "-a", "--amp", required=True, type=str, help="Path to extracted amp_region reads from inference subworkflow"
+        "-t", "--taxa", required=True, type=str, help="Path to taxa file"
+    )
+    parser.add_argument(
+        "-f", "--fwd", required=True, type=str, help="Path to DADA2 forward map file"
+    )
+    parser.add_argument(
+        "-r", "--rev", required=False, type=str, help="Path to DADA2 reverse map file"
+    )
+    parser.add_argument(
+        "-a",
+        "--amp",
+        required=True,
+        type=str,
+        help="Path to extracted amp_region reads from inference subworkflow",
+    )
+    parser.add_argument(
+        "-hd", "--headers", required=True, type=str, help="Path to fastq headers"
     )
-    parser.add_argument("-hd", "--headers", required=True, type=str, help="Path to fastq headers")
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     args = parser.parse_args()
-    _TAXA = args.taxa
-    _FWD = args.fwd
-    _REV = args.rev
-    _AMP = args.amp
-    _HEADERS = args.headers
-    _SAMPLE = args.sample
-    return _TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE
+    taxa = args.taxa
+    fwd = args.fwd
+    rev = args.rev
+    amp = args.amp
+    headers = args.headers
+    sample = args.sample
+    return taxa, fwd, rev, amp, headers, sample
 def order_df(taxa_df):
@@ -59,6 +75,7 @@ def order_df(taxa_df):
     return taxa_df
 def make_tax_assignment_dict_silva(taxa_df, asv_dict):
     tax_assignment_dict = defaultdict(int)
@@ -93,7 +110,7 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
                 k = "_".join(k.split(" "))
                 tax_assignment += f"\t{k}"
             elif sk != "0":
-                tax_assignment += f"\tk__"
+                tax_assignment += "\tk__"
             else:
                 break
@@ -136,9 +153,10 @@ def make_tax_assignment_dict_silva(taxa_df, asv_dict):
             continue
         tax_assignment_dict[tax_assignment] += asv_count
     return tax_assignment_dict
 def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
     tax_assignment_dict = defaultdict(int)
@@ -223,26 +241,45 @@ def make_tax_assignment_dict_pr2(taxa_df, asv_dict):
     return tax_assignment_dict
+def generate_asv_count_dict(asv_dict):
+    res_dict = defaultdict(list)
+    for asv_id, count in asv_dict.items():
+        if count == 0:
+            continue
+        res_dict["asv"].append(asv_id)
+        res_dict["count"].append(count)
+    res_df = pd.DataFrame.from_dict(res_dict)
+    res_df = res_df.sort_values(by="asv", ascending=True)
+    res_df = res_df.sort_values(by="count", ascending=False)
+    return res_df
 def main():
-    _TAXA, _FWD, _REV, _AMP, _HEADERS, _SAMPLE = parse_args()
+    taxa, fwd, rev, amp, headers, sample = parse_args()
-    fwd_fr = open(_FWD, "r")
+    fwd_fr = open(fwd, "r")
     paired_end = True
-    if _REV == None:
+    if rev is None:
         paired_end = False
         rev_fr = [True]
     else:
-        rev_fr = open(_REV, "r")
+        rev_fr = open(rev, "r")
-    taxa_df = pd.read_csv(_TAXA, sep="\t", dtype=str)
+    taxa_df = pd.read_csv(taxa, sep="\t", dtype=str)
     taxa_df = taxa_df.fillna("0")
     taxa_df = order_df(taxa_df)
-    amp_reads = [read.strip() for read in list(open(_AMP, "r"))]
-    headers = [read.split(" ")[0][1:] for read in
-               list(open(_HEADERS, "r"))]
-    amp_region = ".".join(_AMP.split(".")[1:3])
+    amp_reads = [read.strip() for read in list(open(amp, "r"))]
+    headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
+    amp_region = ".".join(amp.split(".")[1:3])
     asv_dict = defaultdict(int)
@@ -250,23 +287,12 @@ def main():
     for line_fwd in fwd_fr:
         counter += 1
         line_fwd = line_fwd.strip()
-        fwd_asvs = line_fwd.split(",")
-        if paired_end:
-            line_rev = next(rev_fr).strip()
-            rev_asvs = line_rev.split(",")
-            asv_intersection = list(set(fwd_asvs).intersection(rev_asvs))
-            if len(asv_intersection) == 0:
-                continue
-            if len(asv_intersection) == 1 and asv_intersection[0] == "0":
-                continue
-        else:
-            asv_intersection = fwd_asvs
+        if line_fwd == "0":
+            continue
         if headers[counter] in amp_reads:
-            asv_dict[f"seq_{int(asv_intersection[0]) - 1}"] += 1
+            asv_dict[f"seq_{line_fwd}"] += 1
     fwd_fr.close()
     if paired_end:
@@ -281,10 +307,15 @@ def main():
         tax_assignment_dict = make_tax_assignment_dict_pr2(taxa_df, asv_dict)
         ref_db = "pr2"
-    with open(f"./{_SAMPLE}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
+    with open(f"./{sample}_{amp_region}_{ref_db}_asv_krona_counts.txt", "w") as fw:
         for tax_assignment, count in tax_assignment_dict.items():
             fw.write(f"{count}\t{tax_assignment}\n")
+    asv_count_df = generate_asv_count_dict(asv_dict)
+    asv_count_df.to_csv(
+        f"./{sample}_{amp_region}_asv_read_counts.tsv", sep="\t", index=False
+    )
 if __name__ == "__main__":
     main()

mgnify_pipelines_toolkit/analysis/amplicon/mapseq_to_asv_table.py CHANGED Viewed

@@ -22,9 +22,12 @@ import pandas as pd
 logging.basicConfig(level=logging.DEBUG)
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Input from MAPseq output")
+    parser.add_argument(
+        "-i", "--input", required=True, type=str, help="Input from MAPseq output"
+    )
     parser.add_argument(
         "-l",
         "--label",
@@ -37,18 +40,48 @@ def parse_args():
     args = parser.parse_args()
-    _INPUT = args.input
-    _LABEL = args.label
-    _SAMPLE = args.sample
+    input = args.input
+    label = args.label
+    sample = args.sample
+    return input, label, sample
-    return _INPUT, _LABEL, _SAMPLE
 def parse_label(label):
     silva_short_ranks = ["sk__", "k__", "p__", "c__", "o__", "f__", "g__", "s__"]
-    pr2_short_ranks = ["d__", "sg__", "dv__", "sdv__", "c__", "o__", "f__", "g__", "s__"]
-    silva_long_ranks = ["Superkingdom", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
-    pr2_long_ranks = ["Domain", "Supergroup", "Division", "Subdivision", "Class", "Order", "Family", "Genus", "Species"]
+    pr2_short_ranks = [
+        "d__",
+        "sg__",
+        "dv__",
+        "sdv__",
+        "c__",
+        "o__",
+        "f__",
+        "g__",
+        "s__",
+    ]
+    silva_long_ranks = [
+        "Superkingdom",
+        "Kingdom",
+        "Phylum",
+        "Class",
+        "Order",
+        "Family",
+        "Genus",
+        "Species",
+    ]
+    pr2_long_ranks = [
+        "Domain",
+        "Supergroup",
+        "Division",
+        "Subdivision",
+        "Class",
+        "Order",
+        "Family",
+        "Genus",
+        "Species",
+    ]
     chosen_short_ranks = ""
     chosen_long_ranks = ""
@@ -65,6 +98,7 @@ def parse_label(label):
     return chosen_short_ranks, chosen_long_ranks
 def parse_mapseq(mseq_df, short_ranks, long_ranks):
     res_dict = defaultdict(list)
@@ -91,7 +125,8 @@ def parse_mapseq(mseq_df, short_ranks, long_ranks):
             res_dict[curr_rank].append(curr_tax)
     res_df = pd.DataFrame.from_dict(res_dict)
-    return(res_df)
+    return res_df
 def process_blank_tax_ends(res_df, ranks):
     # Necessary function as we want to replace consecutive blank assignments that start at the last rank as NAs
@@ -105,7 +140,9 @@ def process_blank_tax_ends(res_df, ranks):
         ):  # Parse an assignment backwards, from Species all the way to Superkingdom/Domain
             curr_rank = res_df.iloc[i, j + 1]
             if curr_rank in ranks:
-                if last_empty_rank == "":  # Last rank is empty, start window of consecutive blanks
+                if (
+                    last_empty_rank == ""
+                ):  # Last rank is empty, start window of consecutive blanks
                     last_empty_rank = j + 1
                     currently_empty = True
                 elif (
@@ -124,16 +161,17 @@ def process_blank_tax_ends(res_df, ranks):
     return res_df
-def main():
-    _INPUT, _LABEL, _SAMPLE = parse_args()
+def main():
+    input, label, sample = parse_args()
-    mseq_df = pd.read_csv(_INPUT, header=1, delim_whitespace=True, usecols=[0, 12])
+    mseq_df = pd.read_csv(input, header=0, delim_whitespace=True, usecols=[0, 12])
-    short_ranks, long_ranks = parse_label(_LABEL)
+    short_ranks, long_ranks = parse_label(label)
     res_df = parse_mapseq(mseq_df, short_ranks, long_ranks)
     final_res_df = process_blank_tax_ends(res_df, short_ranks)
-    final_res_df.to_csv(f"./{_SAMPLE}_{_LABEL}_asv_taxa.tsv", sep="\t", index=False)
+    final_res_df.to_csv(f"./{sample}_{label}_asv_taxa.tsv", sep="\t", index=False)
 if __name__ == "__main__":
     main()

mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py CHANGED Viewed

@@ -21,24 +21,41 @@ import re
 from Bio import SeqIO
 import pandas as pd
-from mgnify_pipelines_toolkit.constants.var_region_coordinates import REGIONS_16S_BACTERIA, REGIONS_16S_ARCHAEA, REGIONS_18S
+from mgnify_pipelines_toolkit.constants.var_region_coordinates import (
+    REGIONS_16S_BACTERIA,
+    REGIONS_16S_ARCHAEA,
+    REGIONS_18S,
+)
 STRAND_FWD = "fwd"
 STRAND_REV = "rev"
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to cmsearch_deoverlap_tblout file")
-    parser.add_argument("-f", "--fasta", required=True, type=str, help="Path to concatenated primers fasta file")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to cmsearch_deoverlap_tblout file",
+    )
+    parser.add_argument(
+        "-f",
+        "--fasta",
+        required=True,
+        type=str,
+        help="Path to concatenated primers fasta file",
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     args = parser.parse_args()
-    _INPUT = args.input
-    _FASTA = args.fasta
-    _SAMPLE = args.sample
+    input = args.input
+    fasta = args.fasta
+    sample = args.sample
-    return _INPUT, _FASTA, _SAMPLE
+    return input, fasta, sample
 def get_amp_region(beg, strand, model):
@@ -62,11 +79,11 @@ def get_amp_region(beg, strand, model):
 def main():
-    _INPUT, _FASTA, _SAMPLE = parse_args()
+    input, fasta, sample = parse_args()
     res_dict = defaultdict(list)
-    fasta_dict = SeqIO.to_dict(SeqIO.parse(_FASTA, "fasta"))
+    fasta_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
-    with open(_INPUT, "r") as fr:
+    with open(input, "r") as fr:
         for line in fr:
             line = line.strip()
             line = re.sub("[ \t]+", "\t", line)
@@ -88,7 +105,7 @@ def main():
             else:
                 continue
-            res_dict["Run"].append(_SAMPLE)
+            res_dict["Run"].append(sample)
             res_dict["AssertionEvidence"].append("ECO_0000363")
             res_dict["AssertionMethod"].append("automatic assertion")
@@ -109,7 +126,7 @@ def main():
             res_dict["PrimerSeq"].append(primer_seq)
     res_df = pd.DataFrame.from_dict(res_dict)
-    res_df.to_csv(f"./{_SAMPLE}_primer_validation.tsv", sep="\t", index=False)
+    res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
 if __name__ == "__main__":

mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py CHANGED Viewed

@@ -21,39 +21,48 @@ import gzip
 from Bio import SeqIO, bgzf
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-f", "--fwd", required=True, type=str, help="Path to forward (or single-end) fastq file")
-    parser.add_argument("-r", "--rev", required=False, type=str, help="Path to reverse fastq file")
+    parser.add_argument(
+        "-f",
+        "--fwd",
+        required=True,
+        type=str,
+        help="Path to forward (or single-end) fastq file",
+    )
+    parser.add_argument(
+        "-r", "--rev", required=False, type=str, help="Path to reverse fastq file"
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     args = parser.parse_args()
-    _FWD = args.fwd
-    _REV = args.rev
-    _SAMPLE = args.sample
-    return _FWD, _REV, _SAMPLE
+    fwd = args.fwd
+    rev = args.rev
+    sample = args.sample
+    return fwd, rev, sample
 def main():
-    _FWD, _REV, _SAMPLE = parse_args()
+    fwd, rev, sample = parse_args()
-    fwd_handle = gzip.open(_FWD, "rt")
+    fwd_handle = gzip.open(fwd, "rt")
     fwd_reads = SeqIO.to_dict(SeqIO.parse(fwd_handle, "fastq"))
     fwd_handle.close()
     paired_end = True
-    if _REV == None:
+    if rev is None:
         paired_end = False
     else:
-        rev_handle = gzip.open(_REV, "rt")
+        rev_handle = gzip.open(rev, "rt")
         rev_reads = SeqIO.to_dict(SeqIO.parse(rev_handle, "fastq"))
         rev_handle.close()
     remove_set = set()
     for read_id in fwd_reads.keys():
@@ -78,23 +87,24 @@ def main():
                 remove_set.add(read_id)
                 continue
-    [ fwd_reads.pop(read_id) for read_id in remove_set ]
+    [fwd_reads.pop(read_id) for read_id in remove_set]
     if paired_end:
-        [ rev_reads.pop(read_id) for read_id in remove_set ]
+        [rev_reads.pop(read_id) for read_id in remove_set]
     if paired_end:
-        fwd_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_1.fastq.gz", "wb")
-        rev_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig_2.fastq.gz", "wb")
+        fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig_1.fastq.gz", "wb")
+        rev_handle = bgzf.BgzfWriter(f"./{sample}_noambig_2.fastq.gz", "wb")
         SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
         SeqIO.write(sequences=rev_reads.values(), handle=rev_handle, format="fastq")
         fwd_handle.close()
         rev_handle.close()
     else:
-        fwd_handle = bgzf.BgzfWriter(f"./{_SAMPLE}_noambig.fastq.gz", "wb")
+        fwd_handle = bgzf.BgzfWriter(f"./{sample}_noambig.fastq.gz", "wb")
         SeqIO.write(sequences=fwd_reads.values(), handle=fwd_handle, format="fastq")
         fwd_handle.close()
 if __name__ == "__main__":
-    main()
+    main()

mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py CHANGED Viewed

@@ -16,29 +16,37 @@
 import argparse
-from Bio import Seq, SeqIO
+from Bio import SeqIO
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to finalised primer list fasta file")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to finalised primer list fasta file",
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
     args = parser.parse_args()
-    _INPUT = args.input
-    _SAMPLE = args.sample
-    _OUTPUT = args.output
-    return _INPUT, _SAMPLE, _OUTPUT
+    input = args.input
+    sample = args.sample
+    output = args.output
+    return input, sample, output
 def main():
-    _INPUT, _SAMPLE, _OUTPUT = parse_args()
-    primers_dict = SeqIO.to_dict(SeqIO.parse(_INPUT, "fasta"))
+    input, sample, output = parse_args()
+    primers_dict = SeqIO.to_dict(SeqIO.parse(input, "fasta"))
     for primer_key in primers_dict.keys():
         primer = primers_dict[primer_key]
@@ -47,8 +55,10 @@ def main():
         if "R" in primer_name:
             primers_dict[primer_key].seq = primer.seq.reverse_complement()
-    SeqIO.write(primers_dict.values(), f"{_OUTPUT}/{_SAMPLE}_rev_comp_se_primers.fasta", "fasta")
+    SeqIO.write(
+        primers_dict.values(), f"{output}/{sample}_rev_comp_se_primers.fasta", "fasta"
+    )
 if __name__ == "__main__":
-    main()
+    main()

mgnify-pipelines-toolkit 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl