PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend - Supply Chain Defender

mgnify-pipelines-toolkit 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (32) hide show

mgnify_pipelines_toolkit/analysis/amplicon/assess_inflection_point_mcp.py CHANGED Viewed

@@ -20,28 +20,43 @@ from Bio.Seq import Seq
 import numpy as np
 import pandas as pd
-from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
+from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
+    get_read_count,
+    build_cons_seq,
+    build_mcp_cons_dict_list,
+    fetch_mcp,
+)
 from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to fastq file to choose inflection point")
-    parser.add_argument("-p", "--points", required=True, type=str, help="Path to inflection points file")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to fastq file to choose inflection point",
+    )
+    parser.add_argument(
+        "-p", "--points", required=True, type=str, help="Path to inflection points file"
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
     args = parser.parse_args()
-    _PATH = args.input
-    _POINTS = args.points
-    _SAMPLE = args.sample
-    _OUTPUT = args.output
-    return _PATH, _POINTS, _SAMPLE, _OUTPUT
+    path = args.input
+    points = args.points
+    sample = args.sample
+    output = args.output
+    return path, points, sample, output
-def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
+def assess_inflection_point_mcp_for_sample(path, inf_point_list, rev=False):
     """
     Assess inflection point list, selecting one for automatic primer trimming.
@@ -54,17 +69,19 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
     Returns the cutoff point and the consensus sequence 'forming' the automatically predicted primer
     """
     # TODO error handle for empty inflection point list
-    start_confs = [] # pre-inf point conservations
-    end_confs = [] # post-inf point conservations
-    start_cons_lens = [] # list for storing lengths of pre-inflection point sequences
-    cons_seq_list = [] # list for storing consensus sequences pre-inflection points
+    start_confs = []  # pre-inf point conservations
+    end_confs = []  # post-inf point conservations
+    start_cons_lens = []  # list for storing lengths of pre-inflection point sequences
+    cons_seq_list = []  # list for storing consensus sequences pre-inflection points
-    do_not_include_list = [ i + 5 for i in inf_point_list ] # ignore conservation of inflection point in calculation
+    do_not_include_list = [
+        i + 5 for i in inf_point_list
+    ]  # ignore conservation of inflection point in calculation
-    read_count = get_read_count(_PATH) # get readcount from fastq
+    read_count = get_read_count(path)  # get readcount from fastq
     max_line_count = None
     if read_count > MCP_MAX_LINE_COUNT:
@@ -72,89 +89,126 @@ def assess_inflection_point_mcp_for_sample(_PATH, inf_point_list, rev=False):
     n_prop = 0.8
-    for start in inf_point_list: # Looping through the pre-inflection point mcps
-        mcp_len = start + 4 # length of pre-inf mcps is inflection point + 4
-        mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev, max_line_count=max_line_count) # get MCP count dict
-        mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
-        cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, max_line_count=max_line_count) # get list of max base conservations for each index
-                                                                                                    # also get consensus sequence
+    for start in inf_point_list:  # Looping through the pre-inflection point mcps
+        mcp_len = start + 4  # length of pre-inf mcps is inflection point + 4
+        mcp_count_dict = fetch_mcp(
+            path, mcp_len, rev=rev, max_line_count=max_line_count
+        )  # get MCP count dict
+        mcp_cons_list = build_mcp_cons_dict_list(
+            mcp_count_dict, mcp_len
+        )  # list of base conservation dicts for mcps
+        cons_seq, cons_confs = build_cons_seq(
+            mcp_cons_list,
+            read_count,
+            n_prop,
+            do_not_include_list,
+            max_line_count=max_line_count,
+        )  # get list of max base conservations for each index
+        # also get consensus sequence
         cons_seq_list.append(cons_seq)
         start_confs.append(np.mean(cons_confs))
         start_cons_lens.append(len(cons_seq))
-    for i, end in enumerate(inf_point_list): # Looping through the post-inflection point mcps
-        mcp_len = end + 5 # length of pre-inf mcps is inflection point + 5
-        subs_len = start_cons_lens[i] # length of respective pre-inf point sequence
-        l = mcp_len + subs_len - 1 # final index of MCP
+    for i, end in enumerate(
+        inf_point_list
+    ):  # Looping through the post-inflection point mcps
+        mcp_len = end + 5  # length of pre-inf mcps is inflection point + 5
+        subs_len = start_cons_lens[i]  # length of respective pre-inf point sequence
+        l = mcp_len + subs_len - 1  # final index of MCP
-        mcp_count_dict = fetch_mcp(_PATH, l, mcp_len, rev=rev, max_line_count=max_line_count)
+        mcp_count_dict = fetch_mcp(
+            path, l, mcp_len, rev=rev, max_line_count=max_line_count
+        )
         mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, subs_len)
-        cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count, n_prop, do_not_include_list, subs_len, max_line_count=max_line_count)
+        cons_seq, cons_confs = build_cons_seq(
+            mcp_cons_list,
+            read_count,
+            n_prop,
+            do_not_include_list,
+            subs_len,
+            max_line_count=max_line_count,
+        )
         end_confs.append(np.mean(cons_confs))
-    diff_res = [ start_confs[i] - end_confs[i] for i in range(len(start_confs))] # get differences between pre- and -post avg conservation values
-    diff_res_sorted = sorted(diff_res, reverse=True) # sort differences from highest to lowest
+    diff_res = [
+        start_confs[i] - end_confs[i] for i in range(len(start_confs))
+    ]  # get differences between pre- and -post avg conservation values
+    diff_res_sorted = sorted(
+        diff_res, reverse=True
+    )  # sort differences from highest to lowest
-    ini_max_res = diff_res_sorted[0] # maximum differences
-    curr_max_index = diff_res.index(ini_max_res) # index of maximum differences
+    ini_max_res = diff_res_sorted[0]  # maximum differences
+    curr_max_index = diff_res.index(ini_max_res)  # index of maximum differences
-    for res in diff_res_sorted[1:]: # Loop through the rest of the differences
-        curr_res_index = np.where(diff_res == res)[0][0]
+    for res in diff_res_sorted[1:]:  # Loop through the rest of the differences
+        curr_res_index = np.where(diff_res == res)[0][0]
         index_diff = inf_point_list[curr_max_index] - inf_point_list[curr_res_index]
         # if difference between the max and the current is negligible and the index of the current is earlier then..
-        if ini_max_res - res < 0.05 and ( index_diff <= 3 and index_diff > 0 ):
-            curr_max_index = curr_res_index # replace the selected index with the current one
-    cutoff = inf_point_list[curr_max_index] + 5 # cutoff is the inflection point index + 5
-    primer = cons_seq_list[curr_max_index] # grab the correct consensus sequence as primer
+        if ini_max_res - res < 0.05 and (index_diff <= 3 and index_diff > 0):
+            curr_max_index = (
+                curr_res_index  # replace the selected index with the current one
+            )
+    cutoff = (
+        inf_point_list[curr_max_index] + 5
+    )  # cutoff is the inflection point index + 5
+    primer = cons_seq_list[
+        curr_max_index
+    ]  # grab the correct consensus sequence as primer
     # if the requested strand is reverse..
     if rev:
-        primer = str(Seq(primer).complement()) # ..get the complement of consensus sequence
+        primer = str(
+            Seq(primer).complement()
+        )  # ..get the complement of consensus sequence
     return cutoff, primer
 def main():
-    _PATH, _POINTS, _SAMPLE, _OUTPUT = parse_args()
-    inf_df = pd.read_csv(_POINTS, sep='\t')
+    path, points, sample, output = parse_args()
+    inf_df = pd.read_csv(points, sep="\t")
-    f_slice = inf_df[inf_df.strand == 'F'] # get forward inflection points
-    r_slice = inf_df[inf_df.strand == 'R'] # get reverse inflection points
+    f_slice = inf_df[inf_df.strand == "F"]  # get forward inflection points
+    r_slice = inf_df[inf_df.strand == "R"]  # get reverse inflection points
     r_slice = r_slice.reset_index(drop=True)
-    f_cutoff = ''
-    r_cutoff = ''
-    f_primer = ''
-    r_primer = ''
+    f_cutoff = ""
+    r_cutoff = ""
+    f_primer = ""
+    r_primer = ""
-    if not f_slice.empty: # if there is a forward inflection point..
+    if not f_slice.empty:  # if there is a forward inflection point..
         inf_list = f_slice.inf_point.tolist()
-        f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(_PATH, inf_list) # .. assess and select
+        f_cutoff, f_primer = assess_inflection_point_mcp_for_sample(
+            path, inf_list
+        )  # .. assess and select
-    if not r_slice.empty: # if there is a reverse inflection point..
+    if not r_slice.empty:  # if there is a reverse inflection point..
         inf_list = r_slice.inf_point.tolist()
-        r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(_PATH, inf_list, rev=True) # .. assess and select
+        r_cutoff, r_primer = assess_inflection_point_mcp_for_sample(
+            path, inf_list, rev=True
+        )  # .. assess and select
     # Output cutoff point(s) to .txt file
-    with open(f'{_OUTPUT}/{_SAMPLE}_cutoff.txt', 'w') as fw:
-        if f_cutoff != '':
-            fw.write(f'F: {f_cutoff}\n')
-        if r_cutoff != '':
-            fw.write(f'R: {r_cutoff}\n')
+    with open(f"{output}/{sample}_cutoff.txt", "w") as fw:
+        if f_cutoff != "":
+            fw.write(f"F: {f_cutoff}\n")
+        if r_cutoff != "":
+            fw.write(f"R: {r_cutoff}\n")
     # Output consensus primer sequence(s) to .fasta file
-    with open(f'{_OUTPUT}/{_SAMPLE}_auto_primers.fasta', 'w') as fw:
-        if f_cutoff != '':
-            fw.write(f'>F_auto\n{f_primer}\n')
-        if r_cutoff != '':
-            fw.write(f'>R_auto\n{r_primer}\n')
+    with open(f"{output}/{sample}_auto_primers.fasta", "w") as fw:
+        if f_cutoff != "":
+            fw.write(f">F_auto\n{f_primer}\n")
+        if r_cutoff != "":
+            fw.write(f">R_auto\n{r_primer}\n")
 if __name__ == "__main__":
-    main()
+    main()

mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py CHANGED Viewed

@@ -16,33 +16,51 @@
 import argparse
 from collections import defaultdict
-import subprocess
 import pandas as pd
 import numpy as np
-from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
+from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
+    get_read_count,
+    build_cons_seq,
+    build_mcp_cons_dict_list,
+    fetch_mcp,
+)
 from mgnify_pipelines_toolkit.constants.thresholds import MCP_MAX_LINE_COUNT
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to fastq file to assess mcps")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to fastq file to assess mcps",
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
-    parser.add_argument("-st", "--strand", required=True, choices=['FR', 'F', 'R'], help='F: Forward, R: Reverse')
+    parser.add_argument(
+        "-st",
+        "--strand",
+        required=True,
+        choices=["FR", "F", "R"],
+        help="F: Forward, R: Reverse",
+    )
     parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
     args = parser.parse_args()
-    _PATH = args.input
-    _SAMPLE = args.sample
-    _STRAND = args.strand
-    _OUTPUT = args.output
-    return _PATH, _SAMPLE, _STRAND, _OUTPUT
+    path = args.input
+    sample = args.sample
+    strand = args.strand
+    output = args.output
+    return path, sample, strand, output
-def find_mcp_props_for_sample(_PATH, rev=False):
+def find_mcp_props_for_sample(path, rev=False):
     """
     Generate mcp proportions in a stepwise and windowed manner for a fastq file.
@@ -50,38 +68,47 @@ def find_mcp_props_for_sample(_PATH, rev=False):
     Calculate the average conservation of the most common base at each index of a window.
     The resulting list of mcp conservations can be considered a conservation curve and used to
     identify inflection points where the conservation suddenly changes.
     Output a dictionary where:
         key -> an index starting point e.g. base 10
         val -> the average conservation of the most common base for the mcp window goign from base 10 to 15 (inclusive)
     """
     res_dict = defaultdict(float)
-    start_range = range(2, 25, 1) # Range of starting indices
-    print(f'Processing {_PATH}')
+    start_range = range(2, 25, 1)  # Range of starting indices
+    print(f"Processing {path}")
-    mcp_len = 5 # length of generated mcps
+    mcp_len = 5  # length of generated mcps
     for start in start_range:
-        end = start+mcp_len-1 # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
+        end = (
+            start + mcp_len - 1
+        )  # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
+        read_count = get_read_count(path, type="fastq")  # get read count for fastq file
-        read_count = get_read_count(_PATH, type='fastq') # get read count for fastq file
         max_line_count = None
         if read_count > MCP_MAX_LINE_COUNT:
             max_line_count = MCP_MAX_LINE_COUNT
-        mcp_count_dict = fetch_mcp(_PATH, end, start, rev, max_line_count) # get MCP count dict
-        mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
-        cons_seq, cons_conf = build_cons_seq(mcp_cons_list, read_count, max_line_count=max_line_count) # get list of max base conservations for each index
-        res_dict[start] = np.mean(cons_conf) # compute the mean
+        mcp_count_dict = fetch_mcp(
+            path, end, start, rev, max_line_count
+        )  # get MCP count dict
+        mcp_cons_list = build_mcp_cons_dict_list(
+            mcp_count_dict, mcp_len
+        )  # list of base conservation dicts for mcps
+        cons_seq, cons_conf = build_cons_seq(
+            mcp_cons_list, read_count, max_line_count=max_line_count
+        )  # get list of max base conservations for each index
+        res_dict[start] = np.mean(cons_conf)  # compute the mean
     return res_dict
-def concat_out(fwd_out='', rev_out=''):
+def concat_out(fwd_out="", rev_out=""):
     """
     Generate Pandas dataframe out of mcp dictionary.
@@ -97,16 +124,16 @@ def concat_out(fwd_out='', rev_out=''):
     df_ind = []
     # Check if fwd strand was requested
-    if fwd_out != '':
-        [ total_res_dict[key].append(fwd_out[key]) for key in fwd_out.keys() ]
-        df_ind.append('F')
+    if fwd_out != "":
+        [total_res_dict[key].append(fwd_out[key]) for key in fwd_out.keys()]
+        df_ind.append("F")
     # Check if rev strand was requested
-    if rev_out != '':
-        [ total_res_dict[key].append(rev_out[key]) for key in rev_out.keys() ]
-        df_ind.append('R')
+    if rev_out != "":
+        [total_res_dict[key].append(rev_out[key]) for key in rev_out.keys()]
+        df_ind.append("R")
-    res_df= pd.DataFrame.from_dict(total_res_dict)
+    res_df = pd.DataFrame.from_dict(total_res_dict)
     res_df.index = df_ind
     return res_df
@@ -114,31 +141,33 @@ def concat_out(fwd_out='', rev_out=''):
 def main():
-    _PATH, _SAMPLE, _STRAND, _OUTPUT = parse_args()
+    path, sample, strand, output = parse_args()
-    res_df = ''
+    res_df = ""
     # TODO: match-case statement is python 3.10>. We are currently locking the version
     # at version 3.9. The day we bump the version we should replace these if statements
     # with a match-case block.
-    if _STRAND == "FR":
-        fwd_out = find_mcp_props_for_sample(_PATH)
-        rev_out = find_mcp_props_for_sample(_PATH, rev=True)
+    if strand == "FR":
+        fwd_out = find_mcp_props_for_sample(path)
+        rev_out = find_mcp_props_for_sample(path, rev=True)
         res_df = concat_out(fwd_out, rev_out)
-    elif _STRAND == "F":
-        fwd_out = find_mcp_props_for_sample(_PATH)
+    elif strand == "F":
+        fwd_out = find_mcp_props_for_sample(path)
         res_df = concat_out(fwd_out)
-    elif _STRAND == "R":
-        rev_out = find_mcp_props_for_sample(_PATH, rev=True)
+    elif strand == "R":
+        rev_out = find_mcp_props_for_sample(path, rev=True)
         res_df = concat_out(rev_out=rev_out)
     else:
-        print("Incorrect strand input. Should be F for forward, R for reverse, or FR for both.")
+        print(
+            "Incorrect strand input. Should be F for forward, R for reverse, or FR for both."
+        )
         exit(1)
     # Save resulting dataframe to a tsv file
-    res_df.to_csv(f'{_OUTPUT}/{_SAMPLE}_mcp_cons.tsv', sep='\t')
+    res_df.to_csv(f"{output}/{sample}_mcp_cons.tsv", sep="\t")
 if __name__ == "__main__":
-    main()
+    main()