PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend - Supply Chain Defender

mgnify-pipelines-toolkit 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (31) hide show

mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py CHANGED Viewed

@@ -20,71 +20,85 @@ import gzip
 import os
 import subprocess
-from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import _AMBIGUOUS_BASES_DICT, _AMBIGUOUS_BASES_DICT_REV
+from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
+    _AMBIGUOUS_BASES_DICT,
+    _AMBIGUOUS_BASES_DICT_REV,
+)
 logging.basicConfig(level=logging.DEBUG)
-def split_dir_into_sample_paths(_DIR):
-    file_list = os.listdir(_DIR)
-    file_list = [ file for file in file_list if '.fastq' in file and ('_1' in file or '_2' in file) ]
+def split_dir_into_sample_paths(dir):
+    file_list = os.listdir(dir)
+    file_list = [
+        file
+        for file in file_list
+        if ".fastq" in file and ("_1" in file or "_2" in file)
+    ]
     sample_set = set()
-    [ sample_set.add(f"{_DIR}/{file.split('_')[0]}") for file in file_list ]
+    [sample_set.add(f"{dir}/{file.split('_')[0]}") for file in file_list]
     sample_list = sorted(list(sample_set))
     return sample_list
-def get_read_count(read_path, type='fastq'):
+def get_read_count(read_path, type="fastq"):
     cmd = []
-    stdout = ''
-    if type == 'fastq':
-        cmd = [
-            'zcat',
-            read_path
-        ]
-        zcat_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        cmd = [
-            'wc',
-            '-l'
-        ]
-        wc_proc = subprocess.Popen(cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout = ""
+    if type == "fastq":
+        cmd = ["zcat", read_path]
+        zcat_proc = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        cmd = ["wc", "-l"]
+        wc_proc = subprocess.Popen(
+            cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         stdout, stderr = wc_proc.communicate()
-    elif type == 'fasta':
-        cmd = [
-            'grep',
-            '-c',
-            '^>',
-            read_path
-        ]
-        grep_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    elif type == "fasta":
+        cmd = ["grep", "-c", "^>", read_path]
+        grep_proc = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         stdout, stderr = grep_proc.communicate()
-    read_count = stdout.strip() if stdout is not None else ""
+    read_count = stdout.strip() if stdout is not None else ""
     if not read_count.isdigit():
-        logging.error(f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'")
+        logging.error(
+            f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'"
+        )
         exit(1)
     read_count = int(read_count)
-    if type == 'fastq':
+    if type == "fastq":
         read_count /= 4
     return read_count
-def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=None, counter=1, max_line_count=None):
+def build_cons_seq(
+    cons_list,
+    read_count,
+    cons_threshold=0.80,
+    do_not_include=None,
+    counter=1,
+    max_line_count=None,
+):
     """
     Generate consensus sequence using a list of base conservation dictionaries most likely
     generated by the `build_mcp_cons_dict_list()` function.
     Also returns a list containing the conservation value of the most conserved base at every
-    position in the list of base conservation dictionaries.
+    position in the list of base conservation dictionaries.
     """
-    cons_seq = ''
+    cons_seq = ""
     cons_confs = []
     if do_not_include is None:
@@ -96,29 +110,31 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
         if counter in do_not_include:
             counter += 1
-            cons_seq += 'N'
-            continue
+            cons_seq += "N"
+            continue
         for base, count in count_dict.items():
-            if base not in ('A', 'T', 'C', 'G'):
+            if base not in ("A", "T", "C", "G"):
                 continue
             if max_line_count is None:
-                cons_dict[base] = count/read_count
+                cons_dict[base] = count / read_count
             else:
-                cons_dict[base] = count/max_line_count
+                cons_dict[base] = count / max_line_count
             if count > max_count:
                 max_count = count
         counter += 1
         try:
-            max_prop = max_count/read_count
+            max_prop = max_count / read_count
             cons_bases = []
             curr_prop = 0.0
-            sorted_cons_dict = dict(sorted(cons_dict.items(), key=lambda x:x[1], reverse=True))
+            sorted_cons_dict = dict(
+                sorted(cons_dict.items(), key=lambda x: x[1], reverse=True)
+            )
             for base, prop in sorted_cons_dict.items():
                 cons_bases.append(base)
@@ -131,18 +147,18 @@ def build_cons_seq(cons_list, read_count, cons_threshold=0.80, do_not_include=No
             if len(cons_bases) == 1:
                 cons_seq += cons_bases[0]
             else:
-                amb_string = ','.join(cons_bases)
+                amb_string = ",".join(cons_bases)
                 amb_base = _AMBIGUOUS_BASES_DICT_REV[amb_string]
                 cons_seq += amb_base
         except ZeroDivisionError:
             max_prop = 0.0
         cons_confs.append(max_prop)
     return cons_seq, cons_confs
 def primer_regex_query_builder(primer):
     """
     Takes an input nucleotide sequence that can contain IUPAC ambiguous codes
@@ -150,10 +166,10 @@ def primer_regex_query_builder(primer):
     potential bases valid at a position with am abiguity code.
     """
-    query = ''
+    query = ""
     for char in primer:
-        if char in ('A', 'C', 'T', 'G'):
+        if char in ("A", "C", "T", "G"):
             query += char
         else:
             query += str(_AMBIGUOUS_BASES_DICT[char])
@@ -162,6 +178,7 @@ def primer_regex_query_builder(primer):
     return query
 def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
     """
     Generate list of dictionaries of base conservation for mcp output (mcp_cons_list)
@@ -178,9 +195,10 @@ def build_mcp_cons_dict_list(mcp_count_dict, mcp_len):
             base = mcp[i]
             index_base_dict[base] += mcp_count_dict[mcp]
         mcp_cons_list.append(index_base_dict)
     return mcp_cons_list
 def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
     """
     Generates the most common prefix sequences along with their counts in a fastq file.
@@ -194,15 +212,17 @@ def fetch_mcp(fastq, prefix_len, start=1, rev=False, max_line_count=None):
             line = line.strip()
             if i % 4 == 1:
                 if not rev:
-                    selected_lines.append(line[start-1:start+prefix_len-1])
+                    selected_lines.append(line[start - 1 : start + prefix_len - 1])
                 else:
                     rev_line = line[::-1]
-                    selected_lines.append(rev_line[start-1:start+prefix_len-1])
-            if max_line_count != None:
+                    selected_lines.append(rev_line[start - 1 : start + prefix_len - 1])
+            if max_line_count is not None:
                 if len(selected_lines) > max_line_count:
                     break
     sequence_counts = Counter(selected_lines)
-    mcp_count_dict = dict(sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True))
+    mcp_count_dict = dict(
+        sorted(sequence_counts.items(), key=lambda x: x[1], reverse=True)
+    )
     return mcp_count_dict

mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py CHANGED Viewed

@@ -15,33 +15,45 @@
 # limitations under the License.
 import argparse
-from collections import defaultdict
 import numpy as np
-from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import get_read_count, build_cons_seq, build_mcp_cons_dict_list, fetch_mcp
+from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
+    get_read_count,
+    build_cons_seq,
+    build_mcp_cons_dict_list,
+    fetch_mcp,
+)
 def parse_args(argv=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input", required=True, type=str, help="Path to fastq file to check for primers")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="Path to fastq file to check for primers",
+    )
     parser.add_argument("-s", "--sample", required=True, type=str, help="Sample ID")
     parser.add_argument("-o", "--output", required=True, type=str, help="Output path")
     args = parser.parse_args(argv)
-    _PATH = args.input
-    _SAMPLE = args.sample
-    _OUTPUT = args.output
-    return _PATH, _SAMPLE, _OUTPUT
+    path = args.input
+    sample = args.sample
+    output = args.output
+    return path, sample, output
-def are_there_primers_in_this_sample(_PATH, rev=False):
+def are_there_primers_in_this_sample(path, rev=False):
     """
     Predict the presence of primers based on windows of base conservation.
     Takes a fastq file as input. Extracts proportion of most common base for the first 100 bases.
-    Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
+    Computes the a threshold (Q3 - 0.15) based on this proportion and counts the number of bases below
     it in windows of 10 bases.
     If at least one of the first two windows contains at most one such a base, then the presence of a primer is flagged as true.
     A primer is also flagged as true if the combined count of bases below Q3 is at most 4.
@@ -51,14 +63,19 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
         False if a primer was not identified
     """
-    read_count = get_read_count(_PATH, 'fastq') # Get read count for fastq file
-    mcp_len = 100 # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
+    read_count = get_read_count(path, "fastq")  # Get read count for fastq file
+    mcp_len = 100  # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
-    mcp_count_dict = fetch_mcp(_PATH, mcp_len, rev=rev) # mcp dict where key is the mcp and value is the count
-    mcp_cons_list = build_mcp_cons_dict_list(mcp_count_dict, mcp_len) # list of base conservation dicts for mcps
-    cons_seq, cons_confs = build_cons_seq(mcp_cons_list, read_count) # get list of max base conservations for each index
+    mcp_count_dict = fetch_mcp(
+        path, mcp_len, rev=rev
+    )  # mcp dict where key is the mcp and value is the count
+    mcp_cons_list = build_mcp_cons_dict_list(
+        mcp_count_dict, mcp_len
+    )  # list of base conservation dicts for mcps
+    cons_seq, cons_confs = build_cons_seq(
+        mcp_cons_list, read_count
+    )  # get list of max base conservations for each index
     window_size = 10
     # Counter that will reset to 0 every 10 bases
     window_count = 0
@@ -66,7 +83,7 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
     window_count_list = []
     # Compute Q3-based threshold
     max_cons = np.quantile(cons_confs, 0.75)
-    threshold = max_cons - 0.15
+    threshold = max_cons - 0.15
     if max_cons < 0.75:
         threshold = 0.75
@@ -76,19 +93,25 @@ def are_there_primers_in_this_sample(_PATH, rev=False):
     # Loop through every base
     for i, val in enumerate(cons_confs):
-        if i%window_size == 0 and i !=0: # After looping through a window..
-            window_count_list.append(window_count) # ..append window count
-            window_count = 0 # ..reset window count
+        if i % window_size == 0 and i != 0:  # After looping through a window..
+            window_count_list.append(window_count)  # ..append window count
+            window_count = 0  # ..reset window count
-        if val < threshold: # If the conservation at i is less than threshold, increment count for the window
+        if (
+            val < threshold
+        ):  # If the conservation at i is less than threshold, increment count for the window
             window_count += 1
-    primer_flag = False # Initialise primer flag as false
+    primer_flag = False  # Initialise primer flag as false
-    if 1 in window_count_list[:2] or 0 in window_count_list[:2]: # If window count is at most 1 of first two windows...
-        primer_flag = True # ..primer flag is true
-    elif sum(window_count_list[:2]) <= 4: # If sum of window counts of the first two windows is at most 4..
-        primer_flag = True # ..primer flag is true
+    if (
+        1 in window_count_list[:2] or 0 in window_count_list[:2]
+    ):  # If window count is at most 1 of first two windows...
+        primer_flag = True  # ..primer flag is true
+    elif (
+        sum(window_count_list[:2]) <= 4
+    ):  # If sum of window counts of the first two windows is at most 4..
+        primer_flag = True  # ..primer flag is true
     return primer_flag
@@ -99,39 +122,43 @@ def save_out(results, sample_id, output):
     1: primer exists
     0: primer doesn't exist
     First line will be the forward strand
     Second line will be the reverse strand
     """
-    with open(f'{output}/{sample_id}_general_primer_out.txt', 'w') as fw:
-        fw.write(f'{results[0]}\n')
-        fw.write(f'{results[1]}\n')
+    with open(f"{output}/{sample_id}_general_primer_out.txt", "w") as fw:
+        fw.write(f"{results[0]}\n")
+        fw.write(f"{results[1]}\n")
 def main(argv=None):
-    _PATH, _SAMPLE, _OUTPUT = parse_args(argv)
+    path, sample, output = parse_args(argv)
-    fwd_primer_flag = are_there_primers_in_this_sample(_PATH) # Check for general primers in fwd
-    rev_primer_flag = are_there_primers_in_this_sample(_PATH, rev=True) # Check for general primers in rev
+    fwd_primer_flag = are_there_primers_in_this_sample(
+        path
+    )  # Check for general primers in fwd
+    rev_primer_flag = are_there_primers_in_this_sample(
+        path, rev=True
+    )  # Check for general primers in rev
-    fwd_status = '0'
-    rev_status = '0'
+    fwd_status = "0"
+    rev_status = "0"
     # Flag for primer presence: 1 for yes 0 for no
     if fwd_primer_flag:
-        print('Forward primer detected!')
+        print("Forward primer detected!")
         fwd_status = 1
     else:
-        print('No forward primer detected')
+        print("No forward primer detected")
     if rev_primer_flag:
-        print('Reverse primer detected!')
+        print("Reverse primer detected!")
         rev_status = 1
     else:
-        print('No reverse primer detected')
+        print("No reverse primer detected")
+    save_out((fwd_status, rev_status), sample, output)  # Save primer flags to .txt file
-    save_out((fwd_status, rev_status), _SAMPLE, _OUTPUT) # Save primer flags to .txt file
 if __name__ == "__main__":
-    main()
+    main()