PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (16) hide show

mgnify_pipelines_toolkit/analysis/amplicon/amplicon_utils.py CHANGED Viewed

@@ -18,7 +18,7 @@ from collections import defaultdict, Counter
 import logging
 import gzip
 import os
-import subprocess
+import pyfastx
 from mgnify_pipelines_toolkit.constants.regex_ambiguous_bases import (
     _AMBIGUOUS_BASES_DICT,
@@ -29,7 +29,6 @@ logging.basicConfig(level=logging.DEBUG)
 def split_dir_into_sample_paths(dir):
     file_list = os.listdir(dir)
     file_list = [
         file
@@ -43,42 +42,33 @@ def split_dir_into_sample_paths(dir):
     return sample_list
-def get_read_count(read_path, type="fastq"):
-    cmd = []
-    stdout = ""
-    if type == "fastq":
-        cmd = ["zcat", read_path]
-        zcat_proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        cmd = ["wc", "-l"]
-        wc_proc = subprocess.Popen(
-            cmd, stdin=zcat_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = wc_proc.communicate()
-    elif type == "fasta":
-        cmd = ["grep", "-c", "^>", read_path]
-        grep_proc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        stdout, stderr = grep_proc.communicate()
-    read_count = stdout.strip() if stdout is not None else ""
-    if not read_count.isdigit():
-        logging.error(
-            f"Read count is not a digit, something is wrong. stdout: '{stdout}', stderr: '{stderr}'"
+def get_read_count(read_path: str, file_type: str = "fastq") -> int:
+    """
+    Get the read count of a FASTQ or FASTA file.
+    :param read_path: The path to the FASTQ or FASTA file.
+    :type read_path: str
+    :param fasta_type: The type of the file, either "fastq" or "fasta". Defaults to "fastq".
+    :type fasta_type: str
+    :return: The number of reads in the file.
+    :rtype: int
+    :raises ValueError: If the file type is not supported or the read count is not a positive integer.
+    """
+    read_count = 0
+    if file_type == "fasta":
+        fasta = pyfastx.Fasta(read_path, build_index=False)
+        read_count = sum(1 for _ in fasta)
+    elif file_type == "fastq":
+        fastq = pyfastx.Fastq(read_path, build_index=False)
+        read_count = sum(1 for _ in fastq)
+    else:
+        raise ValueError(
+            f"Invalid file_type {file_type}, it needs to be either 'fasta' or 'fastq'"
         )
-        exit(1)
-    read_count = int(read_count)
-    if type == "fastq":
-        read_count /= 4
+    if read_count <= 0:
+        raise ValueError(f"Read count is not a positive integer: {read_count}")
     return read_count
@@ -128,7 +118,10 @@ def build_cons_seq(
         counter += 1
         try:
-            max_prop = max_count / read_count
+            if max_line_count is None:
+                max_prop = max_count / read_count
+            else:
+                max_prop = max_count / max_line_count
             cons_bases = []
             curr_prop = 0.0

mgnify_pipelines_toolkit/analysis/amplicon/are_there_primers.py CHANGED Viewed

@@ -27,7 +27,6 @@ from mgnify_pipelines_toolkit.analysis.amplicon.amplicon_utils import (
 def parse_args(argv=None):
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -63,7 +62,9 @@ def are_there_primers_in_this_sample(path, rev=False):
         False if a primer was not identified
     """
-    read_count = get_read_count(path, "fastq")  # Get read count for fastq file
+    read_count = get_read_count(
+        path, file_type="fastq"
+    )  # Get read count for fastq file
     mcp_len = 100  # Script will look at first 100 base mcps (for rev=True, it will look at first 100 from 3' to 5')
     mcp_count_dict = fetch_mcp(
@@ -133,7 +134,6 @@ def save_out(results, sample_id, output):
 def main(argv=None):
     path, sample, output = parse_args(argv)
     fwd_primer_flag = are_there_primers_in_this_sample(

mgnify_pipelines_toolkit/analysis/amplicon/assess_mcp_proportions.py CHANGED Viewed

@@ -87,7 +87,9 @@ def find_mcp_props_for_sample(path, rev=False):
             start + mcp_len - 1
         )  # compute the final index for the mcp (inclusive). Indices are of base 1 not 0.
-        read_count = get_read_count(path, type="fastq")  # get read count for fastq file
+        read_count = get_read_count(
+            path, file_type="fastq"
+        )  # get read count for fastq file
         max_line_count = None
         if read_count > MCP_MAX_LINE_COUNT:

mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py CHANGED Viewed

@@ -143,7 +143,7 @@ def get_primer_props(std_primer_dict_regex, input_path):
     threshold = 0.60  # Arbitrary threshold for collecting a matched primer
     read_count = get_read_count(
-        input_path, "fastq"
+        input_path, file_type="fastq"
     )  # Get read count of fastq file to calculate proportion with
     res_dict = defaultdict(defaultdict)

mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py CHANGED Viewed

@@ -62,45 +62,13 @@ def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
                 protein_rheas.add(rhea)
-def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
-    logging.info(
-        f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
-    )
-    protein_hashes = {}
-    with open(proteins, "r") as fasta_file:
-        for record in SeqIO.parse(fasta_file, "fasta"):
-            protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
-            protein_hashes[record.id] = protein_hash
-    logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
-    df = pd.read_csv(rhea2chebi, delimiter="\t")
-    rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
-    logging.info(
-        f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
-    )
-    with open(output, "w") as output_handler:
-        if input == "-":
-            process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
-        else:
-            with open(args.input, "r") as input_file:
-                process_lines(
-                    input_file, output_handler, rhea2reaction_dict, protein_hashes
-                )
-    logging.info("Processed successfully. Exiting.")
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(
-        """
-                                    Use diamond output file to create a table with Rhea and CHEBI
-                                    reaction annotation for every protein.
-                                    """
+        "Use diamond output file to create a table with Rhea and CHEBI reaction annotation for every protein."
     )
     parser.add_argument(
-        "-i",
-        "--input",
+        "-d",
+        "--diamond_hits",
         required=True,
         type=str,
         help="DIAMOND results file, use '-' for stdin",
@@ -121,10 +89,45 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--rhea2chebi",
-        default=None,
+        required=True,
         type=Path,
         help="File that maps rhea_ids to CHEBI",
     )
     args = parser.parse_args()
-    main(args.input, args.output, args.proteins, args.rhea2chebi)
+    diamond_hits = args.diamond_hits
+    output = args.output
+    proteins = args.proteins
+    rhea2chebi = args.rhea2chebi
+    logging.info(
+        f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
+    )
+    protein_hashes = {}
+    with open(proteins, "r") as fasta_file:
+        for record in SeqIO.parse(fasta_file, "fasta"):
+            protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
+            protein_hashes[record.id] = protein_hash
+    logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
+    df = pd.read_csv(rhea2chebi, delimiter="\t")
+    rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
+    logging.info(
+        f"Step 3/3: Read DIAMOND results from {'STDIN' if diamond_hits == '-' else Path(diamond_hits).resolve()} and write output"
+    )
+    with open(output, "w") as output_handler:
+        if diamond_hits == "-":
+            process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
+        else:
+            with open(diamond_hits, "r") as input_file:
+                process_lines(
+                    input_file, output_handler, rhea2reaction_dict, protein_hashes
+                )
+    logging.info("Processed successfully. Exiting.")
+if __name__ == "__main__":
+    main()

mgnify-pipelines-toolkit 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl