PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.8__tar.gz → 0.2.0__tar.gz - Mend

mgnify-pipelines-toolkit 0.1.8tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (48) hide show

{mgnify_pipelines_toolkit-0.1.8 → mgnify_pipelines_toolkit-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: mgnify_pipelines_toolkit
-Version: 0.1.8
+Version: 0.2.0
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
 Requires-Dist: numpy==1.26.0
 Requires-Dist: pandas==2.0.2
 Requires-Dist: regex==2023.12.25
+Requires-Dist: requests==2.32.3
+Requires-Dist: click==8.1.7
+Requires-Dist: pandera==0.22.1
 Provides-Extra: tests
 Requires-Dist: pytest==7.4.0; extra == "tests"
 Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
 Requires-Dist: pandas==2.0.2; extra == "tests"
 Requires-Dist: numpy==1.26.0; extra == "tests"
 Requires-Dist: regex==2023.12.25; extra == "tests"
+Requires-Dist: requests==2.32.3; extra == "tests"
+Requires-Dist: click==8.1.7; extra == "tests"
+Requires-Dist: pandera==0.22.1; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
 Requires-Dist: pre-commit==3.8.0; extra == "dev"

mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py ADDED Viewed

@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import hashlib
+import logging
+import sys
+from pathlib import Path
+from Bio import SeqIO
+import pandas as pd
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
+    current_protein = None
+    for line in lines:
+        parts = line.strip().split("\t")
+        protein_id = parts[0]
+        if protein_id != current_protein:
+            current_protein = protein_id
+            protein_rheas = set()
+        rhea_list = parts[-1].split("RheaID=")[1].split()
+        top_hit = "top hit" if rhea_list and not protein_rheas else ""
+        for rhea in rhea_list:
+            if rhea not in protein_rheas:
+                chebi_reaction, reaction = rhea2reaction_dict[rhea]
+                contig_id = protein_id.split("_")[0]
+                protein_hash = protein_hashes[protein_id]
+                print(
+                    contig_id,
+                    protein_id,
+                    protein_hash,
+                    rhea,
+                    chebi_reaction,
+                    reaction,
+                    top_hit,
+                    sep="\t",
+                    file=output_handler,
+                )
+                protein_rheas.add(rhea)
+def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
+    logging.info(
+        f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
+    )
+    protein_hashes = {}
+    with open(proteins, "r") as fasta_file:
+        for record in SeqIO.parse(fasta_file, "fasta"):
+            protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
+            protein_hashes[record.id] = protein_hash
+    logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
+    df = pd.read_csv(rhea2chebi, delimiter="\t")
+    rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
+    logging.info(
+        f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
+    )
+    with open(output, "w") as output_handler:
+        if input == "-":
+            process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
+        else:
+            with open(args.input, "r") as input_file:
+                process_lines(
+                    input_file, output_handler, rhea2reaction_dict, protein_hashes
+                )
+    logging.info("Processed successfully. Exiting.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        """
+                                    Use diamond output file to create a table with Rhea and CHEBI
+                                    reaction annotation for every protein.
+                                    """
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="DIAMOND results file, use '-' for stdin",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        type=Path,
+        help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
+    )
+    parser.add_argument(
+        "-p",
+        "--proteins",
+        required=True,
+        type=Path,
+        help="Protein fasta file used as DIAMOND input",
+    )
+    parser.add_argument(
+        "--rhea2chebi",
+        default=None,
+        type=Path,
+        help="File that maps rhea_ids to CHEBI",
+    )
+    args = parser.parse_args()
+    main(args.input, args.output, args.proteins, args.rhea2chebi)

mgnify_pipelines_toolkit-0.2.0/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py ADDED Viewed

@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from collections import defaultdict
+import json
+import pandas as pd
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
+    )
+    parser.add_argument(
+        "-o", "--output", required=True, type=str, help="Output GFF3 file name"
+    )
+    parser.add_argument(
+        "--cds_tag",
+        default="ID",
+        type=str,
+        help="Type of CDS ID tag to use in the GFF3 (default: locus_tag)",
+    )  # The CDS' identifier changes from tool to tool.
+    args = parser.parse_args()
+    return args.input, args.output, args.cds_tag
+def main():
+    """Transform an antiSMASH JSON into a GFF3 with 'regions' and CDS within those regions"""
+    json_input, output_file, cds_tag = parse_args()
+    with open(json_input, "r") as json_data:
+        antismash_analysis = json.load(json_data)
+    res_dict = defaultdict(list)
+    attributes_dict = defaultdict(dict)
+    antismash_ver = antismash_analysis["version"]
+    for record in antismash_analysis["records"]:
+        record_id = record["id"]
+        iter_cds = (
+            "antismash.detection.genefunctions" in record["modules"].keys()
+        )  # Flag to iterate CDS
+        region_name = None
+        for feature in record["features"]:
+            if feature["type"] == "region":
+                # Annotate region features
+                region_name = (
+                    f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
+                )
+                region_start = int(feature["location"].split(":")[0].split("[")[1])
+                region_end = int(feature["location"].split(":")[1].split("]")[0])
+                res_dict["contig"].append(record_id)
+                res_dict["version"].append(f"antiSMASH:{antismash_ver}")
+                res_dict["type"].append("region")
+                res_dict["start"].append(region_start + 1)
+                res_dict["end"].append(region_end)
+                res_dict["score"].append(".")
+                res_dict["strand"].append(".")
+                res_dict["phase"].append(".")
+                product = ",".join(feature["qualifiers"].get("product", []))
+                attributes_dict[region_name].update(
+                    {"ID": region_name, "product": product}
+                )
+            if iter_cds and feature["type"] == "CDS":
+                # Annotate CDS features
+                start = int(feature["location"].split(":")[0][1:])
+                end = int(feature["location"].split(":")[1].split("]")[0])
+                strand = feature["location"].split("(")[1][0]  # + or -
+                if not region_name or not (region_start <= end and start <= region_end):
+                    continue
+                res_dict["contig"].append(record_id)
+                res_dict["version"].append(f"antiSMASH:{antismash_ver}")
+                res_dict["type"].append("gene")
+                res_dict["start"].append(start + 1)  # Correct for 1-based indexing
+                res_dict["end"].append(end)
+                res_dict["score"].append(".")
+                res_dict["strand"].append(strand)
+                res_dict["phase"].append(".")
+                locus_tag = feature["qualifiers"][cds_tag][0]
+                attributes_dict[locus_tag].update(
+                    {
+                        "ID": locus_tag,
+                        "as_type": ",".join(
+                            feature["qualifiers"].get("gene_kind", ["other"])
+                        ),
+                        "gene_functions": ",".join(
+                            feature["qualifiers"].get("gene_functions", [])
+                        )
+                        .replace(" ", "_")
+                        .replace(":_", ":")
+                        .replace(";_", "%3B"),
+                        "Parent": region_name,
+                    }
+                )
+        # Extended CDS attributes
+        if "antismash.detection.hmm_detection" in record["modules"].keys():
+            cds_by_protocluster = record["modules"][
+                "antismash.detection.hmm_detection"
+            ]["rule_results"]["cds_by_protocluster"]
+            if len(cds_by_protocluster) > 0:
+                for feature in cds_by_protocluster[0][1]:
+                    if "cds_name" in feature.keys():
+                        locus_tag = feature["cds_name"]
+                        as_clusters = ",".join(
+                            list(feature["definition_domains"].keys())
+                        )
+                        if locus_tag in attributes_dict.keys():
+                            attributes_dict[locus_tag].update(
+                                {"as_gene_clusters": as_clusters}
+                            )
+        if "antismash.detection.genefunctions" in record["modules"].keys():
+            for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
+                if tool["tool"] == "smcogs":
+                    for locus_tag in tool["best_hits"]:
+                        hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
+                        hit_desc = (
+                            tool["best_hits"][locus_tag]["hit_id"]
+                            .split(":")[1]
+                            .replace(" ", "_")
+                        )
+                        score = tool["best_hits"][locus_tag]["bitscore"]
+                        e_value = tool["best_hits"][locus_tag]["evalue"]
+                        smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
+                        if locus_tag in attributes_dict.keys():
+                            attributes_dict[locus_tag].update({"as_notes": smcog_note})
+                        break
+    attributes = [
+        ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
+        for attrib_data in attributes_dict.values()
+    ]
+    res_dict["attributes"] = attributes
+    res_df = pd.DataFrame.from_dict(res_dict)
+    with open(output_file, "w") as f_out:
+        f_out.write(
+            "##gff-version 3\n"
+        )  # Save data to the GFF3 file with the proper header
+        res_df.to_csv(f_out, header=False, index=False, sep="\t")
+if __name__ == "__main__":
+    main()

mgnify-pipelines-toolkit 0.1.8__tar.gz → 0.2.0__tar.gz

Potentially problematic release.

mgnify-pipelines-toolkit 0.1.8tar.gz → 0.2.0tar.gz