PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.0.2__tar.gz → 1.0.3__tar.gz - Mend

mgnify-pipelines-toolkit 1.0.2tar.gz → 1.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (57) hide show

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.0.2
+Version: 1.0.3
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/go_utils.py RENAMED Viewed

@@ -84,52 +84,51 @@ def parse_interproscan_tsv(ips_file: Path, mapped_go_terms: dict = None) -> dict
     previous_protein_acc = None
     go_annotations_single_protein = set()
-    fr = open(ips_file, "r")
     go_pattern = re.compile("GO:\\d+")
-    for line in fr:
-        # IPS files are parsed line by line - the same protein accession will appear multiple lines in a row with different annotation
-        line_counter += 1
-        line = line.strip()
-        chunks = line.split("\t")
-        # Get protein accession
-        current_protein_acc = chunks[0]
-        # TODO: not sure if this line is needed - do we ever have more than one protein in a single line of IPS?
-        # Will keep just in case
-        num_of_proteins = len(current_protein_acc.split("|"))
-        # If we're at a new protein accession in the IPS file then we finally increment
-        # the go2protein_count dictionary for each term that was found in that protein
-        if current_protein_acc != previous_protein_acc:
-            total_num_of_proteins += 1
-            if len(go_annotations_single_protein) > 0:
-                num_of_proteins_with_go += 1
-                go2protein_count = count_and_assign_go_annotations(
-                    go2protein_count,
-                    go_annotations_single_protein,
-                    num_of_proteins,
-                    mapped_go_terms,
-                )
-            # reset GO id set because we hit a new protein accession
-            go_annotations_single_protein = set()
-            previous_protein_acc = current_protein_acc
-        # Parse out GO annotations
-        # GO annotations are associated to InterPro entries (InterPro entries start with 'IPR')
-        # Than use the regex to extract the GO Ids (e.g. GO:0009842)
-        if len(chunks) >= 13 and chunks[11].startswith("IPR"):
-            for go_annotation in go_pattern.findall(line):
-                go_annotations_single_protein.add(go_annotation)
-    # Do final counting for the last protein
-    go2protein_count = count_and_assign_go_annotations(
-        go2protein_count,
-        go_annotations_single_protein,
-        num_of_proteins,
-        mapped_go_terms,
-    )
-    fr.close()
+    with open(ips_file, "r") as fr:
+        for line in fr:
+            # IPS files are parsed line by line - the same protein accession will appear multiple lines in a row with different annotation
+            line_counter += 1
+            line = line.strip()
+            chunks = line.split("\t")
+            # Get protein accession
+            current_protein_acc = chunks[0]
+            # TODO: not sure if this line is needed - do we ever have more than one protein in a single line of IPS?
+            # Will keep just in case
+            num_of_proteins = len(current_protein_acc.split("|"))
+            # If we're at a new protein accession in the IPS file then we finally increment
+            # the go2protein_count dictionary for each term that was found in that protein
+            if current_protein_acc != previous_protein_acc:
+                total_num_of_proteins += 1
+                if len(go_annotations_single_protein) > 0:
+                    num_of_proteins_with_go += 1
+                    go2protein_count = count_and_assign_go_annotations(
+                        go2protein_count,
+                        go_annotations_single_protein,
+                        num_of_proteins,
+                        mapped_go_terms,
+                    )
+                # reset GO id set because we hit a new protein accession
+                go_annotations_single_protein = set()
+                previous_protein_acc = current_protein_acc
+            # Parse out GO annotations
+            # GO annotations are associated to InterPro entries (InterPro entries start with 'IPR')
+            # Than use the regex to extract the GO Ids (e.g. GO:0009842)
+            if len(chunks) >= 13 and chunks[11].startswith("IPR"):
+                for go_annotation in go_pattern.findall(line):
+                    go_annotations_single_protein.add(go_annotation)
+        # Do final counting for the last protein
+        go2protein_count = count_and_assign_go_annotations(
+            go2protein_count,
+            go_annotations_single_protein,
+            num_of_proteins,
+            mapped_go_terms,
+        )
     return go2protein_count

mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py ADDED Viewed

@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2025 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from collections import Counter
+import csv
+import logging
+RANK_PREFIXES = {
+    "superkingdom": "sk__",
+    "kingdom": "k__",
+    "phylum": "p__",
+    "class": "c__",
+    "order": "o__",
+    "family": "f__",
+    "genus": "g__",
+    "species": "s__",
+}
+logging.basicConfig(
+    level=logging.INFO, format="[%(asctime)s] - %(levelname)s - %(message)s"
+)
+def import_nodes(nodes_dmp):
+    logging.info(f"Loading file {nodes_dmp}")
+    taxid2rank = {}
+    with open(nodes_dmp) as f1:
+        reader = csv.reader(f1, delimiter="\t")
+        for line in reader:
+            taxid = line[0]
+            rank = line[4]
+            taxid2rank[taxid] = rank
+    return taxid2rank
+def import_names(names_dmp):
+    logging.info(f"Loading file {names_dmp}")
+    taxid2name = {}
+    with open(names_dmp, newline="") as f1:
+        reader = csv.reader(f1, delimiter="\t")
+        for line in reader:
+            if line[6] == "scientific name":
+                taxid = line[0]
+                name = line[2]
+                taxid2name[taxid] = name
+    return taxid2name
+def convert_to_official_names(lineage, taxid2rank, taxid2name):
+    lineage_ranks = [taxid2rank[taxid.rstrip("*")] for taxid in lineage]
+    official_names = list(RANK_PREFIXES.values())
+    lowest_classification_index = -1
+    for i, rank in enumerate(RANK_PREFIXES):
+        if rank in lineage_ranks:
+            index = lineage_ranks.index(rank)
+            taxid = lineage[index].rstrip("*")
+            name = taxid2name[taxid]
+            official_names[i] = official_names[i] + name
+            lowest_classification_index = i
+    return official_names[: lowest_classification_index + 1]
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process TSV classification generated by CAT_pack contigs and write input file for Krona ktImportText"
+    )
+    parser.add_argument(
+        "-i", "--input", help="Path to the input TSV file from CAT_pack contigs"
+    )
+    parser.add_argument("-o", "--output", help="Name of the output Krona TXT file")
+    parser.add_argument(
+        "-n", "--names_dmp", help="Path to the nodes.dmp file from NCBI taxonomy"
+    )
+    parser.add_argument(
+        "-r", "--nodes_dmp", help="Path to the names.dmp file from NCBI taxonomy"
+    )
+    args = parser.parse_args()
+    taxid2rank = import_nodes(args.nodes_dmp)
+    taxid2name = import_names(args.names_dmp)
+    logging.info(f"Begin parsing of CAT_pack classiffication file {args.input}")
+    lineage_counter = Counter()
+    with open(args.input) as infile:
+        reader = csv.reader(infile, delimiter="\t")
+        next(reader)  # Skip the header row
+        for row in reader:
+            if row[1] == "no taxid assigned":
+                lineage = "unclassified"
+            else:
+                taxid_lineage = row[3].split(";")
+                names_lineage = convert_to_official_names(
+                    taxid_lineage, taxid2rank, taxid2name
+                )
+                lineage = "\t".join(names_lineage) if names_lineage else "unclassified"
+            lineage_counter[lineage] += 1
+    logging.info(f"Writting output to {args.output}")
+    with open(args.output, "w") as outfile:
+        for lineage, count in lineage_counter.most_common():
+            outfile.write(f"{count}\t{lineage}\n")
+    logging.info("Done")
+if __name__ == "__main__":
+    main()

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py RENAMED Viewed

@@ -15,9 +15,10 @@
 # limitations under the License.
 import argparse
-from collections import defaultdict
+import csv
 import logging
 import os
+from collections import defaultdict
 from pathlib import Path
 from mgnify_pipelines_toolkit.analysis.assembly.go_utils import parse_interproscan_tsv
@@ -28,7 +29,6 @@ logging.basicConfig(
 def parse_args():
     description = "Go slim pipeline."
     parser = argparse.ArgumentParser(description=description)
     parser.add_argument(
@@ -59,46 +59,56 @@ def parse_args():
 def parse_mapped_gaf_file(gaf_file: Path) -> defaultdict[set]:
     mapped_go_dict = defaultdict(set)
     if os.path.exists(gaf_file):
-        handle = open(gaf_file, "r")
-        for line in handle:
-            if not line.startswith("!"):
-                line = line.strip()
-                splitted_line = line.split("\t")
-                go_id = splitted_line[1]
-                mapped_go_id = splitted_line[4]
-                mapped_go_dict[go_id].add(mapped_go_id)
+        with open(gaf_file, "r") as handle:
+            for line in handle:
+                if not line.startswith("!"):
+                    line = line.strip()
+                    splitted_line = line.split("\t")
+                    go_id = splitted_line[1]
+                    mapped_go_id = splitted_line[4]
+                    mapped_go_dict[go_id].add(mapped_go_id)
     return mapped_go_dict
 def get_go_slim_summary(go_slim_banding_file, goslims2_protein_count):
     summary = []
-    fr = open(go_slim_banding_file, "r")
-    for line in fr:
-        if line.startswith("GO"):
-            line = line.strip()
-            line_chunks = line.split("\t")
-            go_id = line_chunks[0]
-            term = line_chunks[1]
-            category = line_chunks[2]
-            # Default value for the count
-            count = 0
-            if go_id in goslims2_protein_count:
-                count = goslims2_protein_count[go_id]
-            summary.append((go_id, term, category, count))
+    with open(go_slim_banding_file, "r") as fr:
+        for line in fr:
+            if line.startswith("GO"):
+                line = line.strip()
+                line_chunks = line.split("\t")
+                go_id = line_chunks[0]
+                term = line_chunks[1]
+                category = line_chunks[2]
+                # Default value for the count
+                count = 0
+                if go_id in goslims2_protein_count:
+                    count = goslims2_protein_count[go_id]
+                summary.append((go_id, term, category, count))
     return summary
 def write_go_summary_to_file(go_summary, output_file):
-    fw = open(output_file, "w")
-    for go, term, category, count in go_summary:
-        fw.write('","'.join(['"' + go, term, category, str(count) + '"']) + "\n")
-    fw.close()
+    """
+    Write a sorted GO summary to a TSV file.
+    :param go_summary: A list of tuples, where each tuple contains the following
+                       elements:
+                       - go (str): The GO identifier.
+                       - term (str): The GO term description.
+                       - category (str): The category of the GO term.
+                       - count (int): The count associated with the GO term.
+    :param output_file: The path to the output TSV file where the sorted GO
+    """
+    sorted_go_summary = sorted(go_summary, key=lambda x: x[3], reverse=True)
+    with open(output_file, "w", newline="") as fw:
+        tsv_writer = csv.writer(fw, delimiter="\t")
+        tsv_writer.writerow(["go", "term", "category", "count"])
+        for go, term, category, count in sorted_go_summary:
+            tsv_writer.writerow([go, term, category, count])
 def parse_gene_ontology(obo_file):
@@ -108,23 +118,22 @@ def parse_gene_ontology(obo_file):
     :return:
     """
     go_term_tuples = []
-    fr = open(obo_file, "r")
-    id, term, category = "", "", ""
-    for line in fr:
-        line = line.strip()
-        split_line = line.split(": ")
-        if line.startswith("id:"):
-            id = split_line[1]
-        elif line.startswith("name:"):
-            term = split_line[1]
-        elif line.startswith("namespace"):
-            category = split_line[1]
-        else:
-            if id.startswith("GO:") and id and term and category:
-                item = (id, term, category)
-                go_term_tuples.append(item)
-                id, term, category = "", "", ""
-    fr.close()
+    with open(obo_file, "r") as fr:
+        id, term, category = "", "", ""
+        for line in fr:
+            line = line.strip()
+            split_line = line.split(": ")
+            if line.startswith("id:"):
+                id = split_line[1]
+            elif line.startswith("name:"):
+                term = split_line[1]
+            elif line.startswith("namespace"):
+                category = split_line[1]
+            else:
+                if id.startswith("GO:") and id and term and category:
+                    item = (id, term, category)
+                    go_term_tuples.append(item)
+                    id, term, category = "", "", ""
     return go_term_tuples
@@ -132,7 +141,6 @@ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_
     summary = []
     for go_id, term, category in core_gene_ontology:
         if (go_id in go2protein_count_dict) and (
             go_id not in top_level_go_ids
         ):  # make sure that top level terms are not included (they tell you nothing!)
@@ -143,7 +151,6 @@ def get_full_go_summary(core_gene_ontology, go2protein_count_dict, top_level_go_
 def main():
     go_obo, go_banding, gaf_input, ips_input, output = parse_args()
     logging.info("Parsing the InterProScan input: " + ips_input)

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit/constants/ncrna.py RENAMED Viewed

@@ -60,3 +60,25 @@ RFAM_MODELS = {
     LSU_rRNA_bacteria: "RF02541",
     LSU_rRNA_eukarya: "RF02543",
 }
+TRNA = [
+    "Ala",
+    "Gly",
+    "Pro",
+    "Thr",
+    "Val",
+    "Ser",
+    "Arg",
+    "Leu",
+    "Phe",
+    "Asn",
+    "Lys",
+    "Asp",
+    "Glu",
+    "His",
+    "Gln",
+    "Ile",
+    "Tyr",
+    "Cys",
+    "Trp",
+]

mgnify_pipelines_toolkit-1.0.3/mgnify_pipelines_toolkit/utils/__init__.py ADDED Viewed

File without changes

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.0.2
+Version: 1.0.3
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -29,7 +29,9 @@ mgnify_pipelines_toolkit/analysis/assembly/gff_annotation_utils.py
 mgnify_pipelines_toolkit/analysis/assembly/gff_file_utils.py
 mgnify_pipelines_toolkit/analysis/assembly/gff_toolkit.py
 mgnify_pipelines_toolkit/analysis/assembly/go_utils.py
+mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py
 mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py
+mgnify_pipelines_toolkit/analysis/genomes/__init__.py
 mgnify_pipelines_toolkit/analysis/shared/__init__.py
 mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py
 mgnify_pipelines_toolkit/analysis/shared/dwc_summary_generator.py

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/mgnify_pipelines_toolkit.egg-info/entry_points.txt RENAMED Viewed

@@ -1,19 +1,24 @@
 [console_scripts]
 add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
+antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
 are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
 assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
 assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
 classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
 combined_gene_caller_merge = mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main
 convert_cmscan_to_cmsearch_tblout = mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main
-dwc_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main
+dwc_summary_generator = mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main
 fasta_to_delimited = mgnify_pipelines_toolkit.utils.fasta_to_delimited:main
 fastq_suffix_header_check = mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main
 find_mcp_inflection_points = mgnify_pipelines_toolkit.analysis.amplicon.find_mcp_inflection_points:main
 generate_gaf = mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main
+genomes_extract_bacterial_rrnas_as_tsv = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main
+genomes_extract_rrnas_as_fasta = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main
+genomes_extract_trnas = mgnify_pipelines_toolkit.analysis.genomes.rna.extract_trnas:main
 get_mpt_version = mgnify_pipelines_toolkit.utils.get_mpt_version:main
 get_subunits = mgnify_pipelines_toolkit.analysis.shared.get_subunits:main
 get_subunits_coords = mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main
+krona_txt_from_cat_classification = mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main
 library_strategy_check = mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main
 make_asv_count_table = mgnify_pipelines_toolkit.analysis.amplicon.make_asv_count_table:main
 mapseq2biom = mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main

{mgnify_pipelines_toolkit-1.0.2 → mgnify_pipelines_toolkit-1.0.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mgnify_pipelines_toolkit"
-version = "1.0.2"
+version = "1.0.3"
 readme = "README.md"
 license = {text = "Apache Software License 2.0"}
 authors = [
@@ -40,10 +40,11 @@ packages = ["mgnify_pipelines_toolkit",
             "mgnify_pipelines_toolkit.analysis.shared",
             "mgnify_pipelines_toolkit.analysis.amplicon",
             "mgnify_pipelines_toolkit.analysis.assembly",
-            ]
+            "mgnify_pipelines_toolkit.analysis.genomes"
+]
 [project.scripts]
-# analysis.shared
+# analysis.shared #
 get_subunits = "mgnify_pipelines_toolkit.analysis.shared.get_subunits:main"
 get_subunits_coords = "mgnify_pipelines_toolkit.analysis.shared.get_subunits_coords:main"
 mapseq2biom = "mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main"
@@ -52,7 +53,8 @@ library_strategy_check = "mgnify_pipelines_toolkit.analysis.shared.library_strat
 study_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli"
 markergene_study_summary = "mgnify_pipelines_toolkit.analysis.shared.markergene_study_summary:main"
 convert_cmscan_to_cmsearch_tblout = "mgnify_pipelines_toolkit.analysis.shared.convert_cmscan_to_cmsearch_tblout:main"
-# analysis.amplicon
+dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.dwc_summary_generator:main"
+# analysis.amplicon #
 are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
 assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
 assess_mcp_proportions = "mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main"
@@ -64,12 +66,18 @@ rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_pr
 standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
 mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
 primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
-# analysis.assembly
+# analysis.assembly #
+krona_txt_from_cat_classification = "mgnify_pipelines_toolkit.analysis.assembly.krona_txt_from_cat_classification:main"
 add_rhea_chebi_annotation = "mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main"
 combined_gene_caller_merge = "mgnify_pipelines_toolkit.analysis.assembly.combined_gene_caller_merge:main"
 generate_gaf = "mgnify_pipelines_toolkit.analysis.assembly.generate_gaf:main"
 summarise_goslims = "mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main"
-dwc_summary_generator = "mgnify_pipelines_toolkit.analysis.assembly.dwc_summary_generator:main"
+antismash_gff_builder = "mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main"
+# genomes #
+genomes_extract_bacterial_rrnas_as_tsv = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_bacterial_rrnas_as_tsv:main"
+genomes_extract_rrnas_as_fasta = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_rrnas_as_fasta:main"
+genomes_extract_trnas = "mgnify_pipelines_toolkit.analysis.genomes.rna.extract_trnas:main"
 # utils
 fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
 get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"