PyPI - mgnify-pipelines-toolkit - Versions diffs - 0.1.7__tar.gz → 0.1.9__tar.gz - Mend

mgnify-pipelines-toolkit 0.1.7tar.gz → 0.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (44) hide show

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: mgnify_pipelines_toolkit
-Version: 0.1.7
+Version: 0.1.9
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
 Requires-Dist: numpy==1.26.0
 Requires-Dist: pandas==2.0.2
 Requires-Dist: regex==2023.12.25
+Requires-Dist: requests==2.32.3
+Requires-Dist: click==8.1.7
+Requires-Dist: pandera==0.22.1
 Provides-Extra: tests
 Requires-Dist: pytest==7.4.0; extra == "tests"
 Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
 Requires-Dist: pandas==2.0.2; extra == "tests"
 Requires-Dist: numpy==1.26.0; extra == "tests"
 Requires-Dist: regex==2023.12.25; extra == "tests"
+Requires-Dist: requests==2.32.3; extra == "tests"
+Requires-Dist: click==8.1.7; extra == "tests"
+Requires-Dist: pandera==0.22.1; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
 Requires-Dist: pre-commit==3.8.0; extra == "dev"

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/analysis/amplicon/make_asv_count_table.py RENAMED Viewed

@@ -277,6 +277,8 @@ def main():
     taxa_df = taxa_df.fillna("0")
     taxa_df = order_df(taxa_df)
+    asv_list = taxa_df.ASV.to_list()
     amp_reads = [read.strip() for read in list(open(amp, "r"))]
     headers = [read.split(" ")[0][1:] for read in list(open(headers, "r"))]
     amp_region = ".".join(amp.split(".")[1:3])
@@ -288,7 +290,7 @@ def main():
         counter += 1
         line_fwd = line_fwd.strip()
-        if line_fwd == "0":
+        if line_fwd == "0" or f"seq_{line_fwd}" not in asv_list:
             continue
         if headers[counter] in amp_reads:

mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py ADDED Viewed

@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import hashlib
+import logging
+import sys
+from pathlib import Path
+from Bio import SeqIO
+import pandas as pd
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+def process_lines(lines, output_handler, rhea2reaction_dict, protein_hashes):
+    current_protein = None
+    for line in lines:
+        parts = line.strip().split("\t")
+        protein_id = parts[0]
+        if protein_id != current_protein:
+            current_protein = protein_id
+            protein_rheas = set()
+        rhea_list = parts[-1].split("RheaID=")[1].split()
+        top_hit = "top hit" if rhea_list and not protein_rheas else ""
+        for rhea in rhea_list:
+            if rhea not in protein_rheas:
+                chebi_reaction, reaction = rhea2reaction_dict[rhea]
+                contig_id = protein_id.split("_")[0]
+                protein_hash = protein_hashes[protein_id]
+                print(
+                    contig_id,
+                    protein_id,
+                    protein_hash,
+                    rhea,
+                    chebi_reaction,
+                    reaction,
+                    top_hit,
+                    sep="\t",
+                    file=output_handler,
+                )
+                protein_rheas.add(rhea)
+def main(input: str, output: Path, proteins: Path, rhea2chebi: Path):
+    logging.info(
+        f"Step 1/3: Parse protein fasta and calculating SHA256 hash from {proteins.resolve()}"
+    )
+    protein_hashes = {}
+    with open(proteins, "r") as fasta_file:
+        for record in SeqIO.parse(fasta_file, "fasta"):
+            protein_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
+            protein_hashes[record.id] = protein_hash
+    logging.info(f"Step 2/3: Load reactions from provided file {rhea2chebi.resolve()}")
+    df = pd.read_csv(rhea2chebi, delimiter="\t")
+    rhea2reaction_dict = dict(zip(df["ENTRY"], zip(df["EQUATION"], df["DEFINITION"])))
+    logging.info(
+        f"Step 3/3: Read DIAMOND results from {'STDIN' if input == '-' else Path(input).resolve()} and write output"
+    )
+    with open(output, "w") as output_handler:
+        if input == "-":
+            process_lines(sys.stdin, output_handler, rhea2reaction_dict, protein_hashes)
+        else:
+            with open(args.input, "r") as input_file:
+                process_lines(
+                    input_file, output_handler, rhea2reaction_dict, protein_hashes
+                )
+    logging.info("Processed successfully. Exiting.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        """
+                                    Use diamond output file to create a table with Rhea and CHEBI
+                                    reaction annotation for every protein.
+                                    """
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        type=str,
+        help="DIAMOND results file, use '-' for stdin",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        type=Path,
+        help="Output TSV file with columns: contig_id, protein_id, UniRef90 cluster, rhea_ids, CHEBI reaction participants",
+    )
+    parser.add_argument(
+        "-p",
+        "--proteins",
+        required=True,
+        type=Path,
+        help="Protein fasta file used as DIAMOND input",
+    )
+    parser.add_argument(
+        "--rhea2chebi",
+        default=None,
+        type=Path,
+        help="File that maps rhea_ids to CHEBI",
+    )
+    args = parser.parse_args()
+    main(args.input, args.output, args.proteins, args.rhea2chebi)

mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py ADDED Viewed

@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from collections import defaultdict
+import json
+import pandas as pd
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--input", required=True, type=str, help="Input JSON from antiSMASH"
+    )
+    parser.add_argument(
+        "-o", "--output", required=True, type=str, help="Output GFF3 file name"
+    )
+    parser.add_argument(
+        "--cds_tag",
+        default="ID",
+        type=str,
+        help="Type of CDS ID tag to use in the GFF3 (default: locus_tag)",
+    )  # The CDS' identifier changes from tool to tool.
+    args = parser.parse_args()
+    return args.input, args.output, args.cds_tag
+def main():
+    """Transform an antiSMASH JSON into a GFF3 with 'regions' and CDS within those regions"""
+    json_input, output_file, cds_tag = parse_args()
+    with open(json_input, "r") as json_data:
+        antismash_analysis = json.load(json_data)
+    res_dict = defaultdict(list)
+    attributes_dict = defaultdict(dict)
+    antismash_ver = antismash_analysis["version"]
+    for record in antismash_analysis["records"]:
+        record_id = record["id"]
+        iter_cds = (
+            "antismash.detection.genefunctions" in record["modules"].keys()
+        )  # Flag to iterate CDS
+        region_name = None
+        for feature in record["features"]:
+            if feature["type"] == "region":
+                # Annotate region features
+                region_name = (
+                    f"{record_id}_region{feature['qualifiers']['region_number'][0]}"
+                )
+                region_start = int(feature["location"].split(":")[0].split("[")[1])
+                region_end = int(feature["location"].split(":")[1].split("]")[0])
+                res_dict["contig"].append(record_id)
+                res_dict["version"].append(f"antiSMASH:{antismash_ver}")
+                res_dict["type"].append("region")
+                res_dict["start"].append(region_start + 1)
+                res_dict["end"].append(region_end)
+                res_dict["score"].append(".")
+                res_dict["strand"].append(".")
+                res_dict["phase"].append(".")
+                product = ",".join(feature["qualifiers"].get("product", []))
+                attributes_dict[region_name].update(
+                    {"ID": region_name, "product": product}
+                )
+            if iter_cds and feature["type"] == "CDS":
+                # Annotate CDS features
+                start = int(feature["location"].split(":")[0][1:])
+                end = int(feature["location"].split(":")[1].split("]")[0])
+                strand = feature["location"].split("(")[1][0]  # + or -
+                if not region_name or not (region_start <= end and start <= region_end):
+                    continue
+                res_dict["contig"].append(record_id)
+                res_dict["version"].append(f"antiSMASH:{antismash_ver}")
+                res_dict["type"].append("gene")
+                res_dict["start"].append(start + 1)  # Correct for 1-based indexing
+                res_dict["end"].append(end)
+                res_dict["score"].append(".")
+                res_dict["strand"].append(strand)
+                res_dict["phase"].append(".")
+                locus_tag = feature["qualifiers"][cds_tag][0]
+                attributes_dict[locus_tag].update(
+                    {
+                        "ID": locus_tag,
+                        "as_type": ",".join(
+                            feature["qualifiers"].get("gene_kind", ["other"])
+                        ),
+                        "gene_functions": ",".join(
+                            feature["qualifiers"].get("gene_functions", [])
+                        )
+                        .replace(" ", "_")
+                        .replace(":_", ":")
+                        .replace(";_", "%3B"),
+                        "Parent": region_name,
+                    }
+                )
+        # Extended CDS attributes
+        if "antismash.detection.hmm_detection" in record["modules"].keys():
+            cds_by_protocluster = record["modules"][
+                "antismash.detection.hmm_detection"
+            ]["rule_results"]["cds_by_protocluster"]
+            if len(cds_by_protocluster) > 0:
+                for feature in cds_by_protocluster[0][1]:
+                    if "cds_name" in feature.keys():
+                        locus_tag = feature["cds_name"]
+                        as_clusters = ",".join(
+                            list(feature["definition_domains"].keys())
+                        )
+                        if locus_tag in attributes_dict.keys():
+                            attributes_dict[locus_tag].update(
+                                {"as_gene_clusters": as_clusters}
+                            )
+        if "antismash.detection.genefunctions" in record["modules"].keys():
+            for tool in record["modules"]["antismash.detection.genefunctions"]["tools"]:
+                if tool["tool"] == "smcogs":
+                    for locus_tag in tool["best_hits"]:
+                        hit_id = tool["best_hits"][locus_tag]["hit_id"].split(":")[0]
+                        hit_desc = (
+                            tool["best_hits"][locus_tag]["hit_id"]
+                            .split(":")[1]
+                            .replace(" ", "_")
+                        )
+                        score = tool["best_hits"][locus_tag]["bitscore"]
+                        e_value = tool["best_hits"][locus_tag]["evalue"]
+                        smcog_note = f"smCOG:{hit_id}:{hit_desc.replace(' ', '_')}(Score:{score}%3BE-value:{e_value})"
+                        if locus_tag in attributes_dict.keys():
+                            attributes_dict[locus_tag].update({"as_notes": smcog_note})
+                        break
+    attributes = [
+        ";".join(f"{k}={v}" for k, v in attrib_data.items() if v)
+        for attrib_data in attributes_dict.values()
+    ]
+    res_dict["attributes"] = attributes
+    res_df = pd.DataFrame.from_dict(res_dict)
+    with open(output_file, "w") as f_out:
+        f_out.write(
+            "##gff-version 3\n"
+        )  # Save data to the GFF3 file with the proper header
+        res_df.to_csv(f_out, header=False, index=False, sep="\t")
+if __name__ == "__main__":
+    main()

mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py ADDED Viewed

@@ -0,0 +1,382 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import click
+from collections import defaultdict
+import glob
+import logging
+from pathlib import Path
+from typing import Union, List
+import pandas as pd
+from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
+from mgnify_pipelines_toolkit.constants.tax_ranks import (
+    _SILVA_TAX_RANKS,
+    _PR2_TAX_RANKS,
+)
+from mgnify_pipelines_toolkit.schemas.schemas import (
+    AmpliconPassedRunsSchema,
+    AmpliconNonINSDCPassedRunsSchema,
+    TaxonSchema,
+    PR2TaxonSchema,
+)
+logging.basicConfig(level=logging.DEBUG)
+@click.group()
+def cli():
+    pass
+def get_tax_file(
+    run_acc: str, analyses_dir: Path, db_label: str
+) -> Union[Path, List[Path]]:
+    """Takes path information for a particular analysis and db_label combo, and returns any existing files.
+    :param run_acc: Run accession for the tax file that should be retrieved.
+    :type run_acc: str
+    :param analyses_dir: The path to the directory containing all of the analyses,
+            including the tax file corresponding to :param:`run_acc`.
+    :type analyses_dir: Path
+    :param db_label: One of the database labels that results might exist for,
+            values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
+    :type db_label: str
+    :return: Either a :class:`Path` object if :param:`db_label` comes from ``TAXDB_LABELS``,
+            or a list of :class:`Path` objects if from ``ASV_TAXDB_LABELS``.
+    :rtype: Union[Path, List[Path]]
+    """
+    tax_file = None
+    db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
+    if not db_path.exists():
+        logging.debug(
+            f"DB {db_path} doesn't exist for {run_acc}. Skipping"
+        )  # or error?
+        return
+    if db_label in TAXDB_LABELS:
+        tax_file = Path(
+            f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
+        )
+        if not tax_file.exists():
+            logging.error(
+                f"DB path exists but file doesn't - exiting. Path: {tax_file}"
+            )
+            exit(1)
+        file_size = tax_file.stat().st_size
+        if (
+            file_size == 0
+        ):  # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
+            # so need to skip those. Should probably fix that at some point
+            logging.debug(
+                f"File {tax_file} exists but is empty, so will be skipping it."
+            )
+            tax_file = None
+    elif db_label in ASV_TAXDB_LABELS:
+        # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
+        # So will need to handle this differently to closed-reference files
+        asv_tax_files = glob.glob(
+            f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
+        )
+        asv_tax_files = [
+            Path(file) for file in asv_tax_files if "concat" not in file
+        ]  # Have to filter out concatenated file if it exists
+        tax_file = asv_tax_files
+    return tax_file
+def parse_one_tax_file(
+    run_acc: str, tax_file: Path, long_tax_ranks: list
+) -> pd.DataFrame:
+    """Parses a taxonomy file, and returns it as a pandas DataFrame object.
+    :param run_acc: Run accession of the taxonomy file that will be parsed.
+    :type run_acc: str
+    :param tax_file: Taxonomy file that will be parsed.
+    :type tax_file: Path
+    :param long_tax_ranks: Either the imported list _SILVA_TAX_RANKS or _PR2_TAX_RANKS
+            to validate the taxonomic ranks of the file.
+    :type tax_file: list
+    :return: The parsed :param:`tax_file` as a :class:`pd.DataFrame` object
+    :rtype: pd.DataFrame
+    """
+    res_df = pd.read_csv(tax_file, sep="\t", names=["Count"] + long_tax_ranks)
+    res_df = res_df.fillna("")
+    # Two different schemas used for validation depending on the database
+    # because PR2 schema has different taxonomic ranks than the standard
+    if len(long_tax_ranks) == 8:
+        TaxonSchema(res_df)
+    elif len(long_tax_ranks) == 9:
+        PR2TaxonSchema(res_df)
+    res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
+        lambda x: ";".join(x).strip(";"), axis=1
+    )
+    final_df = res_df.iloc[:, [0, -1]]
+    final_df = final_df.set_index("full_taxon")
+    final_df.columns = [run_acc]
+    return final_df
+def generate_db_summary(
+    db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
+) -> None:
+    """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
+    and respective db_label, joins them together, and generates a study-wide summary
+    in the form of a .tsv file.
+    :param db_label: One of the database labels that results might exist for,
+            values of which come from the imported constants ``TAXDB_LABELS`` and ``ASV_TAXDB_LABELS``.
+    :param tax_dfs: Dictionary where the key is a run accession,
+        and values are either one parsed taxonomy dataframe if the :param:db_label comes from ``TAXDB_LABELS``,
+        or a list of at least 1 and at most 2 dataframes if it comes from ``ASV_TAXDB_LABELS``.
+        These dataframes are parsed by :func:`parse_one_tax_file`
+    :type tax_dfs: defaultdict[Path]
+    :param output_prefix: Prefix to be added to the generated summary file.
+    :type output_prefix: str
+    """
+    if db_label in TAXDB_LABELS:
+        df_list = []
+        if "PR2" in db_label:
+            long_tax_ranks = _PR2_TAX_RANKS
+        else:
+            long_tax_ranks = _SILVA_TAX_RANKS
+        for run_acc, tax_df in tax_dfs.items():
+            res_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
+            df_list.append(res_df)
+        res_df = pd.concat(df_list, axis=1).fillna(0)
+        res_df = res_df.sort_index()
+        res_df = res_df.astype(int)
+        res_df.to_csv(
+            f"{output_prefix}_{db_label}_study_summary.tsv",
+            sep="\t",
+            index_label="taxonomy",
+        )
+    elif db_label in ASV_TAXDB_LABELS:
+        if "PR2" in db_label:
+            long_tax_ranks = _PR2_TAX_RANKS
+        else:
+            long_tax_ranks = _SILVA_TAX_RANKS
+        amp_region_dict = defaultdict(list)
+        for (
+            run_acc,
+            tax_df_asv_lst,
+        ) in (
+            tax_dfs.items()
+        ):  # each `tax_file` will be a list containing at most two files (one for each amp_region)
+            for tax_df in tax_df_asv_lst:
+                amp_region = str(tax_df).split("_")[
+                    -5
+                ]  # there are a lot of underscores in these names... but it is consistent
+                # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
+                amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
+                amp_region_dict[amp_region].append(amp_region_df)
+        for amp_region, amp_region_dfs in amp_region_dict.items():
+            if (
+                len(amp_region_dfs) > 1
+            ):  # Need at least two analyses with this amp_region to bother with the summary
+                amp_res_df = amp_region_dfs[0]
+                for amp_df in amp_region_dfs[1:]:
+                    amp_res_df = amp_res_df.join(amp_df, how="outer")
+                amp_res_df = amp_res_df.fillna(0)
+                amp_res_df = amp_res_df.astype(int)
+                amp_res_df.to_csv(
+                    f"{output_prefix}_{db_label}_{amp_region}_asv_study_summary.tsv",
+                    sep="\t",
+                    index_label="taxonomy",
+                )
+def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List]:
+    """Matches different summary files of the same database label and analysis
+    type (and amplified region for ASVs) into a dictionary to help merge
+    the correct summaries.
+    :param all_study_summaries: List of file paths to different summary files
+    :type all_study_summaries: List[str]
+    :return: Organised dictionary where each summary is paired to a specific
+        database label key to be merged together.
+    :rtype: defaultdict[List]
+    """
+    summaries_dict = defaultdict(list)
+    for summary in all_study_summaries:
+        summary_path = Path(summary)
+        summary_filename = summary_path.stem
+        temp_lst = summary_filename.split("_")
+        if "asv_study_summary" in summary_filename:
+            summary_db_label = "_".join(
+                temp_lst[1:3]
+            )  # For ASVs we need to include the amp_region in the label
+        else:
+            summary_db_label = temp_lst[
+                1
+            ]  # For closed reference, just the db_label is needed
+        summaries_dict[summary_db_label].append(summary_path)
+    return summaries_dict
+@cli.command(
+    "summarise",
+    options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
+    short_help="Generate study-level analysis summaries.",
+)
+@click.option(
+    "-r",
+    "--runs",
+    required=True,
+    help="CSV file containing successful analyses generated by the pipeline",
+    type=click.Path(exists=True, path_type=Path, dir_okay=False),
+)
+@click.option(
+    "-a",
+    "--analyses_dir",
+    required=True,
+    help="Input directory to where all the individual analyses subdirectories for summarising",
+    type=click.Path(exists=True, path_type=Path, file_okay=False),
+)
+@click.option(
+    "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
+)
+@click.option(
+    "--non_insdc",
+    default=False,
+    is_flag=True,
+    help="If run accessions aren't INSDC-formatted",
+)
+def summarise_analyses(
+    runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
+) -> None:
+    """Function that will take a file of pipeline-successful run accessions
+    that should be used for the generation of the relevant db-specific
+    study-level summary files. For ASV results, these will also be on a
+    per-amplified-region basis.
+    \f
+    :param runs: Path to a qc_passed_runs file from the pipeline execution.
+        Contains the accessions of runs that should therefore be included in the generated
+        summaries.
+    :type runs: Path
+    :param analyses_dir: The path to the directory containing all of the analyses.
+    :type analyses_dir: Path
+    :param output_prefix: Prefix to be added to the generated summary file.
+    :type output_prefix: str
+    """
+    runs_df = pd.read_csv(runs, names=["run", "status"])
+    if not non_insdc:
+        AmpliconPassedRunsSchema(
+            runs_df
+        )  # Run validation on the successful_runs .csv file
+    else:
+        AmpliconNonINSDCPassedRunsSchema(runs_df)
+    all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
+    for db_label in all_db_labels:
+        tax_files = defaultdict(Path)
+        for i in range(0, len(runs_df)):
+            run_acc = runs_df.loc[i, "run"]
+            tax_file = get_tax_file(run_acc, analyses_dir, db_label)
+            if tax_file:
+                tax_files[run_acc] = tax_file
+        if (
+            len(tax_files) > 1
+        ):  # If at least two analyses have results from the current DB, generate a study-level summary for it
+            generate_db_summary(db_label, tax_files, output_prefix)
+@cli.command(
+    "merge",
+    options_metavar="-a <analyses_dir> -p <output_prefix>",
+    short_help="Merge multiple study-level analysis summaries.",
+)
+@click.option(
+    "-a",
+    "--analyses_dir",
+    required=True,
+    help="Input directory to where all the individual analyses subdirectories for merging",
+    type=click.Path(exists=True, file_okay=False),
+)
+@click.option(
+    "-p",
+    "--output_prefix",
+    required=True,
+    help="Prefix to merged summary files",
+    type=str,
+)
+def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
+    """Function that will take a file path containing study-level
+    summaries that should be merged together on a per-db-per-amplified-region
+    basis.
+    \f
+    :param analyses_dir: The filepath to the directory containing all of the analyses.
+    :type analyses_dir: str
+    :param output_prefix: Prefix to be added to the generated summary file.
+    :type output_prefix: str
+    """
+    # TODO: The way we grab all the summaries might change depending on how the prefect side does things
+    all_study_summaries = glob.glob(f"{analyses_dir}/*_study_summary.tsv")
+    summaries_dict = organise_study_summaries(all_study_summaries)
+    for db_label, summaries in summaries_dict.items():
+        if len(summaries) > 1:
+            res_df = pd.read_csv(summaries[0], sep="\t", index_col=0)
+            for summary in summaries[1:]:
+                curr_df = pd.read_csv(summary, sep="\t", index_col=0)
+                res_df = res_df.join(curr_df, how="outer")
+                res_df = res_df.fillna(0)
+                res_df = res_df.astype(int)
+            res_df = res_df.reindex(sorted(res_df.columns), axis=1)
+            res_df.to_csv(
+                f"{output_prefix}_{db_label}_study_summary.tsv",
+                sep="\t",
+                index_label="taxonomy",
+            )
+if __name__ == "__main__":
+    cli()

mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/constants/db_labels.py ADDED Viewed

@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# taxonomy_summary labels for closed-reference method
+TAXDB_LABELS = ["SILVA-SSU", "SILVA-LSU", "PR2", "UNITE", "ITSoneDB"]
+# taxonomy_summary for ASV method
+ASV_TAXDB_LABELS = ["DADA2-SILVA", "DADA2-PR2"]

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit/constants/tax_ranks.py RENAMED Viewed

@@ -35,3 +35,7 @@ _PR2_TAX_RANKS = [
     "Genus",
     "Species",
 ]
+SHORT_TAX_RANKS = ["sk", "k", "p", "c", "o", "f", "g", "s"]
+SHORT_PR2_TAX_RANKS = ["d", "sg", "dv", "sdv", "c", "o", "f", "g", "s"]

mgnify_pipelines_toolkit-0.1.9/mgnify_pipelines_toolkit/schemas/schemas.py ADDED Viewed

@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2024 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from enum import Enum
+from typing import ClassVar, Optional
+import pandera as pa
+from pydantic import (
+    Field,
+    BaseModel,
+    field_validator,
+    RootModel,
+)
+from pandera.engines.pandas_engine import PydanticModel
+from mgnify_pipelines_toolkit.constants.tax_ranks import (
+    SHORT_TAX_RANKS,
+    SHORT_PR2_TAX_RANKS,
+)
+class INSDCRunAccession(RootModel):
+    """Class for modelling for INSDC-specific run accessions.
+    Essentially is just a special string with regex-based validation of the accession.
+    """
+    # RootModel example:
+    # https://stackoverflow.com/questions/78393675/how-to-make-a-custom-type-inheriting-from-uuid-work-as-a-pydantic-model
+    root: str = Field(
+        unique=True,
+        description="The run needs to be a valid ENA accession",
+        examples=["ERR123456", "DRR789012", "SRR345678"],
+    )
+    @field_validator("root", mode="after")
+    @classmethod
+    def run_validity_check(cls, run: str) -> bool:
+        """Checks that the run string matches the regex code of an INSDC run accession.
+        Throws a `ValueError` exception if not, which is what Pydantic prefers for validation errors.
+        """
+        run_accession_regex = "(E|D|S)RR[0-9]{6,}"
+        regex_res = re.match(run_accession_regex, run)
+        if regex_res is None:
+            raise ValueError(
+                f"Accession `{run}` does not fit INSDC format [ERR*,SRR*,DRR*]."
+            )
+        return run
+class AmpliconResultTypes(str, Enum):
+    """Class that models the two allowed statuses for successful amplicon analysis runs.
+    Pydantic validates Enums very simply without needing to declare a new function.
+    """
+    all_results = "all_results"
+    no_asvs = "no_asvs"
+class AmpliconPassedRunsRecord(BaseModel):
+    """Class defining a Pydantic model for a single "row" of an amplicon passed runs file.
+    Uses the previous two classes.
+    """
+    run: INSDCRunAccession
+    status: AmpliconResultTypes
+class AmpliconNonINSDCSPassedRunsRecord(BaseModel):
+    """Class modeling a very similar model as the preceding one, but with no INSDC-validation.
+    This is achieved by replacing the type of the runs with just a simple string so no validation
+    happens.
+    """
+    run: str
+    status: AmpliconResultTypes
+# This is the schema for the whole DF
+class AmpliconPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the AmpliconPassedRunsRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(AmpliconPassedRunsRecord)
+        coerce = True
+class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
+    """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
+    Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(AmpliconNonINSDCSPassedRunsRecord)
+        coerce = True
+class TaxRank(RootModel):
+    """Class for modelling a single Taxonomic Rank.
+    Essentially is just a special string with validation of the structure:
+    `${rank}__${taxon}`
+    Where `${rank}` is one of the allowed short ranks defined by the imported
+    `SHORT_TAX_RANKS` and `SHORT_PR2_TAX_RANKS` variables.
+    And `${taxon}` is the actual taxon for that rank (this isn't validated).
+    It will also validate if the whole string is the permitted "Unclassified".
+    """
+    valid_tax_ranks: ClassVar = SHORT_TAX_RANKS + SHORT_PR2_TAX_RANKS
+    root: str = Field(
+        unique=True,
+        description="A single taxon in a taxonomy record",
+        examples=["sk__Bacteria", "p__Bacillota", "g__Tundrisphaera"],
+    )
+    @field_validator("root", mode="after")
+    @classmethod
+    def rank_structure_validity_check(cls, taxrank: str) -> bool:
+        taxrank_list = taxrank.split("__")
+        rank = taxrank_list[0]
+        if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
+            raise ValueError(f"Invalid taxonomy rank {rank}.")
+        return taxrank
+# TODO: see if we can simplify the declaration of two Taxon classes by using one of these solutions
+# None of the solutions have a model-only way of doing it, but worth considering maybe
+# https://stackoverflow.com/questions/76537360/initialize-one-of-two-pydantic-models-depending-on-an-init-parameter
+class Taxon(BaseModel):
+    """Class for modelling an entire Taxon or taxonomic assignment.
+    All of the ranks are optional, to model for the taxon being "Unclassified".
+    """
+    Superkingdom: Optional[TaxRank] = None
+    Kingdom: Optional[TaxRank] = None
+    Phylum: Optional[TaxRank] = None
+    Class: Optional[TaxRank] = None
+    Order: Optional[TaxRank] = None
+    Family: Optional[TaxRank] = None
+    Genus: Optional[TaxRank] = None
+    Species: Optional[TaxRank] = None
+class PR2Taxon(Taxon):
+    """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
+    Domain: Optional[TaxRank] = None
+    Supergroup: Optional[TaxRank] = None
+    Division: Optional[TaxRank] = None
+    Subdivision: Optional[TaxRank] = None
+class TaxonRecord(Taxon):
+    """Class for modelling a single taxon record in a taxonomy file.
+    It inherits the Taxon class, and simply adds a Count field, modelling the read counts
+    for that particular Taxon record.
+    """
+    Count: int
+class PR2TaxonRecord(PR2Taxon):
+    """Class for modelling the same thing as the preceding class, but for PR2 ranks."""
+    Count: int
+# This is the schema for the whole DF
+class TaxonSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the TaxonRecord class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(TaxonRecord)
+        coerce = True
+class PR2TaxonSchema(pa.DataFrameModel):
+    """Class modelling the same dataframe schema as the preceding one, except for the PR2 taxonomy.
+    Uses the PR2TaxonSchema as a dtype to achieve this.
+    """
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(PR2TaxonRecord)
+        coerce = True

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: mgnify_pipelines_toolkit
-Version: 0.1.7
+Version: 0.1.9
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -15,6 +15,9 @@ Requires-Dist: biopython==1.82
 Requires-Dist: numpy==1.26.0
 Requires-Dist: pandas==2.0.2
 Requires-Dist: regex==2023.12.25
+Requires-Dist: requests==2.32.3
+Requires-Dist: click==8.1.7
+Requires-Dist: pandera==0.22.1
 Provides-Extra: tests
 Requires-Dist: pytest==7.4.0; extra == "tests"
 Requires-Dist: pytest-md==0.2.0; extra == "tests"
@@ -23,6 +26,9 @@ Requires-Dist: biopython==1.82; extra == "tests"
 Requires-Dist: pandas==2.0.2; extra == "tests"
 Requires-Dist: numpy==1.26.0; extra == "tests"
 Requires-Dist: regex==2023.12.25; extra == "tests"
+Requires-Dist: requests==2.32.3; extra == "tests"
+Requires-Dist: click==8.1.7; extra == "tests"
+Requires-Dist: pandera==0.22.1; extra == "tests"
 Provides-Extra: dev
 Requires-Dist: mgnify_pipelines_toolkit[tests]; extra == "dev"
 Requires-Dist: pre-commit==3.8.0; extra == "dev"

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -21,17 +21,22 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py
 mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py
 mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py
 mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py
+mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py
+mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py
 mgnify_pipelines_toolkit/analysis/shared/__init__.py
 mgnify_pipelines_toolkit/analysis/shared/fastq_suffix_header_check.py
 mgnify_pipelines_toolkit/analysis/shared/get_subunits.py
 mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py
 mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py
 mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py
+mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py
+mgnify_pipelines_toolkit/constants/db_labels.py
 mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py
 mgnify_pipelines_toolkit/constants/regex_fasta_header.py
 mgnify_pipelines_toolkit/constants/tax_ranks.py
 mgnify_pipelines_toolkit/constants/thresholds.py
 mgnify_pipelines_toolkit/constants/var_region_coordinates.py
+mgnify_pipelines_toolkit/schemas/schemas.py
 mgnify_pipelines_toolkit/utils/__init__.py
 mgnify_pipelines_toolkit/utils/fasta_to_delimited.py
 mgnify_pipelines_toolkit/utils/get_mpt_version.py

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
+add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
 are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
 assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
 assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
@@ -17,3 +18,4 @@ primer_val_classification = mgnify_pipelines_toolkit.analysis.amplicon.primer_va
 remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
 rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
 standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
+study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/mgnify_pipelines_toolkit.egg-info/requires.txt RENAMED Viewed

@@ -2,6 +2,9 @@ biopython==1.82
 numpy==1.26.0
 pandas==2.0.2
 regex==2023.12.25
+requests==2.32.3
+click==8.1.7
+pandera==0.22.1
 [dev]
 mgnify_pipelines_toolkit[tests]
@@ -18,3 +21,6 @@ biopython==1.82
 pandas==2.0.2
 numpy==1.26.0
 regex==2023.12.25
+requests==2.32.3
+click==8.1.7
+pandera==0.22.1

{mgnify_pipelines_toolkit-0.1.7 → mgnify_pipelines_toolkit-0.1.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mgnify_pipelines_toolkit"
-version = "0.1.7"
+version = "0.1.9"
 readme = "README.md"
 license = {text = "Apache Software License 2.0"}
 authors = [
@@ -19,7 +19,10 @@ dependencies = [
     "biopython==1.82",
     "numpy==1.26.0",
     "pandas==2.0.2",
-    "regex==2023.12.25"
+    "regex==2023.12.25",
+    "requests==2.32.3",
+    "click==8.1.7",
+    "pandera==0.22.1"
 ]
 [build-system]
@@ -31,8 +34,10 @@ packages = ["mgnify_pipelines_toolkit",
             "mgnify_pipelines_toolkit.analysis",
             "mgnify_pipelines_toolkit.constants",
             "mgnify_pipelines_toolkit.utils",
+            "mgnify_pipelines_toolkit.schemas",
             "mgnify_pipelines_toolkit.analysis.shared",
-            "mgnify_pipelines_toolkit.analysis.amplicon"
+            "mgnify_pipelines_toolkit.analysis.amplicon",
+            "mgnify_pipelines_toolkit.analysis.assembly",
             ]
 [project.scripts]
@@ -42,6 +47,7 @@ get_subunits_coords = "mgnify_pipelines_toolkit.analysis.shared.get_subunits_coo
 mapseq2biom = "mgnify_pipelines_toolkit.analysis.shared.mapseq2biom:main"
 fastq_suffix_header_check = "mgnify_pipelines_toolkit.analysis.shared.fastq_suffix_header_check:main"
 library_strategy_check = "mgnify_pipelines_toolkit.analysis.shared.library_strategy_check:main"
+study_summary_generator = "mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:main"
 # analysis.amplicon
 are_there_primers = "mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main"
 assess_inflection_point_mcp = "mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main"
@@ -54,6 +60,8 @@ rev_comp_se_primers = "mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_pr
 standard_primer_matching = "mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main"
 mapseq_to_asv_table = "mgnify_pipelines_toolkit.analysis.amplicon.mapseq_to_asv_table:main"
 primer_val_classification = "mgnify_pipelines_toolkit.analysis.amplicon.primer_val_classification:main"
+# analysis.assembly
+add_rhea_chebi_annotation = "mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main"
 # utils
 fasta_to_delimited = "mgnify_pipelines_toolkit.utils.fasta_to_delimited:main"
 get_mpt_version = "mgnify_pipelines_toolkit.utils.get_mpt_version:main"
@@ -66,7 +74,10 @@ tests = [
     "biopython==1.82",
     "pandas==2.0.2",
     "numpy==1.26.0",
-    "regex==2023.12.25"
+    "regex==2023.12.25",
+    "requests==2.32.3",
+    "click==8.1.7",
+    "pandera==0.22.1"
 ]
 dev = [