PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (11) hide show

mgnify_pipelines_toolkit/analysis/{shared → amplicon}/study_summary_generator.py RENAMED Viewed

@@ -257,7 +257,7 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
 @cli.command(
     "summarise",
     options_metavar="-r <runs> -a <analyses_dir> -p <output_prefix>",
-    short_help="Generate study-level analysis summaries.",
+    short_help="Generate study-level summaries of amplicon analysis results.",
 )
 @click.option(
     "-r",
@@ -327,7 +327,7 @@ def summarise_analyses(
 @cli.command(
     "merge",
     options_metavar="-a <analyses_dir> -p <output_prefix>",
-    short_help="Merge multiple study-level analysis summaries.",
+    short_help="Merge multiple study-level summaries of amplicon analysis.",
 )
 @click.option(
     "-a",

mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py ADDED Viewed

@@ -0,0 +1,605 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright 2025 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import click
+from functools import reduce
+import glob
+import logging
+from pathlib import Path
+from typing import Literal
+import pandas as pd
+from mgnify_pipelines_toolkit.schemas.schemas import (
+    CompletedAnalysisSchema,
+    TaxonSchema,
+    GOSummarySchema,
+    InterProSummarySchema,
+    KOSummarySchema,
+    SanntisSummarySchema,
+    AntismashSummarySchema,
+    PFAMSummarySchema,
+    KEGGModulesSummarySchema,
+    GOStudySummarySchema,
+    InterProStudySummarySchema,
+    TaxonomyStudySummarySchema,
+    KOStudySummarySchema,
+    SanntisStudySummarySchema,
+    AntismashStudySummarySchema,
+    PFAMStudySummarySchema,
+    KEGGModulesStudySummarySchema,
+    validate_dataframe,
+)
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Keys are the original column names in the input files,
+# values are the standardised column names used in the generated study summary files
+# Note: "Count" or "count" column should be excluded
+GO_COLUMN_NAMES = {
+    "go": "GO",
+    "term": "description",
+    "category": "category",
+}
+INTERPRO_COLUMN_NAMES = {
+    "interpro_accession": "IPR",
+    "description": "description",
+}
+SANNTIS_COLUMN_NAMES = {
+    "nearest_mibig": "nearest_mibig",
+    "nearest_mibig_class": "nearest_mibig_class",
+    "description": "description",
+}
+ANTISMASH_COLUMN_NAMES = {
+    "label": "label",
+    "description": "description",
+}
+KEGG_COLUMN_NAMES = {
+    "ko": "KO",
+    "description": "description",
+}
+PFAM_COLUMN_NAMES = {
+    "pfam": "PFAM",
+    "description": "description",
+}
+KEGG_MODULES_COLUMN_NAMES = {
+    "module_accession": "module_accession",
+    "pathway_name": "pathway_name",
+    "pathway_class": "pathway_class",
+}
+# this mapping allows using 'for' cycle later to process all summary types in one way
+SUMMARY_TYPES_MAP = {
+    "go": {
+        "folder": "functional-annotation/go",
+        "column_names": GO_COLUMN_NAMES,
+        "schema": GOSummarySchema,
+        "study_schema": GOStudySummarySchema,
+    },
+    "goslim": {
+        "folder": "functional-annotation/go",
+        "column_names": GO_COLUMN_NAMES,
+        "schema": GOSummarySchema,
+        "study_schema": GOStudySummarySchema,
+    },
+    "interpro": {
+        "folder": "functional-annotation/interpro",
+        "column_names": INTERPRO_COLUMN_NAMES,
+        "schema": InterProSummarySchema,
+        "study_schema": InterProStudySummarySchema,
+    },
+    "ko": {
+        "folder": "functional-annotation/kegg",
+        "column_names": KEGG_COLUMN_NAMES,
+        "schema": KOSummarySchema,
+        "study_schema": KOStudySummarySchema,
+    },
+    "sanntis": {
+        "folder": "pathways-and-systems/sanntis",
+        "column_names": SANNTIS_COLUMN_NAMES,
+        "schema": SanntisSummarySchema,
+        "study_schema": SanntisStudySummarySchema,
+    },
+    "antismash": {
+        "folder": "pathways-and-systems/antismash",
+        "column_names": ANTISMASH_COLUMN_NAMES,
+        "schema": AntismashSummarySchema,
+        "study_schema": AntismashStudySummarySchema,
+    },
+    "pfam": {
+        "folder": "functional-annotation/pfam",
+        "column_names": PFAM_COLUMN_NAMES,
+        "schema": PFAMSummarySchema,
+        "study_schema": PFAMStudySummarySchema,
+    },
+    "kegg_modules": {
+        "folder": "pathways-and-systems/kegg-modules",
+        "column_names": KEGG_MODULES_COLUMN_NAMES,
+        "schema": KEGGModulesSummarySchema,
+        "study_schema": KEGGModulesStudySummarySchema,
+    },
+}
+# The taxonomy file is a tab-separated file without any header
+# containing of following columns:
+TAXONOMY_COLUMN_NAMES = [
+    "Count",
+    "Superkingdom",
+    "Kingdom",
+    "Phylum",
+    "Class",
+    "Order",
+    "Family",
+    "Genus",
+    "Species",
+]
+OUTPUT_SUFFIX = "summary.tsv"
+@click.group()
+def cli():
+    pass
+def check_files_exist(file_list: list[Path]) -> None:
+    """
+    Check that all files in the given list exist on disk.
+    :param file_list: List of file paths to check.
+    :raises FileNotFoundError: If any file does not exist.
+    """
+    missing_files = [str(path) for path in file_list if not path.is_file()]
+    if missing_files:
+        raise FileNotFoundError(
+            f"The following required files are missing: {', '.join(missing_files)}"
+        )
+def generate_taxonomy_summary(
+    file_dict: dict[str, Path],
+    output_file_name: str,
+    outdir: Path = None,
+) -> None:
+    """
+    Generate a combined study-level taxonomic classification summary from multiple input
+    assembly-level summary files.
+    :param file_dict: Dictionary mapping assembly accession to its taxonomy file.
+    :param output_file_name: Output path for the output summary file.
+    :param outdir: Optional output directory for the results.
+    Example of the taxonomy file:
+    23651	sk__Bacteria
+    4985	sk__Archaea	k__Thermoproteati	p__Nitrososphaerota
+    882	sk__Archaea	k__Nanobdellati	p__	c__	o__	f__	g__	s__Candidatus Pacearchaeota archaeon
+    """
+    check_files_exist(list(file_dict.values()))
+    tax_dfs = []
+    for assembly_acc, path in file_dict.items():
+        df = pd.read_csv(path, sep="\t", names=TAXONOMY_COLUMN_NAMES).fillna("")
+        # Note: schema validation will fail if the taxonomy file is empty
+        df = validate_dataframe(df, TaxonSchema, str(path))
+        # Combine all taxonomic ranks in the classification into a single string
+        df["full_taxon"] = (
+            df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
+        )
+        # Create a new DataFrame with taxonomy as index and count as the only column
+        result = df[["Count", "full_taxon"]].set_index("full_taxon")
+        result.columns = [assembly_acc]
+        tax_dfs.append(result)
+    summary_df = pd.concat(tax_dfs, axis=1)
+    summary_df = summary_df.fillna(0).astype(int).sort_index()
+    outfile = output_file_name
+    if outdir:
+        outfile = outdir / output_file_name
+    summary_df.to_csv(outfile, sep="\t", index_label="taxonomy")
+def generate_functional_summary(
+    file_dict: dict[str, Path],
+    column_names: dict[str, str],
+    output_prefix: str,
+    label: Literal[
+        "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
+    ],
+    outdir: Path = None,
+) -> None:
+    """
+    Generate a combined study-level functional annotation summary from multiple input
+    assembly-level summary files.
+    :param file_dict: Dictionary mapping assembly accession to its summary file path.
+    :param column_names: Dictionary mapping original column names to standard column names.
+    :param output_prefix: Prefix for the output summary file.
+    :param label: Label for the functional annotation type
+    (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
+    :param outdir: Optional output directory for the results.
+    In the input files, column orders may vary, but the following columns are expected:
+    GO summary input file:
+    go	term	category	count
+    GO:0016020	membrane	cellular_component	30626
+    GO:0005524	ATP binding	molecular_function	30524
+    InterPro summary input file:
+    interpro_accession	description	count
+    IPR036291	NAD(P)-binding domain superfamily	16503
+    IPR019734	Tetratricopeptide repeat	14694
+    KEGG summary input file:
+    ko      description	count
+    K01552  energy-coupling factor transport system ATP-binding protein [EC:7.-.-.-]	562
+    K18889  ATP-binding cassette, subfamily B, multidrug efflux pump	537
+    K15497  molybdate/tungstate transport system ATP-binding protein [EC:7.3.2.5 7.3.2.6]	517
+    Sanntis summary input file:
+    nearest_mibig	nearest_mibig_class	description	count
+    BGC0000787	Saccharide	Carbohydrate-based natural products (e.g., aminoglycoside antibiotics)	1
+    BGC0000248	Polyketide	Built from iterative condensation of acetate units derived from acetyl-CoA	3
+    BGC0001327	NRP Polyketide	Nonribosomal Peptide Polyketide	2
+    Antismash summary input file:
+    label	description	count
+    terpene	Terpene	16
+    betalactone	Beta-lactone containing protease inhibitor	8
+    T1PKS	Type I PKS (Polyketide synthase)	3
+    PFAM summary input file:
+    pfam	description	count
+    PF00265	Thymidine kinase	457
+    PF01852	START domain	368
+    PF13756	Stimulus-sensing domain	397
+    KEGG modules summary input file:
+    module_accession	completeness	pathway_name	pathway_class	matching_ko	missing_ko
+    M00986	100.0	Sulfur reduction, sulfur => sulfide	Pathway modules; Energy metabolism; Sulfur metabolism	K18367
+    M00163	83.33	Photosystem I	Pathway modules; Energy metabolism; Photosynthesis	K02689,K02690,K02691,K02692,K02694	K02693
+    M00615	50.0	Nitrate assimilation	Signature modules; Module set; Metabolic capacity	K02575	M00531
+    """
+    check_files_exist(list(file_dict.values()))
+    output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
+    original_col_names = list(column_names.keys())
+    renamed_col_names = list(column_names.values())
+    value_col_name = "completeness" if label == "kegg_modules" else "count"
+    dfs = []
+    for assembly_acc, filepath in file_dict.items():
+        try:
+            df = pd.read_csv(filepath, sep="\t")
+        except pd.errors.EmptyDataError:
+            logging.warning(f"File {filepath.resolve()} is empty. Skipping.")
+            continue
+        schema = SUMMARY_TYPES_MAP[label]["schema"]
+        df = validate_dataframe(df, schema, str(filepath))
+        # Extract only relevant columns
+        df = df[original_col_names + [value_col_name]].copy()
+        # Rename columns: metadata columns are renamed according to column_names dict, "count"/"completeness" -> assembly acc
+        df.rename(columns={**column_names, value_col_name: assembly_acc}, inplace=True)
+        dfs.append(df)
+    if not dfs:
+        logging.warning(
+            f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
+        )
+        return
+    # Merge all dataframes on the renamed metadata columns
+    merged_df = reduce(
+        lambda left, right: pd.merge(left, right, on=renamed_col_names, how="outer"),
+        dfs,
+    )
+    # Fill missing values appropriately, convert completeness percentages to float, counts to integers
+    value_columns = [col for col in merged_df.columns if col not in renamed_col_names]
+    fill_value = 0.0 if label == "kegg_modules" else 0
+    dtype = float if label == "kegg_modules" else int
+    merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
+    # Reorder columns: merge keys first, then sorted assembly accessions
+    merged_df = merged_df[renamed_col_names + sorted(value_columns)]
+    outfile = output_file_name
+    if outdir:
+        outfile = outdir / output_file_name
+    merged_df.to_csv(outfile, sep="\t", index=False)
+@cli.command(
+    "summarise",
+    options_metavar="-a <assemblies> -s <study_dir> -p <output_prefix>",
+    short_help="Generate study-level summaries for assembly analysis results.",
+)
+@click.option(
+    "-a",
+    "--assemblies",
+    required=True,
+    help="CSV file containing successful analyses generated by the pipeline",
+    type=click.Path(exists=True, path_type=Path, dir_okay=False),
+)
+@click.option(
+    "-s",
+    "--study_dir",
+    required=True,
+    help="Input directory to where all the individual analyses subdirectories for summarising",
+    type=click.Path(exists=True, path_type=Path, file_okay=False),
+)
+@click.option(
+    "-p",
+    "--output_prefix",
+    required=True,
+    help="Prefix for generated summary files",
+    type=str,
+)
+@click.option(
+    "-o",
+    "--outdir",
+    required=False,
+    help="Directory for the output files, by default it will use the current working directory.",
+    type=click.Path(exists=True, path_type=Path, file_okay=False),
+)
+def summarise_analyses(
+    assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
+) -> None:
+    """
+    Generate study-level summaries for successfully proccessed assemblies.
+    :param assemblies: Path to a file listing completed assembly accessions and their status.
+    :param study_dir: Path to the directory containing analysis results for each assembly.
+    :param output_prefix: Prefix for the generated summary files.
+    """
+    logging.info(f"Reading assembly list from {assemblies.resolve()}")
+    assemblies_df = pd.read_csv(assemblies, names=["assembly", "status"])
+    CompletedAnalysisSchema(assemblies_df)
+    assembly_list = assemblies_df["assembly"].tolist()
+    logging.info("Assembly list was read successfully.")
+    def get_file_paths(subdir: str, filename_template: str) -> dict[str, Path]:
+        """
+        Construct file paths for each assembly given a subdirectory and filename template.
+        Template must contain {acc} as a placeholder.
+        """
+        return {
+            acc: study_dir / acc / subdir / filename_template.format(acc=acc)
+            for acc in assembly_list
+        }
+    logging.info("Start processing of assembly-level summaries.")
+    logging.info(
+        "Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
+    )
+    generate_taxonomy_summary(
+        get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
+        f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
+        outdir=outdir,
+    )
+    for summary_type, config in SUMMARY_TYPES_MAP.items():
+        logging.info(
+            f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
+        )
+        generate_functional_summary(
+            get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
+            config["column_names"],
+            output_prefix,
+            summary_type,
+            outdir=outdir,
+        )
+    logging.info("Assembly-level summaries were generated successfully.")
+    logging.info("Done.")
+@cli.command(
+    "merge",
+    options_metavar="-a <study_dir> -p <output_prefix>",
+    short_help="Merge multiple study-level summaries of assembly analysis.",
+)
+@click.option(
+    "-s",
+    "--study_dir",
+    required=True,
+    help="Input directory to where all the individual analyses subdirectories for merging",
+    type=click.Path(exists=True, file_okay=False),
+)
+@click.option(
+    "-p",
+    "--output_prefix",
+    required=True,
+    help="Prefix for generated merged summary files",
+    type=str,
+)
+def merge_summaries(study_dir: str, output_prefix: str) -> None:
+    """
+    Merge multiple study-level summary files into combined summary files.
+    :param study_dir: Path to the directory containing study-level summary files.
+    :param output_prefix: Prefix for the output merged summary files.
+    """
+    def get_file_paths(summary_type: str) -> list[str]:
+        return glob.glob(f"{study_dir}/*_{summary_type}_{OUTPUT_SUFFIX}")
+    logging.info("Generating combined assembly-level summaries")
+    logging.info("Parsing summary files for taxonomic classification")
+    merge_taxonomy_summaries(
+        get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
+    )
+    for summary_type, config in SUMMARY_TYPES_MAP.items():
+        logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
+        column_names = config["column_names"]
+        merge_functional_summaries(
+            get_file_paths(summary_type),
+            list(column_names.values()),
+            output_prefix,
+            summary_type,
+        )
+    logging.info("Merged assembly-level summaries were generated successfully.")
+    logging.info("Done.")
+def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) -> None:
+    """
+    Merge multiple taxonomy study-level summary files into a single study-level summary.
+    :param summary_files: List of paths to taxonomy summary files, each containing
+                        taxonomic classifications and counts for an individual analysis.
+    :param output_file_name: Output path for the merged taxonomy summary.
+    Example of input taxonomy summary file:
+    taxonomy	ERZ1049444	ERZ1049446
+    sk__Eukaryota;k__Metazoa;p__Chordata	2	10
+    sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates	118	94
+    """
+    if not summary_files:
+        raise FileNotFoundError(
+            "The required taxonomic classification summary files are missing. Exiting."
+        )
+    summary_dfs = []
+    for file in summary_files:
+        df = pd.read_csv(file, sep="\t", index_col=0)
+        df = validate_dataframe(df, TaxonomyStudySummarySchema, file)
+        summary_dfs.append(df)
+    merged_df = pd.concat(summary_dfs, axis=1)
+    merged_df = merged_df.fillna(0).astype(int)
+    # Reorder columns: taxonomy first, then sorted assembly accessions
+    merged_df = merged_df[sorted(merged_df.columns)]
+    merged_df = merged_df.sort_index()
+    merged_df.to_csv(
+        output_file_name,
+        sep="\t",
+        index_label="taxonomy",
+    )
+def merge_functional_summaries(
+    summary_files: list[str],
+    merge_keys: list[str],
+    output_prefix: str,
+    label: Literal[
+        "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
+    ],
+) -> None:
+    """
+    Merge multiple functional study-level summary files into a single study-level summary.
+    :param summary_files: List of paths to functional summary files, each containing
+                        annotation terms and counts for an individual analysis.
+    :param merge_keys: List of column names to merge on (e.g. term ID, description).
+    :param output_prefix: Prefix for the generated output file.
+    :param label: Label describing the functional annotation type
+    (expected one of ["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"]).
+    In the input files, column orders may vary, but the following columns are expected:
+    GO summary input:
+    GO	description	category	ERZ1049444	ERZ1049446
+    GO:0016020	membrane	cellular_component	30626	673
+    GO:0005524	ATP binding	molecular_function	30524	2873
+    Example of InterPro summary input:
+    IPR	description	ERZ1049444	ERZ1049446
+    IPR036291	NAD(P)-binding domain superfamily	16503	13450
+    IPR019734	Tetratricopeptide repeat	14694	11021
+    KEGG summary input:
+    GO	description	category	ERZ1049440	ERZ1049443
+    GO:0003677	DNA binding	molecular_function	6125	16417
+    GO:0055085	transmembrane transport	biological_process	144	13926
+    Sanntis summary input:
+    nearest_mibig	nearest_mibig_class	description	ERZ1049440	ERZ1049443
+    BGC0001356	RiPP	Ribosomally synthesised and Post-translationally modified Peptide	230	185
+    BGC0001432	NRP Polyketide	Nonribosomal Peptide Polyketide	0	8
+    Antismash summary input:
+    label	description	ERZ1049440	ERZ1049443
+    NRPS	Non-ribosomal peptide synthetase	368	0
+    arylpolyene	Aryl polyene	149	447
+    PFAM summary input:
+    PFAM	description	ERZ1049440	ERZ1049443
+    PF24718	HTH-like domain	468	1
+    PF06039	Malate:quinone oxidoreductase (Mqo)	490	21
+    KEGG modules summary input:
+    module_accession	pathway_name	pathway_class	ERZ1049440	ERZ1049443
+    M00109	C21-Steroid hormone biosynthesis, progesterone => cortisol/cortisone	Pathway modules; Lipid metabolism; Sterol biosynthesis	38.9	0.0
+    M00153	Cytochrome bd ubiquinol oxidase	Pathway modules; Energy metabolism; ATP synthesis	44.7	84.4
+    """
+    output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
+    if not summary_files:
+        logging.warning(
+            f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
+        )
+        return
+    validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
+    dfs = []
+    for filepath in summary_files:
+        df = pd.read_csv(filepath, sep="\t")
+        df = validate_dataframe(df, validation_schema, filepath)
+        dfs.append(df)
+    if len(dfs) == 1:
+        merged_df = dfs[0]
+    else:
+        merged_df = reduce(
+            lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
+        )
+    # Identify non-key columns (i.e. counts)
+    value_columns = [col for col in merged_df.columns if col not in merge_keys]
+    # Fill NaNs and set dtype accordingly
+    fill_value = 0.0 if label == "kegg_modules" else 0
+    dtype = float if label == "kegg_modules" else int
+    merged_df[value_columns] = merged_df[value_columns].fillna(fill_value).astype(dtype)
+    # Reorder columns
+    merged_df = merged_df[merge_keys + sorted(value_columns)]
+    merged_df.to_csv(output_file_name, sep="\t", index=False)
+if __name__ == "__main__":
+    cli()

mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py CHANGED Viewed

@@ -155,7 +155,7 @@ def parse_args():
     description = (
         "antiSMASH output summary generator. "
         "Script takes regions from GFF and counts its appearance in annotation. "
-        "Output columns contain classID, descriptions and count. "
+        "Output columns contain label, descriptions and count. "
         f"Descriptions were taken from pre-parsed glossary provided on antiSMASH website. "
         f"Current script supports antiSMASH results for version {ANTISMASH_VERSION} and older."
     )
@@ -202,15 +202,15 @@ def main():
         df = pd.DataFrame(dict_list)
         df = df[df["product"].notna()]
         df_grouped = (
-            df.groupby(["product"]).size().reset_index(name="Count")
-        ).sort_values(by="Count", ascending=False)
+            df.groupby(["product"]).size().reset_index(name="count")
+        ).sort_values(by="count", ascending=False)
         df_grouped = df_grouped.rename(
             columns={
                 "product": "label",
             }
         )
-        df_grouped["Description"] = df_grouped["label"].apply(
+        df_grouped["description"] = df_grouped["label"].apply(
             lambda x: ",".join(
                 [
                     DESCRIPTIONS.get(cls.strip().lower(), cls.strip())
@@ -218,11 +218,7 @@ def main():
                 ]
             )
         )
-        df_grouped = df_grouped[["label", "Description", "Count"]]
-        df_grouped = df_grouped.rename(columns={
-            "Description": "description",
-            "Count": "count"
-        })
+        df_grouped = df_grouped[["label", "description", "count"]]
         df_grouped.to_csv(output_filename, sep="\t", index=False)

mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py CHANGED Viewed

@@ -75,38 +75,40 @@ def main():
                 entry_dict[key] = value
             dict_list.append(entry_dict)
-            # Convert to DataFrame
+        # Convert to DataFrame
         df = pd.DataFrame(dict_list)
         df = df.rename(
             columns={
-                "nearest_MiBIG": "nearest_MIBiG",
-                "nearest_MiBIG_class": "nearest_MIBiG_class",
+                "nearest_MiBIG": "nearest_mibig",
+                "nearest_MiBIG_class": "nearest_mibig_class",
             }
         )
         df_grouped = (
-            df.groupby(["nearest_MIBiG", "nearest_MIBiG_class"])
+            df.groupby(["nearest_mibig", "nearest_mibig_class"])
             .size()
-            .reset_index(name="Count")
+            .reset_index(name="count")
         )
-        df_grouped = df_grouped.sort_values(by="Count", ascending=False)
+        df_grouped = df_grouped.sort_values(by="count", ascending=False)
         df_desc = pd.DataFrame(
-            list(DESCRIPTIONS.items()), columns=["MIBiG_class", "Description"]
+            list(DESCRIPTIONS.items()), columns=["mibig_class", "description"]
         )
-        df_desc = df_desc.set_index("MIBiG_class")
+        df_desc = df_desc.set_index("mibig_class")
         df_merged = df_grouped.merge(
-            df_desc, left_on="nearest_MIBiG_class", right_index=True, how="left"
+            df_desc, left_on="nearest_mibig_class", right_index=True, how="left"
         )
-        df_merged["Description"] = df_merged.apply(
-            lambda row: row["nearest_MIBiG_class"].replace(
-                "NRP", df_desc.loc["NRP"]["Description"]
-            )
-            if pd.isna(row["Description"]) and "NRP" in row["nearest_MIBiG_class"]
-            else row["Description"],
+        df_merged["description"] = df_merged.apply(
+            lambda row: (
+                row["nearest_mibig_class"].replace(
+                    "NRP", df_desc.loc["NRP"]["description"]
+                )
+                if pd.isna(row["description"]) and "NRP" in row["nearest_mibig_class"]
+                else row["description"]
+            ),
             axis=1,
         )
         df_merged = df_merged[
-            ["nearest_MIBiG", "nearest_MIBiG_class", "Description", "Count"]
+            ["nearest_mibig", "nearest_mibig_class", "description", "count"]
         ]
         df_merged = df_merged.rename(columns={
             "Description": "description",

mgnify_pipelines_toolkit/schemas/schemas.py CHANGED Viewed

@@ -17,10 +17,11 @@ import logging
 import re
 from enum import Enum
-from typing import ClassVar, Optional, Type
+from typing import ClassVar, Optional, Type, Literal
 import pandas as pd
 import pandera as pa
+from pandera.typing import Series
 from pandera.typing.common import DataFrameBase
 from pydantic import (
@@ -110,6 +111,354 @@ class AmpliconPassedRunsSchema(pa.DataFrameModel):
         coerce = True
+class CompletedAnalysisRecord(BaseModel):
+    """Class defining a Pydantic model for a single "row" of an successfully analysed assemblies file."""
+    assembly: str = Field(
+        ...,
+        description="Assembly accession",
+        examples=["ERZ789012"],
+        pattern=r"ERZ\d{6,}",
+    )
+    status: Literal["success"] = Field(
+        ...,
+        description="Pipeline output for whether this assembly's analysis succeeded or not",
+    )
+class CompletedAnalysisSchema(pa.DataFrameModel):
+    """Class modelling a Pandera dataframe schema that uses the CompletedAnalysisSchema class as dtype.
+    This is what actually validates the generated dataframe when read by pandas.read_csv.
+    """
+    assembly: Series[str]
+    @pa.check("assembly")
+    def accessions_unique(self, series: Series[str]) -> Series[bool]:
+        return ~series.duplicated()
+    class Config:
+        """Config with dataframe-level data type."""
+        dtype = PydanticModel(CompletedAnalysisRecord)
+        coerce = True
+class InterProSummaryRecord(BaseModel):
+    """Model of a row in the InterPro summary file."""
+    count: int = Field(
+        ..., ge=0, description="Number of hits for the InterPro accession"
+    )
+    interpro_accession: str = Field(
+        ...,
+        description="InterPro accession ID",
+        examples=["IPR123456"],
+        pattern=r"IPR\d{6}",
+    )
+    description: str = Field(..., description="Description of the InterPro domain")
+class GOSummaryRecord(BaseModel):
+    """Model of a row in the GO summary file."""
+    go: str = Field(
+        ...,
+        description="GO term identifier",
+        examples=["GO:1234567"],
+        pattern=r"GO:\d{7}",
+    )
+    term: str = Field(..., description="GO term name")
+    category: str = Field(
+        ...,
+        description="GO category",
+        examples=["biological_process", "molecular_function", "cellular_component"],
+    )
+    count: int = Field(..., ge=0, description="Number of times the GO term is observed")
+class BaseSummarySchema(pa.DataFrameModel):
+    """Base schema for summary files."""
+    @staticmethod
+    def is_unique(series: Series[str]) -> Series[bool]:
+        return ~series.duplicated()
+class InterProSummarySchema(BaseSummarySchema):
+    """Schema for InterPro summary file validation."""
+    interpro_accession: Series[str]
+    @pa.check("interpro_accession")
+    def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(InterProSummaryRecord)
+        coerce = True
+class GOSummarySchema(BaseSummarySchema):
+    """Schema for GO or GOslim summary file validation."""
+    go: Series[str]
+    @pa.check("go")
+    def go_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(GOSummaryRecord)
+        coerce = True
+class SanntisSummaryRecord(BaseModel):
+    """Model of a row in the Sanntis assembly-level summary file."""
+    nearest_mibig: str = Field(
+        ...,
+        description="The accession ID of the closest matching biosynthetic gene cluster (BGC) in the MIBiG database",
+        examples=["BGC0000073"],
+        pattern=r"BGC\d{7}",
+    )
+    nearest_mibig_class: str = Field(
+        ...,
+        description="The biosynthetic class of the nearest MIBiG BGC",
+        examples=["Polyketide"],
+    )
+    description: str = Field(
+        ...,
+        description="A brief summary of the biosynthetic process or type of metabolite associated with the nearest MIBiG cluster",
+    )
+    count: int = Field(
+        ..., ge=0, description="Number of times the MIBiG entry is observed"
+    )
+class AntismashSummaryRecord(BaseModel):
+    """Model of a row in the Antismash summary file."""
+    label: str = Field(
+        ...,
+        description="Biosynthetic class or label assigned by Antismash based on sequence similarity to known biosynthetic gene clusters.",
+        examples=["RiPP-like", "T1PKS", "terpene"],
+    )
+    description: str = Field(
+        ...,
+        description="Brief explanation of the biosynthetic class, often indicating compound type or functional characteristics.",
+        examples=["Type I PKS (Polyketide synthase)", "Redox-cofactors such as PQQ"],
+    )
+    count: int = Field(
+        ...,
+        ge=0,
+        description="Number of BGCs (biosynthetic gene clusters) in the dataset assigned to this label.",
+    )
+class KOSummaryRecord(BaseModel):
+    """Model of a row in the KEGG summary file."""
+    ko: str = Field(
+        ...,
+        description="KEGG Orthology (KO) identifier representing a functional gene or pathway component.",
+        examples=["K07547", "K04874", "K19946"],
+        pattern=r"K\d{5,}",
+    )
+    description: str = Field(
+        ...,
+        description="Name or function of the KO, sometimes including EC numbers and protein families.",
+        examples=["optineurin", "MFS transporter, POT/PTR family"],
+    )
+    count: int = Field(
+        ...,
+        ge=0,
+        description="Number of times this KO identifier is observed in the dataset.",
+    )
+class PFAMSummaryRecord(BaseModel):
+    """Model of a row in the PFAM summary file."""
+    pfam: str = Field(
+        ...,
+        description="PFAM accession identifier representing a protein domain or family.",
+        examples=["PF00265", "PF01956", "PF00673"],
+        pattern=r"PF\d{5}",
+    )
+    description: str = Field(
+        ...,
+        description="Description of the protein domain or family associated with the PFAM ID.",
+        examples=["Thymidine kinase", "Integral membrane protein EMC3/TMCO1-like"],
+    )
+    count: int = Field(
+        ...,
+        ge=0,
+        description="Number of times the PFAM domain is observed in the dataset.",
+    )
+class KEGGModulesSummaryRecord(BaseModel):
+    """Model of a row in the KEGG Modules summary file."""
+    module_accession: str = Field(
+        ...,
+        description="KEGG Module identifier representing a specific metabolic pathway or module.",
+        examples=["M00123", "M00234"],
+        pattern=r"M\d{5}",
+    )
+    completeness: float = Field(
+        ...,
+        ge=0,
+        description="Completeness score of the KEGG Module, indicating the extent to which the module is present in the metagenome.",
+    )
+    pathway_name: str = Field(
+        ...,
+        description="Name of the metabolic pathway associated with the KEGG Module.",
+        examples=["Sulfur reduction, sulfur => sulfide"],
+    )
+    pathway_class: str = Field(
+        ...,
+        description="Biosynthetic class or category associated with the KEGG Module, semi colon separated.",
+        examples=["Pathway modules; Energy metabolism; Photosynthesis"],
+    )
+class SanntisSummarySchema(BaseSummarySchema):
+    nearest_mibig: Series[str]
+    @pa.check("nearest_mibig")
+    def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(SanntisSummaryRecord)
+        coerce = True
+class AntismashSummarySchema(BaseSummarySchema):
+    label: Series[str]
+    @pa.check("label")
+    def class_names_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(AntismashSummaryRecord)
+        coerce = True
+class KOSummarySchema(BaseSummarySchema):
+    ko: Series[str]
+    @pa.check("ko")
+    def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(KOSummaryRecord)
+        coerce = True
+class PFAMSummarySchema(BaseSummarySchema):
+    pfam: Series[str]
+    @pa.check("pfam")
+    def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(PFAMSummaryRecord)
+        coerce = True
+class KEGGModulesSummarySchema(BaseSummarySchema):
+    module_accession: Series[str]
+    @pa.check("module_accession")
+    def module_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+    class Config:
+        dtype = PydanticModel(KEGGModulesSummaryRecord)
+        coerce = True
+class BaseStudySummarySchema(BaseSummarySchema):
+    """Base schema for study summary files with ERZ* columns and count checks."""
+    @pa.check(regex=r"^ERZ\d+")
+    def count_columns_are_non_negative(self, s: Series[int]) -> Series[bool]:
+        return s >= 0
+    class Config:
+        strict = False  # allow extra ERZ* columns not declared above
+class GOStudySummarySchema(BaseStudySummarySchema):
+    GO: Series[str] = pa.Field(str_matches=r"^GO:\d{7}$")
+    description: Series[str]
+    category: Series[str]
+    @pa.check("GO")
+    def go_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class InterProStudySummarySchema(BaseStudySummarySchema):
+    IPR: Series[str] = pa.Field(str_matches=r"^IPR\d{6}$")
+    description: Series[str]
+    @pa.check("IPR")
+    def interpro_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class AntismashStudySummarySchema(BaseStudySummarySchema):
+    label: Series[str]
+    @pa.check("label")
+    def class_names_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class SanntisStudySummarySchema(BaseStudySummarySchema):
+    nearest_mibig: Series[str]
+    @pa.check("nearest_mibig")
+    def mibig_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class KOStudySummarySchema(BaseStudySummarySchema):
+    KO: Series[str]
+    @pa.check("KO")
+    def ko_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class PFAMStudySummarySchema(BaseStudySummarySchema):
+    PFAM: Series[str]
+    @pa.check("PFAM")
+    def pfam_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class KEGGModulesStudySummarySchema(BaseStudySummarySchema):
+    module_accession: Series[str]
+    @pa.check("module_accession")
+    def module_ids_unique(self, series: Series[str]) -> Series[bool]:
+        return self.is_unique(series)
+class TaxonomyStudySummarySchema(BaseStudySummarySchema):
+    pass
 class AmpliconNonINSDCPassedRunsSchema(pa.DataFrameModel):
     """Class modelling the same dataframe schema as the preceding one, except with no INSDC validation.
     Uses the AmpliconNonINSDCSPassedRunsRecord as a dtype to achieve this.
@@ -145,7 +494,11 @@ class TaxRank(RootModel):
     def rank_structure_validity_check(cls, taxrank: str) -> bool:
         taxrank_list = taxrank.split("__")
         rank = taxrank_list[0]
-        if rank != "" and rank != "Unclassified" and rank not in cls.valid_tax_ranks:
+        if (
+            rank != ""
+            and rank.capitalize() != "Unclassified"
+            and rank not in cls.valid_tax_ranks
+        ):
             raise ValueError(f"Invalid taxonomy rank {rank}.")
         return taxrank

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mgnify_pipelines_toolkit
-Version: 1.1.0
+Version: 1.1.1
 Summary: Collection of scripts and tools for MGnify pipelines
 Author-email: MGnify team <metagenomics-help@ebi.ac.uk>
 License: Apache Software License 2.0
@@ -8,7 +8,7 @@ Keywords: bioinformatics,pipelines,metagenomics
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: biopython>=1.85

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/RECORD RENAMED Viewed

@@ -12,6 +12,7 @@ mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py,sha256=B
 mgnify_pipelines_toolkit/analysis/amplicon/remove_ambiguous_reads.py,sha256=Wu4tRtuRkgd3hoeuwPl_E5ghxIW7e_1vrcvFGWv_U4A,3173
 mgnify_pipelines_toolkit/analysis/amplicon/rev_comp_se_primers.py,sha256=yLpzkRJXAeXRUNgz60zopEwHcdprM2UDjquE-GkrFys,1722
 mgnify_pipelines_toolkit/analysis/amplicon/standard_primer_matching.py,sha256=K6gniytuItq5WzHLi1BsaUCOdP4Zm0_ZzW2_ns7-BTI,11114
+mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py,sha256=epVClL10QcllL8yu7YGjx0rXNVHL2GxHi-Ek0MOjsjo,13859
 mgnify_pipelines_toolkit/analysis/assembly/add_rhea_chebi_annotation.py,sha256=NZSNY2bqs_TQyz8riDqiEFPLKcwTgzh1C7DeVHT6V8Q,4366
 mgnify_pipelines_toolkit/analysis/assembly/antismash_gff_builder.py,sha256=wXrw1B-z4hOu5oA27Vp1WYxGP2Mk6ZY4i_T5jDZgek0,6954
 mgnify_pipelines_toolkit/analysis/assembly/combined_gene_caller_merge.py,sha256=Pq-9RSt3RCxzDMQVW1VHlHF4NtpVwCWFbg2CMkvpZZc,19089
@@ -23,9 +24,10 @@ mgnify_pipelines_toolkit/analysis/assembly/go_utils.py,sha256=eay9e3Xdc8XxnlC_4S
 mgnify_pipelines_toolkit/analysis/assembly/krona_txt_from_cat_classification.py,sha256=uex2T6GagtYFBIc39-Xm4SFHL06KAQ5v0_loOmY_eaw,4289
 mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_cazys.py,sha256=KaJHOKfbIurbD1iiMssjdAaSAT8Nv-_ZUFwxkLqukAE,7799
 mgnify_pipelines_toolkit/analysis/assembly/process_dbcan_result_clusters.py,sha256=DYZhChGD49M-zAtGkCmNHXDoVTnd5Qy6amG-oePO8Ek,5981
-mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=eRAQ0vFbqnWreiBdtFuwLKve9WwYwv9dYQtD1pumaZs,10776
+mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py,sha256=J4cIWaFyWihqo2JtaOR531aXtVxIfOi_hcwZZw-vP8g,21252
+mgnify_pipelines_toolkit/analysis/assembly/summarise_antismash_bgcs.py,sha256=jUeA7I12YrtIqnm3hUxpdgsWfa2pP1ALGjb9OMKPcgY,10643
 mgnify_pipelines_toolkit/analysis/assembly/summarise_goslims.py,sha256=TPaKlYkoy37_XgYNOskWCCoXtPNku_k5ygSeK4fT1VQ,6689
-mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=65szj-H8Hxy_eXy3TyTs48EhPJbJ2w1skHlVbH2YeVM,4538
+mgnify_pipelines_toolkit/analysis/assembly/summarise_sanntis_bgcs.py,sha256=1wblLbZl521digIUWoqneAu15gErzvN_oC--5T_xUdw,4582
 mgnify_pipelines_toolkit/analysis/genomes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/analysis/shared/convert_cmscan_to_cmsearch_tblout.py,sha256=kAGU5kQyj-Hlcdx32i-xOJSuHYYUDj-kqnyYHMohHGc,4477
@@ -36,7 +38,6 @@ mgnify_pipelines_toolkit/analysis/shared/get_subunits_coords.py,sha256=EH5RyzesL
 mgnify_pipelines_toolkit/analysis/shared/library_strategy_check.py,sha256=6Ck2NhwRWw66GctUtKDdPT5fwJhWFR_YOZq-Vxwoa8A,1996
 mgnify_pipelines_toolkit/analysis/shared/mapseq2biom.py,sha256=7-U0DN1joVu0ifLOoDUK2Pfqy8rb1RDKT6khVg3jky0,5559
 mgnify_pipelines_toolkit/analysis/shared/markergene_study_summary.py,sha256=sKAo_rKEyVAZXSaIFMkpSoYZxiWwXMA3XDA6Z-hbHgg,7904
-mgnify_pipelines_toolkit/analysis/shared/study_summary_generator.py,sha256=OOqKaQmKGAya6_BZgfcWBZSVlmZ918PQTVMv6KwGIns,13827
 mgnify_pipelines_toolkit/constants/db_labels.py,sha256=omPINMylAjO2PxeFhSk2MbYNcGZH3P82optSlMey3dw,858
 mgnify_pipelines_toolkit/constants/ncrna.py,sha256=a_5hWp446S7BhRbe_JcydFgZM7sgPLuMlaiBvKWN_XM,1928
 mgnify_pipelines_toolkit/constants/regex_ambiguous_bases.py,sha256=7nEOODQq35y9wx9YnvJuo29oBpwTpXg_kIbf_t7N4TQ,1093
@@ -44,13 +45,13 @@ mgnify_pipelines_toolkit/constants/regex_fasta_header.py,sha256=G-xrc9b8zdmPTaOI
 mgnify_pipelines_toolkit/constants/tax_ranks.py,sha256=kMq__kOJcbiwsgolkdvb-XLo3WMnJdEXgedjUyMOYjI,1081
 mgnify_pipelines_toolkit/constants/thresholds.py,sha256=V_xDBk0RhS3hHeWqOacKzth2gM6zJABRPgwHy-Ciqfk,1157
 mgnify_pipelines_toolkit/constants/var_region_coordinates.py,sha256=0bM4MwarFiM5yTcp5AbAmQ0o-q-gWy7kknir9zJ9R0A,1312
-mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pnH8LUH8i2ACNvFNWyG-n-eIHZcI5O9UDYulkh43mec,7692
+mgnify_pipelines_toolkit/schemas/schemas.py,sha256=pyDZvCuWbwccQF0D7c5BN1vv36wQdgcAUXU43_zAu74,18164
 mgnify_pipelines_toolkit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 mgnify_pipelines_toolkit/utils/fasta_to_delimited.py,sha256=lgYIR1S4crURY7C7nFtgE6QMV4u4zCNsUrVkcRnsEEo,3996
 mgnify_pipelines_toolkit/utils/get_mpt_version.py,sha256=aS9bWrC9CP7tpxoEVg6eEYt18-pmjG7fJl5Mchz4YOU,798
-mgnify_pipelines_toolkit-1.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-mgnify_pipelines_toolkit-1.1.0.dist-info/METADATA,sha256=ZBar6psIFlDE7DNfuDFjeX0HLKsgMwFn6ZW_ifMqEww,5810
-mgnify_pipelines_toolkit-1.1.0.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
-mgnify_pipelines_toolkit-1.1.0.dist-info/entry_points.txt,sha256=T8soGT2to8c_qafw-0itqCn4sjOnxlfaNWHIaHz4H54,3416
-mgnify_pipelines_toolkit-1.1.0.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
-mgnify_pipelines_toolkit-1.1.0.dist-info/RECORD,,
+mgnify_pipelines_toolkit-1.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+mgnify_pipelines_toolkit-1.1.1.dist-info/METADATA,sha256=E86Tp9qJuQUrkNIklK4PEATQ4ovZfhRbgMKVTyxGSx0,5811
+mgnify_pipelines_toolkit-1.1.1.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
+mgnify_pipelines_toolkit-1.1.1.dist-info/entry_points.txt,sha256=JSjuxAr71MTeSUPPpno22wmZYgVO-gbsXfDkgWKkF7A,3533
+mgnify_pipelines_toolkit-1.1.1.dist-info/top_level.txt,sha256=xA_wC7C01V3VwuDnqwRM2QYeJJ45WtvF6LVav4tYxuE,25
+mgnify_pipelines_toolkit-1.1.1.dist-info/RECORD,,

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.0.0)
+Generator: setuptools (80.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,7 +1,9 @@
 [console_scripts]
 add_rhea_chebi_annotation = mgnify_pipelines_toolkit.analysis.assembly.add_rhea_chebi_annotation:main
+amplicon_study_summary_generator = mgnify_pipelines_toolkit.analysis.amplicon.study_summary_generator:cli
 antismash_gff_builder = mgnify_pipelines_toolkit.analysis.assembly.antismash_gff_builder:main
 are_there_primers = mgnify_pipelines_toolkit.analysis.amplicon.are_there_primers:main
+assembly_study_summary_generator = mgnify_pipelines_toolkit.analysis.assembly.study_summary_generator:cli
 assess_inflection_point_mcp = mgnify_pipelines_toolkit.analysis.amplicon.assess_inflection_point_mcp:main
 assess_mcp_proportions = mgnify_pipelines_toolkit.analysis.amplicon.assess_mcp_proportions:main
 classify_var_regions = mgnify_pipelines_toolkit.analysis.amplicon.classify_var_regions:main
@@ -31,7 +33,6 @@ process_dbcan_clusters = mgnify_pipelines_toolkit.analysis.assembly.process_dbca
 remove_ambiguous_reads = mgnify_pipelines_toolkit.analysis.amplicon.remove_ambiguous_reads:main
 rev_comp_se_primers = mgnify_pipelines_toolkit.analysis.amplicon.rev_comp_se_primers:main
 standard_primer_matching = mgnify_pipelines_toolkit.analysis.amplicon.standard_primer_matching:main
-study_summary_generator = mgnify_pipelines_toolkit.analysis.shared.study_summary_generator:cli
 summarise_antismash_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_antismash_bgcs:main
 summarise_goslims = mgnify_pipelines_toolkit.analysis.assembly.summarise_goslims:main
 summarise_sanntis_bgcs = mgnify_pipelines_toolkit.analysis.assembly.summarise_sanntis_bgcs:main

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mgnify_pipelines_toolkit-1.1.0.dist-info → mgnify_pipelines_toolkit-1.1.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

mgnify-pipelines-toolkit 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl