PyPI - mgnify-pipelines-toolkit - Versions diffs - 1.2.10__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

mgnify-pipelines-toolkit 1.2.10py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mgnify-pipelines-toolkit might be problematic. Click here for more details.

Files changed (12) hide show

mgnify_pipelines_toolkit/analysis/amplicon/primer_val_classification.py CHANGED Viewed

@@ -167,7 +167,8 @@ def main():
             matched_primers_list.append(cleaned_primer_name)
     res_df = pd.DataFrame.from_dict(res_dict)
-    res_df.to_csv(f"./{sample}_primer_validation.tsv", sep="\t", index=False)
+    res_tsv_name = f"./{sample}_primer_validation.tsv"
+    res_df.to_csv(res_tsv_name, sep="\t", index=False) if not res_df.empty else open(res_tsv_name, "w").close()
     fwd_primers_fw.close()
     rev_primers_fw.close()

mgnify_pipelines_toolkit/analysis/amplicon/study_summary_generator.py CHANGED Viewed

@@ -1,7 +1,5 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import shutil
-from shutil import SameFileError
 # Copyright 2024-2025 EMBL - European Bioinformatics Institute
 #
@@ -16,25 +14,27 @@ from shutil import SameFileError
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import click
-from collections import defaultdict
 import glob
 import logging
+import shutil
+from collections import defaultdict
 from pathlib import Path
-from typing import Union, List
+from shutil import SameFileError
+from typing import List, Union
+import click
 import pandas as pd
-from mgnify_pipelines_toolkit.constants.db_labels import TAXDB_LABELS, ASV_TAXDB_LABELS
+from mgnify_pipelines_toolkit.constants.db_labels import ASV_TAXDB_LABELS, TAXDB_LABELS
 from mgnify_pipelines_toolkit.constants.tax_ranks import (
-    _SILVA_TAX_RANKS,
     _PR2_TAX_RANKS,
+    _SILVA_TAX_RANKS,
 )
-from mgnify_pipelines_toolkit.schemas.schemas import (
-    AmpliconPassedRunsSchema,
+from mgnify_pipelines_toolkit.schemas.dataframes import (
     AmpliconNonINSDCPassedRunsSchema,
-    TaxonSchema,
+    AmpliconPassedRunsSchema,
     PR2TaxonSchema,
+    TaxonSchema,
     validate_dataframe,
 )
@@ -46,9 +46,7 @@ def cli():
     pass
-def get_tax_file(
-    run_acc: str, analyses_dir: Path, db_label: str
-) -> Union[Path, List[Path]]:
+def get_tax_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path]]:
     """Takes path information for a particular analysis and db_label combo, and returns any existing files.
     :param run_acc: Run accession for the tax file that should be retrieved.
@@ -69,48 +67,32 @@ def get_tax_file(
     db_path = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}")
     if not db_path.exists():
-        logging.debug(
-            f"DB {db_path} doesn't exist for {run_acc}. Skipping"
-        )  # or error?
+        logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping")  # or error?
         return
     if db_label in TAXDB_LABELS:
-        tax_file = Path(
-            f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt"
-        )
+        tax_file = Path(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/{run_acc}_{db_label}.txt")
         if not tax_file.exists():
-            logging.error(
-                f"DB path exists but file doesn't - exiting. Path: {tax_file}"
-            )
+            logging.error(f"DB path exists but file doesn't - exiting. Path: {tax_file}")
             exit(1)
         file_size = tax_file.stat().st_size
-        if (
-            file_size == 0
-        ):  # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
+        if file_size == 0:  # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
             # so need to skip those. Should probably fix that at some point
-            logging.debug(
-                f"File {tax_file} exists but is empty, so will be skipping it."
-            )
+            logging.debug(f"File {tax_file} exists but is empty, so will be skipping it.")
             tax_file = None
     elif db_label in ASV_TAXDB_LABELS:
         # ASV tax files could have up to two files, one for each amplified region (maximum two from the pipeline).
         # So will need to handle this differently to closed-reference files
-        asv_tax_files = glob.glob(
-            f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt"
-        )
-        asv_tax_files = [
-            Path(file) for file in asv_tax_files if "concat" not in file
-        ]  # Have to filter out concatenated file if it exists
+        asv_tax_files = glob.glob(f"{analyses_dir}/{run_acc}/taxonomy-summary/{db_label}/*.txt")
+        asv_tax_files = [Path(file) for file in asv_tax_files if "concat" not in file]  # Have to filter out concatenated file if it exists
         tax_file = asv_tax_files
     return tax_file
-def parse_one_tax_file(
-    run_acc: str, tax_file: Path, long_tax_ranks: list
-) -> pd.DataFrame:
+def parse_one_tax_file(run_acc: str, tax_file: Path, long_tax_ranks: list) -> pd.DataFrame:
     """Parses a taxonomy file, and returns it as a pandas DataFrame object.
     :param run_acc: Run accession of the taxonomy file that will be parsed.
@@ -134,9 +116,7 @@ def parse_one_tax_file(
     elif len(long_tax_ranks) == 9:
         validate_dataframe(res_df, PR2TaxonSchema, str(tax_file))
-    res_df["full_taxon"] = res_df.iloc[:, 1:].apply(
-        lambda x: ";".join(x).strip(";"), axis=1
-    )
+    res_df["full_taxon"] = res_df.iloc[:, 1:].apply(lambda x: ";".join(x).strip(";"), axis=1)
     final_df = res_df.iloc[:, [0, -1]]
     final_df = final_df.set_index("full_taxon")
     final_df.columns = [run_acc]
@@ -144,9 +124,7 @@ def parse_one_tax_file(
     return final_df
-def generate_db_summary(
-    db_label: str, tax_dfs: defaultdict[Path], output_prefix: str
-) -> None:
+def generate_db_summary(db_label: str, tax_dfs: defaultdict[Path], output_prefix: str) -> None:
     """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
     and respective db_label, joins them together, and generates a study-wide summary
     in the form of a .tsv file.
@@ -185,7 +163,6 @@ def generate_db_summary(
         )
     elif db_label in ASV_TAXDB_LABELS:
         if "PR2" in db_label:
             long_tax_ranks = _PR2_TAX_RANKS
         else:
@@ -196,13 +173,9 @@ def generate_db_summary(
         for (
             run_acc,
             tax_df_asv_lst,
-        ) in (
-            tax_dfs.items()
-        ):  # each `tax_file` will be a list containing at most two files (one for each amp_region)
+        ) in tax_dfs.items():  # each `tax_file` will be a list containing at most two files (one for each amp_region)
             for tax_df in tax_df_asv_lst:
-                amp_region = str(tax_df).split("_")[
-                    -5
-                ]  # there are a lot of underscores in these names... but it is consistent
+                amp_region = str(tax_df).split("_")[-5]  # there are a lot of underscores in these names... but it is consistent
                 # e.g. ERR4334351_16S-V3-V4_DADA2-SILVA_asv_krona_counts.txt
                 amp_region_df = parse_one_tax_file(run_acc, tax_df, long_tax_ranks)
                 amp_region_dict[amp_region].append(amp_region_df)
@@ -241,13 +214,9 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
         temp_lst = summary_filename.split("_")
         if "asv_study_summary" in summary_filename:
-            summary_db_label = "_".join(
-                temp_lst[1:3]
-            )  # For ASVs we need to include the amp_region in the label
+            summary_db_label = "_".join(temp_lst[1:3])  # For ASVs we need to include the amp_region in the label
         else:
-            summary_db_label = temp_lst[
-                1
-            ]  # For closed reference, just the db_label is needed
+            summary_db_label = temp_lst[1]  # For closed reference, just the db_label is needed
         summaries_dict[summary_db_label].append(summary_path)
@@ -273,18 +242,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[List
     help="Input directory to where all the individual analyses subdirectories for summarising",
     type=click.Path(exists=True, path_type=Path, file_okay=False),
 )
-@click.option(
-    "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
-)
+@click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
 @click.option(
     "--non_insdc",
     default=False,
     is_flag=True,
     help="If run accessions aren't INSDC-formatted",
 )
-def summarise_analyses(
-    runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
-) -> None:
+def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
     """Function that will take a file of pipeline-successful run accessions
     that should be used for the generation of the relevant db-specific
     study-level summary files. For ASV results, these will also be on a
@@ -302,16 +267,14 @@ def summarise_analyses(
     """
     runs_df = pd.read_csv(runs, names=["run", "status"])
+    # Run validation on the successful_runs .csv file
     if not non_insdc:
-        AmpliconPassedRunsSchema(
-            runs_df
-        )  # Run validation on the successful_runs .csv file
+        AmpliconPassedRunsSchema(runs_df)
     else:
         AmpliconNonINSDCPassedRunsSchema(runs_df)
     all_db_labels = TAXDB_LABELS + ASV_TAXDB_LABELS
     for db_label in all_db_labels:
         tax_files = defaultdict(Path)
         for i in range(0, len(runs_df)):
             run_acc = runs_df.loc[i, "run"]
@@ -376,9 +339,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
                 index_label="taxonomy",
             )
         elif len(summaries) == 1:
-            logging.info(
-                f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
-            )
+            logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
             try:
                 shutil.copyfile(summaries[0], merged_summary_name)
             except SameFileError:

mgnify_pipelines_toolkit/analysis/assembly/study_summary_generator.py CHANGED Viewed

@@ -14,39 +14,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import click
-from functools import reduce
 import glob
 import logging
+from functools import reduce
 from pathlib import Path
 from typing import Literal
+import click
 import pandas as pd
-from mgnify_pipelines_toolkit.schemas.schemas import (
+from mgnify_pipelines_toolkit.schemas.dataframes import (
+    AntismashStudySummarySchema,
+    AntismashSummarySchema,
     CompletedAnalysisSchema,
-    TaxonSchema,
+    GOStudySummarySchema,
     GOSummarySchema,
+    InterProStudySummarySchema,
     InterProSummarySchema,
-    KOSummarySchema,
-    SanntisSummarySchema,
-    AntismashSummarySchema,
-    PFAMSummarySchema,
+    KEGGModulesStudySummarySchema,
     KEGGModulesSummarySchema,
-    GOStudySummarySchema,
-    InterProStudySummarySchema,
-    TaxonomyStudySummarySchema,
     KOStudySummarySchema,
-    SanntisStudySummarySchema,
-    AntismashStudySummarySchema,
+    KOSummarySchema,
     PFAMStudySummarySchema,
-    KEGGModulesStudySummarySchema,
+    PFAMSummarySchema,
+    SanntisStudySummarySchema,
+    SanntisSummarySchema,
+    TaxonomyStudySummarySchema,
+    TaxonSchema,
     validate_dataframe,
 )
-logging.basicConfig(
-    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
-)
+logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
 # Keys are the original column names in the input files,
 # values are the standardised column names used in the generated study summary files
@@ -173,9 +171,7 @@ def check_files_exist(file_list: list[Path]) -> None:
     """
     missing_files = [str(path) for path in file_list if not path.is_file()]
     if missing_files:
-        raise FileNotFoundError(
-            f"The following required files are missing: {', '.join(missing_files)}"
-        )
+        raise FileNotFoundError(f"The following required files are missing: {', '.join(missing_files)}")
 def generate_taxonomy_summary(
@@ -206,9 +202,7 @@ def generate_taxonomy_summary(
         df = validate_dataframe(df, TaxonSchema, str(path))
         # Combine all taxonomic ranks in the classification into a single string
-        df["full_taxon"] = (
-            df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
-        )
+        df["full_taxon"] = df[TAXONOMY_COLUMN_NAMES[1:]].agg(";".join, axis=1).str.strip(";")
         # Create a new DataFrame with taxonomy as index and count as the only column
         result = df[["Count", "full_taxon"]].set_index("full_taxon")
@@ -229,9 +223,7 @@ def generate_functional_summary(
     file_dict: dict[str, Path],
     column_names: dict[str, str],
     output_prefix: str,
-    label: Literal[
-        "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
-    ],
+    label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
     outdir: Path = None,
     allow_missing: bool = False,
 ) -> None:
@@ -292,9 +284,7 @@ def generate_functional_summary(
         check_files_exist(list(file_dict.values()))
     except FileNotFoundError as e:
         if allow_missing:
-            logging.warning(
-                f"One of the expected files is missing, but this is allowed for {label}."
-            )
+            logging.warning(f"One of the expected files is missing, but this is allowed for {label}.")
             logging.warning(e)
             return
         raise
@@ -324,9 +314,7 @@ def generate_functional_summary(
         dfs.append(df)
     if not dfs:
-        logging.warning(
-            f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}."
-        )
+        logging.warning(f"No valid files with functional annotation summary were found. Skipping creation of {output_file_name}.")
         return
     # Merge all dataframes on the renamed metadata columns
@@ -384,9 +372,7 @@ def generate_functional_summary(
     help="Directory for the output files, by default it will use the current working directory.",
     type=click.Path(exists=True, path_type=Path, file_okay=False),
 )
-def summarise_analyses(
-    assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path
-) -> None:
+def summarise_analyses(assemblies: Path, study_dir: Path, output_prefix: str, outdir: Path) -> None:
     """
     Generate study-level summaries for successfully proccessed assemblies.
@@ -405,16 +391,11 @@ def summarise_analyses(
         Construct file paths for each assembly given a subdirectory and filename template.
         Template must contain {acc} as a placeholder.
         """
-        return {
-            acc: study_dir / acc / subdir / filename_template.format(acc=acc)
-            for acc in assembly_list
-        }
+        return {acc: study_dir / acc / subdir / filename_template.format(acc=acc) for acc in assembly_list}
     logging.info("Start processing of assembly-level summaries.")
-    logging.info(
-        "Generating taxonomy summary from assembly-level summaries <accession>.krona.txt"
-    )
+    logging.info("Generating taxonomy summary from assembly-level summaries <accession>.krona.txt")
     generate_taxonomy_summary(
         get_file_paths("taxonomy", "{acc}.krona.txt.gz"),
         f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}",
@@ -422,9 +403,7 @@ def summarise_analyses(
     )
     for summary_type, config in SUMMARY_TYPES_MAP.items():
-        logging.info(
-            f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz"
-        )
+        logging.info(f"Generating study-level {summary_type.capitalize()} summary from file <accession>_{summary_type}_summary.tsv.gz")
         generate_functional_summary(
             get_file_paths(config["folder"], f"{{acc}}_{summary_type}_summary.tsv.gz"),
             config["column_names"],
@@ -469,9 +448,7 @@ def merge_summaries(study_dir: str, output_prefix: str) -> None:
     logging.info("Generating combined assembly-level summaries")
     logging.info("Parsing summary files for taxonomic classification")
-    merge_taxonomy_summaries(
-        get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}"
-    )
+    merge_taxonomy_summaries(get_file_paths("taxonomy"), f"{output_prefix}_taxonomy_{OUTPUT_SUFFIX}")
     for summary_type, config in SUMMARY_TYPES_MAP.items():
         logging.info(f"Parsing summary files for {summary_type.capitalize()}.")
@@ -500,9 +477,7 @@ def merge_taxonomy_summaries(summary_files: list[str], output_file_name: str) ->
     sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia;o__Primates	118	94
     """
     if not summary_files:
-        raise FileNotFoundError(
-            "The required taxonomic classification summary files are missing. Exiting."
-        )
+        raise FileNotFoundError("The required taxonomic classification summary files are missing. Exiting.")
     summary_dfs = []
     for file in summary_files:
@@ -527,9 +502,7 @@ def merge_functional_summaries(
     summary_files: list[str],
     merge_keys: list[str],
     output_prefix: str,
-    label: Literal[
-        "go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"
-    ],
+    label: Literal["go", "goslim", "interpro", "ko", "sanntis", "antismash", "pfam", "kegg_modules"],
 ) -> None:
     """
     Merge multiple functional study-level summary files into a single study-level summary.
@@ -580,9 +553,7 @@ def merge_functional_summaries(
     output_file_name = f"{output_prefix}_{label}_{OUTPUT_SUFFIX}"
     if not summary_files:
-        logging.warning(
-            f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation."
-        )
+        logging.warning(f"Skipping creation of {output_file_name} because no summaries were found for this type of functional annotation.")
         return
     validation_schema = SUMMARY_TYPES_MAP[label]["study_schema"]
@@ -596,9 +567,7 @@ def merge_functional_summaries(
     if len(dfs) == 1:
         merged_df = dfs[0]
     else:
-        merged_df = reduce(
-            lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs
-        )
+        merged_df = reduce(lambda left, right: pd.merge(left, right, on=merge_keys, how="outer"), dfs)
     # Identify non-key columns (i.e. counts)
     value_columns = [col for col in merged_df.columns if col not in merge_keys]

mgnify_pipelines_toolkit/analysis/rawreads/study_summary_generator.py CHANGED Viewed

@@ -14,32 +14,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import shutil
-from shutil import SameFileError
-import click
-from collections import defaultdict
 import glob
 import logging
+import shutil
+from collections import defaultdict
 from pathlib import Path
-from typing import Union, List
+from shutil import SameFileError
+from typing import List, Union
+import click
 import pandas as pd
 from mgnify_pipelines_toolkit.constants.db_labels import (
-    RRAP_TAXDB_LABELS,
     RRAP_FUNCDB_LABELS,
+    RRAP_TAXDB_LABELS,
 )
 from mgnify_pipelines_toolkit.constants.tax_ranks import (
-    _SILVA_TAX_RANKS,
     _MOTUS_TAX_RANKS,
+    _SILVA_TAX_RANKS,
 )
-from mgnify_pipelines_toolkit.schemas.schemas import (
-    RawReadsPassedRunsSchema,
+from mgnify_pipelines_toolkit.schemas.dataframes import (
+    FunctionProfileSchema,
+    MotusTaxonSchema,
     RawReadsNonINSDCPassedRunsSchema,
+    RawReadsPassedRunsSchema,
     TaxonSchema,
-    MotusTaxonSchema,
-    FunctionProfileSchema,
     validate_dataframe,
 )
@@ -51,9 +50,7 @@ def cli():
     pass
-def get_file(
-    run_acc: str, analyses_dir: Path, db_label: str
-) -> Union[Path, List[Path], None]:
+def get_file(run_acc: str, analyses_dir: Path, db_label: str) -> Union[Path, List[Path], None]:
     """Takes path information for a particular analysis and db_label combo, and returns any existing files.
     :param run_acc: Run accession for the tax file that should be retrieved.
@@ -78,28 +75,18 @@ def get_file(
     db_path = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}")
     if not db_path.exists():
-        logging.debug(
-            f"DB {db_path} doesn't exist for {run_acc}. Skipping"
-        )  # or error?
+        logging.debug(f"DB {db_path} doesn't exist for {run_acc}. Skipping")  # or error?
         return
-    analysis_file = Path(
-        f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz"
-    )
+    analysis_file = Path(f"{analyses_dir}/{run_acc}/{db_dir}/{db_label}/{run_acc}_{db_label}.txt.gz")
     if not analysis_file.exists():
-        logging.error(
-            f"DB path exists but file doesn't - exiting. Path: {analysis_file}"
-        )
+        logging.error(f"DB path exists but file doesn't - exiting. Path: {analysis_file}")
         exit(1)
     file_size = analysis_file.stat().st_size
-    if (
-        file_size == 0
-    ):  # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
+    if file_size == 0:  # Pipeline can generate files that are empty for ITS DBs (UNITE and ITSoneDB),
         # so need to skip those. Should probably fix that at some point
-        logging.debug(
-            f"File {analysis_file} exists but is empty, so will be skipping it."
-        )
+        logging.debug(f"File {analysis_file} exists but is empty, so will be skipping it.")
         analysis_file = None
     return analysis_file
@@ -130,21 +117,13 @@ def parse_one_tax_file(run_acc: str, tax_file: Path, db_label: str) -> pd.DataFr
             str(tax_file),
         )
-    res_df["full_taxon"] = [
-        ";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()
-    ]
-    final_df = (
-        res_df[["Count", "full_taxon"]]
-        .set_index("full_taxon")
-        .rename(columns={"Count": run_acc})
-    )
+    res_df["full_taxon"] = [";".join(r[tax_ranks]).strip(";") for _, r in res_df.iterrows()]
+    final_df = res_df[["Count", "full_taxon"]].set_index("full_taxon").rename(columns={"Count": run_acc})
     return final_df
-def parse_one_func_file(
-    run_acc: str, func_file: Path, db_label: str
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+def parse_one_func_file(run_acc: str, func_file: Path, db_label: str) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Parses a functional profile file, and returns it as a pandas DataFrame object.
     :param run_acc: Run accession of the taxonomy file that will be parsed.
@@ -170,24 +149,16 @@ def parse_one_func_file(
     if res_df.shape[0] > 0:
         validate_dataframe(res_df, FunctionProfileSchema, str(func_file))
-    count_df = pd.DataFrame(res_df[["read_count"]]).rename(
-        columns={"read_count": run_acc}
-    )
+    count_df = pd.DataFrame(res_df[["read_count"]]).rename(columns={"read_count": run_acc})
-    depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(
-        columns={"coverage_depth": run_acc}
-    )
+    depth_df = pd.DataFrame(res_df[["coverage_depth"]]).rename(columns={"coverage_depth": run_acc})
-    breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(
-        columns={"coverage_breadth": run_acc}
-    )
+    breadth_df = pd.DataFrame(res_df[["coverage_breadth"]]).rename(columns={"coverage_breadth": run_acc})
     return count_df, depth_df, breadth_df
-def generate_db_summary(
-    db_label: str, analysis_dfs: dict[str, Path], output_prefix: str
-) -> None:
+def generate_db_summary(db_label: str, analysis_dfs: dict[str, Path], output_prefix: str) -> None:
     """Takes paired run accessions taxonomy dataframes in the form of a dictionary,
     and respective db_label, joins them together, and generates a study-wide summary
     in the form of a .tsv file.
@@ -225,9 +196,7 @@ def generate_db_summary(
         breadth_df_list = []
         for run_acc, analysis_df in analysis_dfs.items():
-            count_df, depth_df, breadth_df = parse_one_func_file(
-                run_acc, analysis_df, db_label
-            )
+            count_df, depth_df, breadth_df = parse_one_func_file(run_acc, analysis_df, db_label)
             count_df_list.append(count_df)
             depth_df_list.append(depth_df)
             breadth_df_list.append(breadth_df)
@@ -308,18 +277,14 @@ def organise_study_summaries(all_study_summaries: List[str]) -> defaultdict[str,
     help="Input directory to where all the individual analyses subdirectories for summarising",
     type=click.Path(exists=True, path_type=Path, file_okay=False),
 )
-@click.option(
-    "-p", "--output_prefix", required=True, help="Prefix to summary files", type=str
-)
+@click.option("-p", "--output_prefix", required=True, help="Prefix to summary files", type=str)
 @click.option(
     "--non_insdc",
     default=False,
     is_flag=True,
     help="If run accessions aren't INSDC-formatted",
 )
-def summarise_analyses(
-    runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool
-) -> None:
+def summarise_analyses(runs: Path, analyses_dir: Path, output_prefix: str, non_insdc: bool) -> None:
     """Function that will take a file of pipeline-successful run accessions
     that should be used for the generation of the relevant db-specific
     study-level summary files.
@@ -337,15 +302,12 @@ def summarise_analyses(
     runs_df = pd.read_csv(runs, names=["run", "status"])
     if not non_insdc:
-        RawReadsPassedRunsSchema(
-            runs_df
-        )  # Run validation on the successful_runs .csv file
+        RawReadsPassedRunsSchema(runs_df)  # Run validation on the successful_runs .csv file
     else:
         RawReadsNonINSDCPassedRunsSchema(runs_df)
     all_db_labels = RRAP_TAXDB_LABELS + RRAP_FUNCDB_LABELS
     for db_label in all_db_labels:
         analysis_files = {}
         for run_acc in runs_df["run"]:
             analysis_file = get_file(run_acc, analyses_dir, db_label)
@@ -410,9 +372,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
                     index_label="taxonomy",
                 )
             elif len(summaries) == 1:
-                logging.info(
-                    f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}"
-                )
+                logging.info(f"Only one summary ({summaries[0]}) so will use that as {merged_summary_name}")
                 try:
                     shutil.copyfile(summaries[0], merged_summary_name)
                 except SameFileError:
@@ -420,21 +380,15 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
         if db_label in RRAP_FUNCDB_LABELS:
             for table_type in ["read-count", "coverage-depth", "coverage-breadth"]:
-                merged_summary_name = (
-                    f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
-                )
-                summaries_ = [
-                    v for v in summaries if Path(v).stem.split("_")[2] == table_type
-                ]
+                merged_summary_name = f"{output_prefix}_{db_label}_{table_type}_study_summary.tsv"
+                summaries_ = [v for v in summaries if Path(v).stem.split("_")[2] == table_type]
                 if len(summaries_) > 1:
                     res_df = pd.read_csv(summaries_[0], sep="\t", index_col=0)
                     for summary in summaries_[1:]:
                         curr_df = pd.read_csv(summary, sep="\t", index_col=0)
                         res_df = res_df.join(curr_df, how="outer")
                         res_df = res_df.fillna(0)
-                        res_df = res_df.astype(
-                            int if table_type == "read-count" else float
-                        )
+                        res_df = res_df.astype(int if table_type == "read-count" else float)
                     res_df = res_df.reindex(sorted(res_df.columns), axis=1)
                     res_df.to_csv(
@@ -444,9 +398,7 @@ def merge_summaries(analyses_dir: str, output_prefix: str) -> None:
                         float_format="%.6g",
                     )
                 elif len(summaries_) == 1:
-                    logging.info(
-                        f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}"
-                    )
+                    logging.info(f"Only one summary ({summaries_[0]}) so will use that as {merged_summary_name}")
                     try:
                         shutil.copyfile(summaries_[0], merged_summary_name)
                     except SameFileError:

mgnify-pipelines-toolkit 1.2.10__py3-none-any.whl → 1.3.0__py3-none-any.whl

Potentially problematic release.

mgnify-pipelines-toolkit 1.2.10py3-none-any.whl → 1.3.0py3-none-any.whl