PyPI - pheval - Versions diffs - 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

pheval 0.4.6py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show

pheval/analyse/benchmark.py +156 -0
pheval/analyse/benchmark_db_manager.py +16 -134
pheval/analyse/benchmark_output_type.py +43 -0
pheval/analyse/binary_classification_curves.py +132 -0
pheval/analyse/binary_classification_stats.py +164 -307
pheval/analyse/generate_plots.py +210 -395
pheval/analyse/generate_rank_comparisons.py +44 -0
pheval/analyse/rank_stats.py +190 -382
pheval/analyse/run_data_parser.py +21 -39
pheval/cli.py +28 -25
pheval/cli_pheval_utils.py +7 -8
pheval/post_processing/phenopacket_truth_set.py +235 -0
pheval/post_processing/post_processing.py +183 -303
pheval/post_processing/validate_result_format.py +92 -0
pheval/prepare/update_phenopacket.py +11 -9
pheval/utils/logger.py +35 -0
pheval/utils/phenopacket_utils.py +85 -91
{pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
{pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
{pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/WHEEL +1 -1
pheval/analyse/analysis.py +0 -104
pheval/analyse/assess_prioritisation_base.py +0 -108
pheval/analyse/benchmark_generator.py +0 -126
pheval/analyse/benchmarking_data.py +0 -25
pheval/analyse/disease_prioritisation_analysis.py +0 -152
pheval/analyse/gene_prioritisation_analysis.py +0 -147
pheval/analyse/generate_summary_outputs.py +0 -105
pheval/analyse/parse_benchmark_summary.py +0 -81
pheval/analyse/parse_corpus.py +0 -219
pheval/analyse/prioritisation_result_types.py +0 -52
pheval/analyse/variant_prioritisation_analysis.py +0 -159
{pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
{pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0

pheval/analyse/run_data_parser.py CHANGED Viewed

@@ -2,7 +2,9 @@ from pathlib import Path
 from typing import List, Optional
 import yaml
-from pydantic import BaseModel, root_validator
+from pydantic import BaseModel, field_validator
+from pheval.utils.logger import get_logger
 class RunConfig(BaseModel):
@@ -12,10 +14,10 @@ class RunConfig(BaseModel):
     Attributes:
         run_identifier (str): The run identifier.
         phenopacket_dir (str): The path to the phenopacket directory used for generating the results.
-        results_dir (str): The path to the results directory.
-        gene_analysis (bool): Whether or not to benchmark gene analysis results.
-        variant_analysis (bool): Whether or not to benchmark variant analysis results.
-        disease_analysis (bool): Whether or not to benchmark disease analysis results.
+        results_dir (str): The path to the result directory.
+        gene_analysis (bool): Whether to benchmark gene analysis results.
+        variant_analysis (bool): Whether to benchmark variant analysis results.
+        disease_analysis (bool): Whether to benchmark disease analysis results.
         threshold (Optional[float]): The threshold to consider for benchmarking.
         score_order (Optional[str]): The order of scores to consider for benchmarking, either ascending or descending.
     """
@@ -29,25 +31,15 @@ class RunConfig(BaseModel):
     threshold: Optional[float]
     score_order: Optional[str]
-    @root_validator(pre=True)
-    def handle_blank_fields(cls, values: dict) -> dict:  # noqa: N805
-        """
-        Root validator to handle fields that may be explicitly set to None.
-        This method checks if 'threshold' and 'score_order' are None and assigns default values if so.
-        Args:
-            values (dict): The input values provided to the model.
+    @field_validator("threshold", mode="before")
+    @classmethod
+    def set_threshold(cls, threshold):
+        return threshold or None
-        Returns:
-            dict: The updated values with defaults applied where necessary.
-        """
-        if values.get("threshold") is None:
-            values["threshold"] = 0
-            print("setting default threshold")
-        if values.get("score_order") is None:
-            values["score_order"] = "descending"
-        return values
+    @field_validator("score_order", mode="before")
+    @classmethod
+    def set_score_order(cls, score_order):
+        return score_order or "descending"
 class SinglePlotCustomisation(BaseModel):
@@ -66,22 +58,10 @@ class SinglePlotCustomisation(BaseModel):
     roc_curve_title: Optional[str]
     precision_recall_title: Optional[str]
-    @root_validator(pre=True)
-    def handle_blank_fields(cls, values: dict) -> dict:  # noqa: N805
-        """
-        Root validator to handle fields that may be explicitly set to None.
-        This method checks if 'plot_type' is None and assigns default value if so.
-        Args:
-            values (dict): The input values provided to the model.
-        Returns:
-            dict: The updated values with defaults applied where necessary.
-        """
-        if values.get("plot_type") is None:
-            values["plot_type"] = "bar_cumulative"
-        return values
+    @field_validator("plot_type", mode="before")
+    @classmethod
+    def set_plot_type(cls, plot_type):
+        return plot_type or "bar_cumulative"
 class PlotCustomisation(BaseModel):
@@ -118,6 +98,8 @@ def parse_run_config(run_config: Path) -> Config:
     Returns:
         Config: The parsed run configurations.
     """
+    logger = get_logger()
+    logger.info(f"Loading benchmark configuration from {run_config}")
     with open(run_config, "r") as f:
         config_data = yaml.safe_load(f)
     f.close()

pheval/cli.py CHANGED Viewed

@@ -1,14 +1,16 @@
-"""PhEval CLI Module """
+"""PhEval CLI Module"""
 import logging
 import click
+from pheval.utils.logger import get_logger, initialise_context
 from .cli_pheval import run
 from .cli_pheval_utils import (
+    benchmark,
     create_spiked_vcfs_command,
-    generate_benchmark_stats,
-    generate_stats_plot,
+    generate_plots,
     prepare_corpus_command,
     scramble_phenopackets_command,
     semsim_scramble_command,
@@ -16,50 +18,51 @@ from .cli_pheval_utils import (
     update_phenopackets_command,
 )
-info_log = logging.getLogger("info")
+logger = get_logger()
 @click.group()
 @click.option("-v", "--verbose", count=True)
-@click.option("-q", "--quiet")
-def main(verbose=1, quiet=False) -> None:
-    """main CLI method for PhEval
-    Args:
-        verbose (int, optional): Verbose flag.
-        quiet (bool, optional): Queit Flag.
-    """
+@click.option("-q", "--quiet", is_flag=True)
+@click.pass_context
+def main(ctx, verbose=1, quiet=False):
+    """Main CLI method for PhEval."""
+    initialise_context(ctx)
     if verbose >= 2:
-        info_log.setLevel(level=logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
     elif verbose == 1:
-        info_log.setLevel(level=logging.INFO)
+        logger.setLevel(logging.INFO)
     else:
-        info_log.setLevel(level=logging.WARNING)
+        logger.setLevel(logging.WARNING)
     if quiet:
-        info_log.setLevel(level=logging.ERROR)
+        logger.setLevel(logging.ERROR)
-@click.group()
-def pheval():
+@main.group()
+@click.pass_context
+def pheval(ctx):
     """pheval"""
+    initialise_context(ctx)
-pheval.add_command(run)
-@click.group()
-def pheval_utils():
+@main.group()
+@click.pass_context
+def pheval_utils(ctx):
     """pheval_utils"""
+    initialise_context(ctx)
+pheval.add_command(run)
 pheval_utils.add_command(semsim_scramble_command)
 pheval_utils.add_command(scramble_phenopackets_command)
 pheval_utils.add_command(update_phenopackets_command)
 pheval_utils.add_command(create_spiked_vcfs_command)
-pheval_utils.add_command(generate_benchmark_stats)
+pheval_utils.add_command(benchmark)
 pheval_utils.add_command(semsim_to_exomiserdb_command)
-pheval_utils.add_command(generate_stats_plot)
 pheval_utils.add_command(prepare_corpus_command)
+pheval_utils.add_command(generate_plots)
 if __name__ == "__main__":
     main()

pheval/cli_pheval_utils.py CHANGED Viewed

@@ -5,9 +5,8 @@ from typing import List
 import click
-from pheval.analyse.analysis import benchmark_run_comparisons
-from pheval.analyse.generate_plots import generate_plots_from_benchmark_summary_db
-from pheval.analyse.run_data_parser import parse_run_config
+from pheval.analyse.benchmark import benchmark_runs
+from pheval.analyse.generate_plots import generate_plots_from_db
 from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
 from pheval.prepare.create_spiked_vcf import spike_vcfs
 from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
@@ -353,12 +352,12 @@ def create_spiked_vcfs_command(
     help="Path to yaml configuration file for benchmarking.",
     type=Path,
 )
-def generate_benchmark_stats(
+def benchmark(
     run_yaml: Path,
 ):
     """Benchmark the gene/variant/disease prioritisation performance for runs."""
-    benchmark_run_comparisons(
-        parse_run_config(run_yaml),
+    benchmark_runs(
+        run_yaml,
     )
@@ -426,12 +425,12 @@ def semsim_to_exomiserdb_command(
     help="Path to yaml configuration file for benchmarking.",
     type=Path,
 )
-def generate_stats_plot(
+def generate_plots(
     benchmark_db: Path,
     run_data: Path,
 ):
     """Generate bar plot from benchmark db."""
-    generate_plots_from_benchmark_summary_db(benchmark_db, run_data)
+    generate_plots_from_db(benchmark_db, run_data)
 @click.command("prepare-corpus")

pheval/post_processing/phenopacket_truth_set.py ADDED Viewed

@@ -0,0 +1,235 @@
+from pathlib import Path
+from typing import List
+import polars as pl
+from pheval.utils.phenopacket_utils import (
+    GenomicVariant,
+    PhenopacketUtil,
+    ProbandCausativeGene,
+    ProbandDisease,
+    phenopacket_reader,
+)
+class PhenopacketTruthSet:
+    """Class for finding the causative gene/disease/variant from a phenopacket"""
+    def __init__(self, phenopacket_dir: Path):
+        self.phenopacket_dir = phenopacket_dir
+    def _get_phenopacket_path(self, phenopacket_name: str) -> Path:
+        """
+        Get the phenopacket path for a given phenopacket name.
+        Args:
+            phenopacket_name (str): Name of the phenopacket.
+        Returns:
+            Path: Path to the phenopacket path.
+        """
+        phenopacket_path = self.phenopacket_dir.joinpath(f"{phenopacket_name}.json")
+        if not phenopacket_path.exists():
+            raise FileNotFoundError(phenopacket_name + " not found in corpus!")
+        return phenopacket_path
+    def _get_phenopacket_util(self, phenopacket_name: str) -> PhenopacketUtil:
+        """
+        Get the phenopacket util for a given phenopacket name.
+        Args:
+            phenopacket_name (str): Name of the phenopacket.
+        Returns:
+            PhenopacketUtil: PhenopacketUtil object.
+        """
+        phenopacket_path = self._get_phenopacket_path(phenopacket_name)
+        phenopacket = phenopacket_reader(phenopacket_path)
+        return PhenopacketUtil(phenopacket)
+    def _get_causative_genes(self, phenopacket_name: str) -> List[ProbandCausativeGene]:
+        """
+        Get the causative genes for a given phenopacket.
+        Args:
+            phenopacket_name (str): Name of the phenopacket.
+        Returns:
+            List[ProbandCausativeGene]: List of ProbandCausativeGene.
+        """
+        phenopacket_util = self._get_phenopacket_util(phenopacket_name)
+        return phenopacket_util.diagnosed_genes()
+    def _get_causative_variants(self, phenopacket_name: str) -> List[GenomicVariant]:
+        """
+        Get the causative variants for a given phenopacket.
+        Args:
+            phenopacket_name (str): Name of the phenopacket.
+        Returns:
+            List[GenomicVariant]: List of GenomicVariant.
+        """
+        phenopacket_util = self._get_phenopacket_util(phenopacket_name)
+        return phenopacket_util.diagnosed_variants()
+    def _get_causative_diseases(self, phenopacket_name: str) -> List[ProbandDisease]:
+        """
+        Get the diseases for a given phenopacket.
+        Args:
+            phenopacket_name (str): Name of the phenopacket.
+        Returns:
+            List[ProbandDisease]: List of ProbandDisease
+        """
+        phenopacket_util = self._get_phenopacket_util(phenopacket_name)
+        return phenopacket_util.diagnoses()
+    def classified_gene(self, result_name: str) -> pl.DataFrame:
+        """
+        Classify gene results for a given phenopacket.
+        Args:
+            result_name (str): Name of the result file.
+        Returns:
+            pl.DataFrame: Classified ranked gene results.
+        """
+        causative_genes = self._get_causative_genes(result_name)
+        gene_symbols = [causative_gene.gene_symbol for causative_gene in causative_genes]
+        gene_identifiers = [causative_gene.gene_identifier for causative_gene in causative_genes]
+        return pl.DataFrame(
+            {
+                "gene_symbol": [g for g in gene_symbols],
+                "gene_identifier": [g for g in gene_identifiers],
+            }
+        ).with_columns(
+            [
+                pl.lit(0).cast(pl.Float64).alias("score"),
+                pl.lit(0).cast(pl.Int64).alias("rank"),
+                pl.lit(True).alias("true_positive"),
+            ]
+        )
+    @staticmethod
+    def merge_gene_results(ranked_results: pl.DataFrame, output_file: Path) -> pl.DataFrame:
+        """
+        Merge ranked gene results with the classified genes.
+        Args:
+            ranked_results (pl.DataFrame): Ranked gene results.
+            output_file (Path): Path to the output file.
+        Returns:
+            pl.DataFrame: Merged ranked gene results.
+        """
+        classified_results = pl.read_parquet(output_file)
+        return (
+            ranked_results.with_columns(
+                (
+                    pl.col("gene_symbol").is_in(classified_results["gene_symbol"])
+                    | pl.col("gene_identifier").is_in(classified_results["gene_identifier"])
+                ).alias("true_positive")
+            )
+            .with_columns(pl.col("rank").cast(pl.Int64))
+            .select(classified_results.columns)
+            .vstack(
+                classified_results.filter(
+                    ~pl.col("gene_symbol").is_in(ranked_results["gene_symbol"])
+                )
+            )
+        )
+    def classified_variant(self, result_name: str) -> pl.DataFrame:
+        """
+        Classified variant results for a given phenopacket.
+        Args:
+            result_name (str): Name of the result file.
+        Returns:
+            pl.DataFrame: Classified ranked variant results.
+        """
+        variants = self._get_causative_variants(result_name)
+        return pl.DataFrame(
+            {
+                "chrom": [v.chrom for v in variants],
+                "pos": [v.pos for v in variants],
+                "ref": [v.ref for v in variants],
+                "alt": [v.alt for v in variants],
+            }
+        ).with_columns(
+            [
+                pl.concat_str(["chrom", "pos", "ref", "alt"], separator="-").alias("variant_id"),
+                pl.lit(0.0).cast(pl.Float64).alias("score"),
+                pl.lit(0).cast(pl.Int64).alias("rank"),
+                pl.lit(True).alias("true_positive"),
+            ]
+        )
+    @staticmethod
+    def merge_variant_results(ranked_results: pl.DataFrame, output_file: Path) -> pl.DataFrame:
+        """
+        Merge ranked variant results with the classified variants.
+        Args:
+            ranked_results (pl.DataFrame): Ranked variant results.
+            output_file (Path): Path to the output file.
+        Returns:
+            pl.DataFrame: Merged ranked variant results.
+        """
+        classified_results = pl.read_parquet(output_file)
+        return (
+            ranked_results.with_columns(
+                [
+                    pl.struct(["chrom", "pos", "ref", "alt"])
+                    .is_in(
+                        classified_results.select(
+                            pl.struct(["chrom", "pos", "ref", "alt"])
+                        ).to_series()
+                    )
+                    .alias("true_positive")
+                ]
+            )
+            .with_columns(pl.col("rank").cast(pl.Int64))
+            .select(classified_results.columns)
+            .vstack(
+                classified_results.filter(
+                    ~pl.struct(["chrom", "pos", "ref", "alt"]).is_in(
+                        ranked_results.select(pl.struct(["chrom", "pos", "ref", "alt"])).to_series()
+                    )
+                )
+            )
+        )
+    def classified_disease(self, result_name: str) -> pl.DataFrame:
+        """
+        Classify disease results for a given phenopacket.
+        Args:
+            result_name (str): Name of the result file.
+        Returns:
+            pl.DataFrame: Classified ranked disease results.
+        """
+        diseases = self._get_causative_diseases(result_name)
+        disease_identifiers = list(set(disease.disease_identifier for disease in diseases))
+        return pl.DataFrame(
+            {
+                "disease_identifier": [d for d in disease_identifiers],
+            }
+        ).with_columns(
+            [
+                pl.lit(0).cast(pl.Float64).alias("score"),
+                pl.lit(0).cast(pl.Int64).alias("rank"),
+                pl.lit(True).alias("true_positive"),
+            ]
+        )
+    @staticmethod
+    def merge_disease_results(ranked_results: pl.DataFrame, output_file: Path) -> pl.DataFrame:
+        """
+        Merge ranked disease results with the classified diseases.
+        Args:
+            ranked_results (pl.DataFrame): Ranked disease results.
+            output_file (Path): Path to the output file.
+        Returns:
+            pl.DataFrame: Merged ranked disease results.
+        """
+        classified_results = pl.read_parquet(output_file)
+        return (
+            ranked_results.with_columns(
+                (
+                    pl.col("disease_identifier").is_in(classified_results["disease_identifier"])
+                ).alias("true_positive")
+            )
+            .with_columns(pl.col("rank").cast(pl.Int64))
+            .select(classified_results.columns)
+            .vstack(
+                classified_results.filter(
+                    ~pl.col("disease_identifier").is_in(ranked_results["disease_identifier"])
+                )
+            )
+        )

pheval 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

pheval 0.4.6py3-none-any.whl → 0.5.0py3-none-any.whl