PyPI - pheval - Versions diffs - 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

pheval 0.4.7py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show

pheval/analyse/benchmark.py +156 -0
pheval/analyse/benchmark_db_manager.py +16 -134
pheval/analyse/benchmark_output_type.py +43 -0
pheval/analyse/binary_classification_curves.py +132 -0
pheval/analyse/binary_classification_stats.py +164 -307
pheval/analyse/generate_plots.py +210 -395
pheval/analyse/generate_rank_comparisons.py +44 -0
pheval/analyse/rank_stats.py +190 -382
pheval/analyse/run_data_parser.py +21 -39
pheval/cli.py +27 -24
pheval/cli_pheval_utils.py +7 -8
pheval/post_processing/phenopacket_truth_set.py +250 -0
pheval/post_processing/post_processing.py +179 -345
pheval/post_processing/validate_result_format.py +91 -0
pheval/prepare/update_phenopacket.py +11 -9
pheval/utils/logger.py +35 -0
pheval/utils/phenopacket_utils.py +85 -91
{pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/METADATA +4 -4
{pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/RECORD +22 -26
pheval/analyse/analysis.py +0 -104
pheval/analyse/assess_prioritisation_base.py +0 -108
pheval/analyse/benchmark_generator.py +0 -126
pheval/analyse/benchmarking_data.py +0 -25
pheval/analyse/disease_prioritisation_analysis.py +0 -152
pheval/analyse/gene_prioritisation_analysis.py +0 -147
pheval/analyse/generate_summary_outputs.py +0 -105
pheval/analyse/parse_benchmark_summary.py +0 -81
pheval/analyse/parse_corpus.py +0 -219
pheval/analyse/prioritisation_result_types.py +0 -52
pheval/analyse/variant_prioritisation_analysis.py +0 -159
{pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/LICENSE +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/WHEEL +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/entry_points.txt +0 -0

pheval/analyse/benchmark.py ADDED Viewed

@@ -0,0 +1,156 @@
+import time
+from pathlib import Path
+from typing import List, Tuple
+import duckdb
+import polars as pl
+from pheval.analyse.benchmark_db_manager import write_table
+from pheval.analyse.benchmark_output_type import BenchmarkOutputType, BenchmarkOutputTypeEnum
+from pheval.analyse.binary_classification_curves import compute_curves
+from pheval.analyse.binary_classification_stats import compute_confusion_matrix
+from pheval.analyse.generate_plots import generate_plots
+from pheval.analyse.generate_rank_comparisons import calculate_rank_changes
+from pheval.analyse.rank_stats import compute_rank_stats
+from pheval.analyse.run_data_parser import Config, RunConfig, parse_run_config
+from pheval.utils.logger import get_logger
+def scan_directory(run: RunConfig, benchmark_type: BenchmarkOutputType) -> pl.LazyFrame:
+    """
+    Scan a results directory containing pheval parquet standardised results and return a LazyFrame object.
+    Args:
+        run (RunConfig): RunConfig object.
+        benchmark_type (BenchmarkOutputTypeEnum): Benchmark output type.
+    Returns:
+        pl.LazyFrame: LazyFrame object containing all the results in the directory..
+    """
+    logger = get_logger()
+    logger.info(f"Analysing results in {run.results_dir.joinpath(benchmark_type.result_directory)}")
+    return (
+        pl.scan_parquet(
+            run.results_dir.joinpath(benchmark_type.result_directory),
+            include_file_paths="file_path",
+        ).with_columns(
+            pl.col("rank").cast(pl.Int64),
+            pl.col("file_path").str.extract(r"([^/\\]+)$").alias("result_file"),
+            pl.col("true_positive").fill_null(False),
+        )
+    ).filter(
+        (
+            pl.col("score") >= run.threshold
+            if run.score_order.lower() == "descending"
+            else pl.col("score") <= run.threshold
+        )
+        if run.threshold is not None
+        else True
+    )
+def process_stats(
+    runs: List[RunConfig], benchmark_type: BenchmarkOutputType
+) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
+    """
+    Processes stats outputs for specified runs to compare.
+    Args:
+        runs (List[RunConfig]): List of runs to benchmark.
+        benchmark_type (BenchmarkOutputTypeEnum): Benchmark output type.
+    Returns:
+        Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, pl.DataFrame]: The stats for all runs.
+    """
+    stats, curve_results, true_positive_cases = [], [], []
+    for run in runs:
+        result_scan = scan_directory(run, benchmark_type)
+        stats.append(
+            compute_rank_stats(run.run_identifier, result_scan).join(
+                compute_confusion_matrix(run.run_identifier, result_scan), on="run_identifier"
+            )
+        )
+        curve_results.append(compute_curves(run.run_identifier, result_scan))
+        true_positive_cases.append(
+            result_scan.filter(pl.col("true_positive")).select(
+                ["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)]
+            )
+        )
+    return (
+        pl.concat(stats, how="vertical").collect(),
+        pl.concat(curve_results, how="vertical").collect(),
+        pl.concat(true_positive_cases, how="align_inner").collect(),
+    )
+def benchmark(config: Config, benchmark_type: BenchmarkOutputType) -> None:
+    """
+    Benchmark results for specified runs for a specified prioritisation type for comparison.
+    Args:
+        config (Config): Configuration for benchmarking.
+        benchmark_type (BenchmarkOutputType): Benchmark output type.
+    """
+    conn = duckdb.connect(f"{config.benchmark_name}.duckdb")
+    stats, curve_results, true_positive_cases = process_stats(config.runs, benchmark_type)
+    write_table(
+        conn, stats, f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_summary"
+    )
+    write_table(
+        conn,
+        curve_results,
+        f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_binary_classification_curves",
+    )
+    calculate_rank_changes(
+        conn, [run.run_identifier for run in config.runs], true_positive_cases, benchmark_type
+    )
+    generate_plots(
+        config.benchmark_name, stats, curve_results, benchmark_type, config.plot_customisation
+    )
+    conn.close()
+def benchmark_runs(benchmark_config_file: Path) -> None:
+    """
+    Benchmark results for specified runs for comparison.
+    Args:
+        benchmark_config_file (Path): Path to benchmark config file.
+    """
+    logger = get_logger()
+    start_time = time.perf_counter()
+    logger.info("Initiated benchmarking process.")
+    config = parse_run_config(benchmark_config_file)
+    gene_analysis_runs = [run for run in config.runs if run.gene_analysis]
+    variant_analysis_runs = [run for run in config.runs if run.variant_analysis]
+    disease_analysis_runs = [run for run in config.runs if run.disease_analysis]
+    if gene_analysis_runs:
+        logger.info("Initiating benchmarking for gene results.")
+        benchmark(
+            Config(
+                benchmark_name=config.benchmark_name,
+                runs=gene_analysis_runs,
+                plot_customisation=config.plot_customisation,
+            ),
+            BenchmarkOutputTypeEnum.GENE.value,
+        )
+        logger.info("Finished benchmarking for gene results.")
+    if variant_analysis_runs:
+        logger.info("Initiating benchmarking for variant results")
+        benchmark(
+            Config(
+                benchmark_name=config.benchmark_name,
+                runs=variant_analysis_runs,
+                plot_customisation=config.plot_customisation,
+            ),
+            BenchmarkOutputTypeEnum.VARIANT.value,
+        )
+        logger.info("Finished benchmarking for variant results.")
+    if disease_analysis_runs:
+        logger.info("Initiating benchmarking for disease results")
+        benchmark(
+            Config(
+                benchmark_name=config.benchmark_name,
+                runs=disease_analysis_runs,
+                plot_customisation=config.plot_customisation,
+            ),
+            BenchmarkOutputTypeEnum.DISEASE.value,
+        )
+        logger.info("Finished benchmarking for disease results.")
+    logger.info(
+        f"Finished benchmarking! Total time: {time.perf_counter() - start_time:.2f} seconds."
+    )

pheval/analyse/benchmark_db_manager.py CHANGED Viewed

@@ -1,141 +1,23 @@
-import ast
-import re
-from typing import List, Type, Union
-import duckdb
+import polars as pl
 from duckdb import DuckDBPyConnection
-from pheval.post_processing.post_processing import (
-    RankedPhEvalDiseaseResult,
-    RankedPhEvalGeneResult,
-    RankedPhEvalVariantResult,
-)
-class BenchmarkDBManager:
-    """
-    Class to connect to database.
-    """
-    def __init__(self, benchmark_name: str):
-        """Initialise the BenchmarkDBManager class."""
-        self.conn = self.get_connection(
-            f"{benchmark_name}" if str(benchmark_name).endswith(".db") else f"{benchmark_name}.db"
-        )
-    def initialise(self):
-        """Initialise the duckdb connection."""
-        self.add_contains_function()
+from pheval.utils.logger import get_logger
-    @staticmethod
-    def get_connection(db_name: str) -> DuckDBPyConnection:
-        """
-        Get a connection to the database.
-        Returns:
-            DuckDBPyConnection: Connection to the database.
-        """
-        conn = duckdb.connect(db_name)
-        return conn
+logger = get_logger()
-    def add_column_integer_default(self, table_name: str, column: str, default: int = 0) -> None:
-        """
-        Add a column to an existing table with an integer default value.
-        Args:
-            table_name (str): Name of the table.
-            column (str): Name of the column to add.
-            default (int): Default integer value to add.
-        """
-        try:
-            self.conn.execute(
-                f'ALTER TABLE "{table_name}" ADD COLUMN "{column}" INTEGER DEFAULT {default}'
-            )
-            self.conn.execute(f'UPDATE "{table_name}" SET "{column}" = ?', (default,))
-            self.conn.commit()
-        except duckdb.CatalogException:
-            pass
-    def drop_table(self, table_name: str) -> None:
-        """
-        Drop a table from the database.
-        Args:
-            table_name: Name of the table to drop from the database
-        """
-        self.conn.execute(f"""DROP TABLE IF EXISTS "{table_name}";""")
+def load_table_lazy(table_name: str, conn: DuckDBPyConnection) -> pl.LazyFrame:
+    logger.info(f"Loading table {table_name}")
+    return pl.from_arrow(conn.execute(f"SELECT * FROM {table_name}").fetch_arrow_table()).lazy()
-    @staticmethod
-    def contains_entity_function(entity: str, known_causative_entity: str) -> bool:
-        """
-        Determines if a known causative entity is present within an entity or list of entities.
-        Args:
-            entity (str): The entity to be checked. It can be a single entity or a string representation of a list.
-            known_causative_entity (str): The entity to search for within the `entity`.
-        Returns:
-            bool: `True` if `known_causative_entity` is found in `entity` (or its list representation),
-                `False` otherwise.
-        """
-        list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*]$")
-        entity = entity.replace("nan", "None").replace("NaN", "None")
-        if list_pattern.match(str(entity)):
-            list_representation = ast.literal_eval(entity)
-            if isinstance(list_representation, list):
-                return known_causative_entity in list_representation
-        return known_causative_entity == entity
-    def add_contains_function(self) -> None:
-        """
-        Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist.
-        """
-        result = self.conn.execute(
-            "SELECT * FROM duckdb_functions() WHERE function_name = ?", ["contains_entity_function"]
-        ).fetchall()
-        if not result:
-            self.conn.create_function("contains_entity_function", self.contains_entity_function)
-    def parse_table_into_dataclass(
-        self,
-        table_name: str,
-        dataclass: Union[
-            Type[RankedPhEvalGeneResult],
-            Type[RankedPhEvalVariantResult],
-            Type[RankedPhEvalDiseaseResult],
-        ],
-    ) -> Union[
-        List[RankedPhEvalGeneResult],
-        List[RankedPhEvalVariantResult],
-        List[RankedPhEvalDiseaseResult],
-    ]:
-        """
-        Parses a DuckDB table into a list of dataclass instances.
-        Args:
-            table_name (str): The name of the DuckDB table to be parsed.
-            dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult],
-            Type[RankedPhEvalDiseaseResult]]):
-                The dataclass type to which each row in the table should be mapped.
-        Returns:
-            List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table.
-        """
-        result = (
-            self.conn.execute(f"SELECT * FROM '{table_name}'").fetchdf().to_dict(orient="records")
-        )
-        return [dataclass(**row) for row in result]
-    def check_table_exists(self, table_name: str) -> bool:
-        """
-        Check if a table exists in the connected DuckDB database.
-        Args:
-            table_name (str): The name of the table to check for existence.
-        Returns:
-            bool: Returns `True` if the table exists in the database, `False` otherwise.
-        """
-        result = self.conn.execute(
-            f"SELECT * FROM information_schema.tables WHERE table_name = '{table_name}'"
-        ).fetchall()
-        if result:
-            return True
-        return False
-    def close(self):
-        """Close the connection to the database."""
-        self.conn.close()
+def write_table(conn: DuckDBPyConnection, df: pl.DataFrame, table_name: str) -> None:
+    """
+    Write table to DuckDB database.
+    Args:
+        conn (DuckDBPyConnection): DuckDB connection.
+        df (pl.DataFrame): DuckDB dataframe.
+        table_name (str): Table name.
+    """
+    logger.info(f"Storing results in {table_name}.")
+    conn.execute(f"""CREATE TABLE "{table_name}" AS SELECT * FROM df""")

pheval/analyse/benchmark_output_type.py ADDED Viewed

@@ -0,0 +1,43 @@
+from enum import Enum
+from typing import List, NamedTuple
+class BenchmarkOutputType(NamedTuple):
+    """
+    Represents the structure of benchmark output types.
+    Attributes:
+        prioritisation_type_string (str): The type of prioritisation being performed.
+        y_label (str): The label for the y-axis in performance evaluation plots.
+        columns (List[str]): The list of column names relevant to the benchmark output.
+        result_directory (str): The directory where benchmark results are stored.
+    """
+    prioritisation_type_string: str
+    y_label: str
+    columns: List[str]
+    result_directory: str
+class BenchmarkOutputTypeEnum(Enum):
+    """
+    Enumeration of benchmark output types, representing different entities.
+    Attributes:
+        GENE (BenchmarkOutputType): Benchmark output type for gene prioritisation.
+        VARIANT (BenchmarkOutputType): Benchmark output type for variant prioritisation.
+        DISEASE (BenchmarkOutputType): Benchmark output type for disease prioritisation.
+    """
+    GENE = BenchmarkOutputType(
+        "gene",
+        "Disease-causing genes (%)",
+        ["gene_identifier", "gene_symbol"],
+        "pheval_gene_results",
+    )
+    VARIANT = BenchmarkOutputType(
+        "variant", "Disease-causing variants (%)", ["variant_id"], "pheval_variant_results"
+    )
+    DISEASE = BenchmarkOutputType(
+        "disease", "Known diseases (%)", ["disease_identifier"], "pheval_disease_results"
+    )

pheval/analyse/binary_classification_curves.py ADDED Viewed

@@ -0,0 +1,132 @@
+from typing import Tuple
+import numpy as np
+import polars as pl
+from sklearn.metrics import precision_recall_curve, roc_curve
+from pheval.utils.logger import get_logger
+class BinaryClassificationCurves:
+    """Class for computing and storing ROC & Precision-Recall curves in Polars."""
+    @staticmethod
+    def _compute_finite_bounds(result_scan: pl.LazyFrame) -> Tuple[float, float]:
+        """
+        Compute min and max finite values in the 'score' column to handle NaN and Inf values.
+        Args:
+            result_scan (pl.LazyFrame): The LazyFrame containing the results for the directory.
+        Returns:
+            Tuple[float, float]: The (max_finite, min_finite) values for normalising scores.
+        """
+        return (
+            result_scan.select(
+                [
+                    pl.col("score").filter(pl.col("score").is_finite()).max().alias("max_finite"),
+                    pl.col("score").filter(pl.col("score").is_finite()).min().alias("min_finite"),
+                ]
+            )
+            .collect()
+            .row(0)
+        )
+    @staticmethod
+    def _clean_and_extract_data(
+        result_scan: pl.LazyFrame, max_finite: float, min_finite: float
+    ) -> pl.LazyFrame:
+        """
+        Normalise the 'score' column (handling NaNs and Inf values) and extract 'true_positive' labels.
+        Args:
+            result_scan (pl.LazyFrame): The LazyFrame containing the results for the directory.
+            max_finite (float): The maximum finite score value.
+            min_finite (float): The minimum finite score value.
+        Returns:
+            pl.LazyFrame: A LazyFrame with cleaned 'score' and binary 'true_positive' columns.
+        """
+        return result_scan.with_columns(
+            [
+                pl.when(pl.col("score").is_nan())
+                .then(0.0)
+                .when(pl.col("score").is_infinite() & (pl.col("score") > 0))
+                .then(max_finite)
+                .when(pl.col("score").is_infinite() & (pl.col("score") < 0))
+                .then(min_finite)
+                .otherwise(pl.col("score"))
+                .alias("score"),
+                pl.when(pl.col("true_positive").is_null())
+                .then(0)
+                .otherwise(pl.col("true_positive").cast(pl.Int8))
+                .alias("true_positive"),
+            ]
+        )
+    @staticmethod
+    def _compute_roc_pr_curves(
+        run_identifier: str, labels: np.ndarray, scores: np.ndarray
+    ) -> pl.LazyFrame:
+        """
+        Compute ROC and Precision-Recall curves.
+        Args:
+            labels (np.ndarray): Binary ground truth labels (0 or 1).
+            scores (np.ndarray): Prediction scores.
+        Returns:
+            pl.LazyFrame: A LazyFrame containing the computed FPR, TPR, Precision, Recall, and Thresholds.
+        """
+        fpr, tpr, roc_thresholds = roc_curve(labels, scores, pos_label=1)
+        precision, recall, pr_thresholds = precision_recall_curve(labels, scores, pos_label=1)
+        return pl.LazyFrame(
+            {
+                "run_identifier": [run_identifier],
+                "fpr": [fpr.tolist()],
+                "tpr": [tpr.tolist()],
+                "threshold_roc": [roc_thresholds.tolist()],
+                "precision": [precision.tolist()],
+                "recall": [recall.tolist()],
+                "threshold_pr": [pr_thresholds.tolist()],
+            }
+        )
+    @classmethod
+    def process(cls, result_scan: pl.LazyFrame, run_identifier: str) -> pl.LazyFrame:
+        """
+        Process scores, extract true labels, compute ROC and Precision-Recall curves,
+        and store results in a Polars LazyFrame with NumPy arrays.
+        Args:
+            result_scan (pl.LazyFrame): The LazyFrame containing the results for the directory.
+            run_identifier (str): Identifier for this run.
+        Returns:
+            pl.LazyFrame: A LazyFrame containing ROC & PR curve data with NumPy arrays.
+        """
+        max_finite, min_finite = cls._compute_finite_bounds(result_scan)
+        cleaned_data = (
+            cls._clean_and_extract_data(result_scan, max_finite, min_finite)
+            .select(["true_positive", "score"])
+            .collect()
+        )
+        return cls._compute_roc_pr_curves(
+            run_identifier,
+            cleaned_data["true_positive"].to_numpy().flatten(),
+            cleaned_data["score"].to_numpy().flatten(),
+        )
+def compute_curves(run_identifier: str, result_scan: pl.LazyFrame) -> pl.LazyFrame:
+    """
+    Compute ROC and Precision-Recall curves.
+    Args:
+        result_scan (pl.LazyFrame): The LazyFrame containing the results for the directory.
+        run_identifier (str): Identifier for this run.
+    Returns:
+        pl.LazyFrame: LazyFrame containing the ROC & Precision-Recall curve data with NumPy arrays.
+    """
+    logger = get_logger()
+    logger.info("Calculating ROC and Precision-Recall metrics")
+    return BinaryClassificationCurves.process(result_scan, run_identifier)

pheval 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl

Potentially problematic release.

pheval 0.4.7py3-none-any.whl → 0.5.1py3-none-any.whl