PyPI - pheval - Versions diffs - 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

pheval 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show

pheval/analyse/benchmark.py +156 -0
pheval/analyse/benchmark_db_manager.py +16 -134
pheval/analyse/benchmark_output_type.py +43 -0
pheval/analyse/binary_classification_curves.py +132 -0
pheval/analyse/binary_classification_stats.py +164 -307
pheval/analyse/generate_plots.py +210 -395
pheval/analyse/generate_rank_comparisons.py +44 -0
pheval/analyse/rank_stats.py +190 -382
pheval/analyse/run_data_parser.py +21 -39
pheval/cli.py +27 -24
pheval/cli_pheval_utils.py +7 -8
pheval/post_processing/phenopacket_truth_set.py +235 -0
pheval/post_processing/post_processing.py +185 -337
pheval/post_processing/validate_result_format.py +92 -0
pheval/prepare/update_phenopacket.py +11 -9
pheval/utils/logger.py +35 -0
pheval/utils/phenopacket_utils.py +85 -91
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
pheval/analyse/analysis.py +0 -104
pheval/analyse/assess_prioritisation_base.py +0 -108
pheval/analyse/benchmark_generator.py +0 -126
pheval/analyse/benchmarking_data.py +0 -25
pheval/analyse/disease_prioritisation_analysis.py +0 -152
pheval/analyse/gene_prioritisation_analysis.py +0 -147
pheval/analyse/generate_summary_outputs.py +0 -105
pheval/analyse/parse_benchmark_summary.py +0 -81
pheval/analyse/parse_corpus.py +0 -219
pheval/analyse/prioritisation_result_types.py +0 -52
pheval/analyse/variant_prioritisation_analysis.py +0 -159
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/WHEEL +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0

pheval/post_processing/validate_result_format.py ADDED Viewed

@@ -0,0 +1,92 @@
+from enum import Enum
+from functools import wraps
+from typing import Callable
+import polars as pl
+class ResultSchema(Enum):
+    """
+    Enum for different result schema formats.
+    Attributes:
+        GENE_RESULT_SCHEMA (pl.Schema): Schema for gene-based results.
+        VARIANT_RESULT_SCHEMA (pl.Schema): Schema for variant-based results.
+        DISEASE_RESULT_SCHEMA (pl.Schema): Schema for disease-based results.
+    """
+    GENE_RESULT_SCHEMA = pl.Schema(
+        {
+            "gene_symbol": pl.String,
+            "gene_identifier": pl.String,
+            "score": pl.Float64,
+            "grouping_id": pl.Utf8,
+        }
+    )
+    VARIANT_RESULT_SCHEMA = pl.Schema(
+        {
+            "chrom": pl.String,
+            "start": pl.Int64,
+            "end": pl.Int64,
+            "ref": pl.String,
+            "alt": pl.String,
+            "score": pl.Float64,
+            "grouping_id": pl.Utf8,
+        }
+    )
+    DISEASE_RESULT_SCHEMA = pl.Schema(
+        {
+            "disease_name": pl.String,
+            "disease_identifier": pl.String,
+            "score": pl.Float64,
+            "grouping_id": pl.Utf8,
+        }
+    )
+    def validate(self, df: pl.DataFrame) -> bool:
+        """
+        Validate that a DataFrame follows the expected schema.
+        Args:
+            df (pl.DataFrame): The DataFrame to validate.
+        Raises:
+            ValueError: If a required column is missing or the grouping_id column contains a null value.
+            TypeError: If a column exists but has an incorrect data type.
+        Returns:
+            bool: True if the DataFrame is valid according to the schema.
+        """
+        expected_schema = self.value
+        if "grouping_id" in df.columns and df["grouping_id"].null_count() > 0:
+            raise ValueError("'grouping_id' column should not contain null values if provided.")
+        for col_name, expected_type in expected_schema.items():
+            if col_name not in df.schema:
+                if col_name == "grouping_id":
+                    continue
+                raise ValueError(f"Missing required column: {col_name}")
+            if df.schema[col_name] != expected_type:
+                raise TypeError(
+                    f"Column '{col_name}' has type {df.schema[col_name]}, expected {expected_type}"
+                )
+        return True
+def validate_dataframe(schema: ResultSchema) -> Callable:
+    """
+    Decorator to validate DataFrame input based on a ResultSchema.
+    Args:
+        schema (ResultSchema): The expected schema from the `ResultSchema` enum.
+    Returns:
+        Callable: A wrapped function that validates the DataFrame before execution.
+    """
+    def decorator(func: Callable) -> Callable:
+        @wraps(func)
+        def wrapper(df: pl.DataFrame, *args, **kwargs):
+            schema.validate(df)
+            return func(df, *args, **kwargs)
+        return wrapper
+    return decorator

pheval/prepare/update_phenopacket.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from collections import defaultdict
 from pathlib import Path
 from typing import Union
+import polars as pl
 from phenopackets import Family, Phenopacket
 from pheval.utils.file_utils import all_files
@@ -9,14 +9,14 @@ from pheval.utils.phenopacket_utils import (
     GeneIdentifierUpdater,
     PhenopacketRebuilder,
     PhenopacketUtil,
-    create_hgnc_dict,
+    create_gene_identifier_map,
     phenopacket_reader,
     write_phenopacket,
 )
 def update_outdated_gene_context(
-    phenopacket_path: Path, gene_identifier: str, hgnc_data: defaultdict
+    phenopacket_path: Path, gene_identifier: str, identifier_map: pl.DataFrame
 ) -> Union[Phenopacket, Family]:
     """
     Update the gene context of the Phenopacket.
@@ -24,7 +24,7 @@ def update_outdated_gene_context(
     Args:
         phenopacket_path (Path): The path to the Phenopacket file.
         gene_identifier (str): Identifier to update the gene context.
-        hgnc_data (defaultdict): The HGNC data used for updating.
+        identifier_map (pl.DataFrame): The gene identifier map used for updating.
     Returns:
         Union[Phenopacket, Family]: The updated Phenopacket or Family.
@@ -37,7 +37,7 @@ def update_outdated_gene_context(
     phenopacket = phenopacket_reader(phenopacket_path)
     interpretations = PhenopacketUtil(phenopacket).interpretations()
     updated_interpretations = GeneIdentifierUpdater(
-        hgnc_data=hgnc_data, gene_identifier=gene_identifier
+        identifier_map=identifier_map, gene_identifier=gene_identifier
     ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
     return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
@@ -57,8 +57,10 @@ def create_updated_phenopacket(
         to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
         to describe the gene identifiers.
     """
-    hgnc_data = create_hgnc_dict()
-    updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, hgnc_data)
+    identifier_map = create_gene_identifier_map()
+    updated_phenopacket = update_outdated_gene_context(
+        phenopacket_path, gene_identifier, identifier_map
+    )
     write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
@@ -78,10 +80,10 @@ def create_updated_phenopackets(
         to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
         to describe the gene identifiers.
     """
-    hgnc_data = create_hgnc_dict()
+    identifier_map = create_gene_identifier_map()
     for phenopacket_path in all_files(phenopacket_dir):
         updated_phenopacket = update_outdated_gene_context(
-            phenopacket_path, gene_identifier, hgnc_data
+            phenopacket_path, gene_identifier, identifier_map
         )
         write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))

pheval/utils/logger.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] [%(filename)s:%(lineno)d] - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+def get_logger(name="PHEVAL"):
+    return logging.getLogger(name)
+def print_ascii_banner():
+    """Prints ASCII banner only once when the script starts."""
+    if not getattr(logging, "_ascii_printed", False):
+        logging._ascii_printed = True
+        pheval_banner = """
+        Welcome to:
+        ██████╗ ██╗  ██╗███████╗██╗   ██╗ █████╗ ██╗
+        ██╔══██╗██║  ██║██╔════╝██║   ██║██╔══██╗██║
+        ██████╔╝███████║█████╗  ██║   ██║███████║██║
+        ██╔═══╝ ██╔══██║██╔══╝  ╚██╗ ██╔╝██╔══██║██║
+        ██║     ██║  ██║███████╗ ╚████╔╝ ██║  ██║███████╗
+        ╚═╝     ╚═╝  ╚═╝╚══════╝  ╚═══╝  ╚═╝  ╚═╝╚══════╝
+        A framework for the empirical evaluation of phenotype-driven prioritisation tools.
+        """
+        print(pheval_banner)
+def initialise_context(ctx):
+    ctx.ensure_object(dict)
+    if not getattr(ctx, "ascii_printed", False):
+        ctx.ascii_printed = True
+        print_ascii_banner()

pheval/utils/phenopacket_utils.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import json
 import logging
 import os
-from collections import defaultdict
 from copy import copy
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Union
-import pandas as pd
+import polars as pl
 from google.protobuf.json_format import MessageToJson, Parse
 from phenopackets import (
     Disease,
@@ -122,79 +121,65 @@ class ProbandDisease:
     disease_identifier: str
-def read_hgnc_data() -> pd.DataFrame:
+def parse_hgnc_data() -> pl.DataFrame:
     """
-    Read HGNC data from a file and return it as a Pandas DataFrame.
+    Read HGNC data from a file and return it as a Polars DataFrame.
     Returns:
-        pd.DataFrame: DataFrame containing the HGNC data.
+        pl.DataFrame: DataFrame containing the HGNC data.
     """
-    return pd.read_csv(
-        os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set.txt"),
-        delimiter="\t",
-        dtype=str,
+    return (
+        pl.read_csv(
+            os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set.txt"),
+            separator="\t",
+            infer_schema=10000000000,
+            dtypes={"omim_id": pl.Utf8},
+        )
+        .select(
+            [
+                pl.col("hgnc_id").alias("hgnc_id"),
+                pl.col("symbol").alias("gene_symbol"),
+                pl.col("ensembl_gene_id").alias("ensembl_id"),
+                pl.col("entrez_id").alias("entrez_id"),
+                pl.col("refseq_accession").alias("refseq_accession"),
+                pl.col("prev_symbol").alias("previous_symbol_raw"),
+            ]
+        )
+        .with_columns(
+            pl.col("previous_symbol_raw")
+            .str.split("|")
+            .list.eval(pl.element().str.strip_chars('"'))
+            .alias("prev_symbols")
+        )
     )
-def create_hgnc_dict() -> defaultdict:
-    """
-    Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data.
-    Returns:
-        defaultdict: A dictionary containing gene symbols as keys and their associated gene information.
-    Notes:
-        The dictionary structure:
-        {
-            'gene_symbol': {
-                'ensembl_id': str,
-                'hgnc_id': str,
-                'entrez_id': str,
-                'refseq_accession': str,
-                'previous_symbol': [str, ...]
-            },
-            ...
-        }
-    """
-    hgnc_df = read_hgnc_data()
-    hgnc_data = defaultdict(dict)
-    for _index, row in hgnc_df.iterrows():
-        previous_names = []
-        hgnc_data[row["symbol"]]["ensembl_id"] = row["ensembl_gene_id"]
-        hgnc_data[row["symbol"]]["hgnc_id"] = row["hgnc_id"]
-        hgnc_data[row["symbol"]]["entrez_id"] = row["entrez_id"]
-        hgnc_data[row["symbol"]]["refseq_accession"] = row["refseq_accession"]
-        previous = str(row["prev_symbol"]).split("|")
-        for p in previous:
-            previous_names.append(p.strip('"'))
-        hgnc_data[row["symbol"]]["previous_symbol"] = previous_names
-    return hgnc_data
-def create_gene_identifier_map() -> dict:
+def create_gene_identifier_map() -> pl.DataFrame:
     """
     Create a mapping of gene identifiers to gene symbols using HGNC data.
     Returns:
-        dict: A mapping of gene identifiers to gene symbols.
-    Notes:
-        The dictionary structure:
-        {
-            'identifier': 'gene_symbol',
-            ...
-        }
+        pl.DataFrame: A mapping of gene identifiers to gene symbols.
     """
-    hgnc_df = read_hgnc_data()
-    identifier_map = {}
-    for _index, row in hgnc_df.iterrows():
-        identifier_map[row["ensembl_gene_id"]] = row["symbol"]
-        identifier_map[row["hgnc_id"]] = row["symbol"]
-        identifier_map[row["entrez_id"]] = row["symbol"]
-        identifier_map[row["refseq_accession"]] = row["symbol"]
-    return identifier_map
+    hgnc_df = parse_hgnc_data()
+    return hgnc_df.melt(
+        id_vars=["gene_symbol", "prev_symbols"],
+        value_vars=["ensembl_id", "hgnc_id", "entrez_id", "refseq_accession"],
+        variable_name="identifier_type",
+        value_name="identifier",
+    ).with_columns(
+        pl.col("identifier_type")
+        .replace(
+            {
+                "ensembl_id": "ensembl:",
+                "hgnc_id": "",
+                "entrez_id": "ncbigene:",
+                "refseq_accession": "",
+            },
+            default="",
+        )
+        .alias("prefix")
+    )
 def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
@@ -651,17 +636,19 @@ def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path
 class GeneIdentifierUpdater:
     """Class for updating gene identifiers within genomic interpretations."""
-    def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
+    def __init__(
+        self,
+        gene_identifier: str,
+        identifier_map: pl.DataFrame = None,
+    ):
         """
         Initialise the GeneIdentifierUpdater.
         Args:
             gene_identifier (str): The gene identifier to update to.
-            hgnc_data (dict): A dictionary containing HGNC data (default: None).
-            identifier_map (dict): A dictionary mapping gene identifiers (default: None).
+            identifier_map (dict): A polars dataframe mapping gene identifiers (default: None).
         """
-        self.hgnc_data = hgnc_data
         self.gene_identifier = gene_identifier
         self.identifier_map = identifier_map
@@ -675,13 +662,20 @@ class GeneIdentifierUpdater:
         Returns:
             str: The identified gene identifier.
         """
-        if gene_symbol in self.hgnc_data.keys():
-            return self.hgnc_data[gene_symbol][self.gene_identifier]
-        else:
-            for _symbol, data in self.hgnc_data.items():
-                for prev_symbol in data["previous_symbol"]:
-                    if prev_symbol == gene_symbol:
-                        return data[self.gene_identifier]
+        matches = self.identifier_map.filter(
+            (pl.col("gene_symbol") == gene_symbol)
+            & (pl.col("identifier_type") == self.gene_identifier)
+        )
+        if matches.height > 0:
+            return matches["identifier"][0]
+        prev_symbol_matches = self.identifier_map.filter(
+            (pl.col("identifier_type") == self.gene_identifier)
+            & (pl.col("prev_symbols").list.contains(gene_symbol))
+        )
+        if prev_symbol_matches.height > 0:
+            return prev_symbol_matches["identifier"][0]
+        return None
     def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
         """
@@ -693,7 +687,9 @@ class GeneIdentifierUpdater:
         Returns:
             str: The gene symbol corresponding to the identifier.
         """
-        return self.identifier_map[query_gene_identifier]
+        return self.identifier_map.filter(pl.col("identifier") == query_gene_identifier)[
+            "gene_symbol"
+        ][0]
     def _find_alternate_ids(self, gene_symbol: str) -> List[str]:
         """
@@ -705,23 +701,20 @@ class GeneIdentifierUpdater:
         Returns:
             List[str]: List of alternate IDs for the gene symbol.
         """
-        if gene_symbol in self.hgnc_data.keys():
-            return [
-                self.hgnc_data[gene_symbol]["hgnc_id"],
-                "ncbigene:" + self.hgnc_data[gene_symbol]["entrez_id"],
-                "ensembl:" + self.hgnc_data[gene_symbol]["ensembl_id"],
-                "symbol:" + gene_symbol,
+        matches = self.identifier_map.filter((pl.col("gene_symbol") == gene_symbol))
+        if matches.height > 0:
+            return [f"{row['prefix']}{row['identifier']}" for row in matches.rows(named=True)] + [
+                f"symbol:{gene_symbol}"
             ]
-        else:
-            for symbol, data in self.hgnc_data.items():
-                for prev_symbol in data["previous_symbol"]:
-                    if prev_symbol == gene_symbol:
-                        return [
-                            data["hgnc_id"],
-                            "ncbigene:" + data["entrez_id"],
-                            "ensembl:" + data["ensembl_id"],
-                            "symbol:" + symbol,
-                        ]
+        prev_symbol_matches = self.identifier_map.filter(
+            (pl.col("prev_symbols").list.contains(gene_symbol))
+        )
+        if prev_symbol_matches.height > 0:
+            return [
+                f"{row['prefix']}{row['identifier']}"
+                for row in prev_symbol_matches.rows(named=True)
+            ] + [f"symbol:{gene_symbol}"]
+        return None
     def update_genomic_interpretations_gene_identifier(
         self, interpretations: List[Interpretation], phenopacket_path: Path
@@ -731,6 +724,7 @@ class GeneIdentifierUpdater:
         Args:
             interpretations (List[Interpretation]): List of Interpretation objects.
+            phenopacket_path (Path): The Path to the Phenopacket.
         Returns:
             List[Interpretation]: Updated list of Interpretation objects.

{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,11 @@
 Metadata-Version: 2.3
 Name: pheval
-Version: 0.4.7
+Version: 0.5.0
 Summary:
 Author: Yasemin Bridges
 Author-email: y.bridges@qmul.ac.uk
-Requires-Python: >=3.9,<4.0.0
+Requires-Python: >=3.10,<4.0.0
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
@@ -22,8 +21,9 @@ Requires-Dist: oaklib (>=0.5.6)
 Requires-Dist: pandas (>=1.5.1)
 Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
 Requires-Dist: plotly (>=5.13.0,<6.0.0)
-Requires-Dist: polars (>=0.19.15,<0.20.0)
+Requires-Dist: polars (>=1.23,<2.0)
 Requires-Dist: pyaml (>=21.10.1,<22.0.0)
+Requires-Dist: pyarrow (>=19.0.1,<20.0.0)
 Requires-Dist: pyserde (>=0.9.8,<0.10.0)
 Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
 Requires-Dist: seaborn (>=0.12.2,<0.13.0)

{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,36 +1,31 @@
 pheval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pheval/analyse/analysis.py,sha256=Yt2xH0WS_2NO13-wYvywzmCRCj8RinQ1MeozJQuGe3o,4009
-pheval/analyse/assess_prioritisation_base.py,sha256=znBscRTqIKxxZMHR-H6KrjFJ6Uv5P5HzwTQUWS6Eoos,3434
-pheval/analyse/benchmark_db_manager.py,sha256=IRqu5fUpLBboHpS4lx0AaAkor_W7whuvUSKiMr2-GhM,5185
-pheval/analyse/benchmark_generator.py,sha256=-LljszuKAT3oJfGQn7JHAILCGg5QXYny4nPPf273g_E,5896
-pheval/analyse/benchmarking_data.py,sha256=aRvDmwqjFGKvWDRGjMwaQxfDZscptRBwI-rcSqY-X5s,913
-pheval/analyse/binary_classification_stats.py,sha256=E35YjvGM-zFnuEt8M3pgN03vBab4MH6ih726QKvuogg,12519
-pheval/analyse/disease_prioritisation_analysis.py,sha256=t__1lhyw1PnbDBbXDxDaFgLFbdRz20D1s-8tfLQJjLs,6186
-pheval/analyse/gene_prioritisation_analysis.py,sha256=Bapg0VcHOz5vp1dI4bDba04SpX6UdDu35VGTI6sZyOk,6026
-pheval/analyse/generate_plots.py,sha256=5oxsdnAbbVgQj8ZrWTLs12rSM24EXp-IdLCjy5QB1_g,21992
-pheval/analyse/generate_summary_outputs.py,sha256=nKqwbpA-9bbL5mCySiuyV_AUDIokmCg3vD8_JAsg1ls,4157
-pheval/analyse/parse_benchmark_summary.py,sha256=vyAOIdIWF4rZjGTPFE69ajhEC9AkkN3QBVqSe_uYZsg,2946
-pheval/analyse/parse_corpus.py,sha256=pxhoKTgd-DnwAMP081UMG-NKbj89qAYBQhHve8aphfI,8698
-pheval/analyse/prioritisation_result_types.py,sha256=qJoB6O-lFYmzAMcTQeDJZQNLJ6hleoKDYATTkhvFF98,1228
-pheval/analyse/rank_stats.py,sha256=vNLVuG_NzhKDXxKmklYNPz44MczlyKUqcuHqbiuOXwI,17993
-pheval/analyse/run_data_parser.py,sha256=VQBUoOIRYRWc5uqURUvaWdaW3E3C7Su0JvLavQLHQaY,4105
-pheval/analyse/variant_prioritisation_analysis.py,sha256=HhDeczF7wmJjXt0ejAtF0qdczyMe25glqiS6uX_TFl8,6408
-pheval/cli.py,sha256=SPB8-BCIRt1fUaAalhZ5Y6JUlnJX6Cj2S52QXCovJR8,1526
+pheval/analyse/benchmark.py,sha256=1ysz1peGb21DhgNpEam9NgUOS5eGv7K0CI3RNjy0crQ,6275
+pheval/analyse/benchmark_db_manager.py,sha256=zS1TI76YuV2_YXLipHLSyh-XDR5kTxyOwhRhHRFHfjQ,764
+pheval/analyse/benchmark_output_type.py,sha256=bh-qQvV4AF7BHQyr_bdY8HTTzYZVe7KvoIoUF0D9k-g,1468
+pheval/analyse/binary_classification_curves.py,sha256=Crb45rJWc5rxDdx82sgoHRvYHE2D5pus91fgl39FyRw,5007
+pheval/analyse/binary_classification_stats.py,sha256=sOuEp6IxZ6SVp-KC6MJkZNTkZucZTNK25xApP5tU6Mk,6944
+pheval/analyse/generate_plots.py,sha256=g98DxhTw1dPRfRRYoKBmt51XfIa2KzlL_Z7weFSoBUg,14550
+pheval/analyse/generate_rank_comparisons.py,sha256=KcQJ9rm1nvvTcqLNuxAkXRXuV18vEsiP0giQ-ryHyYc,1684
+pheval/analyse/rank_stats.py,sha256=qHrqlIsZVSV2ASc5cZ6TsmKaMq3bZtCzS1ZURjL8mks,9211
+pheval/analyse/run_data_parser.py,sha256=Lr0ao_Mlp8EYLaM4XmiEjo7P7jt_rCBR2y2hb_D3c70,3366
+pheval/cli.py,sha256=rpvTTCKAvH75XkZUh0xaKv7Ftl9zIt2RncsMGIlrq9U,1556
 pheval/cli_pheval.py,sha256=fWbKUcPTZZSa1EJEtH_lNn1XE6qRApRHihqUZS5owrA,2424
-pheval/cli_pheval_utils.py,sha256=O6tWnE85QQHGNcP08OwJGANMfXJPsZtFEu-D6ATld00,16700
+pheval/cli_pheval_utils.py,sha256=sh6kx36jYfuSIWBMlrdW3g-LPftxBy-xw4b7hg8bdj4,16545
 pheval/config_parser.py,sha256=lh-Dy_FflXJUnRC3HYaEdSvPAsNZWQZlEr1hHQigrTM,1227
 pheval/implementations/__init__.py,sha256=BMUTotjTdgy5j5xubWCIQgRXrSQ1ZIcjooer7r299Zo,1228
 pheval/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/infra/exomiserdb.py,sha256=pM9-TfjrgurtH4OtM1Enk5oVhIxGQN3rKRlrxHuObTM,5080
 pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pheval/post_processing/post_processing.py,sha256=MdacHoVjmwvmWBnHCSSKBboCgMW4MRGP-d_7-t1zZew,14808
+pheval/post_processing/phenopacket_truth_set.py,sha256=ue3pNeg_GZiGyuKrm6_4MsJWpW0LWtfG9wja2Cc8SLg,8873
+pheval/post_processing/post_processing.py,sha256=4xP-gjZ3VoXydU9ClPvmRtuDaSMUeJImgLugurOS5_k,9480
+pheval/post_processing/validate_result_format.py,sha256=4U6AfHt01EexwU_OnpmytQAhGVS6ZWF1S-5NVBx1oaM,2916
 pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/prepare/create_noisy_phenopackets.py,sha256=ydhA4mpqKTDc4hBu8YfvNW2nMubHK3dbO-cv0lA4JFQ,11504
 pheval/prepare/create_spiked_vcf.py,sha256=90A-Mi8QKhvN036vtFEVWAHgzHO37itiLYrqYlG4LiA,23953
 pheval/prepare/custom_exceptions.py,sha256=_G3_95dPtHIs1SviYBV1j7cYc-hxlhuw8hhnYdzByYY,1719
 pheval/prepare/prepare_corpus.py,sha256=YFnklpeVXeqeme9DVmd_jfsK04ytIe9cH5uXYcgK5cY,4650
-pheval/prepare/update_phenopacket.py,sha256=21fzUPbwKN6Ey5TSh9PFzjT2x86U19RAE6WmkjG8u28,4770
+pheval/prepare/update_phenopacket.py,sha256=Bjru0ptNKyzLaYElouKZe2GYRQbETTC0FMiMojrP8Lg,4850
 pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
 pheval/resources/alternate_ouputs/DeepPVP_results.txt,sha256=MF9MZJYa4r4PEvFzALpi-lNGLxjENOnq_YgrgFMn-oQ,1508
 pheval/resources/alternate_ouputs/OVA_results.txt,sha256=_5XFCR4W04D-W7DObpALLsa0-693g2kiIUB_uo79aHk,9845
@@ -47,11 +42,12 @@ pheval/utils/docs_gen.py,sha256=6FGtHicBC0rZKi0tdL3Epsg8d4osE44I9f1Ga0j4JLA,3193
 pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
 pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
 pheval/utils/file_utils.py,sha256=m21cz-qjDYqnI8ClUv3J9fKizex98a-9bSEerQ75i_c,3576
-pheval/utils/phenopacket_utils.py,sha256=6xQ8WCLdR1VhiU3nCDzaqEVKjGvDWrzvPA50_6ZAHXM,27310
+pheval/utils/logger.py,sha256=5DZl5uMltUDQorhkvg_B7_ZhFwApAmEkWneFIOKfRGQ,1566
+pheval/utils/phenopacket_utils.py,sha256=AfV_mWac6n5HCc5zjfH6CGP8T0qI0LR0VBrooaKmgdY,26978
 pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
 pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
-pheval-0.4.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-pheval-0.4.7.dist-info/METADATA,sha256=JfraeowwRp8eQjKiFBBrIVUoA0fchchznGj4t8sXgFE,6469
-pheval-0.4.7.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-pheval-0.4.7.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
-pheval-0.4.7.dist-info/RECORD,,
+pheval-0.5.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+pheval-0.5.0.dist-info/METADATA,sha256=v7UNSBKUzJQAs8oBSq8XScwKnDiNXlzWZV0A70xR3M8,6456
+pheval-0.5.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
+pheval-0.5.0.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
+pheval-0.5.0.dist-info/RECORD,,

pheval/analyse/analysis.py DELETED Viewed

@@ -1,104 +0,0 @@
-from pheval.analyse.benchmark_generator import (
-    BenchmarkRunOutputGenerator,
-    DiseaseBenchmarkRunOutputGenerator,
-    GeneBenchmarkRunOutputGenerator,
-    VariantBenchmarkRunOutputGenerator,
-)
-from pheval.analyse.generate_summary_outputs import generate_benchmark_comparison_output
-from pheval.analyse.parse_corpus import CorpusParser
-from pheval.analyse.rank_stats import RankStatsWriter
-from pheval.analyse.run_data_parser import Config
-def _run_benchmark_comparison(
-    run_config: Config,
-    benchmark_generator: BenchmarkRunOutputGenerator,
-) -> None:
-    """
-    Run a benchmark on several result directories.
-    Args:
-        run_config (List[TrackInputOutputDirectories]): List of input and output directories
-            for tracking results across multiple directories.
-        benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
-    """
-    stats_writer = RankStatsWriter(
-        run_config.benchmark_name, benchmark_generator.stats_comparison_file
-    )
-    unique_test_corpora_directories = set([result.phenopacket_dir for result in run_config.runs])
-    [
-        CorpusParser(run_config.benchmark_name, test_corpora_directory).parse_corpus(
-            benchmark_generator
-        )
-        for test_corpora_directory in unique_test_corpora_directories
-    ]
-    benchmarking_results = []
-    for run in run_config.runs:
-        benchmark_result = benchmark_generator.generate_benchmark_run_results(
-            run_config.benchmark_name, run, run.score_order, run.threshold
-        )
-        stats_writer.add_statistics_entry(
-            run.run_identifier,
-            benchmark_result.rank_stats,
-            benchmark_result.binary_classification_stats,
-        )
-        benchmarking_results.append(benchmark_result)
-    run_identifiers = [run.run_identifier for run in run_config.runs]
-    [
-        generate_benchmark_comparison_output(
-            run_config.benchmark_name,
-            benchmarking_results,
-            run_identifiers,
-            benchmark_generator,
-            f"{unique_test_corpora_directory.parents[0].name}_"
-            f"{benchmark_generator.prioritisation_type_string}",
-        )
-        for unique_test_corpora_directory in unique_test_corpora_directories
-    ]
-def benchmark_run_comparisons(
-    run_config: Config,
-) -> None:
-    """
-    Benchmark prioritisation performance for several runs.
-    Args:
-        run_config (Config): Run configurations.
-    """
-    gene_analysis_runs = Config(
-        benchmark_name=run_config.benchmark_name,
-        runs=[run for run in run_config.runs if run.gene_analysis],
-        plot_customisation=run_config.plot_customisation,
-    )
-    variant_analysis_runs = Config(
-        benchmark_name=run_config.benchmark_name,
-        runs=[run for run in run_config.runs if run.variant_analysis],
-        plot_customisation=run_config.plot_customisation,
-    )
-    disease_analysis_runs = Config(
-        benchmark_name=run_config.benchmark_name,
-        runs=[run for run in run_config.runs if run.disease_analysis],
-        plot_customisation=run_config.plot_customisation,
-    )
-    if gene_analysis_runs.runs:
-        _run_benchmark_comparison(
-            run_config=gene_analysis_runs,
-            benchmark_generator=GeneBenchmarkRunOutputGenerator(
-                plot_customisation=gene_analysis_runs.plot_customisation.gene_plots
-            ),
-        )
-    if variant_analysis_runs.runs:
-        _run_benchmark_comparison(
-            run_config=variant_analysis_runs,
-            benchmark_generator=VariantBenchmarkRunOutputGenerator(
-                plot_customisation=variant_analysis_runs.plot_customisation.variant_plots
-            ),
-        )
-    if disease_analysis_runs.runs:
-        _run_benchmark_comparison(
-            run_config=disease_analysis_runs,
-            benchmark_generator=DiseaseBenchmarkRunOutputGenerator(
-                plot_customisation=disease_analysis_runs.plot_customisation.disease_plots
-            ),
-        )

pheval 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

pheval 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl