PyPI - pheval - Versions diffs - 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

pheval 0.3.9py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (26) hide show

pheval/analyse/analysis.py +61 -150
pheval/analyse/assess_prioritisation_base.py +108 -0
pheval/analyse/benchmark_db_manager.py +140 -0
pheval/analyse/benchmark_generator.py +47 -50
pheval/analyse/benchmarking_data.py +3 -2
pheval/analyse/disease_prioritisation_analysis.py +70 -219
pheval/analyse/gene_prioritisation_analysis.py +66 -242
pheval/analyse/generate_plots.py +81 -79
pheval/analyse/generate_summary_outputs.py +64 -134
pheval/analyse/parse_benchmark_summary.py +50 -37
pheval/analyse/parse_corpus.py +219 -0
pheval/analyse/rank_stats.py +177 -144
pheval/analyse/run_data_parser.py +108 -27
pheval/analyse/variant_prioritisation_analysis.py +78 -212
pheval/cli.py +2 -4
pheval/cli_pheval_utils.py +34 -245
pheval/prepare/create_noisy_phenopackets.py +78 -67
pheval-0.4.1.dist-info/METADATA +113 -0
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
pheval/analyse/parse_pheval_result.py +0 -43
pheval/analyse/prioritisation_rank_recorder.py +0 -83
pheval/constants.py +0 -8
pheval-0.3.9.dist-info/METADATA +0 -35
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0

pheval/analyse/parse_benchmark_summary.py CHANGED Viewed

@@ -1,55 +1,34 @@
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 import pandas as pd
+from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
 from pheval.analyse.benchmarking_data import BenchmarkRunResults
 from pheval.analyse.binary_classification_stats import BinaryClassificationStats
 from pheval.analyse.rank_stats import RankStats
-def read_benchmark_tsv_result_summary(benchmarking_tsv: Path) -> pd.DataFrame:
-    """
-    Read the summary benchmark TSV output generated from the benchmark-comparison command.
+@dataclass
+class BenchmarkSummaryResults:
+    gene_results: List[BenchmarkRunResults]
+    disease_results: List[BenchmarkRunResults]
+    variant_results: List[BenchmarkRunResults]
-    Args:
-        benchmarking_tsv (Path): Path to the summary benchmark TSV output file.
-    Returns:
-        pd.DataFrame: A pandas DataFrame containing specific columns from the TSV file, including:
-                      'results_directory_path', 'top', 'top3', 'top5', 'top10', 'found',
-                      'total', 'mean_reciprocal_rank'.
+def parse_benchmark_results(benchmark_summary_table: pd.DataFrame) -> List[BenchmarkRunResults]:
     """
-    return pd.read_csv(
-        benchmarking_tsv,
-        delimiter="\t",
-        usecols=[
-            "results_directory_path",
-            "top",
-            "top3",
-            "top5",
-            "top10",
-            "found",
-            "total",
-            "mean_reciprocal_rank",
-        ],
-    )
-def parse_benchmark_result_summary(benchmarking_df: pd.DataFrame) -> List[BenchmarkRunResults]:
-    """
-    Parse the summary benchmark DataFrame into a list of BenchmarkRunResults.
+    Parse benchmark results from a DataFrame.
     Args:
-        benchmarking_df (pd.DataFrame): Summary benchmark DataFrame containing columns such as
-                                        'results_directory_path', 'top', 'top3', 'top5', 'top10',
-                                        'found', 'total', 'mean_reciprocal_rank'.
+        benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results.
     Returns:
-        List[BenchmarkRunResults]: A list of BenchmarkRunResults instances generated from the DataFrame.
+        List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame.
     """
-    benchmarking_results = []
-    for _, row in benchmarking_df.iterrows():
+    results = []
+    for _, row in benchmark_summary_table.iterrows():
         benchmarking_result = BenchmarkRunResults(
             rank_stats=RankStats(
                 top=row["top"],
@@ -60,9 +39,43 @@ def parse_benchmark_result_summary(benchmarking_df: pd.DataFrame) -> List[Benchm
                 total=row["total"],
                 mrr=row["mean_reciprocal_rank"],
             ),
-            ranks={},
             benchmark_name=row["results_directory_path"],
             binary_classification_stats=BinaryClassificationStats(),
         )
-        benchmarking_results.append(benchmarking_result)
-    return benchmarking_results
+        results.append(benchmarking_result)
+    return results
+def parse_benchmark_db(benchmarking_db: Path) -> BenchmarkSummaryResults:
+    """
+    Read the summary benchmark TSV output generated from the benchmark-comparison command.
+    Args:
+        benchmarking_db (Path): Path to the benchmark db.
+    Returns:
+        BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db.
+    """
+    db_connector = BenchmarkDBManager(benchmarking_db)
+    gene_benchmarking_results, disease_benchmarking_results, variant_benchmarking_results = (
+        None,
+        None,
+        None,
+    )
+    if db_connector.check_table_exists("gene_summary"):
+        gene_benchmarking_results = parse_benchmark_results(
+            db_connector.conn.execute("SELECT * FROM gene_summary").fetchdf()
+        )
+    if db_connector.check_table_exists("disease_summary"):
+        disease_benchmarking_results = parse_benchmark_results(
+            db_connector.conn.execute("SELECT * FROM disease_summary").fetchdf()
+        )
+    if db_connector.check_table_exists("variant_summary"):
+        variant_benchmarking_results = parse_benchmark_results(
+            db_connector.conn.execute("SELECT * FROM variant_summary").fetchdf()
+        )
+    return BenchmarkSummaryResults(
+        gene_results=gene_benchmarking_results,
+        disease_results=disease_benchmarking_results,
+        variant_results=variant_benchmarking_results,
+    )

pheval/analyse/parse_corpus.py ADDED Viewed

@@ -0,0 +1,219 @@
+from pathlib import Path
+from typing import List
+from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
+from pheval.analyse.benchmark_generator import (
+    BenchmarkRunOutputGenerator,
+    DiseaseBenchmarkRunOutputGenerator,
+    GeneBenchmarkRunOutputGenerator,
+    VariantBenchmarkRunOutputGenerator,
+)
+from pheval.utils.file_utils import all_files
+from pheval.utils.phenopacket_utils import (
+    GenomicVariant,
+    PhenopacketUtil,
+    ProbandCausativeGene,
+    ProbandDisease,
+    phenopacket_reader,
+)
+def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
+    """
+    Obtain known diseases from a Phenopacket.
+    Args:
+       phenopacket_path (Path): Path to the Phenopacket file.
+    Returns:
+       List[ProbandDisease]: A list of known diseases associated with the proband,
+       extracted from the Phenopacket.
+    """
+    phenopacket = phenopacket_reader(phenopacket_path)
+    phenopacket_util = PhenopacketUtil(phenopacket)
+    return phenopacket_util.diagnoses()
+def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
+    """
+    Obtain known variants from a Phenopacket.
+    Args:
+       phenopacket_path (Path): Path to the Phenopacket file.
+    Returns:
+       List[GenomicVariant]: A list of known variants associated with the proband,
+       extracted from the Phenopacket.
+    """
+    phenopacket = phenopacket_reader(phenopacket_path)
+    phenopacket_util = PhenopacketUtil(phenopacket)
+    return phenopacket_util.diagnosed_variants()
+def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
+    """
+    Obtain known genes from a Phenopacket.
+    Args:
+       phenopacket_path (Path): Path to the Phenopacket file.
+    Returns:
+       List[ProbandCausativeGene]: A list of known genes associated with the proband,
+       extracted from the Phenopacket.
+    """
+    phenopacket = phenopacket_reader(phenopacket_path)
+    phenopacket_util = PhenopacketUtil(phenopacket)
+    return phenopacket_util.diagnosed_genes()
+class CorpusParser:
+    """Class for parsing phenopacket corpus and retrieving known variants/genes/diseases."""
+    def __init__(self, benchmark_name: str, phenopacket_dir: Path) -> None:
+        """
+        Initialise the CorpusParser class.
+        Args:
+            phenopacket_dir (Path): Path to the Phenopacket directory.
+        """
+        self.phenopacket_dir = phenopacket_dir
+        self.conn = BenchmarkDBManager(benchmark_name).conn
+        self.table_name = phenopacket_dir.parents[0].name
+    def _create_gene_table(self) -> None:
+        """
+        Create the Gene benchmarking table if it doesn't already exist.
+        """
+        self.conn.execute(
+            f"""
+                    CREATE TABLE IF NOT EXISTS {self.table_name}_gene (
+                        identifier VARCHAR(255) PRIMARY KEY,
+                        phenopacket VARCHAR,
+                        gene_symbol VARCHAR,
+                        gene_identifier VARCHAR
+                    )
+                    """
+        )
+    def _create_variant_table(self) -> None:
+        """
+        Create the Variant benchmarking table if it doesn't already exist.
+        """
+        self.conn.execute(
+            f"""
+                    CREATE TABLE IF NOT EXISTS {self.table_name}_variant (
+                        identifier VARCHAR(255) PRIMARY KEY,
+                        phenopacket VARCHAR,
+                        chrom VARCHAR,
+                        pos INTEGER,
+                        "ref" VARCHAR,
+                        alt VARCHAR
+                    )
+                    """
+        )
+    def _create_disease_table(self):
+        """
+        Create the Disease benchmarking table if it doesn't already exist.
+        """
+        self.conn.execute(
+            f"""
+                    CREATE TABLE IF NOT EXISTS {self.table_name}_disease (
+                        identifier VARCHAR(255) PRIMARY KEY,
+                        phenopacket VARCHAR,
+                        disease_identifier VARCHAR,
+                        disease_name VARCHAR
+                    )
+                    """
+        )
+    def _create_tables(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
+        """
+        Create tables based on the benchmarking analysis specified.
+        Args:
+            benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
+        """
+        if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
+            self._create_gene_table()
+        if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
+            self._create_variant_table()
+        if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
+            self._create_disease_table()
+    def _insert_genes(self, phenopacket_path: Path, genes: List[ProbandCausativeGene]) -> None:
+        """
+        Insert known disease-causing genes into the Gene benchmarking table.
+        Args:
+            phenopacket_path(Path): Path to the Phenopacket file.
+            genes(List[ProbandCausativeGene]): List of known genes associated with the proband.
+        """
+        for gene in genes:
+            identifier = f"{phenopacket_path.name}-{gene.gene_symbol}"
+            self.conn.execute(
+                f"""
+                INSERT OR IGNORE INTO {self.table_name}_gene (identifier, phenopacket, gene_symbol, gene_identifier)
+                VALUES (?, ?, ?, ?)
+                """,
+                (identifier, phenopacket_path.name, gene.gene_symbol, gene.gene_identifier),
+            )
+    def _insert_variants(self, phenopacket_path: Path, variants: List[GenomicVariant]) -> None:
+        """
+        Insert known variants into the Variant benchmarking table.
+        Args:
+            phenopacket_path (Path): Path to the Phenopacket file.:
+            variants (List[GenomicVariant]): List of known variants associated with the proband.
+        """
+        for variant in variants:
+            identifier = (
+                f"{phenopacket_path.name}-{variant.chrom}-{variant.pos}-{variant.ref}-{variant.alt}"
+            )
+            self.conn.execute(
+                f"""
+                INSERT OR IGNORE INTO {self.table_name}_variant (identifier, phenopacket, chrom, pos, "ref", alt)
+                VALUES (?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    identifier,
+                    phenopacket_path.name,
+                    variant.chrom,
+                    variant.pos,
+                    variant.ref,
+                    variant.alt,
+                ),
+            )
+    def _insert_diseases(self, phenopacket_path: Path, diseases: List[ProbandDisease]) -> None:
+        """
+        Insert known diseases into the Disease benchmarking table.
+        Args:
+            phenopacket_path (Path): Path to the Phenopacket file.:
+            diseases (List[ProbandDisease]): List of known diseases associated with the proband.
+        """
+        for disease in diseases:
+            identifier = f"{phenopacket_path.name}-{disease.disease_identifier}"
+            self.conn.execute(
+                f"INSERT OR IGNORE INTO {self.table_name}_disease "
+                f"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)",
+                (
+                    identifier,
+                    phenopacket_path.name,
+                    disease.disease_identifier,
+                    disease.disease_name,
+                ),
+            )
+    def parse_corpus(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
+        """
+        Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables.
+        Args:
+            benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
+        """
+        self._create_tables(benchmark_generator)
+        for phenopacket_path in all_files(self.phenopacket_dir):
+            if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
+                genes = _obtain_causative_genes(phenopacket_path)
+                self._insert_genes(phenopacket_path, genes)
+            if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
+                variants = _obtain_causative_variants(phenopacket_path)
+                self._insert_variants(phenopacket_path, variants)
+            if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
+                diseases = _obtain_causative_diseases(phenopacket_path)
+                self._insert_diseases(phenopacket_path, diseases)
+        self.conn.close()

pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

pheval 0.3.9py3-none-any.whl → 0.4.1py3-none-any.whl