PyPI - pheval - Versions diffs - 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

pheval 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show

pheval/analyse/benchmark.py +156 -0
pheval/analyse/benchmark_db_manager.py +16 -134
pheval/analyse/benchmark_output_type.py +43 -0
pheval/analyse/binary_classification_curves.py +132 -0
pheval/analyse/binary_classification_stats.py +164 -307
pheval/analyse/generate_plots.py +210 -395
pheval/analyse/generate_rank_comparisons.py +44 -0
pheval/analyse/rank_stats.py +190 -382
pheval/analyse/run_data_parser.py +21 -39
pheval/cli.py +27 -24
pheval/cli_pheval_utils.py +7 -8
pheval/post_processing/phenopacket_truth_set.py +235 -0
pheval/post_processing/post_processing.py +185 -337
pheval/post_processing/validate_result_format.py +92 -0
pheval/prepare/update_phenopacket.py +11 -9
pheval/utils/logger.py +35 -0
pheval/utils/phenopacket_utils.py +85 -91
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
pheval/analyse/analysis.py +0 -104
pheval/analyse/assess_prioritisation_base.py +0 -108
pheval/analyse/benchmark_generator.py +0 -126
pheval/analyse/benchmarking_data.py +0 -25
pheval/analyse/disease_prioritisation_analysis.py +0 -152
pheval/analyse/gene_prioritisation_analysis.py +0 -147
pheval/analyse/generate_summary_outputs.py +0 -105
pheval/analyse/parse_benchmark_summary.py +0 -81
pheval/analyse/parse_corpus.py +0 -219
pheval/analyse/prioritisation_result_types.py +0 -52
pheval/analyse/variant_prioritisation_analysis.py +0 -159
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/WHEEL +0 -0
{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0

pheval/analyse/parse_benchmark_summary.py DELETED Viewed

@@ -1,81 +0,0 @@
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List
-import pandas as pd
-from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
-from pheval.analyse.benchmarking_data import BenchmarkRunResults
-from pheval.analyse.binary_classification_stats import BinaryClassificationStats
-from pheval.analyse.rank_stats import RankStats
-@dataclass
-class BenchmarkSummaryResults:
-    gene_results: List[BenchmarkRunResults]
-    disease_results: List[BenchmarkRunResults]
-    variant_results: List[BenchmarkRunResults]
-def parse_benchmark_results(benchmark_summary_table: pd.DataFrame) -> List[BenchmarkRunResults]:
-    """
-    Parse benchmark results from a DataFrame.
-    Args:
-        benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results.
-    Returns:
-        List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame.
-    """
-    results = []
-    for _, row in benchmark_summary_table.iterrows():
-        benchmarking_result = BenchmarkRunResults(
-            rank_stats=RankStats(
-                top=row["top"],
-                top3=row["top3"],
-                top5=row["top5"],
-                top10=row["top10"],
-                found=row["found"],
-                total=row["total"],
-                mrr=row["mean_reciprocal_rank"],
-            ),
-            benchmark_name=row["results_directory_path"],
-            binary_classification_stats=BinaryClassificationStats(),
-        )
-        results.append(benchmarking_result)
-    return results
-def parse_benchmark_db(benchmarking_db: Path) -> BenchmarkSummaryResults:
-    """
-    Read the summary benchmark TSV output generated from the benchmark-comparison command.
-    Args:
-        benchmarking_db (Path): Path to the benchmark db.
-    Returns:
-        BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db.
-    """
-    db_connector = BenchmarkDBManager(benchmarking_db)
-    gene_benchmarking_results, disease_benchmarking_results, variant_benchmarking_results = (
-        None,
-        None,
-        None,
-    )
-    if db_connector.check_table_exists("gene_summary"):
-        gene_benchmarking_results = parse_benchmark_results(
-            db_connector.conn.execute("SELECT * FROM gene_summary").fetchdf()
-        )
-    if db_connector.check_table_exists("disease_summary"):
-        disease_benchmarking_results = parse_benchmark_results(
-            db_connector.conn.execute("SELECT * FROM disease_summary").fetchdf()
-        )
-    if db_connector.check_table_exists("variant_summary"):
-        variant_benchmarking_results = parse_benchmark_results(
-            db_connector.conn.execute("SELECT * FROM variant_summary").fetchdf()
-        )
-    return BenchmarkSummaryResults(
-        gene_results=gene_benchmarking_results,
-        disease_results=disease_benchmarking_results,
-        variant_results=variant_benchmarking_results,
-    )

pheval/analyse/parse_corpus.py DELETED Viewed

@@ -1,219 +0,0 @@
-from pathlib import Path
-from typing import List
-from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
-from pheval.analyse.benchmark_generator import (
-    BenchmarkRunOutputGenerator,
-    DiseaseBenchmarkRunOutputGenerator,
-    GeneBenchmarkRunOutputGenerator,
-    VariantBenchmarkRunOutputGenerator,
-)
-from pheval.utils.file_utils import all_files
-from pheval.utils.phenopacket_utils import (
-    GenomicVariant,
-    PhenopacketUtil,
-    ProbandCausativeGene,
-    ProbandDisease,
-    phenopacket_reader,
-)
-def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
-    """
-    Obtain known diseases from a Phenopacket.
-    Args:
-       phenopacket_path (Path): Path to the Phenopacket file.
-    Returns:
-       List[ProbandDisease]: A list of known diseases associated with the proband,
-       extracted from the Phenopacket.
-    """
-    phenopacket = phenopacket_reader(phenopacket_path)
-    phenopacket_util = PhenopacketUtil(phenopacket)
-    return phenopacket_util.diagnoses()
-def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
-    """
-    Obtain known variants from a Phenopacket.
-    Args:
-       phenopacket_path (Path): Path to the Phenopacket file.
-    Returns:
-       List[GenomicVariant]: A list of known variants associated with the proband,
-       extracted from the Phenopacket.
-    """
-    phenopacket = phenopacket_reader(phenopacket_path)
-    phenopacket_util = PhenopacketUtil(phenopacket)
-    return phenopacket_util.diagnosed_variants()
-def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
-    """
-    Obtain known genes from a Phenopacket.
-    Args:
-       phenopacket_path (Path): Path to the Phenopacket file.
-    Returns:
-       List[ProbandCausativeGene]: A list of known genes associated with the proband,
-       extracted from the Phenopacket.
-    """
-    phenopacket = phenopacket_reader(phenopacket_path)
-    phenopacket_util = PhenopacketUtil(phenopacket)
-    return phenopacket_util.diagnosed_genes()
-class CorpusParser:
-    """Class for parsing phenopacket corpus and retrieving known variants/genes/diseases."""
-    def __init__(self, benchmark_name: str, phenopacket_dir: Path) -> None:
-        """
-        Initialise the CorpusParser class.
-        Args:
-            phenopacket_dir (Path): Path to the Phenopacket directory.
-        """
-        self.phenopacket_dir = phenopacket_dir
-        self.conn = BenchmarkDBManager(benchmark_name).conn
-        self.table_name = phenopacket_dir.parents[0].name
-    def _create_gene_table(self) -> None:
-        """
-        Create the Gene benchmarking table if it doesn't already exist.
-        """
-        self.conn.execute(
-            f"""
-                    CREATE TABLE IF NOT EXISTS "{self.table_name}_gene" (
-                        identifier VARCHAR(255) PRIMARY KEY,
-                        phenopacket VARCHAR,
-                        gene_symbol VARCHAR,
-                        gene_identifier VARCHAR
-                    )
-                    """
-        )
-    def _create_variant_table(self) -> None:
-        """
-        Create the Variant benchmarking table if it doesn't already exist.
-        """
-        self.conn.execute(
-            f"""
-                    CREATE TABLE IF NOT EXISTS "{self.table_name}_variant" (
-                        identifier VARCHAR(255) PRIMARY KEY,
-                        phenopacket VARCHAR,
-                        chrom VARCHAR,
-                        pos INTEGER,
-                        "ref" VARCHAR,
-                        alt VARCHAR
-                    )
-                    """
-        )
-    def _create_disease_table(self):
-        """
-        Create the Disease benchmarking table if it doesn't already exist.
-        """
-        self.conn.execute(
-            f"""
-                    CREATE TABLE IF NOT EXISTS "{self.table_name}_disease" (
-                        identifier VARCHAR(255) PRIMARY KEY,
-                        phenopacket VARCHAR,
-                        disease_identifier VARCHAR,
-                        disease_name VARCHAR
-                    )
-                    """
-        )
-    def _create_tables(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
-        """
-        Create tables based on the benchmarking analysis specified.
-        Args:
-            benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
-        """
-        if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
-            self._create_gene_table()
-        if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
-            self._create_variant_table()
-        if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
-            self._create_disease_table()
-    def _insert_genes(self, phenopacket_path: Path, genes: List[ProbandCausativeGene]) -> None:
-        """
-        Insert known disease-causing genes into the Gene benchmarking table.
-        Args:
-            phenopacket_path(Path): Path to the Phenopacket file.
-            genes(List[ProbandCausativeGene]): List of known genes associated with the proband.
-        """
-        for gene in genes:
-            identifier = f"{phenopacket_path.name}-{gene.gene_symbol}"
-            self.conn.execute(
-                f"""
-                INSERT OR IGNORE INTO "{self.table_name}_gene" (identifier, phenopacket, gene_symbol, gene_identifier)
-                VALUES (?, ?, ?, ?)
-                """,
-                (identifier, phenopacket_path.name, gene.gene_symbol, gene.gene_identifier),
-            )
-    def _insert_variants(self, phenopacket_path: Path, variants: List[GenomicVariant]) -> None:
-        """
-        Insert known variants into the Variant benchmarking table.
-        Args:
-            phenopacket_path (Path): Path to the Phenopacket file.:
-            variants (List[GenomicVariant]): List of known variants associated with the proband.
-        """
-        for variant in variants:
-            identifier = (
-                f"{phenopacket_path.name}-{variant.chrom}-{variant.pos}-{variant.ref}-{variant.alt}"
-            )
-            self.conn.execute(
-                f"""
-                INSERT OR IGNORE INTO "{self.table_name}_variant" (identifier, phenopacket, chrom, pos, "ref", alt)
-                VALUES (?, ?, ?, ?, ?, ?)
-                """,
-                (
-                    identifier,
-                    phenopacket_path.name,
-                    variant.chrom,
-                    variant.pos,
-                    variant.ref,
-                    variant.alt,
-                ),
-            )
-    def _insert_diseases(self, phenopacket_path: Path, diseases: List[ProbandDisease]) -> None:
-        """
-        Insert known diseases into the Disease benchmarking table.
-        Args:
-            phenopacket_path (Path): Path to the Phenopacket file.:
-            diseases (List[ProbandDisease]): List of known diseases associated with the proband.
-        """
-        for disease in diseases:
-            identifier = f"{phenopacket_path.name}-{disease.disease_identifier}"
-            self.conn.execute(
-                f"""INSERT OR IGNORE INTO "{self.table_name}_disease" """
-                f"""(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)""",
-                (
-                    identifier,
-                    phenopacket_path.name,
-                    disease.disease_identifier,
-                    disease.disease_name,
-                ),
-            )
-    def parse_corpus(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
-        """
-        Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables.
-        Args:
-            benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
-        """
-        self._create_tables(benchmark_generator)
-        for phenopacket_path in all_files(self.phenopacket_dir):
-            if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
-                genes = _obtain_causative_genes(phenopacket_path)
-                self._insert_genes(phenopacket_path, genes)
-            if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
-                variants = _obtain_causative_variants(phenopacket_path)
-                self._insert_variants(phenopacket_path, variants)
-            if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
-                diseases = _obtain_causative_diseases(phenopacket_path)
-                self._insert_diseases(phenopacket_path, diseases)
-        self.conn.close()

pheval/analyse/prioritisation_result_types.py DELETED Viewed

@@ -1,52 +0,0 @@
-from dataclasses import dataclass
-from pathlib import Path
-from pheval.utils.phenopacket_utils import GenomicVariant, ProbandDisease
-@dataclass
-class GenePrioritisationResult:
-    """
-    Store rank data for causative genes.
-    Attributes:
-        phenopacket_path (Path): Path to the phenopacket.
-        gene (str): The causative gene.
-        rank (int): The assigned rank for the gene. Defaults to 0.
-    """
-    phenopacket_path: Path
-    gene: str
-    rank: int = 0
-@dataclass
-class VariantPrioritisationResult:
-    """
-    Store rank data for variants.
-    Attributes:
-        phenopacket_path (Path): Path to the phenopacket.
-        variant (GenomicVariant): The genomic variant.
-        rank (int): The assigned rank for the variant. Defaults to 0.
-    """
-    phenopacket_path: Path
-    variant: GenomicVariant
-    rank: int = 0
-@dataclass
-class DiseasePrioritisationResult:
-    """
-    Store rank data for known diseases.
-    Attributes:
-        phenopacket_path (Path): Path to the phenopacket.
-        disease (ProbandDisease): The proband disease.
-        rank (int): The assigned rank for the disease. Defaults to 0.
-    """
-    phenopacket_path: Path
-    disease: ProbandDisease
-    rank: int = 0

pheval/analyse/variant_prioritisation_analysis.py DELETED Viewed

@@ -1,159 +0,0 @@
-from pathlib import Path
-from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
-from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
-from pheval.analyse.benchmarking_data import BenchmarkRunResults
-from pheval.analyse.binary_classification_stats import BinaryClassificationStats
-from pheval.analyse.rank_stats import RankStats
-from pheval.analyse.run_data_parser import RunConfig
-from pheval.post_processing.post_processing import RankedPhEvalVariantResult
-from pheval.utils.file_utils import all_files
-from pheval.utils.phenopacket_utils import GenomicVariant
-class AssessVariantPrioritisation(AssessPrioritisationBase):
-    """Class for assessing variant prioritisation based on thresholds and scoring orders."""
-    def assess_variant_prioritisation(
-        self,
-        standardised_variant_result_path: Path,
-        phenopacket_path: Path,
-        binary_classification_stats: BinaryClassificationStats,
-    ) -> None:
-        """
-        Assess variant prioritisation.
-        This method assesses the prioritisation of variants based on the provided criteria
-        and records ranks using a PrioritisationRankRecorder.
-        Args:
-            standardised_variant_result_path (Path): Path to standardised variant TSV result.
-            phenopacket_path (Path): Path to the phenopacket.
-            binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
-        """
-        relevant_ranks = []
-        df = self.conn.execute(
-            f"""SELECT * FROM "{self.table_name}" WHERE phenopacket = '{phenopacket_path.name}'"""
-        ).fetchdf()
-        for _i, row in df.iterrows():
-            causative_variant = GenomicVariant(
-                chrom=row["chrom"],
-                pos=int(row["pos"]),
-                ref=row["ref"],
-                alt=row["alt"],
-            )
-            result = (
-                self.conn.execute(
-                    (
-                        f"SELECT * FROM '{standardised_variant_result_path}' "
-                        f"WHERE "
-                        f"chromosome == '{causative_variant.chrom}' AND "
-                        f"start == {causative_variant.pos} AND "
-                        f"ref == '{causative_variant.ref}' AND "
-                        f"alt == '{causative_variant.alt}'"
-                    )
-                    if standardised_variant_result_path.exists()
-                    else "SELECT NULL WHERE FALSE"
-                )
-                .fetchdf()
-                .to_dict(orient="records")
-            )
-            if len(result) > 0:
-                variant_match = self._record_matched_entity(RankedPhEvalVariantResult(**result[0]))
-                relevant_ranks.append(variant_match)
-                primary_key = (
-                    f"{phenopacket_path.name}-{causative_variant.chrom}-{causative_variant.pos}-"
-                    f"{causative_variant.ref}-{causative_variant.alt}"
-                )
-                self.conn.execute(
-                    f'UPDATE "{self.table_name}" SET "{self.column}" = ? WHERE identifier = ?',
-                    (variant_match, primary_key),
-                )
-            elif len(result) == 0:
-                relevant_ranks.append(0)
-        binary_classification_stats.add_classification(
-            (
-                self.db_connection.parse_table_into_dataclass(
-                    str(standardised_variant_result_path), RankedPhEvalVariantResult
-                )
-                if standardised_variant_result_path.exists()
-                else []
-            ),
-            relevant_ranks,
-        )
-def assess_phenopacket_variant_prioritisation(
-    phenopacket_path: Path,
-    run: RunConfig,
-    variant_binary_classification_stats: BinaryClassificationStats,
-    variant_benchmarker: AssessVariantPrioritisation,
-) -> None:
-    """
-    Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results
-    against the recorded causative variants for a proband in the Phenopacket.
-    Args:
-        phenopacket_path (Path): Path to the Phenopacket.
-        run (RunConfig): Run configuration.
-        variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
-        variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance.
-    """
-    standardised_variant_result_path = run.results_dir.joinpath(
-        f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
-    )
-    variant_benchmarker.assess_variant_prioritisation(
-        standardised_variant_result_path,
-        phenopacket_path,
-        variant_binary_classification_stats,
-    )
-def benchmark_variant_prioritisation(
-    benchmark_name: str,
-    run: RunConfig,
-    score_order: str,
-    threshold: float,
-):
-    """
-    Benchmark a directory based on variant prioritisation results.
-    Args:
-        benchmark_name (str): Name of the benchmark.
-        run (RunConfig): Run configuration.
-        score_order (str): The order in which scores are arranged.
-        threshold (float): Threshold for assessment.
-    Returns:
-        BenchmarkRunResults: An object containing benchmarking results for variant prioritisation,
-        including ranks and rank statistics for the benchmarked directory.
-    """
-    variant_binary_classification_stats = BinaryClassificationStats()
-    db_connection = BenchmarkDBManager(benchmark_name)
-    variant_benchmarker = AssessVariantPrioritisation(
-        db_connection,
-        f"{run.phenopacket_dir.parents[0].name}" f"_variant",
-        run.run_identifier,
-        threshold,
-        score_order,
-    )
-    for phenopacket_path in all_files(run.phenopacket_dir):
-        assess_phenopacket_variant_prioritisation(
-            phenopacket_path,
-            run,
-            variant_binary_classification_stats,
-            variant_benchmarker,
-        )
-    variant_rank_stats = RankStats()
-    variant_rank_stats.add_ranks(
-        benchmark_name=benchmark_name,
-        table_name=f"{run.phenopacket_dir.parents[0].name}_variant",
-        column_name=str(run.run_identifier),
-    )
-    return BenchmarkRunResults(
-        benchmark_name=run.run_identifier,
-        rank_stats=variant_rank_stats,
-        binary_classification_stats=variant_binary_classification_stats,
-        phenopacket_dir=run.phenopacket_dir,
-    )

{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pheval 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

pheval 0.4.7py3-none-any.whl → 0.5.0py3-none-any.whl