PyPI - pheval - Versions diffs - 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

pheval 0.3.9py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (26) hide show

pheval/analyse/analysis.py +61 -150
pheval/analyse/assess_prioritisation_base.py +108 -0
pheval/analyse/benchmark_db_manager.py +140 -0
pheval/analyse/benchmark_generator.py +47 -50
pheval/analyse/benchmarking_data.py +3 -2
pheval/analyse/disease_prioritisation_analysis.py +70 -219
pheval/analyse/gene_prioritisation_analysis.py +66 -242
pheval/analyse/generate_plots.py +81 -79
pheval/analyse/generate_summary_outputs.py +64 -134
pheval/analyse/parse_benchmark_summary.py +50 -37
pheval/analyse/parse_corpus.py +219 -0
pheval/analyse/rank_stats.py +177 -144
pheval/analyse/run_data_parser.py +108 -27
pheval/analyse/variant_prioritisation_analysis.py +78 -212
pheval/cli.py +2 -4
pheval/cli_pheval_utils.py +34 -245
pheval/prepare/create_noisy_phenopackets.py +78 -67
pheval-0.4.1.dist-info/METADATA +113 -0
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
pheval/analyse/parse_pheval_result.py +0 -43
pheval/analyse/prioritisation_rank_recorder.py +0 -83
pheval/constants.py +0 -8
pheval-0.3.9.dist-info/METADATA +0 -35
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
{pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0

pheval/analyse/gene_prioritisation_analysis.py CHANGED Viewed

@@ -1,169 +1,22 @@
-import ast
-import re
-from collections import defaultdict
 from pathlib import Path
-from typing import List, Union
+from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
+from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
 from pheval.analyse.benchmarking_data import BenchmarkRunResults
 from pheval.analyse.binary_classification_stats import BinaryClassificationStats
-from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
-from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
-from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
 from pheval.analyse.rank_stats import RankStats
-from pheval.analyse.run_data_parser import TrackInputOutputDirectories
+from pheval.analyse.run_data_parser import RunConfig
 from pheval.post_processing.post_processing import RankedPhEvalGeneResult
 from pheval.utils.file_utils import all_files
-from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
-class AssessGenePrioritisation:
+class AssessGenePrioritisation(AssessPrioritisationBase):
     """Class for assessing gene prioritisation based on thresholds and scoring orders."""
-    def __init__(
-        self,
-        phenopacket_path: Path,
-        results_dir: Path,
-        standardised_gene_results: List[RankedPhEvalGeneResult],
-        threshold: float,
-        score_order: str,
-        proband_causative_genes: List[ProbandCausativeGene],
-    ):
-        """
-        Initialise AssessGenePrioritisation class.
-        Args:
-            phenopacket_path (Path): Path to the phenopacket file
-            results_dir (Path): Path to the results directory
-            standardised_gene_results (List[RankedPhEvalGeneResult]): List of ranked PhEval gene results
-            threshold (float): Threshold for scores
-            score_order (str): Score order for results, either ascending or descending
-            proband_causative_genes (List[ProbandCausativeGene]): List of proband causative genes
-        """
-        self.phenopacket_path = phenopacket_path
-        self.results_dir = results_dir
-        self.standardised_gene_results = standardised_gene_results
-        self.threshold = threshold
-        self.score_order = score_order
-        self.proband_causative_genes = proband_causative_genes
-    def _record_gene_prioritisation_match(
-        self,
-        gene: ProbandCausativeGene,
-        result_entry: RankedPhEvalGeneResult,
-        rank_stats: RankStats,
-    ) -> GenePrioritisationResult:
-        """
-        Record the gene prioritisation rank if found within the results
-        Args:
-            gene (ProbandCausativeGene): Diagnosed proband gene
-            result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
-            rank_stats (RankStats): RankStats class instance
-        Returns:
-            GenePrioritisationResult: Recorded correct gene prioritisation rank result
-        """
-        rank = result_entry.rank
-        rank_stats.add_rank(rank)
-        return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
-    def _assess_gene_with_threshold_ascending_order(
-        self,
-        result_entry: RankedPhEvalGeneResult,
-        gene: ProbandCausativeGene,
-        rank_stats: RankStats,
-    ) -> GenePrioritisationResult:
-        """
-        Record the gene prioritisation rank if it meets the ascending order threshold.
-        This method checks if the gene prioritisation rank meets the ascending order threshold.
-        If the score of the result entry is less than the threshold, it records the gene rank.
-        Args:
-            result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
-            gene (ProbandCausativeGene): Diagnosed proband gene
-            rank_stats (RankStats): RankStats class instance
-        Returns:
-            GenePrioritisationResult: Recorded correct gene prioritisation rank result
-        """
-        if float(self.threshold) > float(result_entry.score):
-            return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
-    def _assess_gene_with_threshold(
-        self,
-        result_entry: RankedPhEvalGeneResult,
-        gene: ProbandCausativeGene,
-        rank_stats: RankStats,
-    ) -> GenePrioritisationResult:
-        """
-        Record the gene prioritisation rank if it meets the score threshold.
-        This method checks if the gene prioritisation rank meets the score threshold.
-        If the score of the result entry is greater than the threshold, it records the gene rank.
-        Args:
-            result_entry (RankedPhEvalResult): Ranked PhEval gene result entry
-            gene (ProbandCausativeGene): Diagnosed proband gene
-            rank_stats (RankStats): RankStats class instance
-        Returns:
-            GenePrioritisationResult: Recorded correct gene prioritisation rank result
-        """
-        if float(self.threshold) < float(result_entry.score):
-            return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
-    def _record_matched_gene(
-        self,
-        gene: ProbandCausativeGene,
-        rank_stats: RankStats,
-        standardised_gene_result: RankedPhEvalGeneResult,
-    ) -> GenePrioritisationResult:
-        """
-        Return the gene rank result - handling the specification of a threshold.
-        This method determines and returns the gene rank result based on the specified threshold
-        and score order. If the threshold is 0.0, it records the gene rank directly.
-        Otherwise, it assesses the gene with the threshold based on the score order.
-        Args:
-            gene (ProbandCausativeGene): Diagnosed proband gene
-            rank_stats (RankStats): RankStats class instance
-            standardised_gene_result (RankedPhEvalGeneResult): Ranked PhEval gene result entry
-        Returns:
-            GenePrioritisationResult: Recorded correct gene prioritisation rank result
-        """
-        if float(self.threshold) == 0.0:
-            return self._record_gene_prioritisation_match(
-                gene, standardised_gene_result, rank_stats
-            )
-        else:
-            return (
-                self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
-                if self.score_order != "ascending"
-                else self._assess_gene_with_threshold_ascending_order(
-                    standardised_gene_result, gene, rank_stats
-                )
-            )
-    @staticmethod
-    def _check_string_representation(entity: str) -> Union[List[str], str]:
-        """
-        Check if the input string is a representation of a list and returns the list if true, otherwise the string.
-        Args:
-            entity (str): The input entity to check.
-        Returns:
-            Union[List[str], str]: A list if the input string is a list representation, otherwise
-            the original string.
-        """
-        list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
-        if list_pattern.match(str(entity)):
-            return ast.literal_eval(entity)
-        else:
-            return entity
     def assess_gene_prioritisation(
         self,
-        rank_stats: RankStats,
-        rank_records: defaultdict,
+        standardised_gene_result_path: Path,
+        phenopacket_path: Path,
         binary_classification_stats: BinaryClassificationStats,
     ) -> None:
         """
@@ -172,78 +25,47 @@ class AssessGenePrioritisation:
         and records ranks using a PrioritisationRankRecorder.
         Args:
-            rank_stats (RankStats): RankStats class instance
-            rank_records (defaultdict): A defaultdict to store the correct ranked results.
+            standardised_gene_result_path (Path): Path to the standardised gene TSV result.
+            phenopacket_path (Path): Path to the Phenopacket.
             binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
         """
         relevant_ranks = []
-        for gene in self.proband_causative_genes:
-            rank_stats.total += 1
-            gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
-            for standardised_gene_result in self.standardised_gene_results:
-                gene_identifier = self._check_string_representation(
-                    standardised_gene_result.gene_identifier
+        df = self.conn.execute(
+            f"""SELECT * FROM {self.table_name} WHERE phenopacket = '{phenopacket_path.name}'"""
+        ).fetchdf()
+        for _i, row in df.iterrows():
+            result = (
+                self.conn.execute(
+                    f"SELECT * FROM '{standardised_gene_result_path}' "
+                    f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),"
+                    f" '{row['gene_identifier']}') "
+                    f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), "
+                    f"'{row['gene_symbol']}')"
                 )
-                gene_symbol = self._check_string_representation(
-                    standardised_gene_result.gene_symbol
+                .fetchdf()
+                .to_dict(orient="records")
+            )
+            if len(result) > 0:
+                gene_match = self._record_matched_entity(RankedPhEvalGeneResult(**result[0]))
+                relevant_ranks.append(gene_match)
+                primary_key = f"{phenopacket_path.name}-{row['gene_symbol']}"
+                self.conn.execute(
+                    f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
+                    (gene_match, primary_key),
                 )
-                if (
-                    isinstance(gene_identifier, list)
-                    and gene.gene_identifier in gene_identifier
-                    or isinstance(gene_identifier, str)
-                    and gene.gene_identifier == str
-                    or isinstance(gene_symbol, list)
-                    and gene.gene_symbol in gene_symbol
-                    or isinstance(gene_symbol, str)
-                    and gene.gene_symbol == gene_symbol
-                ):
-                    gene_match = self._record_matched_gene(
-                        gene, rank_stats, standardised_gene_result
-                    )
-                    (
-                        relevant_ranks.append(gene_match.rank)
-                        if gene_match
-                        else relevant_ranks.append(0)
-                    )
-                    break
-            PrioritisationRankRecorder(
-                rank_stats.total,
-                self.results_dir,
-                (
-                    GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
-                    if gene_match is None
-                    else gene_match
-                ),
-                rank_records,
-            ).record_rank()
-        rank_stats.relevant_result_ranks.append(relevant_ranks)
         binary_classification_stats.add_classification(
-            pheval_results=self.standardised_gene_results, relevant_ranks=relevant_ranks
+            self.db_connection.parse_table_into_dataclass(
+                str(standardised_gene_result_path), RankedPhEvalGeneResult
+            ),
+            relevant_ranks,
         )
-def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
-    """
-    Obtain known genes from a Phenopacket.
-    Args:
-       phenopacket_path (Path): Path to the Phenopacket file.
-    Returns:
-       List[ProbandCausativeGene]: A list of known genes associated with the proband,
-       extracted from the Phenopacket.
-    """
-    phenopacket = phenopacket_reader(phenopacket_path)
-    phenopacket_util = PhenopacketUtil(phenopacket)
-    return phenopacket_util.diagnosed_genes()
 def assess_phenopacket_gene_prioritisation(
     phenopacket_path: Path,
-    score_order: str,
-    results_dir_and_input: TrackInputOutputDirectories,
-    threshold: float,
-    gene_rank_stats: RankStats,
-    gene_rank_comparison: defaultdict,
+    run: RunConfig,
     gene_binary_classification_stats: BinaryClassificationStats,
+    gene_benchmarker: AssessGenePrioritisation,
 ) -> None:
     """
     Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results
@@ -251,62 +73,64 @@ def assess_phenopacket_gene_prioritisation(
     Args:
         phenopacket_path (Path): Path to the Phenopacket.
-        score_order (str): The order in which scores are arranged, either ascending or descending.
-        results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
-        threshold (float): Threshold for assessment.
-        gene_rank_stats (RankStats): RankStats class instance.
-        gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
+        run (RunConfig): Run configuration.
         gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
+        gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance.
     """
-    standardised_gene_result = results_dir_and_input.results_dir.joinpath(
+    standardised_gene_result_path = run.results_dir.joinpath(
         f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
     )
-    pheval_gene_result = read_standardised_result(standardised_gene_result)
-    proband_causative_genes = _obtain_causative_genes(phenopacket_path)
-    AssessGenePrioritisation(
+    gene_benchmarker.assess_gene_prioritisation(
+        standardised_gene_result_path,
         phenopacket_path,
-        results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
-        parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result),
-        threshold,
-        score_order,
-        proband_causative_genes,
-    ).assess_gene_prioritisation(
-        gene_rank_stats, gene_rank_comparison, gene_binary_classification_stats
+        gene_binary_classification_stats,
     )
 def benchmark_gene_prioritisation(
-    results_directory_and_input: TrackInputOutputDirectories,
+    benchmark_name: str,
+    run: RunConfig,
     score_order: str,
     threshold: float,
-    gene_rank_comparison: defaultdict,
 ) -> BenchmarkRunResults:
     """
     Benchmark a directory based on gene prioritisation results.
      Args:
-         results_directory_and_input (TrackInputOutputDirectories): Input and output directories.
+         benchmark_name (str): Name of the benchmark.
+         run (RunConfig): Run configuration.
          score_order (str): The order in which scores are arranged.
          threshold (float): Threshold for assessment.
-         gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
      Returns:
          BenchmarkRunResults: An object containing benchmarking results for gene prioritisation,
          including ranks and rank statistics for the benchmarked directory.
     """
-    gene_rank_stats = RankStats()
     gene_binary_classification_stats = BinaryClassificationStats()
-    for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
+    db_connection = BenchmarkDBManager(benchmark_name)
+    db_connection.initialise()
+    gene_benchmarker = AssessGenePrioritisation(
+        db_connection,
+        f"{run.phenopacket_dir.parents[0].name}" f"_gene",
+        run.run_identifier,
+        threshold,
+        score_order,
+    )
+    for phenopacket_path in all_files(run.phenopacket_dir):
         assess_phenopacket_gene_prioritisation(
             phenopacket_path,
-            score_order,
-            results_directory_and_input,
-            threshold,
-            gene_rank_stats,
-            gene_rank_comparison,
+            run,
             gene_binary_classification_stats,
+            gene_benchmarker,
         )
+    db_connection.close()
+    gene_rank_stats = RankStats()
+    gene_rank_stats.add_ranks(
+        benchmark_name=benchmark_name,
+        table_name=f"{run.phenopacket_dir.parents[0].name}_gene",
+        column_name=str(run.run_identifier),
+    )
     return BenchmarkRunResults(
-        results_dir=results_directory_and_input.results_dir,
-        ranks=gene_rank_comparison,
         rank_stats=gene_rank_stats,
+        benchmark_name=run.run_identifier,
         binary_classification_stats=gene_binary_classification_stats,
+        phenopacket_dir=run.phenopacket_dir,
     )

pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

Potentially problematic release.

pheval 0.3.9py3-none-any.whl → 0.4.1py3-none-any.whl