pheval 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show
  1. pheval/analyse/benchmark.py +156 -0
  2. pheval/analyse/benchmark_db_manager.py +16 -134
  3. pheval/analyse/benchmark_output_type.py +43 -0
  4. pheval/analyse/binary_classification_curves.py +132 -0
  5. pheval/analyse/binary_classification_stats.py +164 -307
  6. pheval/analyse/generate_plots.py +210 -395
  7. pheval/analyse/generate_rank_comparisons.py +44 -0
  8. pheval/analyse/rank_stats.py +190 -382
  9. pheval/analyse/run_data_parser.py +21 -39
  10. pheval/cli.py +27 -24
  11. pheval/cli_pheval_utils.py +7 -8
  12. pheval/post_processing/phenopacket_truth_set.py +250 -0
  13. pheval/post_processing/post_processing.py +179 -345
  14. pheval/post_processing/validate_result_format.py +91 -0
  15. pheval/prepare/update_phenopacket.py +11 -9
  16. pheval/utils/logger.py +35 -0
  17. pheval/utils/phenopacket_utils.py +85 -91
  18. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/METADATA +4 -4
  19. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/RECORD +22 -26
  20. pheval/analyse/analysis.py +0 -104
  21. pheval/analyse/assess_prioritisation_base.py +0 -108
  22. pheval/analyse/benchmark_generator.py +0 -126
  23. pheval/analyse/benchmarking_data.py +0 -25
  24. pheval/analyse/disease_prioritisation_analysis.py +0 -152
  25. pheval/analyse/gene_prioritisation_analysis.py +0 -147
  26. pheval/analyse/generate_summary_outputs.py +0 -105
  27. pheval/analyse/parse_benchmark_summary.py +0 -81
  28. pheval/analyse/parse_corpus.py +0 -219
  29. pheval/analyse/prioritisation_result_types.py +0 -52
  30. pheval/analyse/variant_prioritisation_analysis.py +0 -159
  31. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/LICENSE +0 -0
  32. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/WHEEL +0 -0
  33. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/entry_points.txt +0 -0
@@ -1,108 +0,0 @@
1
- from typing import Union
2
-
3
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
4
- from pheval.post_processing.post_processing import (
5
- RankedPhEvalDiseaseResult,
6
- RankedPhEvalGeneResult,
7
- RankedPhEvalVariantResult,
8
- )
9
-
10
-
11
- class AssessPrioritisationBase:
12
- def __init__(
13
- self,
14
- db_connection: BenchmarkDBManager,
15
- table_name: str,
16
- column: str,
17
- threshold: float,
18
- score_order: str,
19
- ):
20
- """
21
- Initialise AssessPrioritisationBase class
22
-
23
- Args:
24
- db_connection (BenchmarkDBManager): DB connection.
25
- table_name (str): Table name.
26
- column (str): Column name.
27
- threshold (float): Threshold for scores
28
- score_order (str): Score order for results, either ascending or descending
29
-
30
- """
31
- self.threshold = threshold
32
- self.score_order = score_order
33
- self.db_connection = db_connection
34
- self.conn = db_connection.conn
35
- self.column = column
36
- self.table_name = table_name
37
- db_connection.add_column_integer_default(
38
- table_name=table_name, column=self.column, default=0
39
- )
40
-
41
- def _assess_with_threshold_ascending_order(
42
- self,
43
- result_entry: Union[
44
- RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
45
- ],
46
- ) -> int:
47
- """
48
- Record the prioritisation rank if it meets the ascending order threshold.
49
-
50
-
51
- Args:
52
- result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
53
- Ranked PhEval result entry
54
-
55
- Returns:
56
- int: Recorded prioritisation rank
57
- """
58
- if float(self.threshold) > float(result_entry.score):
59
- return result_entry.rank
60
- else:
61
- return 0
62
-
63
- def _assess_with_threshold(
64
- self,
65
- result_entry: Union[
66
- RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
67
- ],
68
- ) -> int:
69
- """
70
- Record the prioritisation rank if it meets the score threshold.
71
-
72
- Args:
73
- result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
74
- Ranked PhEval result entry
75
-
76
- Returns:
77
- int: Recorded prioritisation rank
78
- """
79
- if float(self.threshold) < float(result_entry.score):
80
- return result_entry.rank
81
- else:
82
- return 0
83
-
84
- def _record_matched_entity(
85
- self,
86
- standardised_result: Union[
87
- RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
88
- ],
89
- ) -> int:
90
- """
91
- Return the rank result - handling the specification of a threshold.
92
- Args:
93
- standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
94
- Ranked PhEval disease result entry
95
-
96
- Returns:
97
- int: Recorded entity prioritisation rank
98
- """
99
- if float(self.threshold) == 0.0:
100
- return standardised_result.rank
101
- else:
102
- return (
103
- self._assess_with_threshold(standardised_result)
104
- if self.score_order != "ascending"
105
- else self._assess_with_threshold_ascending_order(
106
- standardised_result,
107
- )
108
- )
@@ -1,126 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Callable
3
-
4
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
5
- from pheval.analyse.disease_prioritisation_analysis import benchmark_disease_prioritisation
6
- from pheval.analyse.gene_prioritisation_analysis import benchmark_gene_prioritisation
7
- from pheval.analyse.run_data_parser import RunConfig, SinglePlotCustomisation
8
- from pheval.analyse.variant_prioritisation_analysis import benchmark_variant_prioritisation
9
-
10
-
11
- @dataclass
12
- class BenchmarkRunOutputGenerator:
13
- """Base class for recording data required for generating benchmarking outputs.
14
-
15
- Attributes:
16
- plot_customisation (SinglePlotCustomisation): Customisation for plot.
17
- prioritisation_type_string (str): Prioritisation type string.
18
- y_label (str): Label for the y-axis in benchmarking outputs.
19
- generate_benchmark_run_results (Callable): Callable to generate benchmark run results.
20
- Takes parameters: input and results directory, score order, threshold, rank comparison,
21
- and returns BenchmarkRunResults.
22
- stats_comparison_file (str): Suffix for the rank comparison file.
23
- """
24
-
25
- plot_customisation: SinglePlotCustomisation
26
- prioritisation_type_string: str
27
- y_label: str
28
- generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults]
29
- stats_comparison_file: str
30
-
31
-
32
- @dataclass
33
- class GeneBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
34
- """
35
- Subclass of BenchmarkRunOutputGenerator specialised
36
- for producing gene prioritisation benchmarking outputs.
37
-
38
- This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes
39
- specifically for gene prioritisation benchmarking.
40
-
41
- Attributes:
42
- plot_customisation (SinglePlotCustomisation): Customisation for plot.
43
- prioritisation_type_string (str): Prioritisation type string.
44
- Defaults to GENE_PRIORITISATION_TYPE_STR.
45
- y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs.
46
- Defaults to GENE_PLOT_Y_LABEL.
47
- generate_benchmark_run_results (Callable): Callable to generate gene prioritisation
48
- benchmark run results. Defaults to benchmark_gene_prioritisation.
49
- Takes parameters: run configuration, score order, threshold, rank comparison,
50
- and returns BenchmarkRunResults.
51
- stats_comparison_file (str): Suffix for the gene rank comparison file.
52
- Defaults to "-gene_summary".
53
- """
54
-
55
- plot_customisation: SinglePlotCustomisation = None
56
- prioritisation_type_string: str = "gene"
57
- y_label: str = "Disease-causing genes (%)"
58
- generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
59
- benchmark_gene_prioritisation
60
- )
61
- stats_comparison_file: str = "gene_summary"
62
-
63
-
64
- @dataclass
65
- class VariantBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
66
- """
67
- Subclass of BenchmarkRunOutputGenerator specialised
68
- for producing variant prioritisation benchmarking outputs.
69
-
70
- This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes
71
- specifically for variant prioritisation benchmarking.
72
-
73
- Attributes:
74
- plot_customisation (SinglePlotCustomisation): Customisation for plot.
75
- prioritisation_type_string (str): Prioritisation type string.
76
- Defaults to VARIANT_PRIORITISATION_TYPE_STR.
77
- y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs.
78
- Defaults to VARIANT_PLOT_Y_LABEL.
79
- generate_benchmark_run_results (Callable): Callable to generate variant prioritisation
80
- benchmark run results. Defaults to benchmark_variant_prioritisation.
81
- Takes parameters: run configuration, score order, threshold, rank comparison,
82
- and returns BenchmarkRunResults.
83
- stats_comparison_file (str): Suffix for the variant rank comparison file.
84
- Defaults to "-variant_summary".
85
-
86
- """
87
-
88
- plot_customisation: SinglePlotCustomisation = None
89
- prioritisation_type_string: str = "variant"
90
- y_label: str = "Disease-causing variants (%)"
91
- generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
92
- benchmark_variant_prioritisation
93
- )
94
- stats_comparison_file: str = "variant_summary"
95
-
96
-
97
- @dataclass
98
- class DiseaseBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
99
- """
100
- Subclass of BenchmarkRunOutputGenerator specialised
101
- for producing disease prioritisation benchmarking outputs.
102
-
103
- This subclass inherits from BenchmarkRunOutputGenerator and specialises its attributes
104
- specifically for disease prioritisation benchmarking.
105
-
106
- Attributes:
107
- plot_customisation (SinglePlotCustomisation): Customisation for plot.
108
- prioritisation_type_string (str): Prioritisation type string.
109
- Defaults to DISEASE_PRIORITISATION_TYPE_STR.
110
- y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs.
111
- Defaults to DISEASE_PLOT_Y_LABEL.
112
- generate_benchmark_run_results (Callable): Callable to generate disease prioritisation
113
- benchmark run results. Defaults to benchmark_disease_prioritisation.
114
- Takes parameters: run configuration, score order, threshold, rank comparison,
115
- and returns BenchmarkRunResults.
116
- stats_comparison_file (str): Suffix for the disease rank comparison file.
117
- Defaults to "-disease_summary".
118
- """
119
-
120
- plot_customisation: SinglePlotCustomisation = None
121
- prioritisation_type_string: str = "disease"
122
- y_label: str = "Known diseases (%)"
123
- generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
124
- benchmark_disease_prioritisation
125
- )
126
- stats_comparison_file: str = "disease_summary"
@@ -1,25 +0,0 @@
1
- from dataclasses import dataclass
2
- from pathlib import Path
3
-
4
- from pheval.analyse.binary_classification_stats import BinaryClassificationStats
5
- from pheval.analyse.rank_stats import RankStats
6
-
7
-
8
- @dataclass
9
- class BenchmarkRunResults:
10
- """
11
- Benchmarking results for a run.
12
-
13
- Attributes:
14
- rank_stats (RankStats): Statistics related to benchmark.
15
- binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark.
16
- results_dir (Path, optional): Path to the result directory. Defaults to None.
17
- benchmark_name (str, optional): Name of the benchmark run. Defaults to None.
18
- phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None.
19
- """
20
-
21
- rank_stats: RankStats
22
- binary_classification_stats: BinaryClassificationStats
23
- results_dir: Path = None
24
- benchmark_name: str = None
25
- phenopacket_dir: Path = None
@@ -1,152 +0,0 @@
1
- from pathlib import Path
2
-
3
- from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
- from pheval.analyse.binary_classification_stats import BinaryClassificationStats
7
- from pheval.analyse.rank_stats import RankStats
8
- from pheval.analyse.run_data_parser import RunConfig
9
- from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
10
- from pheval.utils.file_utils import all_files
11
-
12
-
13
- class AssessDiseasePrioritisation(AssessPrioritisationBase):
14
- """Class for assessing disease prioritisation based on thresholds and scoring orders."""
15
-
16
- def assess_disease_prioritisation(
17
- self,
18
- standardised_disease_result_path: Path,
19
- phenopacket_path: Path,
20
- binary_classification_stats: BinaryClassificationStats,
21
- ) -> None:
22
- """
23
- Assess disease prioritisation.
24
-
25
- This method assesses the prioritisation of diseases based on the provided criteria
26
- and records ranks using a PrioritisationRankRecorder.
27
-
28
- Args:
29
- standardised_disease_result_path (Path): Path to the standardised disease TSV result.
30
- phenopacket_path (Path): Path to the phenopacket.
31
- binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
32
- """
33
- relevant_ranks = []
34
- df = self.conn.execute(
35
- f"SELECT * FROM '{self.table_name}' WHERE phenopacket = ? ",
36
- (phenopacket_path.name,),
37
- ).fetchdf()
38
- for _i, row in df.iterrows():
39
- result = (
40
- self.conn.execute(
41
- (
42
- f"SELECT * FROM '{standardised_disease_result_path}' "
43
- f"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),"
44
- f" '{row['disease_identifier']}') "
45
- f"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), "
46
- f"'{row['disease_name']}')"
47
- )
48
- if standardised_disease_result_path.exists()
49
- and standardised_disease_result_path.stat().st_size > 0
50
- else "SELECT NULL WHERE FALSE"
51
- )
52
- .fetchdf()
53
- .to_dict(orient="records")
54
- )
55
-
56
- if len(result) > 0:
57
- disease_match = self._record_matched_entity(RankedPhEvalDiseaseResult(**result[0]))
58
- relevant_ranks.append(disease_match)
59
- primary_key = f"{phenopacket_path.name}-{row['disease_identifier']}"
60
- self.conn.execute(
61
- f'UPDATE "{self.table_name}" SET "{self.column}" = ? WHERE identifier = ?',
62
- (disease_match, primary_key),
63
- )
64
- elif len(result) == 0:
65
- relevant_ranks.append(0)
66
- binary_classification_stats.add_classification(
67
- (
68
- self.db_connection.parse_table_into_dataclass(
69
- str(standardised_disease_result_path), RankedPhEvalDiseaseResult
70
- )
71
- if standardised_disease_result_path.exists()
72
- else []
73
- ),
74
- relevant_ranks,
75
- )
76
-
77
-
78
- def assess_phenopacket_disease_prioritisation(
79
- phenopacket_path: Path,
80
- run: RunConfig,
81
- disease_binary_classification_stats: BinaryClassificationStats,
82
- disease_benchmarker: AssessDiseasePrioritisation,
83
- ) -> None:
84
- """
85
- Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results
86
- against the recorded causative diseases for a proband in the Phenopacket.
87
-
88
- Args:
89
- phenopacket_path (Path): Path to the Phenopacket.
90
- run (RunConfig): Run configuration.
91
- disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
92
- disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance.
93
- """
94
- standardised_disease_result_path = run.results_dir.joinpath(
95
- f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
96
- )
97
- disease_benchmarker.assess_disease_prioritisation(
98
- standardised_disease_result_path,
99
- phenopacket_path,
100
- disease_binary_classification_stats,
101
- )
102
-
103
-
104
- def benchmark_disease_prioritisation(
105
- benchmark_name: str,
106
- run: RunConfig,
107
- score_order: str,
108
- threshold: float,
109
- ):
110
- """
111
- Benchmark a directory based on disease prioritisation results.
112
-
113
- Args:
114
- benchmark_name (str): Name of the benchmark.
115
- run (RunConfig): Run configuration.
116
- score_order (str): The order in which scores are arranged.
117
- threshold (float): Threshold for assessment.
118
-
119
- Returns:
120
- BenchmarkRunResults: An object containing benchmarking results for disease prioritisation,
121
- including ranks and rank statistics for the benchmarked directory.
122
- """
123
- disease_binary_classification_stats = BinaryClassificationStats()
124
- db_connection = BenchmarkDBManager(benchmark_name)
125
- db_connection.initialise()
126
- disease_benchmarker = AssessDiseasePrioritisation(
127
- db_connection,
128
- f"{run.phenopacket_dir.parents[0].name}_disease",
129
- run.run_identifier,
130
- threshold,
131
- score_order,
132
- )
133
- for phenopacket_path in all_files(run.phenopacket_dir):
134
- assess_phenopacket_disease_prioritisation(
135
- phenopacket_path,
136
- run,
137
- disease_binary_classification_stats,
138
- disease_benchmarker,
139
- )
140
- db_connection.close()
141
- disease_rank_stats = RankStats()
142
- disease_rank_stats.add_ranks(
143
- benchmark_name=benchmark_name,
144
- table_name=f"{run.phenopacket_dir.parents[0].name}_disease",
145
- column_name=str(run.run_identifier),
146
- )
147
- return BenchmarkRunResults(
148
- rank_stats=disease_rank_stats,
149
- benchmark_name=run.run_identifier,
150
- binary_classification_stats=disease_binary_classification_stats,
151
- phenopacket_dir=run.phenopacket_dir,
152
- )
@@ -1,147 +0,0 @@
1
- from pathlib import Path
2
-
3
- from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
- from pheval.analyse.binary_classification_stats import BinaryClassificationStats
7
- from pheval.analyse.rank_stats import RankStats
8
- from pheval.analyse.run_data_parser import RunConfig
9
- from pheval.post_processing.post_processing import RankedPhEvalGeneResult
10
- from pheval.utils.file_utils import all_files
11
-
12
-
13
- class AssessGenePrioritisation(AssessPrioritisationBase):
14
- """Class for assessing gene prioritisation based on thresholds and scoring orders."""
15
-
16
- def assess_gene_prioritisation(
17
- self,
18
- standardised_gene_result_path: Path,
19
- phenopacket_path: Path,
20
- binary_classification_stats: BinaryClassificationStats,
21
- ) -> None:
22
- """
23
- Assess gene prioritisation.
24
- This method assesses the prioritisation of genes based on the provided criteria
25
- and records ranks using a PrioritisationRankRecorder.
26
-
27
- Args:
28
- standardised_gene_result_path (Path): Path to the standardised gene TSV result.
29
- phenopacket_path (Path): Path to the Phenopacket.
30
- binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
31
- """
32
- relevant_ranks = []
33
- df = self.conn.execute(
34
- f"""SELECT * FROM "{self.table_name}" WHERE phenopacket = '{phenopacket_path.name}'"""
35
- ).fetchdf()
36
- for _i, row in df.iterrows():
37
- result = (
38
- self.conn.execute(
39
- (
40
- f"SELECT * FROM '{standardised_gene_result_path}' "
41
- f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR), "
42
- f"'{row['gene_identifier']}') "
43
- f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), "
44
- f"'{row['gene_symbol']}')"
45
- )
46
- if standardised_gene_result_path.exists()
47
- and standardised_gene_result_path.stat().st_size > 0
48
- else "SELECT NULL WHERE FALSE"
49
- )
50
- .fetchdf()
51
- .to_dict(orient="records")
52
- )
53
- if len(result) > 0:
54
- gene_match = self._record_matched_entity(RankedPhEvalGeneResult(**result[0]))
55
- relevant_ranks.append(gene_match)
56
- primary_key = f"{phenopacket_path.name}-{row['gene_symbol']}"
57
- self.conn.execute(
58
- f'UPDATE "{self.table_name}" SET "{self.column}" = ? WHERE identifier = ?',
59
- (gene_match, primary_key),
60
- )
61
- if not result:
62
- relevant_ranks.append(0)
63
- binary_classification_stats.add_classification(
64
- (
65
- self.db_connection.parse_table_into_dataclass(
66
- str(standardised_gene_result_path), RankedPhEvalGeneResult
67
- )
68
- if standardised_gene_result_path.exists()
69
- else []
70
- ),
71
- relevant_ranks,
72
- )
73
-
74
-
75
- def assess_phenopacket_gene_prioritisation(
76
- phenopacket_path: Path,
77
- run: RunConfig,
78
- gene_binary_classification_stats: BinaryClassificationStats,
79
- gene_benchmarker: AssessGenePrioritisation,
80
- ) -> None:
81
- """
82
- Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results
83
- against the recorded causative genes for a proband in the Phenopacket.
84
-
85
- Args:
86
- phenopacket_path (Path): Path to the Phenopacket.
87
- run (RunConfig): Run configuration.
88
- gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
89
- gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance.
90
- """
91
- standardised_gene_result_path = run.results_dir.joinpath(
92
- f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
93
- )
94
- gene_benchmarker.assess_gene_prioritisation(
95
- standardised_gene_result_path,
96
- phenopacket_path,
97
- gene_binary_classification_stats,
98
- )
99
-
100
-
101
- def benchmark_gene_prioritisation(
102
- benchmark_name: str,
103
- run: RunConfig,
104
- score_order: str,
105
- threshold: float,
106
- ) -> BenchmarkRunResults:
107
- """
108
- Benchmark a directory based on gene prioritisation results.
109
- Args:
110
- benchmark_name (str): Name of the benchmark.
111
- run (RunConfig): Run configuration.
112
- score_order (str): The order in which scores are arranged.
113
- threshold (float): Threshold for assessment.
114
- Returns:
115
- BenchmarkRunResults: An object containing benchmarking results for gene prioritisation,
116
- including ranks and rank statistics for the benchmarked directory.
117
- """
118
- gene_binary_classification_stats = BinaryClassificationStats()
119
- db_connection = BenchmarkDBManager(benchmark_name)
120
- db_connection.initialise()
121
- gene_benchmarker = AssessGenePrioritisation(
122
- db_connection,
123
- f"{run.phenopacket_dir.parents[0].name}" f"_gene",
124
- run.run_identifier,
125
- threshold,
126
- score_order,
127
- )
128
- for phenopacket_path in all_files(run.phenopacket_dir):
129
- assess_phenopacket_gene_prioritisation(
130
- phenopacket_path,
131
- run,
132
- gene_binary_classification_stats,
133
- gene_benchmarker,
134
- )
135
- db_connection.close()
136
- gene_rank_stats = RankStats()
137
- gene_rank_stats.add_ranks(
138
- benchmark_name=benchmark_name,
139
- table_name=f"{run.phenopacket_dir.parents[0].name}_gene",
140
- column_name=str(run.run_identifier),
141
- )
142
- return BenchmarkRunResults(
143
- rank_stats=gene_rank_stats,
144
- benchmark_name=run.run_identifier,
145
- binary_classification_stats=gene_binary_classification_stats,
146
- phenopacket_dir=run.phenopacket_dir,
147
- )
@@ -1,105 +0,0 @@
1
- import itertools
2
- from typing import List
3
-
4
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
- from pheval.analyse.benchmark_generator import BenchmarkRunOutputGenerator
6
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
7
- from pheval.analyse.generate_plots import generate_plots
8
-
9
-
10
- def get_new_table_name(run_identifier_1: str, run_identifier_2: str, output_prefix: str) -> str:
11
- """
12
- Get the new table name for rank comparison tables.
13
- Args:
14
- run_identifier_1: The first run identifier.
15
- run_identifier_2: The second run identifier.
16
- output_prefix: The output prefix of the table
17
- Returns:
18
- The new table name.
19
- """
20
- return f"{run_identifier_1}_vs_" f"{run_identifier_2}_" f"{output_prefix}_rank_comparison"
21
-
22
-
23
- def create_comparison_table(
24
- comparison_table_name: str,
25
- connector: BenchmarkDBManager,
26
- drop_columns: List[str],
27
- run_identifier_1: str,
28
- run_identifier_2: str,
29
- table_name: str,
30
- ) -> None:
31
- """
32
- Create rank comparison tables.
33
- Args:
34
- comparison_table_name (str): Name of the comparison table to create.
35
- connector (BenchmarkDBManager): DBConnector instance.
36
- drop_columns (List[str]): List of columns to drop.
37
- run_identifier_1 (str): The first run identifier.
38
- run_identifier_2 (str): The second run identifier.
39
- table_name (str): Name of the table to extract ranks from
40
- """
41
- connector.drop_table(comparison_table_name)
42
- excluded_columns = tuple(drop_columns + ["identifier"]) if drop_columns else ("identifier",)
43
- connector.conn.execute(
44
- f'CREATE TABLE "{comparison_table_name}" AS SELECT * '
45
- f"EXCLUDE {excluded_columns} FROM {table_name}"
46
- )
47
-
48
- connector.conn.execute(
49
- f"""ALTER TABLE "{comparison_table_name}" ADD COLUMN rank_change VARCHAR;"""
50
- )
51
- connector.conn.execute(
52
- f'UPDATE "{comparison_table_name}" SET rank_change = CASE WHEN "{run_identifier_1}" = 0 '
53
- f'AND "{run_identifier_2}" != 0 '
54
- f"THEN 'GAINED' WHEN \"{run_identifier_1}\" != 0 AND \"{run_identifier_2}\" = 0 THEN 'LOST' ELSE "
55
- f'CAST ("{run_identifier_1}" - "{run_identifier_2}" AS VARCHAR) END;'
56
- )
57
- connector.conn.commit()
58
-
59
-
60
- def generate_benchmark_comparison_output(
61
- benchmark_name: str,
62
- benchmarking_results: List[BenchmarkRunResults],
63
- run_identifiers: List[str],
64
- benchmark_generator: BenchmarkRunOutputGenerator,
65
- table_name: str,
66
- ) -> None:
67
- """
68
- Generate prioritisation outputs for benchmarking multiple runs.
69
-
70
- This function generates comparison outputs for benchmarking multiple runs. It compares the results
71
- between pairs of `BenchmarkRunResults` instances in `benchmarking_results` and generates rank
72
- comparison outputs using `RankComparisonGenerator` for each pair.
73
-
74
- Args:
75
- benchmark_name (str): Name of the benchmark.
76
- benchmarking_results (List[BenchmarkRunResults]): A list containing BenchmarkRunResults instances
77
- representing the benchmarking results of multiple runs.
78
- run_identifiers (List[str]): A list of run identifiers.
79
- benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
80
- table_name (str): The name of the table where ranks are stored.
81
- """
82
- output_prefix = benchmark_generator.prioritisation_type_string
83
- connector = BenchmarkDBManager(benchmark_name)
84
- for pair in itertools.combinations(
85
- [str(result.benchmark_name) for result in benchmarking_results], 2
86
- ):
87
- run_identifier_1 = pair[0]
88
- run_identifier_2 = pair[1]
89
- drop_columns = [run for run in run_identifiers if run not in pair]
90
- comparison_table_name = get_new_table_name(
91
- run_identifier_1, run_identifier_2, output_prefix
92
- )
93
- create_comparison_table(
94
- comparison_table_name,
95
- connector,
96
- drop_columns,
97
- run_identifier_1,
98
- run_identifier_2,
99
- table_name,
100
- )
101
- generate_plots(
102
- benchmark_name,
103
- benchmarking_results,
104
- benchmark_generator,
105
- )