pheval 0.3.8__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/analysis.py +61 -150
- pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval/analyse/benchmark_db_manager.py +140 -0
- pheval/analyse/benchmark_generator.py +47 -50
- pheval/analyse/benchmarking_data.py +3 -2
- pheval/analyse/disease_prioritisation_analysis.py +70 -219
- pheval/analyse/gene_prioritisation_analysis.py +66 -242
- pheval/analyse/generate_plots.py +81 -79
- pheval/analyse/generate_summary_outputs.py +64 -134
- pheval/analyse/parse_benchmark_summary.py +50 -37
- pheval/analyse/parse_corpus.py +219 -0
- pheval/analyse/rank_stats.py +177 -144
- pheval/analyse/run_data_parser.py +108 -27
- pheval/analyse/variant_prioritisation_analysis.py +78 -212
- pheval/cli.py +2 -4
- pheval/cli_pheval_utils.py +34 -245
- pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.4.0.dist-info/METADATA +112 -0
- {pheval-0.3.8.dist-info → pheval-0.4.0.dist-info}/RECORD +22 -22
- pheval/analyse/parse_pheval_result.py +0 -43
- pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval/constants.py +0 -8
- pheval-0.3.8.dist-info/METADATA +0 -35
- {pheval-0.3.8.dist-info → pheval-0.4.0.dist-info}/LICENSE +0 -0
- {pheval-0.3.8.dist-info → pheval-0.4.0.dist-info}/WHEEL +0 -0
- {pheval-0.3.8.dist-info → pheval-0.4.0.dist-info}/entry_points.txt +0 -0
pheval/analyse/analysis.py
CHANGED
|
@@ -1,193 +1,104 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
1
|
from pheval.analyse.benchmark_generator import (
|
|
6
2
|
BenchmarkRunOutputGenerator,
|
|
7
3
|
DiseaseBenchmarkRunOutputGenerator,
|
|
8
4
|
GeneBenchmarkRunOutputGenerator,
|
|
9
5
|
VariantBenchmarkRunOutputGenerator,
|
|
10
6
|
)
|
|
11
|
-
from pheval.analyse.generate_summary_outputs import
|
|
12
|
-
|
|
13
|
-
generate_benchmark_output,
|
|
14
|
-
)
|
|
7
|
+
from pheval.analyse.generate_summary_outputs import generate_benchmark_comparison_output
|
|
8
|
+
from pheval.analyse.parse_corpus import CorpusParser
|
|
15
9
|
from pheval.analyse.rank_stats import RankStatsWriter
|
|
16
|
-
from pheval.analyse.run_data_parser import
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def _run_benchmark(
|
|
20
|
-
results_dir_and_input: TrackInputOutputDirectories,
|
|
21
|
-
score_order: str,
|
|
22
|
-
output_prefix: str,
|
|
23
|
-
threshold: float,
|
|
24
|
-
plot_type: str,
|
|
25
|
-
benchmark_generator: BenchmarkRunOutputGenerator,
|
|
26
|
-
) -> None:
|
|
27
|
-
"""Run a benchmark on a result directory.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
results_dir_and_input (TrackInputOutputDirectories): Input and output directories for tracking results.
|
|
31
|
-
score_order (str): The order in which scores are arranged, this can be either ascending or descending.
|
|
32
|
-
output_prefix (str): Prefix for the benchmark output file names.
|
|
33
|
-
threshold (float): The threshold for benchmark evaluation.
|
|
34
|
-
plot_type (str): Type of plot for benchmark visualisation.
|
|
35
|
-
benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
|
|
36
|
-
"""
|
|
37
|
-
stats_writer = RankStatsWriter(
|
|
38
|
-
Path(output_prefix + benchmark_generator.stats_comparison_file_suffix)
|
|
39
|
-
)
|
|
40
|
-
rank_comparison = defaultdict(dict)
|
|
41
|
-
benchmark_result = benchmark_generator.generate_benchmark_run_results(
|
|
42
|
-
results_dir_and_input, score_order, threshold, rank_comparison
|
|
43
|
-
)
|
|
44
|
-
stats_writer.write_row(
|
|
45
|
-
results_dir_and_input.results_dir,
|
|
46
|
-
benchmark_result.rank_stats,
|
|
47
|
-
benchmark_result.binary_classification_stats,
|
|
48
|
-
)
|
|
49
|
-
generate_benchmark_output(benchmark_result, plot_type, benchmark_generator)
|
|
50
|
-
stats_writer.close()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def benchmark_directory(
|
|
54
|
-
results_dir_and_input: TrackInputOutputDirectories,
|
|
55
|
-
score_order: str,
|
|
56
|
-
output_prefix: str,
|
|
57
|
-
threshold: float,
|
|
58
|
-
gene_analysis: bool,
|
|
59
|
-
variant_analysis: bool,
|
|
60
|
-
disease_analysis: bool,
|
|
61
|
-
plot_type: str,
|
|
62
|
-
) -> None:
|
|
63
|
-
"""
|
|
64
|
-
Benchmark prioritisation performance for a single run.
|
|
65
|
-
|
|
66
|
-
Args:
|
|
67
|
-
results_dir_and_input (TrackInputOutputDirectories): Input and output directories for tracking results.
|
|
68
|
-
score_order (str): The order in which scores are arranged, this can be either ascending or descending.
|
|
69
|
-
output_prefix (str): Prefix for the benchmark output file names.
|
|
70
|
-
threshold (float): The threshold for benchmark evaluation.
|
|
71
|
-
gene_analysis (bool): Boolean flag indicating whether to benchmark gene results.
|
|
72
|
-
variant_analysis (bool): Boolean flag indicating whether to benchmark variant results.
|
|
73
|
-
disease_analysis (bool): Boolean flag indicating whether to benchmark disease results.
|
|
74
|
-
plot_type (str): Type of plot for benchmark visualisation.
|
|
75
|
-
"""
|
|
76
|
-
if gene_analysis:
|
|
77
|
-
_run_benchmark(
|
|
78
|
-
results_dir_and_input=results_dir_and_input,
|
|
79
|
-
score_order=score_order,
|
|
80
|
-
output_prefix=output_prefix,
|
|
81
|
-
threshold=threshold,
|
|
82
|
-
plot_type=plot_type,
|
|
83
|
-
benchmark_generator=GeneBenchmarkRunOutputGenerator(),
|
|
84
|
-
)
|
|
85
|
-
if variant_analysis:
|
|
86
|
-
_run_benchmark(
|
|
87
|
-
results_dir_and_input=results_dir_and_input,
|
|
88
|
-
score_order=score_order,
|
|
89
|
-
output_prefix=output_prefix,
|
|
90
|
-
threshold=threshold,
|
|
91
|
-
plot_type=plot_type,
|
|
92
|
-
benchmark_generator=VariantBenchmarkRunOutputGenerator(),
|
|
93
|
-
)
|
|
94
|
-
if disease_analysis:
|
|
95
|
-
_run_benchmark(
|
|
96
|
-
results_dir_and_input=results_dir_and_input,
|
|
97
|
-
score_order=score_order,
|
|
98
|
-
output_prefix=output_prefix,
|
|
99
|
-
threshold=threshold,
|
|
100
|
-
plot_type=plot_type,
|
|
101
|
-
benchmark_generator=DiseaseBenchmarkRunOutputGenerator(),
|
|
102
|
-
)
|
|
10
|
+
from pheval.analyse.run_data_parser import Config
|
|
103
11
|
|
|
104
12
|
|
|
105
13
|
def _run_benchmark_comparison(
|
|
106
|
-
|
|
107
|
-
score_order: str,
|
|
108
|
-
output_prefix: str,
|
|
109
|
-
threshold: float,
|
|
110
|
-
plot_type: str,
|
|
14
|
+
run_config: Config,
|
|
111
15
|
benchmark_generator: BenchmarkRunOutputGenerator,
|
|
112
16
|
) -> None:
|
|
113
17
|
"""
|
|
114
18
|
Run a benchmark on several result directories.
|
|
115
19
|
|
|
116
20
|
Args:
|
|
117
|
-
|
|
21
|
+
run_config (List[TrackInputOutputDirectories]): List of input and output directories
|
|
118
22
|
for tracking results across multiple directories.
|
|
119
|
-
score_order (str): The order in which scores are arranged, this can be either ascending or descending.
|
|
120
|
-
output_prefix (str): Prefix for the benchmark output file names.
|
|
121
|
-
threshold (float): The threshold for benchmark evaluation.
|
|
122
|
-
plot_type (str): Type of plot for benchmark visualisation.
|
|
123
23
|
benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
|
|
124
24
|
"""
|
|
125
25
|
stats_writer = RankStatsWriter(
|
|
126
|
-
|
|
26
|
+
run_config.benchmark_name, benchmark_generator.stats_comparison_file
|
|
127
27
|
)
|
|
28
|
+
unique_test_corpora_directories = set([result.phenopacket_dir for result in run_config.runs])
|
|
29
|
+
[
|
|
30
|
+
CorpusParser(run_config.benchmark_name, test_corpora_directory).parse_corpus(
|
|
31
|
+
benchmark_generator
|
|
32
|
+
)
|
|
33
|
+
for test_corpora_directory in unique_test_corpora_directories
|
|
34
|
+
]
|
|
128
35
|
benchmarking_results = []
|
|
129
|
-
for
|
|
130
|
-
rank_comparison = defaultdict(dict)
|
|
36
|
+
for run in run_config.runs:
|
|
131
37
|
benchmark_result = benchmark_generator.generate_benchmark_run_results(
|
|
132
|
-
|
|
38
|
+
run_config.benchmark_name, run, run.score_order, run.threshold
|
|
133
39
|
)
|
|
134
|
-
stats_writer.
|
|
135
|
-
|
|
40
|
+
stats_writer.add_statistics_entry(
|
|
41
|
+
run.run_identifier,
|
|
136
42
|
benchmark_result.rank_stats,
|
|
137
43
|
benchmark_result.binary_classification_stats,
|
|
138
44
|
)
|
|
139
45
|
benchmarking_results.append(benchmark_result)
|
|
140
|
-
|
|
141
|
-
|
|
46
|
+
run_identifiers = [run.run_identifier for run in run_config.runs]
|
|
47
|
+
[
|
|
48
|
+
generate_benchmark_comparison_output(
|
|
49
|
+
run_config.benchmark_name,
|
|
50
|
+
benchmarking_results,
|
|
51
|
+
run_identifiers,
|
|
52
|
+
benchmark_generator,
|
|
53
|
+
f"{unique_test_corpora_directory.parents[0].name}_"
|
|
54
|
+
f"{benchmark_generator.prioritisation_type_string}",
|
|
55
|
+
)
|
|
56
|
+
for unique_test_corpora_directory in unique_test_corpora_directories
|
|
57
|
+
]
|
|
142
58
|
|
|
143
59
|
|
|
144
60
|
def benchmark_run_comparisons(
|
|
145
|
-
|
|
146
|
-
score_order: str,
|
|
147
|
-
output_prefix: str,
|
|
148
|
-
threshold: float,
|
|
149
|
-
gene_analysis: bool,
|
|
150
|
-
variant_analysis: bool,
|
|
151
|
-
disease_analysis: bool,
|
|
152
|
-
plot_type: str,
|
|
61
|
+
run_config: Config,
|
|
153
62
|
) -> None:
|
|
154
63
|
"""
|
|
155
64
|
Benchmark prioritisation performance for several runs.
|
|
156
65
|
|
|
157
66
|
Args:
|
|
158
|
-
|
|
159
|
-
score_order (str): The order in which scores are arranged, this can be either ascending or descending.
|
|
160
|
-
output_prefix (str): Prefix for the benchmark output file names.
|
|
161
|
-
threshold (float): The threshold for benchmark evaluation.
|
|
162
|
-
gene_analysis (bool): Boolean flag indicating whether to benchmark gene results.
|
|
163
|
-
variant_analysis (bool): Boolean flag indicating whether to benchmark variant results.
|
|
164
|
-
disease_analysis (bool): Boolean flag indicating whether to benchmark disease results.
|
|
165
|
-
plot_type (str): Type of plot for benchmark visualisation.
|
|
67
|
+
run_config (Config): Run configurations.
|
|
166
68
|
"""
|
|
167
|
-
|
|
69
|
+
gene_analysis_runs = Config(
|
|
70
|
+
benchmark_name=run_config.benchmark_name,
|
|
71
|
+
runs=[run for run in run_config.runs if run.gene_analysis],
|
|
72
|
+
plot_customisation=run_config.plot_customisation,
|
|
73
|
+
)
|
|
74
|
+
variant_analysis_runs = Config(
|
|
75
|
+
benchmark_name=run_config.benchmark_name,
|
|
76
|
+
runs=[run for run in run_config.runs if run.variant_analysis],
|
|
77
|
+
plot_customisation=run_config.plot_customisation,
|
|
78
|
+
)
|
|
79
|
+
disease_analysis_runs = Config(
|
|
80
|
+
benchmark_name=run_config.benchmark_name,
|
|
81
|
+
runs=[run for run in run_config.runs if run.disease_analysis],
|
|
82
|
+
plot_customisation=run_config.plot_customisation,
|
|
83
|
+
)
|
|
84
|
+
if gene_analysis_runs.runs:
|
|
168
85
|
_run_benchmark_comparison(
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
plot_type=plot_type,
|
|
174
|
-
benchmark_generator=GeneBenchmarkRunOutputGenerator(),
|
|
86
|
+
run_config=gene_analysis_runs,
|
|
87
|
+
benchmark_generator=GeneBenchmarkRunOutputGenerator(
|
|
88
|
+
plot_customisation=gene_analysis_runs.plot_customisation.gene_plots
|
|
89
|
+
),
|
|
175
90
|
)
|
|
176
|
-
if
|
|
91
|
+
if variant_analysis_runs.runs:
|
|
177
92
|
_run_benchmark_comparison(
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
plot_type=plot_type,
|
|
183
|
-
benchmark_generator=VariantBenchmarkRunOutputGenerator(),
|
|
93
|
+
run_config=variant_analysis_runs,
|
|
94
|
+
benchmark_generator=VariantBenchmarkRunOutputGenerator(
|
|
95
|
+
plot_customisation=variant_analysis_runs.plot_customisation.variant_plots
|
|
96
|
+
),
|
|
184
97
|
)
|
|
185
|
-
if
|
|
98
|
+
if disease_analysis_runs.runs:
|
|
186
99
|
_run_benchmark_comparison(
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
plot_type=plot_type,
|
|
192
|
-
benchmark_generator=DiseaseBenchmarkRunOutputGenerator(),
|
|
100
|
+
run_config=disease_analysis_runs,
|
|
101
|
+
benchmark_generator=DiseaseBenchmarkRunOutputGenerator(
|
|
102
|
+
plot_customisation=disease_analysis_runs.plot_customisation.disease_plots
|
|
103
|
+
),
|
|
193
104
|
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
|
|
4
|
+
from pheval.post_processing.post_processing import (
|
|
5
|
+
RankedPhEvalDiseaseResult,
|
|
6
|
+
RankedPhEvalGeneResult,
|
|
7
|
+
RankedPhEvalVariantResult,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AssessPrioritisationBase:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
db_connection: BenchmarkDBManager,
|
|
15
|
+
table_name: str,
|
|
16
|
+
column: str,
|
|
17
|
+
threshold: float,
|
|
18
|
+
score_order: str,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialise AssessPrioritisationBase class
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
db_connection (BenchmarkDBManager): DB connection.
|
|
25
|
+
table_name (str): Table name.
|
|
26
|
+
column (str): Column name.
|
|
27
|
+
threshold (float): Threshold for scores
|
|
28
|
+
score_order (str): Score order for results, either ascending or descending
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
self.threshold = threshold
|
|
32
|
+
self.score_order = score_order
|
|
33
|
+
self.db_connection = db_connection
|
|
34
|
+
self.conn = db_connection.conn
|
|
35
|
+
self.column = column
|
|
36
|
+
self.table_name = table_name
|
|
37
|
+
db_connection.add_column_integer_default(
|
|
38
|
+
table_name=table_name, column=self.column, default=0
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def _assess_with_threshold_ascending_order(
|
|
42
|
+
self,
|
|
43
|
+
result_entry: Union[
|
|
44
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
45
|
+
],
|
|
46
|
+
) -> int:
|
|
47
|
+
"""
|
|
48
|
+
Record the prioritisation rank if it meets the ascending order threshold.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
53
|
+
Ranked PhEval result entry
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
int: Recorded prioritisation rank
|
|
57
|
+
"""
|
|
58
|
+
if float(self.threshold) > float(result_entry.score):
|
|
59
|
+
return result_entry.rank
|
|
60
|
+
else:
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
def _assess_with_threshold(
|
|
64
|
+
self,
|
|
65
|
+
result_entry: Union[
|
|
66
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
67
|
+
],
|
|
68
|
+
) -> int:
|
|
69
|
+
"""
|
|
70
|
+
Record the prioritisation rank if it meets the score threshold.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
74
|
+
Ranked PhEval result entry
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
int: Recorded prioritisation rank
|
|
78
|
+
"""
|
|
79
|
+
if float(self.threshold) < float(result_entry.score):
|
|
80
|
+
return result_entry.rank
|
|
81
|
+
else:
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
def _record_matched_entity(
|
|
85
|
+
self,
|
|
86
|
+
standardised_result: Union[
|
|
87
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
88
|
+
],
|
|
89
|
+
) -> int:
|
|
90
|
+
"""
|
|
91
|
+
Return the rank result - handling the specification of a threshold.
|
|
92
|
+
Args:
|
|
93
|
+
standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
94
|
+
Ranked PhEval disease result entry
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
int: Recorded entity prioritisation rank
|
|
98
|
+
"""
|
|
99
|
+
if float(self.threshold) == 0.0:
|
|
100
|
+
return standardised_result.rank
|
|
101
|
+
else:
|
|
102
|
+
return (
|
|
103
|
+
self._assess_with_threshold(standardised_result)
|
|
104
|
+
if self.score_order != "ascending"
|
|
105
|
+
else self._assess_with_threshold_ascending_order(
|
|
106
|
+
standardised_result,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Type, Union
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
6
|
+
from duckdb import DuckDBPyConnection
|
|
7
|
+
|
|
8
|
+
from pheval.post_processing.post_processing import (
|
|
9
|
+
RankedPhEvalDiseaseResult,
|
|
10
|
+
RankedPhEvalGeneResult,
|
|
11
|
+
RankedPhEvalVariantResult,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BenchmarkDBManager:
|
|
16
|
+
"""
|
|
17
|
+
Class to connect to database.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, benchmark_name: str):
|
|
21
|
+
"""Initialise the BenchmarkDBManager class."""
|
|
22
|
+
self.conn = self.get_connection(
|
|
23
|
+
f"{benchmark_name}" if str(benchmark_name).endswith(".db") else f"{benchmark_name}.db"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def initialise(self):
|
|
27
|
+
"""Initialise the duckdb connection."""
|
|
28
|
+
self.add_contains_function()
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def get_connection(db_name: str) -> DuckDBPyConnection:
|
|
32
|
+
"""
|
|
33
|
+
Get a connection to the database.
|
|
34
|
+
Returns:
|
|
35
|
+
DuckDBPyConnection: Connection to the database.
|
|
36
|
+
"""
|
|
37
|
+
conn = duckdb.connect(db_name)
|
|
38
|
+
return conn
|
|
39
|
+
|
|
40
|
+
def add_column_integer_default(self, table_name: str, column: str, default: int = 0) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Add a column to an existing table with an integer default value.
|
|
43
|
+
Args:
|
|
44
|
+
table_name (str): Name of the table.
|
|
45
|
+
column (str): Name of the column to add.
|
|
46
|
+
default (int): Default integer value to add.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
self.conn.execute(
|
|
50
|
+
f'ALTER TABLE {table_name} ADD COLUMN "{column}" INTEGER DEFAULT {default}'
|
|
51
|
+
)
|
|
52
|
+
self.conn.execute(f'UPDATE {table_name} SET "{column}" = ?', (default,))
|
|
53
|
+
self.conn.commit()
|
|
54
|
+
except duckdb.CatalogException:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def drop_table(self, table_name: str) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Drop a table from the database.
|
|
60
|
+
Args:
|
|
61
|
+
table_name: Name of the table to drop from the database
|
|
62
|
+
"""
|
|
63
|
+
self.conn.execute(f"""DROP TABLE IF EXISTS "{table_name}";""")
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def contains_entity_function(entity: str, known_causative_entity: str) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Determines if a known causative entity is present within an entity or list of entities.
|
|
69
|
+
Args:
|
|
70
|
+
entity (str): The entity to be checked. It can be a single entity or a string representation of a list.
|
|
71
|
+
known_causative_entity (str): The entity to search for within the `entity`.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
bool: `True` if `known_causative_entity` is found in `entity` (or its list representation),
|
|
75
|
+
`False` otherwise.
|
|
76
|
+
"""
|
|
77
|
+
list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*]$")
|
|
78
|
+
if list_pattern.match(str(entity)):
|
|
79
|
+
list_representation = ast.literal_eval(entity)
|
|
80
|
+
if isinstance(list_representation, list):
|
|
81
|
+
return known_causative_entity in list_representation
|
|
82
|
+
return known_causative_entity == entity
|
|
83
|
+
|
|
84
|
+
def add_contains_function(self) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist.
|
|
87
|
+
"""
|
|
88
|
+
result = self.conn.execute(
|
|
89
|
+
"SELECT * FROM duckdb_functions() WHERE function_name = ?", ["contains_entity_function"]
|
|
90
|
+
).fetchall()
|
|
91
|
+
if not result:
|
|
92
|
+
self.conn.create_function("contains_entity_function", self.contains_entity_function)
|
|
93
|
+
|
|
94
|
+
def parse_table_into_dataclass(
|
|
95
|
+
self,
|
|
96
|
+
table_name: str,
|
|
97
|
+
dataclass: Union[
|
|
98
|
+
Type[RankedPhEvalGeneResult],
|
|
99
|
+
Type[RankedPhEvalVariantResult],
|
|
100
|
+
Type[RankedPhEvalDiseaseResult],
|
|
101
|
+
],
|
|
102
|
+
) -> Union[
|
|
103
|
+
List[RankedPhEvalGeneResult],
|
|
104
|
+
List[RankedPhEvalVariantResult],
|
|
105
|
+
List[RankedPhEvalDiseaseResult],
|
|
106
|
+
]:
|
|
107
|
+
"""
|
|
108
|
+
Parses a DuckDB table into a list of dataclass instances.
|
|
109
|
+
Args:
|
|
110
|
+
table_name (str): The name of the DuckDB table to be parsed.
|
|
111
|
+
dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult],
|
|
112
|
+
Type[RankedPhEvalDiseaseResult]]):
|
|
113
|
+
The dataclass type to which each row in the table should be mapped.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table.
|
|
117
|
+
"""
|
|
118
|
+
result = (
|
|
119
|
+
self.conn.execute(f"SELECT * FROM '{table_name}'").fetchdf().to_dict(orient="records")
|
|
120
|
+
)
|
|
121
|
+
return [dataclass(**row) for row in result]
|
|
122
|
+
|
|
123
|
+
def check_table_exists(self, table_name: str) -> bool:
|
|
124
|
+
"""
|
|
125
|
+
Check if a table exists in the connected DuckDB database.
|
|
126
|
+
Args:
|
|
127
|
+
table_name (str): The name of the table to check for existence.
|
|
128
|
+
Returns:
|
|
129
|
+
bool: Returns `True` if the table exists in the database, `False` otherwise.
|
|
130
|
+
"""
|
|
131
|
+
result = self.conn.execute(
|
|
132
|
+
f"SELECT * FROM information_schema.tables WHERE table_name = '{table_name}'"
|
|
133
|
+
).fetchall()
|
|
134
|
+
if result:
|
|
135
|
+
return True
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
def close(self):
|
|
139
|
+
"""Close the connection to the database."""
|
|
140
|
+
self.conn.close()
|
|
@@ -1,20 +1,11 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from dataclasses import dataclass
|
|
3
2
|
from typing import Callable
|
|
4
3
|
|
|
5
4
|
from pheval.analyse.benchmarking_data import BenchmarkRunResults
|
|
6
5
|
from pheval.analyse.disease_prioritisation_analysis import benchmark_disease_prioritisation
|
|
7
6
|
from pheval.analyse.gene_prioritisation_analysis import benchmark_gene_prioritisation
|
|
8
|
-
from pheval.analyse.run_data_parser import
|
|
7
|
+
from pheval.analyse.run_data_parser import RunConfig, SinglePlotCustomisation
|
|
9
8
|
from pheval.analyse.variant_prioritisation_analysis import benchmark_variant_prioritisation
|
|
10
|
-
from pheval.constants import (
|
|
11
|
-
DISEASE_PLOT_FILE_PREFIX,
|
|
12
|
-
DISEASE_PLOT_Y_LABEL,
|
|
13
|
-
GENE_PLOT_FILE_PREFIX,
|
|
14
|
-
GENE_PLOT_Y_LABEL,
|
|
15
|
-
VARIANT_PLOT_FILE_PREFIX,
|
|
16
|
-
VARIANT_PLOT_Y_LABEL,
|
|
17
|
-
)
|
|
18
9
|
|
|
19
10
|
|
|
20
11
|
@dataclass
|
|
@@ -22,20 +13,20 @@ class BenchmarkRunOutputGenerator:
|
|
|
22
13
|
"""Base class for recording data required for generating benchmarking outputs.
|
|
23
14
|
|
|
24
15
|
Attributes:
|
|
25
|
-
|
|
16
|
+
plot_customisation (SinglePlotCustomisation): Customisation for plot.
|
|
17
|
+
prioritisation_type_string (str): Prioritisation type string.
|
|
26
18
|
y_label (str): Label for the y-axis in benchmarking outputs.
|
|
27
19
|
generate_benchmark_run_results (Callable): Callable to generate benchmark run results.
|
|
28
20
|
Takes parameters: input and results directory, score order, threshold, rank comparison,
|
|
29
21
|
and returns BenchmarkRunResults.
|
|
30
|
-
|
|
22
|
+
stats_comparison_file (str): Suffix for the rank comparison file.
|
|
31
23
|
"""
|
|
32
24
|
|
|
33
|
-
|
|
25
|
+
plot_customisation: SinglePlotCustomisation
|
|
26
|
+
prioritisation_type_string: str
|
|
34
27
|
y_label: str
|
|
35
|
-
generate_benchmark_run_results: Callable[
|
|
36
|
-
|
|
37
|
-
]
|
|
38
|
-
stats_comparison_file_suffix: str
|
|
28
|
+
generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults]
|
|
29
|
+
stats_comparison_file: str
|
|
39
30
|
|
|
40
31
|
|
|
41
32
|
@dataclass
|
|
@@ -48,24 +39,26 @@ class GeneBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
|
|
|
48
39
|
specifically for gene prioritisation benchmarking.
|
|
49
40
|
|
|
50
41
|
Attributes:
|
|
51
|
-
|
|
52
|
-
|
|
42
|
+
plot_customisation (SinglePlotCustomisation): Customisation for plot.
|
|
43
|
+
prioritisation_type_string (str): Prioritisation type string.
|
|
44
|
+
Defaults to GENE_PRIORITISATION_TYPE_STR.
|
|
53
45
|
y_label (str): Label for the y-axis in gene prioritisation benchmarking outputs.
|
|
54
46
|
Defaults to GENE_PLOT_Y_LABEL.
|
|
55
47
|
generate_benchmark_run_results (Callable): Callable to generate gene prioritisation
|
|
56
48
|
benchmark run results. Defaults to benchmark_gene_prioritisation.
|
|
57
|
-
Takes parameters:
|
|
49
|
+
Takes parameters: run configuration, score order, threshold, rank comparison,
|
|
58
50
|
and returns BenchmarkRunResults.
|
|
59
|
-
|
|
60
|
-
Defaults to "-gene_summary
|
|
51
|
+
stats_comparison_file (str): Suffix for the gene rank comparison file.
|
|
52
|
+
Defaults to "-gene_summary".
|
|
61
53
|
"""
|
|
62
54
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
55
|
+
plot_customisation: SinglePlotCustomisation = None
|
|
56
|
+
prioritisation_type_string: str = "gene"
|
|
57
|
+
y_label: str = "Disease-causing genes (%)"
|
|
58
|
+
generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
|
|
59
|
+
benchmark_gene_prioritisation
|
|
60
|
+
)
|
|
61
|
+
stats_comparison_file: str = "gene_summary"
|
|
69
62
|
|
|
70
63
|
|
|
71
64
|
@dataclass
|
|
@@ -78,25 +71,27 @@ class VariantBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
|
|
|
78
71
|
specifically for variant prioritisation benchmarking.
|
|
79
72
|
|
|
80
73
|
Attributes:
|
|
81
|
-
|
|
82
|
-
|
|
74
|
+
plot_customisation (SinglePlotCustomisation): Customisation for plot.
|
|
75
|
+
prioritisation_type_string (str): Prioritisation type string.
|
|
76
|
+
Defaults to VARIANT_PRIORITISATION_TYPE_STR.
|
|
83
77
|
y_label (str): Label for the y-axis in variant prioritisation benchmarking outputs.
|
|
84
78
|
Defaults to VARIANT_PLOT_Y_LABEL.
|
|
85
79
|
generate_benchmark_run_results (Callable): Callable to generate variant prioritisation
|
|
86
80
|
benchmark run results. Defaults to benchmark_variant_prioritisation.
|
|
87
|
-
Takes parameters:
|
|
81
|
+
Takes parameters: run configuration, score order, threshold, rank comparison,
|
|
88
82
|
and returns BenchmarkRunResults.
|
|
89
|
-
|
|
90
|
-
Defaults to "-variant_summary
|
|
83
|
+
stats_comparison_file (str): Suffix for the variant rank comparison file.
|
|
84
|
+
Defaults to "-variant_summary".
|
|
91
85
|
|
|
92
86
|
"""
|
|
93
87
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
88
|
+
plot_customisation: SinglePlotCustomisation = None
|
|
89
|
+
prioritisation_type_string: str = "variant"
|
|
90
|
+
y_label: str = "Disease-causing variants (%)"
|
|
91
|
+
generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
|
|
92
|
+
benchmark_variant_prioritisation
|
|
93
|
+
)
|
|
94
|
+
stats_comparison_file: str = "variant_summary"
|
|
100
95
|
|
|
101
96
|
|
|
102
97
|
@dataclass
|
|
@@ -109,21 +104,23 @@ class DiseaseBenchmarkRunOutputGenerator(BenchmarkRunOutputGenerator):
|
|
|
109
104
|
specifically for disease prioritisation benchmarking.
|
|
110
105
|
|
|
111
106
|
Attributes:
|
|
112
|
-
|
|
113
|
-
|
|
107
|
+
plot_customisation (SinglePlotCustomisation): Customisation for plot.
|
|
108
|
+
prioritisation_type_string (str): Prioritisation type string.
|
|
109
|
+
Defaults to DISEASE_PRIORITISATION_TYPE_STR.
|
|
114
110
|
y_label (str): Label for the y-axis in disease prioritisation benchmarking outputs.
|
|
115
111
|
Defaults to DISEASE_PLOT_Y_LABEL.
|
|
116
112
|
generate_benchmark_run_results (Callable): Callable to generate disease prioritisation
|
|
117
113
|
benchmark run results. Defaults to benchmark_disease_prioritisation.
|
|
118
|
-
Takes parameters:
|
|
114
|
+
Takes parameters: run configuration, score order, threshold, rank comparison,
|
|
119
115
|
and returns BenchmarkRunResults.
|
|
120
|
-
|
|
121
|
-
Defaults to "-disease_summary
|
|
116
|
+
stats_comparison_file (str): Suffix for the disease rank comparison file.
|
|
117
|
+
Defaults to "-disease_summary".
|
|
122
118
|
"""
|
|
123
119
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
120
|
+
plot_customisation: SinglePlotCustomisation = None
|
|
121
|
+
prioritisation_type_string: str = "disease"
|
|
122
|
+
y_label: str = "Known diseases (%)"
|
|
123
|
+
generate_benchmark_run_results: Callable[[str, RunConfig, str, float], BenchmarkRunResults] = (
|
|
124
|
+
benchmark_disease_prioritisation
|
|
125
|
+
)
|
|
126
|
+
stats_comparison_file: str = "disease_summary"
|