pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/analysis.py +61 -150
- pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval/analyse/benchmark_db_manager.py +140 -0
- pheval/analyse/benchmark_generator.py +47 -50
- pheval/analyse/benchmarking_data.py +3 -2
- pheval/analyse/disease_prioritisation_analysis.py +70 -219
- pheval/analyse/gene_prioritisation_analysis.py +66 -242
- pheval/analyse/generate_plots.py +81 -79
- pheval/analyse/generate_summary_outputs.py +64 -134
- pheval/analyse/parse_benchmark_summary.py +50 -37
- pheval/analyse/parse_corpus.py +219 -0
- pheval/analyse/rank_stats.py +177 -144
- pheval/analyse/run_data_parser.py +108 -27
- pheval/analyse/variant_prioritisation_analysis.py +78 -212
- pheval/cli.py +2 -4
- pheval/cli_pheval_utils.py +34 -245
- pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.4.1.dist-info/METADATA +113 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
- pheval/analyse/parse_pheval_result.py +0 -43
- pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval/constants.py +0 -8
- pheval-0.3.9.dist-info/METADATA +0 -35
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0
|
@@ -11,14 +11,15 @@ class BenchmarkRunResults:
|
|
|
11
11
|
Benchmarking results for a run.
|
|
12
12
|
|
|
13
13
|
Attributes:
|
|
14
|
-
ranks (dict): Dictionary containing recorded ranks for samples.
|
|
15
14
|
rank_stats (RankStats): Statistics related to benchmark.
|
|
15
|
+
binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark.
|
|
16
16
|
results_dir (Path, optional): Path to the result directory. Defaults to None.
|
|
17
17
|
benchmark_name (str, optional): Name of the benchmark run. Defaults to None.
|
|
18
|
+
phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None.
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
|
-
ranks: dict
|
|
21
21
|
rank_stats: RankStats
|
|
22
22
|
binary_classification_stats: BinaryClassificationStats
|
|
23
23
|
results_dir: Path = None
|
|
24
24
|
benchmark_name: str = None
|
|
25
|
+
phenopacket_dir: Path = None
|
|
@@ -1,155 +1,22 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from pathlib import Path
|
|
3
|
-
from typing import List
|
|
4
2
|
|
|
3
|
+
from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
|
|
4
|
+
from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
|
|
5
5
|
from pheval.analyse.benchmarking_data import BenchmarkRunResults
|
|
6
6
|
from pheval.analyse.binary_classification_stats import BinaryClassificationStats
|
|
7
|
-
from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
|
|
8
|
-
from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
|
|
9
|
-
from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResult
|
|
10
7
|
from pheval.analyse.rank_stats import RankStats
|
|
11
|
-
from pheval.analyse.run_data_parser import
|
|
8
|
+
from pheval.analyse.run_data_parser import RunConfig
|
|
12
9
|
from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
|
|
13
10
|
from pheval.utils.file_utils import all_files
|
|
14
|
-
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
|
|
15
11
|
|
|
16
12
|
|
|
17
|
-
class AssessDiseasePrioritisation:
|
|
13
|
+
class AssessDiseasePrioritisation(AssessPrioritisationBase):
|
|
18
14
|
"""Class for assessing disease prioritisation based on thresholds and scoring orders."""
|
|
19
15
|
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
phenopacket_path: Path,
|
|
23
|
-
results_dir: Path,
|
|
24
|
-
standardised_disease_results: List[RankedPhEvalDiseaseResult],
|
|
25
|
-
threshold: float,
|
|
26
|
-
score_order: str,
|
|
27
|
-
proband_diseases: List[ProbandDisease],
|
|
28
|
-
):
|
|
29
|
-
"""
|
|
30
|
-
Initialise AssessDiseasePrioritisation class
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
phenopacket_path (Path): Path to the phenopacket file
|
|
34
|
-
results_dir (Path): Path to the results directory
|
|
35
|
-
standardised_disease_results (List[RankedPhEvalDiseaseResult]): List of ranked PhEval disease results
|
|
36
|
-
threshold (float): Threshold for scores
|
|
37
|
-
score_order (str): Score order for results, either ascending or descending
|
|
38
|
-
proband_diseases (List[ProbandDisease]): List of proband diseases
|
|
39
|
-
|
|
40
|
-
"""
|
|
41
|
-
self.phenopacket_path = phenopacket_path
|
|
42
|
-
self.results_dir = results_dir
|
|
43
|
-
self.standardised_disease_results = standardised_disease_results
|
|
44
|
-
self.threshold = threshold
|
|
45
|
-
self.score_order = score_order
|
|
46
|
-
self.proband_diseases = proband_diseases
|
|
47
|
-
|
|
48
|
-
def _record_disease_prioritisation_match(
|
|
49
|
-
self,
|
|
50
|
-
disease: ProbandDisease,
|
|
51
|
-
result_entry: RankedPhEvalDiseaseResult,
|
|
52
|
-
rank_stats: RankStats,
|
|
53
|
-
) -> DiseasePrioritisationResult:
|
|
54
|
-
"""
|
|
55
|
-
Record the disease prioritisation rank if found within the results
|
|
56
|
-
Args:
|
|
57
|
-
disease (ProbandDisease): Diagnosed proband disease
|
|
58
|
-
result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
|
|
59
|
-
rank_stats (RankStats): RankStats class instance
|
|
60
|
-
Returns:
|
|
61
|
-
DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
|
|
62
|
-
"""
|
|
63
|
-
rank = result_entry.rank
|
|
64
|
-
rank_stats.add_rank(rank)
|
|
65
|
-
return DiseasePrioritisationResult(self.phenopacket_path, disease, rank)
|
|
66
|
-
|
|
67
|
-
def _assess_disease_with_threshold_ascending_order(
|
|
68
|
-
self,
|
|
69
|
-
result_entry: RankedPhEvalDiseaseResult,
|
|
70
|
-
disease: ProbandDisease,
|
|
71
|
-
rank_stats: RankStats,
|
|
72
|
-
) -> DiseasePrioritisationResult:
|
|
73
|
-
"""
|
|
74
|
-
Record the disease prioritisation rank if it meets the ascending order threshold.
|
|
75
|
-
|
|
76
|
-
This method checks if the disease prioritisation rank meets the ascending order threshold.
|
|
77
|
-
If the score of the result entry is less than the threshold, it records the disease rank.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
|
|
81
|
-
disease (ProbandDisease): Diagnosed proband disease
|
|
82
|
-
rank_stats (RankStats): RankStats class instance
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
|
|
86
|
-
"""
|
|
87
|
-
if float(self.threshold) > float(result_entry.score):
|
|
88
|
-
return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
|
|
89
|
-
|
|
90
|
-
def _assess_disease_with_threshold(
|
|
91
|
-
self,
|
|
92
|
-
result_entry: RankedPhEvalDiseaseResult,
|
|
93
|
-
disease: ProbandDisease,
|
|
94
|
-
rank_stats: RankStats,
|
|
95
|
-
) -> DiseasePrioritisationResult:
|
|
96
|
-
"""
|
|
97
|
-
Record the disease prioritisation rank if it meets the score threshold.
|
|
98
|
-
|
|
99
|
-
This method checks if the disease prioritisation rank meets the score threshold.
|
|
100
|
-
If the score of the result entry is greater than the threshold, it records the disease rank.
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
|
|
104
|
-
disease (ProbandDisease): Diagnosed proband disease
|
|
105
|
-
rank_stats (RankStats): RankStats class instance
|
|
106
|
-
|
|
107
|
-
Returns:
|
|
108
|
-
DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
|
|
109
|
-
"""
|
|
110
|
-
if float(self.threshold) < float(result_entry.score):
|
|
111
|
-
return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
|
|
112
|
-
|
|
113
|
-
def _record_matched_disease(
|
|
114
|
-
self,
|
|
115
|
-
disease: ProbandDisease,
|
|
116
|
-
rank_stats: RankStats,
|
|
117
|
-
standardised_disease_result: RankedPhEvalDiseaseResult,
|
|
118
|
-
) -> DiseasePrioritisationResult:
|
|
119
|
-
"""
|
|
120
|
-
Return the disease rank result - handling the specification of a threshold.
|
|
121
|
-
|
|
122
|
-
This method determines and returns the disease rank result based on the specified threshold
|
|
123
|
-
and score order. If the threshold is 0.0, it records the disease rank directly.
|
|
124
|
-
Otherwise, it assesses the disease with the threshold based on the score order.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
disease (ProbandDisease): Diagnosed proband disease
|
|
128
|
-
rank_stats (RankStats): RankStats class instance
|
|
129
|
-
standardised_disease_result (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
|
|
130
|
-
|
|
131
|
-
Returns:
|
|
132
|
-
DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
|
|
133
|
-
"""
|
|
134
|
-
if float(self.threshold) == 0.0:
|
|
135
|
-
return self._record_disease_prioritisation_match(
|
|
136
|
-
disease, standardised_disease_result, rank_stats
|
|
137
|
-
)
|
|
138
|
-
else:
|
|
139
|
-
return (
|
|
140
|
-
self._assess_disease_with_threshold(
|
|
141
|
-
standardised_disease_result, disease, rank_stats
|
|
142
|
-
)
|
|
143
|
-
if self.score_order != "ascending"
|
|
144
|
-
else self._assess_disease_with_threshold_ascending_order(
|
|
145
|
-
standardised_disease_result, disease, rank_stats
|
|
146
|
-
)
|
|
147
|
-
)
|
|
148
|
-
|
|
149
16
|
def assess_disease_prioritisation(
|
|
150
17
|
self,
|
|
151
|
-
|
|
152
|
-
|
|
18
|
+
standardised_disease_result_path: Path,
|
|
19
|
+
phenopacket_path: Path,
|
|
153
20
|
binary_classification_stats: BinaryClassificationStats,
|
|
154
21
|
) -> None:
|
|
155
22
|
"""
|
|
@@ -159,67 +26,49 @@ class AssessDiseasePrioritisation:
|
|
|
159
26
|
and records ranks using a PrioritisationRankRecorder.
|
|
160
27
|
|
|
161
28
|
Args:
|
|
162
|
-
|
|
163
|
-
|
|
29
|
+
standardised_disease_result_path (Path): Path to the standardised disease TSV result.
|
|
30
|
+
phenopacket_path (Path): Path to the phenopacket.
|
|
164
31
|
binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
165
32
|
"""
|
|
166
33
|
relevant_ranks = []
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
192
|
-
rank_records,
|
|
193
|
-
).record_rank()
|
|
194
|
-
rank_stats.relevant_result_ranks.append(relevant_ranks)
|
|
34
|
+
df = self.conn.execute(
|
|
35
|
+
f"SELECT * FROM {self.table_name} WHERE phenopacket = ? ",
|
|
36
|
+
(phenopacket_path.name,),
|
|
37
|
+
).fetchdf()
|
|
38
|
+
for _i, row in df.iterrows():
|
|
39
|
+
result = (
|
|
40
|
+
self.conn.execute(
|
|
41
|
+
f"SELECT * FROM '{standardised_disease_result_path}' "
|
|
42
|
+
f"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),"
|
|
43
|
+
f" '{row['disease_identifier']}') "
|
|
44
|
+
f"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), "
|
|
45
|
+
f"'{row['disease_name']}')"
|
|
46
|
+
)
|
|
47
|
+
.fetchdf()
|
|
48
|
+
.to_dict(orient="records")
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if len(result) > 0:
|
|
52
|
+
disease_match = self._record_matched_entity(RankedPhEvalDiseaseResult(**result[0]))
|
|
53
|
+
relevant_ranks.append(disease_match)
|
|
54
|
+
primary_key = f"{phenopacket_path.name}-{row['disease_identifier']}"
|
|
55
|
+
self.conn.execute(
|
|
56
|
+
f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
|
|
57
|
+
(disease_match, primary_key),
|
|
58
|
+
)
|
|
195
59
|
binary_classification_stats.add_classification(
|
|
196
|
-
self.
|
|
60
|
+
self.db_connection.parse_table_into_dataclass(
|
|
61
|
+
str(standardised_disease_result_path), RankedPhEvalDiseaseResult
|
|
62
|
+
),
|
|
63
|
+
relevant_ranks,
|
|
197
64
|
)
|
|
198
65
|
|
|
199
66
|
|
|
200
|
-
def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
|
|
201
|
-
"""
|
|
202
|
-
Obtain known diseases from a Phenopacket.
|
|
203
|
-
Args:
|
|
204
|
-
phenopacket_path (Path): Path to the Phenopacket file.
|
|
205
|
-
|
|
206
|
-
Returns:
|
|
207
|
-
List[ProbandDisease]: A list of known diseases associated with the proband,
|
|
208
|
-
extracted from the Phenopacket.
|
|
209
|
-
"""
|
|
210
|
-
phenopacket = phenopacket_reader(phenopacket_path)
|
|
211
|
-
phenopacket_util = PhenopacketUtil(phenopacket)
|
|
212
|
-
return phenopacket_util.diagnoses()
|
|
213
|
-
|
|
214
|
-
|
|
215
67
|
def assess_phenopacket_disease_prioritisation(
|
|
216
68
|
phenopacket_path: Path,
|
|
217
|
-
|
|
218
|
-
results_dir_and_input: TrackInputOutputDirectories,
|
|
219
|
-
threshold: float,
|
|
220
|
-
disease_rank_stats: RankStats,
|
|
221
|
-
disease_rank_comparison: defaultdict,
|
|
69
|
+
run: RunConfig,
|
|
222
70
|
disease_binary_classification_stats: BinaryClassificationStats,
|
|
71
|
+
disease_benchmarker: AssessDiseasePrioritisation,
|
|
223
72
|
) -> None:
|
|
224
73
|
"""
|
|
225
74
|
Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results
|
|
@@ -227,64 +76,66 @@ def assess_phenopacket_disease_prioritisation(
|
|
|
227
76
|
|
|
228
77
|
Args:
|
|
229
78
|
phenopacket_path (Path): Path to the Phenopacket.
|
|
230
|
-
|
|
231
|
-
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
232
|
-
threshold (float): Threshold for assessment.
|
|
233
|
-
disease_rank_stats (RankStats): RankStats class instance.
|
|
234
|
-
disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
|
|
79
|
+
run (RunConfig): Run configuration.
|
|
235
80
|
disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
81
|
+
disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance.
|
|
236
82
|
"""
|
|
237
|
-
|
|
83
|
+
standardised_disease_result_path = run.results_dir.joinpath(
|
|
238
84
|
f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
|
|
239
85
|
)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
AssessDiseasePrioritisation(
|
|
86
|
+
disease_benchmarker.assess_disease_prioritisation(
|
|
87
|
+
standardised_disease_result_path,
|
|
243
88
|
phenopacket_path,
|
|
244
|
-
|
|
245
|
-
parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result),
|
|
246
|
-
threshold,
|
|
247
|
-
score_order,
|
|
248
|
-
proband_diseases,
|
|
249
|
-
).assess_disease_prioritisation(
|
|
250
|
-
disease_rank_stats, disease_rank_comparison, disease_binary_classification_stats
|
|
89
|
+
disease_binary_classification_stats,
|
|
251
90
|
)
|
|
252
91
|
|
|
253
92
|
|
|
254
93
|
def benchmark_disease_prioritisation(
|
|
255
|
-
|
|
94
|
+
benchmark_name: str,
|
|
95
|
+
run: RunConfig,
|
|
256
96
|
score_order: str,
|
|
257
97
|
threshold: float,
|
|
258
|
-
disease_rank_comparison: defaultdict,
|
|
259
98
|
):
|
|
260
99
|
"""
|
|
261
100
|
Benchmark a directory based on disease prioritisation results.
|
|
262
101
|
|
|
263
102
|
Args:
|
|
264
|
-
|
|
103
|
+
benchmark_name (str): Name of the benchmark.
|
|
104
|
+
run (RunConfig): Run configuration.
|
|
265
105
|
score_order (str): The order in which scores are arranged.
|
|
266
106
|
threshold (float): Threshold for assessment.
|
|
267
|
-
disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
|
|
268
107
|
|
|
269
108
|
Returns:
|
|
270
109
|
BenchmarkRunResults: An object containing benchmarking results for disease prioritisation,
|
|
271
110
|
including ranks and rank statistics for the benchmarked directory.
|
|
272
111
|
"""
|
|
273
|
-
disease_rank_stats = RankStats()
|
|
274
112
|
disease_binary_classification_stats = BinaryClassificationStats()
|
|
275
|
-
|
|
113
|
+
db_connection = BenchmarkDBManager(benchmark_name)
|
|
114
|
+
db_connection.initialise()
|
|
115
|
+
disease_benchmarker = AssessDiseasePrioritisation(
|
|
116
|
+
db_connection,
|
|
117
|
+
f"{run.phenopacket_dir.parents[0].name}_disease",
|
|
118
|
+
run.run_identifier,
|
|
119
|
+
threshold,
|
|
120
|
+
score_order,
|
|
121
|
+
)
|
|
122
|
+
for phenopacket_path in all_files(run.phenopacket_dir):
|
|
276
123
|
assess_phenopacket_disease_prioritisation(
|
|
277
124
|
phenopacket_path,
|
|
278
|
-
|
|
279
|
-
results_directory_and_input,
|
|
280
|
-
threshold,
|
|
281
|
-
disease_rank_stats,
|
|
282
|
-
disease_rank_comparison,
|
|
125
|
+
run,
|
|
283
126
|
disease_binary_classification_stats,
|
|
127
|
+
disease_benchmarker,
|
|
284
128
|
)
|
|
129
|
+
db_connection.close()
|
|
130
|
+
disease_rank_stats = RankStats()
|
|
131
|
+
disease_rank_stats.add_ranks(
|
|
132
|
+
benchmark_name=benchmark_name,
|
|
133
|
+
table_name=f"{run.phenopacket_dir.parents[0].name}_disease",
|
|
134
|
+
column_name=str(run.run_identifier),
|
|
135
|
+
)
|
|
285
136
|
return BenchmarkRunResults(
|
|
286
|
-
results_dir=results_directory_and_input.results_dir,
|
|
287
|
-
ranks=disease_rank_comparison,
|
|
288
137
|
rank_stats=disease_rank_stats,
|
|
138
|
+
benchmark_name=run.run_identifier,
|
|
289
139
|
binary_classification_stats=disease_binary_classification_stats,
|
|
140
|
+
phenopacket_dir=run.phenopacket_dir,
|
|
290
141
|
)
|