pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/analysis.py +61 -150
- pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval/analyse/benchmark_db_manager.py +140 -0
- pheval/analyse/benchmark_generator.py +47 -50
- pheval/analyse/benchmarking_data.py +3 -2
- pheval/analyse/disease_prioritisation_analysis.py +70 -219
- pheval/analyse/gene_prioritisation_analysis.py +66 -242
- pheval/analyse/generate_plots.py +81 -79
- pheval/analyse/generate_summary_outputs.py +64 -134
- pheval/analyse/parse_benchmark_summary.py +50 -37
- pheval/analyse/parse_corpus.py +219 -0
- pheval/analyse/rank_stats.py +177 -144
- pheval/analyse/run_data_parser.py +108 -27
- pheval/analyse/variant_prioritisation_analysis.py +78 -212
- pheval/cli.py +2 -4
- pheval/cli_pheval_utils.py +34 -245
- pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.4.1.dist-info/METADATA +113 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
- pheval/analyse/parse_pheval_result.py +0 -43
- pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval/constants.py +0 -8
- pheval-0.3.9.dist-info/METADATA +0 -35
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,148 +1,23 @@
|
|
|
1
|
-
from collections import defaultdict
|
|
2
1
|
from pathlib import Path
|
|
3
|
-
from typing import List
|
|
4
2
|
|
|
3
|
+
from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
|
|
4
|
+
from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
|
|
5
5
|
from pheval.analyse.benchmarking_data import BenchmarkRunResults
|
|
6
6
|
from pheval.analyse.binary_classification_stats import BinaryClassificationStats
|
|
7
|
-
from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
|
|
8
|
-
from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
|
|
9
|
-
from pheval.analyse.prioritisation_result_types import VariantPrioritisationResult
|
|
10
7
|
from pheval.analyse.rank_stats import RankStats
|
|
11
|
-
from pheval.analyse.run_data_parser import
|
|
8
|
+
from pheval.analyse.run_data_parser import RunConfig
|
|
12
9
|
from pheval.post_processing.post_processing import RankedPhEvalVariantResult
|
|
13
10
|
from pheval.utils.file_utils import all_files
|
|
14
|
-
from pheval.utils.phenopacket_utils import GenomicVariant
|
|
11
|
+
from pheval.utils.phenopacket_utils import GenomicVariant
|
|
15
12
|
|
|
16
13
|
|
|
17
|
-
class AssessVariantPrioritisation:
|
|
14
|
+
class AssessVariantPrioritisation(AssessPrioritisationBase):
|
|
18
15
|
"""Class for assessing variant prioritisation based on thresholds and scoring orders."""
|
|
19
16
|
|
|
20
|
-
def __init__(
|
|
21
|
-
self,
|
|
22
|
-
phenopacket_path: Path,
|
|
23
|
-
results_dir: Path,
|
|
24
|
-
standardised_variant_results: List[RankedPhEvalVariantResult],
|
|
25
|
-
threshold: float,
|
|
26
|
-
score_order: str,
|
|
27
|
-
proband_causative_variants: List[GenomicVariant],
|
|
28
|
-
):
|
|
29
|
-
"""
|
|
30
|
-
Initialise AssessVariantPrioritisation class
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
phenopacket_path (Path): Path to the phenopacket file
|
|
34
|
-
results_dir (Path): Path to the results directory
|
|
35
|
-
standardised_variant_results (List[RankedPhEvalVariantResult]): List of ranked PhEval variant results
|
|
36
|
-
threshold (float): Threshold for scores
|
|
37
|
-
score_order (str): Score order for results, either ascending or descending
|
|
38
|
-
proband_causative_variants (List[GenomicVariant]): List of proband variants
|
|
39
|
-
|
|
40
|
-
"""
|
|
41
|
-
self.phenopacket_path = phenopacket_path
|
|
42
|
-
self.results_dir = results_dir
|
|
43
|
-
self.standardised_variant_results = standardised_variant_results
|
|
44
|
-
self.threshold = threshold
|
|
45
|
-
self.score_order = score_order
|
|
46
|
-
self.proband_causative_variants = proband_causative_variants
|
|
47
|
-
|
|
48
|
-
def _record_variant_prioritisation_match(
|
|
49
|
-
self,
|
|
50
|
-
result_entry: RankedPhEvalVariantResult,
|
|
51
|
-
rank_stats: RankStats,
|
|
52
|
-
) -> VariantPrioritisationResult:
|
|
53
|
-
"""
|
|
54
|
-
Record the variant prioritisation rank if found within the results
|
|
55
|
-
Args:
|
|
56
|
-
result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
|
|
57
|
-
rank_stats (RankStats): RankStats class instance
|
|
58
|
-
Returns:
|
|
59
|
-
VariantPrioritisationResult: Recorded correct variant prioritisation rank result
|
|
60
|
-
"""
|
|
61
|
-
rank = result_entry.rank
|
|
62
|
-
rank_stats.add_rank(rank)
|
|
63
|
-
return VariantPrioritisationResult(
|
|
64
|
-
self.phenopacket_path,
|
|
65
|
-
GenomicVariant(
|
|
66
|
-
chrom=result_entry.chromosome,
|
|
67
|
-
pos=result_entry.start,
|
|
68
|
-
ref=result_entry.ref,
|
|
69
|
-
alt=result_entry.alt,
|
|
70
|
-
),
|
|
71
|
-
rank,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
def _assess_variant_with_threshold_ascending_order(
|
|
75
|
-
self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
|
|
76
|
-
) -> VariantPrioritisationResult:
|
|
77
|
-
"""
|
|
78
|
-
Record the variant prioritisation rank if it meets the ascending order threshold.
|
|
79
|
-
|
|
80
|
-
This method checks if the variant prioritisation rank meets the ascending order threshold.
|
|
81
|
-
If the score of the result entry is less than the threshold, it records the variant rank.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
|
|
85
|
-
rank_stats (RankStats): RankStats class instance
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
VariantPrioritisationResult: Recorded correct variant prioritisation rank result
|
|
89
|
-
"""
|
|
90
|
-
if float(self.threshold) > float(result_entry.score):
|
|
91
|
-
return self._record_variant_prioritisation_match(result_entry, rank_stats)
|
|
92
|
-
|
|
93
|
-
def _assess_variant_with_threshold(
|
|
94
|
-
self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
|
|
95
|
-
) -> VariantPrioritisationResult:
|
|
96
|
-
"""
|
|
97
|
-
Record the variant prioritisation rank if it meets the score threshold.
|
|
98
|
-
|
|
99
|
-
This method checks if the variant prioritisation rank meets the score threshold.
|
|
100
|
-
If the score of the result entry is greater than the threshold, it records the variant rank.
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
|
|
104
|
-
rank_stats (RankStats): RankStats class instance
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
VariantPrioritisationResult: Recorded correct variant prioritisation rank result
|
|
108
|
-
"""
|
|
109
|
-
if float(self.threshold) < float(result_entry.score):
|
|
110
|
-
return self._record_variant_prioritisation_match(result_entry, rank_stats)
|
|
111
|
-
|
|
112
|
-
def _record_matched_variant(
|
|
113
|
-
self, rank_stats: RankStats, standardised_variant_result: RankedPhEvalVariantResult
|
|
114
|
-
) -> VariantPrioritisationResult:
|
|
115
|
-
"""
|
|
116
|
-
Return the variant rank result - handling the specification of a threshold.
|
|
117
|
-
|
|
118
|
-
This method determines and returns the variant rank result based on the specified threshold
|
|
119
|
-
and score order. If the threshold is 0.0, it records the variant rank directly.
|
|
120
|
-
Otherwise, it assesses the variant with the threshold based on the score order.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
rank_stats (RankStats): RankStats class instance
|
|
124
|
-
standardised_variant_result (RankedPhEvalVariantResult): Ranked PhEval variant result entry
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
VariantPrioritisationResult: Recorded correct variant prioritisation rank result
|
|
128
|
-
"""
|
|
129
|
-
if float(self.threshold) == 0.0:
|
|
130
|
-
return self._record_variant_prioritisation_match(
|
|
131
|
-
standardised_variant_result, rank_stats
|
|
132
|
-
)
|
|
133
|
-
else:
|
|
134
|
-
return (
|
|
135
|
-
self._assess_variant_with_threshold(standardised_variant_result, rank_stats)
|
|
136
|
-
if self.score_order != "ascending"
|
|
137
|
-
else self._assess_variant_with_threshold_ascending_order(
|
|
138
|
-
standardised_variant_result, rank_stats
|
|
139
|
-
)
|
|
140
|
-
)
|
|
141
|
-
|
|
142
17
|
def assess_variant_prioritisation(
|
|
143
18
|
self,
|
|
144
|
-
|
|
145
|
-
|
|
19
|
+
standardised_variant_result_path: Path,
|
|
20
|
+
phenopacket_path: Path,
|
|
146
21
|
binary_classification_stats: BinaryClassificationStats,
|
|
147
22
|
) -> None:
|
|
148
23
|
"""
|
|
@@ -152,68 +27,59 @@ class AssessVariantPrioritisation:
|
|
|
152
27
|
and records ranks using a PrioritisationRankRecorder.
|
|
153
28
|
|
|
154
29
|
Args:
|
|
155
|
-
|
|
156
|
-
|
|
30
|
+
standardised_variant_result_path (Path): Path to standardised variant TSV result.
|
|
31
|
+
phenopacket_path (Path): Path to the phenopacket.
|
|
157
32
|
binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
158
33
|
"""
|
|
159
34
|
relevant_ranks = []
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
35
|
+
df = self.conn.execute(
|
|
36
|
+
f"""SELECT * FROM {self.table_name} WHERE phenopacket = '{phenopacket_path.name}'"""
|
|
37
|
+
).fetchdf()
|
|
38
|
+
for _i, row in df.iterrows():
|
|
39
|
+
causative_variant = GenomicVariant(
|
|
40
|
+
chrom=row["chrom"],
|
|
41
|
+
pos=int(row["pos"]),
|
|
42
|
+
ref=row["ref"],
|
|
43
|
+
alt=row["alt"],
|
|
44
|
+
)
|
|
45
|
+
result = (
|
|
46
|
+
self.conn.execute(
|
|
47
|
+
f"SELECT * FROM '{standardised_variant_result_path}' "
|
|
48
|
+
f"WHERE "
|
|
49
|
+
f"chromosome == '{causative_variant.chrom}' AND "
|
|
50
|
+
f"start == {causative_variant.pos} AND "
|
|
51
|
+
f"ref == '{causative_variant.ref}' AND "
|
|
52
|
+
f"alt == '{causative_variant.alt}'"
|
|
169
53
|
)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
relevant_ranks.append(variant_match.rank)
|
|
174
|
-
if variant_match
|
|
175
|
-
else relevant_ranks.append(0)
|
|
176
|
-
)
|
|
177
|
-
break
|
|
178
|
-
PrioritisationRankRecorder(
|
|
179
|
-
rank_stats.total,
|
|
180
|
-
self.results_dir,
|
|
181
|
-
(
|
|
182
|
-
VariantPrioritisationResult(self.phenopacket_path, variant)
|
|
183
|
-
if variant_match is None
|
|
184
|
-
else variant_match
|
|
185
|
-
),
|
|
186
|
-
rank_records,
|
|
187
|
-
).record_rank()
|
|
188
|
-
rank_stats.relevant_result_ranks.append(relevant_ranks)
|
|
189
|
-
binary_classification_stats.add_classification(
|
|
190
|
-
self.standardised_variant_results, relevant_ranks
|
|
191
|
-
)
|
|
192
|
-
|
|
54
|
+
.fetchdf()
|
|
55
|
+
.to_dict(orient="records")
|
|
56
|
+
)
|
|
193
57
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
58
|
+
if len(result) > 0:
|
|
59
|
+
variant_match = self._record_matched_entity(RankedPhEvalVariantResult(**result[0]))
|
|
60
|
+
relevant_ranks.append(variant_match)
|
|
61
|
+
primary_key = (
|
|
62
|
+
f"{phenopacket_path.name}-{causative_variant.chrom}-{causative_variant.pos}-"
|
|
63
|
+
f"{causative_variant.ref}-{causative_variant.alt}"
|
|
64
|
+
)
|
|
65
|
+
self.conn.execute(
|
|
66
|
+
f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
|
|
67
|
+
(variant_match, primary_key),
|
|
68
|
+
)
|
|
199
69
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
return phenopacket_util.diagnosed_variants()
|
|
70
|
+
binary_classification_stats.add_classification(
|
|
71
|
+
self.db_connection.parse_table_into_dataclass(
|
|
72
|
+
str(standardised_variant_result_path), RankedPhEvalVariantResult
|
|
73
|
+
),
|
|
74
|
+
relevant_ranks,
|
|
75
|
+
)
|
|
207
76
|
|
|
208
77
|
|
|
209
78
|
def assess_phenopacket_variant_prioritisation(
|
|
210
79
|
phenopacket_path: Path,
|
|
211
|
-
|
|
212
|
-
results_dir_and_input: TrackInputOutputDirectories,
|
|
213
|
-
threshold: float,
|
|
214
|
-
variant_rank_stats: RankStats,
|
|
215
|
-
variant_rank_comparison: defaultdict,
|
|
80
|
+
run: RunConfig,
|
|
216
81
|
variant_binary_classification_stats: BinaryClassificationStats,
|
|
82
|
+
variant_benchmarker: AssessVariantPrioritisation,
|
|
217
83
|
) -> None:
|
|
218
84
|
"""
|
|
219
85
|
Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results
|
|
@@ -221,64 +87,64 @@ def assess_phenopacket_variant_prioritisation(
|
|
|
221
87
|
|
|
222
88
|
Args:
|
|
223
89
|
phenopacket_path (Path): Path to the Phenopacket.
|
|
224
|
-
|
|
225
|
-
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
226
|
-
threshold (float): Threshold for assessment.
|
|
227
|
-
variant_rank_stats (RankStats): RankStats class instance.
|
|
228
|
-
variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
|
|
90
|
+
run (RunConfig): Run configuration.
|
|
229
91
|
variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
92
|
+
variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance.
|
|
230
93
|
"""
|
|
231
|
-
|
|
232
|
-
standardised_variant_result = results_dir_and_input.results_dir.joinpath(
|
|
94
|
+
standardised_variant_result_path = run.results_dir.joinpath(
|
|
233
95
|
f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
|
|
234
96
|
)
|
|
235
|
-
|
|
236
|
-
|
|
97
|
+
variant_benchmarker.assess_variant_prioritisation(
|
|
98
|
+
standardised_variant_result_path,
|
|
237
99
|
phenopacket_path,
|
|
238
|
-
|
|
239
|
-
parse_pheval_result(RankedPhEvalVariantResult, pheval_variant_result),
|
|
240
|
-
threshold,
|
|
241
|
-
score_order,
|
|
242
|
-
proband_causative_variants,
|
|
243
|
-
).assess_variant_prioritisation(
|
|
244
|
-
variant_rank_stats, variant_rank_comparison, variant_binary_classification_stats
|
|
100
|
+
variant_binary_classification_stats,
|
|
245
101
|
)
|
|
246
102
|
|
|
247
103
|
|
|
248
104
|
def benchmark_variant_prioritisation(
|
|
249
|
-
|
|
105
|
+
benchmark_name: str,
|
|
106
|
+
run: RunConfig,
|
|
250
107
|
score_order: str,
|
|
251
108
|
threshold: float,
|
|
252
|
-
variant_rank_comparison: defaultdict,
|
|
253
109
|
):
|
|
254
110
|
"""
|
|
255
111
|
Benchmark a directory based on variant prioritisation results.
|
|
256
112
|
|
|
257
113
|
Args:
|
|
258
|
-
|
|
114
|
+
benchmark_name (str): Name of the benchmark.
|
|
115
|
+
run (RunConfig): Run configuration.
|
|
259
116
|
score_order (str): The order in which scores are arranged.
|
|
260
117
|
threshold (float): Threshold for assessment.
|
|
261
|
-
variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
|
|
262
118
|
|
|
263
119
|
Returns:
|
|
264
120
|
BenchmarkRunResults: An object containing benchmarking results for variant prioritisation,
|
|
265
121
|
including ranks and rank statistics for the benchmarked directory.
|
|
266
122
|
"""
|
|
267
|
-
variant_rank_stats = RankStats()
|
|
268
123
|
variant_binary_classification_stats = BinaryClassificationStats()
|
|
269
|
-
|
|
124
|
+
db_connection = BenchmarkDBManager(benchmark_name)
|
|
125
|
+
variant_benchmarker = AssessVariantPrioritisation(
|
|
126
|
+
db_connection,
|
|
127
|
+
f"{run.phenopacket_dir.parents[0].name}" f"_variant",
|
|
128
|
+
run.run_identifier,
|
|
129
|
+
threshold,
|
|
130
|
+
score_order,
|
|
131
|
+
)
|
|
132
|
+
for phenopacket_path in all_files(run.phenopacket_dir):
|
|
270
133
|
assess_phenopacket_variant_prioritisation(
|
|
271
134
|
phenopacket_path,
|
|
272
|
-
|
|
273
|
-
results_directory_and_input,
|
|
274
|
-
threshold,
|
|
275
|
-
variant_rank_stats,
|
|
276
|
-
variant_rank_comparison,
|
|
135
|
+
run,
|
|
277
136
|
variant_binary_classification_stats,
|
|
137
|
+
variant_benchmarker,
|
|
278
138
|
)
|
|
139
|
+
variant_rank_stats = RankStats()
|
|
140
|
+
variant_rank_stats.add_ranks(
|
|
141
|
+
benchmark_name=benchmark_name,
|
|
142
|
+
table_name=f"{run.phenopacket_dir.parents[0].name}_variant",
|
|
143
|
+
column_name=str(run.run_identifier),
|
|
144
|
+
)
|
|
279
145
|
return BenchmarkRunResults(
|
|
280
|
-
|
|
281
|
-
ranks=variant_rank_comparison,
|
|
146
|
+
benchmark_name=run.run_identifier,
|
|
282
147
|
rank_stats=variant_rank_stats,
|
|
283
148
|
binary_classification_stats=variant_binary_classification_stats,
|
|
149
|
+
phenopacket_dir=run.phenopacket_dir,
|
|
284
150
|
)
|
pheval/cli.py
CHANGED
|
@@ -6,9 +6,8 @@ import click
|
|
|
6
6
|
|
|
7
7
|
from .cli_pheval import run
|
|
8
8
|
from .cli_pheval_utils import (
|
|
9
|
-
benchmark,
|
|
10
|
-
benchmark_comparison,
|
|
11
9
|
create_spiked_vcfs_command,
|
|
10
|
+
generate_benchmark_stats,
|
|
12
11
|
generate_stats_plot,
|
|
13
12
|
prepare_corpus_command,
|
|
14
13
|
scramble_phenopackets_command,
|
|
@@ -57,8 +56,7 @@ pheval_utils.add_command(semsim_scramble_command)
|
|
|
57
56
|
pheval_utils.add_command(scramble_phenopackets_command)
|
|
58
57
|
pheval_utils.add_command(update_phenopackets_command)
|
|
59
58
|
pheval_utils.add_command(create_spiked_vcfs_command)
|
|
60
|
-
pheval_utils.add_command(
|
|
61
|
-
pheval_utils.add_command(benchmark_comparison)
|
|
59
|
+
pheval_utils.add_command(generate_benchmark_stats)
|
|
62
60
|
pheval_utils.add_command(semsim_to_exomiserdb_command)
|
|
63
61
|
pheval_utils.add_command(generate_stats_plot)
|
|
64
62
|
pheval_utils.add_command(prepare_corpus_command)
|