pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/__init__.py +0 -5
- pheval/analyse/__init__.py +0 -0
- pheval/analyse/analysis.py +703 -0
- pheval/analyse/generate_plots.py +312 -0
- pheval/analyse/generate_summary_outputs.py +186 -0
- pheval/analyse/rank_stats.py +61 -0
- pheval/cli.py +22 -7
- pheval/cli_pheval.py +37 -12
- pheval/cli_pheval_utils.py +225 -8
- pheval/config_parser.py +36 -0
- pheval/constants.py +1 -0
- pheval/implementations/__init__.py +1 -3
- pheval/post_processing/__init__.py +0 -0
- pheval/post_processing/post_processing.py +210 -0
- pheval/prepare/__init__.py +0 -0
- pheval/prepare/create_noisy_phenopackets.py +173 -0
- pheval/prepare/create_spiked_vcf.py +366 -0
- pheval/prepare/custom_exceptions.py +47 -0
- pheval/prepare/update_phenopacket.py +53 -0
- pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
- pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
- pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
- pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
- pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
- pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
- pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
- pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
- pheval/run_metadata.py +27 -0
- pheval/runners/runner.py +92 -11
- pheval/utils/__init__.py +0 -0
- pheval/utils/docs_gen.py +105 -0
- pheval/utils/docs_gen.sh +18 -0
- pheval/utils/file_utils.py +88 -0
- pheval/utils/phenopacket_utils.py +356 -0
- pheval/utils/semsim_utils.py +156 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
- pheval-0.2.0.dist-info/RECORD +41 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
- pheval/utils.py +0 -7
- pheval-0.1.0.dist-info/RECORD +0 -13
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
pheval/__init__.py
CHANGED
|
File without changes
|
|
@@ -0,0 +1,703 @@
|
|
|
1
|
+
# #!/usr/bin/python
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from pheval.analyse.generate_plots import (
|
|
10
|
+
TrackGenePrioritisation,
|
|
11
|
+
TrackPrioritisation,
|
|
12
|
+
TrackVariantPrioritisation,
|
|
13
|
+
)
|
|
14
|
+
from pheval.analyse.generate_summary_outputs import (
|
|
15
|
+
RankStatsWriter,
|
|
16
|
+
generate_benchmark_comparison_gene_output,
|
|
17
|
+
generate_benchmark_comparison_variant_output,
|
|
18
|
+
generate_benchmark_gene_output,
|
|
19
|
+
generate_benchmark_variant_output,
|
|
20
|
+
)
|
|
21
|
+
from pheval.analyse.rank_stats import RankStats
|
|
22
|
+
from pheval.post_processing.post_processing import (
|
|
23
|
+
PhEvalGeneResult,
|
|
24
|
+
PhEvalVariantResult,
|
|
25
|
+
RankedPhEvalGeneResult,
|
|
26
|
+
RankedPhEvalVariantResult,
|
|
27
|
+
)
|
|
28
|
+
from pheval.prepare.custom_exceptions import InputError
|
|
29
|
+
from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
|
|
30
|
+
from pheval.utils.phenopacket_utils import (
|
|
31
|
+
GenomicVariant,
|
|
32
|
+
PhenopacketUtil,
|
|
33
|
+
ProbandCausativeGene,
|
|
34
|
+
phenopacket_reader,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _read_standardised_result(standardised_result_path: Path) -> dict:
|
|
39
|
+
"""Read the standardised result output and return a dictionary."""
|
|
40
|
+
return pd.read_csv(standardised_result_path, delimiter="\t")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse_pheval_gene_result(pheval_gene_result: pd.DataFrame) -> [RankedPhEvalGeneResult]:
|
|
44
|
+
"""Parse PhEval gene result into RankedPhEvalGeneResult dataclass."""
|
|
45
|
+
ranked_gene_results = []
|
|
46
|
+
for _index, result in pheval_gene_result.iterrows():
|
|
47
|
+
ranked_gene_results.append(
|
|
48
|
+
RankedPhEvalGeneResult(
|
|
49
|
+
pheval_gene_result=PhEvalGeneResult(
|
|
50
|
+
gene_symbol=result["gene_symbol"],
|
|
51
|
+
gene_identifier=result["gene_identifier"],
|
|
52
|
+
score=result["score"],
|
|
53
|
+
),
|
|
54
|
+
rank=result["rank"],
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
return ranked_gene_results
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def parse_pheval_variant_result(pheval_variant_result: pd.DataFrame) -> [RankedPhEvalVariantResult]:
|
|
61
|
+
"""Parse PhEval variant result into RankedPhEvalVariantResult dataclass."""
|
|
62
|
+
ranked_variant_results = []
|
|
63
|
+
for _index, result in pheval_variant_result.iterrows():
|
|
64
|
+
ranked_variant_results.append(
|
|
65
|
+
RankedPhEvalVariantResult(
|
|
66
|
+
pheval_variant_result=PhEvalVariantResult(
|
|
67
|
+
chromosome=result["chromosome"],
|
|
68
|
+
start=result["start"],
|
|
69
|
+
end=result["end"],
|
|
70
|
+
ref=result["ref"],
|
|
71
|
+
alt=result["alt"],
|
|
72
|
+
score=result["score"],
|
|
73
|
+
),
|
|
74
|
+
rank=result["rank"],
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
return ranked_variant_results
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class GenePrioritisationResult:
|
|
82
|
+
"""Store rank data for causative genes."""
|
|
83
|
+
|
|
84
|
+
phenopacket_path: Path
|
|
85
|
+
gene: str
|
|
86
|
+
rank: int = 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class VariantPrioritisationResult:
|
|
91
|
+
"""Store rank data for causative variants."""
|
|
92
|
+
|
|
93
|
+
phenopacket_path: Path
|
|
94
|
+
variant: GenomicVariant
|
|
95
|
+
rank: int = 0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class PrioritisationRankRecorder:
|
|
100
|
+
"""Compare the ranks of different runs."""
|
|
101
|
+
|
|
102
|
+
index: int
|
|
103
|
+
directory: Path
|
|
104
|
+
prioritisation_result: VariantPrioritisationResult or GenePrioritisationResult
|
|
105
|
+
run_comparison: defaultdict
|
|
106
|
+
|
|
107
|
+
def _record_gene_rank(self) -> None:
|
|
108
|
+
"""Record gene prioritisation rank."""
|
|
109
|
+
self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene
|
|
110
|
+
|
|
111
|
+
def _record_variant_rank(self) -> None:
|
|
112
|
+
"""Record variant prioritisation rank."""
|
|
113
|
+
variant = self.prioritisation_result.variant
|
|
114
|
+
self.run_comparison[self.index]["Variant"] = "_".join(
|
|
115
|
+
[variant.chrom, str(variant.pos), variant.ref, variant.alt]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def record_rank(self) -> None:
|
|
119
|
+
"""Records the rank for different runs."""
|
|
120
|
+
self.run_comparison[self.index][
|
|
121
|
+
"Phenopacket"
|
|
122
|
+
] = self.prioritisation_result.phenopacket_path.name
|
|
123
|
+
self._record_gene_rank() if type(
|
|
124
|
+
self.prioritisation_result
|
|
125
|
+
) is GenePrioritisationResult else self._record_variant_rank()
|
|
126
|
+
self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class TrackInputOutputDirectories:
|
|
131
|
+
"""Track the input testdata for a corresponding pheval output directory"""
|
|
132
|
+
|
|
133
|
+
phenopacket_dir: Path
|
|
134
|
+
results_dir: Path
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]:
|
|
138
|
+
"""Parse run data .txt file returning a list of input testdata and corresponding output directories."""
|
|
139
|
+
run_data = pd.read_csv(run_data_path, delimiter="\t", header=None)
|
|
140
|
+
run_data_list = []
|
|
141
|
+
for _index, row in run_data.iterrows():
|
|
142
|
+
run_data_list.append(
|
|
143
|
+
TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1]))
|
|
144
|
+
)
|
|
145
|
+
return run_data_list
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class AssessGenePrioritisation:
|
|
149
|
+
"""Assess gene prioritisation."""
|
|
150
|
+
|
|
151
|
+
def __init__(
|
|
152
|
+
self,
|
|
153
|
+
phenopacket_path: Path,
|
|
154
|
+
results_dir: Path,
|
|
155
|
+
standardised_gene_results: [RankedPhEvalGeneResult],
|
|
156
|
+
threshold: float,
|
|
157
|
+
score_order: str,
|
|
158
|
+
proband_causative_genes: [ProbandCausativeGene],
|
|
159
|
+
):
|
|
160
|
+
self.phenopacket_path = phenopacket_path
|
|
161
|
+
self.results_dir = results_dir
|
|
162
|
+
self.standardised_gene_results = standardised_gene_results
|
|
163
|
+
self.threshold = threshold
|
|
164
|
+
self.score_order = score_order
|
|
165
|
+
self.proband_causative_genes = proband_causative_genes
|
|
166
|
+
|
|
167
|
+
def _record_gene_prioritisation_match(
|
|
168
|
+
self,
|
|
169
|
+
gene: ProbandCausativeGene,
|
|
170
|
+
result_entry: RankedPhEvalGeneResult,
|
|
171
|
+
rank_stats: RankStats,
|
|
172
|
+
) -> GenePrioritisationResult:
|
|
173
|
+
"""Record the gene prioritisation rank if found within results."""
|
|
174
|
+
rank = result_entry.rank
|
|
175
|
+
rank_stats.add_rank(rank)
|
|
176
|
+
return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
|
|
177
|
+
|
|
178
|
+
def _assess_gene_with_threshold_ascending_order(
|
|
179
|
+
self,
|
|
180
|
+
result_entry: RankedPhEvalGeneResult,
|
|
181
|
+
gene: ProbandCausativeGene,
|
|
182
|
+
rank_stats: RankStats,
|
|
183
|
+
) -> GenePrioritisationResult:
|
|
184
|
+
"""Record the gene prioritisation rank if it meets the ascending order threshold."""
|
|
185
|
+
if float(self.threshold) > float(result_entry.pheval_gene_result.score):
|
|
186
|
+
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
|
|
187
|
+
|
|
188
|
+
def _assess_gene_with_threshold(
|
|
189
|
+
self,
|
|
190
|
+
result_entry: RankedPhEvalGeneResult,
|
|
191
|
+
gene: ProbandCausativeGene,
|
|
192
|
+
rank_stats: RankStats,
|
|
193
|
+
) -> GenePrioritisationResult:
|
|
194
|
+
"""Record the gene prioritisation rank if it meets the score threshold."""
|
|
195
|
+
if float(self.threshold) < float(result_entry.pheval_gene_result.score):
|
|
196
|
+
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
|
|
197
|
+
|
|
198
|
+
def _record_matched_gene(
|
|
199
|
+
self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series
|
|
200
|
+
) -> GenePrioritisationResult:
|
|
201
|
+
"""Return the gene rank result - dealing with the specification of a threshold."""
|
|
202
|
+
if float(self.threshold) == 0.0:
|
|
203
|
+
return self._record_gene_prioritisation_match(
|
|
204
|
+
gene, standardised_gene_result, rank_stats
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
return (
|
|
208
|
+
self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
|
|
209
|
+
if self.score_order != "ascending"
|
|
210
|
+
else self._assess_gene_with_threshold_ascending_order(
|
|
211
|
+
standardised_gene_result, gene, rank_stats
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None:
|
|
216
|
+
"""Assess gene prioritisation."""
|
|
217
|
+
for gene in self.proband_causative_genes:
|
|
218
|
+
rank_stats.total += 1
|
|
219
|
+
gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
|
|
220
|
+
for standardised_gene_result in self.standardised_gene_results:
|
|
221
|
+
if (
|
|
222
|
+
gene.gene_identifier
|
|
223
|
+
== standardised_gene_result.pheval_gene_result.gene_identifier
|
|
224
|
+
or gene.gene_symbol
|
|
225
|
+
== standardised_gene_result.pheval_gene_result.gene_identifier
|
|
226
|
+
):
|
|
227
|
+
gene_match = self._record_matched_gene(
|
|
228
|
+
gene, rank_stats, standardised_gene_result
|
|
229
|
+
)
|
|
230
|
+
break
|
|
231
|
+
PrioritisationRankRecorder(
|
|
232
|
+
rank_stats.total,
|
|
233
|
+
self.results_dir,
|
|
234
|
+
GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
|
|
235
|
+
if gene_match is None
|
|
236
|
+
else gene_match,
|
|
237
|
+
rank_records,
|
|
238
|
+
).record_rank()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class AssessVariantPrioritisation:
|
|
242
|
+
"""Assess variant prioritisation."""
|
|
243
|
+
|
|
244
|
+
def __init__(
|
|
245
|
+
self,
|
|
246
|
+
phenopacket_path: Path,
|
|
247
|
+
results_dir: Path,
|
|
248
|
+
standardised_variant_results: [RankedPhEvalVariantResult],
|
|
249
|
+
threshold: float,
|
|
250
|
+
score_order: str,
|
|
251
|
+
proband_causative_variants: [GenomicVariant],
|
|
252
|
+
):
|
|
253
|
+
self.phenopacket_path = phenopacket_path
|
|
254
|
+
self.results_dir = results_dir
|
|
255
|
+
self.standardised_variant_results = standardised_variant_results
|
|
256
|
+
self.threshold = threshold
|
|
257
|
+
self.score_order = score_order
|
|
258
|
+
self.proband_causative_variants = proband_causative_variants
|
|
259
|
+
|
|
260
|
+
def _record_variant_prioritisation_match(
|
|
261
|
+
self,
|
|
262
|
+
result_entry: RankedPhEvalVariantResult,
|
|
263
|
+
rank_stats: RankStats,
|
|
264
|
+
) -> VariantPrioritisationResult:
|
|
265
|
+
"""Record the variant prioritisation rank if found within results."""
|
|
266
|
+
rank = result_entry.rank
|
|
267
|
+
rank_stats.add_rank(rank)
|
|
268
|
+
return VariantPrioritisationResult(
|
|
269
|
+
self.phenopacket_path,
|
|
270
|
+
GenomicVariant(
|
|
271
|
+
chrom=result_entry.pheval_variant_result.chromosome,
|
|
272
|
+
pos=result_entry.pheval_variant_result.start,
|
|
273
|
+
ref=result_entry.pheval_variant_result.ref,
|
|
274
|
+
alt=result_entry.pheval_variant_result.alt,
|
|
275
|
+
),
|
|
276
|
+
rank,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _assess_variant_with_threshold_ascending_order(
|
|
280
|
+
self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
|
|
281
|
+
) -> VariantPrioritisationResult:
|
|
282
|
+
"""Record the variant prioritisation rank if it meets the ascending order threshold."""
|
|
283
|
+
if float(self.threshold) > float(result_entry.pheval_variant_result.score):
|
|
284
|
+
return self._record_variant_prioritisation_match(result_entry, rank_stats)
|
|
285
|
+
|
|
286
|
+
def _assess_variant_with_threshold(
|
|
287
|
+
self, result_entry: pd.Series, rank_stats: RankStats
|
|
288
|
+
) -> VariantPrioritisationResult:
|
|
289
|
+
"""Record the variant prioritisation rank if it meets the score threshold."""
|
|
290
|
+
if float(self.threshold) < float(result_entry.pheval_variant_result.score):
|
|
291
|
+
return self._record_variant_prioritisation_match(result_entry, rank_stats)
|
|
292
|
+
|
|
293
|
+
def _record_matched_variant(
|
|
294
|
+
self, rank_stats: RankStats, standardised_variant_result: pd.Series
|
|
295
|
+
) -> VariantPrioritisationResult:
|
|
296
|
+
"""Return the variant rank result - dealing with the specification of a threshold."""
|
|
297
|
+
if float(self.threshold) == 0.0:
|
|
298
|
+
return self._record_variant_prioritisation_match(
|
|
299
|
+
standardised_variant_result, rank_stats
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
return (
|
|
303
|
+
self._assess_variant_with_threshold(standardised_variant_result, rank_stats)
|
|
304
|
+
if self.score_order != "ascending"
|
|
305
|
+
else self._assess_variant_with_threshold_ascending_order(
|
|
306
|
+
standardised_variant_result, rank_stats
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def assess_variant_prioritisation(
|
|
311
|
+
self, rank_stats: RankStats, rank_records: defaultdict
|
|
312
|
+
) -> None:
|
|
313
|
+
"""Assess variant prioritisation."""
|
|
314
|
+
for variant in self.proband_causative_variants:
|
|
315
|
+
rank_stats.total += 1
|
|
316
|
+
variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
|
|
317
|
+
for result in self.standardised_variant_results:
|
|
318
|
+
result_variant = GenomicVariant(
|
|
319
|
+
chrom=result.pheval_variant_result.chromosome,
|
|
320
|
+
pos=result.pheval_variant_result.start,
|
|
321
|
+
ref=result.pheval_variant_result.ref,
|
|
322
|
+
alt=result.pheval_variant_result.alt,
|
|
323
|
+
)
|
|
324
|
+
if variant == result_variant:
|
|
325
|
+
variant_match = self._record_matched_variant(rank_stats, result)
|
|
326
|
+
break
|
|
327
|
+
PrioritisationRankRecorder(
|
|
328
|
+
rank_stats.total,
|
|
329
|
+
self.results_dir,
|
|
330
|
+
VariantPrioritisationResult(self.phenopacket_path, variant)
|
|
331
|
+
if variant_match is None
|
|
332
|
+
else variant_match,
|
|
333
|
+
rank_records,
|
|
334
|
+
).record_rank()
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]:
|
|
338
|
+
"""Obtain causative genes from a phenopacket."""
|
|
339
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
340
|
+
phenopacket_util = PhenopacketUtil(phenopacket)
|
|
341
|
+
return phenopacket_util.diagnosed_genes()
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _obtain_causative_variants(phenopacket_path: Path) -> [GenomicVariant]:
|
|
345
|
+
"""Obtain causative variants from a phenopacket."""
|
|
346
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
347
|
+
phenopacket_util = PhenopacketUtil(phenopacket)
|
|
348
|
+
return phenopacket_util.diagnosed_variants()
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def _assess_phenopacket_gene_prioritisation(
|
|
352
|
+
standardised_gene_result: Path,
|
|
353
|
+
score_order: str,
|
|
354
|
+
results_dir_and_input: TrackInputOutputDirectories,
|
|
355
|
+
threshold: float,
|
|
356
|
+
gene_rank_stats: RankStats,
|
|
357
|
+
gene_rank_comparison: defaultdict,
|
|
358
|
+
) -> None:
|
|
359
|
+
"""Assess gene prioritisation for a phenopacket."""
|
|
360
|
+
phenopacket_path = obtain_closest_file_name(
|
|
361
|
+
standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
|
|
362
|
+
)
|
|
363
|
+
pheval_gene_result = _read_standardised_result(standardised_gene_result)
|
|
364
|
+
proband_causative_genes = _obtain_causative_genes(phenopacket_path)
|
|
365
|
+
AssessGenePrioritisation(
|
|
366
|
+
phenopacket_path,
|
|
367
|
+
results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
|
|
368
|
+
parse_pheval_gene_result(pheval_gene_result),
|
|
369
|
+
threshold,
|
|
370
|
+
score_order,
|
|
371
|
+
proband_causative_genes,
|
|
372
|
+
).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _assess_phenopacket_variant_prioritisation(
|
|
376
|
+
standardised_variant_result: Path,
|
|
377
|
+
score_order: str,
|
|
378
|
+
results_dir_and_input: TrackInputOutputDirectories,
|
|
379
|
+
threshold: float,
|
|
380
|
+
variant_rank_stats: RankStats,
|
|
381
|
+
variant_rank_comparison: defaultdict,
|
|
382
|
+
) -> None:
|
|
383
|
+
"""Assess variant prioritisation for a phenopacket"""
|
|
384
|
+
phenopacket_path = obtain_closest_file_name(
|
|
385
|
+
standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
|
|
386
|
+
)
|
|
387
|
+
proband_causative_variants = _obtain_causative_variants(phenopacket_path)
|
|
388
|
+
pheval_variant_result = _read_standardised_result(standardised_variant_result)
|
|
389
|
+
AssessVariantPrioritisation(
|
|
390
|
+
phenopacket_path,
|
|
391
|
+
results_dir_and_input.results_dir.joinpath("pheval_variant_results/"),
|
|
392
|
+
parse_pheval_variant_result(pheval_variant_result),
|
|
393
|
+
threshold,
|
|
394
|
+
score_order,
|
|
395
|
+
proband_causative_variants,
|
|
396
|
+
).assess_variant_prioritisation(variant_rank_stats, variant_rank_comparison)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _assess_prioritisation_for_results_directory(
|
|
400
|
+
results_directory_and_input: TrackInputOutputDirectories,
|
|
401
|
+
score_order: str,
|
|
402
|
+
threshold: float,
|
|
403
|
+
gene_rank_comparison: defaultdict,
|
|
404
|
+
variant_rank_comparison: defaultdict,
|
|
405
|
+
gene_stats_writer: RankStatsWriter,
|
|
406
|
+
variants_stats_writer: RankStatsWriter,
|
|
407
|
+
gene_analysis: bool,
|
|
408
|
+
variant_analysis: bool,
|
|
409
|
+
) -> TrackPrioritisation:
|
|
410
|
+
"""Assess prioritisation for a single results directory."""
|
|
411
|
+
gene_rank_stats, variant_rank_stats = RankStats(), RankStats()
|
|
412
|
+
if gene_analysis:
|
|
413
|
+
for standardised_result in files_with_suffix(
|
|
414
|
+
results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv"
|
|
415
|
+
):
|
|
416
|
+
_assess_phenopacket_gene_prioritisation(
|
|
417
|
+
standardised_result,
|
|
418
|
+
score_order,
|
|
419
|
+
results_directory_and_input,
|
|
420
|
+
threshold,
|
|
421
|
+
gene_rank_stats,
|
|
422
|
+
gene_rank_comparison,
|
|
423
|
+
)
|
|
424
|
+
if variant_analysis:
|
|
425
|
+
for standardised_result in files_with_suffix(
|
|
426
|
+
results_directory_and_input.results_dir.joinpath("pheval_variant_results/"),
|
|
427
|
+
".tsv",
|
|
428
|
+
):
|
|
429
|
+
_assess_phenopacket_variant_prioritisation(
|
|
430
|
+
standardised_result,
|
|
431
|
+
score_order,
|
|
432
|
+
results_directory_and_input,
|
|
433
|
+
threshold,
|
|
434
|
+
variant_rank_stats,
|
|
435
|
+
variant_rank_comparison,
|
|
436
|
+
)
|
|
437
|
+
gene_stats_writer.write_row(
|
|
438
|
+
results_directory_and_input.results_dir, gene_rank_stats
|
|
439
|
+
) if gene_analysis else None
|
|
440
|
+
variants_stats_writer.write_row(
|
|
441
|
+
results_directory_and_input.results_dir, variant_rank_stats
|
|
442
|
+
) if variant_analysis else None
|
|
443
|
+
return TrackPrioritisation(
|
|
444
|
+
gene_prioritisation=TrackGenePrioritisation(
|
|
445
|
+
results_dir=results_directory_and_input.results_dir,
|
|
446
|
+
ranks=gene_rank_comparison,
|
|
447
|
+
rank_stats=gene_rank_stats,
|
|
448
|
+
),
|
|
449
|
+
variant_prioritisation=TrackVariantPrioritisation(
|
|
450
|
+
results_dir=results_directory_and_input.results_dir,
|
|
451
|
+
ranks=variant_rank_comparison,
|
|
452
|
+
rank_stats=variant_rank_stats,
|
|
453
|
+
),
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def benchmark_directory(
|
|
458
|
+
results_dir_and_input: TrackInputOutputDirectories,
|
|
459
|
+
score_order: str,
|
|
460
|
+
output_prefix: str,
|
|
461
|
+
threshold: float,
|
|
462
|
+
gene_analysis: bool,
|
|
463
|
+
variant_analysis: bool,
|
|
464
|
+
plot_type: str,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""Benchmark prioritisation performance for a single directory."""
|
|
467
|
+
gene_stats_writer = (
|
|
468
|
+
RankStatsWriter(Path(output_prefix + "-gene_summary.tsv")) if gene_analysis else None
|
|
469
|
+
)
|
|
470
|
+
variants_stats_writer = (
|
|
471
|
+
RankStatsWriter(Path(output_prefix + "-variant_summary.tsv")) if variant_analysis else None
|
|
472
|
+
)
|
|
473
|
+
gene_rank_comparison, variant_rank_comparison = defaultdict(dict), defaultdict(dict)
|
|
474
|
+
prioritisation_data = _assess_prioritisation_for_results_directory(
|
|
475
|
+
results_dir_and_input,
|
|
476
|
+
score_order,
|
|
477
|
+
threshold,
|
|
478
|
+
gene_rank_comparison,
|
|
479
|
+
variant_rank_comparison,
|
|
480
|
+
gene_stats_writer,
|
|
481
|
+
variants_stats_writer,
|
|
482
|
+
gene_analysis,
|
|
483
|
+
variant_analysis,
|
|
484
|
+
)
|
|
485
|
+
generate_benchmark_gene_output(prioritisation_data, plot_type) if gene_analysis else None
|
|
486
|
+
generate_benchmark_variant_output(prioritisation_data, plot_type) if variant_analysis else None
|
|
487
|
+
gene_stats_writer.close() if gene_analysis else None
|
|
488
|
+
variants_stats_writer.close() if variant_analysis else None
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def benchmark_runs(
|
|
492
|
+
results_directories: [TrackInputOutputDirectories],
|
|
493
|
+
score_order: str,
|
|
494
|
+
output_prefix: str,
|
|
495
|
+
threshold: float,
|
|
496
|
+
gene_analysis: bool,
|
|
497
|
+
variant_analysis: bool,
|
|
498
|
+
plot_type: str,
|
|
499
|
+
) -> None:
|
|
500
|
+
"""Benchmark several result directories."""
|
|
501
|
+
gene_stats_writer = (
|
|
502
|
+
RankStatsWriter(Path(output_prefix + "-gene_summary.tsv")) if gene_analysis else None
|
|
503
|
+
)
|
|
504
|
+
variants_stats_writer = (
|
|
505
|
+
RankStatsWriter(Path(output_prefix + "-variant_summary.tsv")) if variant_analysis else None
|
|
506
|
+
)
|
|
507
|
+
prioritisation_stats_for_runs = []
|
|
508
|
+
for results_dir_and_input in results_directories:
|
|
509
|
+
gene_rank_comparison, variant_rank_comparison = defaultdict(dict), defaultdict(dict)
|
|
510
|
+
prioritisation_stats = _assess_prioritisation_for_results_directory(
|
|
511
|
+
results_dir_and_input,
|
|
512
|
+
score_order,
|
|
513
|
+
threshold,
|
|
514
|
+
gene_rank_comparison,
|
|
515
|
+
variant_rank_comparison,
|
|
516
|
+
gene_stats_writer,
|
|
517
|
+
variants_stats_writer,
|
|
518
|
+
gene_analysis,
|
|
519
|
+
variant_analysis,
|
|
520
|
+
)
|
|
521
|
+
prioritisation_stats_for_runs.append(prioritisation_stats)
|
|
522
|
+
generate_benchmark_comparison_gene_output(
|
|
523
|
+
prioritisation_stats_for_runs, plot_type
|
|
524
|
+
) if gene_analysis else None
|
|
525
|
+
generate_benchmark_comparison_variant_output(
|
|
526
|
+
prioritisation_stats_for_runs, plot_type
|
|
527
|
+
) if variant_analysis else None
|
|
528
|
+
gene_stats_writer.close() if gene_analysis else None
|
|
529
|
+
variants_stats_writer.close() if variant_analysis else None
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
@click.command()
|
|
533
|
+
@click.option(
|
|
534
|
+
"--directory",
|
|
535
|
+
"-d",
|
|
536
|
+
required=True,
|
|
537
|
+
metavar="PATH",
|
|
538
|
+
help="General results directory to be benchmarked, assumes contains subdirectories of pheval_gene_results/"
|
|
539
|
+
"pheval_variant_results and the tool specific results directory. ",
|
|
540
|
+
type=Path,
|
|
541
|
+
)
|
|
542
|
+
@click.option(
|
|
543
|
+
"--phenopacket-dir",
|
|
544
|
+
"-p",
|
|
545
|
+
required=True,
|
|
546
|
+
metavar="PATH",
|
|
547
|
+
help="Full path to directory containing input phenopackets.",
|
|
548
|
+
type=Path,
|
|
549
|
+
)
|
|
550
|
+
@click.option(
|
|
551
|
+
"--output-prefix",
|
|
552
|
+
"-o",
|
|
553
|
+
metavar="<str>",
|
|
554
|
+
required=True,
|
|
555
|
+
help=" Output file prefix. ",
|
|
556
|
+
)
|
|
557
|
+
@click.option(
|
|
558
|
+
"--score-order",
|
|
559
|
+
"-so",
|
|
560
|
+
required=True,
|
|
561
|
+
help="Ordering of results for ranking.",
|
|
562
|
+
type=click.Choice(["ascending", "descending"]),
|
|
563
|
+
default="descending",
|
|
564
|
+
show_default=True,
|
|
565
|
+
)
|
|
566
|
+
@click.option(
|
|
567
|
+
"--threshold",
|
|
568
|
+
"-t",
|
|
569
|
+
metavar="<float>",
|
|
570
|
+
default=float(0.0),
|
|
571
|
+
required=False,
|
|
572
|
+
help="Score threshold.",
|
|
573
|
+
type=float,
|
|
574
|
+
)
|
|
575
|
+
@click.option(
|
|
576
|
+
"--gene-analysis/--no-gene-analysis",
|
|
577
|
+
default=False,
|
|
578
|
+
required=False,
|
|
579
|
+
type=bool,
|
|
580
|
+
show_default=True,
|
|
581
|
+
help="Specify analysis for gene prioritisation",
|
|
582
|
+
)
|
|
583
|
+
@click.option(
|
|
584
|
+
"--variant-analysis/--no-variant-analysis",
|
|
585
|
+
default=False,
|
|
586
|
+
required=False,
|
|
587
|
+
type=bool,
|
|
588
|
+
show_default=True,
|
|
589
|
+
help="Specify analysis for variant prioritisation",
|
|
590
|
+
)
|
|
591
|
+
@click.option(
|
|
592
|
+
"--plot-type",
|
|
593
|
+
"-p",
|
|
594
|
+
default="bar_stacked",
|
|
595
|
+
show_default=True,
|
|
596
|
+
type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
|
|
597
|
+
help="Bar chart type to output.",
|
|
598
|
+
)
|
|
599
|
+
def benchmark(
|
|
600
|
+
directory: Path,
|
|
601
|
+
phenopacket_dir: Path,
|
|
602
|
+
score_order: str,
|
|
603
|
+
output_prefix: str,
|
|
604
|
+
threshold: float,
|
|
605
|
+
gene_analysis: bool,
|
|
606
|
+
variant_analysis: bool,
|
|
607
|
+
plot_type: str,
|
|
608
|
+
):
|
|
609
|
+
"""Benchmark the gene/variant prioritisation performance for a single run."""
|
|
610
|
+
if not gene_analysis and not variant_analysis:
|
|
611
|
+
raise InputError("Need to specify gene analysis and/or variant analysis.")
|
|
612
|
+
benchmark_directory(
|
|
613
|
+
TrackInputOutputDirectories(results_dir=directory, phenopacket_dir=phenopacket_dir),
|
|
614
|
+
score_order,
|
|
615
|
+
output_prefix,
|
|
616
|
+
threshold,
|
|
617
|
+
gene_analysis,
|
|
618
|
+
variant_analysis,
|
|
619
|
+
plot_type,
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
@click.command()
|
|
624
|
+
@click.option(
|
|
625
|
+
"--run-data",
|
|
626
|
+
"-r",
|
|
627
|
+
required=True,
|
|
628
|
+
metavar="PATH",
|
|
629
|
+
help="Path to .txt file containing testdata directory and corresponding results directory separated by tab."
|
|
630
|
+
"Each run contained to a new line with the input testdata listed first and on the same line separated by a tab"
|
|
631
|
+
"the results directory.",
|
|
632
|
+
type=Path,
|
|
633
|
+
)
|
|
634
|
+
@click.option(
|
|
635
|
+
"--output-prefix",
|
|
636
|
+
"-o",
|
|
637
|
+
metavar="<str>",
|
|
638
|
+
required=True,
|
|
639
|
+
help=" Output file prefix. ",
|
|
640
|
+
)
|
|
641
|
+
@click.option(
|
|
642
|
+
"--score-order",
|
|
643
|
+
"-so",
|
|
644
|
+
required=True,
|
|
645
|
+
help="Ordering of results for ranking.",
|
|
646
|
+
type=click.Choice(["ascending", "descending"]),
|
|
647
|
+
default="descending",
|
|
648
|
+
show_default=True,
|
|
649
|
+
)
|
|
650
|
+
@click.option(
|
|
651
|
+
"--threshold",
|
|
652
|
+
"-t",
|
|
653
|
+
metavar="<float>",
|
|
654
|
+
default=float(0.0),
|
|
655
|
+
required=False,
|
|
656
|
+
help="Score threshold.",
|
|
657
|
+
type=float,
|
|
658
|
+
)
|
|
659
|
+
@click.option(
|
|
660
|
+
"--gene-analysis/--no-gene-analysis",
|
|
661
|
+
default=False,
|
|
662
|
+
required=False,
|
|
663
|
+
type=bool,
|
|
664
|
+
show_default=True,
|
|
665
|
+
help="Specify analysis for gene prioritisation",
|
|
666
|
+
)
|
|
667
|
+
@click.option(
|
|
668
|
+
"--variant-analysis/--no-variant-analysis",
|
|
669
|
+
default=False,
|
|
670
|
+
required=False,
|
|
671
|
+
type=bool,
|
|
672
|
+
show_default=True,
|
|
673
|
+
help="Specify analysis for variant prioritisation",
|
|
674
|
+
)
|
|
675
|
+
@click.option(
|
|
676
|
+
"--plot-type",
|
|
677
|
+
"-p",
|
|
678
|
+
default="bar_stacked",
|
|
679
|
+
show_default=True,
|
|
680
|
+
type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
|
|
681
|
+
help="Bar chart type to output.",
|
|
682
|
+
)
|
|
683
|
+
def benchmark_comparison(
|
|
684
|
+
run_data: Path,
|
|
685
|
+
score_order: str,
|
|
686
|
+
output_prefix: str,
|
|
687
|
+
threshold: float,
|
|
688
|
+
gene_analysis: bool,
|
|
689
|
+
variant_analysis: bool,
|
|
690
|
+
plot_type: str,
|
|
691
|
+
):
|
|
692
|
+
"""Benchmark the gene/variant prioritisation performance for two runs."""
|
|
693
|
+
if not gene_analysis and not variant_analysis:
|
|
694
|
+
raise InputError("Need to specify gene analysis and/or variant analysis.")
|
|
695
|
+
benchmark_runs(
|
|
696
|
+
_parse_run_data_text_file(run_data),
|
|
697
|
+
score_order,
|
|
698
|
+
output_prefix,
|
|
699
|
+
threshold,
|
|
700
|
+
gene_analysis,
|
|
701
|
+
variant_analysis,
|
|
702
|
+
plot_type,
|
|
703
|
+
)
|