pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (42) hide show
  1. pheval/__init__.py +0 -5
  2. pheval/analyse/__init__.py +0 -0
  3. pheval/analyse/analysis.py +703 -0
  4. pheval/analyse/generate_plots.py +312 -0
  5. pheval/analyse/generate_summary_outputs.py +186 -0
  6. pheval/analyse/rank_stats.py +61 -0
  7. pheval/cli.py +22 -7
  8. pheval/cli_pheval.py +37 -12
  9. pheval/cli_pheval_utils.py +225 -8
  10. pheval/config_parser.py +36 -0
  11. pheval/constants.py +1 -0
  12. pheval/implementations/__init__.py +1 -3
  13. pheval/post_processing/__init__.py +0 -0
  14. pheval/post_processing/post_processing.py +210 -0
  15. pheval/prepare/__init__.py +0 -0
  16. pheval/prepare/create_noisy_phenopackets.py +173 -0
  17. pheval/prepare/create_spiked_vcf.py +366 -0
  18. pheval/prepare/custom_exceptions.py +47 -0
  19. pheval/prepare/update_phenopacket.py +53 -0
  20. pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
  21. pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
  22. pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
  23. pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
  24. pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
  25. pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
  26. pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
  27. pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
  28. pheval/run_metadata.py +27 -0
  29. pheval/runners/runner.py +92 -11
  30. pheval/utils/__init__.py +0 -0
  31. pheval/utils/docs_gen.py +105 -0
  32. pheval/utils/docs_gen.sh +18 -0
  33. pheval/utils/file_utils.py +88 -0
  34. pheval/utils/phenopacket_utils.py +356 -0
  35. pheval/utils/semsim_utils.py +156 -0
  36. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
  37. pheval-0.2.0.dist-info/RECORD +41 -0
  38. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
  39. pheval/utils.py +0 -7
  40. pheval-0.1.0.dist-info/RECORD +0 -13
  41. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
  42. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
pheval/__init__.py CHANGED
@@ -1,5 +0,0 @@
1
- """
2
- pheval
3
- ------
4
- """
5
- __version__ = "0.1.0"
File without changes
@@ -0,0 +1,703 @@
1
+ # #!/usr/bin/python
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+
6
+ import click
7
+ import pandas as pd
8
+
9
+ from pheval.analyse.generate_plots import (
10
+ TrackGenePrioritisation,
11
+ TrackPrioritisation,
12
+ TrackVariantPrioritisation,
13
+ )
14
+ from pheval.analyse.generate_summary_outputs import (
15
+ RankStatsWriter,
16
+ generate_benchmark_comparison_gene_output,
17
+ generate_benchmark_comparison_variant_output,
18
+ generate_benchmark_gene_output,
19
+ generate_benchmark_variant_output,
20
+ )
21
+ from pheval.analyse.rank_stats import RankStats
22
+ from pheval.post_processing.post_processing import (
23
+ PhEvalGeneResult,
24
+ PhEvalVariantResult,
25
+ RankedPhEvalGeneResult,
26
+ RankedPhEvalVariantResult,
27
+ )
28
+ from pheval.prepare.custom_exceptions import InputError
29
+ from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
30
+ from pheval.utils.phenopacket_utils import (
31
+ GenomicVariant,
32
+ PhenopacketUtil,
33
+ ProbandCausativeGene,
34
+ phenopacket_reader,
35
+ )
36
+
37
+
38
+ def _read_standardised_result(standardised_result_path: Path) -> dict:
39
+ """Read the standardised result output and return a dictionary."""
40
+ return pd.read_csv(standardised_result_path, delimiter="\t")
41
+
42
+
43
+ def parse_pheval_gene_result(pheval_gene_result: pd.DataFrame) -> [RankedPhEvalGeneResult]:
44
+ """Parse PhEval gene result into RankedPhEvalGeneResult dataclass."""
45
+ ranked_gene_results = []
46
+ for _index, result in pheval_gene_result.iterrows():
47
+ ranked_gene_results.append(
48
+ RankedPhEvalGeneResult(
49
+ pheval_gene_result=PhEvalGeneResult(
50
+ gene_symbol=result["gene_symbol"],
51
+ gene_identifier=result["gene_identifier"],
52
+ score=result["score"],
53
+ ),
54
+ rank=result["rank"],
55
+ )
56
+ )
57
+ return ranked_gene_results
58
+
59
+
60
+ def parse_pheval_variant_result(pheval_variant_result: pd.DataFrame) -> [RankedPhEvalVariantResult]:
61
+ """Parse PhEval variant result into RankedPhEvalVariantResult dataclass."""
62
+ ranked_variant_results = []
63
+ for _index, result in pheval_variant_result.iterrows():
64
+ ranked_variant_results.append(
65
+ RankedPhEvalVariantResult(
66
+ pheval_variant_result=PhEvalVariantResult(
67
+ chromosome=result["chromosome"],
68
+ start=result["start"],
69
+ end=result["end"],
70
+ ref=result["ref"],
71
+ alt=result["alt"],
72
+ score=result["score"],
73
+ ),
74
+ rank=result["rank"],
75
+ )
76
+ )
77
+ return ranked_variant_results
78
+
79
+
80
+ @dataclass
81
+ class GenePrioritisationResult:
82
+ """Store rank data for causative genes."""
83
+
84
+ phenopacket_path: Path
85
+ gene: str
86
+ rank: int = 0
87
+
88
+
89
+ @dataclass
90
+ class VariantPrioritisationResult:
91
+ """Store rank data for causative variants."""
92
+
93
+ phenopacket_path: Path
94
+ variant: GenomicVariant
95
+ rank: int = 0
96
+
97
+
98
+ @dataclass
99
+ class PrioritisationRankRecorder:
100
+ """Compare the ranks of different runs."""
101
+
102
+ index: int
103
+ directory: Path
104
+ prioritisation_result: VariantPrioritisationResult or GenePrioritisationResult
105
+ run_comparison: defaultdict
106
+
107
+ def _record_gene_rank(self) -> None:
108
+ """Record gene prioritisation rank."""
109
+ self.run_comparison[self.index]["Gene"] = self.prioritisation_result.gene
110
+
111
+ def _record_variant_rank(self) -> None:
112
+ """Record variant prioritisation rank."""
113
+ variant = self.prioritisation_result.variant
114
+ self.run_comparison[self.index]["Variant"] = "_".join(
115
+ [variant.chrom, str(variant.pos), variant.ref, variant.alt]
116
+ )
117
+
118
+ def record_rank(self) -> None:
119
+ """Records the rank for different runs."""
120
+ self.run_comparison[self.index][
121
+ "Phenopacket"
122
+ ] = self.prioritisation_result.phenopacket_path.name
123
+ self._record_gene_rank() if type(
124
+ self.prioritisation_result
125
+ ) is GenePrioritisationResult else self._record_variant_rank()
126
+ self.run_comparison[self.index][self.directory] = self.prioritisation_result.rank
127
+
128
+
129
+ @dataclass
130
+ class TrackInputOutputDirectories:
131
+ """Track the input testdata for a corresponding pheval output directory"""
132
+
133
+ phenopacket_dir: Path
134
+ results_dir: Path
135
+
136
+
137
+ def _parse_run_data_text_file(run_data_path: Path) -> [TrackInputOutputDirectories]:
138
+ """Parse run data .txt file returning a list of input testdata and corresponding output directories."""
139
+ run_data = pd.read_csv(run_data_path, delimiter="\t", header=None)
140
+ run_data_list = []
141
+ for _index, row in run_data.iterrows():
142
+ run_data_list.append(
143
+ TrackInputOutputDirectories(phenopacket_dir=Path(row[0]), results_dir=Path(row[1]))
144
+ )
145
+ return run_data_list
146
+
147
+
148
+ class AssessGenePrioritisation:
149
+ """Assess gene prioritisation."""
150
+
151
+ def __init__(
152
+ self,
153
+ phenopacket_path: Path,
154
+ results_dir: Path,
155
+ standardised_gene_results: [RankedPhEvalGeneResult],
156
+ threshold: float,
157
+ score_order: str,
158
+ proband_causative_genes: [ProbandCausativeGene],
159
+ ):
160
+ self.phenopacket_path = phenopacket_path
161
+ self.results_dir = results_dir
162
+ self.standardised_gene_results = standardised_gene_results
163
+ self.threshold = threshold
164
+ self.score_order = score_order
165
+ self.proband_causative_genes = proband_causative_genes
166
+
167
+ def _record_gene_prioritisation_match(
168
+ self,
169
+ gene: ProbandCausativeGene,
170
+ result_entry: RankedPhEvalGeneResult,
171
+ rank_stats: RankStats,
172
+ ) -> GenePrioritisationResult:
173
+ """Record the gene prioritisation rank if found within results."""
174
+ rank = result_entry.rank
175
+ rank_stats.add_rank(rank)
176
+ return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
177
+
178
+ def _assess_gene_with_threshold_ascending_order(
179
+ self,
180
+ result_entry: RankedPhEvalGeneResult,
181
+ gene: ProbandCausativeGene,
182
+ rank_stats: RankStats,
183
+ ) -> GenePrioritisationResult:
184
+ """Record the gene prioritisation rank if it meets the ascending order threshold."""
185
+ if float(self.threshold) > float(result_entry.pheval_gene_result.score):
186
+ return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
187
+
188
+ def _assess_gene_with_threshold(
189
+ self,
190
+ result_entry: RankedPhEvalGeneResult,
191
+ gene: ProbandCausativeGene,
192
+ rank_stats: RankStats,
193
+ ) -> GenePrioritisationResult:
194
+ """Record the gene prioritisation rank if it meets the score threshold."""
195
+ if float(self.threshold) < float(result_entry.pheval_gene_result.score):
196
+ return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
197
+
198
+ def _record_matched_gene(
199
+ self, gene: ProbandCausativeGene, rank_stats: RankStats, standardised_gene_result: pd.Series
200
+ ) -> GenePrioritisationResult:
201
+ """Return the gene rank result - dealing with the specification of a threshold."""
202
+ if float(self.threshold) == 0.0:
203
+ return self._record_gene_prioritisation_match(
204
+ gene, standardised_gene_result, rank_stats
205
+ )
206
+ else:
207
+ return (
208
+ self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
209
+ if self.score_order != "ascending"
210
+ else self._assess_gene_with_threshold_ascending_order(
211
+ standardised_gene_result, gene, rank_stats
212
+ )
213
+ )
214
+
215
+ def assess_gene_prioritisation(self, rank_stats: RankStats, rank_records: defaultdict) -> None:
216
+ """Assess gene prioritisation."""
217
+ for gene in self.proband_causative_genes:
218
+ rank_stats.total += 1
219
+ gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
220
+ for standardised_gene_result in self.standardised_gene_results:
221
+ if (
222
+ gene.gene_identifier
223
+ == standardised_gene_result.pheval_gene_result.gene_identifier
224
+ or gene.gene_symbol
225
+ == standardised_gene_result.pheval_gene_result.gene_identifier
226
+ ):
227
+ gene_match = self._record_matched_gene(
228
+ gene, rank_stats, standardised_gene_result
229
+ )
230
+ break
231
+ PrioritisationRankRecorder(
232
+ rank_stats.total,
233
+ self.results_dir,
234
+ GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
235
+ if gene_match is None
236
+ else gene_match,
237
+ rank_records,
238
+ ).record_rank()
239
+
240
+
241
+ class AssessVariantPrioritisation:
242
+ """Assess variant prioritisation."""
243
+
244
+ def __init__(
245
+ self,
246
+ phenopacket_path: Path,
247
+ results_dir: Path,
248
+ standardised_variant_results: [RankedPhEvalVariantResult],
249
+ threshold: float,
250
+ score_order: str,
251
+ proband_causative_variants: [GenomicVariant],
252
+ ):
253
+ self.phenopacket_path = phenopacket_path
254
+ self.results_dir = results_dir
255
+ self.standardised_variant_results = standardised_variant_results
256
+ self.threshold = threshold
257
+ self.score_order = score_order
258
+ self.proband_causative_variants = proband_causative_variants
259
+
260
+ def _record_variant_prioritisation_match(
261
+ self,
262
+ result_entry: RankedPhEvalVariantResult,
263
+ rank_stats: RankStats,
264
+ ) -> VariantPrioritisationResult:
265
+ """Record the variant prioritisation rank if found within results."""
266
+ rank = result_entry.rank
267
+ rank_stats.add_rank(rank)
268
+ return VariantPrioritisationResult(
269
+ self.phenopacket_path,
270
+ GenomicVariant(
271
+ chrom=result_entry.pheval_variant_result.chromosome,
272
+ pos=result_entry.pheval_variant_result.start,
273
+ ref=result_entry.pheval_variant_result.ref,
274
+ alt=result_entry.pheval_variant_result.alt,
275
+ ),
276
+ rank,
277
+ )
278
+
279
+ def _assess_variant_with_threshold_ascending_order(
280
+ self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
281
+ ) -> VariantPrioritisationResult:
282
+ """Record the variant prioritisation rank if it meets the ascending order threshold."""
283
+ if float(self.threshold) > float(result_entry.pheval_variant_result.score):
284
+ return self._record_variant_prioritisation_match(result_entry, rank_stats)
285
+
286
+ def _assess_variant_with_threshold(
287
+ self, result_entry: pd.Series, rank_stats: RankStats
288
+ ) -> VariantPrioritisationResult:
289
+ """Record the variant prioritisation rank if it meets the score threshold."""
290
+ if float(self.threshold) < float(result_entry.pheval_variant_result.score):
291
+ return self._record_variant_prioritisation_match(result_entry, rank_stats)
292
+
293
+ def _record_matched_variant(
294
+ self, rank_stats: RankStats, standardised_variant_result: pd.Series
295
+ ) -> VariantPrioritisationResult:
296
+ """Return the variant rank result - dealing with the specification of a threshold."""
297
+ if float(self.threshold) == 0.0:
298
+ return self._record_variant_prioritisation_match(
299
+ standardised_variant_result, rank_stats
300
+ )
301
+ else:
302
+ return (
303
+ self._assess_variant_with_threshold(standardised_variant_result, rank_stats)
304
+ if self.score_order != "ascending"
305
+ else self._assess_variant_with_threshold_ascending_order(
306
+ standardised_variant_result, rank_stats
307
+ )
308
+ )
309
+
310
+ def assess_variant_prioritisation(
311
+ self, rank_stats: RankStats, rank_records: defaultdict
312
+ ) -> None:
313
+ """Assess variant prioritisation."""
314
+ for variant in self.proband_causative_variants:
315
+ rank_stats.total += 1
316
+ variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
317
+ for result in self.standardised_variant_results:
318
+ result_variant = GenomicVariant(
319
+ chrom=result.pheval_variant_result.chromosome,
320
+ pos=result.pheval_variant_result.start,
321
+ ref=result.pheval_variant_result.ref,
322
+ alt=result.pheval_variant_result.alt,
323
+ )
324
+ if variant == result_variant:
325
+ variant_match = self._record_matched_variant(rank_stats, result)
326
+ break
327
+ PrioritisationRankRecorder(
328
+ rank_stats.total,
329
+ self.results_dir,
330
+ VariantPrioritisationResult(self.phenopacket_path, variant)
331
+ if variant_match is None
332
+ else variant_match,
333
+ rank_records,
334
+ ).record_rank()
335
+
336
+
337
+ def _obtain_causative_genes(phenopacket_path: Path) -> [ProbandCausativeGene]:
338
+ """Obtain causative genes from a phenopacket."""
339
+ phenopacket = phenopacket_reader(phenopacket_path)
340
+ phenopacket_util = PhenopacketUtil(phenopacket)
341
+ return phenopacket_util.diagnosed_genes()
342
+
343
+
344
+ def _obtain_causative_variants(phenopacket_path: Path) -> [GenomicVariant]:
345
+ """Obtain causative variants from a phenopacket."""
346
+ phenopacket = phenopacket_reader(phenopacket_path)
347
+ phenopacket_util = PhenopacketUtil(phenopacket)
348
+ return phenopacket_util.diagnosed_variants()
349
+
350
+
351
+ def _assess_phenopacket_gene_prioritisation(
352
+ standardised_gene_result: Path,
353
+ score_order: str,
354
+ results_dir_and_input: TrackInputOutputDirectories,
355
+ threshold: float,
356
+ gene_rank_stats: RankStats,
357
+ gene_rank_comparison: defaultdict,
358
+ ) -> None:
359
+ """Assess gene prioritisation for a phenopacket."""
360
+ phenopacket_path = obtain_closest_file_name(
361
+ standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
362
+ )
363
+ pheval_gene_result = _read_standardised_result(standardised_gene_result)
364
+ proband_causative_genes = _obtain_causative_genes(phenopacket_path)
365
+ AssessGenePrioritisation(
366
+ phenopacket_path,
367
+ results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
368
+ parse_pheval_gene_result(pheval_gene_result),
369
+ threshold,
370
+ score_order,
371
+ proband_causative_genes,
372
+ ).assess_gene_prioritisation(gene_rank_stats, gene_rank_comparison)
373
+
374
+
375
+ def _assess_phenopacket_variant_prioritisation(
376
+ standardised_variant_result: Path,
377
+ score_order: str,
378
+ results_dir_and_input: TrackInputOutputDirectories,
379
+ threshold: float,
380
+ variant_rank_stats: RankStats,
381
+ variant_rank_comparison: defaultdict,
382
+ ) -> None:
383
+ """Assess variant prioritisation for a phenopacket"""
384
+ phenopacket_path = obtain_closest_file_name(
385
+ standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
386
+ )
387
+ proband_causative_variants = _obtain_causative_variants(phenopacket_path)
388
+ pheval_variant_result = _read_standardised_result(standardised_variant_result)
389
+ AssessVariantPrioritisation(
390
+ phenopacket_path,
391
+ results_dir_and_input.results_dir.joinpath("pheval_variant_results/"),
392
+ parse_pheval_variant_result(pheval_variant_result),
393
+ threshold,
394
+ score_order,
395
+ proband_causative_variants,
396
+ ).assess_variant_prioritisation(variant_rank_stats, variant_rank_comparison)
397
+
398
+
399
+ def _assess_prioritisation_for_results_directory(
400
+ results_directory_and_input: TrackInputOutputDirectories,
401
+ score_order: str,
402
+ threshold: float,
403
+ gene_rank_comparison: defaultdict,
404
+ variant_rank_comparison: defaultdict,
405
+ gene_stats_writer: RankStatsWriter,
406
+ variants_stats_writer: RankStatsWriter,
407
+ gene_analysis: bool,
408
+ variant_analysis: bool,
409
+ ) -> TrackPrioritisation:
410
+ """Assess prioritisation for a single results directory."""
411
+ gene_rank_stats, variant_rank_stats = RankStats(), RankStats()
412
+ if gene_analysis:
413
+ for standardised_result in files_with_suffix(
414
+ results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv"
415
+ ):
416
+ _assess_phenopacket_gene_prioritisation(
417
+ standardised_result,
418
+ score_order,
419
+ results_directory_and_input,
420
+ threshold,
421
+ gene_rank_stats,
422
+ gene_rank_comparison,
423
+ )
424
+ if variant_analysis:
425
+ for standardised_result in files_with_suffix(
426
+ results_directory_and_input.results_dir.joinpath("pheval_variant_results/"),
427
+ ".tsv",
428
+ ):
429
+ _assess_phenopacket_variant_prioritisation(
430
+ standardised_result,
431
+ score_order,
432
+ results_directory_and_input,
433
+ threshold,
434
+ variant_rank_stats,
435
+ variant_rank_comparison,
436
+ )
437
+ gene_stats_writer.write_row(
438
+ results_directory_and_input.results_dir, gene_rank_stats
439
+ ) if gene_analysis else None
440
+ variants_stats_writer.write_row(
441
+ results_directory_and_input.results_dir, variant_rank_stats
442
+ ) if variant_analysis else None
443
+ return TrackPrioritisation(
444
+ gene_prioritisation=TrackGenePrioritisation(
445
+ results_dir=results_directory_and_input.results_dir,
446
+ ranks=gene_rank_comparison,
447
+ rank_stats=gene_rank_stats,
448
+ ),
449
+ variant_prioritisation=TrackVariantPrioritisation(
450
+ results_dir=results_directory_and_input.results_dir,
451
+ ranks=variant_rank_comparison,
452
+ rank_stats=variant_rank_stats,
453
+ ),
454
+ )
455
+
456
+
457
+ def benchmark_directory(
458
+ results_dir_and_input: TrackInputOutputDirectories,
459
+ score_order: str,
460
+ output_prefix: str,
461
+ threshold: float,
462
+ gene_analysis: bool,
463
+ variant_analysis: bool,
464
+ plot_type: str,
465
+ ) -> None:
466
+ """Benchmark prioritisation performance for a single directory."""
467
+ gene_stats_writer = (
468
+ RankStatsWriter(Path(output_prefix + "-gene_summary.tsv")) if gene_analysis else None
469
+ )
470
+ variants_stats_writer = (
471
+ RankStatsWriter(Path(output_prefix + "-variant_summary.tsv")) if variant_analysis else None
472
+ )
473
+ gene_rank_comparison, variant_rank_comparison = defaultdict(dict), defaultdict(dict)
474
+ prioritisation_data = _assess_prioritisation_for_results_directory(
475
+ results_dir_and_input,
476
+ score_order,
477
+ threshold,
478
+ gene_rank_comparison,
479
+ variant_rank_comparison,
480
+ gene_stats_writer,
481
+ variants_stats_writer,
482
+ gene_analysis,
483
+ variant_analysis,
484
+ )
485
+ generate_benchmark_gene_output(prioritisation_data, plot_type) if gene_analysis else None
486
+ generate_benchmark_variant_output(prioritisation_data, plot_type) if variant_analysis else None
487
+ gene_stats_writer.close() if gene_analysis else None
488
+ variants_stats_writer.close() if variant_analysis else None
489
+
490
+
491
+ def benchmark_runs(
492
+ results_directories: [TrackInputOutputDirectories],
493
+ score_order: str,
494
+ output_prefix: str,
495
+ threshold: float,
496
+ gene_analysis: bool,
497
+ variant_analysis: bool,
498
+ plot_type: str,
499
+ ) -> None:
500
+ """Benchmark several result directories."""
501
+ gene_stats_writer = (
502
+ RankStatsWriter(Path(output_prefix + "-gene_summary.tsv")) if gene_analysis else None
503
+ )
504
+ variants_stats_writer = (
505
+ RankStatsWriter(Path(output_prefix + "-variant_summary.tsv")) if variant_analysis else None
506
+ )
507
+ prioritisation_stats_for_runs = []
508
+ for results_dir_and_input in results_directories:
509
+ gene_rank_comparison, variant_rank_comparison = defaultdict(dict), defaultdict(dict)
510
+ prioritisation_stats = _assess_prioritisation_for_results_directory(
511
+ results_dir_and_input,
512
+ score_order,
513
+ threshold,
514
+ gene_rank_comparison,
515
+ variant_rank_comparison,
516
+ gene_stats_writer,
517
+ variants_stats_writer,
518
+ gene_analysis,
519
+ variant_analysis,
520
+ )
521
+ prioritisation_stats_for_runs.append(prioritisation_stats)
522
+ generate_benchmark_comparison_gene_output(
523
+ prioritisation_stats_for_runs, plot_type
524
+ ) if gene_analysis else None
525
+ generate_benchmark_comparison_variant_output(
526
+ prioritisation_stats_for_runs, plot_type
527
+ ) if variant_analysis else None
528
+ gene_stats_writer.close() if gene_analysis else None
529
+ variants_stats_writer.close() if variant_analysis else None
530
+
531
+
532
+ @click.command()
533
+ @click.option(
534
+ "--directory",
535
+ "-d",
536
+ required=True,
537
+ metavar="PATH",
538
+ help="General results directory to be benchmarked, assumes contains subdirectories of pheval_gene_results/"
539
+ "pheval_variant_results and the tool specific results directory. ",
540
+ type=Path,
541
+ )
542
+ @click.option(
543
+ "--phenopacket-dir",
544
+ "-p",
545
+ required=True,
546
+ metavar="PATH",
547
+ help="Full path to directory containing input phenopackets.",
548
+ type=Path,
549
+ )
550
+ @click.option(
551
+ "--output-prefix",
552
+ "-o",
553
+ metavar="<str>",
554
+ required=True,
555
+ help=" Output file prefix. ",
556
+ )
557
+ @click.option(
558
+ "--score-order",
559
+ "-so",
560
+ required=True,
561
+ help="Ordering of results for ranking.",
562
+ type=click.Choice(["ascending", "descending"]),
563
+ default="descending",
564
+ show_default=True,
565
+ )
566
+ @click.option(
567
+ "--threshold",
568
+ "-t",
569
+ metavar="<float>",
570
+ default=float(0.0),
571
+ required=False,
572
+ help="Score threshold.",
573
+ type=float,
574
+ )
575
+ @click.option(
576
+ "--gene-analysis/--no-gene-analysis",
577
+ default=False,
578
+ required=False,
579
+ type=bool,
580
+ show_default=True,
581
+ help="Specify analysis for gene prioritisation",
582
+ )
583
+ @click.option(
584
+ "--variant-analysis/--no-variant-analysis",
585
+ default=False,
586
+ required=False,
587
+ type=bool,
588
+ show_default=True,
589
+ help="Specify analysis for variant prioritisation",
590
+ )
591
+ @click.option(
592
+ "--plot-type",
593
+ "-p",
594
+ default="bar_stacked",
595
+ show_default=True,
596
+ type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
597
+ help="Bar chart type to output.",
598
+ )
599
+ def benchmark(
600
+ directory: Path,
601
+ phenopacket_dir: Path,
602
+ score_order: str,
603
+ output_prefix: str,
604
+ threshold: float,
605
+ gene_analysis: bool,
606
+ variant_analysis: bool,
607
+ plot_type: str,
608
+ ):
609
+ """Benchmark the gene/variant prioritisation performance for a single run."""
610
+ if not gene_analysis and not variant_analysis:
611
+ raise InputError("Need to specify gene analysis and/or variant analysis.")
612
+ benchmark_directory(
613
+ TrackInputOutputDirectories(results_dir=directory, phenopacket_dir=phenopacket_dir),
614
+ score_order,
615
+ output_prefix,
616
+ threshold,
617
+ gene_analysis,
618
+ variant_analysis,
619
+ plot_type,
620
+ )
621
+
622
+
623
+ @click.command()
624
+ @click.option(
625
+ "--run-data",
626
+ "-r",
627
+ required=True,
628
+ metavar="PATH",
629
+ help="Path to .txt file containing testdata directory and corresponding results directory separated by tab."
630
+ "Each run contained to a new line with the input testdata listed first and on the same line separated by a tab"
631
+ "the results directory.",
632
+ type=Path,
633
+ )
634
+ @click.option(
635
+ "--output-prefix",
636
+ "-o",
637
+ metavar="<str>",
638
+ required=True,
639
+ help=" Output file prefix. ",
640
+ )
641
+ @click.option(
642
+ "--score-order",
643
+ "-so",
644
+ required=True,
645
+ help="Ordering of results for ranking.",
646
+ type=click.Choice(["ascending", "descending"]),
647
+ default="descending",
648
+ show_default=True,
649
+ )
650
+ @click.option(
651
+ "--threshold",
652
+ "-t",
653
+ metavar="<float>",
654
+ default=float(0.0),
655
+ required=False,
656
+ help="Score threshold.",
657
+ type=float,
658
+ )
659
+ @click.option(
660
+ "--gene-analysis/--no-gene-analysis",
661
+ default=False,
662
+ required=False,
663
+ type=bool,
664
+ show_default=True,
665
+ help="Specify analysis for gene prioritisation",
666
+ )
667
+ @click.option(
668
+ "--variant-analysis/--no-variant-analysis",
669
+ default=False,
670
+ required=False,
671
+ type=bool,
672
+ show_default=True,
673
+ help="Specify analysis for variant prioritisation",
674
+ )
675
+ @click.option(
676
+ "--plot-type",
677
+ "-p",
678
+ default="bar_stacked",
679
+ show_default=True,
680
+ type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
681
+ help="Bar chart type to output.",
682
+ )
683
+ def benchmark_comparison(
684
+ run_data: Path,
685
+ score_order: str,
686
+ output_prefix: str,
687
+ threshold: float,
688
+ gene_analysis: bool,
689
+ variant_analysis: bool,
690
+ plot_type: str,
691
+ ):
692
+ """Benchmark the gene/variant prioritisation performance for two runs."""
693
+ if not gene_analysis and not variant_analysis:
694
+ raise InputError("Need to specify gene analysis and/or variant analysis.")
695
+ benchmark_runs(
696
+ _parse_run_data_text_file(run_data),
697
+ score_order,
698
+ output_prefix,
699
+ threshold,
700
+ gene_analysis,
701
+ variant_analysis,
702
+ plot_type,
703
+ )