pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/analysis.py +61 -150
- pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval/analyse/benchmark_db_manager.py +140 -0
- pheval/analyse/benchmark_generator.py +47 -50
- pheval/analyse/benchmarking_data.py +3 -2
- pheval/analyse/disease_prioritisation_analysis.py +70 -219
- pheval/analyse/gene_prioritisation_analysis.py +66 -242
- pheval/analyse/generate_plots.py +81 -79
- pheval/analyse/generate_summary_outputs.py +64 -134
- pheval/analyse/parse_benchmark_summary.py +50 -37
- pheval/analyse/parse_corpus.py +219 -0
- pheval/analyse/rank_stats.py +177 -144
- pheval/analyse/run_data_parser.py +108 -27
- pheval/analyse/variant_prioritisation_analysis.py +78 -212
- pheval/cli.py +2 -4
- pheval/cli_pheval_utils.py +34 -245
- pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.4.1.dist-info/METADATA +113 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
- pheval/analyse/parse_pheval_result.py +0 -43
- pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval/constants.py +0 -8
- pheval-0.3.9.dist-info/METADATA +0 -35
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0
pheval/cli_pheval_utils.py
CHANGED
|
@@ -5,13 +5,9 @@ from typing import List
|
|
|
5
5
|
|
|
6
6
|
import click
|
|
7
7
|
|
|
8
|
-
from pheval.analyse.analysis import
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
benchmark_run_comparisons,
|
|
12
|
-
)
|
|
13
|
-
from pheval.analyse.generate_plots import generate_plots_from_benchmark_summary_tsv
|
|
14
|
-
from pheval.analyse.run_data_parser import parse_run_data_text_file
|
|
8
|
+
from pheval.analyse.analysis import benchmark_run_comparisons
|
|
9
|
+
from pheval.analyse.generate_plots import generate_plots_from_benchmark_summary_db
|
|
10
|
+
from pheval.analyse.run_data_parser import parse_run_config
|
|
15
11
|
from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
|
|
16
12
|
from pheval.prepare.create_spiked_vcf import spike_vcfs
|
|
17
13
|
from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
|
|
@@ -110,17 +106,29 @@ def semsim_scramble_command(
|
|
|
110
106
|
default="noisy_phenopackets",
|
|
111
107
|
type=Path,
|
|
112
108
|
)
|
|
109
|
+
@click.option(
|
|
110
|
+
"--local-ontology-cache",
|
|
111
|
+
"-l",
|
|
112
|
+
metavar="PATH",
|
|
113
|
+
required=False,
|
|
114
|
+
help="Path to the local ontology cache, e.g., path to the hp.obo.",
|
|
115
|
+
default=None,
|
|
116
|
+
type=Path,
|
|
117
|
+
)
|
|
113
118
|
def scramble_phenopackets_command(
|
|
114
119
|
phenopacket_path: Path,
|
|
115
120
|
phenopacket_dir: Path,
|
|
116
121
|
scramble_factor: float,
|
|
117
122
|
output_dir: Path,
|
|
123
|
+
local_ontology_cache: Path,
|
|
118
124
|
):
|
|
119
125
|
"""Generate noisy phenopackets from existing ones."""
|
|
120
126
|
if phenopacket_path is None and phenopacket_dir is None:
|
|
121
127
|
raise InputError("Either a phenopacket or phenopacket directory must be specified")
|
|
122
128
|
else:
|
|
123
|
-
scramble_phenopackets(
|
|
129
|
+
scramble_phenopackets(
|
|
130
|
+
output_dir, phenopacket_path, phenopacket_dir, scramble_factor, local_ontology_cache
|
|
131
|
+
)
|
|
124
132
|
|
|
125
133
|
|
|
126
134
|
@click.command("semsim-comparison")
|
|
@@ -338,196 +346,19 @@ def create_spiked_vcfs_command(
|
|
|
338
346
|
|
|
339
347
|
@click.command()
|
|
340
348
|
@click.option(
|
|
341
|
-
"--
|
|
342
|
-
"-d",
|
|
343
|
-
required=True,
|
|
344
|
-
metavar="PATH",
|
|
345
|
-
help="General results directory to be benchmarked, assumes contains subdirectories of pheval_gene_results/,"
|
|
346
|
-
"pheval_variant_results/ or pheval_disease_results/. ",
|
|
347
|
-
type=Path,
|
|
348
|
-
)
|
|
349
|
-
@click.option(
|
|
350
|
-
"--phenopacket-dir",
|
|
351
|
-
"-p",
|
|
352
|
-
required=True,
|
|
353
|
-
metavar="PATH",
|
|
354
|
-
help="Full path to directory containing input phenopackets.",
|
|
355
|
-
type=Path,
|
|
356
|
-
)
|
|
357
|
-
@click.option(
|
|
358
|
-
"--output-prefix",
|
|
359
|
-
"-o",
|
|
360
|
-
metavar="<str>",
|
|
361
|
-
required=True,
|
|
362
|
-
help=" Output file prefix. ",
|
|
363
|
-
)
|
|
364
|
-
@click.option(
|
|
365
|
-
"--score-order",
|
|
366
|
-
"-so",
|
|
367
|
-
required=True,
|
|
368
|
-
help="Ordering of results for ranking.",
|
|
369
|
-
type=click.Choice(["ascending", "descending"]),
|
|
370
|
-
default="descending",
|
|
371
|
-
show_default=True,
|
|
372
|
-
)
|
|
373
|
-
@click.option(
|
|
374
|
-
"--threshold",
|
|
375
|
-
"-t",
|
|
376
|
-
metavar="<float>",
|
|
377
|
-
default=float(0.0),
|
|
378
|
-
required=False,
|
|
379
|
-
help="Score threshold.",
|
|
380
|
-
type=float,
|
|
381
|
-
)
|
|
382
|
-
@click.option(
|
|
383
|
-
"--gene-analysis/--no-gene-analysis",
|
|
384
|
-
default=False,
|
|
385
|
-
required=False,
|
|
386
|
-
type=bool,
|
|
387
|
-
show_default=True,
|
|
388
|
-
help="Specify analysis for gene prioritisation",
|
|
389
|
-
)
|
|
390
|
-
@click.option(
|
|
391
|
-
"--variant-analysis/--no-variant-analysis",
|
|
392
|
-
default=False,
|
|
393
|
-
required=False,
|
|
394
|
-
type=bool,
|
|
395
|
-
show_default=True,
|
|
396
|
-
help="Specify analysis for variant prioritisation",
|
|
397
|
-
)
|
|
398
|
-
@click.option(
|
|
399
|
-
"--disease-analysis/--no-disease-analysis",
|
|
400
|
-
default=False,
|
|
401
|
-
required=False,
|
|
402
|
-
type=bool,
|
|
403
|
-
show_default=True,
|
|
404
|
-
help="Specify analysis for disease prioritisation",
|
|
405
|
-
)
|
|
406
|
-
@click.option(
|
|
407
|
-
"--plot-type",
|
|
408
|
-
"-y",
|
|
409
|
-
default="bar_stacked",
|
|
410
|
-
show_default=True,
|
|
411
|
-
type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
|
|
412
|
-
help="Bar chart type to output.",
|
|
413
|
-
)
|
|
414
|
-
def benchmark(
|
|
415
|
-
directory: Path,
|
|
416
|
-
phenopacket_dir: Path,
|
|
417
|
-
score_order: str,
|
|
418
|
-
output_prefix: str,
|
|
419
|
-
threshold: float,
|
|
420
|
-
gene_analysis: bool,
|
|
421
|
-
variant_analysis: bool,
|
|
422
|
-
disease_analysis: bool,
|
|
423
|
-
plot_type: str,
|
|
424
|
-
):
|
|
425
|
-
"""Benchmark the gene/variant/disease prioritisation performance for a single run."""
|
|
426
|
-
if not gene_analysis and not variant_analysis and not disease_analysis:
|
|
427
|
-
raise InputError("Need to specify at least one of gene/variant/disease analysis.")
|
|
428
|
-
benchmark_directory(
|
|
429
|
-
TrackInputOutputDirectories(results_dir=directory, phenopacket_dir=phenopacket_dir),
|
|
430
|
-
score_order,
|
|
431
|
-
output_prefix,
|
|
432
|
-
threshold,
|
|
433
|
-
gene_analysis,
|
|
434
|
-
variant_analysis,
|
|
435
|
-
disease_analysis,
|
|
436
|
-
plot_type,
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
@click.command()
|
|
441
|
-
@click.option(
|
|
442
|
-
"--run-data",
|
|
349
|
+
"--run-yaml",
|
|
443
350
|
"-r",
|
|
444
351
|
required=True,
|
|
445
352
|
metavar="PATH",
|
|
446
|
-
help="Path to
|
|
447
|
-
"and corresponding results directory separated by tab."
|
|
448
|
-
"Each run contained to a new line with the input testdata listed first and on the same line separated by a tab"
|
|
449
|
-
"the results directory.",
|
|
353
|
+
help="Path to yaml configuration file for benchmarking.",
|
|
450
354
|
type=Path,
|
|
451
355
|
)
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
"-o",
|
|
455
|
-
metavar="<str>",
|
|
456
|
-
required=True,
|
|
457
|
-
help=" Output file prefix. ",
|
|
458
|
-
)
|
|
459
|
-
@click.option(
|
|
460
|
-
"--score-order",
|
|
461
|
-
"-so",
|
|
462
|
-
required=True,
|
|
463
|
-
help="Ordering of results for ranking.",
|
|
464
|
-
type=click.Choice(["ascending", "descending"]),
|
|
465
|
-
default="descending",
|
|
466
|
-
show_default=True,
|
|
467
|
-
)
|
|
468
|
-
@click.option(
|
|
469
|
-
"--threshold",
|
|
470
|
-
"-t",
|
|
471
|
-
metavar="<float>",
|
|
472
|
-
default=float(0.0),
|
|
473
|
-
required=False,
|
|
474
|
-
help="Score threshold.",
|
|
475
|
-
type=float,
|
|
476
|
-
)
|
|
477
|
-
@click.option(
|
|
478
|
-
"--gene-analysis/--no-gene-analysis",
|
|
479
|
-
default=False,
|
|
480
|
-
required=False,
|
|
481
|
-
type=bool,
|
|
482
|
-
show_default=True,
|
|
483
|
-
help="Specify analysis for gene prioritisation",
|
|
484
|
-
)
|
|
485
|
-
@click.option(
|
|
486
|
-
"--variant-analysis/--no-variant-analysis",
|
|
487
|
-
default=False,
|
|
488
|
-
required=False,
|
|
489
|
-
type=bool,
|
|
490
|
-
show_default=True,
|
|
491
|
-
help="Specify analysis for variant prioritisation",
|
|
492
|
-
)
|
|
493
|
-
@click.option(
|
|
494
|
-
"--disease-analysis/--no-disease-analysis",
|
|
495
|
-
default=False,
|
|
496
|
-
required=False,
|
|
497
|
-
type=bool,
|
|
498
|
-
show_default=True,
|
|
499
|
-
help="Specify analysis for disease prioritisation",
|
|
500
|
-
)
|
|
501
|
-
@click.option(
|
|
502
|
-
"--plot-type",
|
|
503
|
-
"-y",
|
|
504
|
-
default="bar_cumulative",
|
|
505
|
-
show_default=True,
|
|
506
|
-
type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
|
|
507
|
-
help="Bar chart type to output.",
|
|
508
|
-
)
|
|
509
|
-
def benchmark_comparison(
|
|
510
|
-
run_data: Path,
|
|
511
|
-
score_order: str,
|
|
512
|
-
output_prefix: str,
|
|
513
|
-
threshold: float,
|
|
514
|
-
gene_analysis: bool,
|
|
515
|
-
variant_analysis: bool,
|
|
516
|
-
disease_analysis: bool,
|
|
517
|
-
plot_type: str,
|
|
356
|
+
def generate_benchmark_stats(
|
|
357
|
+
run_yaml: Path,
|
|
518
358
|
):
|
|
519
|
-
"""Benchmark the gene/variant/disease prioritisation performance for
|
|
520
|
-
if not gene_analysis and not variant_analysis and not disease_analysis:
|
|
521
|
-
raise InputError("Need to specify at least one of gene/variant/disease analysis.")
|
|
359
|
+
"""Benchmark the gene/variant/disease prioritisation performance for runs."""
|
|
522
360
|
benchmark_run_comparisons(
|
|
523
|
-
|
|
524
|
-
score_order,
|
|
525
|
-
output_prefix,
|
|
526
|
-
threshold,
|
|
527
|
-
gene_analysis,
|
|
528
|
-
variant_analysis,
|
|
529
|
-
disease_analysis,
|
|
530
|
-
plot_type,
|
|
361
|
+
parse_run_config(run_yaml),
|
|
531
362
|
)
|
|
532
363
|
|
|
533
364
|
|
|
@@ -580,69 +411,27 @@ def semsim_to_exomiserdb_command(
|
|
|
580
411
|
|
|
581
412
|
@click.command()
|
|
582
413
|
@click.option(
|
|
583
|
-
"--
|
|
414
|
+
"--benchmark-db",
|
|
584
415
|
"-b",
|
|
585
416
|
required=True,
|
|
586
417
|
metavar="PATH",
|
|
587
|
-
help="Path to benchmark
|
|
418
|
+
help="Path to benchmark db output by PhEval benchmark commands.",
|
|
588
419
|
type=Path,
|
|
589
420
|
)
|
|
590
421
|
@click.option(
|
|
591
|
-
"--
|
|
592
|
-
|
|
593
|
-
required=
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
cls=MutuallyExclusiveOptionError,
|
|
598
|
-
mutually_exclusive=["variant_analysis", "disease_analysis"],
|
|
599
|
-
)
|
|
600
|
-
@click.option(
|
|
601
|
-
"--variant-analysis/--no-variant-analysis",
|
|
602
|
-
default=False,
|
|
603
|
-
required=False,
|
|
604
|
-
type=bool,
|
|
605
|
-
show_default=True,
|
|
606
|
-
help="Specify analysis for variant prioritisation",
|
|
607
|
-
cls=MutuallyExclusiveOptionError,
|
|
608
|
-
mutually_exclusive=["gene_analysis", "disease_analysis"],
|
|
609
|
-
)
|
|
610
|
-
@click.option(
|
|
611
|
-
"--disease-analysis/--no-disease-analysis",
|
|
612
|
-
default=False,
|
|
613
|
-
required=False,
|
|
614
|
-
type=bool,
|
|
615
|
-
show_default=True,
|
|
616
|
-
help="Specify analysis for disease prioritisation",
|
|
617
|
-
cls=MutuallyExclusiveOptionError,
|
|
618
|
-
mutually_exclusive=["gene_analysis", "variant_analysis"],
|
|
619
|
-
)
|
|
620
|
-
@click.option(
|
|
621
|
-
"--plot-type",
|
|
622
|
-
"-y",
|
|
623
|
-
default="bar_cumulative",
|
|
624
|
-
show_default=True,
|
|
625
|
-
type=click.Choice(["bar_stacked", "bar_cumulative", "bar_non_cumulative"]),
|
|
626
|
-
help="Bar chart type to output.",
|
|
627
|
-
)
|
|
628
|
-
@click.option(
|
|
629
|
-
"--title",
|
|
630
|
-
"-t",
|
|
631
|
-
type=str,
|
|
632
|
-
help='Title for plot, specify the title on the CLI enclosed with ""',
|
|
422
|
+
"--run-data",
|
|
423
|
+
"-r",
|
|
424
|
+
required=True,
|
|
425
|
+
metavar="PATH",
|
|
426
|
+
help="Path to yaml configuration file for benchmarking.",
|
|
427
|
+
type=Path,
|
|
633
428
|
)
|
|
634
429
|
def generate_stats_plot(
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
variant_analysis: bool,
|
|
638
|
-
disease_analysis: bool,
|
|
639
|
-
plot_type: str,
|
|
640
|
-
title: str = None,
|
|
430
|
+
benchmark_db: Path,
|
|
431
|
+
run_data: Path,
|
|
641
432
|
):
|
|
642
|
-
"""Generate bar plot from benchmark
|
|
643
|
-
|
|
644
|
-
benchmarking_tsv, gene_analysis, variant_analysis, disease_analysis, plot_type, title
|
|
645
|
-
)
|
|
433
|
+
"""Generate bar plot from benchmark db."""
|
|
434
|
+
generate_plots_from_benchmark_summary_db(benchmark_db, run_data)
|
|
646
435
|
|
|
647
436
|
|
|
648
437
|
@click.command("prepare-corpus")
|
|
@@ -15,15 +15,20 @@ from pheval.utils.phenopacket_utils import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def load_ontology():
|
|
18
|
+
def load_ontology(local_cached_ontology: Path = None) -> ProntoImplementation:
|
|
19
19
|
"""
|
|
20
20
|
Load the Human Phenotype Ontology (HPO).
|
|
21
|
-
|
|
21
|
+
Args:
|
|
22
|
+
local_cached_ontology(Path): Path to the local cached ontology.
|
|
22
23
|
Returns:
|
|
23
24
|
ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO.
|
|
24
25
|
"""
|
|
25
|
-
|
|
26
|
-
|
|
26
|
+
if local_cached_ontology is None:
|
|
27
|
+
resource = OntologyResource(slug="hp.obo", local=False)
|
|
28
|
+
return ProntoImplementation(resource)
|
|
29
|
+
else:
|
|
30
|
+
resource = OntologyResource(slug=local_cached_ontology, local=True)
|
|
31
|
+
return ProntoImplementation(resource)
|
|
27
32
|
|
|
28
33
|
|
|
29
34
|
class HpoRandomiser:
|
|
@@ -181,78 +186,77 @@ class HpoRandomiser:
|
|
|
181
186
|
+ self.create_random_hpo_terms(number_of_scrambled_terms)
|
|
182
187
|
)
|
|
183
188
|
|
|
189
|
+
def add_noise_to_phenotypic_profile(
|
|
190
|
+
self,
|
|
191
|
+
phenopacket: Union[Phenopacket, Family],
|
|
192
|
+
) -> Union[Phenopacket, Family]:
|
|
193
|
+
"""
|
|
194
|
+
Randomise the phenotypic profile of a Phenopacket or Family.
|
|
184
195
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
phenopacket: Union[Phenopacket, Family],
|
|
188
|
-
) -> Union[Phenopacket, Family]:
|
|
189
|
-
"""
|
|
190
|
-
Randomise the phenotypic profile of a Phenopacket or Family.
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
hpo_randomiser (HpoRandomiser): An instance of HpoRandomiser used for randomisation.
|
|
194
|
-
phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised.
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
Union[Phenopacket, Family]: The randomised Phenopacket or Family.
|
|
198
|
-
"""
|
|
199
|
-
phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
|
|
200
|
-
random_phenotypes = hpo_randomiser.randomise_hpo_terms(phenotypic_features)
|
|
201
|
-
randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(random_phenotypes)
|
|
202
|
-
return randomised_phenopacket
|
|
203
|
-
|
|
196
|
+
Args:
|
|
197
|
+
phenopacket (Union[Phenopacket, Family]): The Phenopacket or Family to be randomised.
|
|
204
198
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
199
|
+
Returns:
|
|
200
|
+
Union[Phenopacket, Family]: The randomised Phenopacket or Family.
|
|
201
|
+
"""
|
|
202
|
+
phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
|
|
203
|
+
random_phenotypes = self.randomise_hpo_terms(phenotypic_features)
|
|
204
|
+
randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(
|
|
205
|
+
random_phenotypes
|
|
206
|
+
)
|
|
207
|
+
return randomised_phenopacket
|
|
210
208
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
phenopacket = phenopacket_reader(phenopacket_path)
|
|
219
|
-
created_noisy_phenopacket = add_noise_to_phenotypic_profile(
|
|
220
|
-
hpo_randomiser,
|
|
221
|
-
phenopacket,
|
|
222
|
-
)
|
|
223
|
-
write_phenopacket(
|
|
224
|
-
created_noisy_phenopacket,
|
|
225
|
-
output_dir.joinpath(phenopacket_path.name),
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def create_scrambled_phenopackets(
|
|
230
|
-
output_dir: Path, phenopacket_dir: Path, scramble_factor: float
|
|
231
|
-
) -> None:
|
|
232
|
-
"""
|
|
233
|
-
Create scrambled versions of Phenopackets within a directory.
|
|
209
|
+
def create_scrambled_phenopacket(
|
|
210
|
+
self,
|
|
211
|
+
output_dir: Path,
|
|
212
|
+
phenopacket_path: Path,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""
|
|
215
|
+
Create a scrambled version of a Phenopacket.
|
|
234
216
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
"""
|
|
240
|
-
ontology = load_ontology()
|
|
241
|
-
hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
|
|
242
|
-
phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
|
|
243
|
-
for phenopacket_path in phenopacket_files:
|
|
217
|
+
Args:
|
|
218
|
+
output_dir (Path): The directory to store the output scrambled Phenopacket.
|
|
219
|
+
phenopacket_path (Path): The path to the original Phenopacket file.
|
|
220
|
+
"""
|
|
244
221
|
phenopacket = phenopacket_reader(phenopacket_path)
|
|
245
|
-
created_noisy_phenopacket = add_noise_to_phenotypic_profile(
|
|
222
|
+
created_noisy_phenopacket = self.add_noise_to_phenotypic_profile(
|
|
223
|
+
phenopacket,
|
|
224
|
+
)
|
|
246
225
|
write_phenopacket(
|
|
247
226
|
created_noisy_phenopacket,
|
|
248
|
-
output_dir.joinpath(
|
|
249
|
-
phenopacket_path.name,
|
|
250
|
-
),
|
|
227
|
+
output_dir.joinpath(phenopacket_path.name),
|
|
251
228
|
)
|
|
252
229
|
|
|
230
|
+
def create_scrambled_phenopackets(
|
|
231
|
+
self,
|
|
232
|
+
output_dir: Path,
|
|
233
|
+
phenopacket_dir: Path,
|
|
234
|
+
) -> None:
|
|
235
|
+
"""
|
|
236
|
+
Create scrambled versions of Phenopackets within a directory.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
output_dir (Path): The directory to store the output scrambled Phenopackets.
|
|
240
|
+
phenopacket_dir (Path): The directory containing the original Phenopacket files.
|
|
241
|
+
"""
|
|
242
|
+
phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
|
|
243
|
+
for phenopacket_path in phenopacket_files:
|
|
244
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
245
|
+
created_noisy_phenopacket = self.add_noise_to_phenotypic_profile(phenopacket)
|
|
246
|
+
write_phenopacket(
|
|
247
|
+
created_noisy_phenopacket,
|
|
248
|
+
output_dir.joinpath(
|
|
249
|
+
phenopacket_path.name,
|
|
250
|
+
),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
253
|
|
|
254
254
|
def scramble_phenopackets(
|
|
255
|
-
output_dir: Path,
|
|
255
|
+
output_dir: Path,
|
|
256
|
+
phenopacket_path: Path,
|
|
257
|
+
phenopacket_dir: Path,
|
|
258
|
+
scramble_factor: float,
|
|
259
|
+
local_cached_ontology: Path,
|
|
256
260
|
) -> None:
|
|
257
261
|
"""
|
|
258
262
|
Create scrambled phenopackets from either a single phenopacket or a directory of phenopackets.
|
|
@@ -262,9 +266,16 @@ def scramble_phenopackets(
|
|
|
262
266
|
phenopacket_path (Path): The path to a single Phenopacket file (if applicable).
|
|
263
267
|
phenopacket_dir (Path): The directory containing multiple Phenopacket files (if applicable).
|
|
264
268
|
scramble_factor (float): A factor determining the level of scrambling for phenotypic features.
|
|
269
|
+
local_cached_ontology (Path): The path to the local cached ontology.
|
|
265
270
|
"""
|
|
266
271
|
output_dir.mkdir(exist_ok=True)
|
|
272
|
+
ontology = load_ontology(local_cached_ontology)
|
|
267
273
|
if phenopacket_path is not None:
|
|
268
|
-
|
|
274
|
+
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopacket(
|
|
275
|
+
output_dir, phenopacket_path
|
|
276
|
+
)
|
|
269
277
|
elif phenopacket_dir is not None:
|
|
270
|
-
|
|
278
|
+
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopackets(
|
|
279
|
+
output_dir,
|
|
280
|
+
phenopacket_dir,
|
|
281
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pheval
|
|
3
|
+
Version: 0.4.1
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Yasemin Bridges
|
|
6
|
+
Author-email: y.bridges@qmul.ac.uk
|
|
7
|
+
Requires-Python: >=3.9,<4.0.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: class-resolver (>=0.4.2)
|
|
15
|
+
Requires-Dist: click (>=8.1.3)
|
|
16
|
+
Requires-Dist: deprecation (>=2.1.0)
|
|
17
|
+
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
18
|
+
Requires-Dist: google (>=3.0.0,<4.0.0)
|
|
19
|
+
Requires-Dist: jaydebeapi (>=1.2.3)
|
|
20
|
+
Requires-Dist: matplotlib (>=3.7.0,<4.0.0)
|
|
21
|
+
Requires-Dist: oaklib (>=0.5.6)
|
|
22
|
+
Requires-Dist: pandas (>=1.5.1)
|
|
23
|
+
Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
|
|
24
|
+
Requires-Dist: plotly (>=5.13.0,<6.0.0)
|
|
25
|
+
Requires-Dist: polars (>=0.19.15,<0.20.0)
|
|
26
|
+
Requires-Dist: pyaml (>=21.10.1,<22.0.0)
|
|
27
|
+
Requires-Dist: pyserde (>=0.9.8,<0.10.0)
|
|
28
|
+
Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
|
|
29
|
+
Requires-Dist: seaborn (>=0.12.2,<0.13.0)
|
|
30
|
+
Requires-Dist: tqdm (>=4.64.1)
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# PhEval - Phenotypic Inference Evaluation Framework
|
|
34
|
+
|
|
35
|
+

|
|
36
|
+

|
|
37
|
+

|
|
38
|
+

|
|
39
|
+

|
|
40
|
+
|
|
41
|
+
## Overview
|
|
42
|
+
|
|
43
|
+
The absence of standardised benchmarks and data standardisation for Variant and Gene Prioritisation Algorithms (VGPAs) presents a significant challenge in the field of genomic research. To address this, we developed PhEval, a novel framework designed to streamline the evaluation of VGPAs that incorporate phenotypic data. PhEval offers several key benefits:
|
|
44
|
+
|
|
45
|
+
- Automated Processes: Reduces manual effort by automating various evaluation tasks, thus enhancing efficiency.
|
|
46
|
+
- Standardisation: Ensures consistency and comparability in evaluation methodologies, leading to more reliable and standardised assessments.
|
|
47
|
+
- Reproducibility: Facilitates reproducibility in research by providing a standardised platform, allowing for consistent validation of algorithms.
|
|
48
|
+
- Comprehensive Benchmarking: Enables thorough benchmarking of algorithms, providing well-founded comparisons and deeper insights into their performance.
|
|
49
|
+
|
|
50
|
+
PhEval is a valuable tool for researchers looking to improve the accuracy and reliability of VGPA evaluations through a structured and standardised approach.
|
|
51
|
+
|
|
52
|
+
For more information please see the full [documentation](https://monarch-initiative.github.io/pheval/).
|
|
53
|
+
|
|
54
|
+
## Download and Installation
|
|
55
|
+
|
|
56
|
+
1. Ensure you have Python 3.8 or greater installed.
|
|
57
|
+
2. Install with `pip`:
|
|
58
|
+
```bash
|
|
59
|
+
pip install pheval
|
|
60
|
+
```
|
|
61
|
+
3. See list of all PhEval utility commands:
|
|
62
|
+
```bash
|
|
63
|
+
pheval-utils --help
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
The PhEval CLI offers a variety of commands categorised into two main types: **Runner Implementations** and **Utility Commands**. Below is an overview of each category, detailing how they can be utilised to perform various tasks within PhEval.
|
|
69
|
+
|
|
70
|
+
### Runner Implementations
|
|
71
|
+
|
|
72
|
+
The primary command used within PhEval is `pheval run`. This command is responsible for executing concrete VGPA runner implementations, that we sometimes term as plugins. By using pheval run, users can leverage these runner implementations to: execute the VGPA on a set of test corpora, produce tool-specific result outputs, and post-process tool-specific outputs to PhEval standardised TSV outputs.
|
|
73
|
+
|
|
74
|
+
Some concrete PhEval runner implementations include the [Exomiser runner](https://github.com/monarch-initiative/pheval.exomiser) and the [Phen2Gene runner](https://github.com/monarch-initiative/pheval.phen2gene). The full list of currently implemented runners can be found [here](https://monarch-initiative.github.io/pheval/plugins/)
|
|
75
|
+
|
|
76
|
+
Please read the [documentation](https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/) for a step-by-step for creating your own PhEval plugin.
|
|
77
|
+
|
|
78
|
+
### Utility Commands
|
|
79
|
+
|
|
80
|
+
In addition to the main `run` command, PhEval provides a set of utility commands designed to enhance the overall functionality of the CLI. These commands can be used to set up and configure experiments, streamline data preparation, and benchmark the performance of various VGPA runner implementations. By utilising these utilities, users can optimise their experimental workflows, ensure reproducibility, and compare the efficiency and accuracy of different approaches. The utility commands offer a range of options that facilitate the customisation and fine-tuning to suit diverse research objectives.
|
|
81
|
+
|
|
82
|
+
#### Example Usage
|
|
83
|
+
|
|
84
|
+
To add noise to an existing corpus of phenopackets, this could be used to assess the robustness of VGPAs when less relevant or unreliable phenotype data is introduced:
|
|
85
|
+
```bash
|
|
86
|
+
pheval-utils scramble-phenopackets --phenopacket-dir /phenopackets --scramble-factor 0.5 --output-dir /scrambled_phenopackets_0.5
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
To update the gene symbols and identifiers to a specific namespace:
|
|
90
|
+
```bash
|
|
91
|
+
pheval-utils update-phenopackets --phenopacket-dir /phenopackets --output-dir /updated_phenopackets --gene-identifier ensembl_id
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
To prepare VCF files for a corpus of phenopackets, spiking in the known causative variants:
|
|
95
|
+
```bash
|
|
96
|
+
pheval-utils create-spiked-vcfs --phenopacket-dir /phenopackets --hg19-template-vcf /template_hg19.vcf --hg38-template-vcf /template_hg38.vcf --output-dir /vcf
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Alternatively, you can wrap all corpus preparatory commands into a single step. Specifying `--variant-analysis`/`--gene-analysis`/`--disease-analysis` will check the phenopackets for complete records documenting the known entities. If template vcf(s) are provided this will spike VCFs with the known variant for the corpus. If a `--gene-identifier` is specified then the corpus of phenopackets is updated.
|
|
100
|
+
```bash
|
|
101
|
+
pheval-utils prepare-corpus \
|
|
102
|
+
--phenopacket-dir /phenopackets \
|
|
103
|
+
--variant-analysis \
|
|
104
|
+
--gene-analysis \
|
|
105
|
+
--gene-identifier ensembl_id \
|
|
106
|
+
--hg19-template-vcf /template_hg19.vcf \
|
|
107
|
+
--hg38-template-vcf /template_hg38.vcf \
|
|
108
|
+
--output-dir /vcf
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
See the [documentation](https://monarch-initiative.github.io/pheval/executing_a_benchmark/) for instructions on benchmarking and evaluating the performance of various VGPAs.
|
|
112
|
+
|
|
113
|
+
|