pheval 0.3.9__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval-0.4.0/PKG-INFO +112 -0
- pheval-0.4.0/README.md +80 -0
- {pheval-0.3.9 → pheval-0.4.0}/pyproject.toml +8 -5
- pheval-0.4.0/src/pheval/analyse/analysis.py +104 -0
- pheval-0.4.0/src/pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval-0.4.0/src/pheval/analyse/benchmark_db_manager.py +140 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/benchmark_generator.py +47 -50
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/benchmarking_data.py +3 -2
- pheval-0.4.0/src/pheval/analyse/disease_prioritisation_analysis.py +141 -0
- pheval-0.4.0/src/pheval/analyse/gene_prioritisation_analysis.py +136 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/generate_plots.py +81 -79
- pheval-0.4.0/src/pheval/analyse/generate_summary_outputs.py +105 -0
- pheval-0.4.0/src/pheval/analyse/parse_benchmark_summary.py +81 -0
- pheval-0.4.0/src/pheval/analyse/parse_corpus.py +219 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/rank_stats.py +177 -144
- pheval-0.4.0/src/pheval/analyse/run_data_parser.py +125 -0
- pheval-0.4.0/src/pheval/analyse/variant_prioritisation_analysis.py +150 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/cli.py +2 -4
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/cli_pheval_utils.py +34 -245
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.3.9/PKG-INFO +0 -35
- pheval-0.3.9/README.md +0 -4
- pheval-0.3.9/src/pheval/analyse/analysis.py +0 -193
- pheval-0.3.9/src/pheval/analyse/disease_prioritisation_analysis.py +0 -290
- pheval-0.3.9/src/pheval/analyse/gene_prioritisation_analysis.py +0 -312
- pheval-0.3.9/src/pheval/analyse/generate_summary_outputs.py +0 -175
- pheval-0.3.9/src/pheval/analyse/parse_benchmark_summary.py +0 -68
- pheval-0.3.9/src/pheval/analyse/parse_pheval_result.py +0 -43
- pheval-0.3.9/src/pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval-0.3.9/src/pheval/analyse/run_data_parser.py +0 -44
- pheval-0.3.9/src/pheval/analyse/variant_prioritisation_analysis.py +0 -284
- pheval-0.3.9/src/pheval/constants.py +0 -8
- {pheval-0.3.9 → pheval-0.4.0}/LICENSE +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/binary_classification_stats.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/analyse/prioritisation_result_types.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/cli_pheval.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/config_parser.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/implementations/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/infra/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/infra/exomiserdb.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/post_processing/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/post_processing/post_processing.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/create_spiked_vcf.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/custom_exceptions.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/prepare_corpus.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/prepare/update_phenopacket.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/resources/hgnc_complete_set.txt +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/run_metadata.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/runners/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/runners/runner.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/__init__.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/docs_gen.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/docs_gen.sh +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/exomiser.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/file_utils.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/phenopacket_utils.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/semsim_utils.py +0 -0
- {pheval-0.3.9 → pheval-0.4.0}/src/pheval/utils/utils.py +0 -0
pheval-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pheval
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Yasemin Bridges
|
|
6
|
+
Author-email: y.bridges@qmul.ac.uk
|
|
7
|
+
Requires-Python: >=3.9,<4.0.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: class-resolver (>=0.4.2)
|
|
14
|
+
Requires-Dist: click (>=8.1.3)
|
|
15
|
+
Requires-Dist: deprecation (>=2.1.0)
|
|
16
|
+
Requires-Dist: duckdb (>=1.0.0,<2.0.0)
|
|
17
|
+
Requires-Dist: google (>=3.0.0,<4.0.0)
|
|
18
|
+
Requires-Dist: jaydebeapi (>=1.2.3)
|
|
19
|
+
Requires-Dist: matplotlib (>=3.7.0,<4.0.0)
|
|
20
|
+
Requires-Dist: oaklib (>=0.5.6)
|
|
21
|
+
Requires-Dist: pandas (>=1.5.1)
|
|
22
|
+
Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
|
|
23
|
+
Requires-Dist: plotly (>=5.13.0,<6.0.0)
|
|
24
|
+
Requires-Dist: polars (>=0.19.15,<0.20.0)
|
|
25
|
+
Requires-Dist: pyaml (>=21.10.1,<22.0.0)
|
|
26
|
+
Requires-Dist: pyserde (>=0.9.8,<0.10.0)
|
|
27
|
+
Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
|
|
28
|
+
Requires-Dist: seaborn (>=0.12.2,<0.13.0)
|
|
29
|
+
Requires-Dist: tqdm (>=4.64.1)
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# PhEval - Phenotypic Inference Evaluation Framework
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+

|
|
36
|
+

|
|
37
|
+

|
|
38
|
+

|
|
39
|
+
|
|
40
|
+
## Overview
|
|
41
|
+
|
|
42
|
+
The absence of standardised benchmarks and data standardisation for Variant and Gene Prioritisation Algorithms (VGPAs) presents a significant challenge in the field of genomic research. To address this, we developed PhEval, a novel framework designed to streamline the evaluation of VGPAs that incorporate phenotypic data. PhEval offers several key benefits:
|
|
43
|
+
|
|
44
|
+
- Automated Processes: Reduces manual effort by automating various evaluation tasks, thus enhancing efficiency.
|
|
45
|
+
- Standardisation: Ensures consistency and comparability in evaluation methodologies, leading to more reliable and standardised assessments.
|
|
46
|
+
- Reproducibility: Facilitates reproducibility in research by providing a standardised platform, allowing for consistent validation of algorithms.
|
|
47
|
+
- Comprehensive Benchmarking: Enables thorough benchmarking of algorithms, providing well-founded comparisons and deeper insights into their performance.
|
|
48
|
+
|
|
49
|
+
PhEval is a valuable tool for researchers looking to improve the accuracy and reliability of VGPA evaluations through a structured and standardised approach.
|
|
50
|
+
|
|
51
|
+
For more information please see the full [documentation](https://monarch-initiative.github.io/pheval/).
|
|
52
|
+
|
|
53
|
+
## Download and Installation
|
|
54
|
+
|
|
55
|
+
1. Ensure you have Python 3.8 or greater installed.
|
|
56
|
+
2. Install with `pip`:
|
|
57
|
+
```bash
|
|
58
|
+
pip install pheval
|
|
59
|
+
```
|
|
60
|
+
3. See list of all PhEval utility commands:
|
|
61
|
+
```bash
|
|
62
|
+
pheval-utils --help
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
The PhEval CLI offers a variety of commands categorised into two main types: **Runner Implementations** and **Utility Commands**. Below is an overview of each category, detailing how they can be utilised to perform various tasks within PhEval.
|
|
68
|
+
|
|
69
|
+
### Runner Implementations
|
|
70
|
+
|
|
71
|
+
The primary command used within PhEval is `pheval run`. This command is responsible for executing concrete VGPA runner implementations, that we sometimes term as plugins. By using pheval run, users can leverage these runner implementations to: execute the VGPA on a set of test corpora, produce tool-specific result outputs, and post-process tool-specific outputs to PhEval standardised TSV outputs.
|
|
72
|
+
|
|
73
|
+
Some concrete PhEval runner implementations include the [Exomiser runner](https://github.com/monarch-initiative/pheval.exomiser) and the [Phen2Gene runner](https://github.com/monarch-initiative/pheval.phen2gene). The full list of currently implemented runners can be found [here](https://monarch-initiative.github.io/pheval/plugins/)
|
|
74
|
+
|
|
75
|
+
Please read the [documentation](https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/) for a step-by-step for creating your own PhEval plugin.
|
|
76
|
+
|
|
77
|
+
### Utility Commands
|
|
78
|
+
|
|
79
|
+
In addition to the main `run` command, PhEval provides a set of utility commands designed to enhance the overall functionality of the CLI. These commands can be used to set up and configure experiments, streamline data preparation, and benchmark the performance of various VGPA runner implementations. By utilising these utilities, users can optimise their experimental workflows, ensure reproducibility, and compare the efficiency and accuracy of different approaches. The utility commands offer a range of options that facilitate the customisation and fine-tuning to suit diverse research objectives.
|
|
80
|
+
|
|
81
|
+
#### Example Usage
|
|
82
|
+
|
|
83
|
+
To add noise to an existing corpus of phenopackets, this could be used to assess the robustness of VGPAs when less relevant or unreliable phenotype data is introduced:
|
|
84
|
+
```bash
|
|
85
|
+
pheval-utils scramble-phenopackets --phenopacket-dir /phenopackets --scramble-factor 0.5 --output-dir /scrambled_phenopackets_0.5
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
To update the gene symbols and identifiers to a specific namespace:
|
|
89
|
+
```bash
|
|
90
|
+
pheval-utils update-phenopackets --phenopacket-dir /phenopackets --output-dir /updated_phenopackets --gene-identifier ensembl_id
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
To prepare VCF files for a corpus of phenopackets, spiking in the known causative variants:
|
|
94
|
+
```bash
|
|
95
|
+
pheval-utils create-spiked-vcfs --phenopacket-dir /phenopackets --hg19-template-vcf /template_hg19.vcf --hg38-template-vcf /template_hg38.vcf --output-dir /vcf
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Alternatively, you can wrap all corpus preparatory commands into a single step. Specifying `--variant-analysis`/`--gene-analysis`/`--disease-analysis` will check the phenopackets for complete records documenting the known entities. If template vcf(s) are provided this will spike VCFs with the known variant for the corpus. If a `--gene-identifier` is specified then the corpus of phenopackets is updated.
|
|
99
|
+
```bash
|
|
100
|
+
pheval-utils prepare-corpus \
|
|
101
|
+
--phenopacket-dir /phenopackets \
|
|
102
|
+
--variant-analysis \
|
|
103
|
+
--gene-analysis \
|
|
104
|
+
--gene-identifier ensembl_id \
|
|
105
|
+
--hg19-template-vcf /template_hg19.vcf \
|
|
106
|
+
--hg38-template-vcf /template_hg38.vcf \
|
|
107
|
+
--output-dir /vcf
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
See the [documentation](https://monarch-initiative.github.io/pheval/executing_a_benchmark/) for instructions on benchmarking and evaluating the performance of various VGPAs.
|
|
111
|
+
|
|
112
|
+
|
pheval-0.4.0/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# PhEval - Phenotypic Inference Evaluation Framework
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
The absence of standardised benchmarks and data standardisation for Variant and Gene Prioritisation Algorithms (VGPAs) presents a significant challenge in the field of genomic research. To address this, we developed PhEval, a novel framework designed to streamline the evaluation of VGPAs that incorporate phenotypic data. PhEval offers several key benefits:
|
|
12
|
+
|
|
13
|
+
- Automated Processes: Reduces manual effort by automating various evaluation tasks, thus enhancing efficiency.
|
|
14
|
+
- Standardisation: Ensures consistency and comparability in evaluation methodologies, leading to more reliable and standardised assessments.
|
|
15
|
+
- Reproducibility: Facilitates reproducibility in research by providing a standardised platform, allowing for consistent validation of algorithms.
|
|
16
|
+
- Comprehensive Benchmarking: Enables thorough benchmarking of algorithms, providing well-founded comparisons and deeper insights into their performance.
|
|
17
|
+
|
|
18
|
+
PhEval is a valuable tool for researchers looking to improve the accuracy and reliability of VGPA evaluations through a structured and standardised approach.
|
|
19
|
+
|
|
20
|
+
For more information please see the full [documentation](https://monarch-initiative.github.io/pheval/).
|
|
21
|
+
|
|
22
|
+
## Download and Installation
|
|
23
|
+
|
|
24
|
+
1. Ensure you have Python 3.8 or greater installed.
|
|
25
|
+
2. Install with `pip`:
|
|
26
|
+
```bash
|
|
27
|
+
pip install pheval
|
|
28
|
+
```
|
|
29
|
+
3. See list of all PhEval utility commands:
|
|
30
|
+
```bash
|
|
31
|
+
pheval-utils --help
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
The PhEval CLI offers a variety of commands categorised into two main types: **Runner Implementations** and **Utility Commands**. Below is an overview of each category, detailing how they can be utilised to perform various tasks within PhEval.
|
|
37
|
+
|
|
38
|
+
### Runner Implementations
|
|
39
|
+
|
|
40
|
+
The primary command used within PhEval is `pheval run`. This command is responsible for executing concrete VGPA runner implementations, that we sometimes term as plugins. By using pheval run, users can leverage these runner implementations to: execute the VGPA on a set of test corpora, produce tool-specific result outputs, and post-process tool-specific outputs to PhEval standardised TSV outputs.
|
|
41
|
+
|
|
42
|
+
Some concrete PhEval runner implementations include the [Exomiser runner](https://github.com/monarch-initiative/pheval.exomiser) and the [Phen2Gene runner](https://github.com/monarch-initiative/pheval.phen2gene). The full list of currently implemented runners can be found [here](https://monarch-initiative.github.io/pheval/plugins/)
|
|
43
|
+
|
|
44
|
+
Please read the [documentation](https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/) for a step-by-step for creating your own PhEval plugin.
|
|
45
|
+
|
|
46
|
+
### Utility Commands
|
|
47
|
+
|
|
48
|
+
In addition to the main `run` command, PhEval provides a set of utility commands designed to enhance the overall functionality of the CLI. These commands can be used to set up and configure experiments, streamline data preparation, and benchmark the performance of various VGPA runner implementations. By utilising these utilities, users can optimise their experimental workflows, ensure reproducibility, and compare the efficiency and accuracy of different approaches. The utility commands offer a range of options that facilitate the customisation and fine-tuning to suit diverse research objectives.
|
|
49
|
+
|
|
50
|
+
#### Example Usage
|
|
51
|
+
|
|
52
|
+
To add noise to an existing corpus of phenopackets, this could be used to assess the robustness of VGPAs when less relevant or unreliable phenotype data is introduced:
|
|
53
|
+
```bash
|
|
54
|
+
pheval-utils scramble-phenopackets --phenopacket-dir /phenopackets --scramble-factor 0.5 --output-dir /scrambled_phenopackets_0.5
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
To update the gene symbols and identifiers to a specific namespace:
|
|
58
|
+
```bash
|
|
59
|
+
pheval-utils update-phenopackets --phenopacket-dir /phenopackets --output-dir /updated_phenopackets --gene-identifier ensembl_id
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
To prepare VCF files for a corpus of phenopackets, spiking in the known causative variants:
|
|
63
|
+
```bash
|
|
64
|
+
pheval-utils create-spiked-vcfs --phenopacket-dir /phenopackets --hg19-template-vcf /template_hg19.vcf --hg38-template-vcf /template_hg38.vcf --output-dir /vcf
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Alternatively, you can wrap all corpus preparatory commands into a single step. Specifying `--variant-analysis`/`--gene-analysis`/`--disease-analysis` will check the phenopackets for complete records documenting the known entities. If template vcf(s) are provided this will spike VCFs with the known variant for the corpus. If a `--gene-identifier` is specified then the corpus of phenopackets is updated.
|
|
68
|
+
```bash
|
|
69
|
+
pheval-utils prepare-corpus \
|
|
70
|
+
--phenopacket-dir /phenopackets \
|
|
71
|
+
--variant-analysis \
|
|
72
|
+
--gene-analysis \
|
|
73
|
+
--gene-identifier ensembl_id \
|
|
74
|
+
--hg19-template-vcf /template_hg19.vcf \
|
|
75
|
+
--hg38-template-vcf /template_hg38.vcf \
|
|
76
|
+
--output-dir /vcf
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
See the [documentation](https://monarch-initiative.github.io/pheval/executing_a_benchmark/) for instructions on benchmarking and evaluating the performance of various VGPAs.
|
|
80
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "pheval"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = ""
|
|
5
5
|
authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
|
|
6
6
|
"Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
|
|
@@ -27,10 +27,13 @@ matplotlib = "^3.7.0"
|
|
|
27
27
|
pyserde = "^0.9.8"
|
|
28
28
|
polars = "^0.19.15"
|
|
29
29
|
scikit-learn = "^1.4.0"
|
|
30
|
+
duckdb = "^1.0.0"
|
|
30
31
|
|
|
31
32
|
[tool.poetry.dev-dependencies]
|
|
32
33
|
pytest = "^7.2.0"
|
|
33
34
|
coverage = "^6.5.0"
|
|
35
|
+
pheval-template = "^0.1.2"
|
|
36
|
+
pytest-workflow = "^2.0.1"
|
|
34
37
|
|
|
35
38
|
[tool.poetry.scripts]
|
|
36
39
|
pheval = "pheval.cli:pheval"
|
|
@@ -38,15 +41,15 @@ pheval-utils = "pheval.cli:pheval_utils"
|
|
|
38
41
|
|
|
39
42
|
[tool.poetry.group.docs.dependencies]
|
|
40
43
|
mkdocs = "^1.4.2"
|
|
41
|
-
pymdown-extensions = "^9.9"
|
|
42
44
|
mkdocs-material = "^8.5.11"
|
|
43
|
-
mkdocstrings = "^0.
|
|
44
|
-
mkdocstrings-python = "^
|
|
45
|
+
mkdocstrings = "^0.24.0"
|
|
46
|
+
mkdocstrings-python = "^1.8.0"
|
|
47
|
+
pymdown-extensions = "^9.9"
|
|
45
48
|
mkdocs-include-dir-to-nav = "^1.2.0"
|
|
46
49
|
mkdocs-click = "^0.8.0"
|
|
50
|
+
griffe = "0.38.1"
|
|
47
51
|
[tool.poetry.group.dev.dependencies]
|
|
48
52
|
black = "^22.12.0"
|
|
49
|
-
pytest-workflow = "^2.0.1"
|
|
50
53
|
|
|
51
54
|
[tool.pytest.ini_options]
|
|
52
55
|
pythonpath = [
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from pheval.analyse.benchmark_generator import (
|
|
2
|
+
BenchmarkRunOutputGenerator,
|
|
3
|
+
DiseaseBenchmarkRunOutputGenerator,
|
|
4
|
+
GeneBenchmarkRunOutputGenerator,
|
|
5
|
+
VariantBenchmarkRunOutputGenerator,
|
|
6
|
+
)
|
|
7
|
+
from pheval.analyse.generate_summary_outputs import generate_benchmark_comparison_output
|
|
8
|
+
from pheval.analyse.parse_corpus import CorpusParser
|
|
9
|
+
from pheval.analyse.rank_stats import RankStatsWriter
|
|
10
|
+
from pheval.analyse.run_data_parser import Config
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _run_benchmark_comparison(
|
|
14
|
+
run_config: Config,
|
|
15
|
+
benchmark_generator: BenchmarkRunOutputGenerator,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Run a benchmark on several result directories.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
run_config (List[TrackInputOutputDirectories]): List of input and output directories
|
|
22
|
+
for tracking results across multiple directories.
|
|
23
|
+
benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
|
|
24
|
+
"""
|
|
25
|
+
stats_writer = RankStatsWriter(
|
|
26
|
+
run_config.benchmark_name, benchmark_generator.stats_comparison_file
|
|
27
|
+
)
|
|
28
|
+
unique_test_corpora_directories = set([result.phenopacket_dir for result in run_config.runs])
|
|
29
|
+
[
|
|
30
|
+
CorpusParser(run_config.benchmark_name, test_corpora_directory).parse_corpus(
|
|
31
|
+
benchmark_generator
|
|
32
|
+
)
|
|
33
|
+
for test_corpora_directory in unique_test_corpora_directories
|
|
34
|
+
]
|
|
35
|
+
benchmarking_results = []
|
|
36
|
+
for run in run_config.runs:
|
|
37
|
+
benchmark_result = benchmark_generator.generate_benchmark_run_results(
|
|
38
|
+
run_config.benchmark_name, run, run.score_order, run.threshold
|
|
39
|
+
)
|
|
40
|
+
stats_writer.add_statistics_entry(
|
|
41
|
+
run.run_identifier,
|
|
42
|
+
benchmark_result.rank_stats,
|
|
43
|
+
benchmark_result.binary_classification_stats,
|
|
44
|
+
)
|
|
45
|
+
benchmarking_results.append(benchmark_result)
|
|
46
|
+
run_identifiers = [run.run_identifier for run in run_config.runs]
|
|
47
|
+
[
|
|
48
|
+
generate_benchmark_comparison_output(
|
|
49
|
+
run_config.benchmark_name,
|
|
50
|
+
benchmarking_results,
|
|
51
|
+
run_identifiers,
|
|
52
|
+
benchmark_generator,
|
|
53
|
+
f"{unique_test_corpora_directory.parents[0].name}_"
|
|
54
|
+
f"{benchmark_generator.prioritisation_type_string}",
|
|
55
|
+
)
|
|
56
|
+
for unique_test_corpora_directory in unique_test_corpora_directories
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def benchmark_run_comparisons(
|
|
61
|
+
run_config: Config,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""
|
|
64
|
+
Benchmark prioritisation performance for several runs.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
run_config (Config): Run configurations.
|
|
68
|
+
"""
|
|
69
|
+
gene_analysis_runs = Config(
|
|
70
|
+
benchmark_name=run_config.benchmark_name,
|
|
71
|
+
runs=[run for run in run_config.runs if run.gene_analysis],
|
|
72
|
+
plot_customisation=run_config.plot_customisation,
|
|
73
|
+
)
|
|
74
|
+
variant_analysis_runs = Config(
|
|
75
|
+
benchmark_name=run_config.benchmark_name,
|
|
76
|
+
runs=[run for run in run_config.runs if run.variant_analysis],
|
|
77
|
+
plot_customisation=run_config.plot_customisation,
|
|
78
|
+
)
|
|
79
|
+
disease_analysis_runs = Config(
|
|
80
|
+
benchmark_name=run_config.benchmark_name,
|
|
81
|
+
runs=[run for run in run_config.runs if run.disease_analysis],
|
|
82
|
+
plot_customisation=run_config.plot_customisation,
|
|
83
|
+
)
|
|
84
|
+
if gene_analysis_runs.runs:
|
|
85
|
+
_run_benchmark_comparison(
|
|
86
|
+
run_config=gene_analysis_runs,
|
|
87
|
+
benchmark_generator=GeneBenchmarkRunOutputGenerator(
|
|
88
|
+
plot_customisation=gene_analysis_runs.plot_customisation.gene_plots
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
if variant_analysis_runs.runs:
|
|
92
|
+
_run_benchmark_comparison(
|
|
93
|
+
run_config=variant_analysis_runs,
|
|
94
|
+
benchmark_generator=VariantBenchmarkRunOutputGenerator(
|
|
95
|
+
plot_customisation=variant_analysis_runs.plot_customisation.variant_plots
|
|
96
|
+
),
|
|
97
|
+
)
|
|
98
|
+
if disease_analysis_runs.runs:
|
|
99
|
+
_run_benchmark_comparison(
|
|
100
|
+
run_config=disease_analysis_runs,
|
|
101
|
+
benchmark_generator=DiseaseBenchmarkRunOutputGenerator(
|
|
102
|
+
plot_customisation=disease_analysis_runs.plot_customisation.disease_plots
|
|
103
|
+
),
|
|
104
|
+
)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
|
|
4
|
+
from pheval.post_processing.post_processing import (
|
|
5
|
+
RankedPhEvalDiseaseResult,
|
|
6
|
+
RankedPhEvalGeneResult,
|
|
7
|
+
RankedPhEvalVariantResult,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AssessPrioritisationBase:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
db_connection: BenchmarkDBManager,
|
|
15
|
+
table_name: str,
|
|
16
|
+
column: str,
|
|
17
|
+
threshold: float,
|
|
18
|
+
score_order: str,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialise AssessPrioritisationBase class
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
db_connection (BenchmarkDBManager): DB connection.
|
|
25
|
+
table_name (str): Table name.
|
|
26
|
+
column (str): Column name.
|
|
27
|
+
threshold (float): Threshold for scores
|
|
28
|
+
score_order (str): Score order for results, either ascending or descending
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
self.threshold = threshold
|
|
32
|
+
self.score_order = score_order
|
|
33
|
+
self.db_connection = db_connection
|
|
34
|
+
self.conn = db_connection.conn
|
|
35
|
+
self.column = column
|
|
36
|
+
self.table_name = table_name
|
|
37
|
+
db_connection.add_column_integer_default(
|
|
38
|
+
table_name=table_name, column=self.column, default=0
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def _assess_with_threshold_ascending_order(
|
|
42
|
+
self,
|
|
43
|
+
result_entry: Union[
|
|
44
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
45
|
+
],
|
|
46
|
+
) -> int:
|
|
47
|
+
"""
|
|
48
|
+
Record the prioritisation rank if it meets the ascending order threshold.
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
53
|
+
Ranked PhEval result entry
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
int: Recorded prioritisation rank
|
|
57
|
+
"""
|
|
58
|
+
if float(self.threshold) > float(result_entry.score):
|
|
59
|
+
return result_entry.rank
|
|
60
|
+
else:
|
|
61
|
+
return 0
|
|
62
|
+
|
|
63
|
+
def _assess_with_threshold(
|
|
64
|
+
self,
|
|
65
|
+
result_entry: Union[
|
|
66
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
67
|
+
],
|
|
68
|
+
) -> int:
|
|
69
|
+
"""
|
|
70
|
+
Record the prioritisation rank if it meets the score threshold.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
74
|
+
Ranked PhEval result entry
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
int: Recorded prioritisation rank
|
|
78
|
+
"""
|
|
79
|
+
if float(self.threshold) < float(result_entry.score):
|
|
80
|
+
return result_entry.rank
|
|
81
|
+
else:
|
|
82
|
+
return 0
|
|
83
|
+
|
|
84
|
+
def _record_matched_entity(
|
|
85
|
+
self,
|
|
86
|
+
standardised_result: Union[
|
|
87
|
+
RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
|
|
88
|
+
],
|
|
89
|
+
) -> int:
|
|
90
|
+
"""
|
|
91
|
+
Return the rank result - handling the specification of a threshold.
|
|
92
|
+
Args:
|
|
93
|
+
standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
|
|
94
|
+
Ranked PhEval disease result entry
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
int: Recorded entity prioritisation rank
|
|
98
|
+
"""
|
|
99
|
+
if float(self.threshold) == 0.0:
|
|
100
|
+
return standardised_result.rank
|
|
101
|
+
else:
|
|
102
|
+
return (
|
|
103
|
+
self._assess_with_threshold(standardised_result)
|
|
104
|
+
if self.score_order != "ascending"
|
|
105
|
+
else self._assess_with_threshold_ascending_order(
|
|
106
|
+
standardised_result,
|
|
107
|
+
)
|
|
108
|
+
)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Type, Union
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
6
|
+
from duckdb import DuckDBPyConnection
|
|
7
|
+
|
|
8
|
+
from pheval.post_processing.post_processing import (
|
|
9
|
+
RankedPhEvalDiseaseResult,
|
|
10
|
+
RankedPhEvalGeneResult,
|
|
11
|
+
RankedPhEvalVariantResult,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BenchmarkDBManager:
|
|
16
|
+
"""
|
|
17
|
+
Class to connect to database.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, benchmark_name: str):
|
|
21
|
+
"""Initialise the BenchmarkDBManager class."""
|
|
22
|
+
self.conn = self.get_connection(
|
|
23
|
+
f"{benchmark_name}" if str(benchmark_name).endswith(".db") else f"{benchmark_name}.db"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def initialise(self):
|
|
27
|
+
"""Initialise the duckdb connection."""
|
|
28
|
+
self.add_contains_function()
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def get_connection(db_name: str) -> DuckDBPyConnection:
|
|
32
|
+
"""
|
|
33
|
+
Get a connection to the database.
|
|
34
|
+
Returns:
|
|
35
|
+
DuckDBPyConnection: Connection to the database.
|
|
36
|
+
"""
|
|
37
|
+
conn = duckdb.connect(db_name)
|
|
38
|
+
return conn
|
|
39
|
+
|
|
40
|
+
def add_column_integer_default(self, table_name: str, column: str, default: int = 0) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Add a column to an existing table with an integer default value.
|
|
43
|
+
Args:
|
|
44
|
+
table_name (str): Name of the table.
|
|
45
|
+
column (str): Name of the column to add.
|
|
46
|
+
default (int): Default integer value to add.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
self.conn.execute(
|
|
50
|
+
f'ALTER TABLE {table_name} ADD COLUMN "{column}" INTEGER DEFAULT {default}'
|
|
51
|
+
)
|
|
52
|
+
self.conn.execute(f'UPDATE {table_name} SET "{column}" = ?', (default,))
|
|
53
|
+
self.conn.commit()
|
|
54
|
+
except duckdb.CatalogException:
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def drop_table(self, table_name: str) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Drop a table from the database.
|
|
60
|
+
Args:
|
|
61
|
+
table_name: Name of the table to drop from the database
|
|
62
|
+
"""
|
|
63
|
+
self.conn.execute(f"""DROP TABLE IF EXISTS "{table_name}";""")
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def contains_entity_function(entity: str, known_causative_entity: str) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Determines if a known causative entity is present within an entity or list of entities.
|
|
69
|
+
Args:
|
|
70
|
+
entity (str): The entity to be checked. It can be a single entity or a string representation of a list.
|
|
71
|
+
known_causative_entity (str): The entity to search for within the `entity`.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
bool: `True` if `known_causative_entity` is found in `entity` (or its list representation),
|
|
75
|
+
`False` otherwise.
|
|
76
|
+
"""
|
|
77
|
+
list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*]$")
|
|
78
|
+
if list_pattern.match(str(entity)):
|
|
79
|
+
list_representation = ast.literal_eval(entity)
|
|
80
|
+
if isinstance(list_representation, list):
|
|
81
|
+
return known_causative_entity in list_representation
|
|
82
|
+
return known_causative_entity == entity
|
|
83
|
+
|
|
84
|
+
def add_contains_function(self) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist.
|
|
87
|
+
"""
|
|
88
|
+
result = self.conn.execute(
|
|
89
|
+
"SELECT * FROM duckdb_functions() WHERE function_name = ?", ["contains_entity_function"]
|
|
90
|
+
).fetchall()
|
|
91
|
+
if not result:
|
|
92
|
+
self.conn.create_function("contains_entity_function", self.contains_entity_function)
|
|
93
|
+
|
|
94
|
+
def parse_table_into_dataclass(
|
|
95
|
+
self,
|
|
96
|
+
table_name: str,
|
|
97
|
+
dataclass: Union[
|
|
98
|
+
Type[RankedPhEvalGeneResult],
|
|
99
|
+
Type[RankedPhEvalVariantResult],
|
|
100
|
+
Type[RankedPhEvalDiseaseResult],
|
|
101
|
+
],
|
|
102
|
+
) -> Union[
|
|
103
|
+
List[RankedPhEvalGeneResult],
|
|
104
|
+
List[RankedPhEvalVariantResult],
|
|
105
|
+
List[RankedPhEvalDiseaseResult],
|
|
106
|
+
]:
|
|
107
|
+
"""
|
|
108
|
+
Parses a DuckDB table into a list of dataclass instances.
|
|
109
|
+
Args:
|
|
110
|
+
table_name (str): The name of the DuckDB table to be parsed.
|
|
111
|
+
dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult],
|
|
112
|
+
Type[RankedPhEvalDiseaseResult]]):
|
|
113
|
+
The dataclass type to which each row in the table should be mapped.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table.
|
|
117
|
+
"""
|
|
118
|
+
result = (
|
|
119
|
+
self.conn.execute(f"SELECT * FROM '{table_name}'").fetchdf().to_dict(orient="records")
|
|
120
|
+
)
|
|
121
|
+
return [dataclass(**row) for row in result]
|
|
122
|
+
|
|
123
|
+
def check_table_exists(self, table_name: str) -> bool:
|
|
124
|
+
"""
|
|
125
|
+
Check if a table exists in the connected DuckDB database.
|
|
126
|
+
Args:
|
|
127
|
+
table_name (str): The name of the table to check for existence.
|
|
128
|
+
Returns:
|
|
129
|
+
bool: Returns `True` if the table exists in the database, `False` otherwise.
|
|
130
|
+
"""
|
|
131
|
+
result = self.conn.execute(
|
|
132
|
+
f"SELECT * FROM information_schema.tables WHERE table_name = '{table_name}'"
|
|
133
|
+
).fetchall()
|
|
134
|
+
if result:
|
|
135
|
+
return True
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
def close(self):
|
|
139
|
+
"""Close the connection to the database."""
|
|
140
|
+
self.conn.close()
|