pheval 0.3.9__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (68) hide show
  1. pheval-0.4.1/PKG-INFO +113 -0
  2. pheval-0.4.1/README.md +80 -0
  3. {pheval-0.3.9 → pheval-0.4.1}/pyproject.toml +8 -5
  4. pheval-0.4.1/src/pheval/analyse/analysis.py +104 -0
  5. pheval-0.4.1/src/pheval/analyse/assess_prioritisation_base.py +108 -0
  6. pheval-0.4.1/src/pheval/analyse/benchmark_db_manager.py +140 -0
  7. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/benchmark_generator.py +47 -50
  8. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/benchmarking_data.py +3 -2
  9. pheval-0.4.1/src/pheval/analyse/disease_prioritisation_analysis.py +141 -0
  10. pheval-0.4.1/src/pheval/analyse/gene_prioritisation_analysis.py +136 -0
  11. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/generate_plots.py +81 -79
  12. pheval-0.4.1/src/pheval/analyse/generate_summary_outputs.py +105 -0
  13. pheval-0.4.1/src/pheval/analyse/parse_benchmark_summary.py +81 -0
  14. pheval-0.4.1/src/pheval/analyse/parse_corpus.py +219 -0
  15. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/rank_stats.py +177 -144
  16. pheval-0.4.1/src/pheval/analyse/run_data_parser.py +125 -0
  17. pheval-0.4.1/src/pheval/analyse/variant_prioritisation_analysis.py +150 -0
  18. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/cli.py +2 -4
  19. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/cli_pheval_utils.py +34 -245
  20. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/create_noisy_phenopackets.py +78 -67
  21. pheval-0.3.9/PKG-INFO +0 -35
  22. pheval-0.3.9/README.md +0 -4
  23. pheval-0.3.9/src/pheval/analyse/analysis.py +0 -193
  24. pheval-0.3.9/src/pheval/analyse/disease_prioritisation_analysis.py +0 -290
  25. pheval-0.3.9/src/pheval/analyse/gene_prioritisation_analysis.py +0 -312
  26. pheval-0.3.9/src/pheval/analyse/generate_summary_outputs.py +0 -175
  27. pheval-0.3.9/src/pheval/analyse/parse_benchmark_summary.py +0 -68
  28. pheval-0.3.9/src/pheval/analyse/parse_pheval_result.py +0 -43
  29. pheval-0.3.9/src/pheval/analyse/prioritisation_rank_recorder.py +0 -83
  30. pheval-0.3.9/src/pheval/analyse/run_data_parser.py +0 -44
  31. pheval-0.3.9/src/pheval/analyse/variant_prioritisation_analysis.py +0 -284
  32. pheval-0.3.9/src/pheval/constants.py +0 -8
  33. {pheval-0.3.9 → pheval-0.4.1}/LICENSE +0 -0
  34. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/__init__.py +0 -0
  35. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/__init__.py +0 -0
  36. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/binary_classification_stats.py +0 -0
  37. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/analyse/prioritisation_result_types.py +0 -0
  38. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/cli_pheval.py +0 -0
  39. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/config_parser.py +0 -0
  40. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/implementations/__init__.py +0 -0
  41. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/infra/__init__.py +0 -0
  42. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/infra/exomiserdb.py +0 -0
  43. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/post_processing/__init__.py +0 -0
  44. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/post_processing/post_processing.py +0 -0
  45. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/__init__.py +0 -0
  46. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/create_spiked_vcf.py +0 -0
  47. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/custom_exceptions.py +0 -0
  48. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/prepare_corpus.py +0 -0
  49. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/prepare/update_phenopacket.py +0 -0
  50. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
  51. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
  52. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
  53. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
  54. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
  55. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
  56. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
  57. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/resources/hgnc_complete_set.txt +0 -0
  58. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/run_metadata.py +0 -0
  59. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/runners/__init__.py +0 -0
  60. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/runners/runner.py +0 -0
  61. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/__init__.py +0 -0
  62. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/docs_gen.py +0 -0
  63. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/docs_gen.sh +0 -0
  64. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/exomiser.py +0 -0
  65. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/file_utils.py +0 -0
  66. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/phenopacket_utils.py +0 -0
  67. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/semsim_utils.py +0 -0
  68. {pheval-0.3.9 → pheval-0.4.1}/src/pheval/utils/utils.py +0 -0
pheval-0.4.1/PKG-INFO ADDED
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.1
2
+ Name: pheval
3
+ Version: 0.4.1
4
+ Summary:
5
+ Author: Yasemin Bridges
6
+ Author-email: y.bridges@qmul.ac.uk
7
+ Requires-Python: >=3.9,<4.0.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: class-resolver (>=0.4.2)
15
+ Requires-Dist: click (>=8.1.3)
16
+ Requires-Dist: deprecation (>=2.1.0)
17
+ Requires-Dist: duckdb (>=1.0.0,<2.0.0)
18
+ Requires-Dist: google (>=3.0.0,<4.0.0)
19
+ Requires-Dist: jaydebeapi (>=1.2.3)
20
+ Requires-Dist: matplotlib (>=3.7.0,<4.0.0)
21
+ Requires-Dist: oaklib (>=0.5.6)
22
+ Requires-Dist: pandas (>=1.5.1)
23
+ Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
24
+ Requires-Dist: plotly (>=5.13.0,<6.0.0)
25
+ Requires-Dist: polars (>=0.19.15,<0.20.0)
26
+ Requires-Dist: pyaml (>=21.10.1,<22.0.0)
27
+ Requires-Dist: pyserde (>=0.9.8,<0.10.0)
28
+ Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
29
+ Requires-Dist: seaborn (>=0.12.2,<0.13.0)
30
+ Requires-Dist: tqdm (>=4.64.1)
31
+ Description-Content-Type: text/markdown
32
+
33
+ # PhEval - Phenotypic Inference Evaluation Framework
34
+
35
+ ![PyPI](https://img.shields.io/pypi/v/pheval)
36
+ ![Build Status](https://img.shields.io/github/actions/workflow/status/monarch-initiative/pheval/pypi-publish.yml?branch=main)
37
+ ![License](https://img.shields.io/github/license/monarch-initiative/pheval)
38
+ ![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)
39
+ ![Issues](https://img.shields.io/github/issues/monarch-initiative/pheval)
40
+
41
+ ## Overview
42
+
43
+ The absence of standardised benchmarks and data standardisation for Variant and Gene Prioritisation Algorithms (VGPAs) presents a significant challenge in the field of genomic research. To address this, we developed PhEval, a novel framework designed to streamline the evaluation of VGPAs that incorporate phenotypic data. PhEval offers several key benefits:
44
+
45
+ - Automated Processes: Reduces manual effort by automating various evaluation tasks, thus enhancing efficiency.
46
+ - Standardisation: Ensures consistency and comparability in evaluation methodologies, leading to more reliable and standardised assessments.
47
+ - Reproducibility: Facilitates reproducibility in research by providing a standardised platform, allowing for consistent validation of algorithms.
48
+ - Comprehensive Benchmarking: Enables thorough benchmarking of algorithms, providing well-founded comparisons and deeper insights into their performance.
49
+
50
+ PhEval is a valuable tool for researchers looking to improve the accuracy and reliability of VGPA evaluations through a structured and standardised approach.
51
+
52
+ For more information please see the full [documentation](https://monarch-initiative.github.io/pheval/).
53
+
54
+ ## Download and Installation
55
+
56
+ 1. Ensure you have Python 3.8 or greater installed.
57
+ 2. Install with `pip`:
58
+ ```bash
59
+ pip install pheval
60
+ ```
61
+ 3. See list of all PhEval utility commands:
62
+ ```bash
63
+ pheval-utils --help
64
+ ```
65
+
66
+ ## Usage
67
+
68
+ The PhEval CLI offers a variety of commands categorised into two main types: **Runner Implementations** and **Utility Commands**. Below is an overview of each category, detailing how they can be utilised to perform various tasks within PhEval.
69
+
70
+ ### Runner Implementations
71
+
72
+ The primary command used within PhEval is `pheval run`. This command is responsible for executing concrete VGPA runner implementations, that we sometimes term as plugins. By using pheval run, users can leverage these runner implementations to: execute the VGPA on a set of test corpora, produce tool-specific result outputs, and post-process tool-specific outputs to PhEval standardised TSV outputs.
73
+
74
+ Some concrete PhEval runner implementations include the [Exomiser runner](https://github.com/monarch-initiative/pheval.exomiser) and the [Phen2Gene runner](https://github.com/monarch-initiative/pheval.phen2gene). The full list of currently implemented runners can be found [here](https://monarch-initiative.github.io/pheval/plugins/)
75
+
76
+ Please read the [documentation](https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/) for a step-by-step for creating your own PhEval plugin.
77
+
78
+ ### Utility Commands
79
+
80
+ In addition to the main `run` command, PhEval provides a set of utility commands designed to enhance the overall functionality of the CLI. These commands can be used to set up and configure experiments, streamline data preparation, and benchmark the performance of various VGPA runner implementations. By utilising these utilities, users can optimise their experimental workflows, ensure reproducibility, and compare the efficiency and accuracy of different approaches. The utility commands offer a range of options that facilitate the customisation and fine-tuning to suit diverse research objectives.
81
+
82
+ #### Example Usage
83
+
84
+ To add noise to an existing corpus of phenopackets, this could be used to assess the robustness of VGPAs when less relevant or unreliable phenotype data is introduced:
85
+ ```bash
86
+ pheval-utils scramble-phenopackets --phenopacket-dir /phenopackets --scramble-factor 0.5 --output-dir /scrambled_phenopackets_0.5
87
+ ```
88
+
89
+ To update the gene symbols and identifiers to a specific namespace:
90
+ ```bash
91
+ pheval-utils update-phenopackets --phenopacket-dir /phenopackets --output-dir /updated_phenopackets --gene-identifier ensembl_id
92
+ ```
93
+
94
+ To prepare VCF files for a corpus of phenopackets, spiking in the known causative variants:
95
+ ```bash
96
+ pheval-utils create-spiked-vcfs --phenopacket-dir /phenopackets --hg19-template-vcf /template_hg19.vcf --hg38-template-vcf /template_hg38.vcf --output-dir /vcf
97
+ ```
98
+
99
+ Alternatively, you can wrap all corpus preparatory commands into a single step. Specifying `--variant-analysis`/`--gene-analysis`/`--disease-analysis` will check the phenopackets for complete records documenting the known entities. If template vcf(s) are provided this will spike VCFs with the known variant for the corpus. If a `--gene-identifier` is specified then the corpus of phenopackets is updated.
100
+ ```bash
101
+ pheval-utils prepare-corpus \
102
+ --phenopacket-dir /phenopackets \
103
+ --variant-analysis \
104
+ --gene-analysis \
105
+ --gene-identifier ensembl_id \
106
+ --hg19-template-vcf /template_hg19.vcf \
107
+ --hg38-template-vcf /template_hg38.vcf \
108
+ --output-dir /vcf
109
+ ```
110
+
111
+ See the [documentation](https://monarch-initiative.github.io/pheval/executing_a_benchmark/) for instructions on benchmarking and evaluating the performance of various VGPAs.
112
+
113
+
pheval-0.4.1/README.md ADDED
@@ -0,0 +1,80 @@
1
+ # PhEval - Phenotypic Inference Evaluation Framework
2
+
3
+ ![PyPI](https://img.shields.io/pypi/v/pheval)
4
+ ![Build Status](https://img.shields.io/github/actions/workflow/status/monarch-initiative/pheval/pypi-publish.yml?branch=main)
5
+ ![License](https://img.shields.io/github/license/monarch-initiative/pheval)
6
+ ![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)
7
+ ![Issues](https://img.shields.io/github/issues/monarch-initiative/pheval)
8
+
9
+ ## Overview
10
+
11
+ The absence of standardised benchmarks and data standardisation for Variant and Gene Prioritisation Algorithms (VGPAs) presents a significant challenge in the field of genomic research. To address this, we developed PhEval, a novel framework designed to streamline the evaluation of VGPAs that incorporate phenotypic data. PhEval offers several key benefits:
12
+
13
+ - Automated Processes: Reduces manual effort by automating various evaluation tasks, thus enhancing efficiency.
14
+ - Standardisation: Ensures consistency and comparability in evaluation methodologies, leading to more reliable and standardised assessments.
15
+ - Reproducibility: Facilitates reproducibility in research by providing a standardised platform, allowing for consistent validation of algorithms.
16
+ - Comprehensive Benchmarking: Enables thorough benchmarking of algorithms, providing well-founded comparisons and deeper insights into their performance.
17
+
18
+ PhEval is a valuable tool for researchers looking to improve the accuracy and reliability of VGPA evaluations through a structured and standardised approach.
19
+
20
+ For more information please see the full [documentation](https://monarch-initiative.github.io/pheval/).
21
+
22
+ ## Download and Installation
23
+
24
+ 1. Ensure you have Python 3.8 or greater installed.
25
+ 2. Install with `pip`:
26
+ ```bash
27
+ pip install pheval
28
+ ```
29
+ 3. See list of all PhEval utility commands:
30
+ ```bash
31
+ pheval-utils --help
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ The PhEval CLI offers a variety of commands categorised into two main types: **Runner Implementations** and **Utility Commands**. Below is an overview of each category, detailing how they can be utilised to perform various tasks within PhEval.
37
+
38
+ ### Runner Implementations
39
+
40
+ The primary command used within PhEval is `pheval run`. This command is responsible for executing concrete VGPA runner implementations, that we sometimes term as plugins. By using pheval run, users can leverage these runner implementations to: execute the VGPA on a set of test corpora, produce tool-specific result outputs, and post-process tool-specific outputs to PhEval standardised TSV outputs.
41
+
42
+ Some concrete PhEval runner implementations include the [Exomiser runner](https://github.com/monarch-initiative/pheval.exomiser) and the [Phen2Gene runner](https://github.com/monarch-initiative/pheval.phen2gene). The full list of currently implemented runners can be found [here](https://monarch-initiative.github.io/pheval/plugins/)
43
+
44
+ Please read the [documentation](https://monarch-initiative.github.io/pheval/developing_a_pheval_plugin/) for a step-by-step for creating your own PhEval plugin.
45
+
46
+ ### Utility Commands
47
+
48
+ In addition to the main `run` command, PhEval provides a set of utility commands designed to enhance the overall functionality of the CLI. These commands can be used to set up and configure experiments, streamline data preparation, and benchmark the performance of various VGPA runner implementations. By utilising these utilities, users can optimise their experimental workflows, ensure reproducibility, and compare the efficiency and accuracy of different approaches. The utility commands offer a range of options that facilitate the customisation and fine-tuning to suit diverse research objectives.
49
+
50
+ #### Example Usage
51
+
52
+ To add noise to an existing corpus of phenopackets, this could be used to assess the robustness of VGPAs when less relevant or unreliable phenotype data is introduced:
53
+ ```bash
54
+ pheval-utils scramble-phenopackets --phenopacket-dir /phenopackets --scramble-factor 0.5 --output-dir /scrambled_phenopackets_0.5
55
+ ```
56
+
57
+ To update the gene symbols and identifiers to a specific namespace:
58
+ ```bash
59
+ pheval-utils update-phenopackets --phenopacket-dir /phenopackets --output-dir /updated_phenopackets --gene-identifier ensembl_id
60
+ ```
61
+
62
+ To prepare VCF files for a corpus of phenopackets, spiking in the known causative variants:
63
+ ```bash
64
+ pheval-utils create-spiked-vcfs --phenopacket-dir /phenopackets --hg19-template-vcf /template_hg19.vcf --hg38-template-vcf /template_hg38.vcf --output-dir /vcf
65
+ ```
66
+
67
+ Alternatively, you can wrap all corpus preparatory commands into a single step. Specifying `--variant-analysis`/`--gene-analysis`/`--disease-analysis` will check the phenopackets for complete records documenting the known entities. If template vcf(s) are provided this will spike VCFs with the known variant for the corpus. If a `--gene-identifier` is specified then the corpus of phenopackets is updated.
68
+ ```bash
69
+ pheval-utils prepare-corpus \
70
+ --phenopacket-dir /phenopackets \
71
+ --variant-analysis \
72
+ --gene-analysis \
73
+ --gene-identifier ensembl_id \
74
+ --hg19-template-vcf /template_hg19.vcf \
75
+ --hg38-template-vcf /template_hg38.vcf \
76
+ --output-dir /vcf
77
+ ```
78
+
79
+ See the [documentation](https://monarch-initiative.github.io/pheval/executing_a_benchmark/) for instructions on benchmarking and evaluating the performance of various VGPAs.
80
+
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pheval"
3
- version = "0.3.9"
3
+ version = "0.4.1"
4
4
  description = ""
5
5
  authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
6
6
  "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
@@ -27,10 +27,13 @@ matplotlib = "^3.7.0"
27
27
  pyserde = "^0.9.8"
28
28
  polars = "^0.19.15"
29
29
  scikit-learn = "^1.4.0"
30
+ duckdb = "^1.0.0"
30
31
 
31
32
  [tool.poetry.dev-dependencies]
32
33
  pytest = "^7.2.0"
33
34
  coverage = "^6.5.0"
35
+ pheval-template = "^0.1.2"
36
+ pytest-workflow = "^2.0.1"
34
37
 
35
38
  [tool.poetry.scripts]
36
39
  pheval = "pheval.cli:pheval"
@@ -38,15 +41,15 @@ pheval-utils = "pheval.cli:pheval_utils"
38
41
 
39
42
  [tool.poetry.group.docs.dependencies]
40
43
  mkdocs = "^1.4.2"
41
- pymdown-extensions = "^9.9"
42
44
  mkdocs-material = "^8.5.11"
43
- mkdocstrings = "^0.19.1"
44
- mkdocstrings-python = "^0.8.2"
45
+ mkdocstrings = "^0.24.0"
46
+ mkdocstrings-python = "^1.8.0"
47
+ pymdown-extensions = "^9.9"
45
48
  mkdocs-include-dir-to-nav = "^1.2.0"
46
49
  mkdocs-click = "^0.8.0"
50
+ griffe = "0.38.1"
47
51
  [tool.poetry.group.dev.dependencies]
48
52
  black = "^22.12.0"
49
- pytest-workflow = "^2.0.1"
50
53
 
51
54
  [tool.pytest.ini_options]
52
55
  pythonpath = [
@@ -0,0 +1,104 @@
1
+ from pheval.analyse.benchmark_generator import (
2
+ BenchmarkRunOutputGenerator,
3
+ DiseaseBenchmarkRunOutputGenerator,
4
+ GeneBenchmarkRunOutputGenerator,
5
+ VariantBenchmarkRunOutputGenerator,
6
+ )
7
+ from pheval.analyse.generate_summary_outputs import generate_benchmark_comparison_output
8
+ from pheval.analyse.parse_corpus import CorpusParser
9
+ from pheval.analyse.rank_stats import RankStatsWriter
10
+ from pheval.analyse.run_data_parser import Config
11
+
12
+
13
+ def _run_benchmark_comparison(
14
+ run_config: Config,
15
+ benchmark_generator: BenchmarkRunOutputGenerator,
16
+ ) -> None:
17
+ """
18
+ Run a benchmark on several result directories.
19
+
20
+ Args:
21
+ run_config (List[TrackInputOutputDirectories]): List of input and output directories
22
+ for tracking results across multiple directories.
23
+ benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
24
+ """
25
+ stats_writer = RankStatsWriter(
26
+ run_config.benchmark_name, benchmark_generator.stats_comparison_file
27
+ )
28
+ unique_test_corpora_directories = set([result.phenopacket_dir for result in run_config.runs])
29
+ [
30
+ CorpusParser(run_config.benchmark_name, test_corpora_directory).parse_corpus(
31
+ benchmark_generator
32
+ )
33
+ for test_corpora_directory in unique_test_corpora_directories
34
+ ]
35
+ benchmarking_results = []
36
+ for run in run_config.runs:
37
+ benchmark_result = benchmark_generator.generate_benchmark_run_results(
38
+ run_config.benchmark_name, run, run.score_order, run.threshold
39
+ )
40
+ stats_writer.add_statistics_entry(
41
+ run.run_identifier,
42
+ benchmark_result.rank_stats,
43
+ benchmark_result.binary_classification_stats,
44
+ )
45
+ benchmarking_results.append(benchmark_result)
46
+ run_identifiers = [run.run_identifier for run in run_config.runs]
47
+ [
48
+ generate_benchmark_comparison_output(
49
+ run_config.benchmark_name,
50
+ benchmarking_results,
51
+ run_identifiers,
52
+ benchmark_generator,
53
+ f"{unique_test_corpora_directory.parents[0].name}_"
54
+ f"{benchmark_generator.prioritisation_type_string}",
55
+ )
56
+ for unique_test_corpora_directory in unique_test_corpora_directories
57
+ ]
58
+
59
+
60
+ def benchmark_run_comparisons(
61
+ run_config: Config,
62
+ ) -> None:
63
+ """
64
+ Benchmark prioritisation performance for several runs.
65
+
66
+ Args:
67
+ run_config (Config): Run configurations.
68
+ """
69
+ gene_analysis_runs = Config(
70
+ benchmark_name=run_config.benchmark_name,
71
+ runs=[run for run in run_config.runs if run.gene_analysis],
72
+ plot_customisation=run_config.plot_customisation,
73
+ )
74
+ variant_analysis_runs = Config(
75
+ benchmark_name=run_config.benchmark_name,
76
+ runs=[run for run in run_config.runs if run.variant_analysis],
77
+ plot_customisation=run_config.plot_customisation,
78
+ )
79
+ disease_analysis_runs = Config(
80
+ benchmark_name=run_config.benchmark_name,
81
+ runs=[run for run in run_config.runs if run.disease_analysis],
82
+ plot_customisation=run_config.plot_customisation,
83
+ )
84
+ if gene_analysis_runs.runs:
85
+ _run_benchmark_comparison(
86
+ run_config=gene_analysis_runs,
87
+ benchmark_generator=GeneBenchmarkRunOutputGenerator(
88
+ plot_customisation=gene_analysis_runs.plot_customisation.gene_plots
89
+ ),
90
+ )
91
+ if variant_analysis_runs.runs:
92
+ _run_benchmark_comparison(
93
+ run_config=variant_analysis_runs,
94
+ benchmark_generator=VariantBenchmarkRunOutputGenerator(
95
+ plot_customisation=variant_analysis_runs.plot_customisation.variant_plots
96
+ ),
97
+ )
98
+ if disease_analysis_runs.runs:
99
+ _run_benchmark_comparison(
100
+ run_config=disease_analysis_runs,
101
+ benchmark_generator=DiseaseBenchmarkRunOutputGenerator(
102
+ plot_customisation=disease_analysis_runs.plot_customisation.disease_plots
103
+ ),
104
+ )
@@ -0,0 +1,108 @@
1
+ from typing import Union
2
+
3
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
4
+ from pheval.post_processing.post_processing import (
5
+ RankedPhEvalDiseaseResult,
6
+ RankedPhEvalGeneResult,
7
+ RankedPhEvalVariantResult,
8
+ )
9
+
10
+
11
+ class AssessPrioritisationBase:
12
+ def __init__(
13
+ self,
14
+ db_connection: BenchmarkDBManager,
15
+ table_name: str,
16
+ column: str,
17
+ threshold: float,
18
+ score_order: str,
19
+ ):
20
+ """
21
+ Initialise AssessPrioritisationBase class
22
+
23
+ Args:
24
+ db_connection (BenchmarkDBManager): DB connection.
25
+ table_name (str): Table name.
26
+ column (str): Column name.
27
+ threshold (float): Threshold for scores
28
+ score_order (str): Score order for results, either ascending or descending
29
+
30
+ """
31
+ self.threshold = threshold
32
+ self.score_order = score_order
33
+ self.db_connection = db_connection
34
+ self.conn = db_connection.conn
35
+ self.column = column
36
+ self.table_name = table_name
37
+ db_connection.add_column_integer_default(
38
+ table_name=table_name, column=self.column, default=0
39
+ )
40
+
41
+ def _assess_with_threshold_ascending_order(
42
+ self,
43
+ result_entry: Union[
44
+ RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
45
+ ],
46
+ ) -> int:
47
+ """
48
+ Record the prioritisation rank if it meets the ascending order threshold.
49
+
50
+
51
+ Args:
52
+ result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
53
+ Ranked PhEval result entry
54
+
55
+ Returns:
56
+ int: Recorded prioritisation rank
57
+ """
58
+ if float(self.threshold) > float(result_entry.score):
59
+ return result_entry.rank
60
+ else:
61
+ return 0
62
+
63
+ def _assess_with_threshold(
64
+ self,
65
+ result_entry: Union[
66
+ RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
67
+ ],
68
+ ) -> int:
69
+ """
70
+ Record the prioritisation rank if it meets the score threshold.
71
+
72
+ Args:
73
+ result_entry (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
74
+ Ranked PhEval result entry
75
+
76
+ Returns:
77
+ int: Recorded prioritisation rank
78
+ """
79
+ if float(self.threshold) < float(result_entry.score):
80
+ return result_entry.rank
81
+ else:
82
+ return 0
83
+
84
+ def _record_matched_entity(
85
+ self,
86
+ standardised_result: Union[
87
+ RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult
88
+ ],
89
+ ) -> int:
90
+ """
91
+ Return the rank result - handling the specification of a threshold.
92
+ Args:
93
+ standardised_result (Union[RankedPhEvalGeneResult, RankedPhEvalDiseaseResult, RankedPhEvalVariantResult]):
94
+ Ranked PhEval disease result entry
95
+
96
+ Returns:
97
+ int: Recorded entity prioritisation rank
98
+ """
99
+ if float(self.threshold) == 0.0:
100
+ return standardised_result.rank
101
+ else:
102
+ return (
103
+ self._assess_with_threshold(standardised_result)
104
+ if self.score_order != "ascending"
105
+ else self._assess_with_threshold_ascending_order(
106
+ standardised_result,
107
+ )
108
+ )
@@ -0,0 +1,140 @@
1
+ import ast
2
+ import re
3
+ from typing import List, Type, Union
4
+
5
+ import duckdb
6
+ from duckdb import DuckDBPyConnection
7
+
8
+ from pheval.post_processing.post_processing import (
9
+ RankedPhEvalDiseaseResult,
10
+ RankedPhEvalGeneResult,
11
+ RankedPhEvalVariantResult,
12
+ )
13
+
14
+
15
+ class BenchmarkDBManager:
16
+ """
17
+ Class to connect to database.
18
+ """
19
+
20
+ def __init__(self, benchmark_name: str):
21
+ """Initialise the BenchmarkDBManager class."""
22
+ self.conn = self.get_connection(
23
+ f"{benchmark_name}" if str(benchmark_name).endswith(".db") else f"{benchmark_name}.db"
24
+ )
25
+
26
+ def initialise(self):
27
+ """Initialise the duckdb connection."""
28
+ self.add_contains_function()
29
+
30
+ @staticmethod
31
+ def get_connection(db_name: str) -> DuckDBPyConnection:
32
+ """
33
+ Get a connection to the database.
34
+ Returns:
35
+ DuckDBPyConnection: Connection to the database.
36
+ """
37
+ conn = duckdb.connect(db_name)
38
+ return conn
39
+
40
+ def add_column_integer_default(self, table_name: str, column: str, default: int = 0) -> None:
41
+ """
42
+ Add a column to an existing table with an integer default value.
43
+ Args:
44
+ table_name (str): Name of the table.
45
+ column (str): Name of the column to add.
46
+ default (int): Default integer value to add.
47
+ """
48
+ try:
49
+ self.conn.execute(
50
+ f'ALTER TABLE {table_name} ADD COLUMN "{column}" INTEGER DEFAULT {default}'
51
+ )
52
+ self.conn.execute(f'UPDATE {table_name} SET "{column}" = ?', (default,))
53
+ self.conn.commit()
54
+ except duckdb.CatalogException:
55
+ pass
56
+
57
+ def drop_table(self, table_name: str) -> None:
58
+ """
59
+ Drop a table from the database.
60
+ Args:
61
+ table_name: Name of the table to drop from the database
62
+ """
63
+ self.conn.execute(f"""DROP TABLE IF EXISTS "{table_name}";""")
64
+
65
+ @staticmethod
66
+ def contains_entity_function(entity: str, known_causative_entity: str) -> bool:
67
+ """
68
+ Determines if a known causative entity is present within an entity or list of entities.
69
+ Args:
70
+ entity (str): The entity to be checked. It can be a single entity or a string representation of a list.
71
+ known_causative_entity (str): The entity to search for within the `entity`.
72
+
73
+ Returns:
74
+ bool: `True` if `known_causative_entity` is found in `entity` (or its list representation),
75
+ `False` otherwise.
76
+ """
77
+ list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*]$")
78
+ if list_pattern.match(str(entity)):
79
+ list_representation = ast.literal_eval(entity)
80
+ if isinstance(list_representation, list):
81
+ return known_causative_entity in list_representation
82
+ return known_causative_entity == entity
83
+
84
+ def add_contains_function(self) -> None:
85
+ """
86
+ Adds a custom `contains_entity_function` to the DuckDB connection if it does not already exist.
87
+ """
88
+ result = self.conn.execute(
89
+ "SELECT * FROM duckdb_functions() WHERE function_name = ?", ["contains_entity_function"]
90
+ ).fetchall()
91
+ if not result:
92
+ self.conn.create_function("contains_entity_function", self.contains_entity_function)
93
+
94
+ def parse_table_into_dataclass(
95
+ self,
96
+ table_name: str,
97
+ dataclass: Union[
98
+ Type[RankedPhEvalGeneResult],
99
+ Type[RankedPhEvalVariantResult],
100
+ Type[RankedPhEvalDiseaseResult],
101
+ ],
102
+ ) -> Union[
103
+ List[RankedPhEvalGeneResult],
104
+ List[RankedPhEvalVariantResult],
105
+ List[RankedPhEvalDiseaseResult],
106
+ ]:
107
+ """
108
+ Parses a DuckDB table into a list of dataclass instances.
109
+ Args:
110
+ table_name (str): The name of the DuckDB table to be parsed.
111
+ dataclass (Union[Type[RankedPhEvalGeneResult], Type[RankedPhEvalVariantResult],
112
+ Type[RankedPhEvalDiseaseResult]]):
113
+ The dataclass type to which each row in the table should be mapped.
114
+
115
+ Returns:
116
+ List[dataclass]: A list of instances of the provided dataclass, each representing a row from the table.
117
+ """
118
+ result = (
119
+ self.conn.execute(f"SELECT * FROM '{table_name}'").fetchdf().to_dict(orient="records")
120
+ )
121
+ return [dataclass(**row) for row in result]
122
+
123
+ def check_table_exists(self, table_name: str) -> bool:
124
+ """
125
+ Check if a table exists in the connected DuckDB database.
126
+ Args:
127
+ table_name (str): The name of the table to check for existence.
128
+ Returns:
129
+ bool: Returns `True` if the table exists in the database, `False` otherwise.
130
+ """
131
+ result = self.conn.execute(
132
+ f"SELECT * FROM information_schema.tables WHERE table_name = '{table_name}'"
133
+ ).fetchall()
134
+ if result:
135
+ return True
136
+ return False
137
+
138
+ def close(self):
139
+ """Close the connection to the database."""
140
+ self.conn.close()