pheval 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show
  1. pheval/analyse/benchmark.py +156 -0
  2. pheval/analyse/benchmark_db_manager.py +16 -134
  3. pheval/analyse/benchmark_output_type.py +43 -0
  4. pheval/analyse/binary_classification_curves.py +132 -0
  5. pheval/analyse/binary_classification_stats.py +164 -307
  6. pheval/analyse/generate_plots.py +210 -395
  7. pheval/analyse/generate_rank_comparisons.py +44 -0
  8. pheval/analyse/rank_stats.py +190 -382
  9. pheval/analyse/run_data_parser.py +21 -39
  10. pheval/cli.py +27 -24
  11. pheval/cli_pheval_utils.py +7 -8
  12. pheval/post_processing/phenopacket_truth_set.py +235 -0
  13. pheval/post_processing/post_processing.py +185 -337
  14. pheval/post_processing/validate_result_format.py +92 -0
  15. pheval/prepare/update_phenopacket.py +11 -9
  16. pheval/utils/logger.py +35 -0
  17. pheval/utils/phenopacket_utils.py +85 -91
  18. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
  19. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
  20. pheval/analyse/analysis.py +0 -104
  21. pheval/analyse/assess_prioritisation_base.py +0 -108
  22. pheval/analyse/benchmark_generator.py +0 -126
  23. pheval/analyse/benchmarking_data.py +0 -25
  24. pheval/analyse/disease_prioritisation_analysis.py +0 -152
  25. pheval/analyse/gene_prioritisation_analysis.py +0 -147
  26. pheval/analyse/generate_summary_outputs.py +0 -105
  27. pheval/analyse/parse_benchmark_summary.py +0 -81
  28. pheval/analyse/parse_corpus.py +0 -219
  29. pheval/analyse/prioritisation_result_types.py +0 -52
  30. pheval/analyse/variant_prioritisation_analysis.py +0 -159
  31. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
  32. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/WHEEL +0 -0
  33. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,81 +0,0 @@
1
- from dataclasses import dataclass
2
- from pathlib import Path
3
- from typing import List
4
-
5
- import pandas as pd
6
-
7
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
8
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
9
- from pheval.analyse.binary_classification_stats import BinaryClassificationStats
10
- from pheval.analyse.rank_stats import RankStats
11
-
12
-
13
- @dataclass
14
- class BenchmarkSummaryResults:
15
- gene_results: List[BenchmarkRunResults]
16
- disease_results: List[BenchmarkRunResults]
17
- variant_results: List[BenchmarkRunResults]
18
-
19
-
20
- def parse_benchmark_results(benchmark_summary_table: pd.DataFrame) -> List[BenchmarkRunResults]:
21
- """
22
- Parse benchmark results from a DataFrame.
23
-
24
- Args:
25
- benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results.
26
-
27
- Returns:
28
- List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame.
29
- """
30
- results = []
31
- for _, row in benchmark_summary_table.iterrows():
32
- benchmarking_result = BenchmarkRunResults(
33
- rank_stats=RankStats(
34
- top=row["top"],
35
- top3=row["top3"],
36
- top5=row["top5"],
37
- top10=row["top10"],
38
- found=row["found"],
39
- total=row["total"],
40
- mrr=row["mean_reciprocal_rank"],
41
- ),
42
- benchmark_name=row["results_directory_path"],
43
- binary_classification_stats=BinaryClassificationStats(),
44
- )
45
- results.append(benchmarking_result)
46
- return results
47
-
48
-
49
- def parse_benchmark_db(benchmarking_db: Path) -> BenchmarkSummaryResults:
50
- """
51
- Read the summary benchmark TSV output generated from the benchmark-comparison command.
52
-
53
- Args:
54
- benchmarking_db (Path): Path to the benchmark db.
55
-
56
- Returns:
57
- BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db.
58
- """
59
- db_connector = BenchmarkDBManager(benchmarking_db)
60
- gene_benchmarking_results, disease_benchmarking_results, variant_benchmarking_results = (
61
- None,
62
- None,
63
- None,
64
- )
65
- if db_connector.check_table_exists("gene_summary"):
66
- gene_benchmarking_results = parse_benchmark_results(
67
- db_connector.conn.execute("SELECT * FROM gene_summary").fetchdf()
68
- )
69
- if db_connector.check_table_exists("disease_summary"):
70
- disease_benchmarking_results = parse_benchmark_results(
71
- db_connector.conn.execute("SELECT * FROM disease_summary").fetchdf()
72
- )
73
- if db_connector.check_table_exists("variant_summary"):
74
- variant_benchmarking_results = parse_benchmark_results(
75
- db_connector.conn.execute("SELECT * FROM variant_summary").fetchdf()
76
- )
77
- return BenchmarkSummaryResults(
78
- gene_results=gene_benchmarking_results,
79
- disease_results=disease_benchmarking_results,
80
- variant_results=variant_benchmarking_results,
81
- )
@@ -1,219 +0,0 @@
1
- from pathlib import Path
2
- from typing import List
3
-
4
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
- from pheval.analyse.benchmark_generator import (
6
- BenchmarkRunOutputGenerator,
7
- DiseaseBenchmarkRunOutputGenerator,
8
- GeneBenchmarkRunOutputGenerator,
9
- VariantBenchmarkRunOutputGenerator,
10
- )
11
- from pheval.utils.file_utils import all_files
12
- from pheval.utils.phenopacket_utils import (
13
- GenomicVariant,
14
- PhenopacketUtil,
15
- ProbandCausativeGene,
16
- ProbandDisease,
17
- phenopacket_reader,
18
- )
19
-
20
-
21
- def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
22
- """
23
- Obtain known diseases from a Phenopacket.
24
- Args:
25
- phenopacket_path (Path): Path to the Phenopacket file.
26
-
27
- Returns:
28
- List[ProbandDisease]: A list of known diseases associated with the proband,
29
- extracted from the Phenopacket.
30
- """
31
- phenopacket = phenopacket_reader(phenopacket_path)
32
- phenopacket_util = PhenopacketUtil(phenopacket)
33
- return phenopacket_util.diagnoses()
34
-
35
-
36
- def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
37
- """
38
- Obtain known variants from a Phenopacket.
39
- Args:
40
- phenopacket_path (Path): Path to the Phenopacket file.
41
-
42
- Returns:
43
- List[GenomicVariant]: A list of known variants associated with the proband,
44
- extracted from the Phenopacket.
45
- """
46
- phenopacket = phenopacket_reader(phenopacket_path)
47
- phenopacket_util = PhenopacketUtil(phenopacket)
48
- return phenopacket_util.diagnosed_variants()
49
-
50
-
51
- def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
52
- """
53
- Obtain known genes from a Phenopacket.
54
- Args:
55
- phenopacket_path (Path): Path to the Phenopacket file.
56
- Returns:
57
- List[ProbandCausativeGene]: A list of known genes associated with the proband,
58
- extracted from the Phenopacket.
59
- """
60
- phenopacket = phenopacket_reader(phenopacket_path)
61
- phenopacket_util = PhenopacketUtil(phenopacket)
62
- return phenopacket_util.diagnosed_genes()
63
-
64
-
65
- class CorpusParser:
66
- """Class for parsing phenopacket corpus and retrieving known variants/genes/diseases."""
67
-
68
- def __init__(self, benchmark_name: str, phenopacket_dir: Path) -> None:
69
- """
70
- Initialise the CorpusParser class.
71
- Args:
72
- phenopacket_dir (Path): Path to the Phenopacket directory.
73
- """
74
- self.phenopacket_dir = phenopacket_dir
75
- self.conn = BenchmarkDBManager(benchmark_name).conn
76
- self.table_name = phenopacket_dir.parents[0].name
77
-
78
- def _create_gene_table(self) -> None:
79
- """
80
- Create the Gene benchmarking table if it doesn't already exist.
81
- """
82
- self.conn.execute(
83
- f"""
84
- CREATE TABLE IF NOT EXISTS "{self.table_name}_gene" (
85
- identifier VARCHAR(255) PRIMARY KEY,
86
- phenopacket VARCHAR,
87
- gene_symbol VARCHAR,
88
- gene_identifier VARCHAR
89
- )
90
- """
91
- )
92
-
93
- def _create_variant_table(self) -> None:
94
- """
95
- Create the Variant benchmarking table if it doesn't already exist.
96
- """
97
- self.conn.execute(
98
- f"""
99
- CREATE TABLE IF NOT EXISTS "{self.table_name}_variant" (
100
- identifier VARCHAR(255) PRIMARY KEY,
101
- phenopacket VARCHAR,
102
- chrom VARCHAR,
103
- pos INTEGER,
104
- "ref" VARCHAR,
105
- alt VARCHAR
106
- )
107
- """
108
- )
109
-
110
- def _create_disease_table(self):
111
- """
112
- Create the Disease benchmarking table if it doesn't already exist.
113
- """
114
- self.conn.execute(
115
- f"""
116
- CREATE TABLE IF NOT EXISTS "{self.table_name}_disease" (
117
- identifier VARCHAR(255) PRIMARY KEY,
118
- phenopacket VARCHAR,
119
- disease_identifier VARCHAR,
120
- disease_name VARCHAR
121
- )
122
- """
123
- )
124
-
125
- def _create_tables(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
126
- """
127
- Create tables based on the benchmarking analysis specified.
128
- Args:
129
- benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
130
- """
131
-
132
- if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
133
- self._create_gene_table()
134
- if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
135
- self._create_variant_table()
136
- if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
137
- self._create_disease_table()
138
-
139
- def _insert_genes(self, phenopacket_path: Path, genes: List[ProbandCausativeGene]) -> None:
140
- """
141
- Insert known disease-causing genes into the Gene benchmarking table.
142
- Args:
143
- phenopacket_path(Path): Path to the Phenopacket file.
144
- genes(List[ProbandCausativeGene]): List of known genes associated with the proband.
145
- """
146
- for gene in genes:
147
- identifier = f"{phenopacket_path.name}-{gene.gene_symbol}"
148
- self.conn.execute(
149
- f"""
150
- INSERT OR IGNORE INTO "{self.table_name}_gene" (identifier, phenopacket, gene_symbol, gene_identifier)
151
- VALUES (?, ?, ?, ?)
152
- """,
153
- (identifier, phenopacket_path.name, gene.gene_symbol, gene.gene_identifier),
154
- )
155
-
156
- def _insert_variants(self, phenopacket_path: Path, variants: List[GenomicVariant]) -> None:
157
- """
158
- Insert known variants into the Variant benchmarking table.
159
- Args:
160
- phenopacket_path (Path): Path to the Phenopacket file.:
161
- variants (List[GenomicVariant]): List of known variants associated with the proband.
162
- """
163
- for variant in variants:
164
- identifier = (
165
- f"{phenopacket_path.name}-{variant.chrom}-{variant.pos}-{variant.ref}-{variant.alt}"
166
- )
167
- self.conn.execute(
168
- f"""
169
- INSERT OR IGNORE INTO "{self.table_name}_variant" (identifier, phenopacket, chrom, pos, "ref", alt)
170
- VALUES (?, ?, ?, ?, ?, ?)
171
- """,
172
- (
173
- identifier,
174
- phenopacket_path.name,
175
- variant.chrom,
176
- variant.pos,
177
- variant.ref,
178
- variant.alt,
179
- ),
180
- )
181
-
182
- def _insert_diseases(self, phenopacket_path: Path, diseases: List[ProbandDisease]) -> None:
183
- """
184
- Insert known diseases into the Disease benchmarking table.
185
- Args:
186
- phenopacket_path (Path): Path to the Phenopacket file.:
187
- diseases (List[ProbandDisease]): List of known diseases associated with the proband.
188
- """
189
- for disease in diseases:
190
- identifier = f"{phenopacket_path.name}-{disease.disease_identifier}"
191
- self.conn.execute(
192
- f"""INSERT OR IGNORE INTO "{self.table_name}_disease" """
193
- f"""(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)""",
194
- (
195
- identifier,
196
- phenopacket_path.name,
197
- disease.disease_identifier,
198
- disease.disease_name,
199
- ),
200
- )
201
-
202
- def parse_corpus(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
203
- """
204
- Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables.
205
- Args:
206
- benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
207
- """
208
- self._create_tables(benchmark_generator)
209
- for phenopacket_path in all_files(self.phenopacket_dir):
210
- if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
211
- genes = _obtain_causative_genes(phenopacket_path)
212
- self._insert_genes(phenopacket_path, genes)
213
- if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
214
- variants = _obtain_causative_variants(phenopacket_path)
215
- self._insert_variants(phenopacket_path, variants)
216
- if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
217
- diseases = _obtain_causative_diseases(phenopacket_path)
218
- self._insert_diseases(phenopacket_path, diseases)
219
- self.conn.close()
@@ -1,52 +0,0 @@
1
- from dataclasses import dataclass
2
- from pathlib import Path
3
-
4
- from pheval.utils.phenopacket_utils import GenomicVariant, ProbandDisease
5
-
6
-
7
- @dataclass
8
- class GenePrioritisationResult:
9
- """
10
- Store rank data for causative genes.
11
-
12
- Attributes:
13
- phenopacket_path (Path): Path to the phenopacket.
14
- gene (str): The causative gene.
15
- rank (int): The assigned rank for the gene. Defaults to 0.
16
- """
17
-
18
- phenopacket_path: Path
19
- gene: str
20
- rank: int = 0
21
-
22
-
23
- @dataclass
24
- class VariantPrioritisationResult:
25
- """
26
- Store rank data for variants.
27
-
28
- Attributes:
29
- phenopacket_path (Path): Path to the phenopacket.
30
- variant (GenomicVariant): The genomic variant.
31
- rank (int): The assigned rank for the variant. Defaults to 0.
32
- """
33
-
34
- phenopacket_path: Path
35
- variant: GenomicVariant
36
- rank: int = 0
37
-
38
-
39
- @dataclass
40
- class DiseasePrioritisationResult:
41
- """
42
- Store rank data for known diseases.
43
-
44
- Attributes:
45
- phenopacket_path (Path): Path to the phenopacket.
46
- disease (ProbandDisease): The proband disease.
47
- rank (int): The assigned rank for the disease. Defaults to 0.
48
- """
49
-
50
- phenopacket_path: Path
51
- disease: ProbandDisease
52
- rank: int = 0
@@ -1,159 +0,0 @@
1
- from pathlib import Path
2
-
3
- from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
- from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
- from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
- from pheval.analyse.binary_classification_stats import BinaryClassificationStats
7
- from pheval.analyse.rank_stats import RankStats
8
- from pheval.analyse.run_data_parser import RunConfig
9
- from pheval.post_processing.post_processing import RankedPhEvalVariantResult
10
- from pheval.utils.file_utils import all_files
11
- from pheval.utils.phenopacket_utils import GenomicVariant
12
-
13
-
14
- class AssessVariantPrioritisation(AssessPrioritisationBase):
15
- """Class for assessing variant prioritisation based on thresholds and scoring orders."""
16
-
17
- def assess_variant_prioritisation(
18
- self,
19
- standardised_variant_result_path: Path,
20
- phenopacket_path: Path,
21
- binary_classification_stats: BinaryClassificationStats,
22
- ) -> None:
23
- """
24
- Assess variant prioritisation.
25
-
26
- This method assesses the prioritisation of variants based on the provided criteria
27
- and records ranks using a PrioritisationRankRecorder.
28
-
29
- Args:
30
- standardised_variant_result_path (Path): Path to standardised variant TSV result.
31
- phenopacket_path (Path): Path to the phenopacket.
32
- binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
33
- """
34
- relevant_ranks = []
35
- df = self.conn.execute(
36
- f"""SELECT * FROM "{self.table_name}" WHERE phenopacket = '{phenopacket_path.name}'"""
37
- ).fetchdf()
38
- for _i, row in df.iterrows():
39
- causative_variant = GenomicVariant(
40
- chrom=row["chrom"],
41
- pos=int(row["pos"]),
42
- ref=row["ref"],
43
- alt=row["alt"],
44
- )
45
- result = (
46
- self.conn.execute(
47
- (
48
- f"SELECT * FROM '{standardised_variant_result_path}' "
49
- f"WHERE "
50
- f"chromosome == '{causative_variant.chrom}' AND "
51
- f"start == {causative_variant.pos} AND "
52
- f"ref == '{causative_variant.ref}' AND "
53
- f"alt == '{causative_variant.alt}'"
54
- )
55
- if standardised_variant_result_path.exists()
56
- else "SELECT NULL WHERE FALSE"
57
- )
58
- .fetchdf()
59
- .to_dict(orient="records")
60
- )
61
-
62
- if len(result) > 0:
63
- variant_match = self._record_matched_entity(RankedPhEvalVariantResult(**result[0]))
64
- relevant_ranks.append(variant_match)
65
- primary_key = (
66
- f"{phenopacket_path.name}-{causative_variant.chrom}-{causative_variant.pos}-"
67
- f"{causative_variant.ref}-{causative_variant.alt}"
68
- )
69
- self.conn.execute(
70
- f'UPDATE "{self.table_name}" SET "{self.column}" = ? WHERE identifier = ?',
71
- (variant_match, primary_key),
72
- )
73
- elif len(result) == 0:
74
- relevant_ranks.append(0)
75
- binary_classification_stats.add_classification(
76
- (
77
- self.db_connection.parse_table_into_dataclass(
78
- str(standardised_variant_result_path), RankedPhEvalVariantResult
79
- )
80
- if standardised_variant_result_path.exists()
81
- else []
82
- ),
83
- relevant_ranks,
84
- )
85
-
86
-
87
- def assess_phenopacket_variant_prioritisation(
88
- phenopacket_path: Path,
89
- run: RunConfig,
90
- variant_binary_classification_stats: BinaryClassificationStats,
91
- variant_benchmarker: AssessVariantPrioritisation,
92
- ) -> None:
93
- """
94
- Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results
95
- against the recorded causative variants for a proband in the Phenopacket.
96
-
97
- Args:
98
- phenopacket_path (Path): Path to the Phenopacket.
99
- run (RunConfig): Run configuration.
100
- variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
101
- variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance.
102
- """
103
- standardised_variant_result_path = run.results_dir.joinpath(
104
- f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
105
- )
106
- variant_benchmarker.assess_variant_prioritisation(
107
- standardised_variant_result_path,
108
- phenopacket_path,
109
- variant_binary_classification_stats,
110
- )
111
-
112
-
113
- def benchmark_variant_prioritisation(
114
- benchmark_name: str,
115
- run: RunConfig,
116
- score_order: str,
117
- threshold: float,
118
- ):
119
- """
120
- Benchmark a directory based on variant prioritisation results.
121
-
122
- Args:
123
- benchmark_name (str): Name of the benchmark.
124
- run (RunConfig): Run configuration.
125
- score_order (str): The order in which scores are arranged.
126
- threshold (float): Threshold for assessment.
127
-
128
- Returns:
129
- BenchmarkRunResults: An object containing benchmarking results for variant prioritisation,
130
- including ranks and rank statistics for the benchmarked directory.
131
- """
132
- variant_binary_classification_stats = BinaryClassificationStats()
133
- db_connection = BenchmarkDBManager(benchmark_name)
134
- variant_benchmarker = AssessVariantPrioritisation(
135
- db_connection,
136
- f"{run.phenopacket_dir.parents[0].name}" f"_variant",
137
- run.run_identifier,
138
- threshold,
139
- score_order,
140
- )
141
- for phenopacket_path in all_files(run.phenopacket_dir):
142
- assess_phenopacket_variant_prioritisation(
143
- phenopacket_path,
144
- run,
145
- variant_binary_classification_stats,
146
- variant_benchmarker,
147
- )
148
- variant_rank_stats = RankStats()
149
- variant_rank_stats.add_ranks(
150
- benchmark_name=benchmark_name,
151
- table_name=f"{run.phenopacket_dir.parents[0].name}_variant",
152
- column_name=str(run.run_identifier),
153
- )
154
- return BenchmarkRunResults(
155
- benchmark_name=run.run_identifier,
156
- rank_stats=variant_rank_stats,
157
- binary_classification_stats=variant_binary_classification_stats,
158
- phenopacket_dir=run.phenopacket_dir,
159
- )
File without changes