pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/analysis.py +61 -150
- pheval/analyse/assess_prioritisation_base.py +108 -0
- pheval/analyse/benchmark_db_manager.py +140 -0
- pheval/analyse/benchmark_generator.py +47 -50
- pheval/analyse/benchmarking_data.py +3 -2
- pheval/analyse/disease_prioritisation_analysis.py +70 -219
- pheval/analyse/gene_prioritisation_analysis.py +66 -242
- pheval/analyse/generate_plots.py +81 -79
- pheval/analyse/generate_summary_outputs.py +64 -134
- pheval/analyse/parse_benchmark_summary.py +50 -37
- pheval/analyse/parse_corpus.py +219 -0
- pheval/analyse/rank_stats.py +177 -144
- pheval/analyse/run_data_parser.py +108 -27
- pheval/analyse/variant_prioritisation_analysis.py +78 -212
- pheval/cli.py +2 -4
- pheval/cli_pheval_utils.py +34 -245
- pheval/prepare/create_noisy_phenopackets.py +78 -67
- pheval-0.4.1.dist-info/METADATA +113 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/RECORD +22 -22
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/WHEEL +1 -1
- pheval/analyse/parse_pheval_result.py +0 -43
- pheval/analyse/prioritisation_rank_recorder.py +0 -83
- pheval/constants.py +0 -8
- pheval-0.3.9.dist-info/METADATA +0 -35
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/LICENSE +0 -0
- {pheval-0.3.9.dist-info → pheval-0.4.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,169 +1,22 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import re
|
|
3
|
-
from collections import defaultdict
|
|
4
1
|
from pathlib import Path
|
|
5
|
-
from typing import List, Union
|
|
6
2
|
|
|
3
|
+
from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
|
|
4
|
+
from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
|
|
7
5
|
from pheval.analyse.benchmarking_data import BenchmarkRunResults
|
|
8
6
|
from pheval.analyse.binary_classification_stats import BinaryClassificationStats
|
|
9
|
-
from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
|
|
10
|
-
from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
|
|
11
|
-
from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
|
|
12
7
|
from pheval.analyse.rank_stats import RankStats
|
|
13
|
-
from pheval.analyse.run_data_parser import
|
|
8
|
+
from pheval.analyse.run_data_parser import RunConfig
|
|
14
9
|
from pheval.post_processing.post_processing import RankedPhEvalGeneResult
|
|
15
10
|
from pheval.utils.file_utils import all_files
|
|
16
|
-
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
|
|
17
11
|
|
|
18
12
|
|
|
19
|
-
class AssessGenePrioritisation:
|
|
13
|
+
class AssessGenePrioritisation(AssessPrioritisationBase):
|
|
20
14
|
"""Class for assessing gene prioritisation based on thresholds and scoring orders."""
|
|
21
15
|
|
|
22
|
-
def __init__(
|
|
23
|
-
self,
|
|
24
|
-
phenopacket_path: Path,
|
|
25
|
-
results_dir: Path,
|
|
26
|
-
standardised_gene_results: List[RankedPhEvalGeneResult],
|
|
27
|
-
threshold: float,
|
|
28
|
-
score_order: str,
|
|
29
|
-
proband_causative_genes: List[ProbandCausativeGene],
|
|
30
|
-
):
|
|
31
|
-
"""
|
|
32
|
-
Initialise AssessGenePrioritisation class.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
phenopacket_path (Path): Path to the phenopacket file
|
|
36
|
-
results_dir (Path): Path to the results directory
|
|
37
|
-
standardised_gene_results (List[RankedPhEvalGeneResult]): List of ranked PhEval gene results
|
|
38
|
-
threshold (float): Threshold for scores
|
|
39
|
-
score_order (str): Score order for results, either ascending or descending
|
|
40
|
-
proband_causative_genes (List[ProbandCausativeGene]): List of proband causative genes
|
|
41
|
-
"""
|
|
42
|
-
self.phenopacket_path = phenopacket_path
|
|
43
|
-
self.results_dir = results_dir
|
|
44
|
-
self.standardised_gene_results = standardised_gene_results
|
|
45
|
-
self.threshold = threshold
|
|
46
|
-
self.score_order = score_order
|
|
47
|
-
self.proband_causative_genes = proband_causative_genes
|
|
48
|
-
|
|
49
|
-
def _record_gene_prioritisation_match(
|
|
50
|
-
self,
|
|
51
|
-
gene: ProbandCausativeGene,
|
|
52
|
-
result_entry: RankedPhEvalGeneResult,
|
|
53
|
-
rank_stats: RankStats,
|
|
54
|
-
) -> GenePrioritisationResult:
|
|
55
|
-
"""
|
|
56
|
-
Record the gene prioritisation rank if found within the results
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
gene (ProbandCausativeGene): Diagnosed proband gene
|
|
60
|
-
result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
|
|
61
|
-
rank_stats (RankStats): RankStats class instance
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
GenePrioritisationResult: Recorded correct gene prioritisation rank result
|
|
65
|
-
"""
|
|
66
|
-
rank = result_entry.rank
|
|
67
|
-
rank_stats.add_rank(rank)
|
|
68
|
-
return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
|
|
69
|
-
|
|
70
|
-
def _assess_gene_with_threshold_ascending_order(
|
|
71
|
-
self,
|
|
72
|
-
result_entry: RankedPhEvalGeneResult,
|
|
73
|
-
gene: ProbandCausativeGene,
|
|
74
|
-
rank_stats: RankStats,
|
|
75
|
-
) -> GenePrioritisationResult:
|
|
76
|
-
"""
|
|
77
|
-
Record the gene prioritisation rank if it meets the ascending order threshold.
|
|
78
|
-
|
|
79
|
-
This method checks if the gene prioritisation rank meets the ascending order threshold.
|
|
80
|
-
If the score of the result entry is less than the threshold, it records the gene rank.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
|
|
84
|
-
gene (ProbandCausativeGene): Diagnosed proband gene
|
|
85
|
-
rank_stats (RankStats): RankStats class instance
|
|
86
|
-
Returns:
|
|
87
|
-
GenePrioritisationResult: Recorded correct gene prioritisation rank result
|
|
88
|
-
"""
|
|
89
|
-
if float(self.threshold) > float(result_entry.score):
|
|
90
|
-
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
|
|
91
|
-
|
|
92
|
-
def _assess_gene_with_threshold(
|
|
93
|
-
self,
|
|
94
|
-
result_entry: RankedPhEvalGeneResult,
|
|
95
|
-
gene: ProbandCausativeGene,
|
|
96
|
-
rank_stats: RankStats,
|
|
97
|
-
) -> GenePrioritisationResult:
|
|
98
|
-
"""
|
|
99
|
-
Record the gene prioritisation rank if it meets the score threshold.
|
|
100
|
-
This method checks if the gene prioritisation rank meets the score threshold.
|
|
101
|
-
If the score of the result entry is greater than the threshold, it records the gene rank.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
result_entry (RankedPhEvalResult): Ranked PhEval gene result entry
|
|
105
|
-
gene (ProbandCausativeGene): Diagnosed proband gene
|
|
106
|
-
rank_stats (RankStats): RankStats class instance
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
GenePrioritisationResult: Recorded correct gene prioritisation rank result
|
|
110
|
-
"""
|
|
111
|
-
if float(self.threshold) < float(result_entry.score):
|
|
112
|
-
return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
|
|
113
|
-
|
|
114
|
-
def _record_matched_gene(
|
|
115
|
-
self,
|
|
116
|
-
gene: ProbandCausativeGene,
|
|
117
|
-
rank_stats: RankStats,
|
|
118
|
-
standardised_gene_result: RankedPhEvalGeneResult,
|
|
119
|
-
) -> GenePrioritisationResult:
|
|
120
|
-
"""
|
|
121
|
-
Return the gene rank result - handling the specification of a threshold.
|
|
122
|
-
This method determines and returns the gene rank result based on the specified threshold
|
|
123
|
-
and score order. If the threshold is 0.0, it records the gene rank directly.
|
|
124
|
-
Otherwise, it assesses the gene with the threshold based on the score order.
|
|
125
|
-
Args:
|
|
126
|
-
gene (ProbandCausativeGene): Diagnosed proband gene
|
|
127
|
-
rank_stats (RankStats): RankStats class instance
|
|
128
|
-
standardised_gene_result (RankedPhEvalGeneResult): Ranked PhEval gene result entry
|
|
129
|
-
Returns:
|
|
130
|
-
GenePrioritisationResult: Recorded correct gene prioritisation rank result
|
|
131
|
-
"""
|
|
132
|
-
if float(self.threshold) == 0.0:
|
|
133
|
-
return self._record_gene_prioritisation_match(
|
|
134
|
-
gene, standardised_gene_result, rank_stats
|
|
135
|
-
)
|
|
136
|
-
else:
|
|
137
|
-
return (
|
|
138
|
-
self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
|
|
139
|
-
if self.score_order != "ascending"
|
|
140
|
-
else self._assess_gene_with_threshold_ascending_order(
|
|
141
|
-
standardised_gene_result, gene, rank_stats
|
|
142
|
-
)
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
@staticmethod
|
|
146
|
-
def _check_string_representation(entity: str) -> Union[List[str], str]:
|
|
147
|
-
"""
|
|
148
|
-
Check if the input string is a representation of a list and returns the list if true, otherwise the string.
|
|
149
|
-
|
|
150
|
-
Args:
|
|
151
|
-
entity (str): The input entity to check.
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
Union[List[str], str]: A list if the input string is a list representation, otherwise
|
|
155
|
-
the original string.
|
|
156
|
-
"""
|
|
157
|
-
list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
|
|
158
|
-
if list_pattern.match(str(entity)):
|
|
159
|
-
return ast.literal_eval(entity)
|
|
160
|
-
else:
|
|
161
|
-
return entity
|
|
162
|
-
|
|
163
16
|
def assess_gene_prioritisation(
|
|
164
17
|
self,
|
|
165
|
-
|
|
166
|
-
|
|
18
|
+
standardised_gene_result_path: Path,
|
|
19
|
+
phenopacket_path: Path,
|
|
167
20
|
binary_classification_stats: BinaryClassificationStats,
|
|
168
21
|
) -> None:
|
|
169
22
|
"""
|
|
@@ -172,78 +25,47 @@ class AssessGenePrioritisation:
|
|
|
172
25
|
and records ranks using a PrioritisationRankRecorder.
|
|
173
26
|
|
|
174
27
|
Args:
|
|
175
|
-
|
|
176
|
-
|
|
28
|
+
standardised_gene_result_path (Path): Path to the standardised gene TSV result.
|
|
29
|
+
phenopacket_path (Path): Path to the Phenopacket.
|
|
177
30
|
binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
178
31
|
"""
|
|
179
32
|
relevant_ranks = []
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
33
|
+
df = self.conn.execute(
|
|
34
|
+
f"""SELECT * FROM {self.table_name} WHERE phenopacket = '{phenopacket_path.name}'"""
|
|
35
|
+
).fetchdf()
|
|
36
|
+
for _i, row in df.iterrows():
|
|
37
|
+
result = (
|
|
38
|
+
self.conn.execute(
|
|
39
|
+
f"SELECT * FROM '{standardised_gene_result_path}' "
|
|
40
|
+
f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),"
|
|
41
|
+
f" '{row['gene_identifier']}') "
|
|
42
|
+
f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), "
|
|
43
|
+
f"'{row['gene_symbol']}')"
|
|
186
44
|
)
|
|
187
|
-
|
|
188
|
-
|
|
45
|
+
.fetchdf()
|
|
46
|
+
.to_dict(orient="records")
|
|
47
|
+
)
|
|
48
|
+
if len(result) > 0:
|
|
49
|
+
gene_match = self._record_matched_entity(RankedPhEvalGeneResult(**result[0]))
|
|
50
|
+
relevant_ranks.append(gene_match)
|
|
51
|
+
primary_key = f"{phenopacket_path.name}-{row['gene_symbol']}"
|
|
52
|
+
self.conn.execute(
|
|
53
|
+
f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
|
|
54
|
+
(gene_match, primary_key),
|
|
189
55
|
)
|
|
190
|
-
if (
|
|
191
|
-
isinstance(gene_identifier, list)
|
|
192
|
-
and gene.gene_identifier in gene_identifier
|
|
193
|
-
or isinstance(gene_identifier, str)
|
|
194
|
-
and gene.gene_identifier == str
|
|
195
|
-
or isinstance(gene_symbol, list)
|
|
196
|
-
and gene.gene_symbol in gene_symbol
|
|
197
|
-
or isinstance(gene_symbol, str)
|
|
198
|
-
and gene.gene_symbol == gene_symbol
|
|
199
|
-
):
|
|
200
|
-
gene_match = self._record_matched_gene(
|
|
201
|
-
gene, rank_stats, standardised_gene_result
|
|
202
|
-
)
|
|
203
|
-
(
|
|
204
|
-
relevant_ranks.append(gene_match.rank)
|
|
205
|
-
if gene_match
|
|
206
|
-
else relevant_ranks.append(0)
|
|
207
|
-
)
|
|
208
|
-
break
|
|
209
|
-
PrioritisationRankRecorder(
|
|
210
|
-
rank_stats.total,
|
|
211
|
-
self.results_dir,
|
|
212
|
-
(
|
|
213
|
-
GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
|
|
214
|
-
if gene_match is None
|
|
215
|
-
else gene_match
|
|
216
|
-
),
|
|
217
|
-
rank_records,
|
|
218
|
-
).record_rank()
|
|
219
|
-
rank_stats.relevant_result_ranks.append(relevant_ranks)
|
|
220
56
|
binary_classification_stats.add_classification(
|
|
221
|
-
|
|
57
|
+
self.db_connection.parse_table_into_dataclass(
|
|
58
|
+
str(standardised_gene_result_path), RankedPhEvalGeneResult
|
|
59
|
+
),
|
|
60
|
+
relevant_ranks,
|
|
222
61
|
)
|
|
223
62
|
|
|
224
63
|
|
|
225
|
-
def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
|
|
226
|
-
"""
|
|
227
|
-
Obtain known genes from a Phenopacket.
|
|
228
|
-
Args:
|
|
229
|
-
phenopacket_path (Path): Path to the Phenopacket file.
|
|
230
|
-
Returns:
|
|
231
|
-
List[ProbandCausativeGene]: A list of known genes associated with the proband,
|
|
232
|
-
extracted from the Phenopacket.
|
|
233
|
-
"""
|
|
234
|
-
phenopacket = phenopacket_reader(phenopacket_path)
|
|
235
|
-
phenopacket_util = PhenopacketUtil(phenopacket)
|
|
236
|
-
return phenopacket_util.diagnosed_genes()
|
|
237
|
-
|
|
238
|
-
|
|
239
64
|
def assess_phenopacket_gene_prioritisation(
|
|
240
65
|
phenopacket_path: Path,
|
|
241
|
-
|
|
242
|
-
results_dir_and_input: TrackInputOutputDirectories,
|
|
243
|
-
threshold: float,
|
|
244
|
-
gene_rank_stats: RankStats,
|
|
245
|
-
gene_rank_comparison: defaultdict,
|
|
66
|
+
run: RunConfig,
|
|
246
67
|
gene_binary_classification_stats: BinaryClassificationStats,
|
|
68
|
+
gene_benchmarker: AssessGenePrioritisation,
|
|
247
69
|
) -> None:
|
|
248
70
|
"""
|
|
249
71
|
Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results
|
|
@@ -251,62 +73,64 @@ def assess_phenopacket_gene_prioritisation(
|
|
|
251
73
|
|
|
252
74
|
Args:
|
|
253
75
|
phenopacket_path (Path): Path to the Phenopacket.
|
|
254
|
-
|
|
255
|
-
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
256
|
-
threshold (float): Threshold for assessment.
|
|
257
|
-
gene_rank_stats (RankStats): RankStats class instance.
|
|
258
|
-
gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
|
|
76
|
+
run (RunConfig): Run configuration.
|
|
259
77
|
gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
78
|
+
gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance.
|
|
260
79
|
"""
|
|
261
|
-
|
|
80
|
+
standardised_gene_result_path = run.results_dir.joinpath(
|
|
262
81
|
f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
|
|
263
82
|
)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
AssessGenePrioritisation(
|
|
83
|
+
gene_benchmarker.assess_gene_prioritisation(
|
|
84
|
+
standardised_gene_result_path,
|
|
267
85
|
phenopacket_path,
|
|
268
|
-
|
|
269
|
-
parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result),
|
|
270
|
-
threshold,
|
|
271
|
-
score_order,
|
|
272
|
-
proband_causative_genes,
|
|
273
|
-
).assess_gene_prioritisation(
|
|
274
|
-
gene_rank_stats, gene_rank_comparison, gene_binary_classification_stats
|
|
86
|
+
gene_binary_classification_stats,
|
|
275
87
|
)
|
|
276
88
|
|
|
277
89
|
|
|
278
90
|
def benchmark_gene_prioritisation(
|
|
279
|
-
|
|
91
|
+
benchmark_name: str,
|
|
92
|
+
run: RunConfig,
|
|
280
93
|
score_order: str,
|
|
281
94
|
threshold: float,
|
|
282
|
-
gene_rank_comparison: defaultdict,
|
|
283
95
|
) -> BenchmarkRunResults:
|
|
284
96
|
"""
|
|
285
97
|
Benchmark a directory based on gene prioritisation results.
|
|
286
98
|
Args:
|
|
287
|
-
|
|
99
|
+
benchmark_name (str): Name of the benchmark.
|
|
100
|
+
run (RunConfig): Run configuration.
|
|
288
101
|
score_order (str): The order in which scores are arranged.
|
|
289
102
|
threshold (float): Threshold for assessment.
|
|
290
|
-
gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
|
|
291
103
|
Returns:
|
|
292
104
|
BenchmarkRunResults: An object containing benchmarking results for gene prioritisation,
|
|
293
105
|
including ranks and rank statistics for the benchmarked directory.
|
|
294
106
|
"""
|
|
295
|
-
gene_rank_stats = RankStats()
|
|
296
107
|
gene_binary_classification_stats = BinaryClassificationStats()
|
|
297
|
-
|
|
108
|
+
db_connection = BenchmarkDBManager(benchmark_name)
|
|
109
|
+
db_connection.initialise()
|
|
110
|
+
gene_benchmarker = AssessGenePrioritisation(
|
|
111
|
+
db_connection,
|
|
112
|
+
f"{run.phenopacket_dir.parents[0].name}" f"_gene",
|
|
113
|
+
run.run_identifier,
|
|
114
|
+
threshold,
|
|
115
|
+
score_order,
|
|
116
|
+
)
|
|
117
|
+
for phenopacket_path in all_files(run.phenopacket_dir):
|
|
298
118
|
assess_phenopacket_gene_prioritisation(
|
|
299
119
|
phenopacket_path,
|
|
300
|
-
|
|
301
|
-
results_directory_and_input,
|
|
302
|
-
threshold,
|
|
303
|
-
gene_rank_stats,
|
|
304
|
-
gene_rank_comparison,
|
|
120
|
+
run,
|
|
305
121
|
gene_binary_classification_stats,
|
|
122
|
+
gene_benchmarker,
|
|
306
123
|
)
|
|
124
|
+
db_connection.close()
|
|
125
|
+
gene_rank_stats = RankStats()
|
|
126
|
+
gene_rank_stats.add_ranks(
|
|
127
|
+
benchmark_name=benchmark_name,
|
|
128
|
+
table_name=f"{run.phenopacket_dir.parents[0].name}_gene",
|
|
129
|
+
column_name=str(run.run_identifier),
|
|
130
|
+
)
|
|
307
131
|
return BenchmarkRunResults(
|
|
308
|
-
results_dir=results_directory_and_input.results_dir,
|
|
309
|
-
ranks=gene_rank_comparison,
|
|
310
132
|
rank_stats=gene_rank_stats,
|
|
133
|
+
benchmark_name=run.run_identifier,
|
|
311
134
|
binary_classification_stats=gene_binary_classification_stats,
|
|
135
|
+
phenopacket_dir=run.phenopacket_dir,
|
|
312
136
|
)
|