pheval 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/benchmark.py +156 -0
- pheval/analyse/benchmark_db_manager.py +16 -134
- pheval/analyse/benchmark_output_type.py +43 -0
- pheval/analyse/binary_classification_curves.py +132 -0
- pheval/analyse/binary_classification_stats.py +164 -307
- pheval/analyse/generate_plots.py +210 -395
- pheval/analyse/generate_rank_comparisons.py +44 -0
- pheval/analyse/rank_stats.py +190 -382
- pheval/analyse/run_data_parser.py +21 -39
- pheval/cli.py +28 -25
- pheval/cli_pheval_utils.py +7 -8
- pheval/post_processing/phenopacket_truth_set.py +235 -0
- pheval/post_processing/post_processing.py +183 -303
- pheval/post_processing/validate_result_format.py +92 -0
- pheval/prepare/update_phenopacket.py +11 -9
- pheval/utils/logger.py +35 -0
- pheval/utils/phenopacket_utils.py +85 -91
- {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
- {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
- {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/WHEEL +1 -1
- pheval/analyse/analysis.py +0 -104
- pheval/analyse/assess_prioritisation_base.py +0 -108
- pheval/analyse/benchmark_generator.py +0 -126
- pheval/analyse/benchmarking_data.py +0 -25
- pheval/analyse/disease_prioritisation_analysis.py +0 -152
- pheval/analyse/gene_prioritisation_analysis.py +0 -147
- pheval/analyse/generate_summary_outputs.py +0 -105
- pheval/analyse/parse_benchmark_summary.py +0 -81
- pheval/analyse/parse_corpus.py +0 -219
- pheval/analyse/prioritisation_result_types.py +0 -52
- pheval/analyse/variant_prioritisation_analysis.py +0 -159
- {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
- {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,13 +1,25 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import operator
|
|
3
|
-
from dataclasses import dataclass
|
|
4
1
|
from enum import Enum
|
|
5
2
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
3
|
+
from typing import Callable, Tuple
|
|
7
4
|
|
|
8
|
-
import
|
|
5
|
+
import polars as pl
|
|
9
6
|
|
|
10
|
-
|
|
7
|
+
from pheval.post_processing.phenopacket_truth_set import PhenopacketTruthSet
|
|
8
|
+
from pheval.post_processing.validate_result_format import ResultSchema, validate_dataframe
|
|
9
|
+
from pheval.utils.file_utils import all_files
|
|
10
|
+
from pheval.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
executed_results = set()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ResultType(Enum):
|
|
18
|
+
"""Enumeration of the possible result types."""
|
|
19
|
+
|
|
20
|
+
GENE = "gene"
|
|
21
|
+
DISEASE = "disease"
|
|
22
|
+
VARIANT = "variant"
|
|
11
23
|
|
|
12
24
|
|
|
13
25
|
def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
|
|
@@ -22,365 +34,233 @@ def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
|
|
|
22
34
|
return variant_start + len(variant_ref) - 1
|
|
23
35
|
|
|
24
36
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
"""Base class for PhEval results."""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@dataclass
|
|
31
|
-
class PhEvalGeneResult(PhEvalResult):
|
|
32
|
-
"""Minimal data required from tool-specific output for gene prioritisation result
|
|
33
|
-
Args:
|
|
34
|
-
gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
|
|
35
|
-
gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
|
|
36
|
-
score (float): The score for the gene result entry
|
|
37
|
-
Notes:
|
|
38
|
-
While we recommend providing the gene identifier in the ENSEMBL namespace,
|
|
39
|
-
any matching format used in Phenopacket interpretations is acceptable for result matching purposes
|
|
40
|
-
in the analysis.
|
|
41
|
-
"""
|
|
37
|
+
class SortOrder(Enum):
|
|
38
|
+
"""Enumeration representing sorting orders."""
|
|
42
39
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
ASCENDING = 1
|
|
41
|
+
"""Ascending sort order."""
|
|
42
|
+
DESCENDING = 2
|
|
43
|
+
"""Descending sort order."""
|
|
46
44
|
|
|
47
45
|
|
|
48
|
-
|
|
49
|
-
class RankedPhEvalGeneResult(PhEvalGeneResult):
|
|
50
|
-
"""PhEval gene result with corresponding rank
|
|
51
|
-
Args:
|
|
52
|
-
rank (int): The rank for the result entry
|
|
46
|
+
def _rank_results(results: pl.DataFrame, sort_order: SortOrder) -> pl.DataFrame:
|
|
53
47
|
"""
|
|
54
|
-
|
|
55
|
-
rank: int
|
|
56
|
-
|
|
57
|
-
@staticmethod
|
|
58
|
-
def from_gene_result(pheval_gene_result: PhEvalGeneResult, rank: int):
|
|
59
|
-
"""Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank
|
|
60
|
-
Args:
|
|
61
|
-
pheval_gene_result (PhEvalGeneResult): The gene result entry
|
|
62
|
-
rank (int): The corresponding rank for the result entry
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult
|
|
66
|
-
"""
|
|
67
|
-
return RankedPhEvalGeneResult(
|
|
68
|
-
gene_symbol=pheval_gene_result.gene_symbol,
|
|
69
|
-
gene_identifier=pheval_gene_result.gene_identifier,
|
|
70
|
-
score=pheval_gene_result.score,
|
|
71
|
-
rank=rank,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@dataclass
|
|
76
|
-
class PhEvalVariantResult(PhEvalResult):
|
|
77
|
-
"""Minimal data required from tool-specific output for variant prioritisation
|
|
48
|
+
Rank results with the given sort order.
|
|
78
49
|
Args:
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
end (int): The end position of the variant
|
|
84
|
-
ref (str): The reference allele of the variant
|
|
85
|
-
alt (str): The alternate allele of the variant
|
|
86
|
-
score (float): The score for the variant result entry
|
|
87
|
-
Notes:
|
|
88
|
-
While we recommend providing the variant's chromosome in the specified format,
|
|
89
|
-
any matching format used in Phenopacket interpretations is acceptable for result matching purposes
|
|
90
|
-
in the analysis.
|
|
50
|
+
results (pl.DataFrame): The results to rank.
|
|
51
|
+
sort_order (SortOrder): The sort order to use.
|
|
52
|
+
Returns:
|
|
53
|
+
pl.DataFrame: The ranked results.
|
|
91
54
|
"""
|
|
55
|
+
sort_descending = True if sort_order == SortOrder.DESCENDING else False
|
|
56
|
+
has_grouping_id = "grouping_id" in results.columns
|
|
57
|
+
if has_grouping_id:
|
|
58
|
+
results = (
|
|
59
|
+
results.sort("score", descending=sort_descending)
|
|
60
|
+
.with_columns(
|
|
61
|
+
pl.struct(["score", "grouping_id"])
|
|
62
|
+
.rank(method="dense", descending=sort_descending)
|
|
63
|
+
.cast(pl.Int32)
|
|
64
|
+
.alias("min_rank")
|
|
65
|
+
)
|
|
66
|
+
.with_columns(pl.col("min_rank").max().over("score").alias("rank"))
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
results = results.sort("score", descending=sort_descending).with_columns(
|
|
70
|
+
pl.col("score").rank(method="max", descending=sort_descending).alias("rank")
|
|
71
|
+
)
|
|
92
72
|
|
|
93
|
-
|
|
94
|
-
start: int
|
|
95
|
-
end: int
|
|
96
|
-
ref: str
|
|
97
|
-
alt: str
|
|
98
|
-
score: float
|
|
73
|
+
return results
|
|
99
74
|
|
|
100
75
|
|
|
101
|
-
|
|
102
|
-
class RankedPhEvalVariantResult(PhEvalVariantResult):
|
|
103
|
-
"""PhEval variant result with corresponding rank
|
|
104
|
-
Args:
|
|
105
|
-
rank (int): The rank for the result entry
|
|
76
|
+
def _write_results_file(out_file: Path, output_df: pl.DataFrame) -> None:
|
|
106
77
|
"""
|
|
107
|
-
|
|
108
|
-
rank: int
|
|
109
|
-
|
|
110
|
-
@staticmethod
|
|
111
|
-
def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int):
|
|
112
|
-
"""Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank
|
|
113
|
-
Args:
|
|
114
|
-
pheval_variant_result (PhEvalVariantResult): The variant result entry
|
|
115
|
-
rank (int): The corresponding rank for the result entry
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult
|
|
119
|
-
"""
|
|
120
|
-
return RankedPhEvalVariantResult(
|
|
121
|
-
chromosome=pheval_variant_result.chromosome,
|
|
122
|
-
start=pheval_variant_result.start,
|
|
123
|
-
end=pheval_variant_result.end,
|
|
124
|
-
ref=pheval_variant_result.ref,
|
|
125
|
-
alt=pheval_variant_result.alt,
|
|
126
|
-
score=pheval_variant_result.score,
|
|
127
|
-
rank=rank,
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
@dataclass
|
|
132
|
-
class PhEvalDiseaseResult(PhEvalResult):
|
|
133
|
-
"""Minimal data required from tool-specific output for disease prioritisation
|
|
78
|
+
Write results to compressed Parquet output.
|
|
134
79
|
Args:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
score (str): Score for the disease result entry
|
|
138
|
-
Notes:
|
|
139
|
-
While we recommend providing the disease identifier in the OMIM namespace,
|
|
140
|
-
any matching format used in Phenopacket interpretations is acceptable for result matching purposes
|
|
141
|
-
in the analysis.
|
|
80
|
+
out_file (Path): Output file to write to.
|
|
81
|
+
output_df (pl.DataFrame): Output dataframe.
|
|
142
82
|
"""
|
|
83
|
+
output_df.write_parquet(out_file, compression="zstd")
|
|
143
84
|
|
|
144
|
-
disease_name: str
|
|
145
|
-
disease_identifier: str
|
|
146
|
-
score: float
|
|
147
85
|
|
|
86
|
+
def _write_gene_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Write ranked PhEval gene results to a parquet file.
|
|
148
89
|
|
|
149
|
-
@dataclass
|
|
150
|
-
class RankedPhEvalDiseaseResult(PhEvalDiseaseResult):
|
|
151
|
-
"""PhEval disease result with corresponding rank
|
|
152
90
|
Args:
|
|
153
|
-
|
|
91
|
+
ranked_results ([PhEvalResult]): List of ranked PhEval gene results.
|
|
92
|
+
output_file (Path): Path to the output file.
|
|
154
93
|
"""
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def from_disease_result(pheval_disease_result: PhEvalDiseaseResult, rank: int):
|
|
160
|
-
"""Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank
|
|
161
|
-
Args:
|
|
162
|
-
pheval_disease_result (PhEvalDiseaseResult): The disease result entry
|
|
163
|
-
rank (int): The corresponding rank for the result entry
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult
|
|
167
|
-
"""
|
|
168
|
-
return RankedPhEvalDiseaseResult(
|
|
169
|
-
disease_name=pheval_disease_result.disease_name,
|
|
170
|
-
disease_identifier=pheval_disease_result.disease_identifier,
|
|
171
|
-
score=pheval_disease_result.score,
|
|
172
|
-
rank=rank,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
class SortOrder(Enum):
|
|
177
|
-
"""Enumeration representing sorting orders."""
|
|
178
|
-
|
|
179
|
-
ASCENDING = 1
|
|
180
|
-
"""Ascending sort order."""
|
|
181
|
-
DESCENDING = 2
|
|
182
|
-
"""Descending sort order."""
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
class ResultSorter:
|
|
186
|
-
"""Class for sorting PhEvalResult instances based on a given sort order."""
|
|
187
|
-
|
|
188
|
-
def __init__(self, pheval_results: [PhEvalResult], sort_order: SortOrder):
|
|
189
|
-
"""
|
|
190
|
-
Initialise ResultSorter
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted
|
|
194
|
-
sort_order (SortOrder): Sorting order to be applied
|
|
195
|
-
"""
|
|
196
|
-
self.pheval_results = pheval_results
|
|
197
|
-
self.sort_order = sort_order
|
|
198
|
-
|
|
199
|
-
def _sort_by_decreasing_score(self) -> [PhEvalResult]:
|
|
200
|
-
"""
|
|
201
|
-
Sort results in descending order based on the score
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
[PhEvalResult]: Sorted list of PhEvalResult instances.
|
|
205
|
-
"""
|
|
206
|
-
return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=True)
|
|
207
|
-
|
|
208
|
-
def _sort_by_increasing_score(self) -> [PhEvalResult]:
|
|
209
|
-
"""
|
|
210
|
-
Sort results in ascending order based on the score
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
[PhEvalResult]: Sorted list of PhEvalResult instances.
|
|
214
|
-
"""
|
|
215
|
-
return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=False)
|
|
216
|
-
|
|
217
|
-
def sort_pheval_results(self) -> [PhEvalResult]:
|
|
218
|
-
"""
|
|
219
|
-
Sort results based on the specified sort order.
|
|
220
|
-
|
|
221
|
-
Returns:
|
|
222
|
-
[PhEvalResult]: Sorted list of PhEvalResult instances.
|
|
223
|
-
"""
|
|
224
|
-
return (
|
|
225
|
-
self._sort_by_increasing_score()
|
|
226
|
-
if self.sort_order == SortOrder.ASCENDING
|
|
227
|
-
else self._sort_by_decreasing_score()
|
|
228
|
-
)
|
|
94
|
+
gene_output = ranked_results.select(
|
|
95
|
+
["rank", "score", "gene_symbol", "gene_identifier", "true_positive"]
|
|
96
|
+
)
|
|
97
|
+
_write_results_file(output_file, gene_output)
|
|
229
98
|
|
|
230
99
|
|
|
231
|
-
def
|
|
100
|
+
def _write_variant_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
|
|
232
101
|
"""
|
|
233
|
-
|
|
102
|
+
Write ranked PhEval variant results to a parquet file.
|
|
234
103
|
|
|
235
104
|
Args:
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
Returns:
|
|
240
|
-
pd.DataFrame : Ranked PhEval results with tied scores managed
|
|
241
|
-
|
|
242
|
-
Raises:
|
|
243
|
-
ValueError: If an incompatible PhEval result type is encountered
|
|
105
|
+
ranked_results ([PhEvalResult]): List of ranked PhEval variant results.
|
|
106
|
+
output_file (Path): Path to the output file.
|
|
244
107
|
"""
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False)
|
|
250
|
-
return pheval_result_df
|
|
108
|
+
variant_output = ranked_results.select(
|
|
109
|
+
["rank", "score", "chromosome", "start", "end", "ref", "alt", "variant_id", "true_positive"]
|
|
110
|
+
)
|
|
111
|
+
_write_results_file(output_file, variant_output)
|
|
251
112
|
|
|
252
113
|
|
|
253
|
-
def
|
|
114
|
+
def _write_disease_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
|
|
254
115
|
"""
|
|
255
|
-
|
|
116
|
+
Write ranked PhEval disease results to a parquet file.
|
|
256
117
|
|
|
257
118
|
Args:
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
Returns:
|
|
261
|
-
SortOrder: Enum representing the specified sorting order
|
|
262
|
-
|
|
263
|
-
Raises:
|
|
264
|
-
ValueError: If an incompatible or unknown sorting method is provided
|
|
119
|
+
ranked_results ([PhEvalResult]): List of ranked PhEval disease results.
|
|
120
|
+
output_file (Path): Path to the output file.
|
|
265
121
|
"""
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
122
|
+
disease_output = ranked_results.select(
|
|
123
|
+
["rank", "score", "disease_name", "disease_identifier", "true_positive"]
|
|
124
|
+
)
|
|
125
|
+
_write_results_file(output_file, disease_output)
|
|
270
126
|
|
|
271
127
|
|
|
272
|
-
def
|
|
128
|
+
def _get_result_type(
|
|
129
|
+
result_type: ResultType, phenopacket_truth_set: PhenopacketTruthSet
|
|
130
|
+
) -> Tuple[Callable, Callable]:
|
|
273
131
|
"""
|
|
274
|
-
|
|
275
|
-
|
|
132
|
+
Get the methods for extracting the entity and writing the result for a given result type.
|
|
276
133
|
Args:
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
134
|
+
result_type (ResultType): The result type.
|
|
135
|
+
phenopacket_truth_set (PhenopacketTruthSet): The phenotype truth set class instance.
|
|
280
136
|
Returns:
|
|
281
|
-
|
|
137
|
+
Tuple[Callable, Callable]: The methods for extracting the entity and the write method.
|
|
282
138
|
"""
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
139
|
+
match result_type:
|
|
140
|
+
case ResultType.GENE:
|
|
141
|
+
return phenopacket_truth_set.classified_gene, _write_gene_result
|
|
142
|
+
case ResultType.VARIANT:
|
|
143
|
+
return phenopacket_truth_set.classified_variant, _write_variant_result
|
|
144
|
+
case ResultType.DISEASE:
|
|
145
|
+
return phenopacket_truth_set.classified_disease, _write_disease_result
|
|
286
146
|
|
|
287
147
|
|
|
288
|
-
def
|
|
289
|
-
|
|
148
|
+
def create_empty_pheval_result(
|
|
149
|
+
phenopacket_dir: Path, output_dir: Path, result_type: ResultType
|
|
290
150
|
) -> None:
|
|
291
151
|
"""
|
|
292
|
-
|
|
152
|
+
Create an empty PhEval result for a given result type (gene, variant, or disease).
|
|
153
|
+
|
|
154
|
+
Notes:
|
|
155
|
+
This is necessary because some tools may not generate a result output for certain cases.
|
|
156
|
+
By explicitly creating an empty result, which will contain the known entity with a rank and score of 0,
|
|
157
|
+
we can track and identify false negatives during benchmarking,
|
|
158
|
+
ensuring that missing predictions are accounted for in the evaluation.
|
|
293
159
|
|
|
294
160
|
Args:
|
|
295
|
-
|
|
296
|
-
output_dir (Path):
|
|
297
|
-
|
|
161
|
+
phenopacket_dir (Path): The directory containing the phenopackets.
|
|
162
|
+
output_dir (Path): The output directory.
|
|
163
|
+
result_type (ResultType): The result type.
|
|
164
|
+
|
|
298
165
|
"""
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
166
|
+
if result_type in executed_results:
|
|
167
|
+
return
|
|
168
|
+
executed_results.add(result_type)
|
|
169
|
+
phenopacket_truth_set = PhenopacketTruthSet(phenopacket_dir)
|
|
170
|
+
classify_method, write_method = _get_result_type(result_type, phenopacket_truth_set)
|
|
171
|
+
for file in all_files(phenopacket_dir):
|
|
172
|
+
classified_results = classify_method(file.stem)
|
|
173
|
+
write_method(
|
|
174
|
+
classified_results,
|
|
175
|
+
output_dir.joinpath(f"{file.stem}-{result_type.value}_result.parquet"),
|
|
176
|
+
)
|
|
309
177
|
|
|
310
178
|
|
|
311
|
-
|
|
312
|
-
|
|
179
|
+
@validate_dataframe(ResultSchema.GENE_RESULT_SCHEMA)
|
|
180
|
+
def generate_gene_result(
|
|
181
|
+
results: pl.DataFrame,
|
|
182
|
+
sort_order: SortOrder,
|
|
183
|
+
output_dir: Path,
|
|
184
|
+
result_path: Path,
|
|
185
|
+
phenopacket_dir: Path,
|
|
313
186
|
) -> None:
|
|
314
187
|
"""
|
|
315
|
-
|
|
316
|
-
|
|
188
|
+
Generate PhEval gene results to a compressed Parquet output.
|
|
317
189
|
Args:
|
|
318
|
-
|
|
190
|
+
results (pl.DataFrame): The gene results.
|
|
191
|
+
sort_order (SortOrder): The sort order to use.
|
|
319
192
|
output_dir (Path): Path to the output directory
|
|
320
|
-
|
|
193
|
+
result_path (Path): Path to the tool-specific result file.
|
|
194
|
+
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
321
195
|
"""
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
sep="\t",
|
|
330
|
-
index=False,
|
|
196
|
+
output_file = output_dir.joinpath(f"pheval_gene_results/{result_path.stem}-gene_result.parquet")
|
|
197
|
+
create_empty_pheval_result(
|
|
198
|
+
phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE
|
|
199
|
+
)
|
|
200
|
+
ranked_results = _rank_results(results, sort_order)
|
|
201
|
+
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_gene_results(
|
|
202
|
+
ranked_results, output_file
|
|
331
203
|
)
|
|
204
|
+
_write_gene_result(classified_results, output_file)
|
|
332
205
|
|
|
333
206
|
|
|
334
|
-
|
|
335
|
-
|
|
207
|
+
@validate_dataframe(ResultSchema.VARIANT_RESULT_SCHEMA)
|
|
208
|
+
def generate_variant_result(
|
|
209
|
+
results: pl.DataFrame,
|
|
210
|
+
sort_order: SortOrder,
|
|
211
|
+
output_dir: Path,
|
|
212
|
+
result_path: Path,
|
|
213
|
+
phenopacket_dir: Path,
|
|
336
214
|
) -> None:
|
|
337
215
|
"""
|
|
338
|
-
|
|
339
|
-
|
|
216
|
+
Generate PhEval variant results to a compressed Parquet output.
|
|
340
217
|
Args:
|
|
341
|
-
|
|
218
|
+
results (pl.DataFrame): The variant results.
|
|
219
|
+
sort_order (SortOrder): The sort order to use.
|
|
342
220
|
output_dir (Path): Path to the output directory
|
|
343
|
-
|
|
221
|
+
result_path (Path): Path to the tool-specific result file.
|
|
222
|
+
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
344
223
|
"""
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
output_dir.joinpath(
|
|
350
|
-
"pheval_disease_results/" + tool_result_path.stem + "-pheval_disease_result.tsv"
|
|
351
|
-
),
|
|
352
|
-
sep="\t",
|
|
353
|
-
index=False,
|
|
224
|
+
output_file = output_dir.joinpath(
|
|
225
|
+
f"pheval_variant_results/{result_path.stem}-variant_result.parquet"
|
|
226
|
+
)
|
|
227
|
+
create_empty_pheval_result(
|
|
228
|
+
phenopacket_dir, output_dir.joinpath("pheval_variant_results"), ResultType.VARIANT
|
|
354
229
|
)
|
|
230
|
+
ranked_results = _rank_results(results, sort_order).with_columns(
|
|
231
|
+
pl.concat_str(["chrom", "pos", "ref", "alt"], separator="-").alias("variant_id")
|
|
232
|
+
)
|
|
233
|
+
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(
|
|
234
|
+
ranked_results, output_file
|
|
235
|
+
)
|
|
236
|
+
_write_variant_result(classified_results, output_file)
|
|
355
237
|
|
|
356
238
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
239
|
+
@validate_dataframe(ResultSchema.DISEASE_RESULT_SCHEMA)
|
|
240
|
+
def generate_disease_result(
|
|
241
|
+
results: pl.DataFrame,
|
|
242
|
+
sort_order: SortOrder,
|
|
360
243
|
output_dir: Path,
|
|
361
|
-
|
|
244
|
+
result_path: Path,
|
|
245
|
+
phenopacket_dir: Path,
|
|
362
246
|
) -> None:
|
|
363
247
|
"""
|
|
364
|
-
Generate PhEval
|
|
365
|
-
|
|
248
|
+
Generate PhEval disease results to a compressed Parquet output.
|
|
366
249
|
Args:
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
output_dir (Path): Path to the output directory
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
Raises:
|
|
373
|
-
ValueError: If the results are not all the same type or an error occurs during file writing.
|
|
250
|
+
results (pl.DataFrame): The disease results.
|
|
251
|
+
sort_order (SortOrder): The sort order to use.
|
|
252
|
+
output_dir (Path): Path to the output directory
|
|
253
|
+
result_path (Path): Path to the tool-specific result file.
|
|
254
|
+
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
374
255
|
"""
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
raise ValueError("Results are not all of the same type.")
|
|
256
|
+
output_file = output_dir.joinpath(
|
|
257
|
+
f"pheval_disease_results/{result_path.stem}-disease_result.parquet"
|
|
258
|
+
)
|
|
259
|
+
create_empty_pheval_result(
|
|
260
|
+
phenopacket_dir, output_dir.joinpath("pheval_disease_results"), ResultType.DISEASE
|
|
261
|
+
)
|
|
262
|
+
ranked_results = _rank_results(results, sort_order)
|
|
263
|
+
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_disease_results(
|
|
264
|
+
ranked_results, output_file
|
|
265
|
+
)
|
|
266
|
+
_write_disease_result(classified_results, output_file)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from functools import wraps
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ResultSchema(Enum):
|
|
9
|
+
"""
|
|
10
|
+
Enum for different result schema formats.
|
|
11
|
+
Attributes:
|
|
12
|
+
GENE_RESULT_SCHEMA (pl.Schema): Schema for gene-based results.
|
|
13
|
+
VARIANT_RESULT_SCHEMA (pl.Schema): Schema for variant-based results.
|
|
14
|
+
DISEASE_RESULT_SCHEMA (pl.Schema): Schema for disease-based results.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
GENE_RESULT_SCHEMA = pl.Schema(
|
|
18
|
+
{
|
|
19
|
+
"gene_symbol": pl.String,
|
|
20
|
+
"gene_identifier": pl.String,
|
|
21
|
+
"score": pl.Float64,
|
|
22
|
+
"grouping_id": pl.Utf8,
|
|
23
|
+
}
|
|
24
|
+
)
|
|
25
|
+
VARIANT_RESULT_SCHEMA = pl.Schema(
|
|
26
|
+
{
|
|
27
|
+
"chrom": pl.String,
|
|
28
|
+
"start": pl.Int64,
|
|
29
|
+
"end": pl.Int64,
|
|
30
|
+
"ref": pl.String,
|
|
31
|
+
"alt": pl.String,
|
|
32
|
+
"score": pl.Float64,
|
|
33
|
+
"grouping_id": pl.Utf8,
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
DISEASE_RESULT_SCHEMA = pl.Schema(
|
|
37
|
+
{
|
|
38
|
+
"disease_name": pl.String,
|
|
39
|
+
"disease_identifier": pl.String,
|
|
40
|
+
"score": pl.Float64,
|
|
41
|
+
"grouping_id": pl.Utf8,
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def validate(self, df: pl.DataFrame) -> bool:
|
|
46
|
+
"""
|
|
47
|
+
Validate that a DataFrame follows the expected schema.
|
|
48
|
+
Args:
|
|
49
|
+
df (pl.DataFrame): The DataFrame to validate.
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: If a required column is missing or the grouping_id column contains a null value.
|
|
52
|
+
TypeError: If a column exists but has an incorrect data type.
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if the DataFrame is valid according to the schema.
|
|
55
|
+
"""
|
|
56
|
+
expected_schema = self.value
|
|
57
|
+
|
|
58
|
+
if "grouping_id" in df.columns and df["grouping_id"].null_count() > 0:
|
|
59
|
+
raise ValueError("'grouping_id' column should not contain null values if provided.")
|
|
60
|
+
|
|
61
|
+
for col_name, expected_type in expected_schema.items():
|
|
62
|
+
if col_name not in df.schema:
|
|
63
|
+
if col_name == "grouping_id":
|
|
64
|
+
continue
|
|
65
|
+
raise ValueError(f"Missing required column: {col_name}")
|
|
66
|
+
|
|
67
|
+
if df.schema[col_name] != expected_type:
|
|
68
|
+
raise TypeError(
|
|
69
|
+
f"Column '{col_name}' has type {df.schema[col_name]}, expected {expected_type}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def validate_dataframe(schema: ResultSchema) -> Callable:
|
|
76
|
+
"""
|
|
77
|
+
Decorator to validate DataFrame input based on a ResultSchema.
|
|
78
|
+
Args:
|
|
79
|
+
schema (ResultSchema): The expected schema from the `ResultSchema` enum.
|
|
80
|
+
Returns:
|
|
81
|
+
Callable: A wrapped function that validates the DataFrame before execution.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def decorator(func: Callable) -> Callable:
|
|
85
|
+
@wraps(func)
|
|
86
|
+
def wrapper(df: pl.DataFrame, *args, **kwargs):
|
|
87
|
+
schema.validate(df)
|
|
88
|
+
return func(df, *args, **kwargs)
|
|
89
|
+
|
|
90
|
+
return wrapper
|
|
91
|
+
|
|
92
|
+
return decorator
|