pheval 0.4.6__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show
  1. pheval/analyse/benchmark.py +156 -0
  2. pheval/analyse/benchmark_db_manager.py +16 -134
  3. pheval/analyse/benchmark_output_type.py +43 -0
  4. pheval/analyse/binary_classification_curves.py +132 -0
  5. pheval/analyse/binary_classification_stats.py +164 -307
  6. pheval/analyse/generate_plots.py +210 -395
  7. pheval/analyse/generate_rank_comparisons.py +44 -0
  8. pheval/analyse/rank_stats.py +190 -382
  9. pheval/analyse/run_data_parser.py +21 -39
  10. pheval/cli.py +28 -25
  11. pheval/cli_pheval_utils.py +7 -8
  12. pheval/post_processing/phenopacket_truth_set.py +235 -0
  13. pheval/post_processing/post_processing.py +183 -303
  14. pheval/post_processing/validate_result_format.py +92 -0
  15. pheval/prepare/update_phenopacket.py +11 -9
  16. pheval/utils/logger.py +35 -0
  17. pheval/utils/phenopacket_utils.py +85 -91
  18. {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
  19. {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
  20. {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/WHEEL +1 -1
  21. pheval/analyse/analysis.py +0 -104
  22. pheval/analyse/assess_prioritisation_base.py +0 -108
  23. pheval/analyse/benchmark_generator.py +0 -126
  24. pheval/analyse/benchmarking_data.py +0 -25
  25. pheval/analyse/disease_prioritisation_analysis.py +0 -152
  26. pheval/analyse/gene_prioritisation_analysis.py +0 -147
  27. pheval/analyse/generate_summary_outputs.py +0 -105
  28. pheval/analyse/parse_benchmark_summary.py +0 -81
  29. pheval/analyse/parse_corpus.py +0 -219
  30. pheval/analyse/prioritisation_result_types.py +0 -52
  31. pheval/analyse/variant_prioritisation_analysis.py +0 -159
  32. {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
  33. {pheval-0.4.6.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,25 @@
1
- import logging
2
- import operator
3
- from dataclasses import dataclass
4
1
  from enum import Enum
5
2
  from pathlib import Path
6
- from typing import List, Union
3
+ from typing import Callable, Tuple
7
4
 
8
- import pandas as pd
5
+ import polars as pl
9
6
 
10
- info_log = logging.getLogger("info")
7
+ from pheval.post_processing.phenopacket_truth_set import PhenopacketTruthSet
8
+ from pheval.post_processing.validate_result_format import ResultSchema, validate_dataframe
9
+ from pheval.utils.file_utils import all_files
10
+ from pheval.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ executed_results = set()
15
+
16
+
17
+ class ResultType(Enum):
18
+ """Enumeration of the possible result types."""
19
+
20
+ GENE = "gene"
21
+ DISEASE = "disease"
22
+ VARIANT = "variant"
11
23
 
12
24
 
13
25
  def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
@@ -22,365 +34,233 @@ def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
22
34
  return variant_start + len(variant_ref) - 1
23
35
 
24
36
 
25
- @dataclass
26
- class PhEvalResult:
27
- """Base class for PhEval results."""
28
-
29
-
30
- @dataclass
31
- class PhEvalGeneResult(PhEvalResult):
32
- """Minimal data required from tool-specific output for gene prioritisation result
33
- Args:
34
- gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
35
- gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
36
- score (float): The score for the gene result entry
37
- Notes:
38
- While we recommend providing the gene identifier in the ENSEMBL namespace,
39
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
40
- in the analysis.
41
- """
37
+ class SortOrder(Enum):
38
+ """Enumeration representing sorting orders."""
42
39
 
43
- gene_symbol: Union[List[str], str]
44
- gene_identifier: Union[List[str], str]
45
- score: float
40
+ ASCENDING = 1
41
+ """Ascending sort order."""
42
+ DESCENDING = 2
43
+ """Descending sort order."""
46
44
 
47
45
 
48
- @dataclass
49
- class RankedPhEvalGeneResult(PhEvalGeneResult):
50
- """PhEval gene result with corresponding rank
51
- Args:
52
- rank (int): The rank for the result entry
46
+ def _rank_results(results: pl.DataFrame, sort_order: SortOrder) -> pl.DataFrame:
53
47
  """
54
-
55
- rank: int
56
-
57
- @staticmethod
58
- def from_gene_result(pheval_gene_result: PhEvalGeneResult, rank: int):
59
- """Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank
60
- Args:
61
- pheval_gene_result (PhEvalGeneResult): The gene result entry
62
- rank (int): The corresponding rank for the result entry
63
-
64
- Returns:
65
- RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult
66
- """
67
- return RankedPhEvalGeneResult(
68
- gene_symbol=pheval_gene_result.gene_symbol,
69
- gene_identifier=pheval_gene_result.gene_identifier,
70
- score=pheval_gene_result.score,
71
- rank=rank,
72
- )
73
-
74
-
75
- @dataclass
76
- class PhEvalVariantResult(PhEvalResult):
77
- """Minimal data required from tool-specific output for variant prioritisation
48
+ Rank results with the given sort order.
78
49
  Args:
79
- chromosome (str): The chromosome position of the variant recommended to be provided in the following format.
80
- This includes numerical designations from 1 to 22 representing autosomal chromosomes,
81
- as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT.
82
- start (int): The start position of the variant
83
- end (int): The end position of the variant
84
- ref (str): The reference allele of the variant
85
- alt (str): The alternate allele of the variant
86
- score (float): The score for the variant result entry
87
- Notes:
88
- While we recommend providing the variant's chromosome in the specified format,
89
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
90
- in the analysis.
50
+ results (pl.DataFrame): The results to rank.
51
+ sort_order (SortOrder): The sort order to use.
52
+ Returns:
53
+ pl.DataFrame: The ranked results.
91
54
  """
55
+ sort_descending = True if sort_order == SortOrder.DESCENDING else False
56
+ has_grouping_id = "grouping_id" in results.columns
57
+ if has_grouping_id:
58
+ results = (
59
+ results.sort("score", descending=sort_descending)
60
+ .with_columns(
61
+ pl.struct(["score", "grouping_id"])
62
+ .rank(method="dense", descending=sort_descending)
63
+ .cast(pl.Int32)
64
+ .alias("min_rank")
65
+ )
66
+ .with_columns(pl.col("min_rank").max().over("score").alias("rank"))
67
+ )
68
+ else:
69
+ results = results.sort("score", descending=sort_descending).with_columns(
70
+ pl.col("score").rank(method="max", descending=sort_descending).alias("rank")
71
+ )
92
72
 
93
- chromosome: str
94
- start: int
95
- end: int
96
- ref: str
97
- alt: str
98
- score: float
73
+ return results
99
74
 
100
75
 
101
- @dataclass
102
- class RankedPhEvalVariantResult(PhEvalVariantResult):
103
- """PhEval variant result with corresponding rank
104
- Args:
105
- rank (int): The rank for the result entry
76
+ def _write_results_file(out_file: Path, output_df: pl.DataFrame) -> None:
106
77
  """
107
-
108
- rank: int
109
-
110
- @staticmethod
111
- def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int):
112
- """Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank
113
- Args:
114
- pheval_variant_result (PhEvalVariantResult): The variant result entry
115
- rank (int): The corresponding rank for the result entry
116
-
117
- Returns:
118
- RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult
119
- """
120
- return RankedPhEvalVariantResult(
121
- chromosome=pheval_variant_result.chromosome,
122
- start=pheval_variant_result.start,
123
- end=pheval_variant_result.end,
124
- ref=pheval_variant_result.ref,
125
- alt=pheval_variant_result.alt,
126
- score=pheval_variant_result.score,
127
- rank=rank,
128
- )
129
-
130
-
131
- @dataclass
132
- class PhEvalDiseaseResult(PhEvalResult):
133
- """Minimal data required from tool-specific output for disease prioritisation
78
+ Write results to compressed Parquet output.
134
79
  Args:
135
- disease_name (str): Disease name for the result entry
136
- disease_identifier (str): Identifier for the disease result entry in the OMIM namespace
137
- score (str): Score for the disease result entry
138
- Notes:
139
- While we recommend providing the disease identifier in the OMIM namespace,
140
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
141
- in the analysis.
80
+ out_file (Path): Output file to write to.
81
+ output_df (pl.DataFrame): Output dataframe.
142
82
  """
83
+ output_df.write_parquet(out_file, compression="zstd")
143
84
 
144
- disease_name: str
145
- disease_identifier: str
146
- score: float
147
85
 
86
+ def _write_gene_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
87
+ """
88
+ Write ranked PhEval gene results to a parquet file.
148
89
 
149
- @dataclass
150
- class RankedPhEvalDiseaseResult(PhEvalDiseaseResult):
151
- """PhEval disease result with corresponding rank
152
90
  Args:
153
- rank (int): The rank for the result entry
91
+ ranked_results ([PhEvalResult]): List of ranked PhEval gene results.
92
+ output_file (Path): Path to the output file.
154
93
  """
155
-
156
- rank: int
157
-
158
- @staticmethod
159
- def from_disease_result(pheval_disease_result: PhEvalDiseaseResult, rank: int):
160
- """Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank
161
- Args:
162
- pheval_disease_result (PhEvalDiseaseResult): The disease result entry
163
- rank (int): The corresponding rank for the result entry
164
-
165
- Returns:
166
- RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult
167
- """
168
- return RankedPhEvalDiseaseResult(
169
- disease_name=pheval_disease_result.disease_name,
170
- disease_identifier=pheval_disease_result.disease_identifier,
171
- score=pheval_disease_result.score,
172
- rank=rank,
173
- )
174
-
175
-
176
- class SortOrder(Enum):
177
- """Enumeration representing sorting orders."""
178
-
179
- ASCENDING = 1
180
- """Ascending sort order."""
181
- DESCENDING = 2
182
- """Descending sort order."""
183
-
184
-
185
- class ResultSorter:
186
- """Class for sorting PhEvalResult instances based on a given sort order."""
187
-
188
- def __init__(self, pheval_results: [PhEvalResult], sort_order: SortOrder):
189
- """
190
- Initialise ResultSorter
191
-
192
- Args:
193
- pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted
194
- sort_order (SortOrder): Sorting order to be applied
195
- """
196
- self.pheval_results = pheval_results
197
- self.sort_order = sort_order
198
-
199
- def _sort_by_decreasing_score(self) -> [PhEvalResult]:
200
- """
201
- Sort results in descending order based on the score
202
-
203
- Returns:
204
- [PhEvalResult]: Sorted list of PhEvalResult instances.
205
- """
206
- return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=True)
207
-
208
- def _sort_by_increasing_score(self) -> [PhEvalResult]:
209
- """
210
- Sort results in ascending order based on the score
211
-
212
- Returns:
213
- [PhEvalResult]: Sorted list of PhEvalResult instances.
214
- """
215
- return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=False)
216
-
217
- def sort_pheval_results(self) -> [PhEvalResult]:
218
- """
219
- Sort results based on the specified sort order.
220
-
221
- Returns:
222
- [PhEvalResult]: Sorted list of PhEvalResult instances.
223
- """
224
- return (
225
- self._sort_by_increasing_score()
226
- if self.sort_order == SortOrder.ASCENDING
227
- else self._sort_by_decreasing_score()
228
- )
94
+ gene_output = ranked_results.select(
95
+ ["rank", "score", "gene_symbol", "gene_identifier", "true_positive"]
96
+ )
97
+ _write_results_file(output_file, gene_output)
229
98
 
230
99
 
231
- def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> pd.DataFrame:
100
+ def _write_variant_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
232
101
  """
233
- Rank PhEval results post-processed from tool-specific output, managing tied scores (ex aequo)
102
+ Write ranked PhEval variant results to a parquet file.
234
103
 
235
104
  Args:
236
- pheval_result ([PhEvalResult]): PhEval results obtained from tool-specific output
237
- sort_order (SortOrder): Sorting order based on which ranking is performed
238
-
239
- Returns:
240
- pd.DataFrame : Ranked PhEval results with tied scores managed
241
-
242
- Raises:
243
- ValueError: If an incompatible PhEval result type is encountered
105
+ ranked_results ([PhEvalResult]): List of ranked PhEval variant results.
106
+ output_file (Path): Path to the output file.
244
107
  """
245
- pheval_result_df = pd.DataFrame([data.__dict__ for data in pheval_result])
246
- if sort_order == SortOrder.ASCENDING:
247
- pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=True)
248
- elif sort_order == SortOrder.DESCENDING:
249
- pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False)
250
- return pheval_result_df
108
+ variant_output = ranked_results.select(
109
+ ["rank", "score", "chromosome", "start", "end", "ref", "alt", "variant_id", "true_positive"]
110
+ )
111
+ _write_results_file(output_file, variant_output)
251
112
 
252
113
 
253
- def _return_sort_order(sort_order_str: str) -> SortOrder:
114
+ def _write_disease_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
254
115
  """
255
- Convert a string derived from the config file into SortOrder Enum
116
+ Write ranked PhEval disease results to a parquet file.
256
117
 
257
118
  Args:
258
- sort_order_str (str): String representation of the sorting order
259
-
260
- Returns:
261
- SortOrder: Enum representing the specified sorting order
262
-
263
- Raises:
264
- ValueError: If an incompatible or unknown sorting method is provided
119
+ ranked_results ([PhEvalResult]): List of ranked PhEval disease results.
120
+ output_file (Path): Path to the output file.
265
121
  """
266
- try:
267
- return SortOrder[sort_order_str.upper()]
268
- except KeyError:
269
- raise ValueError("Incompatible ordering method specified.")
122
+ disease_output = ranked_results.select(
123
+ ["rank", "score", "disease_name", "disease_identifier", "true_positive"]
124
+ )
125
+ _write_results_file(output_file, disease_output)
270
126
 
271
127
 
272
- def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) -> pd.DataFrame:
128
+ def _get_result_type(
129
+ result_type: ResultType, phenopacket_truth_set: PhenopacketTruthSet
130
+ ) -> Tuple[Callable, Callable]:
273
131
  """
274
- Create PhEval results with corresponding ranks based on the specified sorting order.
275
-
132
+ Get the methods for extracting the entity and writing the result for a given result type.
276
133
  Args:
277
- pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed.
278
- sort_order_str (str): String representation of the desired sorting order.
279
-
134
+ result_type (ResultType): The result type.
135
+ phenopacket_truth_set (PhenopacketTruthSet): The phenotype truth set class instance.
280
136
  Returns:
281
- pd.DataFrame: PhEval results with ranks assigned.
137
+ Tuple[Callable, Callable]: The methods for extracting the entity and the write method.
282
138
  """
283
- sort_order = _return_sort_order(sort_order_str)
284
- sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results()
285
- return _rank_pheval_result(sorted_pheval_result, sort_order)
139
+ match result_type:
140
+ case ResultType.GENE:
141
+ return phenopacket_truth_set.classified_gene, _write_gene_result
142
+ case ResultType.VARIANT:
143
+ return phenopacket_truth_set.classified_variant, _write_variant_result
144
+ case ResultType.DISEASE:
145
+ return phenopacket_truth_set.classified_disease, _write_disease_result
286
146
 
287
147
 
288
- def _write_pheval_gene_result(
289
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
148
+ def create_empty_pheval_result(
149
+ phenopacket_dir: Path, output_dir: Path, result_type: ResultType
290
150
  ) -> None:
291
151
  """
292
- Write ranked PhEval gene results to a TSV file
152
+ Create an empty PhEval result for a given result type (gene, variant, or disease).
153
+
154
+ Notes:
155
+ This is necessary because some tools may not generate a result output for certain cases.
156
+ By explicitly creating an empty result, which will contain the known entity with a rank and score of 0,
157
+ we can track and identify false negatives during benchmarking,
158
+ ensuring that missing predictions are accounted for in the evaluation.
293
159
 
294
160
  Args:
295
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
296
- output_dir (Path): Path to the output directory
297
- tool_result_path (Path): Path to the tool-specific result file
161
+ phenopacket_dir (Path): The directory containing the phenopackets.
162
+ output_dir (Path): The output directory.
163
+ result_type (ResultType): The result type.
164
+
298
165
  """
299
- pheval_gene_output = ranked_pheval_result.loc[
300
- :, ["rank", "score", "gene_symbol", "gene_identifier"]
301
- ]
302
- pheval_gene_output.to_csv(
303
- output_dir.joinpath(
304
- "pheval_gene_results/" + tool_result_path.stem + "-pheval_gene_result.tsv"
305
- ),
306
- sep="\t",
307
- index=False,
308
- )
166
+ if result_type in executed_results:
167
+ return
168
+ executed_results.add(result_type)
169
+ phenopacket_truth_set = PhenopacketTruthSet(phenopacket_dir)
170
+ classify_method, write_method = _get_result_type(result_type, phenopacket_truth_set)
171
+ for file in all_files(phenopacket_dir):
172
+ classified_results = classify_method(file.stem)
173
+ write_method(
174
+ classified_results,
175
+ output_dir.joinpath(f"{file.stem}-{result_type.value}_result.parquet"),
176
+ )
309
177
 
310
178
 
311
- def _write_pheval_variant_result(
312
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
179
+ @validate_dataframe(ResultSchema.GENE_RESULT_SCHEMA)
180
+ def generate_gene_result(
181
+ results: pl.DataFrame,
182
+ sort_order: SortOrder,
183
+ output_dir: Path,
184
+ result_path: Path,
185
+ phenopacket_dir: Path,
313
186
  ) -> None:
314
187
  """
315
- Write ranked PhEval variant results to a TSV file
316
-
188
+ Generate PhEval gene results to a compressed Parquet output.
317
189
  Args:
318
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
190
+ results (pl.DataFrame): The gene results.
191
+ sort_order (SortOrder): The sort order to use.
319
192
  output_dir (Path): Path to the output directory
320
- tool_result_path (Path): Path to the tool-specific result file
193
+ result_path (Path): Path to the tool-specific result file.
194
+ phenopacket_dir (Path): Path to the Phenopacket directory
321
195
  """
322
- pheval_variant_output = ranked_pheval_result.loc[
323
- :, ["rank", "score", "chromosome", "start", "end", "ref", "alt"]
324
- ]
325
- pheval_variant_output.to_csv(
326
- output_dir.joinpath(
327
- "pheval_variant_results/" + tool_result_path.stem + "-pheval_variant_result.tsv"
328
- ),
329
- sep="\t",
330
- index=False,
196
+ output_file = output_dir.joinpath(f"pheval_gene_results/{result_path.stem}-gene_result.parquet")
197
+ create_empty_pheval_result(
198
+ phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE
199
+ )
200
+ ranked_results = _rank_results(results, sort_order)
201
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_gene_results(
202
+ ranked_results, output_file
331
203
  )
204
+ _write_gene_result(classified_results, output_file)
332
205
 
333
206
 
334
- def _write_pheval_disease_result(
335
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
207
+ @validate_dataframe(ResultSchema.VARIANT_RESULT_SCHEMA)
208
+ def generate_variant_result(
209
+ results: pl.DataFrame,
210
+ sort_order: SortOrder,
211
+ output_dir: Path,
212
+ result_path: Path,
213
+ phenopacket_dir: Path,
336
214
  ) -> None:
337
215
  """
338
- Write ranked PhEval disease results to a TSV file
339
-
216
+ Generate PhEval variant results to a compressed Parquet output.
340
217
  Args:
341
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
218
+ results (pl.DataFrame): The variant results.
219
+ sort_order (SortOrder): The sort order to use.
342
220
  output_dir (Path): Path to the output directory
343
- tool_result_path (Path): Path to the tool-specific result file
221
+ result_path (Path): Path to the tool-specific result file.
222
+ phenopacket_dir (Path): Path to the Phenopacket directory
344
223
  """
345
- pheval_disease_output = ranked_pheval_result.loc[
346
- :, ["rank", "score", "disease_name", "disease_identifier"]
347
- ]
348
- pheval_disease_output.to_csv(
349
- output_dir.joinpath(
350
- "pheval_disease_results/" + tool_result_path.stem + "-pheval_disease_result.tsv"
351
- ),
352
- sep="\t",
353
- index=False,
224
+ output_file = output_dir.joinpath(
225
+ f"pheval_variant_results/{result_path.stem}-variant_result.parquet"
226
+ )
227
+ create_empty_pheval_result(
228
+ phenopacket_dir, output_dir.joinpath("pheval_variant_results"), ResultType.VARIANT
354
229
  )
230
+ ranked_results = _rank_results(results, sort_order).with_columns(
231
+ pl.concat_str(["chrom", "pos", "ref", "alt"], separator="-").alias("variant_id")
232
+ )
233
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(
234
+ ranked_results, output_file
235
+ )
236
+ _write_variant_result(classified_results, output_file)
355
237
 
356
238
 
357
- def generate_pheval_result(
358
- pheval_result: [PhEvalResult],
359
- sort_order_str: str,
239
+ @validate_dataframe(ResultSchema.DISEASE_RESULT_SCHEMA)
240
+ def generate_disease_result(
241
+ results: pl.DataFrame,
242
+ sort_order: SortOrder,
360
243
  output_dir: Path,
361
- tool_result_path: Path,
244
+ result_path: Path,
245
+ phenopacket_dir: Path,
362
246
  ) -> None:
363
247
  """
364
- Generate PhEval variant, gene or disease TSV result based on input results.
365
-
248
+ Generate PhEval disease results to a compressed Parquet output.
366
249
  Args:
367
- pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed.
368
- sort_order_str (str): String representation of the desired sorting order.
369
- output_dir (Path): Path to the output directory.
370
- tool_result_path (Path): Path to the tool-specific result file.
371
-
372
- Raises:
373
- ValueError: If the results are not all the same type or an error occurs during file writing.
250
+ results (pl.DataFrame): The disease results.
251
+ sort_order (SortOrder): The sort order to use.
252
+ output_dir (Path): Path to the output directory
253
+ result_path (Path): Path to the tool-specific result file.
254
+ phenopacket_dir (Path): Path to the Phenopacket directory
374
255
  """
375
- if not pheval_result:
376
- info_log.warning(f"No results found for {tool_result_path.name}")
377
- return
378
- ranked_pheval_result = _create_pheval_result(pheval_result, sort_order_str)
379
- if all(isinstance(result, PhEvalGeneResult) for result in pheval_result):
380
- _write_pheval_gene_result(ranked_pheval_result, output_dir, tool_result_path)
381
- elif all(isinstance(result, PhEvalVariantResult) for result in pheval_result):
382
- _write_pheval_variant_result(ranked_pheval_result, output_dir, tool_result_path)
383
- elif all(isinstance(result, PhEvalDiseaseResult) for result in pheval_result):
384
- _write_pheval_disease_result(ranked_pheval_result, output_dir, tool_result_path)
385
- else:
386
- raise ValueError("Results are not all of the same type.")
256
+ output_file = output_dir.joinpath(
257
+ f"pheval_disease_results/{result_path.stem}-disease_result.parquet"
258
+ )
259
+ create_empty_pheval_result(
260
+ phenopacket_dir, output_dir.joinpath("pheval_disease_results"), ResultType.DISEASE
261
+ )
262
+ ranked_results = _rank_results(results, sort_order)
263
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_disease_results(
264
+ ranked_results, output_file
265
+ )
266
+ _write_disease_result(classified_results, output_file)
@@ -0,0 +1,92 @@
1
+ from enum import Enum
2
+ from functools import wraps
3
+ from typing import Callable
4
+
5
+ import polars as pl
6
+
7
+
8
+ class ResultSchema(Enum):
9
+ """
10
+ Enum for different result schema formats.
11
+ Attributes:
12
+ GENE_RESULT_SCHEMA (pl.Schema): Schema for gene-based results.
13
+ VARIANT_RESULT_SCHEMA (pl.Schema): Schema for variant-based results.
14
+ DISEASE_RESULT_SCHEMA (pl.Schema): Schema for disease-based results.
15
+ """
16
+
17
+ GENE_RESULT_SCHEMA = pl.Schema(
18
+ {
19
+ "gene_symbol": pl.String,
20
+ "gene_identifier": pl.String,
21
+ "score": pl.Float64,
22
+ "grouping_id": pl.Utf8,
23
+ }
24
+ )
25
+ VARIANT_RESULT_SCHEMA = pl.Schema(
26
+ {
27
+ "chrom": pl.String,
28
+ "start": pl.Int64,
29
+ "end": pl.Int64,
30
+ "ref": pl.String,
31
+ "alt": pl.String,
32
+ "score": pl.Float64,
33
+ "grouping_id": pl.Utf8,
34
+ }
35
+ )
36
+ DISEASE_RESULT_SCHEMA = pl.Schema(
37
+ {
38
+ "disease_name": pl.String,
39
+ "disease_identifier": pl.String,
40
+ "score": pl.Float64,
41
+ "grouping_id": pl.Utf8,
42
+ }
43
+ )
44
+
45
+ def validate(self, df: pl.DataFrame) -> bool:
46
+ """
47
+ Validate that a DataFrame follows the expected schema.
48
+ Args:
49
+ df (pl.DataFrame): The DataFrame to validate.
50
+ Raises:
51
+ ValueError: If a required column is missing or the grouping_id column contains a null value.
52
+ TypeError: If a column exists but has an incorrect data type.
53
+ Returns:
54
+ bool: True if the DataFrame is valid according to the schema.
55
+ """
56
+ expected_schema = self.value
57
+
58
+ if "grouping_id" in df.columns and df["grouping_id"].null_count() > 0:
59
+ raise ValueError("'grouping_id' column should not contain null values if provided.")
60
+
61
+ for col_name, expected_type in expected_schema.items():
62
+ if col_name not in df.schema:
63
+ if col_name == "grouping_id":
64
+ continue
65
+ raise ValueError(f"Missing required column: {col_name}")
66
+
67
+ if df.schema[col_name] != expected_type:
68
+ raise TypeError(
69
+ f"Column '{col_name}' has type {df.schema[col_name]}, expected {expected_type}"
70
+ )
71
+
72
+ return True
73
+
74
+
75
+ def validate_dataframe(schema: ResultSchema) -> Callable:
76
+ """
77
+ Decorator to validate DataFrame input based on a ResultSchema.
78
+ Args:
79
+ schema (ResultSchema): The expected schema from the `ResultSchema` enum.
80
+ Returns:
81
+ Callable: A wrapped function that validates the DataFrame before execution.
82
+ """
83
+
84
+ def decorator(func: Callable) -> Callable:
85
+ @wraps(func)
86
+ def wrapper(df: pl.DataFrame, *args, **kwargs):
87
+ schema.validate(df)
88
+ return func(df, *args, **kwargs)
89
+
90
+ return wrapper
91
+
92
+ return decorator