pheval 0.4.7__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show
  1. pheval/analyse/benchmark.py +156 -0
  2. pheval/analyse/benchmark_db_manager.py +16 -134
  3. pheval/analyse/benchmark_output_type.py +43 -0
  4. pheval/analyse/binary_classification_curves.py +132 -0
  5. pheval/analyse/binary_classification_stats.py +164 -307
  6. pheval/analyse/generate_plots.py +210 -395
  7. pheval/analyse/generate_rank_comparisons.py +44 -0
  8. pheval/analyse/rank_stats.py +190 -382
  9. pheval/analyse/run_data_parser.py +21 -39
  10. pheval/cli.py +27 -24
  11. pheval/cli_pheval_utils.py +7 -8
  12. pheval/post_processing/phenopacket_truth_set.py +250 -0
  13. pheval/post_processing/post_processing.py +179 -345
  14. pheval/post_processing/validate_result_format.py +91 -0
  15. pheval/prepare/update_phenopacket.py +11 -9
  16. pheval/utils/logger.py +35 -0
  17. pheval/utils/phenopacket_utils.py +85 -91
  18. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/METADATA +4 -4
  19. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/RECORD +22 -26
  20. pheval/analyse/analysis.py +0 -104
  21. pheval/analyse/assess_prioritisation_base.py +0 -108
  22. pheval/analyse/benchmark_generator.py +0 -126
  23. pheval/analyse/benchmarking_data.py +0 -25
  24. pheval/analyse/disease_prioritisation_analysis.py +0 -152
  25. pheval/analyse/gene_prioritisation_analysis.py +0 -147
  26. pheval/analyse/generate_summary_outputs.py +0 -105
  27. pheval/analyse/parse_benchmark_summary.py +0 -81
  28. pheval/analyse/parse_corpus.py +0 -219
  29. pheval/analyse/prioritisation_result_types.py +0 -52
  30. pheval/analyse/variant_prioritisation_analysis.py +0 -159
  31. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/LICENSE +0 -0
  32. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/WHEEL +0 -0
  33. {pheval-0.4.7.dist-info → pheval-0.5.1.dist-info}/entry_points.txt +0 -0
@@ -1,418 +1,252 @@
1
- import logging
2
- import operator
3
- from dataclasses import dataclass, field
4
1
  from enum import Enum
5
2
  from pathlib import Path
6
- from typing import List, Union
3
+ from typing import Callable, Tuple
7
4
 
8
- import pandas as pd
5
+ import polars as pl
9
6
 
10
- info_log = logging.getLogger("info")
7
+ from pheval.post_processing.phenopacket_truth_set import PhenopacketTruthSet
8
+ from pheval.post_processing.validate_result_format import ResultSchema, validate_dataframe
9
+ from pheval.utils.file_utils import all_files
10
+ from pheval.utils.logger import get_logger
11
11
 
12
+ logger = get_logger()
12
13
 
13
- def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
14
- """Calculate the end position for a variant
15
- Args:
16
- variant_start (int): The start position of the variant
17
- variant_ref (str): The reference allele of the variant
14
+ executed_results = set()
18
15
 
19
- Returns:
20
- int: The end position of the variant
21
- """
22
- return variant_start + len(variant_ref) - 1
23
16
 
17
+ class ResultType(Enum):
18
+ """Enumeration of the possible result types."""
24
19
 
25
- @dataclass
26
- class PhEvalResult:
27
- """Base class for PhEval results."""
20
+ GENE = "gene"
21
+ DISEASE = "disease"
22
+ VARIANT = "variant"
28
23
 
29
24
 
30
- @dataclass
31
- class PhEvalGeneResult(PhEvalResult):
32
- """Minimal data required from tool-specific output for gene prioritisation result
33
- Args:
34
- gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
35
- gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
36
- score (float): The score for the gene result entry
37
- Notes:
38
- While we recommend providing the gene identifier in the ENSEMBL namespace,
39
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
40
- in the analysis.
41
- """
25
+ class SortOrder(Enum):
26
+ """Enumeration representing sorting orders."""
42
27
 
43
- gene_symbol: Union[List[str], str]
44
- gene_identifier: Union[List[str], str]
45
- score: float
28
+ ASCENDING = 1
29
+ """Ascending sort order."""
30
+ DESCENDING = 2
31
+ """Descending sort order."""
46
32
 
47
33
 
48
- @dataclass
49
- class RankedPhEvalGeneResult(PhEvalGeneResult):
50
- """PhEval gene result with corresponding rank
51
- Args:
52
- rank (int): The rank for the result entry
34
+ def _rank_results(results: pl.DataFrame, sort_order: SortOrder) -> pl.DataFrame:
53
35
  """
54
-
55
- rank: int
56
-
57
- @staticmethod
58
- def from_gene_result(pheval_gene_result: PhEvalGeneResult, rank: int):
59
- """Return RankedPhEvalGeneResult from a PhEvalGeneResult and rank
60
- Args:
61
- pheval_gene_result (PhEvalGeneResult): The gene result entry
62
- rank (int): The corresponding rank for the result entry
63
-
64
- Returns:
65
- RankedPhEvalGeneResult: The result as a RankedPhEvalGeneResult
66
- """
67
- return RankedPhEvalGeneResult(
68
- gene_symbol=pheval_gene_result.gene_symbol,
69
- gene_identifier=pheval_gene_result.gene_identifier,
70
- score=pheval_gene_result.score,
71
- rank=rank,
72
- )
73
-
74
-
75
- @dataclass
76
- class PhEvalVariantResult(PhEvalResult):
77
- """Minimal data required from tool-specific output for variant prioritisation
36
+ Rank results with the given sort order.
78
37
  Args:
79
- chromosome (str): The chromosome position of the variant recommended to be provided in the following format.
80
- This includes numerical designations from 1 to 22 representing autosomal chromosomes,
81
- as well as the sex chromosomes X and Y, and the mitochondrial chromosome MT.
82
- start (int): The start position of the variant
83
- end (int): The end position of the variant
84
- ref (str): The reference allele of the variant
85
- alt (str): The alternate allele of the variant
86
- score (float): The score for the variant result entry
87
- Notes:
88
- While we recommend providing the variant's chromosome in the specified format,
89
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
90
- in the analysis.
91
- """
38
+ results (pl.DataFrame): The results to rank.
39
+ sort_order (SortOrder): The sort order to use.
40
+ Returns:
41
+ pl.DataFrame: The ranked results.
42
+ """
43
+ sort_descending = True if sort_order == SortOrder.DESCENDING else False
44
+ has_grouping_id = "grouping_id" in results.columns
45
+ if has_grouping_id:
46
+ results = (
47
+ results.sort("score", descending=sort_descending)
48
+ .with_columns(
49
+ pl.struct(["score", "grouping_id"])
50
+ .rank(method="dense", descending=sort_descending)
51
+ .cast(pl.Int32)
52
+ .alias("min_rank")
53
+ )
54
+ .with_columns(pl.col("min_rank").max().over("score").alias("rank"))
55
+ )
56
+ else:
57
+ results = results.sort("score", descending=sort_descending).with_columns(
58
+ pl.col("score").rank(method="max", descending=sort_descending).alias("rank")
59
+ )
92
60
 
93
- chromosome: str
94
- start: int
95
- end: int
96
- ref: str
97
- alt: str
98
- score: float
99
- grouping_id: str = field(default=None)
61
+ return results
100
62
 
101
63
 
102
- @dataclass
103
- class RankedPhEvalVariantResult(PhEvalVariantResult):
104
- """PhEval variant result with corresponding rank
64
+ def _write_results_file(out_file: Path, output_df: pl.DataFrame) -> None:
65
+ """
66
+ Write results to compressed Parquet output.
105
67
  Args:
106
- rank (int): The rank for the result entry
68
+ out_file (Path): Output file to write to.
69
+ output_df (pl.DataFrame): Output dataframe.
107
70
  """
71
+ output_df.write_parquet(out_file, compression="zstd")
108
72
 
109
- rank: int = 0
110
-
111
- @staticmethod
112
- def from_variant_result(pheval_variant_result: PhEvalVariantResult, rank: int):
113
- """Return RankedPhEvalVariantResult from a PhEvalVariantResult and rank
114
- Args:
115
- pheval_variant_result (PhEvalVariantResult): The variant result entry
116
- rank (int): The corresponding rank for the result entry
117
-
118
- Returns:
119
- RankedPhEvalVariantResult: The result as a RankedPhEvalVariantResult
120
- """
121
- return RankedPhEvalVariantResult(
122
- chromosome=pheval_variant_result.chromosome,
123
- start=pheval_variant_result.start,
124
- end=pheval_variant_result.end,
125
- ref=pheval_variant_result.ref,
126
- alt=pheval_variant_result.alt,
127
- score=pheval_variant_result.score,
128
- rank=rank,
129
- )
130
73
 
74
+ def _write_gene_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
75
+ """
76
+ Write ranked PhEval gene results to a parquet file.
131
77
 
132
- @dataclass
133
- class PhEvalDiseaseResult(PhEvalResult):
134
- """Minimal data required from tool-specific output for disease prioritisation
135
78
  Args:
136
- disease_name (str): Disease name for the result entry
137
- disease_identifier (str): Identifier for the disease result entry in the OMIM namespace
138
- score (str): Score for the disease result entry
139
- Notes:
140
- While we recommend providing the disease identifier in the OMIM namespace,
141
- any matching format used in Phenopacket interpretations is acceptable for result matching purposes
142
- in the analysis.
79
+ ranked_results ([PhEvalResult]): List of ranked PhEval gene results.
80
+ output_file (Path): Path to the output file.
143
81
  """
82
+ gene_output = ranked_results.select(
83
+ ["rank", "score", "gene_symbol", "gene_identifier", "true_positive"]
84
+ )
85
+ _write_results_file(output_file, gene_output)
144
86
 
145
- disease_name: str
146
- disease_identifier: str
147
- score: float
148
87
 
88
+ def _write_variant_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
89
+ """
90
+ Write ranked PhEval variant results to a parquet file.
149
91
 
150
- @dataclass
151
- class RankedPhEvalDiseaseResult(PhEvalDiseaseResult):
152
- """PhEval disease result with corresponding rank
153
92
  Args:
154
- rank (int): The rank for the result entry
93
+ ranked_results ([PhEvalResult]): List of ranked PhEval variant results.
94
+ output_file (Path): Path to the output file.
155
95
  """
156
-
157
- rank: int
158
-
159
- @staticmethod
160
- def from_disease_result(pheval_disease_result: PhEvalDiseaseResult, rank: int):
161
- """Return RankedPhEvalDiseaseResult from a PhEvalDiseaseResult and rank
162
- Args:
163
- pheval_disease_result (PhEvalDiseaseResult): The disease result entry
164
- rank (int): The corresponding rank for the result entry
165
-
166
- Returns:
167
- RankedPhEvalDiseaseResult: The result as a RankedPhEvalDiseaseResult
168
- """
169
- return RankedPhEvalDiseaseResult(
170
- disease_name=pheval_disease_result.disease_name,
171
- disease_identifier=pheval_disease_result.disease_identifier,
172
- score=pheval_disease_result.score,
173
- rank=rank,
174
- )
175
-
176
-
177
- class SortOrder(Enum):
178
- """Enumeration representing sorting orders."""
179
-
180
- ASCENDING = 1
181
- """Ascending sort order."""
182
- DESCENDING = 2
183
- """Descending sort order."""
184
-
185
-
186
- class ResultSorter:
187
- """Class for sorting PhEvalResult instances based on a given sort order."""
188
-
189
- def __init__(self, pheval_results: [PhEvalResult], sort_order: SortOrder):
190
- """
191
- Initialise ResultSorter
192
-
193
- Args:
194
- pheval_results ([PhEvalResult]): List of PhEvalResult instances to be sorted
195
- sort_order (SortOrder): Sorting order to be applied
196
- """
197
- self.pheval_results = pheval_results
198
- self.sort_order = sort_order
199
-
200
- def _sort_by_decreasing_score(self) -> [PhEvalResult]:
201
- """
202
- Sort results in descending order based on the score
203
-
204
- Returns:
205
- [PhEvalResult]: Sorted list of PhEvalResult instances.
206
- """
207
- return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=True)
208
-
209
- def _sort_by_increasing_score(self) -> [PhEvalResult]:
210
- """
211
- Sort results in ascending order based on the score
212
-
213
- Returns:
214
- [PhEvalResult]: Sorted list of PhEvalResult instances.
215
- """
216
- return sorted(self.pheval_results, key=operator.attrgetter("score"), reverse=False)
217
-
218
- def sort_pheval_results(self) -> [PhEvalResult]:
219
- """
220
- Sort results based on the specified sort order.
221
-
222
- Returns:
223
- [PhEvalResult]: Sorted list of PhEvalResult instances.
224
- """
225
- return (
226
- self._sort_by_increasing_score()
227
- if self.sort_order == SortOrder.ASCENDING
228
- else self._sort_by_decreasing_score()
229
- )
230
-
231
-
232
- class ResultRanker:
233
- def __init__(self, pheval_result: List[PhEvalResult], sort_order: SortOrder):
234
- """
235
- Initialise the PhEvalRanker.
236
- Args:
237
- pheval_result (List[PhEvalResult]): PhEval results to rank.
238
- sort_order (SortOrder): Sorting order based on which ranking is performed.
239
- """
240
- self.pheval_result = pheval_result
241
- self.sort_order = sort_order
242
- self.ascending = sort_order == SortOrder.ASCENDING
243
-
244
- def rank(self) -> pd.DataFrame:
245
- """
246
- Rank PhEval results, managing tied scores (ex aequo) and handling grouping_id if present.
247
-
248
- Returns:
249
- pd.DataFrame : Ranked PhEval results with tied scores managed.
250
- """
251
- pheval_result_df = pd.DataFrame([data.__dict__ for data in self.pheval_result])
252
-
253
- if self._has_valid_grouping_id(pheval_result_df):
254
- pheval_result_df = self._rank_with_grouping_id(pheval_result_df)
255
- else:
256
- pheval_result_df = self._rank_without_grouping_id(pheval_result_df)
257
- return pheval_result_df.drop(columns=["min_rank", "grouping_id"], errors="ignore")
258
-
259
- @staticmethod
260
- def _has_valid_grouping_id(pheval_result_df: pd.DataFrame) -> bool:
261
- """Check if grouping_id exists and has no None values."""
262
- return (
263
- "grouping_id" in pheval_result_df.columns
264
- and not pheval_result_df["grouping_id"].isnull().any()
265
- )
266
-
267
- def _rank_with_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
268
- """Apply ranking when grouping_id is present and has no None values."""
269
- pheval_result_df["min_rank"] = (
270
- pheval_result_df.groupby(["score", "grouping_id"])
271
- .ngroup()
272
- .rank(method="dense", ascending=self.ascending)
273
- ).astype(int)
274
- pheval_result_df["rank"] = pheval_result_df.groupby("score")["min_rank"].transform("max")
275
- return pheval_result_df
276
-
277
- def _rank_without_grouping_id(self, pheval_result_df: pd.DataFrame) -> pd.DataFrame:
278
- """Apply ranking without using grouping_id."""
279
- pheval_result_df["rank"] = (
280
- pheval_result_df["score"].rank(method="max", ascending=self.ascending).astype(int)
281
- )
282
- return pheval_result_df
96
+ variant_output = ranked_results.select(
97
+ ["rank", "score", "chrom", "start", "end", "ref", "alt", "variant_id", "true_positive"]
98
+ )
99
+ _write_results_file(output_file, variant_output)
283
100
 
284
101
 
285
- def _return_sort_order(sort_order_str: str) -> SortOrder:
102
+ def _write_disease_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
286
103
  """
287
- Convert a string derived from the config file into SortOrder Enum
104
+ Write ranked PhEval disease results to a parquet file.
288
105
 
289
106
  Args:
290
- sort_order_str (str): String representation of the sorting order
291
-
292
- Returns:
293
- SortOrder: Enum representing the specified sorting order
294
-
295
- Raises:
296
- ValueError: If an incompatible or unknown sorting method is provided
107
+ ranked_results ([PhEvalResult]): List of ranked PhEval disease results.
108
+ output_file (Path): Path to the output file.
297
109
  """
298
- try:
299
- return SortOrder[sort_order_str.upper()]
300
- except KeyError:
301
- raise ValueError("Incompatible ordering method specified.")
110
+ disease_output = ranked_results.select(["rank", "score", "disease_identifier", "true_positive"])
111
+ _write_results_file(output_file, disease_output)
302
112
 
303
113
 
304
- def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) -> pd.DataFrame:
114
+ def _get_result_type(
115
+ result_type: ResultType, phenopacket_truth_set: PhenopacketTruthSet
116
+ ) -> Tuple[Callable, Callable]:
305
117
  """
306
- Create PhEval results with corresponding ranks based on the specified sorting order.
307
-
118
+ Get the methods for extracting the entity and writing the result for a given result type.
308
119
  Args:
309
- pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed.
310
- sort_order_str (str): String representation of the desired sorting order.
311
-
120
+ result_type (ResultType): The result type.
121
+ phenopacket_truth_set (PhenopacketTruthSet): The phenotype truth set class instance.
312
122
  Returns:
313
- pd.DataFrame: PhEval results with ranks assigned.
123
+ Tuple[Callable, Callable]: The methods for extracting the entity and the write method.
314
124
  """
315
- sort_order = _return_sort_order(sort_order_str)
316
- sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results()
317
- return ResultRanker(sorted_pheval_result, sort_order).rank()
125
+ match result_type:
126
+ case ResultType.GENE:
127
+ return phenopacket_truth_set.classified_gene, _write_gene_result
128
+ case ResultType.VARIANT:
129
+ return phenopacket_truth_set.classified_variant, _write_variant_result
130
+ case ResultType.DISEASE:
131
+ return phenopacket_truth_set.classified_disease, _write_disease_result
318
132
 
319
133
 
320
- def _write_pheval_gene_result(
321
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
134
+ def create_empty_pheval_result(
135
+ phenopacket_dir: Path, output_dir: Path, result_type: ResultType
322
136
  ) -> None:
323
137
  """
324
- Write ranked PhEval gene results to a TSV file
138
+ Create an empty PhEval result for a given result type (gene, variant, or disease).
139
+
140
+ Notes:
141
+ This is necessary because some tools may not generate a result output for certain cases.
142
+ By explicitly creating an empty result, which will contain the known entity with a rank and score of 0,
143
+ we can track and identify false negatives during benchmarking,
144
+ ensuring that missing predictions are accounted for in the evaluation.
325
145
 
326
146
  Args:
327
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
328
- output_dir (Path): Path to the output directory
329
- tool_result_path (Path): Path to the tool-specific result file
147
+ phenopacket_dir (Path): The directory containing the phenopackets.
148
+ output_dir (Path): The output directory.
149
+ result_type (ResultType): The result type.
150
+
330
151
  """
331
- pheval_gene_output = ranked_pheval_result.loc[
332
- :, ["rank", "score", "gene_symbol", "gene_identifier"]
333
- ]
334
- pheval_gene_output.to_csv(
335
- output_dir.joinpath(
336
- "pheval_gene_results/" + tool_result_path.stem + "-pheval_gene_result.tsv"
337
- ),
338
- sep="\t",
339
- index=False,
340
- )
152
+ if result_type in executed_results:
153
+ return
154
+ executed_results.add(result_type)
155
+ phenopacket_truth_set = PhenopacketTruthSet(phenopacket_dir)
156
+ classify_method, write_method = _get_result_type(result_type, phenopacket_truth_set)
157
+ for file in all_files(phenopacket_dir):
158
+ classified_results = classify_method(file.stem)
159
+ write_method(
160
+ classified_results,
161
+ output_dir.joinpath(f"{file.stem}-{result_type.value}_result.parquet"),
162
+ )
341
163
 
342
164
 
343
- def _write_pheval_variant_result(
344
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
165
+ @validate_dataframe(ResultSchema.GENE_RESULT_SCHEMA)
166
+ def generate_gene_result(
167
+ results: pl.DataFrame,
168
+ sort_order: SortOrder,
169
+ output_dir: Path,
170
+ result_path: Path,
171
+ phenopacket_dir: Path,
345
172
  ) -> None:
346
173
  """
347
- Write ranked PhEval variant results to a TSV file
348
-
174
+ Generate PhEval gene results to a compressed Parquet output.
349
175
  Args:
350
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
176
+ results (pl.DataFrame): The gene results.
177
+ sort_order (SortOrder): The sort order to use.
351
178
  output_dir (Path): Path to the output directory
352
- tool_result_path (Path): Path to the tool-specific result file
179
+ result_path (Path): Path to the tool-specific result file.
180
+ phenopacket_dir (Path): Path to the Phenopacket directory
353
181
  """
354
- pheval_variant_output = ranked_pheval_result.loc[
355
- :, ["rank", "score", "chromosome", "start", "end", "ref", "alt"]
356
- ]
357
- pheval_variant_output.to_csv(
358
- output_dir.joinpath(
359
- "pheval_variant_results/" + tool_result_path.stem + "-pheval_variant_result.tsv"
360
- ),
361
- sep="\t",
362
- index=False,
182
+ output_file = output_dir.joinpath(f"pheval_gene_results/{result_path.stem}-gene_result.parquet")
183
+ create_empty_pheval_result(
184
+ phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE
363
185
  )
186
+ ranked_results = _rank_results(results, sort_order)
187
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_gene_results(
188
+ ranked_results, output_file
189
+ )
190
+ _write_gene_result(classified_results, output_file)
364
191
 
365
192
 
366
- def _write_pheval_disease_result(
367
- ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
193
+ @validate_dataframe(ResultSchema.VARIANT_RESULT_SCHEMA)
194
+ def generate_variant_result(
195
+ results: pl.DataFrame,
196
+ sort_order: SortOrder,
197
+ output_dir: Path,
198
+ result_path: Path,
199
+ phenopacket_dir: Path,
368
200
  ) -> None:
369
201
  """
370
- Write ranked PhEval disease results to a TSV file
371
-
202
+ Generate PhEval variant results to a compressed Parquet output.
372
203
  Args:
373
- ranked_pheval_result ([PhEvalResult]): List of ranked PhEval gene results
204
+ results (pl.DataFrame): The variant results.
205
+ sort_order (SortOrder): The sort order to use.
374
206
  output_dir (Path): Path to the output directory
375
- tool_result_path (Path): Path to the tool-specific result file
207
+ result_path (Path): Path to the tool-specific result file.
208
+ phenopacket_dir (Path): Path to the Phenopacket directory
376
209
  """
377
- pheval_disease_output = ranked_pheval_result.loc[
378
- :, ["rank", "score", "disease_name", "disease_identifier"]
379
- ]
380
- pheval_disease_output.to_csv(
381
- output_dir.joinpath(
382
- "pheval_disease_results/" + tool_result_path.stem + "-pheval_disease_result.tsv"
383
- ),
384
- sep="\t",
385
- index=False,
210
+ output_file = output_dir.joinpath(
211
+ f"pheval_variant_results/{result_path.stem}-variant_result.parquet"
212
+ )
213
+ create_empty_pheval_result(
214
+ phenopacket_dir, output_dir.joinpath("pheval_variant_results"), ResultType.VARIANT
215
+ )
216
+ ranked_results = _rank_results(results, sort_order).with_columns(
217
+ pl.concat_str(["chrom", "start", "ref", "alt"], separator="-").alias("variant_id")
386
218
  )
219
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(
220
+ ranked_results, output_file
221
+ )
222
+ _write_variant_result(classified_results, output_file)
387
223
 
388
224
 
389
- def generate_pheval_result(
390
- pheval_result: [PhEvalResult],
391
- sort_order_str: str,
225
+ @validate_dataframe(ResultSchema.DISEASE_RESULT_SCHEMA)
226
+ def generate_disease_result(
227
+ results: pl.DataFrame,
228
+ sort_order: SortOrder,
392
229
  output_dir: Path,
393
- tool_result_path: Path,
230
+ result_path: Path,
231
+ phenopacket_dir: Path,
394
232
  ) -> None:
395
233
  """
396
- Generate PhEval variant, gene or disease TSV result based on input results.
397
-
234
+ Generate PhEval disease results to a compressed Parquet output.
398
235
  Args:
399
- pheval_result ([PhEvalResult]): List of PhEvalResult instances to be processed.
400
- sort_order_str (str): String representation of the desired sorting order.
401
- output_dir (Path): Path to the output directory.
402
- tool_result_path (Path): Path to the tool-specific result file.
403
-
404
- Raises:
405
- ValueError: If the results are not all the same type or an error occurs during file writing.
236
+ results (pl.DataFrame): The disease results.
237
+ sort_order (SortOrder): The sort order to use.
238
+ output_dir (Path): Path to the output directory
239
+ result_path (Path): Path to the tool-specific result file.
240
+ phenopacket_dir (Path): Path to the Phenopacket directory
406
241
  """
407
- if not pheval_result:
408
- info_log.warning(f"No results found for {tool_result_path.name}")
409
- return
410
- ranked_pheval_result = _create_pheval_result(pheval_result, sort_order_str)
411
- if all(isinstance(result, PhEvalGeneResult) for result in pheval_result):
412
- _write_pheval_gene_result(ranked_pheval_result, output_dir, tool_result_path)
413
- elif all(isinstance(result, PhEvalVariantResult) for result in pheval_result):
414
- _write_pheval_variant_result(ranked_pheval_result, output_dir, tool_result_path)
415
- elif all(isinstance(result, PhEvalDiseaseResult) for result in pheval_result):
416
- _write_pheval_disease_result(ranked_pheval_result, output_dir, tool_result_path)
417
- else:
418
- raise ValueError("Results are not all of the same type.")
242
+ output_file = output_dir.joinpath(
243
+ f"pheval_disease_results/{result_path.stem}-disease_result.parquet"
244
+ )
245
+ create_empty_pheval_result(
246
+ phenopacket_dir, output_dir.joinpath("pheval_disease_results"), ResultType.DISEASE
247
+ )
248
+ ranked_results = _rank_results(results, sort_order)
249
+ classified_results = PhenopacketTruthSet(phenopacket_dir).merge_disease_results(
250
+ ranked_results, output_file
251
+ )
252
+ _write_disease_result(classified_results, output_file)