pheval 0.3.8__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -11,14 +11,15 @@ class BenchmarkRunResults:
11
11
  Benchmarking results for a run.
12
12
 
13
13
  Attributes:
14
- ranks (dict): Dictionary containing recorded ranks for samples.
15
14
  rank_stats (RankStats): Statistics related to benchmark.
15
+ binary_classification_stats (BinaryClassificationStats): Binary statistics related to benchmark.
16
16
  results_dir (Path, optional): Path to the result directory. Defaults to None.
17
17
  benchmark_name (str, optional): Name of the benchmark run. Defaults to None.
18
+ phenopacket_dir (Path, optional): Path to the phenopacket directory. Defaults to None.
18
19
  """
19
20
 
20
- ranks: dict
21
21
  rank_stats: RankStats
22
22
  binary_classification_stats: BinaryClassificationStats
23
23
  results_dir: Path = None
24
24
  benchmark_name: str = None
25
+ phenopacket_dir: Path = None
@@ -1,155 +1,22 @@
1
- from collections import defaultdict
2
1
  from pathlib import Path
3
- from typing import List
4
2
 
3
+ from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
5
  from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
6
  from pheval.analyse.binary_classification_stats import BinaryClassificationStats
7
- from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
8
- from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
9
- from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResult
10
7
  from pheval.analyse.rank_stats import RankStats
11
- from pheval.analyse.run_data_parser import TrackInputOutputDirectories
8
+ from pheval.analyse.run_data_parser import RunConfig
12
9
  from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
13
10
  from pheval.utils.file_utils import all_files
14
- from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
15
11
 
16
12
 
17
- class AssessDiseasePrioritisation:
13
+ class AssessDiseasePrioritisation(AssessPrioritisationBase):
18
14
  """Class for assessing disease prioritisation based on thresholds and scoring orders."""
19
15
 
20
- def __init__(
21
- self,
22
- phenopacket_path: Path,
23
- results_dir: Path,
24
- standardised_disease_results: List[RankedPhEvalDiseaseResult],
25
- threshold: float,
26
- score_order: str,
27
- proband_diseases: List[ProbandDisease],
28
- ):
29
- """
30
- Initialise AssessDiseasePrioritisation class
31
-
32
- Args:
33
- phenopacket_path (Path): Path to the phenopacket file
34
- results_dir (Path): Path to the results directory
35
- standardised_disease_results (List[RankedPhEvalDiseaseResult]): List of ranked PhEval disease results
36
- threshold (float): Threshold for scores
37
- score_order (str): Score order for results, either ascending or descending
38
- proband_diseases (List[ProbandDisease]): List of proband diseases
39
-
40
- """
41
- self.phenopacket_path = phenopacket_path
42
- self.results_dir = results_dir
43
- self.standardised_disease_results = standardised_disease_results
44
- self.threshold = threshold
45
- self.score_order = score_order
46
- self.proband_diseases = proband_diseases
47
-
48
- def _record_disease_prioritisation_match(
49
- self,
50
- disease: ProbandDisease,
51
- result_entry: RankedPhEvalDiseaseResult,
52
- rank_stats: RankStats,
53
- ) -> DiseasePrioritisationResult:
54
- """
55
- Record the disease prioritisation rank if found within the results
56
- Args:
57
- disease (ProbandDisease): Diagnosed proband disease
58
- result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
59
- rank_stats (RankStats): RankStats class instance
60
- Returns:
61
- DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
62
- """
63
- rank = result_entry.rank
64
- rank_stats.add_rank(rank)
65
- return DiseasePrioritisationResult(self.phenopacket_path, disease, rank)
66
-
67
- def _assess_disease_with_threshold_ascending_order(
68
- self,
69
- result_entry: RankedPhEvalDiseaseResult,
70
- disease: ProbandDisease,
71
- rank_stats: RankStats,
72
- ) -> DiseasePrioritisationResult:
73
- """
74
- Record the disease prioritisation rank if it meets the ascending order threshold.
75
-
76
- This method checks if the disease prioritisation rank meets the ascending order threshold.
77
- If the score of the result entry is less than the threshold, it records the disease rank.
78
-
79
- Args:
80
- result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
81
- disease (ProbandDisease): Diagnosed proband disease
82
- rank_stats (RankStats): RankStats class instance
83
-
84
- Returns:
85
- DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
86
- """
87
- if float(self.threshold) > float(result_entry.score):
88
- return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
89
-
90
- def _assess_disease_with_threshold(
91
- self,
92
- result_entry: RankedPhEvalDiseaseResult,
93
- disease: ProbandDisease,
94
- rank_stats: RankStats,
95
- ) -> DiseasePrioritisationResult:
96
- """
97
- Record the disease prioritisation rank if it meets the score threshold.
98
-
99
- This method checks if the disease prioritisation rank meets the score threshold.
100
- If the score of the result entry is greater than the threshold, it records the disease rank.
101
-
102
- Args:
103
- result_entry (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
104
- disease (ProbandDisease): Diagnosed proband disease
105
- rank_stats (RankStats): RankStats class instance
106
-
107
- Returns:
108
- DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
109
- """
110
- if float(self.threshold) < float(result_entry.score):
111
- return self._record_disease_prioritisation_match(disease, result_entry, rank_stats)
112
-
113
- def _record_matched_disease(
114
- self,
115
- disease: ProbandDisease,
116
- rank_stats: RankStats,
117
- standardised_disease_result: RankedPhEvalDiseaseResult,
118
- ) -> DiseasePrioritisationResult:
119
- """
120
- Return the disease rank result - handling the specification of a threshold.
121
-
122
- This method determines and returns the disease rank result based on the specified threshold
123
- and score order. If the threshold is 0.0, it records the disease rank directly.
124
- Otherwise, it assesses the disease with the threshold based on the score order.
125
-
126
- Args:
127
- disease (ProbandDisease): Diagnosed proband disease
128
- rank_stats (RankStats): RankStats class instance
129
- standardised_disease_result (RankedPhEvalDiseaseResult): Ranked PhEval disease result entry
130
-
131
- Returns:
132
- DiseasePrioritisationResult: Recorded correct disease prioritisation rank result
133
- """
134
- if float(self.threshold) == 0.0:
135
- return self._record_disease_prioritisation_match(
136
- disease, standardised_disease_result, rank_stats
137
- )
138
- else:
139
- return (
140
- self._assess_disease_with_threshold(
141
- standardised_disease_result, disease, rank_stats
142
- )
143
- if self.score_order != "ascending"
144
- else self._assess_disease_with_threshold_ascending_order(
145
- standardised_disease_result, disease, rank_stats
146
- )
147
- )
148
-
149
16
  def assess_disease_prioritisation(
150
17
  self,
151
- rank_stats: RankStats,
152
- rank_records: defaultdict,
18
+ standardised_disease_result_path: Path,
19
+ phenopacket_path: Path,
153
20
  binary_classification_stats: BinaryClassificationStats,
154
21
  ) -> None:
155
22
  """
@@ -159,67 +26,49 @@ class AssessDiseasePrioritisation:
159
26
  and records ranks using a PrioritisationRankRecorder.
160
27
 
161
28
  Args:
162
- rank_stats (RankStats): RankStats class instance
163
- rank_records (defaultdict): A defaultdict to store the correct ranked results.
29
+ standardised_disease_result_path (Path): Path to the standardised disease TSV result.
30
+ phenopacket_path (Path): Path to the phenopacket.
164
31
  binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
165
32
  """
166
33
  relevant_ranks = []
167
- for disease in self.proband_diseases:
168
- rank_stats.total += 1
169
- disease_match = DiseasePrioritisationResult(self.phenopacket_path, disease)
170
- for standardised_disease_result in self.standardised_disease_results:
171
- if (
172
- disease.disease_identifier == standardised_disease_result.disease_identifier
173
- or disease.disease_name == standardised_disease_result.disease_name
174
- ):
175
- disease_match = self._record_matched_disease(
176
- disease, rank_stats, standardised_disease_result
177
- )
178
- (
179
- relevant_ranks.append(disease_match.rank)
180
- if disease_match
181
- else relevant_ranks.append(0)
182
- )
183
- break
184
- PrioritisationRankRecorder(
185
- rank_stats.total,
186
- self.results_dir,
187
- (
188
- DiseasePrioritisationResult(self.phenopacket_path, disease)
189
- if disease_match is None
190
- else disease_match
191
- ),
192
- rank_records,
193
- ).record_rank()
194
- rank_stats.relevant_result_ranks.append(relevant_ranks)
34
+ df = self.conn.execute(
35
+ f"SELECT * FROM {self.table_name} WHERE phenopacket = ? ",
36
+ (phenopacket_path.name,),
37
+ ).fetchdf()
38
+ for _i, row in df.iterrows():
39
+ result = (
40
+ self.conn.execute(
41
+ f"SELECT * FROM '{standardised_disease_result_path}' "
42
+ f"WHERE contains_entity_function(CAST(COALESCE(disease_identifier, '') AS VARCHAR),"
43
+ f" '{row['disease_identifier']}') "
44
+ f"OR contains_entity_function(CAST(COALESCE(disease_name, '') AS VARCHAR), "
45
+ f"'{row['disease_name']}')"
46
+ )
47
+ .fetchdf()
48
+ .to_dict(orient="records")
49
+ )
50
+
51
+ if len(result) > 0:
52
+ disease_match = self._record_matched_entity(RankedPhEvalDiseaseResult(**result[0]))
53
+ relevant_ranks.append(disease_match)
54
+ primary_key = f"{phenopacket_path.name}-{row['disease_identifier']}"
55
+ self.conn.execute(
56
+ f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
57
+ (disease_match, primary_key),
58
+ )
195
59
  binary_classification_stats.add_classification(
196
- self.standardised_disease_results, relevant_ranks
60
+ self.db_connection.parse_table_into_dataclass(
61
+ str(standardised_disease_result_path), RankedPhEvalDiseaseResult
62
+ ),
63
+ relevant_ranks,
197
64
  )
198
65
 
199
66
 
200
- def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
201
- """
202
- Obtain known diseases from a Phenopacket.
203
- Args:
204
- phenopacket_path (Path): Path to the Phenopacket file.
205
-
206
- Returns:
207
- List[ProbandDisease]: A list of known diseases associated with the proband,
208
- extracted from the Phenopacket.
209
- """
210
- phenopacket = phenopacket_reader(phenopacket_path)
211
- phenopacket_util = PhenopacketUtil(phenopacket)
212
- return phenopacket_util.diagnoses()
213
-
214
-
215
67
  def assess_phenopacket_disease_prioritisation(
216
68
  phenopacket_path: Path,
217
- score_order: str,
218
- results_dir_and_input: TrackInputOutputDirectories,
219
- threshold: float,
220
- disease_rank_stats: RankStats,
221
- disease_rank_comparison: defaultdict,
69
+ run: RunConfig,
222
70
  disease_binary_classification_stats: BinaryClassificationStats,
71
+ disease_benchmarker: AssessDiseasePrioritisation,
223
72
  ) -> None:
224
73
  """
225
74
  Assess disease prioritisation for a Phenopacket by comparing PhEval standardised disease results
@@ -227,64 +76,66 @@ def assess_phenopacket_disease_prioritisation(
227
76
 
228
77
  Args:
229
78
  phenopacket_path (Path): Path to the Phenopacket.
230
- score_order (str): The order in which scores are arranged, either ascending or descending.
231
- results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
232
- threshold (float): Threshold for assessment.
233
- disease_rank_stats (RankStats): RankStats class instance.
234
- disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
79
+ run (RunConfig): Run configuration.
235
80
  disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
81
+ disease_benchmarker (AssessDiseasePrioritisation): AssessDiseasePrioritisation class instance.
236
82
  """
237
- standardised_disease_result = results_dir_and_input.results_dir.joinpath(
83
+ standardised_disease_result_path = run.results_dir.joinpath(
238
84
  f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
239
85
  )
240
- pheval_disease_result = read_standardised_result(standardised_disease_result)
241
- proband_diseases = _obtain_causative_diseases(phenopacket_path)
242
- AssessDiseasePrioritisation(
86
+ disease_benchmarker.assess_disease_prioritisation(
87
+ standardised_disease_result_path,
243
88
  phenopacket_path,
244
- results_dir_and_input.results_dir.joinpath("pheval_disease_results/"),
245
- parse_pheval_result(RankedPhEvalDiseaseResult, pheval_disease_result),
246
- threshold,
247
- score_order,
248
- proband_diseases,
249
- ).assess_disease_prioritisation(
250
- disease_rank_stats, disease_rank_comparison, disease_binary_classification_stats
89
+ disease_binary_classification_stats,
251
90
  )
252
91
 
253
92
 
254
93
  def benchmark_disease_prioritisation(
255
- results_directory_and_input: TrackInputOutputDirectories,
94
+ benchmark_name: str,
95
+ run: RunConfig,
256
96
  score_order: str,
257
97
  threshold: float,
258
- disease_rank_comparison: defaultdict,
259
98
  ):
260
99
  """
261
100
  Benchmark a directory based on disease prioritisation results.
262
101
 
263
102
  Args:
264
- results_directory_and_input (TrackInputOutputDirectories): Input and output directories.
103
+ benchmark_name (str): Name of the benchmark.
104
+ run (RunConfig): Run configuration.
265
105
  score_order (str): The order in which scores are arranged.
266
106
  threshold (float): Threshold for assessment.
267
- disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
268
107
 
269
108
  Returns:
270
109
  BenchmarkRunResults: An object containing benchmarking results for disease prioritisation,
271
110
  including ranks and rank statistics for the benchmarked directory.
272
111
  """
273
- disease_rank_stats = RankStats()
274
112
  disease_binary_classification_stats = BinaryClassificationStats()
275
- for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
113
+ db_connection = BenchmarkDBManager(benchmark_name)
114
+ db_connection.initialise()
115
+ disease_benchmarker = AssessDiseasePrioritisation(
116
+ db_connection,
117
+ f"{run.phenopacket_dir.parents[0].name}_disease",
118
+ run.run_identifier,
119
+ threshold,
120
+ score_order,
121
+ )
122
+ for phenopacket_path in all_files(run.phenopacket_dir):
276
123
  assess_phenopacket_disease_prioritisation(
277
124
  phenopacket_path,
278
- score_order,
279
- results_directory_and_input,
280
- threshold,
281
- disease_rank_stats,
282
- disease_rank_comparison,
125
+ run,
283
126
  disease_binary_classification_stats,
127
+ disease_benchmarker,
284
128
  )
129
+ db_connection.close()
130
+ disease_rank_stats = RankStats()
131
+ disease_rank_stats.add_ranks(
132
+ benchmark_name=benchmark_name,
133
+ table_name=f"{run.phenopacket_dir.parents[0].name}_disease",
134
+ column_name=str(run.run_identifier),
135
+ )
285
136
  return BenchmarkRunResults(
286
- results_dir=results_directory_and_input.results_dir,
287
- ranks=disease_rank_comparison,
288
137
  rank_stats=disease_rank_stats,
138
+ benchmark_name=run.run_identifier,
289
139
  binary_classification_stats=disease_binary_classification_stats,
140
+ phenopacket_dir=run.phenopacket_dir,
290
141
  )