pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -1,169 +1,22 @@
1
- import ast
2
- import re
3
- from collections import defaultdict
4
1
  from pathlib import Path
5
- from typing import List, Union
6
2
 
3
+ from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
7
5
  from pheval.analyse.benchmarking_data import BenchmarkRunResults
8
6
  from pheval.analyse.binary_classification_stats import BinaryClassificationStats
9
- from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
10
- from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
11
- from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
12
7
  from pheval.analyse.rank_stats import RankStats
13
- from pheval.analyse.run_data_parser import TrackInputOutputDirectories
8
+ from pheval.analyse.run_data_parser import RunConfig
14
9
  from pheval.post_processing.post_processing import RankedPhEvalGeneResult
15
10
  from pheval.utils.file_utils import all_files
16
- from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
17
11
 
18
12
 
19
- class AssessGenePrioritisation:
13
+ class AssessGenePrioritisation(AssessPrioritisationBase):
20
14
  """Class for assessing gene prioritisation based on thresholds and scoring orders."""
21
15
 
22
- def __init__(
23
- self,
24
- phenopacket_path: Path,
25
- results_dir: Path,
26
- standardised_gene_results: List[RankedPhEvalGeneResult],
27
- threshold: float,
28
- score_order: str,
29
- proband_causative_genes: List[ProbandCausativeGene],
30
- ):
31
- """
32
- Initialise AssessGenePrioritisation class.
33
-
34
- Args:
35
- phenopacket_path (Path): Path to the phenopacket file
36
- results_dir (Path): Path to the results directory
37
- standardised_gene_results (List[RankedPhEvalGeneResult]): List of ranked PhEval gene results
38
- threshold (float): Threshold for scores
39
- score_order (str): Score order for results, either ascending or descending
40
- proband_causative_genes (List[ProbandCausativeGene]): List of proband causative genes
41
- """
42
- self.phenopacket_path = phenopacket_path
43
- self.results_dir = results_dir
44
- self.standardised_gene_results = standardised_gene_results
45
- self.threshold = threshold
46
- self.score_order = score_order
47
- self.proband_causative_genes = proband_causative_genes
48
-
49
- def _record_gene_prioritisation_match(
50
- self,
51
- gene: ProbandCausativeGene,
52
- result_entry: RankedPhEvalGeneResult,
53
- rank_stats: RankStats,
54
- ) -> GenePrioritisationResult:
55
- """
56
- Record the gene prioritisation rank if found within the results
57
-
58
- Args:
59
- gene (ProbandCausativeGene): Diagnosed proband gene
60
- result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
61
- rank_stats (RankStats): RankStats class instance
62
-
63
- Returns:
64
- GenePrioritisationResult: Recorded correct gene prioritisation rank result
65
- """
66
- rank = result_entry.rank
67
- rank_stats.add_rank(rank)
68
- return GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol, rank)
69
-
70
- def _assess_gene_with_threshold_ascending_order(
71
- self,
72
- result_entry: RankedPhEvalGeneResult,
73
- gene: ProbandCausativeGene,
74
- rank_stats: RankStats,
75
- ) -> GenePrioritisationResult:
76
- """
77
- Record the gene prioritisation rank if it meets the ascending order threshold.
78
-
79
- This method checks if the gene prioritisation rank meets the ascending order threshold.
80
- If the score of the result entry is less than the threshold, it records the gene rank.
81
-
82
- Args:
83
- result_entry (RankedPhEvalGeneResult): Ranked PhEval gene result entry
84
- gene (ProbandCausativeGene): Diagnosed proband gene
85
- rank_stats (RankStats): RankStats class instance
86
- Returns:
87
- GenePrioritisationResult: Recorded correct gene prioritisation rank result
88
- """
89
- if float(self.threshold) > float(result_entry.score):
90
- return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
91
-
92
- def _assess_gene_with_threshold(
93
- self,
94
- result_entry: RankedPhEvalGeneResult,
95
- gene: ProbandCausativeGene,
96
- rank_stats: RankStats,
97
- ) -> GenePrioritisationResult:
98
- """
99
- Record the gene prioritisation rank if it meets the score threshold.
100
- This method checks if the gene prioritisation rank meets the score threshold.
101
- If the score of the result entry is greater than the threshold, it records the gene rank.
102
-
103
- Args:
104
- result_entry (RankedPhEvalResult): Ranked PhEval gene result entry
105
- gene (ProbandCausativeGene): Diagnosed proband gene
106
- rank_stats (RankStats): RankStats class instance
107
-
108
- Returns:
109
- GenePrioritisationResult: Recorded correct gene prioritisation rank result
110
- """
111
- if float(self.threshold) < float(result_entry.score):
112
- return self._record_gene_prioritisation_match(gene, result_entry, rank_stats)
113
-
114
- def _record_matched_gene(
115
- self,
116
- gene: ProbandCausativeGene,
117
- rank_stats: RankStats,
118
- standardised_gene_result: RankedPhEvalGeneResult,
119
- ) -> GenePrioritisationResult:
120
- """
121
- Return the gene rank result - handling the specification of a threshold.
122
- This method determines and returns the gene rank result based on the specified threshold
123
- and score order. If the threshold is 0.0, it records the gene rank directly.
124
- Otherwise, it assesses the gene with the threshold based on the score order.
125
- Args:
126
- gene (ProbandCausativeGene): Diagnosed proband gene
127
- rank_stats (RankStats): RankStats class instance
128
- standardised_gene_result (RankedPhEvalGeneResult): Ranked PhEval gene result entry
129
- Returns:
130
- GenePrioritisationResult: Recorded correct gene prioritisation rank result
131
- """
132
- if float(self.threshold) == 0.0:
133
- return self._record_gene_prioritisation_match(
134
- gene, standardised_gene_result, rank_stats
135
- )
136
- else:
137
- return (
138
- self._assess_gene_with_threshold(standardised_gene_result, gene, rank_stats)
139
- if self.score_order != "ascending"
140
- else self._assess_gene_with_threshold_ascending_order(
141
- standardised_gene_result, gene, rank_stats
142
- )
143
- )
144
-
145
- @staticmethod
146
- def _check_string_representation(entity: str) -> Union[List[str], str]:
147
- """
148
- Check if the input string is a representation of a list and returns the list if true, otherwise the string.
149
-
150
- Args:
151
- entity (str): The input entity to check.
152
-
153
- Returns:
154
- Union[List[str], str]: A list if the input string is a list representation, otherwise
155
- the original string.
156
- """
157
- list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
158
- if list_pattern.match(str(entity)):
159
- return ast.literal_eval(entity)
160
- else:
161
- return entity
162
-
163
16
  def assess_gene_prioritisation(
164
17
  self,
165
- rank_stats: RankStats,
166
- rank_records: defaultdict,
18
+ standardised_gene_result_path: Path,
19
+ phenopacket_path: Path,
167
20
  binary_classification_stats: BinaryClassificationStats,
168
21
  ) -> None:
169
22
  """
@@ -172,78 +25,47 @@ class AssessGenePrioritisation:
172
25
  and records ranks using a PrioritisationRankRecorder.
173
26
 
174
27
  Args:
175
- rank_stats (RankStats): RankStats class instance
176
- rank_records (defaultdict): A defaultdict to store the correct ranked results.
28
+ standardised_gene_result_path (Path): Path to the standardised gene TSV result.
29
+ phenopacket_path (Path): Path to the Phenopacket.
177
30
  binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
178
31
  """
179
32
  relevant_ranks = []
180
- for gene in self.proband_causative_genes:
181
- rank_stats.total += 1
182
- gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
183
- for standardised_gene_result in self.standardised_gene_results:
184
- gene_identifier = self._check_string_representation(
185
- standardised_gene_result.gene_identifier
33
+ df = self.conn.execute(
34
+ f"""SELECT * FROM {self.table_name} WHERE phenopacket = '{phenopacket_path.name}'"""
35
+ ).fetchdf()
36
+ for _i, row in df.iterrows():
37
+ result = (
38
+ self.conn.execute(
39
+ f"SELECT * FROM '{standardised_gene_result_path}' "
40
+ f"WHERE contains_entity_function(CAST(COALESCE(gene_identifier, '') AS VARCHAR),"
41
+ f" '{row['gene_identifier']}') "
42
+ f"OR contains_entity_function(CAST(COALESCE(gene_symbol, '') AS VARCHAR), "
43
+ f"'{row['gene_symbol']}')"
186
44
  )
187
- gene_symbol = self._check_string_representation(
188
- standardised_gene_result.gene_symbol
45
+ .fetchdf()
46
+ .to_dict(orient="records")
47
+ )
48
+ if len(result) > 0:
49
+ gene_match = self._record_matched_entity(RankedPhEvalGeneResult(**result[0]))
50
+ relevant_ranks.append(gene_match)
51
+ primary_key = f"{phenopacket_path.name}-{row['gene_symbol']}"
52
+ self.conn.execute(
53
+ f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
54
+ (gene_match, primary_key),
189
55
  )
190
- if (
191
- isinstance(gene_identifier, list)
192
- and gene.gene_identifier in gene_identifier
193
- or isinstance(gene_identifier, str)
194
- and gene.gene_identifier == str
195
- or isinstance(gene_symbol, list)
196
- and gene.gene_symbol in gene_symbol
197
- or isinstance(gene_symbol, str)
198
- and gene.gene_symbol == gene_symbol
199
- ):
200
- gene_match = self._record_matched_gene(
201
- gene, rank_stats, standardised_gene_result
202
- )
203
- (
204
- relevant_ranks.append(gene_match.rank)
205
- if gene_match
206
- else relevant_ranks.append(0)
207
- )
208
- break
209
- PrioritisationRankRecorder(
210
- rank_stats.total,
211
- self.results_dir,
212
- (
213
- GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
214
- if gene_match is None
215
- else gene_match
216
- ),
217
- rank_records,
218
- ).record_rank()
219
- rank_stats.relevant_result_ranks.append(relevant_ranks)
220
56
  binary_classification_stats.add_classification(
221
- pheval_results=self.standardised_gene_results, relevant_ranks=relevant_ranks
57
+ self.db_connection.parse_table_into_dataclass(
58
+ str(standardised_gene_result_path), RankedPhEvalGeneResult
59
+ ),
60
+ relevant_ranks,
222
61
  )
223
62
 
224
63
 
225
- def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
226
- """
227
- Obtain known genes from a Phenopacket.
228
- Args:
229
- phenopacket_path (Path): Path to the Phenopacket file.
230
- Returns:
231
- List[ProbandCausativeGene]: A list of known genes associated with the proband,
232
- extracted from the Phenopacket.
233
- """
234
- phenopacket = phenopacket_reader(phenopacket_path)
235
- phenopacket_util = PhenopacketUtil(phenopacket)
236
- return phenopacket_util.diagnosed_genes()
237
-
238
-
239
64
  def assess_phenopacket_gene_prioritisation(
240
65
  phenopacket_path: Path,
241
- score_order: str,
242
- results_dir_and_input: TrackInputOutputDirectories,
243
- threshold: float,
244
- gene_rank_stats: RankStats,
245
- gene_rank_comparison: defaultdict,
66
+ run: RunConfig,
246
67
  gene_binary_classification_stats: BinaryClassificationStats,
68
+ gene_benchmarker: AssessGenePrioritisation,
247
69
  ) -> None:
248
70
  """
249
71
  Assess gene prioritisation for a Phenopacket by comparing PhEval standardised gene results
@@ -251,62 +73,64 @@ def assess_phenopacket_gene_prioritisation(
251
73
 
252
74
  Args:
253
75
  phenopacket_path (Path): Path to the Phenopacket.
254
- score_order (str): The order in which scores are arranged, either ascending or descending.
255
- results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
256
- threshold (float): Threshold for assessment.
257
- gene_rank_stats (RankStats): RankStats class instance.
258
- gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
76
+ run (RunConfig): Run configuration.
259
77
  gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
78
+ gene_benchmarker (AssessGenePrioritisation): AssessGenePrioritisation class instance.
260
79
  """
261
- standardised_gene_result = results_dir_and_input.results_dir.joinpath(
80
+ standardised_gene_result_path = run.results_dir.joinpath(
262
81
  f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
263
82
  )
264
- pheval_gene_result = read_standardised_result(standardised_gene_result)
265
- proband_causative_genes = _obtain_causative_genes(phenopacket_path)
266
- AssessGenePrioritisation(
83
+ gene_benchmarker.assess_gene_prioritisation(
84
+ standardised_gene_result_path,
267
85
  phenopacket_path,
268
- results_dir_and_input.results_dir.joinpath("pheval_gene_results/"),
269
- parse_pheval_result(RankedPhEvalGeneResult, pheval_gene_result),
270
- threshold,
271
- score_order,
272
- proband_causative_genes,
273
- ).assess_gene_prioritisation(
274
- gene_rank_stats, gene_rank_comparison, gene_binary_classification_stats
86
+ gene_binary_classification_stats,
275
87
  )
276
88
 
277
89
 
278
90
  def benchmark_gene_prioritisation(
279
- results_directory_and_input: TrackInputOutputDirectories,
91
+ benchmark_name: str,
92
+ run: RunConfig,
280
93
  score_order: str,
281
94
  threshold: float,
282
- gene_rank_comparison: defaultdict,
283
95
  ) -> BenchmarkRunResults:
284
96
  """
285
97
  Benchmark a directory based on gene prioritisation results.
286
98
  Args:
287
- results_directory_and_input (TrackInputOutputDirectories): Input and output directories.
99
+ benchmark_name (str): Name of the benchmark.
100
+ run (RunConfig): Run configuration.
288
101
  score_order (str): The order in which scores are arranged.
289
102
  threshold (float): Threshold for assessment.
290
- gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
291
103
  Returns:
292
104
  BenchmarkRunResults: An object containing benchmarking results for gene prioritisation,
293
105
  including ranks and rank statistics for the benchmarked directory.
294
106
  """
295
- gene_rank_stats = RankStats()
296
107
  gene_binary_classification_stats = BinaryClassificationStats()
297
- for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
108
+ db_connection = BenchmarkDBManager(benchmark_name)
109
+ db_connection.initialise()
110
+ gene_benchmarker = AssessGenePrioritisation(
111
+ db_connection,
112
+ f"{run.phenopacket_dir.parents[0].name}" f"_gene",
113
+ run.run_identifier,
114
+ threshold,
115
+ score_order,
116
+ )
117
+ for phenopacket_path in all_files(run.phenopacket_dir):
298
118
  assess_phenopacket_gene_prioritisation(
299
119
  phenopacket_path,
300
- score_order,
301
- results_directory_and_input,
302
- threshold,
303
- gene_rank_stats,
304
- gene_rank_comparison,
120
+ run,
305
121
  gene_binary_classification_stats,
122
+ gene_benchmarker,
306
123
  )
124
+ db_connection.close()
125
+ gene_rank_stats = RankStats()
126
+ gene_rank_stats.add_ranks(
127
+ benchmark_name=benchmark_name,
128
+ table_name=f"{run.phenopacket_dir.parents[0].name}_gene",
129
+ column_name=str(run.run_identifier),
130
+ )
307
131
  return BenchmarkRunResults(
308
- results_dir=results_directory_and_input.results_dir,
309
- ranks=gene_rank_comparison,
310
132
  rank_stats=gene_rank_stats,
133
+ benchmark_name=run.run_identifier,
311
134
  binary_classification_stats=gene_binary_classification_stats,
135
+ phenopacket_dir=run.phenopacket_dir,
312
136
  )