pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -1,148 +1,23 @@
1
- from collections import defaultdict
2
1
  from pathlib import Path
3
- from typing import List
4
2
 
3
+ from pheval.analyse.assess_prioritisation_base import AssessPrioritisationBase
4
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
5
  from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
6
  from pheval.analyse.binary_classification_stats import BinaryClassificationStats
7
- from pheval.analyse.parse_pheval_result import parse_pheval_result, read_standardised_result
8
- from pheval.analyse.prioritisation_rank_recorder import PrioritisationRankRecorder
9
- from pheval.analyse.prioritisation_result_types import VariantPrioritisationResult
10
7
  from pheval.analyse.rank_stats import RankStats
11
- from pheval.analyse.run_data_parser import TrackInputOutputDirectories
8
+ from pheval.analyse.run_data_parser import RunConfig
12
9
  from pheval.post_processing.post_processing import RankedPhEvalVariantResult
13
10
  from pheval.utils.file_utils import all_files
14
- from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
11
+ from pheval.utils.phenopacket_utils import GenomicVariant
15
12
 
16
13
 
17
- class AssessVariantPrioritisation:
14
+ class AssessVariantPrioritisation(AssessPrioritisationBase):
18
15
  """Class for assessing variant prioritisation based on thresholds and scoring orders."""
19
16
 
20
- def __init__(
21
- self,
22
- phenopacket_path: Path,
23
- results_dir: Path,
24
- standardised_variant_results: List[RankedPhEvalVariantResult],
25
- threshold: float,
26
- score_order: str,
27
- proband_causative_variants: List[GenomicVariant],
28
- ):
29
- """
30
- Initialise AssessVariantPrioritisation class
31
-
32
- Args:
33
- phenopacket_path (Path): Path to the phenopacket file
34
- results_dir (Path): Path to the results directory
35
- standardised_variant_results (List[RankedPhEvalVariantResult]): List of ranked PhEval variant results
36
- threshold (float): Threshold for scores
37
- score_order (str): Score order for results, either ascending or descending
38
- proband_causative_variants (List[GenomicVariant]): List of proband variants
39
-
40
- """
41
- self.phenopacket_path = phenopacket_path
42
- self.results_dir = results_dir
43
- self.standardised_variant_results = standardised_variant_results
44
- self.threshold = threshold
45
- self.score_order = score_order
46
- self.proband_causative_variants = proband_causative_variants
47
-
48
- def _record_variant_prioritisation_match(
49
- self,
50
- result_entry: RankedPhEvalVariantResult,
51
- rank_stats: RankStats,
52
- ) -> VariantPrioritisationResult:
53
- """
54
- Record the variant prioritisation rank if found within the results
55
- Args:
56
- result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
57
- rank_stats (RankStats): RankStats class instance
58
- Returns:
59
- VariantPrioritisationResult: Recorded correct variant prioritisation rank result
60
- """
61
- rank = result_entry.rank
62
- rank_stats.add_rank(rank)
63
- return VariantPrioritisationResult(
64
- self.phenopacket_path,
65
- GenomicVariant(
66
- chrom=result_entry.chromosome,
67
- pos=result_entry.start,
68
- ref=result_entry.ref,
69
- alt=result_entry.alt,
70
- ),
71
- rank,
72
- )
73
-
74
- def _assess_variant_with_threshold_ascending_order(
75
- self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
76
- ) -> VariantPrioritisationResult:
77
- """
78
- Record the variant prioritisation rank if it meets the ascending order threshold.
79
-
80
- This method checks if the variant prioritisation rank meets the ascending order threshold.
81
- If the score of the result entry is less than the threshold, it records the variant rank.
82
-
83
- Args:
84
- result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
85
- rank_stats (RankStats): RankStats class instance
86
-
87
- Returns:
88
- VariantPrioritisationResult: Recorded correct variant prioritisation rank result
89
- """
90
- if float(self.threshold) > float(result_entry.score):
91
- return self._record_variant_prioritisation_match(result_entry, rank_stats)
92
-
93
- def _assess_variant_with_threshold(
94
- self, result_entry: RankedPhEvalVariantResult, rank_stats: RankStats
95
- ) -> VariantPrioritisationResult:
96
- """
97
- Record the variant prioritisation rank if it meets the score threshold.
98
-
99
- This method checks if the variant prioritisation rank meets the score threshold.
100
- If the score of the result entry is greater than the threshold, it records the variant rank.
101
-
102
- Args:
103
- result_entry (RankedPhEvalVariantResult): Ranked PhEval variant result entry
104
- rank_stats (RankStats): RankStats class instance
105
-
106
- Returns:
107
- VariantPrioritisationResult: Recorded correct variant prioritisation rank result
108
- """
109
- if float(self.threshold) < float(result_entry.score):
110
- return self._record_variant_prioritisation_match(result_entry, rank_stats)
111
-
112
- def _record_matched_variant(
113
- self, rank_stats: RankStats, standardised_variant_result: RankedPhEvalVariantResult
114
- ) -> VariantPrioritisationResult:
115
- """
116
- Return the variant rank result - handling the specification of a threshold.
117
-
118
- This method determines and returns the variant rank result based on the specified threshold
119
- and score order. If the threshold is 0.0, it records the variant rank directly.
120
- Otherwise, it assesses the variant with the threshold based on the score order.
121
-
122
- Args:
123
- rank_stats (RankStats): RankStats class instance
124
- standardised_variant_result (RankedPhEvalVariantResult): Ranked PhEval variant result entry
125
-
126
- Returns:
127
- VariantPrioritisationResult: Recorded correct variant prioritisation rank result
128
- """
129
- if float(self.threshold) == 0.0:
130
- return self._record_variant_prioritisation_match(
131
- standardised_variant_result, rank_stats
132
- )
133
- else:
134
- return (
135
- self._assess_variant_with_threshold(standardised_variant_result, rank_stats)
136
- if self.score_order != "ascending"
137
- else self._assess_variant_with_threshold_ascending_order(
138
- standardised_variant_result, rank_stats
139
- )
140
- )
141
-
142
17
  def assess_variant_prioritisation(
143
18
  self,
144
- rank_stats: RankStats,
145
- rank_records: defaultdict,
19
+ standardised_variant_result_path: Path,
20
+ phenopacket_path: Path,
146
21
  binary_classification_stats: BinaryClassificationStats,
147
22
  ) -> None:
148
23
  """
@@ -152,68 +27,59 @@ class AssessVariantPrioritisation:
152
27
  and records ranks using a PrioritisationRankRecorder.
153
28
 
154
29
  Args:
155
- rank_stats (RankStats): RankStats class instance
156
- rank_records (defaultdict): A defaultdict to store the correct ranked results.
30
+ standardised_variant_result_path (Path): Path to standardised variant TSV result.
31
+ phenopacket_path (Path): Path to the phenopacket.
157
32
  binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
158
33
  """
159
34
  relevant_ranks = []
160
- for variant in self.proband_causative_variants:
161
- rank_stats.total += 1
162
- variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
163
- for result in self.standardised_variant_results:
164
- result_variant = GenomicVariant(
165
- chrom=str(result.chromosome),
166
- pos=int(result.start),
167
- ref=result.ref,
168
- alt=result.alt,
35
+ df = self.conn.execute(
36
+ f"""SELECT * FROM {self.table_name} WHERE phenopacket = '{phenopacket_path.name}'"""
37
+ ).fetchdf()
38
+ for _i, row in df.iterrows():
39
+ causative_variant = GenomicVariant(
40
+ chrom=row["chrom"],
41
+ pos=int(row["pos"]),
42
+ ref=row["ref"],
43
+ alt=row["alt"],
44
+ )
45
+ result = (
46
+ self.conn.execute(
47
+ f"SELECT * FROM '{standardised_variant_result_path}' "
48
+ f"WHERE "
49
+ f"chromosome == '{causative_variant.chrom}' AND "
50
+ f"start == {causative_variant.pos} AND "
51
+ f"ref == '{causative_variant.ref}' AND "
52
+ f"alt == '{causative_variant.alt}'"
169
53
  )
170
- if variant == result_variant:
171
- variant_match = self._record_matched_variant(rank_stats, result)
172
- (
173
- relevant_ranks.append(variant_match.rank)
174
- if variant_match
175
- else relevant_ranks.append(0)
176
- )
177
- break
178
- PrioritisationRankRecorder(
179
- rank_stats.total,
180
- self.results_dir,
181
- (
182
- VariantPrioritisationResult(self.phenopacket_path, variant)
183
- if variant_match is None
184
- else variant_match
185
- ),
186
- rank_records,
187
- ).record_rank()
188
- rank_stats.relevant_result_ranks.append(relevant_ranks)
189
- binary_classification_stats.add_classification(
190
- self.standardised_variant_results, relevant_ranks
191
- )
192
-
54
+ .fetchdf()
55
+ .to_dict(orient="records")
56
+ )
193
57
 
194
- def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
195
- """
196
- Obtain known variants from a Phenopacket.
197
- Args:
198
- phenopacket_path (Path): Path to the Phenopacket file.
58
+ if len(result) > 0:
59
+ variant_match = self._record_matched_entity(RankedPhEvalVariantResult(**result[0]))
60
+ relevant_ranks.append(variant_match)
61
+ primary_key = (
62
+ f"{phenopacket_path.name}-{causative_variant.chrom}-{causative_variant.pos}-"
63
+ f"{causative_variant.ref}-{causative_variant.alt}"
64
+ )
65
+ self.conn.execute(
66
+ f'UPDATE {self.table_name} SET "{self.column}" = ? WHERE identifier = ?',
67
+ (variant_match, primary_key),
68
+ )
199
69
 
200
- Returns:
201
- List[GenomicVariant]: A list of known variants associated with the proband,
202
- extracted from the Phenopacket.
203
- """
204
- phenopacket = phenopacket_reader(phenopacket_path)
205
- phenopacket_util = PhenopacketUtil(phenopacket)
206
- return phenopacket_util.diagnosed_variants()
70
+ binary_classification_stats.add_classification(
71
+ self.db_connection.parse_table_into_dataclass(
72
+ str(standardised_variant_result_path), RankedPhEvalVariantResult
73
+ ),
74
+ relevant_ranks,
75
+ )
207
76
 
208
77
 
209
78
  def assess_phenopacket_variant_prioritisation(
210
79
  phenopacket_path: Path,
211
- score_order: str,
212
- results_dir_and_input: TrackInputOutputDirectories,
213
- threshold: float,
214
- variant_rank_stats: RankStats,
215
- variant_rank_comparison: defaultdict,
80
+ run: RunConfig,
216
81
  variant_binary_classification_stats: BinaryClassificationStats,
82
+ variant_benchmarker: AssessVariantPrioritisation,
217
83
  ) -> None:
218
84
  """
219
85
  Assess variant prioritisation for a Phenopacket by comparing PhEval standardised variant results
@@ -221,64 +87,64 @@ def assess_phenopacket_variant_prioritisation(
221
87
 
222
88
  Args:
223
89
  phenopacket_path (Path): Path to the Phenopacket.
224
- score_order (str): The order in which scores are arranged, either ascending or descending.
225
- results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
226
- threshold (float): Threshold for assessment.
227
- variant_rank_stats (RankStats): RankStats class instance.
228
- variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
90
+ run (RunConfig): Run configuration.
229
91
  variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
92
+ variant_benchmarker (AssessVariantPrioritisation): AssessVariantPrioritisation class instance.
230
93
  """
231
- proband_causative_variants = _obtain_causative_variants(phenopacket_path)
232
- standardised_variant_result = results_dir_and_input.results_dir.joinpath(
94
+ standardised_variant_result_path = run.results_dir.joinpath(
233
95
  f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
234
96
  )
235
- pheval_variant_result = read_standardised_result(standardised_variant_result)
236
- AssessVariantPrioritisation(
97
+ variant_benchmarker.assess_variant_prioritisation(
98
+ standardised_variant_result_path,
237
99
  phenopacket_path,
238
- results_dir_and_input.results_dir.joinpath("pheval_variant_results/"),
239
- parse_pheval_result(RankedPhEvalVariantResult, pheval_variant_result),
240
- threshold,
241
- score_order,
242
- proband_causative_variants,
243
- ).assess_variant_prioritisation(
244
- variant_rank_stats, variant_rank_comparison, variant_binary_classification_stats
100
+ variant_binary_classification_stats,
245
101
  )
246
102
 
247
103
 
248
104
  def benchmark_variant_prioritisation(
249
- results_directory_and_input: TrackInputOutputDirectories,
105
+ benchmark_name: str,
106
+ run: RunConfig,
250
107
  score_order: str,
251
108
  threshold: float,
252
- variant_rank_comparison: defaultdict,
253
109
  ):
254
110
  """
255
111
  Benchmark a directory based on variant prioritisation results.
256
112
 
257
113
  Args:
258
- results_directory_and_input (TrackInputOutputDirectories): Input and output directories.
114
+ benchmark_name (str): Name of the benchmark.
115
+ run (RunConfig): Run configuration.
259
116
  score_order (str): The order in which scores are arranged.
260
117
  threshold (float): Threshold for assessment.
261
- variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
262
118
 
263
119
  Returns:
264
120
  BenchmarkRunResults: An object containing benchmarking results for variant prioritisation,
265
121
  including ranks and rank statistics for the benchmarked directory.
266
122
  """
267
- variant_rank_stats = RankStats()
268
123
  variant_binary_classification_stats = BinaryClassificationStats()
269
- for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
124
+ db_connection = BenchmarkDBManager(benchmark_name)
125
+ variant_benchmarker = AssessVariantPrioritisation(
126
+ db_connection,
127
+ f"{run.phenopacket_dir.parents[0].name}" f"_variant",
128
+ run.run_identifier,
129
+ threshold,
130
+ score_order,
131
+ )
132
+ for phenopacket_path in all_files(run.phenopacket_dir):
270
133
  assess_phenopacket_variant_prioritisation(
271
134
  phenopacket_path,
272
- score_order,
273
- results_directory_and_input,
274
- threshold,
275
- variant_rank_stats,
276
- variant_rank_comparison,
135
+ run,
277
136
  variant_binary_classification_stats,
137
+ variant_benchmarker,
278
138
  )
139
+ variant_rank_stats = RankStats()
140
+ variant_rank_stats.add_ranks(
141
+ benchmark_name=benchmark_name,
142
+ table_name=f"{run.phenopacket_dir.parents[0].name}_variant",
143
+ column_name=str(run.run_identifier),
144
+ )
279
145
  return BenchmarkRunResults(
280
- results_dir=results_directory_and_input.results_dir,
281
- ranks=variant_rank_comparison,
146
+ benchmark_name=run.run_identifier,
282
147
  rank_stats=variant_rank_stats,
283
148
  binary_classification_stats=variant_binary_classification_stats,
149
+ phenopacket_dir=run.phenopacket_dir,
284
150
  )
pheval/cli.py CHANGED
@@ -6,9 +6,8 @@ import click
6
6
 
7
7
  from .cli_pheval import run
8
8
  from .cli_pheval_utils import (
9
- benchmark,
10
- benchmark_comparison,
11
9
  create_spiked_vcfs_command,
10
+ generate_benchmark_stats,
12
11
  generate_stats_plot,
13
12
  prepare_corpus_command,
14
13
  scramble_phenopackets_command,
@@ -57,8 +56,7 @@ pheval_utils.add_command(semsim_scramble_command)
57
56
  pheval_utils.add_command(scramble_phenopackets_command)
58
57
  pheval_utils.add_command(update_phenopackets_command)
59
58
  pheval_utils.add_command(create_spiked_vcfs_command)
60
- pheval_utils.add_command(benchmark)
61
- pheval_utils.add_command(benchmark_comparison)
59
+ pheval_utils.add_command(generate_benchmark_stats)
62
60
  pheval_utils.add_command(semsim_to_exomiserdb_command)
63
61
  pheval_utils.add_command(generate_stats_plot)
64
62
  pheval_utils.add_command(prepare_corpus_command)