pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (42) hide show
  1. pheval/__init__.py +0 -5
  2. pheval/analyse/__init__.py +0 -0
  3. pheval/analyse/analysis.py +703 -0
  4. pheval/analyse/generate_plots.py +312 -0
  5. pheval/analyse/generate_summary_outputs.py +186 -0
  6. pheval/analyse/rank_stats.py +61 -0
  7. pheval/cli.py +22 -7
  8. pheval/cli_pheval.py +37 -12
  9. pheval/cli_pheval_utils.py +225 -8
  10. pheval/config_parser.py +36 -0
  11. pheval/constants.py +1 -0
  12. pheval/implementations/__init__.py +1 -3
  13. pheval/post_processing/__init__.py +0 -0
  14. pheval/post_processing/post_processing.py +210 -0
  15. pheval/prepare/__init__.py +0 -0
  16. pheval/prepare/create_noisy_phenopackets.py +173 -0
  17. pheval/prepare/create_spiked_vcf.py +366 -0
  18. pheval/prepare/custom_exceptions.py +47 -0
  19. pheval/prepare/update_phenopacket.py +53 -0
  20. pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
  21. pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
  22. pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
  23. pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
  24. pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
  25. pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
  26. pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
  27. pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
  28. pheval/run_metadata.py +27 -0
  29. pheval/runners/runner.py +92 -11
  30. pheval/utils/__init__.py +0 -0
  31. pheval/utils/docs_gen.py +105 -0
  32. pheval/utils/docs_gen.sh +18 -0
  33. pheval/utils/file_utils.py +88 -0
  34. pheval/utils/phenopacket_utils.py +356 -0
  35. pheval/utils/semsim_utils.py +156 -0
  36. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
  37. pheval-0.2.0.dist-info/RECORD +41 -0
  38. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
  39. pheval/utils.py +0 -7
  40. pheval-0.1.0.dist-info/RECORD +0 -13
  41. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
  42. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,312 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ import matplotlib
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ from matplotlib import pyplot as plt
8
+
9
+ from pheval.analyse.rank_stats import RankStats
10
+ from pheval.constants import PHEVAL_RESULTS_DIRECTORY_SUFFIX
11
+
12
+
13
+ def trim_corpus_results_directory_suffix(corpus_results_directory: Path) -> Path:
14
+ """Trim the end of the corpus results directory name."""
15
+ return Path(str(corpus_results_directory).replace(PHEVAL_RESULTS_DIRECTORY_SUFFIX, ""))
16
+
17
+
18
+ @dataclass
19
+ class TrackGenePrioritisation:
20
+ """Track gene prioritisation for a run."""
21
+
22
+ results_dir: Path
23
+ ranks: dict
24
+ rank_stats: RankStats
25
+
26
+
27
+ @dataclass
28
+ class TrackVariantPrioritisation:
29
+ """Track variant prioritisation for a run."""
30
+
31
+ results_dir: Path
32
+ ranks: dict
33
+ rank_stats: RankStats
34
+
35
+
36
+ @dataclass
37
+ class TrackPrioritisation:
38
+ """Track prioritisation for a run."""
39
+
40
+ gene_prioritisation: TrackGenePrioritisation
41
+ variant_prioritisation: TrackVariantPrioritisation
42
+
43
+
44
+ class PlotGenerator:
45
+ def __init__(self, gene_analysis: bool):
46
+ self.gene_analysis = gene_analysis
47
+ self.stats, self.mrr = [], []
48
+ matplotlib.rcParams["axes.spines.right"] = False
49
+ matplotlib.rcParams["axes.spines.top"] = False
50
+
51
+ def _retrieve_prioritisation_data(self, prioritisation_result: TrackPrioritisation):
52
+ """Return either gene prioritisation or variant prioritisation stats."""
53
+ return (
54
+ prioritisation_result.gene_prioritisation
55
+ if self.gene_analysis
56
+ else prioritisation_result.variant_prioritisation
57
+ )
58
+
59
+ def _generate_stacked_bar_plot_data(self, prioritisation_result: TrackPrioritisation) -> None:
60
+ """Generate data in correct format for dataframe creation for stacked bar plot."""
61
+ result = self._retrieve_prioritisation_data(prioritisation_result)
62
+ rank_stats = result.rank_stats
63
+ self.stats.append(
64
+ {
65
+ "Run": f"{result.results_dir.parents[0].name}_"
66
+ f"{trim_corpus_results_directory_suffix(result.results_dir.name)}",
67
+ "Top": result.rank_stats.percentage_top(),
68
+ "2-3": rank_stats.percentage_difference(
69
+ rank_stats.percentage_top3(), rank_stats.percentage_top()
70
+ ),
71
+ "4-5": rank_stats.percentage_difference(
72
+ rank_stats.percentage_top5(), rank_stats.percentage_top3()
73
+ ),
74
+ "6-10": rank_stats.percentage_difference(
75
+ rank_stats.percentage_top10(), rank_stats.percentage_top5()
76
+ ),
77
+ ">10": rank_stats.percentage_difference(
78
+ rank_stats.percentage_found(), rank_stats.percentage_top10()
79
+ ),
80
+ "FO/NP": rank_stats.percentage_difference(100, rank_stats.percentage_found()),
81
+ }
82
+ )
83
+
84
+ def _generate_stats_mrr_bar_plot_data(self, prioritisation_result: TrackPrioritisation) -> None:
85
+ """Generate data in correct format for dataframe creation for MRR bar plot."""
86
+ result = self._retrieve_prioritisation_data(prioritisation_result)
87
+ self.mrr.extend(
88
+ [
89
+ {
90
+ "Rank": "MRR",
91
+ "Percentage": result.rank_stats.mean_reciprocal_rank(),
92
+ "Run": f"{result.results_dir.parents[0].name}_"
93
+ f"{trim_corpus_results_directory_suffix(result.results_dir.name)}",
94
+ }
95
+ ]
96
+ )
97
+
98
+ def generate_stacked_bar_gene(self, prioritisation_data: [TrackPrioritisation]) -> None:
99
+ """Generate stacked bar plot and MRR bar plot for gene prioritisation stats."""
100
+ for prioritisation_result in prioritisation_data:
101
+ self._generate_stacked_bar_plot_data(prioritisation_result)
102
+ self._generate_stats_mrr_bar_plot_data(prioritisation_result)
103
+ gene_prioritisation_stats_df = pd.DataFrame(self.stats)
104
+ gene_prioritisation_stats_df.set_index("Run").plot(
105
+ kind="bar",
106
+ stacked=True,
107
+ colormap="tab10",
108
+ ylabel="Disease-causing genes (%)",
109
+ figsize=(10, 8),
110
+ # rot=45,
111
+ ).legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
112
+ plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
113
+ gene_mrr_df = pd.DataFrame(self.mrr)
114
+ gene_mrr_df.set_index("Run").plot(
115
+ kind="bar",
116
+ colormap="tab10",
117
+ ylabel="Gene prioritisation mean reciprocal rank",
118
+ legend=False,
119
+ )
120
+ plt.savefig("gene_mrr.svg", format="svg", bbox_inches="tight")
121
+
122
+ def generate_stacked_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
123
+ """Generate stacked bar plot and MRR bar plot for variant prioritisation stats."""
124
+ for prioritisation_result in prioritisation_data:
125
+ self._generate_stacked_bar_plot_data(prioritisation_result)
126
+ self._generate_stats_mrr_bar_plot_data(prioritisation_result)
127
+ variant_prioritisation_stats_df = pd.DataFrame(self.stats)
128
+
129
+ variant_prioritisation_stats_df.set_index("Run").plot(
130
+ kind="bar", stacked=True, colormap="tab10", ylabel="Disease-causing variants (%)"
131
+ ).legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
132
+ plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
133
+ gene_mrr_df = pd.DataFrame(self.mrr)
134
+ gene_mrr_df.set_index("Run").plot(
135
+ kind="bar",
136
+ colormap="tab10",
137
+ ylabel="Variant prioritisation mean reciprocal rank",
138
+ legend=False,
139
+ )
140
+ plt.savefig("variant_mrr.svg", format="svg", bbox_inches="tight")
141
+
142
+ def _generate_cumulative_bar_plot_data(self, prioritisation_result: TrackPrioritisation):
143
+ """Generate data in correct format for dataframe creation for cumulative bar plot."""
144
+ result = self._retrieve_prioritisation_data(prioritisation_result)
145
+ rank_stats = result.rank_stats
146
+ trimmed_corpus_results_dir = trim_corpus_results_directory_suffix(result.results_dir.name)
147
+ self.stats.extend(
148
+ [
149
+ {
150
+ "Rank": "Top",
151
+ "Percentage": rank_stats.percentage_top() / 100,
152
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
153
+ },
154
+ {
155
+ "Rank": "Top3",
156
+ "Percentage": rank_stats.percentage_top3() / 100,
157
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
158
+ },
159
+ {
160
+ "Rank": "Top5",
161
+ "Percentage": rank_stats.percentage_top5() / 100,
162
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
163
+ },
164
+ {
165
+ "Rank": "Top10",
166
+ "Percentage": rank_stats.percentage_top10() / 100,
167
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
168
+ },
169
+ {
170
+ "Rank": "Found",
171
+ "Percentage": rank_stats.percentage_found() / 100,
172
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
173
+ },
174
+ {
175
+ "Rank": "FO/NP",
176
+ "Percentage": rank_stats.percentage_difference(
177
+ 100, rank_stats.percentage_found()
178
+ )
179
+ / 100,
180
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
181
+ },
182
+ {
183
+ "Rank": "MRR",
184
+ "Percentage": rank_stats.mean_reciprocal_rank(),
185
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
186
+ },
187
+ ]
188
+ )
189
+
190
+ def generate_cumulative_bar_gene(self, prioritisation_data: [TrackPrioritisation]):
191
+ """Generate cumulative bar plot for gene prioritisation stats."""
192
+ for prioritisation_result in prioritisation_data:
193
+ self._generate_cumulative_bar_plot_data(prioritisation_result)
194
+ gene_prioritisation_df = pd.DataFrame(self.stats)
195
+ sns.catplot(
196
+ data=gene_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
197
+ ).set(xlabel="Rank", ylabel="Disease-causing genes (%)")
198
+ plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
199
+
200
+ def generate_cumulative_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
201
+ """Generate cumulative bar plot for variant prioritisation stats."""
202
+ for prioritisation_result in prioritisation_data:
203
+ self._generate_cumulative_bar_plot_data(prioritisation_result)
204
+ variant_prioritisation_df = pd.DataFrame(self.stats)
205
+ sns.catplot(
206
+ data=variant_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
207
+ ).set(xlabel="Rank", ylabel="Disease-causing variants (%)")
208
+ plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
209
+
210
+ def _generate_non_cumulative_bar_plot_data(
211
+ self, prioritisation_result: TrackPrioritisation
212
+ ) -> [dict]:
213
+ """Generate data in correct format for dataframe creation for non-cumulative bar plot."""
214
+ result = self._retrieve_prioritisation_data(prioritisation_result)
215
+ rank_stats = result.rank_stats
216
+ trimmed_corpus_results_dir = trim_corpus_results_directory_suffix(result.results_dir.name)
217
+ self.stats.extend(
218
+ [
219
+ {
220
+ "Rank": "Top",
221
+ "Percentage": rank_stats.percentage_top() / 100,
222
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
223
+ },
224
+ {
225
+ "Rank": "2-3",
226
+ "Percentage": rank_stats.percentage_difference(
227
+ rank_stats.percentage_top3(), rank_stats.percentage_top()
228
+ )
229
+ / 100,
230
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
231
+ },
232
+ {
233
+ "Rank": "4-5",
234
+ "Percentage": rank_stats.percentage_difference(
235
+ rank_stats.percentage_top5(), rank_stats.percentage_top3()
236
+ )
237
+ / 100,
238
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
239
+ },
240
+ {
241
+ "Rank": "6-10",
242
+ "Percentage": rank_stats.percentage_difference(
243
+ rank_stats.percentage_top10(), rank_stats.percentage_top5()
244
+ )
245
+ / 100,
246
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
247
+ },
248
+ {
249
+ "Rank": ">10",
250
+ "Percentage": rank_stats.percentage_difference(
251
+ rank_stats.percentage_found(), rank_stats.percentage_top10()
252
+ )
253
+ / 100,
254
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
255
+ },
256
+ {
257
+ "Rank": "FO/NP",
258
+ "Percentage": rank_stats.percentage_difference(
259
+ 100, rank_stats.percentage_found()
260
+ )
261
+ / 100,
262
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
263
+ },
264
+ {
265
+ "Rank": "MRR",
266
+ "Percentage": rank_stats.mean_reciprocal_rank(),
267
+ "Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
268
+ },
269
+ ]
270
+ )
271
+
272
+ def generate_non_cumulative_bar_gene(self, prioritisation_data: [TrackPrioritisation]):
273
+ """Generate non-cumulative bar plot for gene prioritisation stats."""
274
+ for prioritisation_result in prioritisation_data:
275
+ self._generate_non_cumulative_bar_plot_data(prioritisation_result)
276
+ gene_prioritisation_df = pd.DataFrame(self.stats)
277
+ sns.catplot(
278
+ data=gene_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
279
+ ).set(xlabel="Rank", ylabel="Disease-causing genes (%)")
280
+ plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
281
+
282
+ def generate_non_cumulative_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
283
+ """Generate non-cumulative bar plot for variant prioritisation stats."""
284
+ for prioritisation_result in prioritisation_data:
285
+ self._generate_non_cumulative_bar_plot_data(prioritisation_result)
286
+ variant_prioritisation_df = pd.DataFrame(self.stats)
287
+ sns.catplot(
288
+ data=variant_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
289
+ ).set(xlabel="Rank", ylabel="Disease-causing variants (%)")
290
+ plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
291
+
292
+
293
+ def generate_gene_plots(prioritisation_data: [TrackPrioritisation], plot_type: str) -> None:
294
+ """Generate summary stats bar plot for gene prioritisation."""
295
+ plot_generator = PlotGenerator(gene_analysis=True)
296
+ if plot_type == "bar_stacked":
297
+ plot_generator.generate_stacked_bar_gene(prioritisation_data)
298
+ elif plot_type == "bar_cumulative":
299
+ plot_generator.generate_cumulative_bar_gene(prioritisation_data)
300
+ elif plot_type == "bar_non_cumulative":
301
+ plot_generator.generate_non_cumulative_bar_gene(prioritisation_data)
302
+
303
+
304
+ def generate_variant_plots(prioritisation_data: [TrackPrioritisation], plot_type: str) -> None:
305
+ """Generate summary stats bar plot for variant prioritisation."""
306
+ plot_generator = PlotGenerator(gene_analysis=False)
307
+ if plot_type == "bar_stacked":
308
+ plot_generator.generate_stacked_bar_variant(prioritisation_data)
309
+ elif plot_type == "bar_cumulative":
310
+ plot_generator.generate_cumulative_bar_variant(prioritisation_data)
311
+ elif plot_type == "bar_non_cumulative":
312
+ plot_generator.generate_non_cumulative_bar_variant(prioritisation_data)
@@ -0,0 +1,186 @@
1
+ import csv
2
+ import itertools
3
+ from collections import defaultdict
4
+ from copy import deepcopy
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ from pheval.analyse.generate_plots import (
10
+ TrackPrioritisation,
11
+ generate_gene_plots,
12
+ generate_variant_plots,
13
+ )
14
+ from pheval.analyse.rank_stats import RankStats
15
+
16
+
17
+ class RankComparisonGenerator:
18
+ """Write the run comparison of rank assignment for prioritisation."""
19
+
20
+ def __init__(self, run_comparison: defaultdict):
21
+ self.run_comparison = run_comparison
22
+
23
+ def _generate_dataframe(self) -> pd.DataFrame:
24
+ """Generate pandas dataframe."""
25
+ return pd.DataFrame.from_dict(self.run_comparison, orient="index")
26
+
27
+ def _calculate_rank_difference(self) -> pd.DataFrame:
28
+ """Calculate the rank decrease for runs - taking the first directory as a baseline."""
29
+ comparison_df = self._generate_dataframe()
30
+ print(len(comparison_df.columns))
31
+ comparison_df["rank_decrease"] = comparison_df.iloc[:, 3] - comparison_df.iloc[:, 2]
32
+ return comparison_df
33
+
34
+ def generate_gene_output(self, prefix: str) -> None:
35
+ """Generate the output for gene prioritisation ranks."""
36
+ self._generate_dataframe().to_csv(prefix + "-gene_rank_comparison.tsv", sep="\t")
37
+
38
+ def generate_variant_output(self, prefix: str) -> None:
39
+ """Generate the output for variant prioritisation ranks."""
40
+ self._generate_dataframe().to_csv(prefix + "-variant_rank_comparison.tsv", sep="\t")
41
+
42
+ def generate_gene_comparison_output(self, prefix: str) -> None:
43
+ """Generate the output for gene prioritisation rank comparison."""
44
+ self._calculate_rank_difference().to_csv(prefix + "-gene_rank_comparison.tsv", sep="\t")
45
+
46
+ def generate_variant_comparison_output(self, prefix: str) -> None:
47
+ """Generate the output for variant prioritisation rank comparison."""
48
+ self._calculate_rank_difference().to_csv(prefix + "-variant_rank_comparison.tsv", sep="\t")
49
+
50
+
51
+ class RankStatsWriter:
52
+ """Write the rank stats for each run."""
53
+
54
+ def __init__(self, file: Path):
55
+ self.file = open(file, "w")
56
+ self.writer = csv.writer(self.file, delimiter="\t")
57
+ self.writer.writerow(
58
+ [
59
+ "results_directory_path",
60
+ "top",
61
+ "top3",
62
+ "top5",
63
+ "top10",
64
+ "found",
65
+ "total",
66
+ "mean_reciprocal_rank",
67
+ "percentage_top",
68
+ "percentage_top3",
69
+ "percentage_top5",
70
+ "percentage_top10",
71
+ "percentage_found",
72
+ ]
73
+ )
74
+
75
+ def write_row(self, directory: Path, rank_stats: RankStats) -> None:
76
+ """Write summary rank stats row for run."""
77
+ try:
78
+ self.writer.writerow(
79
+ [
80
+ directory,
81
+ rank_stats.top,
82
+ rank_stats.top3,
83
+ rank_stats.top5,
84
+ rank_stats.top10,
85
+ rank_stats.found,
86
+ rank_stats.total,
87
+ rank_stats.mean_reciprocal_rank(),
88
+ rank_stats.percentage_top(),
89
+ rank_stats.percentage_top3(),
90
+ rank_stats.percentage_top5(),
91
+ rank_stats.percentage_top10(),
92
+ rank_stats.percentage_found(),
93
+ ]
94
+ )
95
+ except IOError:
96
+ print("Error writing ", self.file)
97
+
98
+ def close(self) -> None:
99
+ """Close file."""
100
+ try:
101
+ self.file.close()
102
+ except IOError:
103
+ print("Error closing ", self.file)
104
+
105
+
106
+ def generate_benchmark_gene_output(
107
+ prioritisation_data: TrackPrioritisation, plot_type: str
108
+ ) -> None:
109
+ """Generate gene prioritisation outputs for benchmarking single run."""
110
+ RankComparisonGenerator(prioritisation_data.gene_prioritisation.ranks).generate_gene_output(
111
+ f"{prioritisation_data.gene_prioritisation.results_dir.name}"
112
+ )
113
+ generate_gene_plots([prioritisation_data], plot_type)
114
+
115
+
116
+ def generate_benchmark_variant_output(
117
+ prioritisation_data: TrackPrioritisation, plot_type: str
118
+ ) -> None:
119
+ """Generate variant prioritisation outputs for benchmarking single run."""
120
+ RankComparisonGenerator(
121
+ prioritisation_data.variant_prioritisation.ranks
122
+ ).generate_variant_output(f"{prioritisation_data.gene_prioritisation.results_dir.name}")
123
+ generate_variant_plots([prioritisation_data], plot_type)
124
+
125
+
126
+ def merge_results(result1: dict, result2: dict) -> dict:
127
+ """Merge two nested dictionaries containing results on commonalities."""
128
+ for key, val in result1.items():
129
+ if type(val) == dict:
130
+ if key in result2 and type(result2[key] == dict):
131
+ merge_results(result1[key], result2[key])
132
+ else:
133
+ if key in result2:
134
+ result1[key] = result2[key]
135
+
136
+ for key, val in result2.items():
137
+ if key not in result1:
138
+ result1[key] = val
139
+ return result1
140
+
141
+
142
+ def generate_gene_rank_comparisons(comparison_ranks: [tuple]) -> None:
143
+ """Generate the gene rank comparison of two result directories."""
144
+ for pair in comparison_ranks:
145
+ merged_results = merge_results(
146
+ deepcopy(pair[0].gene_prioritisation.ranks), deepcopy(pair[1].gene_prioritisation.ranks)
147
+ )
148
+ RankComparisonGenerator(merged_results).generate_gene_comparison_output(
149
+ f"{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
150
+ f"{pair[0].gene_prioritisation.results_dir.name}"
151
+ f"__v__{pair[1].gene_prioritisation.results_dir.parents[0].name}_"
152
+ f"{pair[1].gene_prioritisation.results_dir.name}"
153
+ )
154
+
155
+
156
+ def generate_variant_rank_comparisons(comparison_ranks: [tuple]) -> None:
157
+ """Generate the variant rank comparison of two result directories."""
158
+ for pair in comparison_ranks:
159
+ merged_results = merge_results(
160
+ deepcopy(pair[0].variant_prioritisation.ranks),
161
+ deepcopy(pair[1].variant_prioritisation.ranks),
162
+ )
163
+ RankComparisonGenerator(merged_results).generate_variant_comparison_output(
164
+ f"{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
165
+ f"{pair[0].variant_prioritisation.results_dir.name}"
166
+ f"__v__{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
167
+ f"{pair[1].variant_prioritisation.results_dir.name}"
168
+ )
169
+
170
+
171
+ def generate_benchmark_comparison_gene_output(
172
+ prioritisation_stats_for_runs: [TrackPrioritisation], plot_type: str
173
+ ) -> None:
174
+ """Generate gene prioritisation outputs for benchmarking multiple runs."""
175
+ generate_gene_rank_comparisons(list(itertools.combinations(prioritisation_stats_for_runs, 2)))
176
+ generate_gene_plots(prioritisation_stats_for_runs, plot_type)
177
+
178
+
179
+ def generate_benchmark_comparison_variant_output(
180
+ prioritisation_stats_for_runs: [TrackPrioritisation], plot_type: str
181
+ ) -> None:
182
+ """Generate variant prioritisation outputs for benchmarking multiple runs."""
183
+ generate_variant_rank_comparisons(
184
+ list(itertools.combinations(prioritisation_stats_for_runs, 2))
185
+ )
186
+ generate_variant_plots(prioritisation_stats_for_runs, plot_type)
@@ -0,0 +1,61 @@
1
+ from dataclasses import dataclass, field
2
+ from statistics import mean
3
+
4
+
5
+ @dataclass
6
+ class RankStats:
7
+ """Class for keeping track of the rank stats."""
8
+
9
+ top: int = 0
10
+ top3: int = 0
11
+ top5: int = 0
12
+ top10: int = 0
13
+ found: int = 0
14
+ total: int = 0
15
+ reciprocal_ranks: list = field(default_factory=list)
16
+
17
+ def add_rank(self, rank: int) -> None:
18
+ """Add rank for phenopacket."""
19
+ self.reciprocal_ranks.append(1 / rank)
20
+ self.found += 1
21
+ if rank == 1:
22
+ self.top += 1
23
+ if rank != "" and rank <= 3:
24
+ self.top3 += 1
25
+ if rank != "" and rank <= 5:
26
+ self.top5 += 1
27
+ if rank != "" and rank <= 10:
28
+ self.top10 += 1
29
+
30
+ def percentage_rank(self, value: int) -> float:
31
+ """Return a percentage rank."""
32
+ return 100 * value / self.found
33
+
34
+ def percentage_top(self) -> float:
35
+ """Return percentage of top matches."""
36
+ return self.percentage_rank(self.top)
37
+
38
+ def percentage_top3(self) -> float:
39
+ """Return percentage of matches in the top3."""
40
+ return self.percentage_rank(self.top3)
41
+
42
+ def percentage_top5(self) -> float:
43
+ """Return percentage of matches in the top5."""
44
+ return self.percentage_rank(self.top5)
45
+
46
+ def percentage_top10(self) -> float:
47
+ """Return percentage of matches in the top10."""
48
+ return self.percentage_rank(self.top10)
49
+
50
+ def percentage_found(self) -> float:
51
+ """Return percentage of matches found."""
52
+ return 100 * self.found / self.total
53
+
54
+ @staticmethod
55
+ def percentage_difference(percentage_value_1: float, percentage_value_2: float) -> float:
56
+ """Return percentage difference between two percentage values"""
57
+ return percentage_value_1 - percentage_value_2
58
+
59
+ def mean_reciprocal_rank(self) -> float:
60
+ """Return the mean reciprocal rank."""
61
+ return mean(self.reciprocal_ranks)
pheval/cli.py CHANGED
@@ -3,8 +3,16 @@ import logging
3
3
 
4
4
  import click
5
5
 
6
+ from pheval.analyse.analysis import benchmark, benchmark_comparison
7
+
6
8
  from .cli_pheval import run
7
- from .cli_pheval_utils import scramble_phenopacket, scramble_semsim
9
+ from .cli_pheval_utils import (
10
+ create_spiked_vcfs_command,
11
+ scramble_phenopackets_command,
12
+ scramble_semsim,
13
+ semsim_comparison,
14
+ update_phenopackets_command,
15
+ )
8
16
 
9
17
  info_log = logging.getLogger("info")
10
18
 
@@ -13,11 +21,11 @@ info_log = logging.getLogger("info")
13
21
  @click.option("-v", "--verbose", count=True)
14
22
  @click.option("-q", "--quiet")
15
23
  def main(verbose=1, quiet=False) -> None:
16
- """main CLI method for pheval
24
+ """main CLI method for PhEval
17
25
 
18
26
  Args:
19
- verbose (int): _description_
20
- quiet (bool): _description_
27
+ verbose (int, optional): Verbose flag.
28
+ quiet (bool, optional): Queit Flag.
21
29
  """
22
30
  if verbose >= 2:
23
31
  info_log.setLevel(level=logging.DEBUG)
@@ -34,15 +42,22 @@ def pheval():
34
42
  """pheval"""
35
43
 
36
44
 
45
+ pheval.add_command(run)
46
+
47
+
37
48
  @click.group()
38
49
  def pheval_utils():
39
50
  """pheval_utils"""
40
51
 
41
52
 
42
- pheval.add_command(run)
43
-
44
53
  pheval_utils.add_command(scramble_semsim)
45
- pheval_utils.add_command(scramble_phenopacket)
54
+ pheval_utils.add_command(semsim_comparison)
55
+ pheval_utils.add_command(scramble_phenopackets_command)
56
+ pheval_utils.add_command(update_phenopackets_command)
57
+ pheval_utils.add_command(create_spiked_vcfs_command)
58
+ pheval_utils.add_command(benchmark)
59
+ pheval_utils.add_command(benchmark_comparison)
60
+
46
61
 
47
62
  if __name__ == "__main__":
48
63
  main()