pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/__init__.py +0 -5
- pheval/analyse/__init__.py +0 -0
- pheval/analyse/analysis.py +703 -0
- pheval/analyse/generate_plots.py +312 -0
- pheval/analyse/generate_summary_outputs.py +186 -0
- pheval/analyse/rank_stats.py +61 -0
- pheval/cli.py +22 -7
- pheval/cli_pheval.py +37 -12
- pheval/cli_pheval_utils.py +225 -8
- pheval/config_parser.py +36 -0
- pheval/constants.py +1 -0
- pheval/implementations/__init__.py +1 -3
- pheval/post_processing/__init__.py +0 -0
- pheval/post_processing/post_processing.py +210 -0
- pheval/prepare/__init__.py +0 -0
- pheval/prepare/create_noisy_phenopackets.py +173 -0
- pheval/prepare/create_spiked_vcf.py +366 -0
- pheval/prepare/custom_exceptions.py +47 -0
- pheval/prepare/update_phenopacket.py +53 -0
- pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
- pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
- pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
- pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
- pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
- pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
- pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
- pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
- pheval/run_metadata.py +27 -0
- pheval/runners/runner.py +92 -11
- pheval/utils/__init__.py +0 -0
- pheval/utils/docs_gen.py +105 -0
- pheval/utils/docs_gen.sh +18 -0
- pheval/utils/file_utils.py +88 -0
- pheval/utils/phenopacket_utils.py +356 -0
- pheval/utils/semsim_utils.py +156 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
- pheval-0.2.0.dist-info/RECORD +41 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
- pheval/utils.py +0 -7
- pheval-0.1.0.dist-info/RECORD +0 -13
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import matplotlib
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
from matplotlib import pyplot as plt
|
|
8
|
+
|
|
9
|
+
from pheval.analyse.rank_stats import RankStats
|
|
10
|
+
from pheval.constants import PHEVAL_RESULTS_DIRECTORY_SUFFIX
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def trim_corpus_results_directory_suffix(corpus_results_directory: Path) -> Path:
|
|
14
|
+
"""Trim the end of the corpus results directory name."""
|
|
15
|
+
return Path(str(corpus_results_directory).replace(PHEVAL_RESULTS_DIRECTORY_SUFFIX, ""))
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TrackGenePrioritisation:
|
|
20
|
+
"""Track gene prioritisation for a run."""
|
|
21
|
+
|
|
22
|
+
results_dir: Path
|
|
23
|
+
ranks: dict
|
|
24
|
+
rank_stats: RankStats
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class TrackVariantPrioritisation:
|
|
29
|
+
"""Track variant prioritisation for a run."""
|
|
30
|
+
|
|
31
|
+
results_dir: Path
|
|
32
|
+
ranks: dict
|
|
33
|
+
rank_stats: RankStats
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class TrackPrioritisation:
|
|
38
|
+
"""Track prioritisation for a run."""
|
|
39
|
+
|
|
40
|
+
gene_prioritisation: TrackGenePrioritisation
|
|
41
|
+
variant_prioritisation: TrackVariantPrioritisation
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PlotGenerator:
|
|
45
|
+
def __init__(self, gene_analysis: bool):
|
|
46
|
+
self.gene_analysis = gene_analysis
|
|
47
|
+
self.stats, self.mrr = [], []
|
|
48
|
+
matplotlib.rcParams["axes.spines.right"] = False
|
|
49
|
+
matplotlib.rcParams["axes.spines.top"] = False
|
|
50
|
+
|
|
51
|
+
def _retrieve_prioritisation_data(self, prioritisation_result: TrackPrioritisation):
|
|
52
|
+
"""Return either gene prioritisation or variant prioritisation stats."""
|
|
53
|
+
return (
|
|
54
|
+
prioritisation_result.gene_prioritisation
|
|
55
|
+
if self.gene_analysis
|
|
56
|
+
else prioritisation_result.variant_prioritisation
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _generate_stacked_bar_plot_data(self, prioritisation_result: TrackPrioritisation) -> None:
|
|
60
|
+
"""Generate data in correct format for dataframe creation for stacked bar plot."""
|
|
61
|
+
result = self._retrieve_prioritisation_data(prioritisation_result)
|
|
62
|
+
rank_stats = result.rank_stats
|
|
63
|
+
self.stats.append(
|
|
64
|
+
{
|
|
65
|
+
"Run": f"{result.results_dir.parents[0].name}_"
|
|
66
|
+
f"{trim_corpus_results_directory_suffix(result.results_dir.name)}",
|
|
67
|
+
"Top": result.rank_stats.percentage_top(),
|
|
68
|
+
"2-3": rank_stats.percentage_difference(
|
|
69
|
+
rank_stats.percentage_top3(), rank_stats.percentage_top()
|
|
70
|
+
),
|
|
71
|
+
"4-5": rank_stats.percentage_difference(
|
|
72
|
+
rank_stats.percentage_top5(), rank_stats.percentage_top3()
|
|
73
|
+
),
|
|
74
|
+
"6-10": rank_stats.percentage_difference(
|
|
75
|
+
rank_stats.percentage_top10(), rank_stats.percentage_top5()
|
|
76
|
+
),
|
|
77
|
+
">10": rank_stats.percentage_difference(
|
|
78
|
+
rank_stats.percentage_found(), rank_stats.percentage_top10()
|
|
79
|
+
),
|
|
80
|
+
"FO/NP": rank_stats.percentage_difference(100, rank_stats.percentage_found()),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _generate_stats_mrr_bar_plot_data(self, prioritisation_result: TrackPrioritisation) -> None:
|
|
85
|
+
"""Generate data in correct format for dataframe creation for MRR bar plot."""
|
|
86
|
+
result = self._retrieve_prioritisation_data(prioritisation_result)
|
|
87
|
+
self.mrr.extend(
|
|
88
|
+
[
|
|
89
|
+
{
|
|
90
|
+
"Rank": "MRR",
|
|
91
|
+
"Percentage": result.rank_stats.mean_reciprocal_rank(),
|
|
92
|
+
"Run": f"{result.results_dir.parents[0].name}_"
|
|
93
|
+
f"{trim_corpus_results_directory_suffix(result.results_dir.name)}",
|
|
94
|
+
}
|
|
95
|
+
]
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def generate_stacked_bar_gene(self, prioritisation_data: [TrackPrioritisation]) -> None:
|
|
99
|
+
"""Generate stacked bar plot and MRR bar plot for gene prioritisation stats."""
|
|
100
|
+
for prioritisation_result in prioritisation_data:
|
|
101
|
+
self._generate_stacked_bar_plot_data(prioritisation_result)
|
|
102
|
+
self._generate_stats_mrr_bar_plot_data(prioritisation_result)
|
|
103
|
+
gene_prioritisation_stats_df = pd.DataFrame(self.stats)
|
|
104
|
+
gene_prioritisation_stats_df.set_index("Run").plot(
|
|
105
|
+
kind="bar",
|
|
106
|
+
stacked=True,
|
|
107
|
+
colormap="tab10",
|
|
108
|
+
ylabel="Disease-causing genes (%)",
|
|
109
|
+
figsize=(10, 8),
|
|
110
|
+
# rot=45,
|
|
111
|
+
).legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
|
112
|
+
plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
113
|
+
gene_mrr_df = pd.DataFrame(self.mrr)
|
|
114
|
+
gene_mrr_df.set_index("Run").plot(
|
|
115
|
+
kind="bar",
|
|
116
|
+
colormap="tab10",
|
|
117
|
+
ylabel="Gene prioritisation mean reciprocal rank",
|
|
118
|
+
legend=False,
|
|
119
|
+
)
|
|
120
|
+
plt.savefig("gene_mrr.svg", format="svg", bbox_inches="tight")
|
|
121
|
+
|
|
122
|
+
def generate_stacked_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
|
|
123
|
+
"""Generate stacked bar plot and MRR bar plot for variant prioritisation stats."""
|
|
124
|
+
for prioritisation_result in prioritisation_data:
|
|
125
|
+
self._generate_stacked_bar_plot_data(prioritisation_result)
|
|
126
|
+
self._generate_stats_mrr_bar_plot_data(prioritisation_result)
|
|
127
|
+
variant_prioritisation_stats_df = pd.DataFrame(self.stats)
|
|
128
|
+
|
|
129
|
+
variant_prioritisation_stats_df.set_index("Run").plot(
|
|
130
|
+
kind="bar", stacked=True, colormap="tab10", ylabel="Disease-causing variants (%)"
|
|
131
|
+
).legend(loc="center left", bbox_to_anchor=(1.0, 0.5))
|
|
132
|
+
plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
133
|
+
gene_mrr_df = pd.DataFrame(self.mrr)
|
|
134
|
+
gene_mrr_df.set_index("Run").plot(
|
|
135
|
+
kind="bar",
|
|
136
|
+
colormap="tab10",
|
|
137
|
+
ylabel="Variant prioritisation mean reciprocal rank",
|
|
138
|
+
legend=False,
|
|
139
|
+
)
|
|
140
|
+
plt.savefig("variant_mrr.svg", format="svg", bbox_inches="tight")
|
|
141
|
+
|
|
142
|
+
def _generate_cumulative_bar_plot_data(self, prioritisation_result: TrackPrioritisation):
|
|
143
|
+
"""Generate data in correct format for dataframe creation for cumulative bar plot."""
|
|
144
|
+
result = self._retrieve_prioritisation_data(prioritisation_result)
|
|
145
|
+
rank_stats = result.rank_stats
|
|
146
|
+
trimmed_corpus_results_dir = trim_corpus_results_directory_suffix(result.results_dir.name)
|
|
147
|
+
self.stats.extend(
|
|
148
|
+
[
|
|
149
|
+
{
|
|
150
|
+
"Rank": "Top",
|
|
151
|
+
"Percentage": rank_stats.percentage_top() / 100,
|
|
152
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
"Rank": "Top3",
|
|
156
|
+
"Percentage": rank_stats.percentage_top3() / 100,
|
|
157
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
"Rank": "Top5",
|
|
161
|
+
"Percentage": rank_stats.percentage_top5() / 100,
|
|
162
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"Rank": "Top10",
|
|
166
|
+
"Percentage": rank_stats.percentage_top10() / 100,
|
|
167
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
"Rank": "Found",
|
|
171
|
+
"Percentage": rank_stats.percentage_found() / 100,
|
|
172
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
"Rank": "FO/NP",
|
|
176
|
+
"Percentage": rank_stats.percentage_difference(
|
|
177
|
+
100, rank_stats.percentage_found()
|
|
178
|
+
)
|
|
179
|
+
/ 100,
|
|
180
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
"Rank": "MRR",
|
|
184
|
+
"Percentage": rank_stats.mean_reciprocal_rank(),
|
|
185
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
186
|
+
},
|
|
187
|
+
]
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def generate_cumulative_bar_gene(self, prioritisation_data: [TrackPrioritisation]):
|
|
191
|
+
"""Generate cumulative bar plot for gene prioritisation stats."""
|
|
192
|
+
for prioritisation_result in prioritisation_data:
|
|
193
|
+
self._generate_cumulative_bar_plot_data(prioritisation_result)
|
|
194
|
+
gene_prioritisation_df = pd.DataFrame(self.stats)
|
|
195
|
+
sns.catplot(
|
|
196
|
+
data=gene_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
|
|
197
|
+
).set(xlabel="Rank", ylabel="Disease-causing genes (%)")
|
|
198
|
+
plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
199
|
+
|
|
200
|
+
def generate_cumulative_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
|
|
201
|
+
"""Generate cumulative bar plot for variant prioritisation stats."""
|
|
202
|
+
for prioritisation_result in prioritisation_data:
|
|
203
|
+
self._generate_cumulative_bar_plot_data(prioritisation_result)
|
|
204
|
+
variant_prioritisation_df = pd.DataFrame(self.stats)
|
|
205
|
+
sns.catplot(
|
|
206
|
+
data=variant_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
|
|
207
|
+
).set(xlabel="Rank", ylabel="Disease-causing variants (%)")
|
|
208
|
+
plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
209
|
+
|
|
210
|
+
def _generate_non_cumulative_bar_plot_data(
|
|
211
|
+
self, prioritisation_result: TrackPrioritisation
|
|
212
|
+
) -> [dict]:
|
|
213
|
+
"""Generate data in correct format for dataframe creation for non-cumulative bar plot."""
|
|
214
|
+
result = self._retrieve_prioritisation_data(prioritisation_result)
|
|
215
|
+
rank_stats = result.rank_stats
|
|
216
|
+
trimmed_corpus_results_dir = trim_corpus_results_directory_suffix(result.results_dir.name)
|
|
217
|
+
self.stats.extend(
|
|
218
|
+
[
|
|
219
|
+
{
|
|
220
|
+
"Rank": "Top",
|
|
221
|
+
"Percentage": rank_stats.percentage_top() / 100,
|
|
222
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
"Rank": "2-3",
|
|
226
|
+
"Percentage": rank_stats.percentage_difference(
|
|
227
|
+
rank_stats.percentage_top3(), rank_stats.percentage_top()
|
|
228
|
+
)
|
|
229
|
+
/ 100,
|
|
230
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
231
|
+
},
|
|
232
|
+
{
|
|
233
|
+
"Rank": "4-5",
|
|
234
|
+
"Percentage": rank_stats.percentage_difference(
|
|
235
|
+
rank_stats.percentage_top5(), rank_stats.percentage_top3()
|
|
236
|
+
)
|
|
237
|
+
/ 100,
|
|
238
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
"Rank": "6-10",
|
|
242
|
+
"Percentage": rank_stats.percentage_difference(
|
|
243
|
+
rank_stats.percentage_top10(), rank_stats.percentage_top5()
|
|
244
|
+
)
|
|
245
|
+
/ 100,
|
|
246
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"Rank": ">10",
|
|
250
|
+
"Percentage": rank_stats.percentage_difference(
|
|
251
|
+
rank_stats.percentage_found(), rank_stats.percentage_top10()
|
|
252
|
+
)
|
|
253
|
+
/ 100,
|
|
254
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
"Rank": "FO/NP",
|
|
258
|
+
"Percentage": rank_stats.percentage_difference(
|
|
259
|
+
100, rank_stats.percentage_found()
|
|
260
|
+
)
|
|
261
|
+
/ 100,
|
|
262
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"Rank": "MRR",
|
|
266
|
+
"Percentage": rank_stats.mean_reciprocal_rank(),
|
|
267
|
+
"Run": f"{result.results_dir.parents[0].name}_" f"{trimmed_corpus_results_dir}",
|
|
268
|
+
},
|
|
269
|
+
]
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def generate_non_cumulative_bar_gene(self, prioritisation_data: [TrackPrioritisation]):
|
|
273
|
+
"""Generate non-cumulative bar plot for gene prioritisation stats."""
|
|
274
|
+
for prioritisation_result in prioritisation_data:
|
|
275
|
+
self._generate_non_cumulative_bar_plot_data(prioritisation_result)
|
|
276
|
+
gene_prioritisation_df = pd.DataFrame(self.stats)
|
|
277
|
+
sns.catplot(
|
|
278
|
+
data=gene_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
|
|
279
|
+
).set(xlabel="Rank", ylabel="Disease-causing genes (%)")
|
|
280
|
+
plt.savefig("gene_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
281
|
+
|
|
282
|
+
def generate_non_cumulative_bar_variant(self, prioritisation_data: [TrackPrioritisation]):
|
|
283
|
+
"""Generate non-cumulative bar plot for variant prioritisation stats."""
|
|
284
|
+
for prioritisation_result in prioritisation_data:
|
|
285
|
+
self._generate_non_cumulative_bar_plot_data(prioritisation_result)
|
|
286
|
+
variant_prioritisation_df = pd.DataFrame(self.stats)
|
|
287
|
+
sns.catplot(
|
|
288
|
+
data=variant_prioritisation_df, kind="bar", x="Rank", y="Percentage", hue="Run"
|
|
289
|
+
).set(xlabel="Rank", ylabel="Disease-causing variants (%)")
|
|
290
|
+
plt.savefig("variant_rank_stats.svg", format="svg", bbox_inches="tight")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def generate_gene_plots(prioritisation_data: [TrackPrioritisation], plot_type: str) -> None:
|
|
294
|
+
"""Generate summary stats bar plot for gene prioritisation."""
|
|
295
|
+
plot_generator = PlotGenerator(gene_analysis=True)
|
|
296
|
+
if plot_type == "bar_stacked":
|
|
297
|
+
plot_generator.generate_stacked_bar_gene(prioritisation_data)
|
|
298
|
+
elif plot_type == "bar_cumulative":
|
|
299
|
+
plot_generator.generate_cumulative_bar_gene(prioritisation_data)
|
|
300
|
+
elif plot_type == "bar_non_cumulative":
|
|
301
|
+
plot_generator.generate_non_cumulative_bar_gene(prioritisation_data)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def generate_variant_plots(prioritisation_data: [TrackPrioritisation], plot_type: str) -> None:
|
|
305
|
+
"""Generate summary stats bar plot for variant prioritisation."""
|
|
306
|
+
plot_generator = PlotGenerator(gene_analysis=False)
|
|
307
|
+
if plot_type == "bar_stacked":
|
|
308
|
+
plot_generator.generate_stacked_bar_variant(prioritisation_data)
|
|
309
|
+
elif plot_type == "bar_cumulative":
|
|
310
|
+
plot_generator.generate_cumulative_bar_variant(prioritisation_data)
|
|
311
|
+
elif plot_type == "bar_non_cumulative":
|
|
312
|
+
plot_generator.generate_non_cumulative_bar_variant(prioritisation_data)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import itertools
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from pheval.analyse.generate_plots import (
|
|
10
|
+
TrackPrioritisation,
|
|
11
|
+
generate_gene_plots,
|
|
12
|
+
generate_variant_plots,
|
|
13
|
+
)
|
|
14
|
+
from pheval.analyse.rank_stats import RankStats
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RankComparisonGenerator:
|
|
18
|
+
"""Write the run comparison of rank assignment for prioritisation."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, run_comparison: defaultdict):
|
|
21
|
+
self.run_comparison = run_comparison
|
|
22
|
+
|
|
23
|
+
def _generate_dataframe(self) -> pd.DataFrame:
|
|
24
|
+
"""Generate pandas dataframe."""
|
|
25
|
+
return pd.DataFrame.from_dict(self.run_comparison, orient="index")
|
|
26
|
+
|
|
27
|
+
def _calculate_rank_difference(self) -> pd.DataFrame:
|
|
28
|
+
"""Calculate the rank decrease for runs - taking the first directory as a baseline."""
|
|
29
|
+
comparison_df = self._generate_dataframe()
|
|
30
|
+
print(len(comparison_df.columns))
|
|
31
|
+
comparison_df["rank_decrease"] = comparison_df.iloc[:, 3] - comparison_df.iloc[:, 2]
|
|
32
|
+
return comparison_df
|
|
33
|
+
|
|
34
|
+
def generate_gene_output(self, prefix: str) -> None:
|
|
35
|
+
"""Generate the output for gene prioritisation ranks."""
|
|
36
|
+
self._generate_dataframe().to_csv(prefix + "-gene_rank_comparison.tsv", sep="\t")
|
|
37
|
+
|
|
38
|
+
def generate_variant_output(self, prefix: str) -> None:
|
|
39
|
+
"""Generate the output for variant prioritisation ranks."""
|
|
40
|
+
self._generate_dataframe().to_csv(prefix + "-variant_rank_comparison.tsv", sep="\t")
|
|
41
|
+
|
|
42
|
+
def generate_gene_comparison_output(self, prefix: str) -> None:
|
|
43
|
+
"""Generate the output for gene prioritisation rank comparison."""
|
|
44
|
+
self._calculate_rank_difference().to_csv(prefix + "-gene_rank_comparison.tsv", sep="\t")
|
|
45
|
+
|
|
46
|
+
def generate_variant_comparison_output(self, prefix: str) -> None:
|
|
47
|
+
"""Generate the output for variant prioritisation rank comparison."""
|
|
48
|
+
self._calculate_rank_difference().to_csv(prefix + "-variant_rank_comparison.tsv", sep="\t")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RankStatsWriter:
|
|
52
|
+
"""Write the rank stats for each run."""
|
|
53
|
+
|
|
54
|
+
def __init__(self, file: Path):
|
|
55
|
+
self.file = open(file, "w")
|
|
56
|
+
self.writer = csv.writer(self.file, delimiter="\t")
|
|
57
|
+
self.writer.writerow(
|
|
58
|
+
[
|
|
59
|
+
"results_directory_path",
|
|
60
|
+
"top",
|
|
61
|
+
"top3",
|
|
62
|
+
"top5",
|
|
63
|
+
"top10",
|
|
64
|
+
"found",
|
|
65
|
+
"total",
|
|
66
|
+
"mean_reciprocal_rank",
|
|
67
|
+
"percentage_top",
|
|
68
|
+
"percentage_top3",
|
|
69
|
+
"percentage_top5",
|
|
70
|
+
"percentage_top10",
|
|
71
|
+
"percentage_found",
|
|
72
|
+
]
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def write_row(self, directory: Path, rank_stats: RankStats) -> None:
|
|
76
|
+
"""Write summary rank stats row for run."""
|
|
77
|
+
try:
|
|
78
|
+
self.writer.writerow(
|
|
79
|
+
[
|
|
80
|
+
directory,
|
|
81
|
+
rank_stats.top,
|
|
82
|
+
rank_stats.top3,
|
|
83
|
+
rank_stats.top5,
|
|
84
|
+
rank_stats.top10,
|
|
85
|
+
rank_stats.found,
|
|
86
|
+
rank_stats.total,
|
|
87
|
+
rank_stats.mean_reciprocal_rank(),
|
|
88
|
+
rank_stats.percentage_top(),
|
|
89
|
+
rank_stats.percentage_top3(),
|
|
90
|
+
rank_stats.percentage_top5(),
|
|
91
|
+
rank_stats.percentage_top10(),
|
|
92
|
+
rank_stats.percentage_found(),
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
except IOError:
|
|
96
|
+
print("Error writing ", self.file)
|
|
97
|
+
|
|
98
|
+
def close(self) -> None:
|
|
99
|
+
"""Close file."""
|
|
100
|
+
try:
|
|
101
|
+
self.file.close()
|
|
102
|
+
except IOError:
|
|
103
|
+
print("Error closing ", self.file)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def generate_benchmark_gene_output(
|
|
107
|
+
prioritisation_data: TrackPrioritisation, plot_type: str
|
|
108
|
+
) -> None:
|
|
109
|
+
"""Generate gene prioritisation outputs for benchmarking single run."""
|
|
110
|
+
RankComparisonGenerator(prioritisation_data.gene_prioritisation.ranks).generate_gene_output(
|
|
111
|
+
f"{prioritisation_data.gene_prioritisation.results_dir.name}"
|
|
112
|
+
)
|
|
113
|
+
generate_gene_plots([prioritisation_data], plot_type)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def generate_benchmark_variant_output(
|
|
117
|
+
prioritisation_data: TrackPrioritisation, plot_type: str
|
|
118
|
+
) -> None:
|
|
119
|
+
"""Generate variant prioritisation outputs for benchmarking single run."""
|
|
120
|
+
RankComparisonGenerator(
|
|
121
|
+
prioritisation_data.variant_prioritisation.ranks
|
|
122
|
+
).generate_variant_output(f"{prioritisation_data.gene_prioritisation.results_dir.name}")
|
|
123
|
+
generate_variant_plots([prioritisation_data], plot_type)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def merge_results(result1: dict, result2: dict) -> dict:
|
|
127
|
+
"""Merge two nested dictionaries containing results on commonalities."""
|
|
128
|
+
for key, val in result1.items():
|
|
129
|
+
if type(val) == dict:
|
|
130
|
+
if key in result2 and type(result2[key] == dict):
|
|
131
|
+
merge_results(result1[key], result2[key])
|
|
132
|
+
else:
|
|
133
|
+
if key in result2:
|
|
134
|
+
result1[key] = result2[key]
|
|
135
|
+
|
|
136
|
+
for key, val in result2.items():
|
|
137
|
+
if key not in result1:
|
|
138
|
+
result1[key] = val
|
|
139
|
+
return result1
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def generate_gene_rank_comparisons(comparison_ranks: [tuple]) -> None:
|
|
143
|
+
"""Generate the gene rank comparison of two result directories."""
|
|
144
|
+
for pair in comparison_ranks:
|
|
145
|
+
merged_results = merge_results(
|
|
146
|
+
deepcopy(pair[0].gene_prioritisation.ranks), deepcopy(pair[1].gene_prioritisation.ranks)
|
|
147
|
+
)
|
|
148
|
+
RankComparisonGenerator(merged_results).generate_gene_comparison_output(
|
|
149
|
+
f"{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
|
|
150
|
+
f"{pair[0].gene_prioritisation.results_dir.name}"
|
|
151
|
+
f"__v__{pair[1].gene_prioritisation.results_dir.parents[0].name}_"
|
|
152
|
+
f"{pair[1].gene_prioritisation.results_dir.name}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def generate_variant_rank_comparisons(comparison_ranks: [tuple]) -> None:
|
|
157
|
+
"""Generate the variant rank comparison of two result directories."""
|
|
158
|
+
for pair in comparison_ranks:
|
|
159
|
+
merged_results = merge_results(
|
|
160
|
+
deepcopy(pair[0].variant_prioritisation.ranks),
|
|
161
|
+
deepcopy(pair[1].variant_prioritisation.ranks),
|
|
162
|
+
)
|
|
163
|
+
RankComparisonGenerator(merged_results).generate_variant_comparison_output(
|
|
164
|
+
f"{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
|
|
165
|
+
f"{pair[0].variant_prioritisation.results_dir.name}"
|
|
166
|
+
f"__v__{pair[0].gene_prioritisation.results_dir.parents[0].name}_"
|
|
167
|
+
f"{pair[1].variant_prioritisation.results_dir.name}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def generate_benchmark_comparison_gene_output(
|
|
172
|
+
prioritisation_stats_for_runs: [TrackPrioritisation], plot_type: str
|
|
173
|
+
) -> None:
|
|
174
|
+
"""Generate gene prioritisation outputs for benchmarking multiple runs."""
|
|
175
|
+
generate_gene_rank_comparisons(list(itertools.combinations(prioritisation_stats_for_runs, 2)))
|
|
176
|
+
generate_gene_plots(prioritisation_stats_for_runs, plot_type)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def generate_benchmark_comparison_variant_output(
|
|
180
|
+
prioritisation_stats_for_runs: [TrackPrioritisation], plot_type: str
|
|
181
|
+
) -> None:
|
|
182
|
+
"""Generate variant prioritisation outputs for benchmarking multiple runs."""
|
|
183
|
+
generate_variant_rank_comparisons(
|
|
184
|
+
list(itertools.combinations(prioritisation_stats_for_runs, 2))
|
|
185
|
+
)
|
|
186
|
+
generate_variant_plots(prioritisation_stats_for_runs, plot_type)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from statistics import mean
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class RankStats:
|
|
7
|
+
"""Class for keeping track of the rank stats."""
|
|
8
|
+
|
|
9
|
+
top: int = 0
|
|
10
|
+
top3: int = 0
|
|
11
|
+
top5: int = 0
|
|
12
|
+
top10: int = 0
|
|
13
|
+
found: int = 0
|
|
14
|
+
total: int = 0
|
|
15
|
+
reciprocal_ranks: list = field(default_factory=list)
|
|
16
|
+
|
|
17
|
+
def add_rank(self, rank: int) -> None:
|
|
18
|
+
"""Add rank for phenopacket."""
|
|
19
|
+
self.reciprocal_ranks.append(1 / rank)
|
|
20
|
+
self.found += 1
|
|
21
|
+
if rank == 1:
|
|
22
|
+
self.top += 1
|
|
23
|
+
if rank != "" and rank <= 3:
|
|
24
|
+
self.top3 += 1
|
|
25
|
+
if rank != "" and rank <= 5:
|
|
26
|
+
self.top5 += 1
|
|
27
|
+
if rank != "" and rank <= 10:
|
|
28
|
+
self.top10 += 1
|
|
29
|
+
|
|
30
|
+
def percentage_rank(self, value: int) -> float:
|
|
31
|
+
"""Return a percentage rank."""
|
|
32
|
+
return 100 * value / self.found
|
|
33
|
+
|
|
34
|
+
def percentage_top(self) -> float:
|
|
35
|
+
"""Return percentage of top matches."""
|
|
36
|
+
return self.percentage_rank(self.top)
|
|
37
|
+
|
|
38
|
+
def percentage_top3(self) -> float:
|
|
39
|
+
"""Return percentage of matches in the top3."""
|
|
40
|
+
return self.percentage_rank(self.top3)
|
|
41
|
+
|
|
42
|
+
def percentage_top5(self) -> float:
|
|
43
|
+
"""Return percentage of matches in the top5."""
|
|
44
|
+
return self.percentage_rank(self.top5)
|
|
45
|
+
|
|
46
|
+
def percentage_top10(self) -> float:
|
|
47
|
+
"""Return percentage of matches in the top10."""
|
|
48
|
+
return self.percentage_rank(self.top10)
|
|
49
|
+
|
|
50
|
+
def percentage_found(self) -> float:
|
|
51
|
+
"""Return percentage of matches found."""
|
|
52
|
+
return 100 * self.found / self.total
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def percentage_difference(percentage_value_1: float, percentage_value_2: float) -> float:
|
|
56
|
+
"""Return percentage difference between two percentage values"""
|
|
57
|
+
return percentage_value_1 - percentage_value_2
|
|
58
|
+
|
|
59
|
+
def mean_reciprocal_rank(self) -> float:
|
|
60
|
+
"""Return the mean reciprocal rank."""
|
|
61
|
+
return mean(self.reciprocal_ranks)
|
pheval/cli.py
CHANGED
|
@@ -3,8 +3,16 @@ import logging
|
|
|
3
3
|
|
|
4
4
|
import click
|
|
5
5
|
|
|
6
|
+
from pheval.analyse.analysis import benchmark, benchmark_comparison
|
|
7
|
+
|
|
6
8
|
from .cli_pheval import run
|
|
7
|
-
from .cli_pheval_utils import
|
|
9
|
+
from .cli_pheval_utils import (
|
|
10
|
+
create_spiked_vcfs_command,
|
|
11
|
+
scramble_phenopackets_command,
|
|
12
|
+
scramble_semsim,
|
|
13
|
+
semsim_comparison,
|
|
14
|
+
update_phenopackets_command,
|
|
15
|
+
)
|
|
8
16
|
|
|
9
17
|
info_log = logging.getLogger("info")
|
|
10
18
|
|
|
@@ -13,11 +21,11 @@ info_log = logging.getLogger("info")
|
|
|
13
21
|
@click.option("-v", "--verbose", count=True)
|
|
14
22
|
@click.option("-q", "--quiet")
|
|
15
23
|
def main(verbose=1, quiet=False) -> None:
|
|
16
|
-
"""main CLI method for
|
|
24
|
+
"""main CLI method for PhEval
|
|
17
25
|
|
|
18
26
|
Args:
|
|
19
|
-
verbose (int):
|
|
20
|
-
quiet (bool):
|
|
27
|
+
verbose (int, optional): Verbose flag.
|
|
28
|
+
quiet (bool, optional): Queit Flag.
|
|
21
29
|
"""
|
|
22
30
|
if verbose >= 2:
|
|
23
31
|
info_log.setLevel(level=logging.DEBUG)
|
|
@@ -34,15 +42,22 @@ def pheval():
|
|
|
34
42
|
"""pheval"""
|
|
35
43
|
|
|
36
44
|
|
|
45
|
+
pheval.add_command(run)
|
|
46
|
+
|
|
47
|
+
|
|
37
48
|
@click.group()
|
|
38
49
|
def pheval_utils():
|
|
39
50
|
"""pheval_utils"""
|
|
40
51
|
|
|
41
52
|
|
|
42
|
-
pheval.add_command(run)
|
|
43
|
-
|
|
44
53
|
pheval_utils.add_command(scramble_semsim)
|
|
45
|
-
pheval_utils.add_command(
|
|
54
|
+
pheval_utils.add_command(semsim_comparison)
|
|
55
|
+
pheval_utils.add_command(scramble_phenopackets_command)
|
|
56
|
+
pheval_utils.add_command(update_phenopackets_command)
|
|
57
|
+
pheval_utils.add_command(create_spiked_vcfs_command)
|
|
58
|
+
pheval_utils.add_command(benchmark)
|
|
59
|
+
pheval_utils.add_command(benchmark_comparison)
|
|
60
|
+
|
|
46
61
|
|
|
47
62
|
if __name__ == "__main__":
|
|
48
63
|
main()
|