pheval 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/benchmark.py +12 -23
- pheval/analyse/benchmark_output_type.py +3 -5
- pheval/analyse/binary_classification_curves.py +3 -9
- pheval/analyse/binary_classification_stats.py +1 -4
- pheval/analyse/generate_plots.py +8 -18
- pheval/analyse/generate_rank_comparisons.py +1 -2
- pheval/analyse/rank_stats.py +8 -25
- pheval/analyse/run_data_parser.py +15 -9
- pheval/cli.py +1 -1
- pheval/cli_pheval_utils.py +10 -23
- pheval/config_parser.py +1 -1
- pheval/implementations/__init__.py +3 -5
- pheval/infra/exomiserdb.py +7 -15
- pheval/post_processing/phenopacket_truth_set.py +10 -31
- pheval/post_processing/post_processing.py +12 -33
- pheval/post_processing/validate_result_format.py +2 -4
- pheval/prepare/create_noisy_phenopackets.py +18 -29
- pheval/prepare/create_spiked_vcf.py +25 -56
- pheval/prepare/custom_exceptions.py +6 -7
- pheval/prepare/prepare_corpus.py +6 -17
- pheval/prepare/update_phenopacket.py +6 -17
- pheval/utils/docs_gen.py +3 -3
- pheval/utils/file_utils.py +1 -2
- pheval/utils/phenopacket_utils.py +41 -73
- pheval/utils/semsim_utils.py +6 -10
- pheval/utils/utils.py +3 -4
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/METADATA +1 -1
- pheval-0.6.4.dist-info/RECORD +57 -0
- pheval-0.6.2.dist-info/RECORD +0 -57
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/LICENSE +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/WHEEL +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/entry_points.txt +0 -0
pheval/analyse/benchmark.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import time
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import List, Tuple
|
|
4
4
|
|
|
5
5
|
import duckdb
|
|
6
6
|
import polars as pl
|
|
@@ -53,8 +53,8 @@ def scan_directory(run: RunConfig, benchmark_type: BenchmarkOutputType) -> pl.La
|
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
def process_stats(
|
|
56
|
-
runs:
|
|
57
|
-
) ->
|
|
56
|
+
runs: list[RunConfig], benchmark_type: BenchmarkOutputType
|
|
57
|
+
) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
|
|
58
58
|
"""
|
|
59
59
|
Processes stats outputs for specified runs to compare.
|
|
60
60
|
Args:
|
|
@@ -74,9 +74,7 @@ def process_stats(
|
|
|
74
74
|
curve_results.append(compute_curves(run.run_identifier, result_scan))
|
|
75
75
|
true_positive_cases.append(
|
|
76
76
|
result_scan.filter(pl.col("true_positive"))
|
|
77
|
-
.select(
|
|
78
|
-
["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)]
|
|
79
|
-
)
|
|
77
|
+
.select(["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)])
|
|
80
78
|
.sort(["result_file", *benchmark_type.columns])
|
|
81
79
|
)
|
|
82
80
|
return (
|
|
@@ -86,11 +84,7 @@ def process_stats(
|
|
|
86
84
|
[true_positive_cases[0]]
|
|
87
85
|
+ [
|
|
88
86
|
df.select(
|
|
89
|
-
[
|
|
90
|
-
col
|
|
91
|
-
for col in df.collect_schema().keys()
|
|
92
|
-
if col not in ["result_file", *benchmark_type.columns]
|
|
93
|
-
]
|
|
87
|
+
[col for col in df.collect_schema().keys() if col not in ["result_file", *benchmark_type.columns]]
|
|
94
88
|
)
|
|
95
89
|
for df in true_positive_cases[1:]
|
|
96
90
|
],
|
|
@@ -108,20 +102,14 @@ def benchmark(config: Config, benchmark_type: BenchmarkOutputType) -> None:
|
|
|
108
102
|
"""
|
|
109
103
|
conn = duckdb.connect(f"{config.benchmark_name}.duckdb")
|
|
110
104
|
stats, curve_results, true_positive_cases = process_stats(config.runs, benchmark_type)
|
|
111
|
-
write_table(
|
|
112
|
-
conn, stats, f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_summary"
|
|
113
|
-
)
|
|
105
|
+
write_table(conn, stats, f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_summary")
|
|
114
106
|
write_table(
|
|
115
107
|
conn,
|
|
116
108
|
curve_results,
|
|
117
109
|
f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_binary_classification_curves",
|
|
118
110
|
)
|
|
119
|
-
calculate_rank_changes(
|
|
120
|
-
|
|
121
|
-
)
|
|
122
|
-
generate_plots(
|
|
123
|
-
config.benchmark_name, stats, curve_results, benchmark_type, config.plot_customisation
|
|
124
|
-
)
|
|
111
|
+
calculate_rank_changes(conn, [run.run_identifier for run in config.runs], true_positive_cases, benchmark_type)
|
|
112
|
+
generate_plots(config.benchmark_name, stats, curve_results, benchmark_type, config.plot_customisation)
|
|
125
113
|
conn.close()
|
|
126
114
|
|
|
127
115
|
|
|
@@ -135,6 +123,9 @@ def benchmark_runs(benchmark_config_file: Path) -> None:
|
|
|
135
123
|
start_time = time.perf_counter()
|
|
136
124
|
logger.info("Initiated benchmarking process.")
|
|
137
125
|
config = parse_run_config(benchmark_config_file)
|
|
126
|
+
if Path(f"{config.benchmark_name}.duckdb").exists():
|
|
127
|
+
logger.error(f"{config.benchmark_name}.duckdb already exists! Exiting.")
|
|
128
|
+
sys.exit(1)
|
|
138
129
|
gene_analysis_runs = [run for run in config.runs if run.gene_analysis]
|
|
139
130
|
variant_analysis_runs = [run for run in config.runs if run.variant_analysis]
|
|
140
131
|
disease_analysis_runs = [run for run in config.runs if run.disease_analysis]
|
|
@@ -171,6 +162,4 @@ def benchmark_runs(benchmark_config_file: Path) -> None:
|
|
|
171
162
|
BenchmarkOutputTypeEnum.DISEASE.value,
|
|
172
163
|
)
|
|
173
164
|
logger.info("Finished benchmarking for disease results.")
|
|
174
|
-
logger.info(
|
|
175
|
-
f"Finished benchmarking! Total time: {time.perf_counter() - start_time:.2f} seconds."
|
|
176
|
-
)
|
|
165
|
+
logger.info(f"Finished benchmarking! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import NamedTuple
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class BenchmarkOutputType(NamedTuple):
|
|
@@ -15,7 +15,7 @@ class BenchmarkOutputType(NamedTuple):
|
|
|
15
15
|
|
|
16
16
|
prioritisation_type_string: str
|
|
17
17
|
y_label: str
|
|
18
|
-
columns:
|
|
18
|
+
columns: list[str]
|
|
19
19
|
result_directory: str
|
|
20
20
|
|
|
21
21
|
|
|
@@ -35,9 +35,7 @@ class BenchmarkOutputTypeEnum(Enum):
|
|
|
35
35
|
["gene_identifier", "gene_symbol"],
|
|
36
36
|
"pheval_gene_results",
|
|
37
37
|
)
|
|
38
|
-
VARIANT = BenchmarkOutputType(
|
|
39
|
-
"variant", "Disease-causing variants (%)", ["variant_id"], "pheval_variant_results"
|
|
40
|
-
)
|
|
38
|
+
VARIANT = BenchmarkOutputType("variant", "Disease-causing variants (%)", ["variant_id"], "pheval_variant_results")
|
|
41
39
|
DISEASE = BenchmarkOutputType(
|
|
42
40
|
"disease",
|
|
43
41
|
"Known diseases (%)",
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Tuple
|
|
2
|
-
|
|
3
1
|
import numpy as np
|
|
4
2
|
import polars as pl
|
|
5
3
|
from sklearn.metrics import precision_recall_curve, roc_curve
|
|
@@ -11,7 +9,7 @@ class BinaryClassificationCurves:
|
|
|
11
9
|
"""Class for computing and storing ROC & Precision-Recall curves in Polars."""
|
|
12
10
|
|
|
13
11
|
@staticmethod
|
|
14
|
-
def _compute_finite_bounds(result_scan: pl.LazyFrame) ->
|
|
12
|
+
def _compute_finite_bounds(result_scan: pl.LazyFrame) -> tuple[float, float]:
|
|
15
13
|
"""
|
|
16
14
|
Compute min and max finite values in the 'score' column to handle NaN and Inf values.
|
|
17
15
|
Args:
|
|
@@ -32,9 +30,7 @@ class BinaryClassificationCurves:
|
|
|
32
30
|
)
|
|
33
31
|
|
|
34
32
|
@staticmethod
|
|
35
|
-
def _clean_and_extract_data(
|
|
36
|
-
result_scan: pl.LazyFrame, max_finite: float, min_finite: float
|
|
37
|
-
) -> pl.LazyFrame:
|
|
33
|
+
def _clean_and_extract_data(result_scan: pl.LazyFrame, max_finite: float, min_finite: float) -> pl.LazyFrame:
|
|
38
34
|
"""
|
|
39
35
|
Normalise the 'score' column (handling NaNs and Inf values) and extract 'true_positive' labels.
|
|
40
36
|
|
|
@@ -64,9 +60,7 @@ class BinaryClassificationCurves:
|
|
|
64
60
|
)
|
|
65
61
|
|
|
66
62
|
@staticmethod
|
|
67
|
-
def _compute_roc_pr_curves(
|
|
68
|
-
run_identifier: str, labels: np.ndarray, scores: np.ndarray
|
|
69
|
-
) -> pl.LazyFrame:
|
|
63
|
+
def _compute_roc_pr_curves(run_identifier: str, labels: np.ndarray, scores: np.ndarray) -> pl.LazyFrame:
|
|
70
64
|
"""
|
|
71
65
|
Compute ROC and Precision-Recall curves.
|
|
72
66
|
|
|
@@ -103,10 +103,7 @@ class BinaryClassificationStats:
|
|
|
103
103
|
)
|
|
104
104
|
|
|
105
105
|
F1_SCORE = (
|
|
106
|
-
pl.when(
|
|
107
|
-
2 * (pl.col("true_positives") + pl.col("false_positives") + pl.col("false_negatives"))
|
|
108
|
-
!= 0
|
|
109
|
-
)
|
|
106
|
+
pl.when(2 * (pl.col("true_positives") + pl.col("false_positives") + pl.col("false_negatives")) != 0)
|
|
110
107
|
.then(
|
|
111
108
|
2
|
|
112
109
|
* pl.col("true_positives")
|
pheval/analyse/generate_plots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from pathlib import Path
|
|
3
|
+
from typing import ClassVar
|
|
3
4
|
|
|
4
5
|
import duckdb
|
|
5
6
|
import matplotlib
|
|
@@ -32,7 +33,7 @@ class PlotTypes(Enum):
|
|
|
32
33
|
class PlotGenerator:
|
|
33
34
|
"""Class to generate plots."""
|
|
34
35
|
|
|
35
|
-
palette_hex_codes = [
|
|
36
|
+
palette_hex_codes: ClassVar[list[str]] = [
|
|
36
37
|
"#f4ae3d",
|
|
37
38
|
"#ee5825",
|
|
38
39
|
"#2b7288",
|
|
@@ -91,9 +92,7 @@ class PlotGenerator:
|
|
|
91
92
|
{"run_identifier": "Run", "mrr": "Percentage"}
|
|
92
93
|
)
|
|
93
94
|
|
|
94
|
-
def _save_fig(
|
|
95
|
-
self, benchmark_output_type: BenchmarkOutputType, y_lower_limit: int, y_upper_limit: int
|
|
96
|
-
) -> None:
|
|
95
|
+
def _save_fig(self, benchmark_output_type: BenchmarkOutputType, y_lower_limit: int, y_upper_limit: int) -> None:
|
|
97
96
|
"""
|
|
98
97
|
Save the generated figure.
|
|
99
98
|
Args:
|
|
@@ -140,9 +139,7 @@ class PlotGenerator:
|
|
|
140
139
|
legend=False,
|
|
141
140
|
edgecolor="white",
|
|
142
141
|
)
|
|
143
|
-
plt.title(
|
|
144
|
-
f"{benchmark_output_type.prioritisation_type_string.capitalize()} results - mean reciprocal rank"
|
|
145
|
-
)
|
|
142
|
+
plt.title(f"{benchmark_output_type.prioritisation_type_string.capitalize()} results - mean reciprocal rank")
|
|
146
143
|
self._save_fig(benchmark_output_type, 0, 1)
|
|
147
144
|
|
|
148
145
|
@staticmethod
|
|
@@ -189,17 +186,13 @@ class PlotGenerator:
|
|
|
189
186
|
plt.title(plot_customisation.rank_plot_title, loc="center", fontsize=15)
|
|
190
187
|
self._save_fig(benchmark_output_type, 0, 1)
|
|
191
188
|
|
|
192
|
-
def _generate_non_cumulative_bar_plot_data(
|
|
193
|
-
self, benchmarking_results_df: pl.DataFrame
|
|
194
|
-
) -> pl.DataFrame:
|
|
189
|
+
def _generate_non_cumulative_bar_plot_data(self, benchmarking_results_df: pl.DataFrame) -> pl.DataFrame:
|
|
195
190
|
"""
|
|
196
191
|
Generate data in the correct format for dataframe creation for a non-cumulative bar plot,
|
|
197
192
|
appending to the self.stats attribute of the class.
|
|
198
193
|
"""
|
|
199
194
|
return self._generate_stacked_data(benchmarking_results_df).hstack(
|
|
200
|
-
self._extract_mrr_data(benchmarking_results_df).select(
|
|
201
|
-
pl.col("Percentage").alias("MRR")
|
|
202
|
-
)
|
|
195
|
+
self._extract_mrr_data(benchmarking_results_df).select(pl.col("Percentage").alias("MRR"))
|
|
203
196
|
)
|
|
204
197
|
|
|
205
198
|
def generate_cumulative_bar(
|
|
@@ -309,9 +302,7 @@ def generate_plots(
|
|
|
309
302
|
This method generates summary statistics bar plots based on the provided benchmarking results and plot type.
|
|
310
303
|
"""
|
|
311
304
|
plot_generator = PlotGenerator(benchmark_name)
|
|
312
|
-
plot_customisation_type = getattr(
|
|
313
|
-
plot_customisation, f"{benchmark_output_type.prioritisation_type_string}_plots"
|
|
314
|
-
)
|
|
305
|
+
plot_customisation_type = getattr(plot_customisation, f"{benchmark_output_type.prioritisation_type_string}_plots")
|
|
315
306
|
logger.info("Generating ROC curve visualisations.")
|
|
316
307
|
plot_generator.generate_roc_curve(curves, benchmark_output_type, plot_customisation_type)
|
|
317
308
|
logger.info("Generating Precision-Recall curves visualisations.")
|
|
@@ -355,8 +346,7 @@ def generate_plots_from_db(db_path: Path, config: Path) -> None:
|
|
|
355
346
|
}
|
|
356
347
|
for benchmark_output_type in BenchmarkOutputTypeEnum:
|
|
357
348
|
summary_table = (
|
|
358
|
-
f"{benchmark_config_file.benchmark_name}_"
|
|
359
|
-
f"{benchmark_output_type.value.prioritisation_type_string}_summary"
|
|
349
|
+
f"{benchmark_config_file.benchmark_name}_{benchmark_output_type.value.prioritisation_type_string}_summary"
|
|
360
350
|
)
|
|
361
351
|
curve_table = (
|
|
362
352
|
f"{benchmark_config_file.benchmark_name}_"
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from itertools import combinations
|
|
2
|
-
from typing import List
|
|
3
2
|
|
|
4
3
|
import polars as pl
|
|
5
4
|
from duckdb.duckdb import DuckDBPyConnection
|
|
@@ -11,7 +10,7 @@ from pheval.utils.logger import get_logger
|
|
|
11
10
|
|
|
12
11
|
def calculate_rank_changes(
|
|
13
12
|
conn: DuckDBPyConnection,
|
|
14
|
-
run_identifiers:
|
|
13
|
+
run_identifiers: list[str],
|
|
15
14
|
true_positive_cases: pl.DataFrame,
|
|
16
15
|
benchmark_type: BenchmarkOutputType,
|
|
17
16
|
) -> None:
|
pheval/analyse/rank_stats.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import List
|
|
3
2
|
|
|
4
3
|
import numpy as np
|
|
5
4
|
import polars as pl
|
|
@@ -81,11 +80,7 @@ class Ranks:
|
|
|
81
80
|
"""
|
|
82
81
|
precision_expr = pl.col(f"top{k}") / (pl.col("number_of_samples") * k)
|
|
83
82
|
recall_expr = pl.col(f"top{k}") / pl.col("total")
|
|
84
|
-
return (
|
|
85
|
-
((2 * precision_expr * recall_expr) / (precision_expr + recall_expr))
|
|
86
|
-
.fill_nan(0)
|
|
87
|
-
.alias(f"f_beta@{k}")
|
|
88
|
-
)
|
|
83
|
+
return ((2 * precision_expr * recall_expr) / (precision_expr + recall_expr)).fill_nan(0).alias(f"f_beta@{k}")
|
|
89
84
|
|
|
90
85
|
@classmethod
|
|
91
86
|
def _average_precision_at_k(cls, df: pl.LazyFrame, k: int) -> pl.LazyFrame:
|
|
@@ -103,9 +98,7 @@ class Ranks:
|
|
|
103
98
|
filtered_df = cls._filter_results(df, k)
|
|
104
99
|
df_grouped = filtered_df.with_columns(
|
|
105
100
|
pl.struct("ranks")
|
|
106
|
-
.map_elements(
|
|
107
|
-
lambda row: cls._compute_ap_k(np.array(row["ranks"])), return_dtype=pl.Float64
|
|
108
|
-
)
|
|
101
|
+
.map_elements(lambda row: cls._compute_ap_k(np.array(row["ranks"])), return_dtype=pl.Float64)
|
|
109
102
|
.alias(f"ap@{k}")
|
|
110
103
|
)
|
|
111
104
|
return df_grouped.select(["file_path", f"ap@{k}"])
|
|
@@ -131,7 +124,7 @@ class Ranks:
|
|
|
131
124
|
return ap_sum / num_samples
|
|
132
125
|
|
|
133
126
|
@classmethod
|
|
134
|
-
def _calculate_ndcg_at_k(cls, ranks:
|
|
127
|
+
def _calculate_ndcg_at_k(cls, ranks: list[int], k: int) -> float:
|
|
135
128
|
"""
|
|
136
129
|
Compute NDCG@K for a single query.
|
|
137
130
|
Args:
|
|
@@ -146,9 +139,7 @@ class Ranks:
|
|
|
146
139
|
result_ranks[valid_indices] = 3
|
|
147
140
|
ideal_ranking = np.sort(result_ranks)[::-1]
|
|
148
141
|
return (
|
|
149
|
-
ndcg_score(result_ranks.reshape(1, -1), ideal_ranking.reshape(1, -1))
|
|
150
|
-
if np.sum(result_ranks) > 0
|
|
151
|
-
else 0.0
|
|
142
|
+
ndcg_score(result_ranks.reshape(1, -1), ideal_ranking.reshape(1, -1)) if np.sum(result_ranks) > 0 else 0.0
|
|
152
143
|
)
|
|
153
144
|
|
|
154
145
|
@classmethod
|
|
@@ -156,9 +147,7 @@ class Ranks:
|
|
|
156
147
|
filtered_df = cls._filter_results(df, k)
|
|
157
148
|
ndcg_df = filtered_df.with_columns(
|
|
158
149
|
pl.struct("ranks")
|
|
159
|
-
.map_elements(
|
|
160
|
-
lambda row: cls._calculate_ndcg_at_k(row["ranks"], k), return_dtype=pl.Float64
|
|
161
|
-
)
|
|
150
|
+
.map_elements(lambda row: cls._calculate_ndcg_at_k(row["ranks"], k), return_dtype=pl.Float64)
|
|
162
151
|
.alias(f"NDCG@{k}")
|
|
163
152
|
)
|
|
164
153
|
ndcg_sum = ndcg_df.select(pl.col(f"NDCG@{k}").sum()).collect().item()
|
|
@@ -218,14 +207,8 @@ def compute_rank_stats(run_identifier: str, result_scan: pl.LazyFrame) -> pl.Laz
|
|
|
218
207
|
pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 3)).alias("MAP@3"),
|
|
219
208
|
pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 5)).alias("MAP@5"),
|
|
220
209
|
pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 10)).alias("MAP@10"),
|
|
221
|
-
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 3)).alias(
|
|
222
|
-
|
|
223
|
-
),
|
|
224
|
-
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 5)).alias(
|
|
225
|
-
"NDCG@5"
|
|
226
|
-
),
|
|
227
|
-
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 10)).alias(
|
|
228
|
-
"NDCG@10"
|
|
229
|
-
),
|
|
210
|
+
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 3)).alias("NDCG@3"),
|
|
211
|
+
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 5)).alias("NDCG@5"),
|
|
212
|
+
pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 10)).alias("NDCG@10"),
|
|
230
213
|
]
|
|
231
214
|
)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import List, Optional
|
|
3
2
|
|
|
4
3
|
import yaml
|
|
5
4
|
from pydantic import BaseModel, field_validator
|
|
@@ -28,8 +27,8 @@ class RunConfig(BaseModel):
|
|
|
28
27
|
gene_analysis: bool
|
|
29
28
|
variant_analysis: bool
|
|
30
29
|
disease_analysis: bool
|
|
31
|
-
threshold:
|
|
32
|
-
score_order:
|
|
30
|
+
threshold: float | None
|
|
31
|
+
score_order: str | None
|
|
33
32
|
|
|
34
33
|
@field_validator("threshold", mode="before")
|
|
35
34
|
@classmethod
|
|
@@ -41,6 +40,13 @@ class RunConfig(BaseModel):
|
|
|
41
40
|
def set_score_order(cls, score_order):
|
|
42
41
|
return score_order or "descending"
|
|
43
42
|
|
|
43
|
+
@field_validator("results_dir", mode="after")
|
|
44
|
+
@classmethod
|
|
45
|
+
def check_results_dir_exists(cls, results_dir: Path):
|
|
46
|
+
if not results_dir.exists():
|
|
47
|
+
raise FileNotFoundError(f"The specified results directory does not exist: {results_dir}")
|
|
48
|
+
return results_dir
|
|
49
|
+
|
|
44
50
|
|
|
45
51
|
class SinglePlotCustomisation(BaseModel):
|
|
46
52
|
"""
|
|
@@ -53,10 +59,10 @@ class SinglePlotCustomisation(BaseModel):
|
|
|
53
59
|
precision_recall_title (str): The title for the precision-recall plot.
|
|
54
60
|
"""
|
|
55
61
|
|
|
56
|
-
plot_type:
|
|
57
|
-
rank_plot_title:
|
|
58
|
-
roc_curve_title:
|
|
59
|
-
precision_recall_title:
|
|
62
|
+
plot_type: str | None = "bar_cumulative"
|
|
63
|
+
rank_plot_title: str | None
|
|
64
|
+
roc_curve_title: str | None
|
|
65
|
+
precision_recall_title: str | None
|
|
60
66
|
|
|
61
67
|
@field_validator("plot_type", mode="before")
|
|
62
68
|
@classmethod
|
|
@@ -86,7 +92,7 @@ class Config(BaseModel):
|
|
|
86
92
|
"""
|
|
87
93
|
|
|
88
94
|
benchmark_name: str
|
|
89
|
-
runs:
|
|
95
|
+
runs: list[RunConfig]
|
|
90
96
|
plot_customisation: PlotCustomisation
|
|
91
97
|
|
|
92
98
|
|
|
@@ -100,7 +106,7 @@ def parse_run_config(run_config: Path) -> Config:
|
|
|
100
106
|
"""
|
|
101
107
|
logger = get_logger()
|
|
102
108
|
logger.info(f"Loading benchmark configuration from {run_config}")
|
|
103
|
-
with open(run_config
|
|
109
|
+
with open(run_config) as f:
|
|
104
110
|
config_data = yaml.safe_load(f)
|
|
105
111
|
f.close()
|
|
106
112
|
config = Config(**config_data)
|
pheval/cli.py
CHANGED
pheval/cli_pheval_utils.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""PhEval utils Command Line Interface"""
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import List
|
|
5
4
|
|
|
6
5
|
import click
|
|
7
6
|
|
|
@@ -39,9 +38,7 @@ from pheval.utils.utils import semsim_scramble
|
|
|
39
38
|
"-c",
|
|
40
39
|
required=True,
|
|
41
40
|
multiple=True,
|
|
42
|
-
type=click.Choice(
|
|
43
|
-
["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False
|
|
44
|
-
),
|
|
41
|
+
type=click.Choice(["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False),
|
|
45
42
|
help="Score column that will be scrambled",
|
|
46
43
|
)
|
|
47
44
|
@click.option(
|
|
@@ -54,9 +51,7 @@ from pheval.utils.utils import semsim_scramble
|
|
|
54
51
|
help="""Scramble Magnitude (noise)
|
|
55
52
|
that will be applied to semantic similarity score column (e.g. jaccard similarity).""",
|
|
56
53
|
)
|
|
57
|
-
def semsim_scramble_command(
|
|
58
|
-
input: Path, output: Path, score_column: List[str], scramble_factor: float
|
|
59
|
-
):
|
|
54
|
+
def semsim_scramble_command(input: Path, output: Path, score_column: list[str], scramble_factor: float):
|
|
60
55
|
"""Scrambles semsim profile multiplying score value by scramble factor
|
|
61
56
|
Args:
|
|
62
57
|
input (Path): Path file that points out to the semsim profile
|
|
@@ -125,9 +120,7 @@ def scramble_phenopackets_command(
|
|
|
125
120
|
if phenopacket_path is None and phenopacket_dir is None:
|
|
126
121
|
raise InputError("Either a phenopacket or phenopacket directory must be specified")
|
|
127
122
|
else:
|
|
128
|
-
scramble_phenopackets(
|
|
129
|
-
output_dir, phenopacket_path, phenopacket_dir, scramble_factor, local_ontology_cache
|
|
130
|
-
)
|
|
123
|
+
scramble_phenopackets(output_dir, phenopacket_path, phenopacket_dir, scramble_factor, local_ontology_cache)
|
|
131
124
|
|
|
132
125
|
|
|
133
126
|
@click.command("semsim-comparison")
|
|
@@ -149,9 +142,7 @@ def scramble_phenopackets_command(
|
|
|
149
142
|
"--score-column",
|
|
150
143
|
"-c",
|
|
151
144
|
required=True,
|
|
152
|
-
type=click.Choice(
|
|
153
|
-
["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False
|
|
154
|
-
),
|
|
145
|
+
type=click.Choice(["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False),
|
|
155
146
|
help="Score column that will be used in comparison",
|
|
156
147
|
)
|
|
157
148
|
@click.option(
|
|
@@ -232,9 +223,7 @@ def semsim_comparison(
|
|
|
232
223
|
help="Gene identifier to add to phenopacket",
|
|
233
224
|
type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
|
|
234
225
|
)
|
|
235
|
-
def update_phenopackets_command(
|
|
236
|
-
phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path, gene_identifier: str
|
|
237
|
-
):
|
|
226
|
+
def update_phenopackets_command(phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path, gene_identifier: str):
|
|
238
227
|
"""Update gene symbols and identifiers for phenopackets."""
|
|
239
228
|
if phenopacket_path is None and phenopacket_dir is None:
|
|
240
229
|
raise InputError("Either a phenopacket or phenopacket directory must be specified")
|
|
@@ -313,10 +302,10 @@ def create_spiked_vcfs_command(
|
|
|
313
302
|
phenopacket_path: Path,
|
|
314
303
|
phenopacket_dir: Path,
|
|
315
304
|
output_dir: Path,
|
|
316
|
-
hg19_template_vcf: Path = None,
|
|
317
|
-
hg38_template_vcf: Path = None,
|
|
318
|
-
hg19_vcf_dir: Path = None,
|
|
319
|
-
hg38_vcf_dir: Path = None,
|
|
305
|
+
hg19_template_vcf: Path | None = None,
|
|
306
|
+
hg38_template_vcf: Path | None = None,
|
|
307
|
+
hg19_vcf_dir: Path | None = None,
|
|
308
|
+
hg38_vcf_dir: Path | None = None,
|
|
320
309
|
):
|
|
321
310
|
"""
|
|
322
311
|
Create spiked VCF from either a Phenopacket or a Phenopacket directory.
|
|
@@ -394,9 +383,7 @@ def benchmark(
|
|
|
394
383
|
This is the path where the phenotypic database folder will be written out.""",
|
|
395
384
|
type=Path,
|
|
396
385
|
)
|
|
397
|
-
def semsim_to_exomiserdb_command(
|
|
398
|
-
input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path
|
|
399
|
-
):
|
|
386
|
+
def semsim_to_exomiserdb_command(input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path):
|
|
400
387
|
"""ingests semsim file into exomiser phenotypic database
|
|
401
388
|
|
|
402
389
|
Args:
|
pheval/config_parser.py
CHANGED
|
@@ -39,7 +39,7 @@ class InputDirConfig:
|
|
|
39
39
|
def parse_input_dir_config(input_dir: Path) -> InputDirConfig:
|
|
40
40
|
"""Reads the config file."""
|
|
41
41
|
logger.info(f"Parsing config.yaml located in {input_dir}.")
|
|
42
|
-
with open(Path(input_dir).joinpath("config.yaml")
|
|
42
|
+
with open(Path(input_dir).joinpath("config.yaml")) as config_file:
|
|
43
43
|
config = yaml.safe_load(config_file)
|
|
44
44
|
config_file.close()
|
|
45
45
|
return from_yaml(InputDirConfig, yaml.dump(config))
|
|
@@ -15,11 +15,9 @@ def get_implementation_resolver() -> ClassResolver[PhEvalRunner]:
|
|
|
15
15
|
Returns:
|
|
16
16
|
ClassResolver[PhEvalRunner]: _description_
|
|
17
17
|
"""
|
|
18
|
-
implementation_resolver: PhevalClassResolver[PhEvalRunner] = (
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
suffix="Implementation",
|
|
22
|
-
)
|
|
18
|
+
implementation_resolver: PhevalClassResolver[PhEvalRunner] = PhevalClassResolver.from_subclasses(
|
|
19
|
+
PhEvalRunner,
|
|
20
|
+
suffix="Implementation",
|
|
23
21
|
)
|
|
24
22
|
|
|
25
23
|
# implementation_resolver.synonyms.update(
|
pheval/infra/exomiserdb.py
CHANGED
|
@@ -12,9 +12,7 @@ info_debug = log.getLogger("debug")
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class DBConnector:
|
|
15
|
-
def __init__(
|
|
16
|
-
self, jar: Path, driver: str, server: str, database: str, user: str, password: str
|
|
17
|
-
):
|
|
15
|
+
def __init__(self, jar: Path, driver: str, server: str, database: str, user: str, password: str):
|
|
18
16
|
self.jar = jar
|
|
19
17
|
self.driver = driver
|
|
20
18
|
self.server = server
|
|
@@ -63,7 +61,7 @@ class DBConnection:
|
|
|
63
61
|
class ExomiserDB:
|
|
64
62
|
def __init__(self, db_path: Path):
|
|
65
63
|
try:
|
|
66
|
-
self.connector = DBConnector(
|
|
64
|
+
self.connector = DBConnector(
|
|
67
65
|
jar=os.path.join(os.path.dirname(__file__), "../../../lib/h2-1.4.199.jar"),
|
|
68
66
|
driver="org.h2.Driver",
|
|
69
67
|
server=f"jdbc:h2:{db_path}",
|
|
@@ -89,7 +87,7 @@ class ExomiserDB:
|
|
|
89
87
|
batches = reader.next_batches(batch_length)
|
|
90
88
|
cursor = conn.get_cursor()
|
|
91
89
|
# # TODO: Refactor this
|
|
92
|
-
with open(input_file
|
|
90
|
+
with open(input_file) as f:
|
|
93
91
|
total = sum(1 for line in f)
|
|
94
92
|
pbar = tqdm(total=total - 1)
|
|
95
93
|
mapping_id = 1
|
|
@@ -112,12 +110,10 @@ def _format_row(mapping_id, data):
|
|
|
112
110
|
data (_type_): row data
|
|
113
111
|
"""
|
|
114
112
|
# TODO:Improve string escaping. Replace this code with parametrised query
|
|
115
|
-
return f"""({mapping_id}, '{data[
|
|
113
|
+
return f"""({mapping_id}, '{data["subject_id"]}', '{data["subject_label"].replace("'", "")}', '{data["object_id"]}', '{data["object_label"].replace("'", "")}', {data["jaccard_similarity"]}, {data["ancestor_information_content"]}, {data["phenodigm_score"]}, '{data["ancestor_id"].split(",")[0]}', '{data["ancestor_label"].replace("'", "")}')""" # noqa
|
|
116
114
|
|
|
117
115
|
|
|
118
|
-
def _semsim2h2(
|
|
119
|
-
input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1
|
|
120
|
-
) -> None:
|
|
116
|
+
def _semsim2h2(input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1) -> None:
|
|
121
117
|
"""This function is responsible for generate sql insertion query for each semsim profile row
|
|
122
118
|
|
|
123
119
|
Args:
|
|
@@ -130,12 +126,8 @@ def _semsim2h2(
|
|
|
130
126
|
if mapping_id == 1:
|
|
131
127
|
sql += f"TRUNCATE TABLE EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS;\n"
|
|
132
128
|
|
|
133
|
-
object_id =
|
|
134
|
-
|
|
135
|
-
)
|
|
136
|
-
object_term = (
|
|
137
|
-
f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
|
|
138
|
-
)
|
|
129
|
+
object_id = f"{object_prefix}_ID_HIT" if subject_prefix == object_prefix else f"{object_prefix}_ID"
|
|
130
|
+
object_term = f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
|
|
139
131
|
sql += f"""INSERT INTO EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS
|
|
140
132
|
(MAPPING_ID, {subject_prefix}_ID, {subject_prefix}_TERM, {object_id}, {object_term}, SIMJ, IC, SCORE, LCS_ID, LCS_TERM)
|
|
141
133
|
VALUES"""
|