pheval 0.6.3__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show
  1. {pheval-0.6.3 → pheval-0.6.4}/PKG-INFO +1 -1
  2. {pheval-0.6.3 → pheval-0.6.4}/pyproject.toml +37 -11
  3. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/benchmark.py +8 -23
  4. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/benchmark_output_type.py +3 -5
  5. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/binary_classification_curves.py +3 -9
  6. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/binary_classification_stats.py +1 -4
  7. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/generate_plots.py +8 -18
  8. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/generate_rank_comparisons.py +1 -2
  9. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/rank_stats.py +8 -25
  10. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/run_data_parser.py +9 -12
  11. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/cli.py +1 -1
  12. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/cli_pheval_utils.py +10 -23
  13. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/config_parser.py +1 -1
  14. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/implementations/__init__.py +3 -5
  15. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/infra/exomiserdb.py +7 -15
  16. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/post_processing/phenopacket_truth_set.py +10 -31
  17. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/post_processing/post_processing.py +12 -33
  18. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/post_processing/validate_result_format.py +2 -4
  19. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/create_noisy_phenopackets.py +18 -29
  20. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/create_spiked_vcf.py +25 -56
  21. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/custom_exceptions.py +6 -7
  22. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/prepare_corpus.py +6 -17
  23. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/update_phenopacket.py +6 -17
  24. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/docs_gen.py +3 -3
  25. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/file_utils.py +1 -2
  26. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/phenopacket_utils.py +41 -73
  27. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/semsim_utils.py +6 -10
  28. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/utils.py +3 -4
  29. {pheval-0.6.3 → pheval-0.6.4}/LICENSE +0 -0
  30. {pheval-0.6.3 → pheval-0.6.4}/README.md +0 -0
  31. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/__init__.py +0 -0
  32. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/__init__.py +0 -0
  33. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/analyse/benchmark_db_manager.py +0 -0
  34. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/cli_pheval.py +0 -0
  35. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/implementations/pheval_class_resolver.py +0 -0
  36. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/infra/__init__.py +0 -0
  37. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/post_processing/__init__.py +0 -0
  38. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/post_processing/mondo_mapping.py +0 -0
  39. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/prepare/__init__.py +0 -0
  40. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
  41. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
  42. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
  43. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
  44. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
  45. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
  46. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
  47. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/hgnc_complete_set.txt +0 -0
  48. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/metadata.json +0 -0
  49. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/resources/mondo.sssom.tsv +0 -0
  50. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/run_metadata.py +0 -0
  51. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/runners/__init__.py +0 -0
  52. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/runners/runner.py +0 -0
  53. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/__init__.py +0 -0
  54. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/docs_gen.sh +0 -0
  55. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/exomiser.py +0 -0
  56. {pheval-0.6.3 → pheval-0.6.4}/src/pheval/utils/logger.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pheval
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pheval"
3
- version = "0.6.3"
3
+ version = "0.6.4"
4
4
  description = ""
5
5
  authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
6
6
  "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
@@ -54,23 +54,49 @@ mkdocstrings-python = "^1.16.11"
54
54
  mkdocstrings = "^0.29.1"
55
55
 
56
56
  [tool.poetry.group.dev.dependencies]
57
- black = "^22.12.0"
57
+ ruff = "^0.4.6"
58
58
 
59
59
  [tool.pytest.ini_options]
60
60
  pythonpath = [
61
61
  "src"
62
62
  ]
63
63
 
64
- [tool.black]
65
- line-length = 100
66
- target-version = ["py39", "py310"]
64
+ [tool.ruff]
65
+ line-length = 120
66
+ target-version = "py311"
67
67
 
68
- [tool.isort]
69
- profile = "black"
70
- multi_line_output = 3
71
- line_length = 100
72
- include_trailing_comma = true
73
- reverse_relative = true
68
+ src = ["src", "tests"]
69
+ respect-gitignore = true
70
+
71
+ [tool.ruff.lint]
72
+ select = [
73
+ "E", # pycodestyle errors
74
+ "W", # pycodestyle warnings
75
+ "F", # pyflakes
76
+ "B", # flake8-bugbear
77
+ "I", # isort
78
+ "N", # pep8-naming
79
+ "S", # flake8-bandit
80
+ "C90", # mccabe complexity
81
+ "PL", # pylint
82
+ "RUF", # Ruff-specific rules
83
+ "UP", # pyupgrade
84
+ "NPY", # NumPy-specific
85
+ ]
86
+ ignore = [
87
+ "E203", "S311", "S101", "S106", "S404", "S108", "S307", "S603", "S607", "S608",
88
+ "B024", "B027", "N801", "N803", "N806", "N815", "E731", "C901", "B019", "PLR0913", "PLW0211"
89
+ ]
90
+ fixable = ["ALL"]
91
+ unfixable = ["B"]
92
+
93
+ [tool.ruff.lint.mccabe]
94
+ max-complexity = 13
95
+
96
+ [tool.ruff.format]
97
+ quote-style = "double"
98
+ indent-style = "space"
99
+ line-ending = "auto"
74
100
 
75
101
  [build-system]
76
102
  requires = ["poetry-core"]
@@ -1,7 +1,6 @@
1
1
  import sys
2
2
  import time
3
3
  from pathlib import Path
4
- from typing import List, Tuple
5
4
 
6
5
  import duckdb
7
6
  import polars as pl
@@ -54,8 +53,8 @@ def scan_directory(run: RunConfig, benchmark_type: BenchmarkOutputType) -> pl.La
54
53
 
55
54
 
56
55
  def process_stats(
57
- runs: List[RunConfig], benchmark_type: BenchmarkOutputType
58
- ) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
56
+ runs: list[RunConfig], benchmark_type: BenchmarkOutputType
57
+ ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
59
58
  """
60
59
  Processes stats outputs for specified runs to compare.
61
60
  Args:
@@ -75,9 +74,7 @@ def process_stats(
75
74
  curve_results.append(compute_curves(run.run_identifier, result_scan))
76
75
  true_positive_cases.append(
77
76
  result_scan.filter(pl.col("true_positive"))
78
- .select(
79
- ["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)]
80
- )
77
+ .select(["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)])
81
78
  .sort(["result_file", *benchmark_type.columns])
82
79
  )
83
80
  return (
@@ -87,11 +84,7 @@ def process_stats(
87
84
  [true_positive_cases[0]]
88
85
  + [
89
86
  df.select(
90
- [
91
- col
92
- for col in df.collect_schema().keys()
93
- if col not in ["result_file", *benchmark_type.columns]
94
- ]
87
+ [col for col in df.collect_schema().keys() if col not in ["result_file", *benchmark_type.columns]]
95
88
  )
96
89
  for df in true_positive_cases[1:]
97
90
  ],
@@ -109,20 +102,14 @@ def benchmark(config: Config, benchmark_type: BenchmarkOutputType) -> None:
109
102
  """
110
103
  conn = duckdb.connect(f"{config.benchmark_name}.duckdb")
111
104
  stats, curve_results, true_positive_cases = process_stats(config.runs, benchmark_type)
112
- write_table(
113
- conn, stats, f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_summary"
114
- )
105
+ write_table(conn, stats, f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_summary")
115
106
  write_table(
116
107
  conn,
117
108
  curve_results,
118
109
  f"{config.benchmark_name}_{benchmark_type.prioritisation_type_string}_binary_classification_curves",
119
110
  )
120
- calculate_rank_changes(
121
- conn, [run.run_identifier for run in config.runs], true_positive_cases, benchmark_type
122
- )
123
- generate_plots(
124
- config.benchmark_name, stats, curve_results, benchmark_type, config.plot_customisation
125
- )
111
+ calculate_rank_changes(conn, [run.run_identifier for run in config.runs], true_positive_cases, benchmark_type)
112
+ generate_plots(config.benchmark_name, stats, curve_results, benchmark_type, config.plot_customisation)
126
113
  conn.close()
127
114
 
128
115
 
@@ -175,6 +162,4 @@ def benchmark_runs(benchmark_config_file: Path) -> None:
175
162
  BenchmarkOutputTypeEnum.DISEASE.value,
176
163
  )
177
164
  logger.info("Finished benchmarking for disease results.")
178
- logger.info(
179
- f"Finished benchmarking! Total time: {time.perf_counter() - start_time:.2f} seconds."
180
- )
165
+ logger.info(f"Finished benchmarking! Total time: {time.perf_counter() - start_time:.2f} seconds.")
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import List, NamedTuple
2
+ from typing import NamedTuple
3
3
 
4
4
 
5
5
  class BenchmarkOutputType(NamedTuple):
@@ -15,7 +15,7 @@ class BenchmarkOutputType(NamedTuple):
15
15
 
16
16
  prioritisation_type_string: str
17
17
  y_label: str
18
- columns: List[str]
18
+ columns: list[str]
19
19
  result_directory: str
20
20
 
21
21
 
@@ -35,9 +35,7 @@ class BenchmarkOutputTypeEnum(Enum):
35
35
  ["gene_identifier", "gene_symbol"],
36
36
  "pheval_gene_results",
37
37
  )
38
- VARIANT = BenchmarkOutputType(
39
- "variant", "Disease-causing variants (%)", ["variant_id"], "pheval_variant_results"
40
- )
38
+ VARIANT = BenchmarkOutputType("variant", "Disease-causing variants (%)", ["variant_id"], "pheval_variant_results")
41
39
  DISEASE = BenchmarkOutputType(
42
40
  "disease",
43
41
  "Known diseases (%)",
@@ -1,5 +1,3 @@
1
- from typing import Tuple
2
-
3
1
  import numpy as np
4
2
  import polars as pl
5
3
  from sklearn.metrics import precision_recall_curve, roc_curve
@@ -11,7 +9,7 @@ class BinaryClassificationCurves:
11
9
  """Class for computing and storing ROC & Precision-Recall curves in Polars."""
12
10
 
13
11
  @staticmethod
14
- def _compute_finite_bounds(result_scan: pl.LazyFrame) -> Tuple[float, float]:
12
+ def _compute_finite_bounds(result_scan: pl.LazyFrame) -> tuple[float, float]:
15
13
  """
16
14
  Compute min and max finite values in the 'score' column to handle NaN and Inf values.
17
15
  Args:
@@ -32,9 +30,7 @@ class BinaryClassificationCurves:
32
30
  )
33
31
 
34
32
  @staticmethod
35
- def _clean_and_extract_data(
36
- result_scan: pl.LazyFrame, max_finite: float, min_finite: float
37
- ) -> pl.LazyFrame:
33
+ def _clean_and_extract_data(result_scan: pl.LazyFrame, max_finite: float, min_finite: float) -> pl.LazyFrame:
38
34
  """
39
35
  Normalise the 'score' column (handling NaNs and Inf values) and extract 'true_positive' labels.
40
36
 
@@ -64,9 +60,7 @@ class BinaryClassificationCurves:
64
60
  )
65
61
 
66
62
  @staticmethod
67
- def _compute_roc_pr_curves(
68
- run_identifier: str, labels: np.ndarray, scores: np.ndarray
69
- ) -> pl.LazyFrame:
63
+ def _compute_roc_pr_curves(run_identifier: str, labels: np.ndarray, scores: np.ndarray) -> pl.LazyFrame:
70
64
  """
71
65
  Compute ROC and Precision-Recall curves.
72
66
 
@@ -103,10 +103,7 @@ class BinaryClassificationStats:
103
103
  )
104
104
 
105
105
  F1_SCORE = (
106
- pl.when(
107
- 2 * (pl.col("true_positives") + pl.col("false_positives") + pl.col("false_negatives"))
108
- != 0
109
- )
106
+ pl.when(2 * (pl.col("true_positives") + pl.col("false_positives") + pl.col("false_negatives")) != 0)
110
107
  .then(
111
108
  2
112
109
  * pl.col("true_positives")
@@ -1,5 +1,6 @@
1
1
  from enum import Enum
2
2
  from pathlib import Path
3
+ from typing import ClassVar
3
4
 
4
5
  import duckdb
5
6
  import matplotlib
@@ -32,7 +33,7 @@ class PlotTypes(Enum):
32
33
  class PlotGenerator:
33
34
  """Class to generate plots."""
34
35
 
35
- palette_hex_codes = [
36
+ palette_hex_codes: ClassVar[list[str]] = [
36
37
  "#f4ae3d",
37
38
  "#ee5825",
38
39
  "#2b7288",
@@ -91,9 +92,7 @@ class PlotGenerator:
91
92
  {"run_identifier": "Run", "mrr": "Percentage"}
92
93
  )
93
94
 
94
- def _save_fig(
95
- self, benchmark_output_type: BenchmarkOutputType, y_lower_limit: int, y_upper_limit: int
96
- ) -> None:
95
+ def _save_fig(self, benchmark_output_type: BenchmarkOutputType, y_lower_limit: int, y_upper_limit: int) -> None:
97
96
  """
98
97
  Save the generated figure.
99
98
  Args:
@@ -140,9 +139,7 @@ class PlotGenerator:
140
139
  legend=False,
141
140
  edgecolor="white",
142
141
  )
143
- plt.title(
144
- f"{benchmark_output_type.prioritisation_type_string.capitalize()} results - mean reciprocal rank"
145
- )
142
+ plt.title(f"{benchmark_output_type.prioritisation_type_string.capitalize()} results - mean reciprocal rank")
146
143
  self._save_fig(benchmark_output_type, 0, 1)
147
144
 
148
145
  @staticmethod
@@ -189,17 +186,13 @@ class PlotGenerator:
189
186
  plt.title(plot_customisation.rank_plot_title, loc="center", fontsize=15)
190
187
  self._save_fig(benchmark_output_type, 0, 1)
191
188
 
192
- def _generate_non_cumulative_bar_plot_data(
193
- self, benchmarking_results_df: pl.DataFrame
194
- ) -> pl.DataFrame:
189
+ def _generate_non_cumulative_bar_plot_data(self, benchmarking_results_df: pl.DataFrame) -> pl.DataFrame:
195
190
  """
196
191
  Generate data in the correct format for dataframe creation for a non-cumulative bar plot,
197
192
  appending to the self.stats attribute of the class.
198
193
  """
199
194
  return self._generate_stacked_data(benchmarking_results_df).hstack(
200
- self._extract_mrr_data(benchmarking_results_df).select(
201
- pl.col("Percentage").alias("MRR")
202
- )
195
+ self._extract_mrr_data(benchmarking_results_df).select(pl.col("Percentage").alias("MRR"))
203
196
  )
204
197
 
205
198
  def generate_cumulative_bar(
@@ -309,9 +302,7 @@ def generate_plots(
309
302
  This method generates summary statistics bar plots based on the provided benchmarking results and plot type.
310
303
  """
311
304
  plot_generator = PlotGenerator(benchmark_name)
312
- plot_customisation_type = getattr(
313
- plot_customisation, f"{benchmark_output_type.prioritisation_type_string}_plots"
314
- )
305
+ plot_customisation_type = getattr(plot_customisation, f"{benchmark_output_type.prioritisation_type_string}_plots")
315
306
  logger.info("Generating ROC curve visualisations.")
316
307
  plot_generator.generate_roc_curve(curves, benchmark_output_type, plot_customisation_type)
317
308
  logger.info("Generating Precision-Recall curves visualisations.")
@@ -355,8 +346,7 @@ def generate_plots_from_db(db_path: Path, config: Path) -> None:
355
346
  }
356
347
  for benchmark_output_type in BenchmarkOutputTypeEnum:
357
348
  summary_table = (
358
- f"{benchmark_config_file.benchmark_name}_"
359
- f"{benchmark_output_type.value.prioritisation_type_string}_summary"
349
+ f"{benchmark_config_file.benchmark_name}_{benchmark_output_type.value.prioritisation_type_string}_summary"
360
350
  )
361
351
  curve_table = (
362
352
  f"{benchmark_config_file.benchmark_name}_"
@@ -1,5 +1,4 @@
1
1
  from itertools import combinations
2
- from typing import List
3
2
 
4
3
  import polars as pl
5
4
  from duckdb.duckdb import DuckDBPyConnection
@@ -11,7 +10,7 @@ from pheval.utils.logger import get_logger
11
10
 
12
11
  def calculate_rank_changes(
13
12
  conn: DuckDBPyConnection,
14
- run_identifiers: List[str],
13
+ run_identifiers: list[str],
15
14
  true_positive_cases: pl.DataFrame,
16
15
  benchmark_type: BenchmarkOutputType,
17
16
  ) -> None:
@@ -1,5 +1,4 @@
1
1
  from dataclasses import dataclass
2
- from typing import List
3
2
 
4
3
  import numpy as np
5
4
  import polars as pl
@@ -81,11 +80,7 @@ class Ranks:
81
80
  """
82
81
  precision_expr = pl.col(f"top{k}") / (pl.col("number_of_samples") * k)
83
82
  recall_expr = pl.col(f"top{k}") / pl.col("total")
84
- return (
85
- ((2 * precision_expr * recall_expr) / (precision_expr + recall_expr))
86
- .fill_nan(0)
87
- .alias(f"f_beta@{k}")
88
- )
83
+ return ((2 * precision_expr * recall_expr) / (precision_expr + recall_expr)).fill_nan(0).alias(f"f_beta@{k}")
89
84
 
90
85
  @classmethod
91
86
  def _average_precision_at_k(cls, df: pl.LazyFrame, k: int) -> pl.LazyFrame:
@@ -103,9 +98,7 @@ class Ranks:
103
98
  filtered_df = cls._filter_results(df, k)
104
99
  df_grouped = filtered_df.with_columns(
105
100
  pl.struct("ranks")
106
- .map_elements(
107
- lambda row: cls._compute_ap_k(np.array(row["ranks"])), return_dtype=pl.Float64
108
- )
101
+ .map_elements(lambda row: cls._compute_ap_k(np.array(row["ranks"])), return_dtype=pl.Float64)
109
102
  .alias(f"ap@{k}")
110
103
  )
111
104
  return df_grouped.select(["file_path", f"ap@{k}"])
@@ -131,7 +124,7 @@ class Ranks:
131
124
  return ap_sum / num_samples
132
125
 
133
126
  @classmethod
134
- def _calculate_ndcg_at_k(cls, ranks: List[int], k: int) -> float:
127
+ def _calculate_ndcg_at_k(cls, ranks: list[int], k: int) -> float:
135
128
  """
136
129
  Compute NDCG@K for a single query.
137
130
  Args:
@@ -146,9 +139,7 @@ class Ranks:
146
139
  result_ranks[valid_indices] = 3
147
140
  ideal_ranking = np.sort(result_ranks)[::-1]
148
141
  return (
149
- ndcg_score(result_ranks.reshape(1, -1), ideal_ranking.reshape(1, -1))
150
- if np.sum(result_ranks) > 0
151
- else 0.0
142
+ ndcg_score(result_ranks.reshape(1, -1), ideal_ranking.reshape(1, -1)) if np.sum(result_ranks) > 0 else 0.0
152
143
  )
153
144
 
154
145
  @classmethod
@@ -156,9 +147,7 @@ class Ranks:
156
147
  filtered_df = cls._filter_results(df, k)
157
148
  ndcg_df = filtered_df.with_columns(
158
149
  pl.struct("ranks")
159
- .map_elements(
160
- lambda row: cls._calculate_ndcg_at_k(row["ranks"], k), return_dtype=pl.Float64
161
- )
150
+ .map_elements(lambda row: cls._calculate_ndcg_at_k(row["ranks"], k), return_dtype=pl.Float64)
162
151
  .alias(f"NDCG@{k}")
163
152
  )
164
153
  ndcg_sum = ndcg_df.select(pl.col(f"NDCG@{k}").sum()).collect().item()
@@ -218,14 +207,8 @@ def compute_rank_stats(run_identifier: str, result_scan: pl.LazyFrame) -> pl.Laz
218
207
  pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 3)).alias("MAP@3"),
219
208
  pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 5)).alias("MAP@5"),
220
209
  pl.lit(Ranks.mean_average_precision_at_k(true_positive_scan, 10)).alias("MAP@10"),
221
- pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 3)).alias(
222
- "NDCG@3"
223
- ),
224
- pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 5)).alias(
225
- "NDCG@5"
226
- ),
227
- pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 10)).alias(
228
- "NDCG@10"
229
- ),
210
+ pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 3)).alias("NDCG@3"),
211
+ pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 5)).alias("NDCG@5"),
212
+ pl.lit(Ranks.mean_normalised_discounted_cumulative_gain(true_positive_scan, 10)).alias("NDCG@10"),
230
213
  ]
231
214
  )
@@ -1,5 +1,4 @@
1
1
  from pathlib import Path
2
- from typing import List, Optional
3
2
 
4
3
  import yaml
5
4
  from pydantic import BaseModel, field_validator
@@ -28,8 +27,8 @@ class RunConfig(BaseModel):
28
27
  gene_analysis: bool
29
28
  variant_analysis: bool
30
29
  disease_analysis: bool
31
- threshold: Optional[float]
32
- score_order: Optional[str]
30
+ threshold: float | None
31
+ score_order: str | None
33
32
 
34
33
  @field_validator("threshold", mode="before")
35
34
  @classmethod
@@ -45,9 +44,7 @@ class RunConfig(BaseModel):
45
44
  @classmethod
46
45
  def check_results_dir_exists(cls, results_dir: Path):
47
46
  if not results_dir.exists():
48
- raise FileNotFoundError(
49
- f"The specified results directory does not exist: {results_dir}"
50
- )
47
+ raise FileNotFoundError(f"The specified results directory does not exist: {results_dir}")
51
48
  return results_dir
52
49
 
53
50
 
@@ -62,10 +59,10 @@ class SinglePlotCustomisation(BaseModel):
62
59
  precision_recall_title (str): The title for the precision-recall plot.
63
60
  """
64
61
 
65
- plot_type: Optional[str] = "bar_cumulative"
66
- rank_plot_title: Optional[str]
67
- roc_curve_title: Optional[str]
68
- precision_recall_title: Optional[str]
62
+ plot_type: str | None = "bar_cumulative"
63
+ rank_plot_title: str | None
64
+ roc_curve_title: str | None
65
+ precision_recall_title: str | None
69
66
 
70
67
  @field_validator("plot_type", mode="before")
71
68
  @classmethod
@@ -95,7 +92,7 @@ class Config(BaseModel):
95
92
  """
96
93
 
97
94
  benchmark_name: str
98
- runs: List[RunConfig]
95
+ runs: list[RunConfig]
99
96
  plot_customisation: PlotCustomisation
100
97
 
101
98
 
@@ -109,7 +106,7 @@ def parse_run_config(run_config: Path) -> Config:
109
106
  """
110
107
  logger = get_logger()
111
108
  logger.info(f"Loading benchmark configuration from {run_config}")
112
- with open(run_config, "r") as f:
109
+ with open(run_config) as f:
113
110
  config_data = yaml.safe_load(f)
114
111
  f.close()
115
112
  config = Config(**config_data)
@@ -29,7 +29,7 @@ def main(ctx, verbose=1, quiet=False):
29
29
  """Main CLI method for PhEval."""
30
30
  initialise_context(ctx)
31
31
 
32
- if verbose >= 2:
32
+ if verbose >= 2: # noqa
33
33
  logger.setLevel(logging.DEBUG)
34
34
  elif verbose == 1:
35
35
  logger.setLevel(logging.INFO)
@@ -1,7 +1,6 @@
1
1
  """PhEval utils Command Line Interface"""
2
2
 
3
3
  from pathlib import Path
4
- from typing import List
5
4
 
6
5
  import click
7
6
 
@@ -39,9 +38,7 @@ from pheval.utils.utils import semsim_scramble
39
38
  "-c",
40
39
  required=True,
41
40
  multiple=True,
42
- type=click.Choice(
43
- ["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False
44
- ),
41
+ type=click.Choice(["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False),
45
42
  help="Score column that will be scrambled",
46
43
  )
47
44
  @click.option(
@@ -54,9 +51,7 @@ from pheval.utils.utils import semsim_scramble
54
51
  help="""Scramble Magnitude (noise)
55
52
  that will be applied to semantic similarity score column (e.g. jaccard similarity).""",
56
53
  )
57
- def semsim_scramble_command(
58
- input: Path, output: Path, score_column: List[str], scramble_factor: float
59
- ):
54
+ def semsim_scramble_command(input: Path, output: Path, score_column: list[str], scramble_factor: float):
60
55
  """Scrambles semsim profile multiplying score value by scramble factor
61
56
  Args:
62
57
  input (Path): Path file that points out to the semsim profile
@@ -125,9 +120,7 @@ def scramble_phenopackets_command(
125
120
  if phenopacket_path is None and phenopacket_dir is None:
126
121
  raise InputError("Either a phenopacket or phenopacket directory must be specified")
127
122
  else:
128
- scramble_phenopackets(
129
- output_dir, phenopacket_path, phenopacket_dir, scramble_factor, local_ontology_cache
130
- )
123
+ scramble_phenopackets(output_dir, phenopacket_path, phenopacket_dir, scramble_factor, local_ontology_cache)
131
124
 
132
125
 
133
126
  @click.command("semsim-comparison")
@@ -149,9 +142,7 @@ def scramble_phenopackets_command(
149
142
  "--score-column",
150
143
  "-c",
151
144
  required=True,
152
- type=click.Choice(
153
- ["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False
154
- ),
145
+ type=click.Choice(["jaccard_similarity", "dice_similarity", "phenodigm_score"], case_sensitive=False),
155
146
  help="Score column that will be used in comparison",
156
147
  )
157
148
  @click.option(
@@ -232,9 +223,7 @@ def semsim_comparison(
232
223
  help="Gene identifier to add to phenopacket",
233
224
  type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
234
225
  )
235
- def update_phenopackets_command(
236
- phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path, gene_identifier: str
237
- ):
226
+ def update_phenopackets_command(phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path, gene_identifier: str):
238
227
  """Update gene symbols and identifiers for phenopackets."""
239
228
  if phenopacket_path is None and phenopacket_dir is None:
240
229
  raise InputError("Either a phenopacket or phenopacket directory must be specified")
@@ -313,10 +302,10 @@ def create_spiked_vcfs_command(
313
302
  phenopacket_path: Path,
314
303
  phenopacket_dir: Path,
315
304
  output_dir: Path,
316
- hg19_template_vcf: Path = None,
317
- hg38_template_vcf: Path = None,
318
- hg19_vcf_dir: Path = None,
319
- hg38_vcf_dir: Path = None,
305
+ hg19_template_vcf: Path | None = None,
306
+ hg38_template_vcf: Path | None = None,
307
+ hg19_vcf_dir: Path | None = None,
308
+ hg38_vcf_dir: Path | None = None,
320
309
  ):
321
310
  """
322
311
  Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -394,9 +383,7 @@ def benchmark(
394
383
  This is the path where the phenotypic database folder will be written out.""",
395
384
  type=Path,
396
385
  )
397
- def semsim_to_exomiserdb_command(
398
- input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path
399
- ):
386
+ def semsim_to_exomiserdb_command(input_file: Path, object_prefix: str, subject_prefix: str, db_path: Path):
400
387
  """ingests semsim file into exomiser phenotypic database
401
388
 
402
389
  Args:
@@ -39,7 +39,7 @@ class InputDirConfig:
39
39
  def parse_input_dir_config(input_dir: Path) -> InputDirConfig:
40
40
  """Reads the config file."""
41
41
  logger.info(f"Parsing config.yaml located in {input_dir}.")
42
- with open(Path(input_dir).joinpath("config.yaml"), "r") as config_file:
42
+ with open(Path(input_dir).joinpath("config.yaml")) as config_file:
43
43
  config = yaml.safe_load(config_file)
44
44
  config_file.close()
45
45
  return from_yaml(InputDirConfig, yaml.dump(config))
@@ -15,11 +15,9 @@ def get_implementation_resolver() -> ClassResolver[PhEvalRunner]:
15
15
  Returns:
16
16
  ClassResolver[PhEvalRunner]: _description_
17
17
  """
18
- implementation_resolver: PhevalClassResolver[PhEvalRunner] = (
19
- PhevalClassResolver.from_subclasses(
20
- PhEvalRunner,
21
- suffix="Implementation",
22
- )
18
+ implementation_resolver: PhevalClassResolver[PhEvalRunner] = PhevalClassResolver.from_subclasses(
19
+ PhEvalRunner,
20
+ suffix="Implementation",
23
21
  )
24
22
 
25
23
  # implementation_resolver.synonyms.update(
@@ -12,9 +12,7 @@ info_debug = log.getLogger("debug")
12
12
 
13
13
 
14
14
  class DBConnector:
15
- def __init__(
16
- self, jar: Path, driver: str, server: str, database: str, user: str, password: str
17
- ):
15
+ def __init__(self, jar: Path, driver: str, server: str, database: str, user: str, password: str):
18
16
  self.jar = jar
19
17
  self.driver = driver
20
18
  self.server = server
@@ -63,7 +61,7 @@ class DBConnection:
63
61
  class ExomiserDB:
64
62
  def __init__(self, db_path: Path):
65
63
  try:
66
- self.connector = DBConnector( # noqa
64
+ self.connector = DBConnector(
67
65
  jar=os.path.join(os.path.dirname(__file__), "../../../lib/h2-1.4.199.jar"),
68
66
  driver="org.h2.Driver",
69
67
  server=f"jdbc:h2:{db_path}",
@@ -89,7 +87,7 @@ class ExomiserDB:
89
87
  batches = reader.next_batches(batch_length)
90
88
  cursor = conn.get_cursor()
91
89
  # # TODO: Refactor this
92
- with open(input_file, "r") as f:
90
+ with open(input_file) as f:
93
91
  total = sum(1 for line in f)
94
92
  pbar = tqdm(total=total - 1)
95
93
  mapping_id = 1
@@ -112,12 +110,10 @@ def _format_row(mapping_id, data):
112
110
  data (_type_): row data
113
111
  """
114
112
  # TODO:Improve string escaping. Replace this code with parametrised query
115
- return f"""({mapping_id}, '{data['subject_id']}', '{data['subject_label'].replace("'", "")}', '{data['object_id']}', '{data['object_label'].replace("'", "")}', {data['jaccard_similarity']}, {data['ancestor_information_content']}, {data['phenodigm_score']}, '{data['ancestor_id'].split(",")[0]}', '{data['ancestor_label'].replace("'", "")}')""" # noqa
113
+ return f"""({mapping_id}, '{data["subject_id"]}', '{data["subject_label"].replace("'", "")}', '{data["object_id"]}', '{data["object_label"].replace("'", "")}', {data["jaccard_similarity"]}, {data["ancestor_information_content"]}, {data["phenodigm_score"]}, '{data["ancestor_id"].split(",")[0]}', '{data["ancestor_label"].replace("'", "")}')""" # noqa
116
114
 
117
115
 
118
- def _semsim2h2(
119
- input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1
120
- ) -> None:
116
+ def _semsim2h2(input_data: pl.DataFrame, subject_prefix: str, object_prefix: str, mapping_id=1) -> None:
121
117
  """This function is responsible for generate sql insertion query for each semsim profile row
122
118
 
123
119
  Args:
@@ -130,12 +126,8 @@ def _semsim2h2(
130
126
  if mapping_id == 1:
131
127
  sql += f"TRUNCATE TABLE EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS;\n"
132
128
 
133
- object_id = (
134
- f"{object_prefix}_ID_HIT" if subject_prefix == object_prefix else f"{object_prefix}_ID"
135
- )
136
- object_term = (
137
- f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
138
- )
129
+ object_id = f"{object_prefix}_ID_HIT" if subject_prefix == object_prefix else f"{object_prefix}_ID"
130
+ object_term = f"{object_prefix}_HIT_TERM" if subject_prefix == object_prefix else f"{object_prefix}_TERM"
139
131
  sql += f"""INSERT INTO EXOMISER.{subject_prefix}_{object_prefix}_MAPPINGS
140
132
  (MAPPING_ID, {subject_prefix}_ID, {subject_prefix}_TERM, {object_id}, {object_term}, SIMJ, IC, SCORE, LCS_ID, LCS_TERM)
141
133
  VALUES"""