pheval 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from math import sqrt
3
3
  from typing import List, Union
4
4
 
@@ -29,6 +29,8 @@ class BinaryClassificationStats:
29
29
  true_negatives: int = 0
30
30
  false_positives: int = 0
31
31
  false_negatives: int = 0
32
+ labels: List = field(default_factory=list)
33
+ scores: List = field(default_factory=list)
32
34
 
33
35
  @staticmethod
34
36
  def remove_relevant_ranks(
@@ -84,6 +86,31 @@ class BinaryClassificationStats:
84
86
  elif rank != 1:
85
87
  self.true_negatives += 1
86
88
 
89
+ def add_labels_and_scores(
90
+ self,
91
+ pheval_results: Union[
92
+ List[RankedPhEvalGeneResult],
93
+ List[RankedPhEvalVariantResult],
94
+ List[RankedPhEvalDiseaseResult],
95
+ ],
96
+ relevant_ranks: List[int],
97
+ ):
98
+ """
99
+ Adds scores and labels from the PhEval results.
100
+
101
+ Args:
102
+ pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult],
103
+ List[RankedPhEvalDiseaseResult]]):
104
+ List of all PhEval results
105
+ relevant_ranks (List[int]): A list of the ranks associated with the known entities.
106
+ """
107
+ relevant_ranks_copy = relevant_ranks.copy()
108
+ for result in pheval_results:
109
+ self.scores.append(result.score)
110
+ label = 1 if result.rank in relevant_ranks_copy else 0
111
+ self.labels.append(label)
112
+ relevant_ranks_copy.remove(result.rank) if label == 1 else None
113
+
87
114
  def add_classification(
88
115
  self,
89
116
  pheval_results: Union[
@@ -105,6 +132,7 @@ class BinaryClassificationStats:
105
132
  self.add_classification_for_other_entities(
106
133
  self.remove_relevant_ranks(pheval_results, relevant_ranks)
107
134
  )
135
+ self.add_labels_and_scores(pheval_results, relevant_ranks)
108
136
 
109
137
  def sensitivity(self) -> float:
110
138
  """
@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResu
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
13
- from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
13
+ from pheval.utils.file_utils import (
14
+ all_files,
15
+ files_with_suffix,
16
+ obtain_phenopacket_path_from_pheval_result,
17
+ )
14
18
  from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
15
19
 
16
20
 
@@ -234,7 +238,7 @@ def assess_phenopacket_disease_prioritisation(
234
238
  disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
235
239
  disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
236
240
  """
237
- phenopacket_path = obtain_closest_file_name(
241
+ phenopacket_path = obtain_phenopacket_path_from_pheval_result(
238
242
  standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
239
243
  )
240
244
  pheval_disease_result = read_standardised_result(standardised_disease_result)
@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalGeneResult
13
- from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
13
+ from pheval.utils.file_utils import (
14
+ all_files,
15
+ files_with_suffix,
16
+ obtain_phenopacket_path_from_pheval_result,
17
+ )
14
18
  from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
15
19
 
16
20
 
@@ -226,7 +230,7 @@ def assess_phenopacket_gene_prioritisation(
226
230
  gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
227
231
  gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
228
232
  """
229
- phenopacket_path = obtain_closest_file_name(
233
+ phenopacket_path = obtain_phenopacket_path_from_pheval_result(
230
234
  standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
231
235
  )
232
236
  pheval_gene_result = read_standardised_result(standardised_gene_result)
@@ -5,6 +5,7 @@ import matplotlib
5
5
  import pandas as pd
6
6
  import seaborn as sns
7
7
  from matplotlib import pyplot as plt
8
+ from sklearn.metrics import auc, precision_recall_curve, roc_curve
8
9
 
9
10
  from pheval.analyse.benchmark_generator import (
10
11
  BenchmarkRunOutputGenerator,
@@ -357,6 +358,82 @@ class PlotGenerator:
357
358
  ]
358
359
  )
359
360
 
361
+ def generate_roc_curve(
362
+ self,
363
+ benchmarking_results: List[BenchmarkRunResults],
364
+ benchmark_generator: BenchmarkRunOutputGenerator,
365
+ ):
366
+ """
367
+ Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results.
368
+
369
+ Args:
370
+ benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs.
371
+ benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
372
+ """
373
+ for i, benchmark_result in enumerate(benchmarking_results):
374
+ fpr, tpr, thresh = roc_curve(
375
+ benchmark_result.binary_classification_stats.labels,
376
+ benchmark_result.binary_classification_stats.scores,
377
+ pos_label=1,
378
+ )
379
+ roc_auc = auc(fpr, tpr)
380
+
381
+ plt.plot(
382
+ fpr,
383
+ tpr,
384
+ label=f"{self.return_benchmark_name(benchmark_result)} ROC Curve (AUC = {roc_auc:.2f})",
385
+ color=self.palette_hex_codes[i],
386
+ )
387
+
388
+ plt.plot(linestyle="--", color="gray")
389
+ plt.xlabel("False Positive Rate")
390
+ plt.ylabel("True Positive Rate")
391
+ plt.title("Receiver Operating Characteristic (ROC) Curve")
392
+ plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.15))
393
+ plt.savefig(
394
+ f"{benchmark_generator.prioritisation_type_file_prefix}_roc_curve.svg",
395
+ format="svg",
396
+ bbox_inches="tight",
397
+ )
398
+
399
+ def generate_precision_recall(
400
+ self,
401
+ benchmarking_results: List[BenchmarkRunResults],
402
+ benchmark_generator: BenchmarkRunOutputGenerator,
403
+ ):
404
+ """
405
+ Generate and plot Precision-Recall curves for binary classification benchmark results.
406
+
407
+ Args:
408
+ benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs.
409
+ benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
410
+ """
411
+ plt.figure()
412
+ for i, benchmark_result in enumerate(benchmarking_results):
413
+ precision, recall, thresh = precision_recall_curve(
414
+ benchmark_result.binary_classification_stats.labels,
415
+ benchmark_result.binary_classification_stats.scores,
416
+ )
417
+ precision_recall_auc = auc(recall, precision)
418
+ plt.plot(
419
+ recall,
420
+ precision,
421
+ label=f"{self.return_benchmark_name(benchmark_result)} Precision-Recall Curve "
422
+ f"(AUC = {precision_recall_auc:.2f})",
423
+ color=self.palette_hex_codes[i],
424
+ )
425
+
426
+ plt.plot(linestyle="--", color="gray")
427
+ plt.xlabel("Recall")
428
+ plt.ylabel("Precision")
429
+ plt.title("Precision-Recall Curve")
430
+ plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.15))
431
+ plt.savefig(
432
+ f"{benchmark_generator.prioritisation_type_file_prefix}_precision_recall_curve.svg",
433
+ format="svg",
434
+ bbox_inches="tight",
435
+ )
436
+
360
437
  def generate_non_cumulative_bar(
361
438
  self,
362
439
  benchmarking_results: List[BenchmarkRunResults],
@@ -418,6 +495,8 @@ def generate_plots(
418
495
  title (str, optional): Title for the generated plot. Defaults to None.
419
496
  """
420
497
  plot_generator = PlotGenerator()
498
+ plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
499
+ plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
421
500
  if plot_type == "bar_stacked":
422
501
  plot_generator.generate_stacked_bar_plot(benchmarking_results, benchmark_generator, title)
423
502
  elif plot_type == "bar_cumulative":
@@ -3,6 +3,7 @@ from collections import defaultdict
3
3
  from copy import deepcopy
4
4
  from typing import List
5
5
 
6
+ import numpy as np
6
7
  import pandas as pd
7
8
 
8
9
  from pheval.analyse.benchmark_generator import BenchmarkRunOutputGenerator
@@ -40,7 +41,19 @@ class RankComparisonGenerator:
40
41
  pd.DataFrame: DataFrame containing the calculated rank differences.
41
42
  """
42
43
  comparison_df = self._generate_dataframe()
43
- comparison_df["rank_decrease"] = comparison_df.iloc[:, 3] - comparison_df.iloc[:, 2]
44
+ comparison_df["rank_change"] = comparison_df.iloc[:, 2] - comparison_df.iloc[:, 3]
45
+ comparison_df["rank_change"] = np.where(
46
+ (comparison_df.iloc[:, 2] == 0) & (comparison_df.iloc[:, 3] != 0),
47
+ "GAINED",
48
+ np.where(
49
+ (comparison_df.iloc[:, 3] == 0) & (comparison_df.iloc[:, 2] != 0),
50
+ "LOST",
51
+ comparison_df["rank_change"],
52
+ ),
53
+ )
54
+ comparison_df["rank_change"] = comparison_df["rank_change"].apply(
55
+ lambda x: int(x) if str(x).lstrip("-").isdigit() else x
56
+ )
44
57
  return comparison_df
45
58
 
46
59
  def generate_output(self, prefix: str, suffix: str) -> None:
@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import VariantPrioritisationResu
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalVariantResult
13
- from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
13
+ from pheval.utils.file_utils import (
14
+ all_files,
15
+ files_with_suffix,
16
+ obtain_phenopacket_path_from_pheval_result,
17
+ )
14
18
  from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
15
19
 
16
20
 
@@ -228,7 +232,7 @@ def assess_phenopacket_variant_prioritisation(
228
232
  variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
229
233
  variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
230
234
  """
231
- phenopacket_path = obtain_closest_file_name(
235
+ phenopacket_path = obtain_phenopacket_path_from_pheval_result(
232
236
  standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
233
237
  )
234
238
  proband_causative_variants = _obtain_causative_variants(phenopacket_path)
@@ -1,4 +1,3 @@
1
- import difflib
2
1
  import itertools
3
2
  import re
4
3
  import unicodedata
@@ -71,23 +70,33 @@ def normalise_file_name(file_path: Path) -> str:
71
70
  return re.sub("[\u0300-\u036f]", "", normalised_file_name)
72
71
 
73
72
 
74
- def obtain_closest_file_name(file_to_be_queried: Path, file_paths: list[Path]) -> Path:
73
+ def obtain_phenopacket_path_from_pheval_result(
74
+ pheval_result_path: Path, phenopacket_paths: list[Path]
75
+ ) -> Path:
75
76
  """
76
- Obtains the closest file name when given a template file name
77
- and a list of full paths of files to be queried.
77
+ Obtains the phenopacket file name when given a pheval result file name
78
+ and a list of full paths of phenopackets to be queried.
78
79
 
79
80
  Args:
80
- file_to_be_queried (Path): The template file name to find the closest match.
81
- file_paths (list[Path]): List of full paths of files to be queried.
81
+ pheval_result_path (Path): The PhEval result.
82
+ phenopacket_paths (list[Path]): List of full paths of phenopackets to be queried.
82
83
 
83
84
  Returns:
84
- Path: The closest matching file path from the provided list.
85
+ Path: The matching phenopacket file path from the provided list.
85
86
  """
86
- stems = [Path(file_path).stem for file_path in file_paths]
87
- closest_file_match = difflib.get_close_matches(
88
- str(Path(file_to_be_queried).stem), stems, cutoff=0.1, n=1
89
- )[0]
90
- return [file_path for file_path in file_paths if closest_file_match == str(file_path.stem)][0]
87
+ pheval_result_path_stem_stripped = pheval_result_path.stem.split("-pheval_")[0]
88
+ matching_phenopacket_paths = [
89
+ phenopacket_path
90
+ for phenopacket_path in phenopacket_paths
91
+ if phenopacket_path.stem == pheval_result_path_stem_stripped
92
+ ]
93
+ if matching_phenopacket_paths:
94
+ return matching_phenopacket_paths[0]
95
+ else:
96
+ raise FileNotFoundError(
97
+ f"Unable to find matching phenopacket file named "
98
+ f"{pheval_result_path_stem_stripped}.json for {pheval_result_path.name}"
99
+ )
91
100
 
92
101
 
93
102
  def ensure_file_exists(*files: str):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pheval
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -3,18 +3,18 @@ pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  pheval/analyse/analysis.py,sha256=ponm3P8nvzJNmcrNZ2_KudEhWSaWshd_Gd30D-aau8s,7743
4
4
  pheval/analyse/benchmark_generator.py,sha256=AeuwbaPb4j_dyBGPRgEBxQk2NahDb5u4xHyFiqp5Fes,5943
5
5
  pheval/analyse/benchmarking_data.py,sha256=aNZkWdmWemlnC1Tg35MtR60S9YC71QWS2rMuzkUc3w0,768
6
- pheval/analyse/binary_classification_stats.py,sha256=ZBAvhMVPYSFg3asONUG1w24JhYTjG03RG_C9uohQntI,11373
7
- pheval/analyse/disease_prioritisation_analysis.py,sha256=ttdgUX5ZKT74gKgsRrnyH8zKFxhcJxVOtZTsAdheGxU,12596
8
- pheval/analyse/gene_prioritisation_analysis.py,sha256=raEjzJFvAvS3wE0yrYcSIQzBe6s_lOgJMqe_p_AFgZY,12320
9
- pheval/analyse/generate_plots.py,sha256=gU7NYr1zgnXEXAZR-nHLql3farQEaUN5gkgu2ywTJho,17779
10
- pheval/analyse/generate_summary_outputs.py,sha256=tpHjbyme3FlkflGcTIgQ4H4xyN6FZ5Jmm-ImjAbSpYU,6071
6
+ pheval/analyse/binary_classification_stats.py,sha256=E35YjvGM-zFnuEt8M3pgN03vBab4MH6ih726QKvuogg,12519
7
+ pheval/analyse/disease_prioritisation_analysis.py,sha256=qadEVhBMtBgtjGCJLhNQA510F8Pd0Ll4NAQXoT23BYs,12649
8
+ pheval/analyse/gene_prioritisation_analysis.py,sha256=lAN171xfXqweK8ie6191s_6WPPGjZKJXL1Z0dIqp54k,12373
9
+ pheval/analyse/generate_plots.py,sha256=zjsVzf-WsMG7jb5Y_FVYeOHQwu9lz_V90a9LApUlsDo,21163
10
+ pheval/analyse/generate_summary_outputs.py,sha256=s9pXMSW6xm4ZBe1aCd0UJSaFiKBvpUfPwJ2BI4qfTas,6591
11
11
  pheval/analyse/parse_benchmark_summary.py,sha256=Y8uPTlHTEiaeVBOqxMcdOqjY3ZBtOS3DoRycL78Dzxg,2384
12
12
  pheval/analyse/parse_pheval_result.py,sha256=j8YFVA0YXfySOkm8gMwrfIuV45DI9AX3ETn7h-r8ayE,1211
13
13
  pheval/analyse/prioritisation_rank_recorder.py,sha256=EVe8DoEvvp0_WMAcjfVxmDGGRFPEELi7hEVjH3sIpLY,3223
14
14
  pheval/analyse/prioritisation_result_types.py,sha256=qJoB6O-lFYmzAMcTQeDJZQNLJ6hleoKDYATTkhvFF98,1228
15
15
  pheval/analyse/rank_stats.py,sha256=knj1tsKrly17QgtOUVpqA14UjbO99N3ydkWN4xU6c2k,15785
16
16
  pheval/analyse/run_data_parser.py,sha256=HzBKsJL2skjmrRZdrF3VYzswtKNgbX6U5qhY_kqq9mA,1552
17
- pheval/analyse/variant_prioritisation_analysis.py,sha256=_yYgknFHqL0_nlpBeQdo9D1Jnd99BcUkA733uxTPpcg,12331
17
+ pheval/analyse/variant_prioritisation_analysis.py,sha256=ApmUeTW0cl_BPh7LusbApxtgjEXEkhuNFyh0DxKKpgU,12384
18
18
  pheval/cli.py,sha256=4l9xZfxBfLCcm7PDdhMWgTvTKbQt5sJ2bYHf7kU1dO4,1493
19
19
  pheval/cli_pheval.py,sha256=fWbKUcPTZZSa1EJEtH_lNn1XE6qRApRHihqUZS5owrA,2424
20
20
  pheval/cli_pheval_utils.py,sha256=wVLH0Bk2WrvTBkH-G5wC3Xgo6KftX9zSwonC2DVBpP8,16929
@@ -45,12 +45,12 @@ pheval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
45
  pheval/utils/docs_gen.py,sha256=6FGtHicBC0rZKi0tdL3Epsg8d4osE44I9f1Ga0j4JLA,3193
46
46
  pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
47
47
  pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
48
- pheval/utils/file_utils.py,sha256=ESAXWtfpCAZX6T6nU6vb1x0of5S-eYhu639geJBu1es,4361
48
+ pheval/utils/file_utils.py,sha256=9HoCmtF73D3wY6bBhFLefMBI5uhvCe_meZeHXQzF_ts,4640
49
49
  pheval/utils/phenopacket_utils.py,sha256=hBEWl9mOP9D7odSaL6lIY__dbXn7Sc3TZX0Si-nPYaE,24379
50
50
  pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
51
51
  pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
52
- pheval-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
- pheval-0.3.1.dist-info/METADATA,sha256=wVyoDa-Xs4ztciDaO56ogC3rjhukYhCe3HFqmqEtClA,1810
54
- pheval-0.3.1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
55
- pheval-0.3.1.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
56
- pheval-0.3.1.dist-info/RECORD,,
52
+ pheval-0.3.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
53
+ pheval-0.3.2.dist-info/METADATA,sha256=8dvmkrDAkmmwiOvMWltnb_oXGo8IQTC1-iJImlsW-m8,1810
54
+ pheval-0.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
55
+ pheval-0.3.2.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
56
+ pheval-0.3.2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.8.1
2
+ Generator: poetry-core 1.9.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any