pheval 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (33) hide show
  1. pheval/analyse/benchmark.py +156 -0
  2. pheval/analyse/benchmark_db_manager.py +16 -134
  3. pheval/analyse/benchmark_output_type.py +43 -0
  4. pheval/analyse/binary_classification_curves.py +132 -0
  5. pheval/analyse/binary_classification_stats.py +164 -307
  6. pheval/analyse/generate_plots.py +210 -395
  7. pheval/analyse/generate_rank_comparisons.py +44 -0
  8. pheval/analyse/rank_stats.py +190 -382
  9. pheval/analyse/run_data_parser.py +21 -39
  10. pheval/cli.py +27 -24
  11. pheval/cli_pheval_utils.py +7 -8
  12. pheval/post_processing/phenopacket_truth_set.py +235 -0
  13. pheval/post_processing/post_processing.py +185 -337
  14. pheval/post_processing/validate_result_format.py +92 -0
  15. pheval/prepare/update_phenopacket.py +11 -9
  16. pheval/utils/logger.py +35 -0
  17. pheval/utils/phenopacket_utils.py +85 -91
  18. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/METADATA +4 -4
  19. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/RECORD +22 -26
  20. pheval/analyse/analysis.py +0 -104
  21. pheval/analyse/assess_prioritisation_base.py +0 -108
  22. pheval/analyse/benchmark_generator.py +0 -126
  23. pheval/analyse/benchmarking_data.py +0 -25
  24. pheval/analyse/disease_prioritisation_analysis.py +0 -152
  25. pheval/analyse/gene_prioritisation_analysis.py +0 -147
  26. pheval/analyse/generate_summary_outputs.py +0 -105
  27. pheval/analyse/parse_benchmark_summary.py +0 -81
  28. pheval/analyse/parse_corpus.py +0 -219
  29. pheval/analyse/prioritisation_result_types.py +0 -52
  30. pheval/analyse/variant_prioritisation_analysis.py +0 -159
  31. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/LICENSE +0 -0
  32. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/WHEEL +0 -0
  33. {pheval-0.4.7.dist-info → pheval-0.5.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,92 @@
1
+ from enum import Enum
2
+ from functools import wraps
3
+ from typing import Callable
4
+
5
+ import polars as pl
6
+
7
+
8
+ class ResultSchema(Enum):
9
+ """
10
+ Enum for different result schema formats.
11
+ Attributes:
12
+ GENE_RESULT_SCHEMA (pl.Schema): Schema for gene-based results.
13
+ VARIANT_RESULT_SCHEMA (pl.Schema): Schema for variant-based results.
14
+ DISEASE_RESULT_SCHEMA (pl.Schema): Schema for disease-based results.
15
+ """
16
+
17
+ GENE_RESULT_SCHEMA = pl.Schema(
18
+ {
19
+ "gene_symbol": pl.String,
20
+ "gene_identifier": pl.String,
21
+ "score": pl.Float64,
22
+ "grouping_id": pl.Utf8,
23
+ }
24
+ )
25
+ VARIANT_RESULT_SCHEMA = pl.Schema(
26
+ {
27
+ "chrom": pl.String,
28
+ "start": pl.Int64,
29
+ "end": pl.Int64,
30
+ "ref": pl.String,
31
+ "alt": pl.String,
32
+ "score": pl.Float64,
33
+ "grouping_id": pl.Utf8,
34
+ }
35
+ )
36
+ DISEASE_RESULT_SCHEMA = pl.Schema(
37
+ {
38
+ "disease_name": pl.String,
39
+ "disease_identifier": pl.String,
40
+ "score": pl.Float64,
41
+ "grouping_id": pl.Utf8,
42
+ }
43
+ )
44
+
45
+ def validate(self, df: pl.DataFrame) -> bool:
46
+ """
47
+ Validate that a DataFrame follows the expected schema.
48
+ Args:
49
+ df (pl.DataFrame): The DataFrame to validate.
50
+ Raises:
51
+ ValueError: If a required column is missing or the grouping_id column contains a null value.
52
+ TypeError: If a column exists but has an incorrect data type.
53
+ Returns:
54
+ bool: True if the DataFrame is valid according to the schema.
55
+ """
56
+ expected_schema = self.value
57
+
58
+ if "grouping_id" in df.columns and df["grouping_id"].null_count() > 0:
59
+ raise ValueError("'grouping_id' column should not contain null values if provided.")
60
+
61
+ for col_name, expected_type in expected_schema.items():
62
+ if col_name not in df.schema:
63
+ if col_name == "grouping_id":
64
+ continue
65
+ raise ValueError(f"Missing required column: {col_name}")
66
+
67
+ if df.schema[col_name] != expected_type:
68
+ raise TypeError(
69
+ f"Column '{col_name}' has type {df.schema[col_name]}, expected {expected_type}"
70
+ )
71
+
72
+ return True
73
+
74
+
75
+ def validate_dataframe(schema: ResultSchema) -> Callable:
76
+ """
77
+ Decorator to validate DataFrame input based on a ResultSchema.
78
+ Args:
79
+ schema (ResultSchema): The expected schema from the `ResultSchema` enum.
80
+ Returns:
81
+ Callable: A wrapped function that validates the DataFrame before execution.
82
+ """
83
+
84
+ def decorator(func: Callable) -> Callable:
85
+ @wraps(func)
86
+ def wrapper(df: pl.DataFrame, *args, **kwargs):
87
+ schema.validate(df)
88
+ return func(df, *args, **kwargs)
89
+
90
+ return wrapper
91
+
92
+ return decorator
@@ -1,7 +1,7 @@
1
- from collections import defaultdict
2
1
  from pathlib import Path
3
2
  from typing import Union
4
3
 
4
+ import polars as pl
5
5
  from phenopackets import Family, Phenopacket
6
6
 
7
7
  from pheval.utils.file_utils import all_files
@@ -9,14 +9,14 @@ from pheval.utils.phenopacket_utils import (
9
9
  GeneIdentifierUpdater,
10
10
  PhenopacketRebuilder,
11
11
  PhenopacketUtil,
12
- create_hgnc_dict,
12
+ create_gene_identifier_map,
13
13
  phenopacket_reader,
14
14
  write_phenopacket,
15
15
  )
16
16
 
17
17
 
18
18
  def update_outdated_gene_context(
19
- phenopacket_path: Path, gene_identifier: str, hgnc_data: defaultdict
19
+ phenopacket_path: Path, gene_identifier: str, identifier_map: pl.DataFrame
20
20
  ) -> Union[Phenopacket, Family]:
21
21
  """
22
22
  Update the gene context of the Phenopacket.
@@ -24,7 +24,7 @@ def update_outdated_gene_context(
24
24
  Args:
25
25
  phenopacket_path (Path): The path to the Phenopacket file.
26
26
  gene_identifier (str): Identifier to update the gene context.
27
- hgnc_data (defaultdict): The HGNC data used for updating.
27
+ identifier_map (pl.DataFrame): The gene identifier map used for updating.
28
28
 
29
29
  Returns:
30
30
  Union[Phenopacket, Family]: The updated Phenopacket or Family.
@@ -37,7 +37,7 @@ def update_outdated_gene_context(
37
37
  phenopacket = phenopacket_reader(phenopacket_path)
38
38
  interpretations = PhenopacketUtil(phenopacket).interpretations()
39
39
  updated_interpretations = GeneIdentifierUpdater(
40
- hgnc_data=hgnc_data, gene_identifier=gene_identifier
40
+ identifier_map=identifier_map, gene_identifier=gene_identifier
41
41
  ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
42
42
  return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
43
43
 
@@ -57,8 +57,10 @@ def create_updated_phenopacket(
57
57
  to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
58
58
  to describe the gene identifiers.
59
59
  """
60
- hgnc_data = create_hgnc_dict()
61
- updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, hgnc_data)
60
+ identifier_map = create_gene_identifier_map()
61
+ updated_phenopacket = update_outdated_gene_context(
62
+ phenopacket_path, gene_identifier, identifier_map
63
+ )
62
64
  write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
63
65
 
64
66
 
@@ -78,10 +80,10 @@ def create_updated_phenopackets(
78
80
  to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
79
81
  to describe the gene identifiers.
80
82
  """
81
- hgnc_data = create_hgnc_dict()
83
+ identifier_map = create_gene_identifier_map()
82
84
  for phenopacket_path in all_files(phenopacket_dir):
83
85
  updated_phenopacket = update_outdated_gene_context(
84
- phenopacket_path, gene_identifier, hgnc_data
86
+ phenopacket_path, gene_identifier, identifier_map
85
87
  )
86
88
  write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
87
89
 
pheval/utils/logger.py ADDED
@@ -0,0 +1,35 @@
1
+ import logging
2
+
3
+ logging.basicConfig(
4
+ level=logging.INFO,
5
+ format="[%(asctime)s] [%(levelname)s] [%(filename)s:%(lineno)d] - %(message)s",
6
+ datefmt="%Y-%m-%d %H:%M:%S",
7
+ )
8
+
9
+
10
+ def get_logger(name="PHEVAL"):
11
+ return logging.getLogger(name)
12
+
13
+
14
+ def print_ascii_banner():
15
+ """Prints ASCII banner only once when the script starts."""
16
+ if not getattr(logging, "_ascii_printed", False):
17
+ logging._ascii_printed = True
18
+ pheval_banner = """
19
+ Welcome to:
20
+ ██████╗ ██╗ ██╗███████╗██╗ ██╗ █████╗ ██╗
21
+ ██╔══██╗██║ ██║██╔════╝██║ ██║██╔══██╗██║
22
+ ██████╔╝███████║█████╗ ██║ ██║███████║██║
23
+ ██╔═══╝ ██╔══██║██╔══╝ ╚██╗ ██╔╝██╔══██║██║
24
+ ██║ ██║ ██║███████╗ ╚████╔╝ ██║ ██║███████╗
25
+ ╚═╝ ╚═╝ ╚═╝╚══════╝ ╚═══╝ ╚═╝ ╚═╝╚══════╝
26
+ A framework for the empirical evaluation of phenotype-driven prioritisation tools.
27
+ """
28
+ print(pheval_banner)
29
+
30
+
31
+ def initialise_context(ctx):
32
+ ctx.ensure_object(dict)
33
+ if not getattr(ctx, "ascii_printed", False):
34
+ ctx.ascii_printed = True
35
+ print_ascii_banner()
@@ -1,13 +1,12 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- from collections import defaultdict
5
4
  from copy import copy
6
5
  from dataclasses import dataclass
7
6
  from pathlib import Path
8
7
  from typing import List, Union
9
8
 
10
- import pandas as pd
9
+ import polars as pl
11
10
  from google.protobuf.json_format import MessageToJson, Parse
12
11
  from phenopackets import (
13
12
  Disease,
@@ -122,79 +121,65 @@ class ProbandDisease:
122
121
  disease_identifier: str
123
122
 
124
123
 
125
- def read_hgnc_data() -> pd.DataFrame:
124
+ def parse_hgnc_data() -> pl.DataFrame:
126
125
  """
127
- Read HGNC data from a file and return it as a Pandas DataFrame.
126
+ Read HGNC data from a file and return it as a Polars DataFrame.
128
127
 
129
128
  Returns:
130
- pd.DataFrame: DataFrame containing the HGNC data.
129
+ pl.DataFrame: DataFrame containing the HGNC data.
131
130
  """
132
- return pd.read_csv(
133
- os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set.txt"),
134
- delimiter="\t",
135
- dtype=str,
131
+ return (
132
+ pl.read_csv(
133
+ os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set.txt"),
134
+ separator="\t",
135
+ infer_schema=10000000000,
136
+ dtypes={"omim_id": pl.Utf8},
137
+ )
138
+ .select(
139
+ [
140
+ pl.col("hgnc_id").alias("hgnc_id"),
141
+ pl.col("symbol").alias("gene_symbol"),
142
+ pl.col("ensembl_gene_id").alias("ensembl_id"),
143
+ pl.col("entrez_id").alias("entrez_id"),
144
+ pl.col("refseq_accession").alias("refseq_accession"),
145
+ pl.col("prev_symbol").alias("previous_symbol_raw"),
146
+ ]
147
+ )
148
+ .with_columns(
149
+ pl.col("previous_symbol_raw")
150
+ .str.split("|")
151
+ .list.eval(pl.element().str.strip_chars('"'))
152
+ .alias("prev_symbols")
153
+ )
136
154
  )
137
155
 
138
156
 
139
- def create_hgnc_dict() -> defaultdict:
140
- """
141
- Create a dictionary as a reference for updating gene symbols and identifiers based on HGNC data.
142
-
143
-
144
- Returns:
145
- defaultdict: A dictionary containing gene symbols as keys and their associated gene information.
146
-
147
- Notes:
148
- The dictionary structure:
149
- {
150
- 'gene_symbol': {
151
- 'ensembl_id': str,
152
- 'hgnc_id': str,
153
- 'entrez_id': str,
154
- 'refseq_accession': str,
155
- 'previous_symbol': [str, ...]
156
- },
157
- ...
158
- }
159
- """
160
- hgnc_df = read_hgnc_data()
161
- hgnc_data = defaultdict(dict)
162
- for _index, row in hgnc_df.iterrows():
163
- previous_names = []
164
- hgnc_data[row["symbol"]]["ensembl_id"] = row["ensembl_gene_id"]
165
- hgnc_data[row["symbol"]]["hgnc_id"] = row["hgnc_id"]
166
- hgnc_data[row["symbol"]]["entrez_id"] = row["entrez_id"]
167
- hgnc_data[row["symbol"]]["refseq_accession"] = row["refseq_accession"]
168
- previous = str(row["prev_symbol"]).split("|")
169
- for p in previous:
170
- previous_names.append(p.strip('"'))
171
- hgnc_data[row["symbol"]]["previous_symbol"] = previous_names
172
-
173
- return hgnc_data
174
-
175
-
176
- def create_gene_identifier_map() -> dict:
157
+ def create_gene_identifier_map() -> pl.DataFrame:
177
158
  """
178
159
  Create a mapping of gene identifiers to gene symbols using HGNC data.
179
160
 
180
161
  Returns:
181
- dict: A mapping of gene identifiers to gene symbols.
182
-
183
- Notes:
184
- The dictionary structure:
185
- {
186
- 'identifier': 'gene_symbol',
187
- ...
188
- }
162
+ pl.DataFrame: A mapping of gene identifiers to gene symbols.
189
163
  """
190
- hgnc_df = read_hgnc_data()
191
- identifier_map = {}
192
- for _index, row in hgnc_df.iterrows():
193
- identifier_map[row["ensembl_gene_id"]] = row["symbol"]
194
- identifier_map[row["hgnc_id"]] = row["symbol"]
195
- identifier_map[row["entrez_id"]] = row["symbol"]
196
- identifier_map[row["refseq_accession"]] = row["symbol"]
197
- return identifier_map
164
+ hgnc_df = parse_hgnc_data()
165
+ return hgnc_df.melt(
166
+ id_vars=["gene_symbol", "prev_symbols"],
167
+ value_vars=["ensembl_id", "hgnc_id", "entrez_id", "refseq_accession"],
168
+ variable_name="identifier_type",
169
+ value_name="identifier",
170
+ ).with_columns(
171
+ pl.col("identifier_type")
172
+ .replace(
173
+ {
174
+ "ensembl_id": "ensembl:",
175
+ "hgnc_id": "",
176
+ "entrez_id": "ncbigene:",
177
+ "refseq_accession": "",
178
+ },
179
+ default="",
180
+ )
181
+ .alias("prefix")
182
+ )
198
183
 
199
184
 
200
185
  def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
@@ -651,17 +636,19 @@ def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path
651
636
  class GeneIdentifierUpdater:
652
637
  """Class for updating gene identifiers within genomic interpretations."""
653
638
 
654
- def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
639
+ def __init__(
640
+ self,
641
+ gene_identifier: str,
642
+ identifier_map: pl.DataFrame = None,
643
+ ):
655
644
  """
656
645
  Initialise the GeneIdentifierUpdater.
657
646
 
658
647
  Args:
659
648
  gene_identifier (str): The gene identifier to update to.
660
- hgnc_data (dict): A dictionary containing HGNC data (default: None).
661
- identifier_map (dict): A dictionary mapping gene identifiers (default: None).
649
+ identifier_map (dict): A polars dataframe mapping gene identifiers (default: None).
662
650
  """
663
651
 
664
- self.hgnc_data = hgnc_data
665
652
  self.gene_identifier = gene_identifier
666
653
  self.identifier_map = identifier_map
667
654
 
@@ -675,13 +662,20 @@ class GeneIdentifierUpdater:
675
662
  Returns:
676
663
  str: The identified gene identifier.
677
664
  """
678
- if gene_symbol in self.hgnc_data.keys():
679
- return self.hgnc_data[gene_symbol][self.gene_identifier]
680
- else:
681
- for _symbol, data in self.hgnc_data.items():
682
- for prev_symbol in data["previous_symbol"]:
683
- if prev_symbol == gene_symbol:
684
- return data[self.gene_identifier]
665
+ matches = self.identifier_map.filter(
666
+ (pl.col("gene_symbol") == gene_symbol)
667
+ & (pl.col("identifier_type") == self.gene_identifier)
668
+ )
669
+
670
+ if matches.height > 0:
671
+ return matches["identifier"][0]
672
+ prev_symbol_matches = self.identifier_map.filter(
673
+ (pl.col("identifier_type") == self.gene_identifier)
674
+ & (pl.col("prev_symbols").list.contains(gene_symbol))
675
+ )
676
+ if prev_symbol_matches.height > 0:
677
+ return prev_symbol_matches["identifier"][0]
678
+ return None
685
679
 
686
680
  def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
687
681
  """
@@ -693,7 +687,9 @@ class GeneIdentifierUpdater:
693
687
  Returns:
694
688
  str: The gene symbol corresponding to the identifier.
695
689
  """
696
- return self.identifier_map[query_gene_identifier]
690
+ return self.identifier_map.filter(pl.col("identifier") == query_gene_identifier)[
691
+ "gene_symbol"
692
+ ][0]
697
693
 
698
694
  def _find_alternate_ids(self, gene_symbol: str) -> List[str]:
699
695
  """
@@ -705,23 +701,20 @@ class GeneIdentifierUpdater:
705
701
  Returns:
706
702
  List[str]: List of alternate IDs for the gene symbol.
707
703
  """
708
- if gene_symbol in self.hgnc_data.keys():
709
- return [
710
- self.hgnc_data[gene_symbol]["hgnc_id"],
711
- "ncbigene:" + self.hgnc_data[gene_symbol]["entrez_id"],
712
- "ensembl:" + self.hgnc_data[gene_symbol]["ensembl_id"],
713
- "symbol:" + gene_symbol,
704
+ matches = self.identifier_map.filter((pl.col("gene_symbol") == gene_symbol))
705
+ if matches.height > 0:
706
+ return [f"{row['prefix']}{row['identifier']}" for row in matches.rows(named=True)] + [
707
+ f"symbol:{gene_symbol}"
714
708
  ]
715
- else:
716
- for symbol, data in self.hgnc_data.items():
717
- for prev_symbol in data["previous_symbol"]:
718
- if prev_symbol == gene_symbol:
719
- return [
720
- data["hgnc_id"],
721
- "ncbigene:" + data["entrez_id"],
722
- "ensembl:" + data["ensembl_id"],
723
- "symbol:" + symbol,
724
- ]
709
+ prev_symbol_matches = self.identifier_map.filter(
710
+ (pl.col("prev_symbols").list.contains(gene_symbol))
711
+ )
712
+ if prev_symbol_matches.height > 0:
713
+ return [
714
+ f"{row['prefix']}{row['identifier']}"
715
+ for row in prev_symbol_matches.rows(named=True)
716
+ ] + [f"symbol:{gene_symbol}"]
717
+ return None
725
718
 
726
719
  def update_genomic_interpretations_gene_identifier(
727
720
  self, interpretations: List[Interpretation], phenopacket_path: Path
@@ -731,6 +724,7 @@ class GeneIdentifierUpdater:
731
724
 
732
725
  Args:
733
726
  interpretations (List[Interpretation]): List of Interpretation objects.
727
+ phenopacket_path (Path): The Path to the Phenopacket.
734
728
 
735
729
  Returns:
736
730
  List[Interpretation]: Updated list of Interpretation objects.
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pheval
3
- Version: 0.4.7
3
+ Version: 0.5.0
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
7
- Requires-Python: >=3.9,<4.0.0
7
+ Requires-Python: >=3.10,<4.0.0
8
8
  Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.9
10
9
  Classifier: Programming Language :: Python :: 3.10
11
10
  Classifier: Programming Language :: Python :: 3.11
12
11
  Classifier: Programming Language :: Python :: 3.12
@@ -22,8 +21,9 @@ Requires-Dist: oaklib (>=0.5.6)
22
21
  Requires-Dist: pandas (>=1.5.1)
23
22
  Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
24
23
  Requires-Dist: plotly (>=5.13.0,<6.0.0)
25
- Requires-Dist: polars (>=0.19.15,<0.20.0)
24
+ Requires-Dist: polars (>=1.23,<2.0)
26
25
  Requires-Dist: pyaml (>=21.10.1,<22.0.0)
26
+ Requires-Dist: pyarrow (>=19.0.1,<20.0.0)
27
27
  Requires-Dist: pyserde (>=0.9.8,<0.10.0)
28
28
  Requires-Dist: scikit-learn (>=1.4.0,<2.0.0)
29
29
  Requires-Dist: seaborn (>=0.12.2,<0.13.0)
@@ -1,36 +1,31 @@
1
1
  pheval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- pheval/analyse/analysis.py,sha256=Yt2xH0WS_2NO13-wYvywzmCRCj8RinQ1MeozJQuGe3o,4009
4
- pheval/analyse/assess_prioritisation_base.py,sha256=znBscRTqIKxxZMHR-H6KrjFJ6Uv5P5HzwTQUWS6Eoos,3434
5
- pheval/analyse/benchmark_db_manager.py,sha256=IRqu5fUpLBboHpS4lx0AaAkor_W7whuvUSKiMr2-GhM,5185
6
- pheval/analyse/benchmark_generator.py,sha256=-LljszuKAT3oJfGQn7JHAILCGg5QXYny4nPPf273g_E,5896
7
- pheval/analyse/benchmarking_data.py,sha256=aRvDmwqjFGKvWDRGjMwaQxfDZscptRBwI-rcSqY-X5s,913
8
- pheval/analyse/binary_classification_stats.py,sha256=E35YjvGM-zFnuEt8M3pgN03vBab4MH6ih726QKvuogg,12519
9
- pheval/analyse/disease_prioritisation_analysis.py,sha256=t__1lhyw1PnbDBbXDxDaFgLFbdRz20D1s-8tfLQJjLs,6186
10
- pheval/analyse/gene_prioritisation_analysis.py,sha256=Bapg0VcHOz5vp1dI4bDba04SpX6UdDu35VGTI6sZyOk,6026
11
- pheval/analyse/generate_plots.py,sha256=5oxsdnAbbVgQj8ZrWTLs12rSM24EXp-IdLCjy5QB1_g,21992
12
- pheval/analyse/generate_summary_outputs.py,sha256=nKqwbpA-9bbL5mCySiuyV_AUDIokmCg3vD8_JAsg1ls,4157
13
- pheval/analyse/parse_benchmark_summary.py,sha256=vyAOIdIWF4rZjGTPFE69ajhEC9AkkN3QBVqSe_uYZsg,2946
14
- pheval/analyse/parse_corpus.py,sha256=pxhoKTgd-DnwAMP081UMG-NKbj89qAYBQhHve8aphfI,8698
15
- pheval/analyse/prioritisation_result_types.py,sha256=qJoB6O-lFYmzAMcTQeDJZQNLJ6hleoKDYATTkhvFF98,1228
16
- pheval/analyse/rank_stats.py,sha256=vNLVuG_NzhKDXxKmklYNPz44MczlyKUqcuHqbiuOXwI,17993
17
- pheval/analyse/run_data_parser.py,sha256=VQBUoOIRYRWc5uqURUvaWdaW3E3C7Su0JvLavQLHQaY,4105
18
- pheval/analyse/variant_prioritisation_analysis.py,sha256=HhDeczF7wmJjXt0ejAtF0qdczyMe25glqiS6uX_TFl8,6408
19
- pheval/cli.py,sha256=SPB8-BCIRt1fUaAalhZ5Y6JUlnJX6Cj2S52QXCovJR8,1526
3
+ pheval/analyse/benchmark.py,sha256=1ysz1peGb21DhgNpEam9NgUOS5eGv7K0CI3RNjy0crQ,6275
4
+ pheval/analyse/benchmark_db_manager.py,sha256=zS1TI76YuV2_YXLipHLSyh-XDR5kTxyOwhRhHRFHfjQ,764
5
+ pheval/analyse/benchmark_output_type.py,sha256=bh-qQvV4AF7BHQyr_bdY8HTTzYZVe7KvoIoUF0D9k-g,1468
6
+ pheval/analyse/binary_classification_curves.py,sha256=Crb45rJWc5rxDdx82sgoHRvYHE2D5pus91fgl39FyRw,5007
7
+ pheval/analyse/binary_classification_stats.py,sha256=sOuEp6IxZ6SVp-KC6MJkZNTkZucZTNK25xApP5tU6Mk,6944
8
+ pheval/analyse/generate_plots.py,sha256=g98DxhTw1dPRfRRYoKBmt51XfIa2KzlL_Z7weFSoBUg,14550
9
+ pheval/analyse/generate_rank_comparisons.py,sha256=KcQJ9rm1nvvTcqLNuxAkXRXuV18vEsiP0giQ-ryHyYc,1684
10
+ pheval/analyse/rank_stats.py,sha256=qHrqlIsZVSV2ASc5cZ6TsmKaMq3bZtCzS1ZURjL8mks,9211
11
+ pheval/analyse/run_data_parser.py,sha256=Lr0ao_Mlp8EYLaM4XmiEjo7P7jt_rCBR2y2hb_D3c70,3366
12
+ pheval/cli.py,sha256=rpvTTCKAvH75XkZUh0xaKv7Ftl9zIt2RncsMGIlrq9U,1556
20
13
  pheval/cli_pheval.py,sha256=fWbKUcPTZZSa1EJEtH_lNn1XE6qRApRHihqUZS5owrA,2424
21
- pheval/cli_pheval_utils.py,sha256=O6tWnE85QQHGNcP08OwJGANMfXJPsZtFEu-D6ATld00,16700
14
+ pheval/cli_pheval_utils.py,sha256=sh6kx36jYfuSIWBMlrdW3g-LPftxBy-xw4b7hg8bdj4,16545
22
15
  pheval/config_parser.py,sha256=lh-Dy_FflXJUnRC3HYaEdSvPAsNZWQZlEr1hHQigrTM,1227
23
16
  pheval/implementations/__init__.py,sha256=BMUTotjTdgy5j5xubWCIQgRXrSQ1ZIcjooer7r299Zo,1228
24
17
  pheval/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
18
  pheval/infra/exomiserdb.py,sha256=pM9-TfjrgurtH4OtM1Enk5oVhIxGQN3rKRlrxHuObTM,5080
26
19
  pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- pheval/post_processing/post_processing.py,sha256=MdacHoVjmwvmWBnHCSSKBboCgMW4MRGP-d_7-t1zZew,14808
20
+ pheval/post_processing/phenopacket_truth_set.py,sha256=ue3pNeg_GZiGyuKrm6_4MsJWpW0LWtfG9wja2Cc8SLg,8873
21
+ pheval/post_processing/post_processing.py,sha256=4xP-gjZ3VoXydU9ClPvmRtuDaSMUeJImgLugurOS5_k,9480
22
+ pheval/post_processing/validate_result_format.py,sha256=4U6AfHt01EexwU_OnpmytQAhGVS6ZWF1S-5NVBx1oaM,2916
28
23
  pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
24
  pheval/prepare/create_noisy_phenopackets.py,sha256=ydhA4mpqKTDc4hBu8YfvNW2nMubHK3dbO-cv0lA4JFQ,11504
30
25
  pheval/prepare/create_spiked_vcf.py,sha256=90A-Mi8QKhvN036vtFEVWAHgzHO37itiLYrqYlG4LiA,23953
31
26
  pheval/prepare/custom_exceptions.py,sha256=_G3_95dPtHIs1SviYBV1j7cYc-hxlhuw8hhnYdzByYY,1719
32
27
  pheval/prepare/prepare_corpus.py,sha256=YFnklpeVXeqeme9DVmd_jfsK04ytIe9cH5uXYcgK5cY,4650
33
- pheval/prepare/update_phenopacket.py,sha256=21fzUPbwKN6Ey5TSh9PFzjT2x86U19RAE6WmkjG8u28,4770
28
+ pheval/prepare/update_phenopacket.py,sha256=Bjru0ptNKyzLaYElouKZe2GYRQbETTC0FMiMojrP8Lg,4850
34
29
  pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
35
30
  pheval/resources/alternate_ouputs/DeepPVP_results.txt,sha256=MF9MZJYa4r4PEvFzALpi-lNGLxjENOnq_YgrgFMn-oQ,1508
36
31
  pheval/resources/alternate_ouputs/OVA_results.txt,sha256=_5XFCR4W04D-W7DObpALLsa0-693g2kiIUB_uo79aHk,9845
@@ -47,11 +42,12 @@ pheval/utils/docs_gen.py,sha256=6FGtHicBC0rZKi0tdL3Epsg8d4osE44I9f1Ga0j4JLA,3193
47
42
  pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
48
43
  pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
49
44
  pheval/utils/file_utils.py,sha256=m21cz-qjDYqnI8ClUv3J9fKizex98a-9bSEerQ75i_c,3576
50
- pheval/utils/phenopacket_utils.py,sha256=6xQ8WCLdR1VhiU3nCDzaqEVKjGvDWrzvPA50_6ZAHXM,27310
45
+ pheval/utils/logger.py,sha256=5DZl5uMltUDQorhkvg_B7_ZhFwApAmEkWneFIOKfRGQ,1566
46
+ pheval/utils/phenopacket_utils.py,sha256=AfV_mWac6n5HCc5zjfH6CGP8T0qI0LR0VBrooaKmgdY,26978
51
47
  pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
52
48
  pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
53
- pheval-0.4.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
54
- pheval-0.4.7.dist-info/METADATA,sha256=JfraeowwRp8eQjKiFBBrIVUoA0fchchznGj4t8sXgFE,6469
55
- pheval-0.4.7.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
56
- pheval-0.4.7.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
57
- pheval-0.4.7.dist-info/RECORD,,
49
+ pheval-0.5.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
50
+ pheval-0.5.0.dist-info/METADATA,sha256=v7UNSBKUzJQAs8oBSq8XScwKnDiNXlzWZV0A70xR3M8,6456
51
+ pheval-0.5.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
52
+ pheval-0.5.0.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
53
+ pheval-0.5.0.dist-info/RECORD,,
@@ -1,104 +0,0 @@
1
- from pheval.analyse.benchmark_generator import (
2
- BenchmarkRunOutputGenerator,
3
- DiseaseBenchmarkRunOutputGenerator,
4
- GeneBenchmarkRunOutputGenerator,
5
- VariantBenchmarkRunOutputGenerator,
6
- )
7
- from pheval.analyse.generate_summary_outputs import generate_benchmark_comparison_output
8
- from pheval.analyse.parse_corpus import CorpusParser
9
- from pheval.analyse.rank_stats import RankStatsWriter
10
- from pheval.analyse.run_data_parser import Config
11
-
12
-
13
- def _run_benchmark_comparison(
14
- run_config: Config,
15
- benchmark_generator: BenchmarkRunOutputGenerator,
16
- ) -> None:
17
- """
18
- Run a benchmark on several result directories.
19
-
20
- Args:
21
- run_config (List[TrackInputOutputDirectories]): List of input and output directories
22
- for tracking results across multiple directories.
23
- benchmark_generator (BenchmarkRunOutputGenerator): Generator for benchmark run output.
24
- """
25
- stats_writer = RankStatsWriter(
26
- run_config.benchmark_name, benchmark_generator.stats_comparison_file
27
- )
28
- unique_test_corpora_directories = set([result.phenopacket_dir for result in run_config.runs])
29
- [
30
- CorpusParser(run_config.benchmark_name, test_corpora_directory).parse_corpus(
31
- benchmark_generator
32
- )
33
- for test_corpora_directory in unique_test_corpora_directories
34
- ]
35
- benchmarking_results = []
36
- for run in run_config.runs:
37
- benchmark_result = benchmark_generator.generate_benchmark_run_results(
38
- run_config.benchmark_name, run, run.score_order, run.threshold
39
- )
40
- stats_writer.add_statistics_entry(
41
- run.run_identifier,
42
- benchmark_result.rank_stats,
43
- benchmark_result.binary_classification_stats,
44
- )
45
- benchmarking_results.append(benchmark_result)
46
- run_identifiers = [run.run_identifier for run in run_config.runs]
47
- [
48
- generate_benchmark_comparison_output(
49
- run_config.benchmark_name,
50
- benchmarking_results,
51
- run_identifiers,
52
- benchmark_generator,
53
- f"{unique_test_corpora_directory.parents[0].name}_"
54
- f"{benchmark_generator.prioritisation_type_string}",
55
- )
56
- for unique_test_corpora_directory in unique_test_corpora_directories
57
- ]
58
-
59
-
60
- def benchmark_run_comparisons(
61
- run_config: Config,
62
- ) -> None:
63
- """
64
- Benchmark prioritisation performance for several runs.
65
-
66
- Args:
67
- run_config (Config): Run configurations.
68
- """
69
- gene_analysis_runs = Config(
70
- benchmark_name=run_config.benchmark_name,
71
- runs=[run for run in run_config.runs if run.gene_analysis],
72
- plot_customisation=run_config.plot_customisation,
73
- )
74
- variant_analysis_runs = Config(
75
- benchmark_name=run_config.benchmark_name,
76
- runs=[run for run in run_config.runs if run.variant_analysis],
77
- plot_customisation=run_config.plot_customisation,
78
- )
79
- disease_analysis_runs = Config(
80
- benchmark_name=run_config.benchmark_name,
81
- runs=[run for run in run_config.runs if run.disease_analysis],
82
- plot_customisation=run_config.plot_customisation,
83
- )
84
- if gene_analysis_runs.runs:
85
- _run_benchmark_comparison(
86
- run_config=gene_analysis_runs,
87
- benchmark_generator=GeneBenchmarkRunOutputGenerator(
88
- plot_customisation=gene_analysis_runs.plot_customisation.gene_plots
89
- ),
90
- )
91
- if variant_analysis_runs.runs:
92
- _run_benchmark_comparison(
93
- run_config=variant_analysis_runs,
94
- benchmark_generator=VariantBenchmarkRunOutputGenerator(
95
- plot_customisation=variant_analysis_runs.plot_customisation.variant_plots
96
- ),
97
- )
98
- if disease_analysis_runs.runs:
99
- _run_benchmark_comparison(
100
- run_config=disease_analysis_runs,
101
- benchmark_generator=DiseaseBenchmarkRunOutputGenerator(
102
- plot_customisation=disease_analysis_runs.plot_customisation.disease_plots
103
- ),
104
- )