pheval 0.3.9__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -1,55 +1,34 @@
1
+ from dataclasses import dataclass
1
2
  from pathlib import Path
2
3
  from typing import List
3
4
 
4
5
  import pandas as pd
5
6
 
7
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
6
8
  from pheval.analyse.benchmarking_data import BenchmarkRunResults
7
9
  from pheval.analyse.binary_classification_stats import BinaryClassificationStats
8
10
  from pheval.analyse.rank_stats import RankStats
9
11
 
10
12
 
11
- def read_benchmark_tsv_result_summary(benchmarking_tsv: Path) -> pd.DataFrame:
12
- """
13
- Read the summary benchmark TSV output generated from the benchmark-comparison command.
13
+ @dataclass
14
+ class BenchmarkSummaryResults:
15
+ gene_results: List[BenchmarkRunResults]
16
+ disease_results: List[BenchmarkRunResults]
17
+ variant_results: List[BenchmarkRunResults]
14
18
 
15
- Args:
16
- benchmarking_tsv (Path): Path to the summary benchmark TSV output file.
17
19
 
18
- Returns:
19
- pd.DataFrame: A pandas DataFrame containing specific columns from the TSV file, including:
20
- 'results_directory_path', 'top', 'top3', 'top5', 'top10', 'found',
21
- 'total', 'mean_reciprocal_rank'.
20
+ def parse_benchmark_results(benchmark_summary_table: pd.DataFrame) -> List[BenchmarkRunResults]:
22
21
  """
23
- return pd.read_csv(
24
- benchmarking_tsv,
25
- delimiter="\t",
26
- usecols=[
27
- "results_directory_path",
28
- "top",
29
- "top3",
30
- "top5",
31
- "top10",
32
- "found",
33
- "total",
34
- "mean_reciprocal_rank",
35
- ],
36
- )
37
-
38
-
39
- def parse_benchmark_result_summary(benchmarking_df: pd.DataFrame) -> List[BenchmarkRunResults]:
40
- """
41
- Parse the summary benchmark DataFrame into a list of BenchmarkRunResults.
22
+ Parse benchmark results from a DataFrame.
42
23
 
43
24
  Args:
44
- benchmarking_df (pd.DataFrame): Summary benchmark DataFrame containing columns such as
45
- 'results_directory_path', 'top', 'top3', 'top5', 'top10',
46
- 'found', 'total', 'mean_reciprocal_rank'.
25
+ benchmark_summary_table (pd.DataFrame): DataFrame containing benchmark results.
47
26
 
48
27
  Returns:
49
- List[BenchmarkRunResults]: A list of BenchmarkRunResults instances generated from the DataFrame.
28
+ List[BenchmarkRunResults]: A list of BenchmarkRunResults objects parsed from the DataFrame.
50
29
  """
51
- benchmarking_results = []
52
- for _, row in benchmarking_df.iterrows():
30
+ results = []
31
+ for _, row in benchmark_summary_table.iterrows():
53
32
  benchmarking_result = BenchmarkRunResults(
54
33
  rank_stats=RankStats(
55
34
  top=row["top"],
@@ -60,9 +39,43 @@ def parse_benchmark_result_summary(benchmarking_df: pd.DataFrame) -> List[Benchm
60
39
  total=row["total"],
61
40
  mrr=row["mean_reciprocal_rank"],
62
41
  ),
63
- ranks={},
64
42
  benchmark_name=row["results_directory_path"],
65
43
  binary_classification_stats=BinaryClassificationStats(),
66
44
  )
67
- benchmarking_results.append(benchmarking_result)
68
- return benchmarking_results
45
+ results.append(benchmarking_result)
46
+ return results
47
+
48
+
49
+ def parse_benchmark_db(benchmarking_db: Path) -> BenchmarkSummaryResults:
50
+ """
51
+ Read the summary benchmark TSV output generated from the benchmark-comparison command.
52
+
53
+ Args:
54
+ benchmarking_db (Path): Path to the benchmark db.
55
+
56
+ Returns:
57
+ BenchmarkSummaryResults: A dataclass containing all benchmarking results contained in the db.
58
+ """
59
+ db_connector = BenchmarkDBManager(benchmarking_db)
60
+ gene_benchmarking_results, disease_benchmarking_results, variant_benchmarking_results = (
61
+ None,
62
+ None,
63
+ None,
64
+ )
65
+ if db_connector.check_table_exists("gene_summary"):
66
+ gene_benchmarking_results = parse_benchmark_results(
67
+ db_connector.conn.execute("SELECT * FROM gene_summary").fetchdf()
68
+ )
69
+ if db_connector.check_table_exists("disease_summary"):
70
+ disease_benchmarking_results = parse_benchmark_results(
71
+ db_connector.conn.execute("SELECT * FROM disease_summary").fetchdf()
72
+ )
73
+ if db_connector.check_table_exists("variant_summary"):
74
+ variant_benchmarking_results = parse_benchmark_results(
75
+ db_connector.conn.execute("SELECT * FROM variant_summary").fetchdf()
76
+ )
77
+ return BenchmarkSummaryResults(
78
+ gene_results=gene_benchmarking_results,
79
+ disease_results=disease_benchmarking_results,
80
+ variant_results=variant_benchmarking_results,
81
+ )
@@ -0,0 +1,219 @@
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+ from pheval.analyse.benchmark_db_manager import BenchmarkDBManager
5
+ from pheval.analyse.benchmark_generator import (
6
+ BenchmarkRunOutputGenerator,
7
+ DiseaseBenchmarkRunOutputGenerator,
8
+ GeneBenchmarkRunOutputGenerator,
9
+ VariantBenchmarkRunOutputGenerator,
10
+ )
11
+ from pheval.utils.file_utils import all_files
12
+ from pheval.utils.phenopacket_utils import (
13
+ GenomicVariant,
14
+ PhenopacketUtil,
15
+ ProbandCausativeGene,
16
+ ProbandDisease,
17
+ phenopacket_reader,
18
+ )
19
+
20
+
21
+ def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
22
+ """
23
+ Obtain known diseases from a Phenopacket.
24
+ Args:
25
+ phenopacket_path (Path): Path to the Phenopacket file.
26
+
27
+ Returns:
28
+ List[ProbandDisease]: A list of known diseases associated with the proband,
29
+ extracted from the Phenopacket.
30
+ """
31
+ phenopacket = phenopacket_reader(phenopacket_path)
32
+ phenopacket_util = PhenopacketUtil(phenopacket)
33
+ return phenopacket_util.diagnoses()
34
+
35
+
36
+ def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
37
+ """
38
+ Obtain known variants from a Phenopacket.
39
+ Args:
40
+ phenopacket_path (Path): Path to the Phenopacket file.
41
+
42
+ Returns:
43
+ List[GenomicVariant]: A list of known variants associated with the proband,
44
+ extracted from the Phenopacket.
45
+ """
46
+ phenopacket = phenopacket_reader(phenopacket_path)
47
+ phenopacket_util = PhenopacketUtil(phenopacket)
48
+ return phenopacket_util.diagnosed_variants()
49
+
50
+
51
+ def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene]:
52
+ """
53
+ Obtain known genes from a Phenopacket.
54
+ Args:
55
+ phenopacket_path (Path): Path to the Phenopacket file.
56
+ Returns:
57
+ List[ProbandCausativeGene]: A list of known genes associated with the proband,
58
+ extracted from the Phenopacket.
59
+ """
60
+ phenopacket = phenopacket_reader(phenopacket_path)
61
+ phenopacket_util = PhenopacketUtil(phenopacket)
62
+ return phenopacket_util.diagnosed_genes()
63
+
64
+
65
+ class CorpusParser:
66
+ """Class for parsing phenopacket corpus and retrieving known variants/genes/diseases."""
67
+
68
+ def __init__(self, benchmark_name: str, phenopacket_dir: Path) -> None:
69
+ """
70
+ Initialise the CorpusParser class.
71
+ Args:
72
+ phenopacket_dir (Path): Path to the Phenopacket directory.
73
+ """
74
+ self.phenopacket_dir = phenopacket_dir
75
+ self.conn = BenchmarkDBManager(benchmark_name).conn
76
+ self.table_name = phenopacket_dir.parents[0].name
77
+
78
+ def _create_gene_table(self) -> None:
79
+ """
80
+ Create the Gene benchmarking table if it doesn't already exist.
81
+ """
82
+ self.conn.execute(
83
+ f"""
84
+ CREATE TABLE IF NOT EXISTS {self.table_name}_gene (
85
+ identifier VARCHAR(255) PRIMARY KEY,
86
+ phenopacket VARCHAR,
87
+ gene_symbol VARCHAR,
88
+ gene_identifier VARCHAR
89
+ )
90
+ """
91
+ )
92
+
93
+ def _create_variant_table(self) -> None:
94
+ """
95
+ Create the Variant benchmarking table if it doesn't already exist.
96
+ """
97
+ self.conn.execute(
98
+ f"""
99
+ CREATE TABLE IF NOT EXISTS {self.table_name}_variant (
100
+ identifier VARCHAR(255) PRIMARY KEY,
101
+ phenopacket VARCHAR,
102
+ chrom VARCHAR,
103
+ pos INTEGER,
104
+ "ref" VARCHAR,
105
+ alt VARCHAR
106
+ )
107
+ """
108
+ )
109
+
110
+ def _create_disease_table(self):
111
+ """
112
+ Create the Disease benchmarking table if it doesn't already exist.
113
+ """
114
+ self.conn.execute(
115
+ f"""
116
+ CREATE TABLE IF NOT EXISTS {self.table_name}_disease (
117
+ identifier VARCHAR(255) PRIMARY KEY,
118
+ phenopacket VARCHAR,
119
+ disease_identifier VARCHAR,
120
+ disease_name VARCHAR
121
+ )
122
+ """
123
+ )
124
+
125
+ def _create_tables(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
126
+ """
127
+ Create tables based on the benchmarking analysis specified.
128
+ Args:
129
+ benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
130
+ """
131
+
132
+ if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
133
+ self._create_gene_table()
134
+ if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
135
+ self._create_variant_table()
136
+ if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
137
+ self._create_disease_table()
138
+
139
+ def _insert_genes(self, phenopacket_path: Path, genes: List[ProbandCausativeGene]) -> None:
140
+ """
141
+ Insert known disease-causing genes into the Gene benchmarking table.
142
+ Args:
143
+ phenopacket_path(Path): Path to the Phenopacket file.
144
+ genes(List[ProbandCausativeGene]): List of known genes associated with the proband.
145
+ """
146
+ for gene in genes:
147
+ identifier = f"{phenopacket_path.name}-{gene.gene_symbol}"
148
+ self.conn.execute(
149
+ f"""
150
+ INSERT OR IGNORE INTO {self.table_name}_gene (identifier, phenopacket, gene_symbol, gene_identifier)
151
+ VALUES (?, ?, ?, ?)
152
+ """,
153
+ (identifier, phenopacket_path.name, gene.gene_symbol, gene.gene_identifier),
154
+ )
155
+
156
+ def _insert_variants(self, phenopacket_path: Path, variants: List[GenomicVariant]) -> None:
157
+ """
158
+ Insert known variants into the Variant benchmarking table.
159
+ Args:
160
+ phenopacket_path (Path): Path to the Phenopacket file.:
161
+ variants (List[GenomicVariant]): List of known variants associated with the proband.
162
+ """
163
+ for variant in variants:
164
+ identifier = (
165
+ f"{phenopacket_path.name}-{variant.chrom}-{variant.pos}-{variant.ref}-{variant.alt}"
166
+ )
167
+ self.conn.execute(
168
+ f"""
169
+ INSERT OR IGNORE INTO {self.table_name}_variant (identifier, phenopacket, chrom, pos, "ref", alt)
170
+ VALUES (?, ?, ?, ?, ?, ?)
171
+ """,
172
+ (
173
+ identifier,
174
+ phenopacket_path.name,
175
+ variant.chrom,
176
+ variant.pos,
177
+ variant.ref,
178
+ variant.alt,
179
+ ),
180
+ )
181
+
182
+ def _insert_diseases(self, phenopacket_path: Path, diseases: List[ProbandDisease]) -> None:
183
+ """
184
+ Insert known diseases into the Disease benchmarking table.
185
+ Args:
186
+ phenopacket_path (Path): Path to the Phenopacket file.:
187
+ diseases (List[ProbandDisease]): List of known diseases associated with the proband.
188
+ """
189
+ for disease in diseases:
190
+ identifier = f"{phenopacket_path.name}-{disease.disease_identifier}"
191
+ self.conn.execute(
192
+ f"INSERT OR IGNORE INTO {self.table_name}_disease "
193
+ f"(identifier, phenopacket, disease_identifier, disease_name) VALUES (?, ?, ?, ?)",
194
+ (
195
+ identifier,
196
+ phenopacket_path.name,
197
+ disease.disease_identifier,
198
+ disease.disease_name,
199
+ ),
200
+ )
201
+
202
+ def parse_corpus(self, benchmark_generator: BenchmarkRunOutputGenerator) -> None:
203
+ """
204
+ Parse the phenopacket corpus and add known genes/variants/diseases to relevant benchmarking tables.
205
+ Args:
206
+ benchmark_generator (BenchmarkRunOutputGenerator): Class instance of the benchmark generator type.
207
+ """
208
+ self._create_tables(benchmark_generator)
209
+ for phenopacket_path in all_files(self.phenopacket_dir):
210
+ if isinstance(benchmark_generator, GeneBenchmarkRunOutputGenerator):
211
+ genes = _obtain_causative_genes(phenopacket_path)
212
+ self._insert_genes(phenopacket_path, genes)
213
+ if isinstance(benchmark_generator, VariantBenchmarkRunOutputGenerator):
214
+ variants = _obtain_causative_variants(phenopacket_path)
215
+ self._insert_variants(phenopacket_path, variants)
216
+ if isinstance(benchmark_generator, DiseaseBenchmarkRunOutputGenerator):
217
+ diseases = _obtain_causative_diseases(phenopacket_path)
218
+ self._insert_diseases(phenopacket_path, diseases)
219
+ self.conn.close()