pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/__init__.py +0 -5
- pheval/analyse/__init__.py +0 -0
- pheval/analyse/analysis.py +703 -0
- pheval/analyse/generate_plots.py +312 -0
- pheval/analyse/generate_summary_outputs.py +186 -0
- pheval/analyse/rank_stats.py +61 -0
- pheval/cli.py +22 -7
- pheval/cli_pheval.py +37 -12
- pheval/cli_pheval_utils.py +225 -8
- pheval/config_parser.py +36 -0
- pheval/constants.py +1 -0
- pheval/implementations/__init__.py +1 -3
- pheval/post_processing/__init__.py +0 -0
- pheval/post_processing/post_processing.py +210 -0
- pheval/prepare/__init__.py +0 -0
- pheval/prepare/create_noisy_phenopackets.py +173 -0
- pheval/prepare/create_spiked_vcf.py +366 -0
- pheval/prepare/custom_exceptions.py +47 -0
- pheval/prepare/update_phenopacket.py +53 -0
- pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
- pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
- pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
- pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
- pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
- pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
- pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
- pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
- pheval/run_metadata.py +27 -0
- pheval/runners/runner.py +92 -11
- pheval/utils/__init__.py +0 -0
- pheval/utils/docs_gen.py +105 -0
- pheval/utils/docs_gen.sh +18 -0
- pheval/utils/file_utils.py +88 -0
- pheval/utils/phenopacket_utils.py +356 -0
- pheval/utils/semsim_utils.py +156 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
- pheval-0.2.0.dist-info/RECORD +41 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
- pheval/utils.py +0 -7
- pheval-0.1.0.dist-info/RECORD +0 -13
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
# import logging
|
|
4
|
+
import os
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from copy import copy
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from google.protobuf.json_format import MessageToJson, Parse
|
|
12
|
+
from phenopackets import (
|
|
13
|
+
Family,
|
|
14
|
+
File,
|
|
15
|
+
GenomicInterpretation,
|
|
16
|
+
Interpretation,
|
|
17
|
+
Phenopacket,
|
|
18
|
+
PhenotypicFeature,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from pheval.prepare.custom_exceptions import IncorrectFileFormatError
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class IncompatibleGenomeAssemblyError(Exception):
|
|
25
|
+
"""Exception raised for incompatible genome assembly."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, assembly, phenopacket, message="Incompatible Genome Assembly"):
|
|
28
|
+
self.assembly: str = assembly
|
|
29
|
+
self.phenopacket: Path = phenopacket
|
|
30
|
+
self.message: str = message
|
|
31
|
+
super().__init__(self.message)
|
|
32
|
+
|
|
33
|
+
def __str__(self):
|
|
34
|
+
return f"{self.message} -> {self.assembly} in {self.phenopacket}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class GenomicVariant:
|
|
39
|
+
chrom: str
|
|
40
|
+
pos: int
|
|
41
|
+
ref: str
|
|
42
|
+
alt: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ProbandCausativeVariant:
|
|
47
|
+
proband_id: str
|
|
48
|
+
assembly: str
|
|
49
|
+
variant: GenomicVariant
|
|
50
|
+
genotype: str
|
|
51
|
+
info: str = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ProbandCausativeGene:
|
|
56
|
+
gene_symbol: str
|
|
57
|
+
gene_identifier: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_hgnc_data() -> pd.DataFrame:
|
|
61
|
+
return pd.read_csv(
|
|
62
|
+
os.path.dirname(__file__).replace("utils", "resources/hgnc_complete_set_2022-10-01.txt"),
|
|
63
|
+
delimiter="\t",
|
|
64
|
+
dtype=str,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def create_hgnc_dict() -> defaultdict:
|
|
69
|
+
"""Creates reference for updating gene symbols and identifiers."""
|
|
70
|
+
hgnc_df = read_hgnc_data()
|
|
71
|
+
hgnc_data = defaultdict(dict)
|
|
72
|
+
for _index, row in hgnc_df.iterrows():
|
|
73
|
+
previous_names = []
|
|
74
|
+
hgnc_data[row["symbol"]]["ensembl_id"] = row["ensembl_gene_id"]
|
|
75
|
+
hgnc_data[row["symbol"]]["hgnc_id"] = row["hgnc_id"]
|
|
76
|
+
hgnc_data[row["symbol"]]["entrez_id"] = row["entrez_id"]
|
|
77
|
+
hgnc_data[row["symbol"]]["refseq_accession"] = row["refseq_accession"]
|
|
78
|
+
previous = str(row["prev_symbol"]).split("|")
|
|
79
|
+
for p in previous:
|
|
80
|
+
previous_names.append(p.strip('"'))
|
|
81
|
+
hgnc_data[row["symbol"]]["previous_symbol"] = previous_names
|
|
82
|
+
|
|
83
|
+
return hgnc_data
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def create_gene_identifier_map() -> dict:
|
|
87
|
+
hgnc_df = read_hgnc_data()
|
|
88
|
+
identifier_map = {}
|
|
89
|
+
for _index, row in hgnc_df.iterrows():
|
|
90
|
+
identifier_map[row["ensembl_gene_id"]] = row["symbol"]
|
|
91
|
+
identifier_map[row["hgnc_id"]] = row["symbol"]
|
|
92
|
+
identifier_map[row["entrez_id"]] = row["symbol"]
|
|
93
|
+
identifier_map[row["refseq_accession"]] = row["symbol"]
|
|
94
|
+
return identifier_map
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def phenopacket_reader(file: Path):
|
|
98
|
+
"""Reads a phenopacket file, returning its contents."""
|
|
99
|
+
file = open(file, "r")
|
|
100
|
+
phenopacket = json.load(file)
|
|
101
|
+
file.close()
|
|
102
|
+
if "proband" in phenopacket:
|
|
103
|
+
return Parse(json.dumps(phenopacket), Family())
|
|
104
|
+
else:
|
|
105
|
+
return Parse(json.dumps(phenopacket), Phenopacket())
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class PhenopacketUtil:
|
|
109
|
+
"""Retrieves relevant data from a phenopacket."""
|
|
110
|
+
|
|
111
|
+
def __init__(self, phenopacket_contents: Phenopacket):
|
|
112
|
+
self.phenopacket_contents = phenopacket_contents
|
|
113
|
+
|
|
114
|
+
def sample_id(self) -> str:
|
|
115
|
+
"""Retrieve the sample ID from a phenopacket or proband of a family."""
|
|
116
|
+
if hasattr(self.phenopacket_contents, "proband"):
|
|
117
|
+
return self.phenopacket_contents.proband.subject.id
|
|
118
|
+
else:
|
|
119
|
+
return self.phenopacket_contents.subject.id
|
|
120
|
+
|
|
121
|
+
def phenotypic_features(self) -> list[PhenotypicFeature]:
|
|
122
|
+
"""Retrieves a list of all HPO terms."""
|
|
123
|
+
if hasattr(self.phenopacket_contents, "proband"):
|
|
124
|
+
return self.phenopacket_contents.proband.phenotypic_features
|
|
125
|
+
else:
|
|
126
|
+
return self.phenopacket_contents.phenotypic_features
|
|
127
|
+
|
|
128
|
+
def observed_phenotypic_features(self) -> list[PhenotypicFeature]:
|
|
129
|
+
"""Removes any HPO terms labelled as excluded."""
|
|
130
|
+
phenotypic_features = []
|
|
131
|
+
all_phenotypic_features = self.phenotypic_features()
|
|
132
|
+
for p in all_phenotypic_features:
|
|
133
|
+
if p.excluded:
|
|
134
|
+
continue
|
|
135
|
+
phenotypic_features.append(p)
|
|
136
|
+
return phenotypic_features
|
|
137
|
+
|
|
138
|
+
def negated_phenotypic_features(self) -> [PhenotypicFeature]:
|
|
139
|
+
"""Retrieve negated phenotypic features."""
|
|
140
|
+
negated_phenotypic_features = []
|
|
141
|
+
all_phenotypic_features = self.phenotypic_features()
|
|
142
|
+
for p in all_phenotypic_features:
|
|
143
|
+
if p.excluded:
|
|
144
|
+
negated_phenotypic_features.append(p)
|
|
145
|
+
return negated_phenotypic_features
|
|
146
|
+
|
|
147
|
+
def interpretations(self) -> list[Interpretation]:
|
|
148
|
+
"""Returns all interpretations of a phenopacket."""
|
|
149
|
+
if hasattr(self.phenopacket_contents, "proband"):
|
|
150
|
+
return self.phenopacket_contents.proband.interpretations
|
|
151
|
+
else:
|
|
152
|
+
return self.phenopacket_contents.interpretations
|
|
153
|
+
|
|
154
|
+
def causative_variants(self) -> list[ProbandCausativeVariant]:
|
|
155
|
+
"""Returns a list of all causative variants listed in a phenopacket."""
|
|
156
|
+
all_variants = []
|
|
157
|
+
interpretation = self.interpretations()
|
|
158
|
+
for i in interpretation:
|
|
159
|
+
for g in i.diagnosis.genomic_interpretations:
|
|
160
|
+
vcf_record = g.variant_interpretation.variation_descriptor.vcf_record
|
|
161
|
+
genotype = g.variant_interpretation.variation_descriptor.allelic_state
|
|
162
|
+
variant_data = ProbandCausativeVariant(
|
|
163
|
+
self.phenopacket_contents.subject.id,
|
|
164
|
+
vcf_record.genome_assembly,
|
|
165
|
+
GenomicVariant(
|
|
166
|
+
vcf_record.chrom,
|
|
167
|
+
vcf_record.pos,
|
|
168
|
+
vcf_record.ref,
|
|
169
|
+
vcf_record.alt,
|
|
170
|
+
),
|
|
171
|
+
genotype.label,
|
|
172
|
+
vcf_record.info,
|
|
173
|
+
)
|
|
174
|
+
all_variants.append(variant_data)
|
|
175
|
+
return all_variants
|
|
176
|
+
|
|
177
|
+
def files(self) -> list:
|
|
178
|
+
"""Returns all files associated with a phenopacket."""
|
|
179
|
+
return self.phenopacket_contents.files
|
|
180
|
+
|
|
181
|
+
def vcf_file_data(self, phenopacket_path: Path, vcf_dir: Path) -> File:
|
|
182
|
+
"""Retrieves the genome assembly and vcf name from a phenopacket."""
|
|
183
|
+
compatible_genome_assembly = ["GRCh37", "hg19", "GRCh38", "hg38"]
|
|
184
|
+
vcf_data = [file for file in self.files() if file.file_attributes["fileFormat"] == "vcf"][0]
|
|
185
|
+
if not Path(vcf_data.uri).name.endswith(".vcf") and not Path(vcf_data.uri).name.endswith(
|
|
186
|
+
".vcf.gz"
|
|
187
|
+
):
|
|
188
|
+
raise IncorrectFileFormatError(Path(vcf_data.uri), ".vcf or .vcf.gz file")
|
|
189
|
+
if vcf_data.file_attributes["genomeAssembly"] not in compatible_genome_assembly:
|
|
190
|
+
raise IncompatibleGenomeAssemblyError(
|
|
191
|
+
vcf_data.file_attributes["genomeAssembly"], phenopacket_path
|
|
192
|
+
)
|
|
193
|
+
vcf_data.uri = str(vcf_dir.joinpath(Path(vcf_data.uri).name))
|
|
194
|
+
return vcf_data
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def _extract_diagnosed_gene(
|
|
198
|
+
genomic_interpretation: GenomicInterpretation,
|
|
199
|
+
) -> ProbandCausativeGene:
|
|
200
|
+
"""Returns the disease causative gene from the variant descriptor field if not empty,
|
|
201
|
+
otherwise, returns from the gene descriptor from a phenopacket."""
|
|
202
|
+
if genomic_interpretation.variant_interpretation.ByteSize() != 0:
|
|
203
|
+
return ProbandCausativeGene(
|
|
204
|
+
genomic_interpretation.variant_interpretation.variation_descriptor.gene_context.symbol,
|
|
205
|
+
genomic_interpretation.variant_interpretation.variation_descriptor.gene_context.value_id,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
else:
|
|
209
|
+
return ProbandCausativeGene(
|
|
210
|
+
gene_symbol=genomic_interpretation.gene.symbol,
|
|
211
|
+
gene_identifier=genomic_interpretation.gene.value_id,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def diagnosed_genes(self) -> list[ProbandCausativeGene]:
|
|
215
|
+
"""Returns a unique list of all causative genes and the corresponding gene identifiers from a phenopacket."""
|
|
216
|
+
pheno_interpretation = self.interpretations()
|
|
217
|
+
genes = []
|
|
218
|
+
for i in pheno_interpretation:
|
|
219
|
+
for g in i.diagnosis.genomic_interpretations:
|
|
220
|
+
genes.append(self._extract_diagnosed_gene(g))
|
|
221
|
+
genes = list({gene.gene_symbol: gene for gene in genes}.values())
|
|
222
|
+
return genes
|
|
223
|
+
|
|
224
|
+
def diagnosed_variants(self) -> list[GenomicVariant]:
|
|
225
|
+
"""Returns a list of all variants from a phenopacket - for use in assess-prioritisation."""
|
|
226
|
+
variants = []
|
|
227
|
+
pheno_interpretation = self.interpretations()
|
|
228
|
+
for i in pheno_interpretation:
|
|
229
|
+
for g in i.diagnosis.genomic_interpretations:
|
|
230
|
+
variant = GenomicVariant(
|
|
231
|
+
chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom,
|
|
232
|
+
pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
|
|
233
|
+
ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
|
|
234
|
+
alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
|
|
235
|
+
)
|
|
236
|
+
variants.append(variant)
|
|
237
|
+
return variants
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class PhenopacketRebuilder:
|
|
241
|
+
"""Rebuilds a Phenopacket."""
|
|
242
|
+
|
|
243
|
+
def __init__(self, phenopacket: Phenopacket or Family):
|
|
244
|
+
self.phenopacket = phenopacket
|
|
245
|
+
|
|
246
|
+
def update_interpretations(self, interpretations) -> Phenopacket or Family:
|
|
247
|
+
"""Adds the updated interpretations to a phenopacket."""
|
|
248
|
+
phenopacket = copy(self.phenopacket)
|
|
249
|
+
if hasattr(phenopacket, "proband"):
|
|
250
|
+
del phenopacket.proband.interpretations[:]
|
|
251
|
+
phenopacket.proband.interpretations.extend(interpretations)
|
|
252
|
+
else:
|
|
253
|
+
del phenopacket.interpretations[:]
|
|
254
|
+
phenopacket.interpretations.extend(interpretations)
|
|
255
|
+
return phenopacket
|
|
256
|
+
|
|
257
|
+
def add_randomised_hpo(self, randomised_hpo) -> Phenopacket or Family:
|
|
258
|
+
"""Adds randomised phenotypic profile to phenopacket."""
|
|
259
|
+
phenopacket = copy(self.phenopacket)
|
|
260
|
+
if hasattr(phenopacket, "proband"):
|
|
261
|
+
del phenopacket.proband.phenotypic_features[:]
|
|
262
|
+
phenopacket.proband.phenotypic_features.extend(randomised_hpo)
|
|
263
|
+
else:
|
|
264
|
+
del phenopacket.phenotypic_features[:]
|
|
265
|
+
phenopacket.phenotypic_features.extend(randomised_hpo)
|
|
266
|
+
return phenopacket
|
|
267
|
+
|
|
268
|
+
def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Phenopacket or Family:
|
|
269
|
+
"""Adds spiked vcf path to phenopacket."""
|
|
270
|
+
phenopacket = copy(self.phenopacket)
|
|
271
|
+
phenopacket_files = [
|
|
272
|
+
file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"
|
|
273
|
+
]
|
|
274
|
+
phenopacket_files.append(spiked_vcf_file_data)
|
|
275
|
+
del phenopacket.files[:]
|
|
276
|
+
phenopacket.files.extend(phenopacket_files)
|
|
277
|
+
return phenopacket
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def create_json_message(phenopacket: Phenopacket or Family) -> str:
|
|
281
|
+
"""Creates json message for writing to file."""
|
|
282
|
+
return MessageToJson(phenopacket)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def write_phenopacket(phenopacket: Phenopacket or Family, output_file: Path) -> None:
|
|
286
|
+
"""Writes a phenopacket."""
|
|
287
|
+
phenopacket_json = create_json_message(phenopacket)
|
|
288
|
+
with open(output_file, "w") as outfile:
|
|
289
|
+
outfile.write(phenopacket_json)
|
|
290
|
+
outfile.close()
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class GeneIdentifierUpdater:
|
|
294
|
+
def __init__(self, gene_identifier: str, hgnc_data: dict = None, identifier_map: dict = None):
|
|
295
|
+
self.hgnc_data = hgnc_data
|
|
296
|
+
self.gene_identifier = gene_identifier
|
|
297
|
+
self.identifier_map = identifier_map
|
|
298
|
+
|
|
299
|
+
def find_identifier(self, gene_symbol: str) -> str:
|
|
300
|
+
"""Finds the specified gene identifier for a gene symbol."""
|
|
301
|
+
if gene_symbol in self.hgnc_data.keys():
|
|
302
|
+
return self.hgnc_data[gene_symbol][self.gene_identifier]
|
|
303
|
+
else:
|
|
304
|
+
for _symbol, data in self.hgnc_data.items():
|
|
305
|
+
for prev_symbol in data["previous_symbol"]:
|
|
306
|
+
if prev_symbol == gene_symbol:
|
|
307
|
+
return data[self.gene_identifier]
|
|
308
|
+
|
|
309
|
+
def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
|
|
310
|
+
"""
|
|
311
|
+
Obtain gene symbol from a gene identifier. (e.g.)
|
|
312
|
+
"
|
|
313
|
+
obtain_gene_symbol_from_identifier(query_gene_identifier="HGNC:5")
|
|
314
|
+
"
|
|
315
|
+
"""
|
|
316
|
+
return self.identifier_map[query_gene_identifier]
|
|
317
|
+
|
|
318
|
+
def _find_alternate_ids(self, gene_symbol: str) -> list[str]:
|
|
319
|
+
"""Finds the alternate IDs for a gene symbol."""
|
|
320
|
+
if gene_symbol in self.hgnc_data.keys():
|
|
321
|
+
return [
|
|
322
|
+
self.hgnc_data[gene_symbol]["hgnc_id"],
|
|
323
|
+
"ncbigene:" + self.hgnc_data[gene_symbol]["entrez_id"],
|
|
324
|
+
"ensembl:" + self.hgnc_data[gene_symbol]["ensembl_id"],
|
|
325
|
+
"symbol:" + gene_symbol,
|
|
326
|
+
]
|
|
327
|
+
else:
|
|
328
|
+
for symbol, data in self.hgnc_data.items():
|
|
329
|
+
for prev_symbol in data["previous_symbol"]:
|
|
330
|
+
if prev_symbol == gene_symbol:
|
|
331
|
+
return [
|
|
332
|
+
data["hgnc_id"],
|
|
333
|
+
"ncbigene:" + data["entrez_id"],
|
|
334
|
+
"ensembl:" + data["ensembl_id"],
|
|
335
|
+
"symbol:" + symbol,
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
def update_genomic_interpretations_gene_identifier(
|
|
339
|
+
self, interpretations: list[Interpretation]
|
|
340
|
+
) -> list[Interpretation]:
|
|
341
|
+
"""Updates the genomic interpretations of a phenopacket."""
|
|
342
|
+
updated_interpretations = copy(list(interpretations))
|
|
343
|
+
for updated_interpretation in updated_interpretations:
|
|
344
|
+
for g in updated_interpretation.diagnosis.genomic_interpretations:
|
|
345
|
+
g.variant_interpretation.variation_descriptor.gene_context.value_id = (
|
|
346
|
+
self.find_identifier(
|
|
347
|
+
g.variant_interpretation.variation_descriptor.gene_context.symbol
|
|
348
|
+
)
|
|
349
|
+
)
|
|
350
|
+
del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
|
|
351
|
+
g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
|
|
352
|
+
self._find_alternate_ids(
|
|
353
|
+
g.variant_interpretation.variation_descriptor.gene_context.symbol
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
return updated_interpretations
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Contains all pheval utility methods
|
|
3
|
+
"""
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import numpy
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import plotly.express as px
|
|
9
|
+
|
|
10
|
+
import pheval.utils.file_utils as file_utils
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def filter_non_0_score(data: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
14
|
+
"""Removes rows that have value equal to 0 based on the given column passed by col parameter
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
data (pd.DataFrame): Dirty dataframe
|
|
18
|
+
col (str): Column to be filtered
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
pd.DataFrame: Filtered dataframe
|
|
22
|
+
"""
|
|
23
|
+
return data[data[col] != 0]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_semsim(df: pd.DataFrame, cols: list) -> pd.DataFrame:
|
|
27
|
+
"""Parses semantic similarity profiles converting the score column as a numeric value and dropping the null ones
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
df (pd.DataFrame): semantic similarity profile dataframe
|
|
31
|
+
cols (list): list of columns that will be selected on semsim data
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
pd.Dataframe: parsed semantic similarity dataframe
|
|
35
|
+
"""
|
|
36
|
+
df[cols[-1]] = pd.to_numeric(df[cols[-1]], errors="coerce")
|
|
37
|
+
df.replace("None", numpy.nan).dropna(subset=cols[-1], inplace=True)
|
|
38
|
+
return df
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def diff_semsim(
|
|
42
|
+
semsim_left: pd.DataFrame, semsim_right: pd.DataFrame, score_column: str, absolute_diff: bool
|
|
43
|
+
) -> pd.DataFrame:
|
|
44
|
+
"""Calculates score difference between two semantic similarity profiles
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
semsim_left (pd.DataFrame): first semantic similarity dataframe
|
|
48
|
+
semsim_right (pd.DataFrame): second semantic similarity dataframe
|
|
49
|
+
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
|
|
50
|
+
absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False).
|
|
51
|
+
Defaults to True.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
pd.DataFrame: A dataframe with terms and its scores differences
|
|
55
|
+
"""
|
|
56
|
+
df = pd.merge(semsim_left, semsim_right, on=["subject_id", "object_id"], how="outer")
|
|
57
|
+
if absolute_diff:
|
|
58
|
+
df["diff"] = df[f"{score_column}_x"] - df[f"{score_column}_y"]
|
|
59
|
+
return df[["subject_id", "object_id", "diff"]]
|
|
60
|
+
df["diff"] = df.apply(
|
|
61
|
+
lambda row: get_percentage_diff(row[f"{score_column}_x"], row[f"{score_column}_y"]), axis=1
|
|
62
|
+
)
|
|
63
|
+
return df[["subject_id", "object_id", f"{score_column}_x", f"{score_column}_y", "diff"]]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def percentage_diff(semsim_left: Path, semsim_right: Path, score_column: str, output: Path):
|
|
67
|
+
"""Compares two semantic similarity profiles
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
semsim_left (Path): File path of the first semantic similarity profile
|
|
71
|
+
semsim_right (Path): File path of the second semantic similarity profile
|
|
72
|
+
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
|
|
73
|
+
output (Path): Output path for the difference tsv file
|
|
74
|
+
"""
|
|
75
|
+
clean_df = semsim_analysis(semsim_left, semsim_right, score_column, absolute_diff=False)
|
|
76
|
+
clean_df.sort_values(by="diff", ascending=False).to_csv(output, sep="\t", index=False)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str):
|
|
80
|
+
"""Plots semantic similarity profiles heatmap
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
semsim_left (Path): File path of the first semantic similarity profile
|
|
84
|
+
semsim_right (Path): File path of the second semantic similarity profile
|
|
85
|
+
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
|
|
86
|
+
"""
|
|
87
|
+
clean_df = semsim_analysis(semsim_left, semsim_right, score_column)
|
|
88
|
+
df = clean_df.pivot(index="subject_id", columns="object_id", values="diff")
|
|
89
|
+
fig = px.imshow(df, text_auto=True)
|
|
90
|
+
fig.show()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def semsim_analysis(
|
|
94
|
+
semsim_left: Path, semsim_right: Path, score_column: str, absolute_diff=True
|
|
95
|
+
) -> pd.DataFrame:
|
|
96
|
+
"""semsim_analysis
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
semsim_left (Path): File path of the first semantic similarity profile
|
|
100
|
+
semsim_right (Path): File path of the second semantic similarity profile
|
|
101
|
+
score_column (str): Score column that will be computed (e.g. jaccard_similarity)
|
|
102
|
+
absolute_diff (bool, optional): Whether the difference is absolute (True) or percentage (False).
|
|
103
|
+
Defaults to True.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
[pd.DataFrame]: DataFrame with the differences between two semantic similarity profiles
|
|
107
|
+
"""
|
|
108
|
+
validate_semsim_file_comparison(semsim_left, semsim_right)
|
|
109
|
+
cols = ["subject_id", "object_id", score_column]
|
|
110
|
+
semsim_left = pd.read_csv(semsim_left, sep="\t")
|
|
111
|
+
semsim_right = pd.read_csv(semsim_right, sep="\t")
|
|
112
|
+
file_utils.ensure_columns_exists(
|
|
113
|
+
cols=cols,
|
|
114
|
+
err_message="must exist in semsim dataframes",
|
|
115
|
+
dataframes=[semsim_left, semsim_right],
|
|
116
|
+
)
|
|
117
|
+
semsim_left = parse_semsim(semsim_left, cols)
|
|
118
|
+
semsim_right = parse_semsim(semsim_right, cols)
|
|
119
|
+
diff_df = diff_semsim(semsim_left, semsim_right, score_column, absolute_diff)
|
|
120
|
+
return filter_non_0_score(diff_df, "diff")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def validate_semsim_file_comparison(semsim_left: Path, semsim_right: Path):
|
|
124
|
+
"""Checks if files exist and whether they're different
|
|
125
|
+
Args:
|
|
126
|
+
semsim_left (Path): File path of the first semantic similarity profile
|
|
127
|
+
semsim_right (Path): File path of the second semantic similarity profile
|
|
128
|
+
Raises:
|
|
129
|
+
Exception: FileNotFoundException
|
|
130
|
+
"""
|
|
131
|
+
if semsim_left == semsim_right:
|
|
132
|
+
errmsg = "Semantic similarity profiles are equal. Make sure you have selected different files to analyze"
|
|
133
|
+
raise Exception(errmsg)
|
|
134
|
+
file_utils.ensure_file_exists(semsim_left, semsim_right)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def get_percentage_diff(current_number: float, previous_number: float) -> float:
|
|
138
|
+
"""Gets the percentage difference between two numbers
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
current_number (float): second number in comparison
|
|
142
|
+
previous_number (float): first number in comparison
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
float: percentage difference between two numbers
|
|
146
|
+
"""
|
|
147
|
+
try:
|
|
148
|
+
if current_number == previous_number:
|
|
149
|
+
return "{:.2%}".format(0)
|
|
150
|
+
if current_number > previous_number:
|
|
151
|
+
number = (1 - ((current_number / previous_number))) * 100
|
|
152
|
+
else:
|
|
153
|
+
number = (100 - ((previous_number / current_number) * 100)) * -1
|
|
154
|
+
return "{:.2%}".format(number / 100)
|
|
155
|
+
except ZeroDivisionError:
|
|
156
|
+
return None
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: pheval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary:
|
|
5
|
-
Author:
|
|
6
|
-
Author-email:
|
|
7
|
-
Requires-Python: >=3.9
|
|
5
|
+
Author: Yasemin Bridges
|
|
6
|
+
Author-email: y.bridges@qmul.ac.uk
|
|
7
|
+
Requires-Python: >=3.9,<4.0.0
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.9
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
@@ -12,8 +12,16 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
12
12
|
Requires-Dist: class-resolver (>=0.3.10,<0.4.0)
|
|
13
13
|
Requires-Dist: click (>=8.1.3)
|
|
14
14
|
Requires-Dist: deprecation (>=2.1.0)
|
|
15
|
+
Requires-Dist: google (>=3.0.0,<4.0.0)
|
|
15
16
|
Requires-Dist: jaydebeapi (>=1.2.3)
|
|
17
|
+
Requires-Dist: matplotlib (>=3.7.0,<4.0.0)
|
|
18
|
+
Requires-Dist: oaklib (>=0.1.55,<0.2.0)
|
|
16
19
|
Requires-Dist: pandas (>=1.5.1)
|
|
20
|
+
Requires-Dist: phenopackets (>=2.0.2,<3.0.0)
|
|
21
|
+
Requires-Dist: plotly (>=5.13.0,<6.0.0)
|
|
22
|
+
Requires-Dist: pyaml (>=21.10.1,<22.0.0)
|
|
23
|
+
Requires-Dist: pyserde (>=0.9.8,<0.10.0)
|
|
24
|
+
Requires-Dist: seaborn (>=0.12.2,<0.13.0)
|
|
17
25
|
Requires-Dist: tqdm (>=4.64.1)
|
|
18
26
|
Description-Content-Type: text/markdown
|
|
19
27
|
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
pheval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
pheval/analyse/analysis.py,sha256=4Yhlkokx3pezXrslJDZtYfF2Y-BrP6y4_mCetpegkT4,24825
|
|
4
|
+
pheval/analyse/generate_plots.py,sha256=KDV1x7JnS9hX5cwMEUI63TVndC59-Fm0HNfaLtP8tJ4,14483
|
|
5
|
+
pheval/analyse/generate_summary_outputs.py,sha256=HGpg916t5MthSpTSjKQI3sS5Y7jjO1QVqzn4TdR0veE,7266
|
|
6
|
+
pheval/analyse/rank_stats.py,sha256=HMmLECGGCJrCdNstmCbEkODoYgqYdJRfv3NgqaUaA94,1933
|
|
7
|
+
pheval/cli.py,sha256=Ubw4Rup_hF18UszJIFTUB7_dhnr2P88dD33T0WzLblc,1412
|
|
8
|
+
pheval/cli_pheval.py,sha256=aP7UAvmNZdj74raSANF6uSxHk_wCQ9ckHOVOySaW4dE,2423
|
|
9
|
+
pheval/cli_pheval_utils.py,sha256=_rmgFcee8-u4A5eQg4TtY1KbduERSHsm2DQGboYb91E,7307
|
|
10
|
+
pheval/config_parser.py,sha256=R_ivbMBVHMOuctQxVUIl9ojQTE0cX-X6v6YToLlwh64,1030
|
|
11
|
+
pheval/constants.py,sha256=07xfY0nVEkHeDiZXfo5X7TTCOV0GrsERkm2mx6-JiiI,45
|
|
12
|
+
pheval/implementations/__init__.py,sha256=2mkbEaA7o-NAkfoLYkn2q50xp82cavu_qGcjt3k8m-I,1227
|
|
13
|
+
pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
pheval/post_processing/post_processing.py,sha256=clrWnGds8QXcOTeKt3bl4FuqyyuYeM19GjScYpQrA04,7244
|
|
15
|
+
pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
pheval/prepare/create_noisy_phenopackets.py,sha256=405f_kAOFlBN6J9ClB5ZyJ4Z094JE6XEQgrwPZK7LyQ,6920
|
|
17
|
+
pheval/prepare/create_spiked_vcf.py,sha256=1cYMcaU66lnsWoF7PRJ5SZXLZy0Ao3Myux2T_4YIXNU,12988
|
|
18
|
+
pheval/prepare/custom_exceptions.py,sha256=_G3_95dPtHIs1SviYBV1j7cYc-hxlhuw8hhnYdzByYY,1719
|
|
19
|
+
pheval/prepare/update_phenopacket.py,sha256=ZC-i8VVQbzAP_pebat9_Xy13c4MRWSdVplRdUigdXrM,2252
|
|
20
|
+
pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
|
|
21
|
+
pheval/resources/alternate_ouputs/DeepPVP_results.txt,sha256=MF9MZJYa4r4PEvFzALpi-lNGLxjENOnq_YgrgFMn-oQ,1508
|
|
22
|
+
pheval/resources/alternate_ouputs/OVA_results.txt,sha256=_5XFCR4W04D-W7DObpALLsa0-693g2kiIUB_uo79aHk,9845
|
|
23
|
+
pheval/resources/alternate_ouputs/Phen2Gene_results.json,sha256=xxKsuiHKW9qQOz2baFlLW9RYphA4kxjoTsg1weZkTY8,14148
|
|
24
|
+
pheval/resources/alternate_ouputs/Phenolyzer_results.txt,sha256=TltiEzYm2PY79u6EdZR3f4ZqadNDCUN_d4f0TFF-t5A,594
|
|
25
|
+
pheval/resources/alternate_ouputs/lirical_results.tsv,sha256=0juf5HY6ttg-w7aWgYJUmSP5zmoaooEQDY8xhOcerLk,431068
|
|
26
|
+
pheval/resources/alternate_ouputs/svanna_results.tsv,sha256=OpTamPhJwh12wkdAxoIGb0wWs_T7TcqNWgqkQzgOek4,714
|
|
27
|
+
pheval/resources/hgnc_complete_set_2022-10-01.txt,sha256=PLD2-FJizl0detUtjvgeC1qc1FNq2jnykRvfw7ahF2w,16274884
|
|
28
|
+
pheval/run_metadata.py,sha256=lDiLNFSRueX2pfyuRwNRbcRo_XxWQbSTLy45Yhgicsc,919
|
|
29
|
+
pheval/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
pheval/runners/runner.py,sha256=FOpFxUuEeV2-2vYQkaDVye8BTfN9WqSJHIpBF0X14Os,3774
|
|
31
|
+
pheval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
32
|
+
pheval/utils/docs_gen.py,sha256=rYP_76SSRx-G95r25aJcGtEEHCYgfi6-1hR0aV2UZXA,3192
|
|
33
|
+
pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
|
|
34
|
+
pheval/utils/file_utils.py,sha256=n3GKOOkd1mENpWOWcWHIUFvhh1iV2TCp1daMPOP0f_c,3068
|
|
35
|
+
pheval/utils/phenopacket_utils.py,sha256=uZ_SGtrctZgg6iJqyOaseV9bqQ5paYN8FJvnTSg1qMg,14245
|
|
36
|
+
pheval/utils/semsim_utils.py,sha256=jDqSUYBP6Q5yPNq024kDe2fpqWmwwRzv41o2fP1q-vA,6150
|
|
37
|
+
pheval-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
38
|
+
pheval-0.2.0.dist-info/METADATA,sha256=2Elv0CtIfOBLW79u8EtQtbgmWjTqa-QCyX1Hyzwt5Gw,1688
|
|
39
|
+
pheval-0.2.0.dist-info/WHEEL,sha256=WGfLGfLX43Ei_YORXSnT54hxFygu34kMpcQdmgmEwCQ,88
|
|
40
|
+
pheval-0.2.0.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
|
|
41
|
+
pheval-0.2.0.dist-info/RECORD,,
|
pheval/utils.py
DELETED
pheval-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
pheval/__init__.py,sha256=amR2KDYWGlaKIqzfpeFlXYvDacBV-zCV_KH-Po9SQVk,44
|
|
2
|
-
pheval/cli.py,sha256=FAj7ImzjoAMGu41L6odPUamtLLYli9sbWPYGioQSCnk,970
|
|
3
|
-
pheval/cli_pheval.py,sha256=ThAKnnodHwrDxTeb0NLysZJHL1fxN3TRUGKJgpB90zw,1851
|
|
4
|
-
pheval/cli_pheval_utils.py,sha256=tmFzWhQMZYrhQml8XC7HP8dcIYwCDvFlzrzmzGxUgo0,606
|
|
5
|
-
pheval/implementations/__init__.py,sha256=rZoxRBHzQWTUlNeofu_gZcVCYZNF9kEbiw3BKaeMHso,1241
|
|
6
|
-
pheval/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
pheval/runners/runner.py,sha256=A9m_ZfAKvFhWvrj7dRml8JsDIotc6HR-8krTxV2IP44,942
|
|
8
|
-
pheval/utils.py,sha256=M63N6EkO_-AajNWy1YgxxgdE1bqdaFLy5nD47DBYhE4,77
|
|
9
|
-
pheval-0.1.0.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
|
|
10
|
-
pheval-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
11
|
-
pheval-0.1.0.dist-info/WHEEL,sha256=vxFmldFsRN_Hx10GDvsdv1wroKq8r5Lzvjp6GZ4OO8c,88
|
|
12
|
-
pheval-0.1.0.dist-info/METADATA,sha256=CBwt3Ufa7WM9-ffr_lTp-asefXOmbXc4KinIA81sj1Y,1360
|
|
13
|
-
pheval-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|