pheval 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/benchmark.py +12 -23
- pheval/analyse/benchmark_output_type.py +3 -5
- pheval/analyse/binary_classification_curves.py +3 -9
- pheval/analyse/binary_classification_stats.py +1 -4
- pheval/analyse/generate_plots.py +8 -18
- pheval/analyse/generate_rank_comparisons.py +1 -2
- pheval/analyse/rank_stats.py +8 -25
- pheval/analyse/run_data_parser.py +15 -9
- pheval/cli.py +1 -1
- pheval/cli_pheval_utils.py +10 -23
- pheval/config_parser.py +1 -1
- pheval/implementations/__init__.py +3 -5
- pheval/infra/exomiserdb.py +7 -15
- pheval/post_processing/phenopacket_truth_set.py +10 -31
- pheval/post_processing/post_processing.py +12 -33
- pheval/post_processing/validate_result_format.py +2 -4
- pheval/prepare/create_noisy_phenopackets.py +18 -29
- pheval/prepare/create_spiked_vcf.py +25 -56
- pheval/prepare/custom_exceptions.py +6 -7
- pheval/prepare/prepare_corpus.py +6 -17
- pheval/prepare/update_phenopacket.py +6 -17
- pheval/utils/docs_gen.py +3 -3
- pheval/utils/file_utils.py +1 -2
- pheval/utils/phenopacket_utils.py +41 -73
- pheval/utils/semsim_utils.py +6 -10
- pheval/utils/utils.py +3 -4
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/METADATA +1 -1
- pheval-0.6.4.dist-info/RECORD +57 -0
- pheval-0.6.2.dist-info/RECORD +0 -57
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/LICENSE +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/WHEEL +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/entry_points.txt +0 -0
pheval/prepare/prepare_corpus.py
CHANGED
|
@@ -56,15 +56,11 @@ def prepare_corpus(
|
|
|
56
56
|
for phenopacket_path in all_files(phenopacket_dir):
|
|
57
57
|
phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
|
|
58
58
|
if not phenopacket_util.observed_phenotypic_features():
|
|
59
|
-
logger.warning(
|
|
60
|
-
f"Removed {phenopacket_path.name} from the corpus due to no observed phenotypic features."
|
|
61
|
-
)
|
|
59
|
+
logger.warning(f"Removed {phenopacket_path.name} from the corpus due to no observed phenotypic features.")
|
|
62
60
|
continue
|
|
63
61
|
if variant_analysis:
|
|
64
62
|
if phenopacket_util.check_incomplete_variant_record():
|
|
65
|
-
logger.warning(
|
|
66
|
-
f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
|
|
67
|
-
)
|
|
63
|
+
logger.warning(f"Removed {phenopacket_path.name} from the corpus due to missing variant fields.")
|
|
68
64
|
continue
|
|
69
65
|
elif phenopacket_util.check_variant_alleles():
|
|
70
66
|
logger.warning(
|
|
@@ -73,15 +69,11 @@ def prepare_corpus(
|
|
|
73
69
|
)
|
|
74
70
|
if gene_analysis:
|
|
75
71
|
if phenopacket_util.check_incomplete_gene_record():
|
|
76
|
-
logger.warning(
|
|
77
|
-
f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
|
|
78
|
-
)
|
|
72
|
+
logger.warning(f"Removed {phenopacket_path.name} from the corpus due to missing gene fields.")
|
|
79
73
|
continue
|
|
80
74
|
if disease_analysis:
|
|
81
75
|
if phenopacket_util.check_incomplete_disease_record():
|
|
82
|
-
logger.warning(
|
|
83
|
-
f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
|
|
84
|
-
)
|
|
76
|
+
logger.warning(f"Removed {phenopacket_path.name} from the corpus due to missing disease fields.")
|
|
85
77
|
continue
|
|
86
78
|
logger.info(f"{phenopacket_path.name} OK!")
|
|
87
79
|
if hg19_template_vcf or hg38_template_vcf:
|
|
@@ -107,13 +99,10 @@ def prepare_corpus(
|
|
|
107
99
|
else:
|
|
108
100
|
# if not updating phenopacket gene identifiers then copy phenopacket as is to output directory
|
|
109
101
|
(
|
|
110
|
-
shutil.copy(
|
|
111
|
-
phenopacket_path, output_dir.joinpath(f"phenopackets/{phenopacket_path.name}")
|
|
112
|
-
)
|
|
102
|
+
shutil.copy(phenopacket_path, output_dir.joinpath(f"phenopackets/{phenopacket_path.name}"))
|
|
113
103
|
if phenopacket_path != output_dir.joinpath(f"phenopackets/{phenopacket_path.name}")
|
|
114
104
|
else None
|
|
115
105
|
)
|
|
116
106
|
logger.info(
|
|
117
|
-
f"Finished preparing corpus for {phenopacket_dir}. "
|
|
118
|
-
f"Total time: {time.perf_counter() - start_time:.2f} seconds."
|
|
107
|
+
f"Finished preparing corpus for {phenopacket_dir}. Total time: {time.perf_counter() - start_time:.2f} seconds."
|
|
119
108
|
)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import time
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Union
|
|
4
3
|
|
|
5
4
|
import polars as pl
|
|
6
5
|
from phenopackets import Family, Phenopacket
|
|
@@ -21,7 +20,7 @@ logger = get_logger()
|
|
|
21
20
|
|
|
22
21
|
def update_outdated_gene_context(
|
|
23
22
|
phenopacket_path: Path, gene_identifier: str, identifier_map: pl.DataFrame
|
|
24
|
-
) ->
|
|
23
|
+
) -> Phenopacket | Family:
|
|
25
24
|
"""
|
|
26
25
|
Update the gene context of the Phenopacket.
|
|
27
26
|
|
|
@@ -66,15 +65,11 @@ def create_updated_phenopacket(
|
|
|
66
65
|
to describe the gene identifiers.
|
|
67
66
|
"""
|
|
68
67
|
identifier_map = create_gene_identifier_map() if identifier_map is None else identifier_map
|
|
69
|
-
updated_phenopacket = update_outdated_gene_context(
|
|
70
|
-
phenopacket_path, gene_identifier, identifier_map
|
|
71
|
-
)
|
|
68
|
+
updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, identifier_map)
|
|
72
69
|
write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
|
|
73
70
|
|
|
74
71
|
|
|
75
|
-
def create_updated_phenopackets(
|
|
76
|
-
gene_identifier: str, phenopacket_dir: Path, output_dir: Path
|
|
77
|
-
) -> None:
|
|
72
|
+
def create_updated_phenopackets(gene_identifier: str, phenopacket_dir: Path, output_dir: Path) -> None:
|
|
78
73
|
"""
|
|
79
74
|
Update the gene context within the interpretations for a directory of Phenopackets
|
|
80
75
|
and writes the updated Phenopackets.
|
|
@@ -91,15 +86,11 @@ def create_updated_phenopackets(
|
|
|
91
86
|
identifier_map = create_gene_identifier_map()
|
|
92
87
|
for phenopacket_path in all_files(phenopacket_dir):
|
|
93
88
|
logger.info(f"Updating gene context for: {phenopacket_path.name}")
|
|
94
|
-
updated_phenopacket = update_outdated_gene_context(
|
|
95
|
-
phenopacket_path, gene_identifier, identifier_map
|
|
96
|
-
)
|
|
89
|
+
updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, identifier_map)
|
|
97
90
|
write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
|
|
98
91
|
|
|
99
92
|
|
|
100
|
-
def update_phenopackets(
|
|
101
|
-
gene_identifier: str, phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path
|
|
102
|
-
) -> None:
|
|
93
|
+
def update_phenopackets(gene_identifier: str, phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path) -> None:
|
|
103
94
|
"""
|
|
104
95
|
Update the gene identifiers in either a single phenopacket or a directory of phenopackets.
|
|
105
96
|
|
|
@@ -122,8 +113,6 @@ def update_phenopackets(
|
|
|
122
113
|
logger.info(f"Updating {phenopacket_path}.")
|
|
123
114
|
create_updated_phenopacket(gene_identifier, phenopacket_path, output_dir)
|
|
124
115
|
elif phenopacket_dir is not None:
|
|
125
|
-
logger.info(
|
|
126
|
-
f"Updating {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
127
|
-
)
|
|
116
|
+
logger.info(f"Updating {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}.")
|
|
128
117
|
create_updated_phenopackets(gene_identifier, phenopacket_dir, output_dir)
|
|
129
118
|
logger.info(f"Updating finished! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
pheval/utils/docs_gen.py
CHANGED
|
@@ -13,7 +13,7 @@ def find_methods_in_python_file(file_path):
|
|
|
13
13
|
file_path ([type]): [description]
|
|
14
14
|
"""
|
|
15
15
|
methods = []
|
|
16
|
-
with open(file_path,
|
|
16
|
+
with open(file_path, encoding="utf-8") as file:
|
|
17
17
|
text = file.read()
|
|
18
18
|
parsed = ast.parse(text)
|
|
19
19
|
for node in ast.walk(parsed):
|
|
@@ -73,8 +73,8 @@ def print_cli_doc(file_item):
|
|
|
73
73
|
for method in methods:
|
|
74
74
|
content = f"""
|
|
75
75
|
::: mkdocs-click
|
|
76
|
-
:package: {file_item[
|
|
77
|
-
:module: {file_item[
|
|
76
|
+
:package: {file_item["folder"].replace("./", "").replace("/", ".")}.{file_item["basename"]}
|
|
77
|
+
:module: {file_item["folder"].replace("./", "").replace("/", ".").replace("src.", "")}.{file_item["basename"]}
|
|
78
78
|
:command: {method}
|
|
79
79
|
:depth: 4
|
|
80
80
|
:style: table
|
pheval/utils/file_utils.py
CHANGED
|
@@ -3,7 +3,6 @@ import re
|
|
|
3
3
|
import unicodedata
|
|
4
4
|
from os import path
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import List
|
|
7
6
|
|
|
8
7
|
import pandas as pd
|
|
9
8
|
import yaml
|
|
@@ -80,7 +79,7 @@ def ensure_file_exists(*files: str):
|
|
|
80
79
|
raise FileNotFoundError(f"File {file} not found")
|
|
81
80
|
|
|
82
81
|
|
|
83
|
-
def ensure_columns_exists(cols: list, dataframes:
|
|
82
|
+
def ensure_columns_exists(cols: list, dataframes: list[pd.DataFrame], err_message: str = ""):
|
|
84
83
|
"""Ensures the columns exist in dataframes passed as argument (e.g)
|
|
85
84
|
|
|
86
85
|
"
|
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
from copy import copy
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import List, Union
|
|
7
6
|
|
|
8
7
|
import polars as pl
|
|
9
8
|
from google.protobuf.json_format import MessageToJson, Parse
|
|
@@ -183,7 +182,7 @@ def create_gene_identifier_map() -> pl.DataFrame:
|
|
|
183
182
|
)
|
|
184
183
|
|
|
185
184
|
|
|
186
|
-
def phenopacket_reader(file: Path) ->
|
|
185
|
+
def phenopacket_reader(file: Path) -> Phenopacket | Family:
|
|
187
186
|
"""
|
|
188
187
|
Read a Phenopacket file and returns its contents as a Phenopacket or Family object
|
|
189
188
|
|
|
@@ -194,7 +193,7 @@ def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
|
|
|
194
193
|
Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object
|
|
195
194
|
"""
|
|
196
195
|
logger.info(f"Parsing Phenopacket: {file.name}")
|
|
197
|
-
file = open(file
|
|
196
|
+
file = open(file)
|
|
198
197
|
phenopacket = json.load(file)
|
|
199
198
|
file.close()
|
|
200
199
|
if "proband" in phenopacket:
|
|
@@ -206,7 +205,7 @@ def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
|
|
|
206
205
|
class PhenopacketUtil:
|
|
207
206
|
"""Class for retrieving data from a Phenopacket or Family object"""
|
|
208
207
|
|
|
209
|
-
def __init__(self, phenopacket_contents:
|
|
208
|
+
def __init__(self, phenopacket_contents: Phenopacket | Family):
|
|
210
209
|
"""Initialise PhenopacketUtil
|
|
211
210
|
|
|
212
211
|
Args:
|
|
@@ -226,7 +225,7 @@ class PhenopacketUtil:
|
|
|
226
225
|
else:
|
|
227
226
|
return self.phenopacket_contents.subject.id
|
|
228
227
|
|
|
229
|
-
def phenotypic_features(self) ->
|
|
228
|
+
def phenotypic_features(self) -> list[PhenotypicFeature]:
|
|
230
229
|
"""
|
|
231
230
|
Retrieve a list of all HPO terms
|
|
232
231
|
|
|
@@ -238,7 +237,7 @@ class PhenopacketUtil:
|
|
|
238
237
|
else:
|
|
239
238
|
return self.phenopacket_contents.phenotypic_features
|
|
240
239
|
|
|
241
|
-
def observed_phenotypic_features(self) ->
|
|
240
|
+
def observed_phenotypic_features(self) -> list[PhenotypicFeature]:
|
|
242
241
|
"""
|
|
243
242
|
Retrieve a list of all observed HPO terms
|
|
244
243
|
|
|
@@ -253,7 +252,7 @@ class PhenopacketUtil:
|
|
|
253
252
|
phenotypic_features.append(p)
|
|
254
253
|
return phenotypic_features
|
|
255
254
|
|
|
256
|
-
def negated_phenotypic_features(self) ->
|
|
255
|
+
def negated_phenotypic_features(self) -> list[PhenotypicFeature]:
|
|
257
256
|
"""
|
|
258
257
|
Retrieve a list of all negated HPO terms
|
|
259
258
|
|
|
@@ -267,7 +266,7 @@ class PhenopacketUtil:
|
|
|
267
266
|
negated_phenotypic_features.append(p)
|
|
268
267
|
return negated_phenotypic_features
|
|
269
268
|
|
|
270
|
-
def diseases(self) ->
|
|
269
|
+
def diseases(self) -> list[Disease]:
|
|
271
270
|
"""
|
|
272
271
|
Retrieve a list of Diseases associated with the proband
|
|
273
272
|
|
|
@@ -279,7 +278,7 @@ class PhenopacketUtil:
|
|
|
279
278
|
else:
|
|
280
279
|
return self.phenopacket_contents.diseases
|
|
281
280
|
|
|
282
|
-
def _diagnosis_from_interpretations(self) ->
|
|
281
|
+
def _diagnosis_from_interpretations(self) -> list[ProbandDisease]:
|
|
283
282
|
"""
|
|
284
283
|
Retrieve a list of disease diagnoses associated with the proband from the interpretations object
|
|
285
284
|
|
|
@@ -301,7 +300,7 @@ class PhenopacketUtil:
|
|
|
301
300
|
)
|
|
302
301
|
return diagnoses
|
|
303
302
|
|
|
304
|
-
def _diagnosis_from_disease(self) ->
|
|
303
|
+
def _diagnosis_from_disease(self) -> list[ProbandDisease]:
|
|
305
304
|
"""
|
|
306
305
|
Retrieve a list of disease diagnoses associated with the proband from the diseases object
|
|
307
306
|
|
|
@@ -310,12 +309,10 @@ class PhenopacketUtil:
|
|
|
310
309
|
"""
|
|
311
310
|
diagnoses = []
|
|
312
311
|
for disease in self.diseases():
|
|
313
|
-
diagnoses.append(
|
|
314
|
-
ProbandDisease(disease_name=disease.term.label, disease_identifier=disease.term.id)
|
|
315
|
-
)
|
|
312
|
+
diagnoses.append(ProbandDisease(disease_name=disease.term.label, disease_identifier=disease.term.id))
|
|
316
313
|
return diagnoses
|
|
317
314
|
|
|
318
|
-
def diagnoses(self) ->
|
|
315
|
+
def diagnoses(self) -> list[ProbandDisease]:
|
|
319
316
|
"""
|
|
320
317
|
Retrieve a unique list of disease diagnoses associated with the proband from a Phenopacket
|
|
321
318
|
|
|
@@ -324,7 +321,7 @@ class PhenopacketUtil:
|
|
|
324
321
|
"""
|
|
325
322
|
return list(set(self._diagnosis_from_interpretations() + self._diagnosis_from_disease()))
|
|
326
323
|
|
|
327
|
-
def interpretations(self) ->
|
|
324
|
+
def interpretations(self) -> list[Interpretation]:
|
|
328
325
|
"""
|
|
329
326
|
Retrieve a list of interpretations from a Phenopacket
|
|
330
327
|
|
|
@@ -336,7 +333,7 @@ class PhenopacketUtil:
|
|
|
336
333
|
else:
|
|
337
334
|
return self.phenopacket_contents.interpretations
|
|
338
335
|
|
|
339
|
-
def causative_variants(self) ->
|
|
336
|
+
def causative_variants(self) -> list[ProbandCausativeVariant]:
|
|
340
337
|
"""
|
|
341
338
|
Retrieve a list of causative variants listed in a Phenopacket
|
|
342
339
|
|
|
@@ -364,7 +361,7 @@ class PhenopacketUtil:
|
|
|
364
361
|
all_variants.append(variant_data)
|
|
365
362
|
return all_variants
|
|
366
363
|
|
|
367
|
-
def files(self) ->
|
|
364
|
+
def files(self) -> list[File]:
|
|
368
365
|
"""
|
|
369
366
|
Retrieve a list of files associated with a phenopacket
|
|
370
367
|
|
|
@@ -394,15 +391,11 @@ class PhenopacketUtil:
|
|
|
394
391
|
URI of the VCF file to the specified directory and returns the modified file object.
|
|
395
392
|
"""
|
|
396
393
|
compatible_genome_assembly = ["GRCh37", "hg19", "GRCh38", "hg38"]
|
|
397
|
-
vcf_data =
|
|
398
|
-
if not Path(vcf_data.uri).name.endswith(".vcf") and not Path(vcf_data.uri).name.endswith(
|
|
399
|
-
".vcf.gz"
|
|
400
|
-
):
|
|
394
|
+
vcf_data = next(file for file in self.files() if file.file_attributes["fileFormat"] == "vcf")
|
|
395
|
+
if not Path(vcf_data.uri).name.endswith(".vcf") and not Path(vcf_data.uri).name.endswith(".vcf.gz"):
|
|
401
396
|
raise IncorrectFileFormatError(Path(vcf_data.uri), ".vcf or .vcf.gz file")
|
|
402
397
|
if vcf_data.file_attributes["genomeAssembly"] not in compatible_genome_assembly:
|
|
403
|
-
raise IncompatibleGenomeAssemblyError(
|
|
404
|
-
vcf_data.file_attributes["genomeAssembly"], phenopacket_path
|
|
405
|
-
)
|
|
398
|
+
raise IncompatibleGenomeAssemblyError(vcf_data.file_attributes["genomeAssembly"], phenopacket_path)
|
|
406
399
|
vcf_data.uri = str(vcf_dir.joinpath(Path(vcf_data.uri).name))
|
|
407
400
|
return vcf_data
|
|
408
401
|
|
|
@@ -430,7 +423,7 @@ class PhenopacketUtil:
|
|
|
430
423
|
gene_identifier=genomic_interpretation.gene.value_id,
|
|
431
424
|
)
|
|
432
425
|
|
|
433
|
-
def diagnosed_genes(self) ->
|
|
426
|
+
def diagnosed_genes(self) -> list[ProbandCausativeGene]:
|
|
434
427
|
"""
|
|
435
428
|
Retrieve the disease causing genes from a phenopacket.
|
|
436
429
|
Returns:
|
|
@@ -444,7 +437,7 @@ class PhenopacketUtil:
|
|
|
444
437
|
genes = list({gene.gene_symbol: gene for gene in genes}.values())
|
|
445
438
|
return genes
|
|
446
439
|
|
|
447
|
-
def diagnosed_variants(self) ->
|
|
440
|
+
def diagnosed_variants(self) -> list[GenomicVariant]:
|
|
448
441
|
"""
|
|
449
442
|
Retrieve a list of all known causative variants from a phenopacket.
|
|
450
443
|
Returns:
|
|
@@ -455,11 +448,7 @@ class PhenopacketUtil:
|
|
|
455
448
|
for i in pheno_interpretation:
|
|
456
449
|
for g in i.diagnosis.genomic_interpretations:
|
|
457
450
|
variant = GenomicVariant(
|
|
458
|
-
chrom=str(
|
|
459
|
-
g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
|
|
460
|
-
"chr", ""
|
|
461
|
-
)
|
|
462
|
-
),
|
|
451
|
+
chrom=str(g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace("chr", "")),
|
|
463
452
|
pos=int(g.variant_interpretation.variation_descriptor.vcf_record.pos),
|
|
464
453
|
ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
|
|
465
454
|
alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
|
|
@@ -480,13 +469,7 @@ class PhenopacketUtil:
|
|
|
480
469
|
"""
|
|
481
470
|
variants = self.diagnosed_variants()
|
|
482
471
|
for variant in variants:
|
|
483
|
-
if (
|
|
484
|
-
variant.chrom == ""
|
|
485
|
-
or variant.pos == 0
|
|
486
|
-
or variant.pos == ""
|
|
487
|
-
or variant.ref == ""
|
|
488
|
-
or variant.alt == ""
|
|
489
|
-
):
|
|
472
|
+
if variant.chrom == "" or variant.pos in (0, "") or variant.ref == "" or variant.alt == "":
|
|
490
473
|
return True
|
|
491
474
|
return False
|
|
492
475
|
|
|
@@ -537,7 +520,7 @@ class PhenopacketUtil:
|
|
|
537
520
|
class PhenopacketRebuilder:
|
|
538
521
|
"""Class for rebuilding a Phenopacket"""
|
|
539
522
|
|
|
540
|
-
def __init__(self, phenopacket:
|
|
523
|
+
def __init__(self, phenopacket: Phenopacket | Family):
|
|
541
524
|
"""Initialise PhenopacketUtil
|
|
542
525
|
|
|
543
526
|
Attributes:
|
|
@@ -545,9 +528,7 @@ class PhenopacketRebuilder:
|
|
|
545
528
|
"""
|
|
546
529
|
self.phenopacket = phenopacket
|
|
547
530
|
|
|
548
|
-
def update_interpretations(
|
|
549
|
-
self, interpretations: [Interpretation]
|
|
550
|
-
) -> Union[Phenopacket, Family]:
|
|
531
|
+
def update_interpretations(self, interpretations: [Interpretation]) -> Phenopacket | Family:
|
|
551
532
|
"""
|
|
552
533
|
Add the updated interpretations to a Phenopacket or Family.
|
|
553
534
|
|
|
@@ -566,7 +547,7 @@ class PhenopacketRebuilder:
|
|
|
566
547
|
phenopacket.interpretations.extend(interpretations)
|
|
567
548
|
return phenopacket
|
|
568
549
|
|
|
569
|
-
def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) ->
|
|
550
|
+
def add_randomised_hpo(self, randomised_hpo: [PhenotypicFeature]) -> Phenopacket | Family:
|
|
570
551
|
"""
|
|
571
552
|
Add randomised phenotypic profiles to a Phenopacket or Family.
|
|
572
553
|
|
|
@@ -585,7 +566,7 @@ class PhenopacketRebuilder:
|
|
|
585
566
|
phenopacket.phenotypic_features.extend(randomised_hpo)
|
|
586
567
|
return phenopacket
|
|
587
568
|
|
|
588
|
-
def add_spiked_vcf_path(self, spiked_vcf_file_data: File) ->
|
|
569
|
+
def add_spiked_vcf_path(self, spiked_vcf_file_data: File) -> Phenopacket | Family:
|
|
589
570
|
"""
|
|
590
571
|
Add a spiked VCF path to a Phenopacket or Family.
|
|
591
572
|
|
|
@@ -597,16 +578,14 @@ class PhenopacketRebuilder:
|
|
|
597
578
|
"""
|
|
598
579
|
logger.info(f"Adding spiked VCF path {spiked_vcf_file_data.uri} to phenopacket.")
|
|
599
580
|
phenopacket = copy(self.phenopacket)
|
|
600
|
-
phenopacket_files = [
|
|
601
|
-
file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"
|
|
602
|
-
]
|
|
581
|
+
phenopacket_files = [file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"]
|
|
603
582
|
phenopacket_files.append(spiked_vcf_file_data)
|
|
604
583
|
del phenopacket.files[:]
|
|
605
584
|
phenopacket.files.extend(phenopacket_files)
|
|
606
585
|
return phenopacket
|
|
607
586
|
|
|
608
587
|
|
|
609
|
-
def create_json_message(phenopacket:
|
|
588
|
+
def create_json_message(phenopacket: Phenopacket | Family) -> str:
|
|
610
589
|
"""
|
|
611
590
|
Create a JSON message for writing to a file.
|
|
612
591
|
|
|
@@ -619,7 +598,7 @@ def create_json_message(phenopacket: Union[Phenopacket, Family]) -> str:
|
|
|
619
598
|
return MessageToJson(phenopacket)
|
|
620
599
|
|
|
621
600
|
|
|
622
|
-
def write_phenopacket(phenopacket:
|
|
601
|
+
def write_phenopacket(phenopacket: Phenopacket | Family, output_file: Path) -> None:
|
|
623
602
|
"""
|
|
624
603
|
Write a Phenopacket or Family object to a file in JSON format.
|
|
625
604
|
|
|
@@ -667,15 +646,13 @@ class GeneIdentifierUpdater:
|
|
|
667
646
|
str: The identified gene identifier.
|
|
668
647
|
"""
|
|
669
648
|
matches = self.identifier_map.filter(
|
|
670
|
-
(pl.col("gene_symbol") == gene_symbol)
|
|
671
|
-
& (pl.col("identifier_type") == self.gene_identifier)
|
|
649
|
+
(pl.col("gene_symbol") == gene_symbol) & (pl.col("identifier_type") == self.gene_identifier)
|
|
672
650
|
)
|
|
673
651
|
|
|
674
652
|
if matches.height > 0:
|
|
675
653
|
return matches["identifier"][0]
|
|
676
654
|
prev_symbol_matches = self.identifier_map.filter(
|
|
677
|
-
(pl.col("identifier_type") == self.gene_identifier)
|
|
678
|
-
& (pl.col("prev_symbols").list.contains(gene_symbol))
|
|
655
|
+
(pl.col("identifier_type") == self.gene_identifier) & (pl.col("prev_symbols").list.contains(gene_symbol))
|
|
679
656
|
)
|
|
680
657
|
if prev_symbol_matches.height > 0:
|
|
681
658
|
return prev_symbol_matches["identifier"][0]
|
|
@@ -692,11 +669,9 @@ class GeneIdentifierUpdater:
|
|
|
692
669
|
Returns:
|
|
693
670
|
str: The gene symbol corresponding to the identifier.
|
|
694
671
|
"""
|
|
695
|
-
return self.identifier_map.filter(pl.col("identifier") == query_gene_identifier)[
|
|
696
|
-
"gene_symbol"
|
|
697
|
-
][0]
|
|
672
|
+
return self.identifier_map.filter(pl.col("identifier") == query_gene_identifier)["gene_symbol"][0]
|
|
698
673
|
|
|
699
|
-
def _find_alternate_ids(self, gene_symbol: str) ->
|
|
674
|
+
def _find_alternate_ids(self, gene_symbol: str) -> list[str]:
|
|
700
675
|
"""
|
|
701
676
|
Find the alternate IDs for a gene symbol.
|
|
702
677
|
|
|
@@ -706,24 +681,21 @@ class GeneIdentifierUpdater:
|
|
|
706
681
|
Returns:
|
|
707
682
|
List[str]: List of alternate IDs for the gene symbol.
|
|
708
683
|
"""
|
|
709
|
-
matches = self.identifier_map.filter(
|
|
684
|
+
matches = self.identifier_map.filter(pl.col("gene_symbol") == gene_symbol)
|
|
710
685
|
if matches.height > 0:
|
|
711
686
|
return [f"{row['prefix']}{row['identifier']}" for row in matches.rows(named=True)] + [
|
|
712
687
|
f"symbol:{gene_symbol}"
|
|
713
688
|
]
|
|
714
|
-
prev_symbol_matches = self.identifier_map.filter(
|
|
715
|
-
(pl.col("prev_symbols").list.contains(gene_symbol))
|
|
716
|
-
)
|
|
689
|
+
prev_symbol_matches = self.identifier_map.filter(pl.col("prev_symbols").list.contains(gene_symbol))
|
|
717
690
|
if prev_symbol_matches.height > 0:
|
|
718
|
-
return [
|
|
719
|
-
f"{
|
|
720
|
-
|
|
721
|
-
] + [f"symbol:{gene_symbol}"]
|
|
691
|
+
return [f"{row['prefix']}{row['identifier']}" for row in prev_symbol_matches.rows(named=True)] + [
|
|
692
|
+
f"symbol:{gene_symbol}"
|
|
693
|
+
]
|
|
722
694
|
return None
|
|
723
695
|
|
|
724
696
|
def update_genomic_interpretations_gene_identifier(
|
|
725
|
-
self, interpretations:
|
|
726
|
-
) ->
|
|
697
|
+
self, interpretations: list[Interpretation], phenopacket_path: Path
|
|
698
|
+
) -> list[Interpretation]:
|
|
727
699
|
"""
|
|
728
700
|
Update the genomic interpretations of a Phenopacket.
|
|
729
701
|
|
|
@@ -745,13 +717,9 @@ class GeneIdentifierUpdater:
|
|
|
745
717
|
f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
|
|
746
718
|
f" to {updated_gene_identifier}"
|
|
747
719
|
)
|
|
748
|
-
g.variant_interpretation.variation_descriptor.gene_context.value_id =
|
|
749
|
-
updated_gene_identifier
|
|
750
|
-
)
|
|
720
|
+
g.variant_interpretation.variation_descriptor.gene_context.value_id = updated_gene_identifier
|
|
751
721
|
del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
|
|
752
722
|
g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
|
|
753
|
-
self._find_alternate_ids(
|
|
754
|
-
g.variant_interpretation.variation_descriptor.gene_context.symbol
|
|
755
|
-
)
|
|
723
|
+
self._find_alternate_ids(g.variant_interpretation.variation_descriptor.gene_context.symbol)
|
|
756
724
|
)
|
|
757
725
|
return updated_interpretations
|
pheval/utils/semsim_utils.py
CHANGED
|
@@ -8,7 +8,7 @@ import numpy
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
import plotly.express as px
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
from pheval.utils import file_utils
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def filter_non_0_score(data: pd.DataFrame, col: str) -> pd.DataFrame:
|
|
@@ -58,9 +58,7 @@ def diff_semsim(
|
|
|
58
58
|
if absolute_diff:
|
|
59
59
|
df["diff"] = df[f"{score_column}_x"] - df[f"{score_column}_y"]
|
|
60
60
|
return df[["subject_id", "object_id", "diff"]]
|
|
61
|
-
df["diff"] = df.apply(
|
|
62
|
-
lambda row: get_percentage_diff(row[f"{score_column}_x"], row[f"{score_column}_y"]), axis=1
|
|
63
|
-
)
|
|
61
|
+
df["diff"] = df.apply(lambda row: get_percentage_diff(row[f"{score_column}_x"], row[f"{score_column}_y"]), axis=1)
|
|
64
62
|
return df[["subject_id", "object_id", f"{score_column}_x", f"{score_column}_y", "diff"]]
|
|
65
63
|
|
|
66
64
|
|
|
@@ -91,9 +89,7 @@ def semsim_heatmap_plot(semsim_left: Path, semsim_right: Path, score_column: str
|
|
|
91
89
|
fig.show()
|
|
92
90
|
|
|
93
91
|
|
|
94
|
-
def semsim_analysis(
|
|
95
|
-
semsim_left: Path, semsim_right: Path, score_column: str, absolute_diff=True
|
|
96
|
-
) -> pd.DataFrame:
|
|
92
|
+
def semsim_analysis(semsim_left: Path, semsim_right: Path, score_column: str, absolute_diff=True) -> pd.DataFrame:
|
|
97
93
|
"""semsim_analysis
|
|
98
94
|
|
|
99
95
|
Args:
|
|
@@ -147,11 +143,11 @@ def get_percentage_diff(current_number: float, previous_number: float) -> float:
|
|
|
147
143
|
"""
|
|
148
144
|
try:
|
|
149
145
|
if current_number == previous_number:
|
|
150
|
-
return "{:.2%}"
|
|
146
|
+
return f"{0:.2%}"
|
|
151
147
|
if current_number > previous_number:
|
|
152
|
-
number = (1 - (
|
|
148
|
+
number = (1 - (current_number / previous_number)) * 100
|
|
153
149
|
else:
|
|
154
150
|
number = (100 - ((previous_number / current_number) * 100)) * -1
|
|
155
|
-
return "{:.2%}"
|
|
151
|
+
return f"{number / 100:.2%}"
|
|
156
152
|
except ZeroDivisionError:
|
|
157
153
|
return None
|
pheval/utils/utils.py
CHANGED
|
@@ -4,7 +4,6 @@ import json
|
|
|
4
4
|
import random
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import List
|
|
8
7
|
|
|
9
8
|
import pandas as pd
|
|
10
9
|
import requests
|
|
@@ -42,7 +41,7 @@ def rand(df: pd.DataFrame, min_num: int, max_num: int, scramble_factor: float) -
|
|
|
42
41
|
def semsim_scramble(
|
|
43
42
|
input: Path,
|
|
44
43
|
output: Path,
|
|
45
|
-
columns_to_be_scrambled:
|
|
44
|
+
columns_to_be_scrambled: list[str],
|
|
46
45
|
scramble_factor: float = 0.5,
|
|
47
46
|
) -> pd.DataFrame:
|
|
48
47
|
"""
|
|
@@ -66,7 +65,7 @@ def semsim_scramble(
|
|
|
66
65
|
|
|
67
66
|
def semsim_scramble_df(
|
|
68
67
|
dataframe: pd.DataFrame,
|
|
69
|
-
columns_to_be_scrambled:
|
|
68
|
+
columns_to_be_scrambled: list[str],
|
|
70
69
|
scramble_factor: float,
|
|
71
70
|
) -> pd.DataFrame:
|
|
72
71
|
"""scramble_semsim_df
|
|
@@ -136,6 +135,6 @@ def get_resource_timestamp(file_name: str) -> str | None:
|
|
|
136
135
|
file_name (str): The file name.
|
|
137
136
|
"""
|
|
138
137
|
if METADATA_PATH.exists():
|
|
139
|
-
with open(METADATA_PATH
|
|
138
|
+
with open(METADATA_PATH) as f:
|
|
140
139
|
return json.load(f).get(file_name)
|
|
141
140
|
return None
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
pheval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
pheval/analyse/benchmark.py,sha256=Ktjovg3uEuaBi02AIiGX4OaZ73yAruwcsohhfBrDY44,6874
|
|
4
|
+
pheval/analyse/benchmark_db_manager.py,sha256=zS1TI76YuV2_YXLipHLSyh-XDR5kTxyOwhRhHRFHfjQ,764
|
|
5
|
+
pheval/analyse/benchmark_output_type.py,sha256=AG4HtEfscbDqESMBQ_M5Brnj8AmfrFxU6q7Gi2FOebw,1493
|
|
6
|
+
pheval/analyse/binary_classification_curves.py,sha256=b5YseLqv519DT7rsOweMRx7ElxYv9LcukXtLeAxflQE,4953
|
|
7
|
+
pheval/analyse/binary_classification_stats.py,sha256=oWkaj-A2-2MaUIsJjlehwLApx-wGLx-TQ49v9O4lMAs,6910
|
|
8
|
+
pheval/analyse/generate_plots.py,sha256=fyUMOgmbqYeYMPW843VR-CVArt3R75HgGbyq1i2XO7A,14489
|
|
9
|
+
pheval/analyse/generate_rank_comparisons.py,sha256=BmksHkvJhpR0Rcrnc-r2_OF5L3ROHB8o3HuDSXjgeK4,1660
|
|
10
|
+
pheval/analyse/rank_stats.py,sha256=io8UWTEUZfZQSnxXBjk2Z_1u6WWoxX0kSfEvxtpC-Kg,8241
|
|
11
|
+
pheval/analyse/run_data_parser.py,sha256=da8-J9sSwyOUow80A2ETKdVP7GUX-zuEiiBix-M18Mo,3601
|
|
12
|
+
pheval/cli.py,sha256=1kPhBYFSKjvPv9YcpknDj3Y7DZl5CA41ucqDRR7fAjk,1599
|
|
13
|
+
pheval/cli_pheval.py,sha256=N8xp3r8avYqLswPhakxtTQyemVdgHAvnpAIj_FmoN5k,3510
|
|
14
|
+
pheval/cli_pheval_utils.py,sha256=fLbskjHQSTN29qFXmjvsXYn7dE8-3OZuJUqlEQB-wyI,16481
|
|
15
|
+
pheval/config_parser.py,sha256=6wK8x9hXHg-kTDkkqfKshYvIdn91SGhucTYUul3fjNs,1353
|
|
16
|
+
pheval/implementations/__init__.py,sha256=BUTnw9urZOApRFVy6NYsq8TCLphHWsr3vhxvsx2RZ3E,1318
|
|
17
|
+
pheval/implementations/pheval_class_resolver.py,sha256=YNNk3PoQbSvbKmbihlt_bsHNxRM95O-VrtUIbQnfBcw,1567
|
|
18
|
+
pheval/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
pheval/infra/exomiserdb.py,sha256=0NXkqYT59Ueu2F5o7u4iCWJCAqTIL4KxShQuePFmpLo,5015
|
|
20
|
+
pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
pheval/post_processing/mondo_mapping.py,sha256=Vetk0acOtR__FXnx9wy2pcDDWFz1mQkc88alKUn0muI,937
|
|
22
|
+
pheval/post_processing/phenopacket_truth_set.py,sha256=rK_iIZm7OwdDEj_7SV7jMEROPHPe06baXLlHVO8wrSE,9889
|
|
23
|
+
pheval/post_processing/post_processing.py,sha256=mQWBpGmWd3ZPwZtDutekn_osJdGQFj4fPf6ibb7o8xA,10040
|
|
24
|
+
pheval/post_processing/validate_result_format.py,sha256=jg3HjvMwGI8rsLtOM0gpcVlGB3weGZiZek5JwFzw1zE,2862
|
|
25
|
+
pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
pheval/prepare/create_noisy_phenopackets.py,sha256=U-tYex4kbXT9OEkGnMl4p4pt4LRuqBKwKTgVBvhA7xU,12142
|
|
27
|
+
pheval/prepare/create_spiked_vcf.py,sha256=Po7WSQZAHaBObLa7SU50djUJ_XwAlUfTBeE9C0-7GA8,24299
|
|
28
|
+
pheval/prepare/custom_exceptions.py,sha256=8Bwi5SmDgWuvzLWPI7foUl2m5C6QdirlxPeeeC594AU,1637
|
|
29
|
+
pheval/prepare/prepare_corpus.py,sha256=726Ez5xLyvsibfEN3NqC1CFN0BgRl4Uswj4CaNTHS4o,5264
|
|
30
|
+
pheval/prepare/update_phenopacket.py,sha256=IeLKtp2i60LmBbGOh387ccq5J7Mk8y0hx4K55J7Mnbs,5550
|
|
31
|
+
pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
|
|
32
|
+
pheval/resources/alternate_ouputs/DeepPVP_results.txt,sha256=MF9MZJYa4r4PEvFzALpi-lNGLxjENOnq_YgrgFMn-oQ,1508
|
|
33
|
+
pheval/resources/alternate_ouputs/OVA_results.txt,sha256=_5XFCR4W04D-W7DObpALLsa0-693g2kiIUB_uo79aHk,9845
|
|
34
|
+
pheval/resources/alternate_ouputs/Phen2Gene_results.json,sha256=xxKsuiHKW9qQOz2baFlLW9RYphA4kxjoTsg1weZkTY8,14148
|
|
35
|
+
pheval/resources/alternate_ouputs/Phenolyzer_results.txt,sha256=TltiEzYm2PY79u6EdZR3f4ZqadNDCUN_d4f0TFF-t5A,594
|
|
36
|
+
pheval/resources/alternate_ouputs/lirical_results.tsv,sha256=0juf5HY6ttg-w7aWgYJUmSP5zmoaooEQDY8xhOcerLk,431068
|
|
37
|
+
pheval/resources/alternate_ouputs/svanna_results.tsv,sha256=OpTamPhJwh12wkdAxoIGb0wWs_T7TcqNWgqkQzgOek4,714
|
|
38
|
+
pheval/resources/hgnc_complete_set.txt,sha256=9-aNcyGZzarD1DnO_780NK0r-ppwbyu9e4-cQDmtUC8,16593567
|
|
39
|
+
pheval/resources/metadata.json,sha256=aabSMPCwE-KR6cAxBCMRZmDD1fGD7qCeSvPLduvO3gA,112
|
|
40
|
+
pheval/resources/mondo.sssom.tsv,sha256=Egu8UqHPL6TbSRQKlRgQ7kNq5S4e1VElR5JWHWgc7F0,12700455
|
|
41
|
+
pheval/run_metadata.py,sha256=isEs63c-O6LZ6TBmk9wmAOmzjNE3Nf_k9V9uiYu0x8s,1122
|
|
42
|
+
pheval/runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
pheval/runners/runner.py,sha256=zkS6yvbnC6UBCtV2VR3FEfr5naWRDdNcQFN1CoH1ha8,4902
|
|
44
|
+
pheval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
|
+
pheval/utils/docs_gen.py,sha256=8roEavgcVlEgOg54zTn9-xUOTu3qsQF-JfKSj3-mKMg,3188
|
|
46
|
+
pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
|
|
47
|
+
pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
|
|
48
|
+
pheval/utils/file_utils.py,sha256=FY4B17NVS7tXeTso0OZYgde-H9KJdUbQbWYT6fMdnCM,3552
|
|
49
|
+
pheval/utils/logger.py,sha256=5DZl5uMltUDQorhkvg_B7_ZhFwApAmEkWneFIOKfRGQ,1566
|
|
50
|
+
pheval/utils/phenopacket_utils.py,sha256=qt816uYkJKAe2MATFoulqEr78zg0-Z2Z_FxfEO9iSFE,26740
|
|
51
|
+
pheval/utils/semsim_utils.py,sha256=tSDin3PwCdtMjtMXubIXTiGaCEFNz7iF4IngrjNHprI,6104
|
|
52
|
+
pheval/utils/utils.py,sha256=T9zzqMlzY2hrcYn9ObatYgOHtKWTpWbW5nU0tTTcYxI,4489
|
|
53
|
+
pheval-0.6.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
54
|
+
pheval-0.6.4.dist-info/METADATA,sha256=QNcXx4I83wCz7jEkPn1sbKC8phoFxYuil5Gzlb6JSIQ,6494
|
|
55
|
+
pheval-0.6.4.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
56
|
+
pheval-0.6.4.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
|
|
57
|
+
pheval-0.6.4.dist-info/RECORD,,
|