pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/__init__.py +0 -5
- pheval/analyse/__init__.py +0 -0
- pheval/analyse/analysis.py +703 -0
- pheval/analyse/generate_plots.py +312 -0
- pheval/analyse/generate_summary_outputs.py +186 -0
- pheval/analyse/rank_stats.py +61 -0
- pheval/cli.py +22 -7
- pheval/cli_pheval.py +37 -12
- pheval/cli_pheval_utils.py +225 -8
- pheval/config_parser.py +36 -0
- pheval/constants.py +1 -0
- pheval/implementations/__init__.py +1 -3
- pheval/post_processing/__init__.py +0 -0
- pheval/post_processing/post_processing.py +210 -0
- pheval/prepare/__init__.py +0 -0
- pheval/prepare/create_noisy_phenopackets.py +173 -0
- pheval/prepare/create_spiked_vcf.py +366 -0
- pheval/prepare/custom_exceptions.py +47 -0
- pheval/prepare/update_phenopacket.py +53 -0
- pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
- pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
- pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
- pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
- pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
- pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
- pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
- pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
- pheval/run_metadata.py +27 -0
- pheval/runners/runner.py +92 -11
- pheval/utils/__init__.py +0 -0
- pheval/utils/docs_gen.py +105 -0
- pheval/utils/docs_gen.sh +18 -0
- pheval/utils/file_utils.py +88 -0
- pheval/utils/phenopacket_utils.py +356 -0
- pheval/utils/semsim_utils.py +156 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
- pheval-0.2.0.dist-info/RECORD +41 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
- pheval/utils.py +0 -7
- pheval-0.1.0.dist-info/RECORD +0 -13
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
- {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
|
|
5
|
+
from oaklib.resource import OntologyResource
|
|
6
|
+
from phenopackets import Family, OntologyClass, Phenopacket, PhenotypicFeature
|
|
7
|
+
|
|
8
|
+
from pheval.utils.file_utils import files_with_suffix
|
|
9
|
+
from pheval.utils.phenopacket_utils import (
|
|
10
|
+
PhenopacketRebuilder,
|
|
11
|
+
PhenopacketUtil,
|
|
12
|
+
phenopacket_reader,
|
|
13
|
+
write_phenopacket,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_ontology():
|
|
18
|
+
"""Loads human phenotype ontology."""
|
|
19
|
+
resource = OntologyResource(slug="hp.obo", local=False)
|
|
20
|
+
return ProntoImplementation(resource)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HpoRandomiser:
|
|
24
|
+
"""Randomises phenopacket phenotypic features."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, hpo_ontology, scramble_factor: float):
|
|
27
|
+
self.hpo_ontology = hpo_ontology
|
|
28
|
+
self.phenotypic_abnormalities = set(hpo_ontology.roots(predicates=["HP:0000118"]))
|
|
29
|
+
self.scramble_factor = scramble_factor
|
|
30
|
+
|
|
31
|
+
def scramble_factor_proportions(self, phenotypic_features: list[PhenotypicFeature]):
|
|
32
|
+
"""Calculate proportion of scrambled hpo terms from scramble factor."""
|
|
33
|
+
if len(phenotypic_features) == 1:
|
|
34
|
+
return 1
|
|
35
|
+
else:
|
|
36
|
+
return int(round(len(phenotypic_features) * self.scramble_factor, 0))
|
|
37
|
+
|
|
38
|
+
def retrieve_hpo_term(self, hpo_id: str) -> PhenotypicFeature:
|
|
39
|
+
"""Retrieves term for hpo id."""
|
|
40
|
+
rels = self.hpo_ontology.entity_alias_map(hpo_id)
|
|
41
|
+
hpo_term = "".join(rels[(list(rels.keys())[0])])
|
|
42
|
+
return PhenotypicFeature(type=OntologyClass(id=hpo_id, label=hpo_term))
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def retain_real_patient_terms(
|
|
46
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
47
|
+
number_of_scrambled_terms: int,
|
|
48
|
+
) -> list[PhenotypicFeature]:
|
|
49
|
+
"""Returns a list of the maximum number of real patient HPO terms."""
|
|
50
|
+
if len(phenotypic_features) > 1:
|
|
51
|
+
number_of_real_id = len(phenotypic_features) - number_of_scrambled_terms
|
|
52
|
+
else:
|
|
53
|
+
number_of_real_id = 1
|
|
54
|
+
return random.sample(phenotypic_features, number_of_real_id)
|
|
55
|
+
|
|
56
|
+
def convert_patient_terms_to_parent(
|
|
57
|
+
self,
|
|
58
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
59
|
+
retained_phenotypic_features: list[PhenotypicFeature],
|
|
60
|
+
number_of_scrambled_terms: int,
|
|
61
|
+
) -> list[PhenotypicFeature]:
|
|
62
|
+
"""Returns a list of the HPO terms that have been converted to a parent term."""
|
|
63
|
+
remaining_hpo = [i for i in phenotypic_features if i not in retained_phenotypic_features]
|
|
64
|
+
if len(remaining_hpo) == 0:
|
|
65
|
+
number_of_scrambled_terms = 0
|
|
66
|
+
hpo_terms_to_be_changed = list(random.sample(remaining_hpo, number_of_scrambled_terms))
|
|
67
|
+
parent_terms = []
|
|
68
|
+
for term in hpo_terms_to_be_changed:
|
|
69
|
+
try:
|
|
70
|
+
parent_terms.append(
|
|
71
|
+
self.retrieve_hpo_term(
|
|
72
|
+
self.hpo_ontology.hierararchical_parents(term.type.id)[0]
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
except IndexError:
|
|
76
|
+
obsolete_term = self.hpo_ontology.entity_metadata_map(term.type.id)
|
|
77
|
+
updated_term = list(obsolete_term.values())[0][0]
|
|
78
|
+
parent_terms.append(
|
|
79
|
+
self.retrieve_hpo_term(
|
|
80
|
+
self.hpo_ontology.hierararchical_parents(updated_term)[0]
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
return parent_terms
|
|
84
|
+
|
|
85
|
+
def create_random_hpo_terms(self, number_of_scrambled_terms: int) -> list[PhenotypicFeature]:
|
|
86
|
+
"""Returns a list of random HPO terms"""
|
|
87
|
+
random_ids = list(
|
|
88
|
+
random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms)
|
|
89
|
+
)
|
|
90
|
+
return [self.retrieve_hpo_term(random_id) for random_id in random_ids]
|
|
91
|
+
|
|
92
|
+
def randomise_hpo_terms(
|
|
93
|
+
self,
|
|
94
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
95
|
+
) -> list[PhenotypicFeature]:
|
|
96
|
+
"""Returns a list of randomised HPO terms."""
|
|
97
|
+
number_of_scrambled_terms = self.scramble_factor_proportions(phenotypic_features)
|
|
98
|
+
retained_patient_terms = self.retain_real_patient_terms(
|
|
99
|
+
phenotypic_features, number_of_scrambled_terms
|
|
100
|
+
)
|
|
101
|
+
return (
|
|
102
|
+
retained_patient_terms
|
|
103
|
+
+ self.convert_patient_terms_to_parent(
|
|
104
|
+
phenotypic_features, retained_patient_terms, number_of_scrambled_terms
|
|
105
|
+
)
|
|
106
|
+
+ self.create_random_hpo_terms(number_of_scrambled_terms)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def add_noise_to_phenotypic_profile(
|
|
111
|
+
hpo_randomiser: HpoRandomiser,
|
|
112
|
+
phenopacket: Phenopacket or Family,
|
|
113
|
+
) -> Phenopacket or Family:
|
|
114
|
+
"""Randomises the phenotypic profile of a phenopacket."""
|
|
115
|
+
# phenopacket_util = PhenopacketUtil(phenopacket)
|
|
116
|
+
# phenotypic_features = phenopacket_util.observed_phenotypic_features()
|
|
117
|
+
phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
|
|
118
|
+
random_phenotypes = hpo_randomiser.randomise_hpo_terms(phenotypic_features)
|
|
119
|
+
randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(random_phenotypes)
|
|
120
|
+
return randomised_phenopacket
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def create_scrambled_phenopacket(
|
|
124
|
+
output_dir: Path, phenopacket_path: Path, scramble_factor: float
|
|
125
|
+
) -> None:
|
|
126
|
+
"""Creates a scrambled phenopacket."""
|
|
127
|
+
try:
|
|
128
|
+
output_dir.mkdir()
|
|
129
|
+
except FileExistsError:
|
|
130
|
+
pass
|
|
131
|
+
ontology = load_ontology()
|
|
132
|
+
hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
|
|
133
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
134
|
+
created_noisy_phenopacket = add_noise_to_phenotypic_profile(
|
|
135
|
+
hpo_randomiser,
|
|
136
|
+
phenopacket,
|
|
137
|
+
)
|
|
138
|
+
write_phenopacket(
|
|
139
|
+
created_noisy_phenopacket,
|
|
140
|
+
output_dir.joinpath(phenopacket_path.name),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def create_scrambled_phenopackets(
|
|
145
|
+
output_dir: Path, phenopacket_dir: Path, scramble_factor: float
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Creates scrambled phenopackets."""
|
|
148
|
+
try:
|
|
149
|
+
output_dir.mkdir()
|
|
150
|
+
except FileExistsError:
|
|
151
|
+
pass
|
|
152
|
+
ontology = load_ontology()
|
|
153
|
+
hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
|
|
154
|
+
phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
|
|
155
|
+
for phenopacket_path in phenopacket_files:
|
|
156
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
157
|
+
created_noisy_phenopacket = add_noise_to_phenotypic_profile(hpo_randomiser, phenopacket)
|
|
158
|
+
write_phenopacket(
|
|
159
|
+
created_noisy_phenopacket,
|
|
160
|
+
output_dir.joinpath(
|
|
161
|
+
phenopacket_path.name,
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def scramble_phenopackets(
|
|
167
|
+
output_dir: Path, phenopacket_path: Path, phenopacket_dir: Path, scramble_factor: float
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Create scrambled phenopackets from either a single phenopacket or directory of phenopackets."""
|
|
170
|
+
if phenopacket_path is not None:
|
|
171
|
+
create_scrambled_phenopacket(output_dir, phenopacket_path, scramble_factor)
|
|
172
|
+
elif phenopacket_dir is not None:
|
|
173
|
+
create_scrambled_phenopackets(output_dir, phenopacket_dir, scramble_factor)
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
import gzip
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import secrets
|
|
6
|
+
import urllib.parse
|
|
7
|
+
from copy import copy
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from phenopackets import Family, File, Phenopacket
|
|
12
|
+
|
|
13
|
+
from pheval.utils.phenopacket_utils import (
|
|
14
|
+
IncompatibleGenomeAssemblyError,
|
|
15
|
+
PhenopacketRebuilder,
|
|
16
|
+
PhenopacketUtil,
|
|
17
|
+
ProbandCausativeVariant,
|
|
18
|
+
phenopacket_reader,
|
|
19
|
+
write_phenopacket,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from .custom_exceptions import InputError
|
|
23
|
+
from ..utils.file_utils import all_files, files_with_suffix, is_gzipped
|
|
24
|
+
|
|
25
|
+
info_log = logging.getLogger("info")
|
|
26
|
+
|
|
27
|
+
genome_assemblies = {
|
|
28
|
+
"GRCh38": {
|
|
29
|
+
"1": 248956422,
|
|
30
|
+
"2": 242193529,
|
|
31
|
+
"3": 198295559,
|
|
32
|
+
"4": 190214555,
|
|
33
|
+
"5": 181538259,
|
|
34
|
+
"6": 170805979,
|
|
35
|
+
"7": 159345973,
|
|
36
|
+
"8": 145138636,
|
|
37
|
+
"9": 138394717,
|
|
38
|
+
"10": 133797422,
|
|
39
|
+
"11": 135086622,
|
|
40
|
+
"12": 133275309,
|
|
41
|
+
"13": 114364328,
|
|
42
|
+
"14": 107043718,
|
|
43
|
+
"15": 101991189,
|
|
44
|
+
"16": 90338345,
|
|
45
|
+
"17": 83257441,
|
|
46
|
+
"18": 80373285,
|
|
47
|
+
"19": 58617616,
|
|
48
|
+
"20": 64444167,
|
|
49
|
+
"21": 46709983,
|
|
50
|
+
"22": 50818468,
|
|
51
|
+
},
|
|
52
|
+
"GRCh37": {
|
|
53
|
+
"1": 249250621,
|
|
54
|
+
"2": 243199373,
|
|
55
|
+
"3": 198022430,
|
|
56
|
+
"4": 191154276,
|
|
57
|
+
"5": 180915260,
|
|
58
|
+
"6": 171115067,
|
|
59
|
+
"7": 159138663,
|
|
60
|
+
"8": 146364022,
|
|
61
|
+
"9": 141213431,
|
|
62
|
+
"10": 135534747,
|
|
63
|
+
"11": 135006516,
|
|
64
|
+
"12": 133851895,
|
|
65
|
+
"13": 115169878,
|
|
66
|
+
"14": 107349540,
|
|
67
|
+
"15": 102531392,
|
|
68
|
+
"16": 90354753,
|
|
69
|
+
"17": 81195210,
|
|
70
|
+
"18": 78077248,
|
|
71
|
+
"19": 59128983,
|
|
72
|
+
"20": 63025520,
|
|
73
|
+
"21": 48129895,
|
|
74
|
+
"22": 51304566,
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class VcfHeader:
|
|
81
|
+
"""Data obtained from VCF header"""
|
|
82
|
+
|
|
83
|
+
sample_id: str
|
|
84
|
+
assembly: str
|
|
85
|
+
chr_status: bool
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class VcfPicker:
|
|
89
|
+
"""Chooses a VCF file from random for a directory if provided, otherwise selects the single template."""
|
|
90
|
+
|
|
91
|
+
def __init__(self, template_vcf: Path or None, vcf_dir: Path or None):
|
|
92
|
+
self.template_vcf = template_vcf
|
|
93
|
+
self.vcf_dir = vcf_dir
|
|
94
|
+
|
|
95
|
+
def pick_file_from_dir(self) -> Path:
|
|
96
|
+
"""Selects a file from a directory at random."""
|
|
97
|
+
return secrets.choice(all_files(self.vcf_dir))
|
|
98
|
+
|
|
99
|
+
def pick_file(self) -> Path:
|
|
100
|
+
"""Selects a VCF file from random when given a directory, if not, template vcf is assigned."""
|
|
101
|
+
return self.pick_file_from_dir() if self.vcf_dir is not None else self.template_vcf
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def read_vcf(vcf_file: Path) -> list[str]:
|
|
105
|
+
"""Reads the contents of a VCF file into memory - handles both uncompressed and gzipped."""
|
|
106
|
+
open_fn = gzip.open if is_gzipped(vcf_file) else open
|
|
107
|
+
vcf = open_fn(vcf_file)
|
|
108
|
+
vcf_contents = (
|
|
109
|
+
[line.decode() for line in vcf.readlines()] if is_gzipped(vcf_file) else vcf.readlines()
|
|
110
|
+
)
|
|
111
|
+
vcf.close()
|
|
112
|
+
return vcf_contents
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class VcfHeaderParser:
|
|
116
|
+
"""Parses the header of a VCF file."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, vcf_contents: list[str]):
|
|
119
|
+
self.vcf_contents = vcf_contents
|
|
120
|
+
|
|
121
|
+
def parse_assembly(self) -> tuple[str, bool]:
|
|
122
|
+
"""Parses the genome assembly and format of vcf_records."""
|
|
123
|
+
vcf_assembly = {}
|
|
124
|
+
chr_status = False
|
|
125
|
+
for line in self.vcf_contents:
|
|
126
|
+
if line.startswith("##contig=<ID"):
|
|
127
|
+
tokens = line.split(",")
|
|
128
|
+
chromosome = re.sub(
|
|
129
|
+
r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
|
|
130
|
+
)
|
|
131
|
+
if "chr" in chromosome:
|
|
132
|
+
chr_status = True
|
|
133
|
+
chromosome = chromosome.replace("chr", "")
|
|
134
|
+
contig_length = re.sub(
|
|
135
|
+
"[^0-9]+",
|
|
136
|
+
"",
|
|
137
|
+
[token for token in tokens if "length=" in token][0],
|
|
138
|
+
)
|
|
139
|
+
vcf_assembly[chromosome] = int(contig_length)
|
|
140
|
+
vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
|
|
141
|
+
assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
|
|
142
|
+
return assembly, chr_status
|
|
143
|
+
|
|
144
|
+
def parse_sample_id(self) -> str:
|
|
145
|
+
"""Parses the sample ID of the VCF."""
|
|
146
|
+
for line in self.vcf_contents:
|
|
147
|
+
if line.startswith("#CHROM"):
|
|
148
|
+
return line.split("\t")[9].rstrip()
|
|
149
|
+
|
|
150
|
+
def parse_vcf_header(self) -> VcfHeader:
|
|
151
|
+
"""Parses the header of the VCF."""
|
|
152
|
+
assembly, chr_status = self.parse_assembly()
|
|
153
|
+
sample_id = self.parse_sample_id()
|
|
154
|
+
return VcfHeader(sample_id, assembly, chr_status)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def check_variant_assembly(
|
|
158
|
+
proband_causative_variants: list[ProbandCausativeVariant],
|
|
159
|
+
vcf_header: VcfHeader,
|
|
160
|
+
phenopacket_path: Path,
|
|
161
|
+
):
|
|
162
|
+
"""Checks the assembly of the variant assembly against the VCF."""
|
|
163
|
+
compatible_genome_assembly = {"GRCh37", "hg19", "GRCh38", "hg38"}
|
|
164
|
+
phenopacket_assembly = list({variant.assembly for variant in proband_causative_variants})
|
|
165
|
+
if len(phenopacket_assembly) > 1:
|
|
166
|
+
raise ValueError("Too many genome assemblies!")
|
|
167
|
+
if phenopacket_assembly[0] not in compatible_genome_assembly:
|
|
168
|
+
raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
|
|
169
|
+
if phenopacket_assembly[0] != vcf_header.assembly:
|
|
170
|
+
raise IncompatibleGenomeAssemblyError(
|
|
171
|
+
assembly=phenopacket_assembly, phenopacket=phenopacket_path
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class VcfSpiker:
|
|
176
|
+
"""Spikes proband variants into template VCF file contents."""
|
|
177
|
+
|
|
178
|
+
def __init__(
|
|
179
|
+
self,
|
|
180
|
+
vcf_contents: list[str],
|
|
181
|
+
proband_causative_variants: list[ProbandCausativeVariant],
|
|
182
|
+
vcf_header: VcfHeader,
|
|
183
|
+
):
|
|
184
|
+
self.vcf_contents = vcf_contents
|
|
185
|
+
self.proband_causative_variants = proband_causative_variants
|
|
186
|
+
self.vcf_header = vcf_header
|
|
187
|
+
|
|
188
|
+
def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> list[str]:
|
|
189
|
+
"""Constructs variant entries."""
|
|
190
|
+
genotype_codes = {
|
|
191
|
+
"hemizygous": "0/1",
|
|
192
|
+
"homozygous": "1/1",
|
|
193
|
+
"heterozygous": "0/1",
|
|
194
|
+
"compound heterozygous": "0/1",
|
|
195
|
+
}
|
|
196
|
+
if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
|
|
197
|
+
proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
|
|
198
|
+
return [
|
|
199
|
+
proband_variant_data.variant.chrom,
|
|
200
|
+
str(proband_variant_data.variant.pos),
|
|
201
|
+
".",
|
|
202
|
+
proband_variant_data.variant.ref,
|
|
203
|
+
f"<{proband_variant_data.variant.alt}>"
|
|
204
|
+
if proband_variant_data.variant.ref == "N"
|
|
205
|
+
else proband_variant_data.variant.alt,
|
|
206
|
+
"100",
|
|
207
|
+
"PASS",
|
|
208
|
+
proband_variant_data.info
|
|
209
|
+
if proband_variant_data.info is not None
|
|
210
|
+
else "SPIKED_VARIANT_" + proband_variant_data.genotype.upper(),
|
|
211
|
+
"GT",
|
|
212
|
+
genotype_codes[proband_variant_data.genotype.lower()] + "\n",
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
def construct_vcf_records(self):
|
|
216
|
+
"""Inserts spiked variant into correct position within VCF."""
|
|
217
|
+
updated_vcf_records = copy(self.vcf_contents)
|
|
218
|
+
for variant in self.proband_causative_variants:
|
|
219
|
+
variant = self.construct_variant_entry(variant)
|
|
220
|
+
variant_entry_position = [
|
|
221
|
+
i
|
|
222
|
+
for i, val in enumerate(updated_vcf_records)
|
|
223
|
+
if val.split("\t")[0] == variant[0] and int(val.split("\t")[1]) < int(variant[1])
|
|
224
|
+
][-1] + 1
|
|
225
|
+
updated_vcf_records.insert(variant_entry_position, "\t".join(variant))
|
|
226
|
+
return updated_vcf_records
|
|
227
|
+
|
|
228
|
+
def construct_header(self, updated_vcf_records) -> list[str]:
|
|
229
|
+
"""Constructs the header of the VCF."""
|
|
230
|
+
updated_vcf_file = []
|
|
231
|
+
for line in updated_vcf_records:
|
|
232
|
+
text = line.replace(
|
|
233
|
+
self.vcf_header.sample_id,
|
|
234
|
+
self.proband_causative_variants[0].proband_id,
|
|
235
|
+
)
|
|
236
|
+
updated_vcf_file.append(text)
|
|
237
|
+
return updated_vcf_file
|
|
238
|
+
|
|
239
|
+
def construct_vcf(self) -> list[str]:
|
|
240
|
+
"""Constructs the entire spiked VCF."""
|
|
241
|
+
return self.construct_header(self.construct_vcf_records())
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class VcfWriter:
|
|
245
|
+
def __init__(
|
|
246
|
+
self,
|
|
247
|
+
vcf_contents: list[str],
|
|
248
|
+
spiked_vcf_file_path: Path,
|
|
249
|
+
):
|
|
250
|
+
self.vcf_contents = vcf_contents
|
|
251
|
+
self.spiked_vcf_file_path = spiked_vcf_file_path
|
|
252
|
+
|
|
253
|
+
def write_gzip(self) -> None:
|
|
254
|
+
"""Writes gzipped vcf file."""
|
|
255
|
+
encoded_contents = [line.encode() for line in self.vcf_contents]
|
|
256
|
+
with gzip.open(self.spiked_vcf_file_path, "wb") as f:
|
|
257
|
+
for line in encoded_contents:
|
|
258
|
+
f.write(line)
|
|
259
|
+
f.close()
|
|
260
|
+
|
|
261
|
+
def write_uncompressed(self) -> None:
|
|
262
|
+
"""Writes an uncompressed vcf file."""
|
|
263
|
+
with open(self.spiked_vcf_file_path, "w") as file:
|
|
264
|
+
file.writelines(self.vcf_contents)
|
|
265
|
+
file.close()
|
|
266
|
+
|
|
267
|
+
def write_vcf_file(self) -> None:
|
|
268
|
+
"""Writes spiked vcf file."""
|
|
269
|
+
self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def spike_vcf_contents(
|
|
273
|
+
phenopacket: Phenopacket or Family,
|
|
274
|
+
phenopacket_path: Path,
|
|
275
|
+
chosen_template_vcf: Path,
|
|
276
|
+
) -> tuple[str, list[str]]:
|
|
277
|
+
"""Spikes VCF records with variants."""
|
|
278
|
+
# this is a separate function to a click command as it will fail if annotated with click annotations
|
|
279
|
+
# and referenced from another click command
|
|
280
|
+
phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
|
|
281
|
+
vcf_contents = read_vcf(chosen_template_vcf)
|
|
282
|
+
vcf_header = VcfHeaderParser(vcf_contents).parse_vcf_header()
|
|
283
|
+
check_variant_assembly(phenopacket_causative_variants, vcf_header, phenopacket_path)
|
|
284
|
+
return (
|
|
285
|
+
vcf_header.assembly,
|
|
286
|
+
VcfSpiker(vcf_contents, phenopacket_causative_variants, vcf_header).construct_vcf(),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def generate_spiked_vcf_file(
|
|
291
|
+
output_dir: Path,
|
|
292
|
+
phenopacket: Phenopacket or Family,
|
|
293
|
+
phenopacket_path: Path,
|
|
294
|
+
chosen_template_vcf: Path,
|
|
295
|
+
) -> File:
|
|
296
|
+
"""Writes spiked vcf contents to a new file."""
|
|
297
|
+
try:
|
|
298
|
+
output_dir.mkdir()
|
|
299
|
+
info_log.info(f" Created a directory {output_dir}")
|
|
300
|
+
except FileExistsError:
|
|
301
|
+
pass
|
|
302
|
+
vcf_assembly, spiked_vcf = spike_vcf_contents(
|
|
303
|
+
phenopacket, phenopacket_path, chosen_template_vcf
|
|
304
|
+
)
|
|
305
|
+
spiked_vcf_path = (
|
|
306
|
+
output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
|
|
307
|
+
if is_gzipped(chosen_template_vcf)
|
|
308
|
+
else output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf"))
|
|
309
|
+
)
|
|
310
|
+
VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
|
|
311
|
+
return File(
|
|
312
|
+
uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
|
|
313
|
+
file_attributes={"fileFormat": "vcf", "genomeAssembly": vcf_assembly},
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def create_spiked_vcf(
|
|
318
|
+
output_dir: Path, phenopacket_path: Path, template_vcf_path: Path, vcf_dir: Path
|
|
319
|
+
):
|
|
320
|
+
"""Creates a spiked vcf for a phenopacket."""
|
|
321
|
+
if template_vcf_path is None and vcf_dir is None:
|
|
322
|
+
raise InputError("Either a template_vcf or vcf_dir must be specified")
|
|
323
|
+
vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
|
|
324
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
325
|
+
spiked_vcf_file_message = generate_spiked_vcf_file(
|
|
326
|
+
output_dir, phenopacket, phenopacket_path, vcf_file_path
|
|
327
|
+
)
|
|
328
|
+
updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
|
|
329
|
+
spiked_vcf_file_message
|
|
330
|
+
)
|
|
331
|
+
write_phenopacket(updated_phenopacket, phenopacket_path)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def create_spiked_vcfs(
|
|
335
|
+
output_dir: Path, phenopacket_dir: Path, template_vcf_path: Path, vcf_dir: Path
|
|
336
|
+
):
|
|
337
|
+
"""Creates spiked vcfs for phenopackets."""
|
|
338
|
+
if template_vcf_path is None and vcf_dir is None:
|
|
339
|
+
raise InputError("Either a template_vcf or vcf_dir must be specified")
|
|
340
|
+
for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
|
|
341
|
+
vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
|
|
342
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
343
|
+
spiked_vcf_file_message = generate_spiked_vcf_file(
|
|
344
|
+
output_dir, phenopacket, phenopacket_path, vcf_file_path
|
|
345
|
+
)
|
|
346
|
+
updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
|
|
347
|
+
spiked_vcf_file_message
|
|
348
|
+
)
|
|
349
|
+
write_phenopacket(updated_phenopacket, phenopacket_path)
|
|
350
|
+
# or made a lambda one-liner for maximum wtf...
|
|
351
|
+
# [spike_vcf(path, output_dir, template_vcf, vcf_dir) for path in phenopacket_dir.iterdir() if path.suffix ==
|
|
352
|
+
# ".json"]
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def spike_vcfs(
|
|
356
|
+
output_dir: Path,
|
|
357
|
+
phenopacket_path: Path,
|
|
358
|
+
phenopacket_dir: Path,
|
|
359
|
+
template_vcf_path: Path,
|
|
360
|
+
vcf_dir: Path,
|
|
361
|
+
):
|
|
362
|
+
"""Create spiked VCF from either a phenopacket or a phenopacket directory."""
|
|
363
|
+
if phenopacket_path is not None:
|
|
364
|
+
create_spiked_vcf(output_dir, phenopacket_path, template_vcf_path, vcf_dir)
|
|
365
|
+
elif phenopacket_dir is not None:
|
|
366
|
+
create_spiked_vcfs(output_dir, phenopacket_dir, template_vcf_path, vcf_dir)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from click import Option, UsageError
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class InputError(Exception):
|
|
5
|
+
"""Exception raised for missing required inputs."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, file, message="Missing required input"):
|
|
8
|
+
self.file: str = file
|
|
9
|
+
self.message: str = message
|
|
10
|
+
super().__init__(self.message)
|
|
11
|
+
|
|
12
|
+
def __str__(self):
|
|
13
|
+
return f"{self.message} -> {self.file} "
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MutuallyExclusiveOptionError(Option):
|
|
17
|
+
"""Exception raised for when"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, *args, **kwargs):
|
|
20
|
+
self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", []))
|
|
21
|
+
help_ = kwargs.get("help", "")
|
|
22
|
+
if self.mutually_exclusive:
|
|
23
|
+
ex_str = ", ".join(self.mutually_exclusive)
|
|
24
|
+
kwargs["help"] = help_ + (
|
|
25
|
+
" NOTE: This argument is mutually exclusive with " " arguments: [" + ex_str + "]."
|
|
26
|
+
)
|
|
27
|
+
super(MutuallyExclusiveOptionError, self).__init__(*args, **kwargs)
|
|
28
|
+
|
|
29
|
+
def handle_parse_result(self, ctx, opts, args):
|
|
30
|
+
if self.mutually_exclusive.intersection(opts) and self.name in opts:
|
|
31
|
+
raise UsageError(
|
|
32
|
+
"Illegal usage: `{}` is mutually exclusive with "
|
|
33
|
+
"arguments `{}`.".format(self.name, ", ".join(self.mutually_exclusive))
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return super(MutuallyExclusiveOptionError, self).handle_parse_result(ctx, opts, args)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class IncorrectFileFormatError(Exception):
|
|
40
|
+
def __init__(self, file, expectation, message="Incorrect File Type"):
|
|
41
|
+
self.file: str = file
|
|
42
|
+
self.expectation: str = expectation
|
|
43
|
+
self.message: str = message
|
|
44
|
+
super().__init__(self.message)
|
|
45
|
+
|
|
46
|
+
def __str__(self):
|
|
47
|
+
return f"{self.message} -> {self.file} (expected {self.expectation})"
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from pheval.utils.file_utils import all_files
|
|
5
|
+
from pheval.utils.phenopacket_utils import (
|
|
6
|
+
GeneIdentifierUpdater,
|
|
7
|
+
PhenopacketRebuilder,
|
|
8
|
+
PhenopacketUtil,
|
|
9
|
+
create_hgnc_dict,
|
|
10
|
+
phenopacket_reader,
|
|
11
|
+
write_phenopacket,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def update_outdated_gene_context(
|
|
16
|
+
phenopacket_path: Path, gene_identifier: str, hgnc_data: defaultdict
|
|
17
|
+
):
|
|
18
|
+
"""Updates the gene context of the phenopacket."""
|
|
19
|
+
phenopacket = phenopacket_reader(phenopacket_path)
|
|
20
|
+
interpretations = PhenopacketUtil(phenopacket).interpretations()
|
|
21
|
+
updated_interpretations = GeneIdentifierUpdater(
|
|
22
|
+
hgnc_data=hgnc_data, gene_identifier=gene_identifier
|
|
23
|
+
).update_genomic_interpretations_gene_identifier(interpretations)
|
|
24
|
+
|
|
25
|
+
return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def create_updated_phenopacket(gene_identifier: str, phenopacket_path: Path, output_dir: Path):
|
|
29
|
+
"""Updates the gene context within the interpretations for a phenopacket."""
|
|
30
|
+
hgnc_data = create_hgnc_dict()
|
|
31
|
+
updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, hgnc_data)
|
|
32
|
+
write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_updated_phenopackets(gene_identifier: str, phenopacket_dir: Path, output_dir: Path):
|
|
36
|
+
"""Updates the gene context within the interpretations for phenopackets."""
|
|
37
|
+
hgnc_data = create_hgnc_dict()
|
|
38
|
+
for phenopacket_path in all_files(phenopacket_dir):
|
|
39
|
+
updated_phenopacket = update_outdated_gene_context(
|
|
40
|
+
phenopacket_path, gene_identifier, hgnc_data
|
|
41
|
+
)
|
|
42
|
+
write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def update_phenopackets(
|
|
46
|
+
gene_identifier: str, phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path
|
|
47
|
+
):
|
|
48
|
+
"""Update the gene identifiers in either a single phenopacket or a directory of phenopackets."""
|
|
49
|
+
output_dir.mkdir(exist_ok=True)
|
|
50
|
+
if phenopacket_path is not None:
|
|
51
|
+
create_updated_phenopacket(gene_identifier, phenopacket_path, output_dir)
|
|
52
|
+
elif phenopacket_dir is not None:
|
|
53
|
+
create_updated_phenopackets(gene_identifier, phenopacket_dir, output_dir)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
rank gene_id gene_name score
|
|
2
|
+
1 Entrez:368 ABCC6 84.62940470377605
|
|
3
|
+
2 Entrez:5167 ENPP1 69.57813326517741
|
|
4
|
+
3 Entrez:54790 TET2 57.23555533091227
|
|
5
|
+
4 Entrez:64132 XYLT2 57.030126889546715
|
|
6
|
+
5 Entrez:3949 LDLR 55.80375734965006
|
|
7
|
+
6 Entrez:64240 ABCG5 53.74869124094645
|
|
8
|
+
7 Entrez:348 APOE 53.691530545552574
|
|
9
|
+
8 Entrez:462 SERPINC1 51.44988568623861
|
|
10
|
+
9 Entrez:255738 PCSK9 50.51583385467529
|
|
11
|
+
10 Entrez:2162 F13A1 50.0550905863444
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Chr Start Ref Alt GT Gene CADD GWAVA DANN Sim_Score Prediction_Score
|
|
2
|
+
10 100177428 C A 1/1 HPS1 46 0.436666666666666667 0.9972185359365543 0.9648632774654027 0.9213319434699428
|
|
3
|
+
6 32489940 G T,C 1/1 . . 0.41 . . 0.5740168850055932
|
|
4
|
+
6 32489940 G T,C 1/1 . . 0.41 . . 0.5740168850055932
|
|
5
|
+
9 136328657 T C,G 1/1 . . 0.41 . . 0.5740168850055932
|
|
6
|
+
10 113940329 T C,G 1/1 . . 0.4066666666666667 . . 0.5740168850055932
|
|
7
|
+
19 42132273 C T,A 1/1 . . 0.41 . . 0.5740168850055932
|
|
8
|
+
14 21467913 T G,A 1/1 . . 0.42333333333333334 . . 0.5735853467091718
|
|
9
|
+
1 16354590 A T,G 1/1 . . 0.41333333333333333 . . 0.5718396449188846
|
|
10
|
+
12 52681925 A C,T 1/1 . . 0.38000000000000006 . . 0.5711489183536179
|
|
11
|
+
7 34192762 G C,A 1/1 . . 0.37666666666666665 . . 0.5708847601387523
|
|
12
|
+
11 125830970 A T,G 1/1 . . 0.3766666666666667 . . 0.5708847601387523
|
|
13
|
+
11 125830970 A T,G 1/1 . . 0.3766666666666667 . . 0.5708847601387523
|
|
14
|
+
7 156469133 C G,T 1/1 . . 0.4666666666666666 . . 0.5692571159111212
|
|
15
|
+
11 57155288 T C,A 1/1 . . 0.4666666666666666 . . 0.5692571159111212
|
|
16
|
+
2 130832444 C A,T 1/1 . . 0.3833333333333333 . . 0.5689482176042727
|
|
17
|
+
7 106508978 A G,C 1/1 . . 0.38333333333333336 . . 0.5689482176042727
|
|
18
|
+
10 61552692 G T,C 1/1 . . 0.3833333333333333 . . 0.5689482176042727
|
|
19
|
+
5 141336264 G T,A 1/1 . . 0.47666666666666674 . . 0.5686996214945524
|
|
20
|
+
5 141336264 G T,A 1/1 . . 0.47666666666666674 . . 0.5686996214945524
|
|
21
|
+
19 52004795 G T,C 1/1 . . 0.4633333333333334 . . 0.5680430668280317
|
|
22
|
+
11 75572808 G A 1/1 UVRAG 29.8 0.6566666666666666 0.99870875812720883 . 0.5678185751935081
|