pheval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (42) hide show
  1. pheval/__init__.py +0 -5
  2. pheval/analyse/__init__.py +0 -0
  3. pheval/analyse/analysis.py +703 -0
  4. pheval/analyse/generate_plots.py +312 -0
  5. pheval/analyse/generate_summary_outputs.py +186 -0
  6. pheval/analyse/rank_stats.py +61 -0
  7. pheval/cli.py +22 -7
  8. pheval/cli_pheval.py +37 -12
  9. pheval/cli_pheval_utils.py +225 -8
  10. pheval/config_parser.py +36 -0
  11. pheval/constants.py +1 -0
  12. pheval/implementations/__init__.py +1 -3
  13. pheval/post_processing/__init__.py +0 -0
  14. pheval/post_processing/post_processing.py +210 -0
  15. pheval/prepare/__init__.py +0 -0
  16. pheval/prepare/create_noisy_phenopackets.py +173 -0
  17. pheval/prepare/create_spiked_vcf.py +366 -0
  18. pheval/prepare/custom_exceptions.py +47 -0
  19. pheval/prepare/update_phenopacket.py +53 -0
  20. pheval/resources/alternate_ouputs/CADA_results.txt +11 -0
  21. pheval/resources/alternate_ouputs/DeepPVP_results.txt +22 -0
  22. pheval/resources/alternate_ouputs/OVA_results.txt +11 -0
  23. pheval/resources/alternate_ouputs/Phen2Gene_results.json +814 -0
  24. pheval/resources/alternate_ouputs/Phenolyzer_results.txt +12 -0
  25. pheval/resources/alternate_ouputs/lirical_results.tsv +152 -0
  26. pheval/resources/alternate_ouputs/svanna_results.tsv +9 -0
  27. pheval/resources/hgnc_complete_set_2022-10-01.txt +43222 -0
  28. pheval/run_metadata.py +27 -0
  29. pheval/runners/runner.py +92 -11
  30. pheval/utils/__init__.py +0 -0
  31. pheval/utils/docs_gen.py +105 -0
  32. pheval/utils/docs_gen.sh +18 -0
  33. pheval/utils/file_utils.py +88 -0
  34. pheval/utils/phenopacket_utils.py +356 -0
  35. pheval/utils/semsim_utils.py +156 -0
  36. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/METADATA +12 -4
  37. pheval-0.2.0.dist-info/RECORD +41 -0
  38. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/WHEEL +1 -1
  39. pheval/utils.py +0 -7
  40. pheval-0.1.0.dist-info/RECORD +0 -13
  41. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/LICENSE +0 -0
  42. {pheval-0.1.0.dist-info → pheval-0.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,173 @@
1
+ import random
2
+ from pathlib import Path
3
+
4
+ from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
5
+ from oaklib.resource import OntologyResource
6
+ from phenopackets import Family, OntologyClass, Phenopacket, PhenotypicFeature
7
+
8
+ from pheval.utils.file_utils import files_with_suffix
9
+ from pheval.utils.phenopacket_utils import (
10
+ PhenopacketRebuilder,
11
+ PhenopacketUtil,
12
+ phenopacket_reader,
13
+ write_phenopacket,
14
+ )
15
+
16
+
17
+ def load_ontology():
18
+ """Loads human phenotype ontology."""
19
+ resource = OntologyResource(slug="hp.obo", local=False)
20
+ return ProntoImplementation(resource)
21
+
22
+
23
+ class HpoRandomiser:
24
+ """Randomises phenopacket phenotypic features."""
25
+
26
+ def __init__(self, hpo_ontology, scramble_factor: float):
27
+ self.hpo_ontology = hpo_ontology
28
+ self.phenotypic_abnormalities = set(hpo_ontology.roots(predicates=["HP:0000118"]))
29
+ self.scramble_factor = scramble_factor
30
+
31
+ def scramble_factor_proportions(self, phenotypic_features: list[PhenotypicFeature]):
32
+ """Calculate proportion of scrambled hpo terms from scramble factor."""
33
+ if len(phenotypic_features) == 1:
34
+ return 1
35
+ else:
36
+ return int(round(len(phenotypic_features) * self.scramble_factor, 0))
37
+
38
+ def retrieve_hpo_term(self, hpo_id: str) -> PhenotypicFeature:
39
+ """Retrieves term for hpo id."""
40
+ rels = self.hpo_ontology.entity_alias_map(hpo_id)
41
+ hpo_term = "".join(rels[(list(rels.keys())[0])])
42
+ return PhenotypicFeature(type=OntologyClass(id=hpo_id, label=hpo_term))
43
+
44
+ @staticmethod
45
+ def retain_real_patient_terms(
46
+ phenotypic_features: list[PhenotypicFeature],
47
+ number_of_scrambled_terms: int,
48
+ ) -> list[PhenotypicFeature]:
49
+ """Returns a list of the maximum number of real patient HPO terms."""
50
+ if len(phenotypic_features) > 1:
51
+ number_of_real_id = len(phenotypic_features) - number_of_scrambled_terms
52
+ else:
53
+ number_of_real_id = 1
54
+ return random.sample(phenotypic_features, number_of_real_id)
55
+
56
+ def convert_patient_terms_to_parent(
57
+ self,
58
+ phenotypic_features: list[PhenotypicFeature],
59
+ retained_phenotypic_features: list[PhenotypicFeature],
60
+ number_of_scrambled_terms: int,
61
+ ) -> list[PhenotypicFeature]:
62
+ """Returns a list of the HPO terms that have been converted to a parent term."""
63
+ remaining_hpo = [i for i in phenotypic_features if i not in retained_phenotypic_features]
64
+ if len(remaining_hpo) == 0:
65
+ number_of_scrambled_terms = 0
66
+ hpo_terms_to_be_changed = list(random.sample(remaining_hpo, number_of_scrambled_terms))
67
+ parent_terms = []
68
+ for term in hpo_terms_to_be_changed:
69
+ try:
70
+ parent_terms.append(
71
+ self.retrieve_hpo_term(
72
+ self.hpo_ontology.hierararchical_parents(term.type.id)[0]
73
+ )
74
+ )
75
+ except IndexError:
76
+ obsolete_term = self.hpo_ontology.entity_metadata_map(term.type.id)
77
+ updated_term = list(obsolete_term.values())[0][0]
78
+ parent_terms.append(
79
+ self.retrieve_hpo_term(
80
+ self.hpo_ontology.hierararchical_parents(updated_term)[0]
81
+ )
82
+ )
83
+ return parent_terms
84
+
85
+ def create_random_hpo_terms(self, number_of_scrambled_terms: int) -> list[PhenotypicFeature]:
86
+ """Returns a list of random HPO terms"""
87
+ random_ids = list(
88
+ random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms)
89
+ )
90
+ return [self.retrieve_hpo_term(random_id) for random_id in random_ids]
91
+
92
+ def randomise_hpo_terms(
93
+ self,
94
+ phenotypic_features: list[PhenotypicFeature],
95
+ ) -> list[PhenotypicFeature]:
96
+ """Returns a list of randomised HPO terms."""
97
+ number_of_scrambled_terms = self.scramble_factor_proportions(phenotypic_features)
98
+ retained_patient_terms = self.retain_real_patient_terms(
99
+ phenotypic_features, number_of_scrambled_terms
100
+ )
101
+ return (
102
+ retained_patient_terms
103
+ + self.convert_patient_terms_to_parent(
104
+ phenotypic_features, retained_patient_terms, number_of_scrambled_terms
105
+ )
106
+ + self.create_random_hpo_terms(number_of_scrambled_terms)
107
+ )
108
+
109
+
110
+ def add_noise_to_phenotypic_profile(
111
+ hpo_randomiser: HpoRandomiser,
112
+ phenopacket: Phenopacket or Family,
113
+ ) -> Phenopacket or Family:
114
+ """Randomises the phenotypic profile of a phenopacket."""
115
+ # phenopacket_util = PhenopacketUtil(phenopacket)
116
+ # phenotypic_features = phenopacket_util.observed_phenotypic_features()
117
+ phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
118
+ random_phenotypes = hpo_randomiser.randomise_hpo_terms(phenotypic_features)
119
+ randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(random_phenotypes)
120
+ return randomised_phenopacket
121
+
122
+
123
+ def create_scrambled_phenopacket(
124
+ output_dir: Path, phenopacket_path: Path, scramble_factor: float
125
+ ) -> None:
126
+ """Creates a scrambled phenopacket."""
127
+ try:
128
+ output_dir.mkdir()
129
+ except FileExistsError:
130
+ pass
131
+ ontology = load_ontology()
132
+ hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
133
+ phenopacket = phenopacket_reader(phenopacket_path)
134
+ created_noisy_phenopacket = add_noise_to_phenotypic_profile(
135
+ hpo_randomiser,
136
+ phenopacket,
137
+ )
138
+ write_phenopacket(
139
+ created_noisy_phenopacket,
140
+ output_dir.joinpath(phenopacket_path.name),
141
+ )
142
+
143
+
144
+ def create_scrambled_phenopackets(
145
+ output_dir: Path, phenopacket_dir: Path, scramble_factor: float
146
+ ) -> None:
147
+ """Creates scrambled phenopackets."""
148
+ try:
149
+ output_dir.mkdir()
150
+ except FileExistsError:
151
+ pass
152
+ ontology = load_ontology()
153
+ hpo_randomiser = HpoRandomiser(ontology, scramble_factor)
154
+ phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
155
+ for phenopacket_path in phenopacket_files:
156
+ phenopacket = phenopacket_reader(phenopacket_path)
157
+ created_noisy_phenopacket = add_noise_to_phenotypic_profile(hpo_randomiser, phenopacket)
158
+ write_phenopacket(
159
+ created_noisy_phenopacket,
160
+ output_dir.joinpath(
161
+ phenopacket_path.name,
162
+ ),
163
+ )
164
+
165
+
166
+ def scramble_phenopackets(
167
+ output_dir: Path, phenopacket_path: Path, phenopacket_dir: Path, scramble_factor: float
168
+ ) -> None:
169
+ """Create scrambled phenopackets from either a single phenopacket or directory of phenopackets."""
170
+ if phenopacket_path is not None:
171
+ create_scrambled_phenopacket(output_dir, phenopacket_path, scramble_factor)
172
+ elif phenopacket_dir is not None:
173
+ create_scrambled_phenopackets(output_dir, phenopacket_dir, scramble_factor)
@@ -0,0 +1,366 @@
1
+ #!/usr/bin/python
2
+ import gzip
3
+ import logging
4
+ import re
5
+ import secrets
6
+ import urllib.parse
7
+ from copy import copy
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+ from phenopackets import Family, File, Phenopacket
12
+
13
+ from pheval.utils.phenopacket_utils import (
14
+ IncompatibleGenomeAssemblyError,
15
+ PhenopacketRebuilder,
16
+ PhenopacketUtil,
17
+ ProbandCausativeVariant,
18
+ phenopacket_reader,
19
+ write_phenopacket,
20
+ )
21
+
22
+ from .custom_exceptions import InputError
23
+ from ..utils.file_utils import all_files, files_with_suffix, is_gzipped
24
+
25
+ info_log = logging.getLogger("info")
26
+
27
+ genome_assemblies = {
28
+ "GRCh38": {
29
+ "1": 248956422,
30
+ "2": 242193529,
31
+ "3": 198295559,
32
+ "4": 190214555,
33
+ "5": 181538259,
34
+ "6": 170805979,
35
+ "7": 159345973,
36
+ "8": 145138636,
37
+ "9": 138394717,
38
+ "10": 133797422,
39
+ "11": 135086622,
40
+ "12": 133275309,
41
+ "13": 114364328,
42
+ "14": 107043718,
43
+ "15": 101991189,
44
+ "16": 90338345,
45
+ "17": 83257441,
46
+ "18": 80373285,
47
+ "19": 58617616,
48
+ "20": 64444167,
49
+ "21": 46709983,
50
+ "22": 50818468,
51
+ },
52
+ "GRCh37": {
53
+ "1": 249250621,
54
+ "2": 243199373,
55
+ "3": 198022430,
56
+ "4": 191154276,
57
+ "5": 180915260,
58
+ "6": 171115067,
59
+ "7": 159138663,
60
+ "8": 146364022,
61
+ "9": 141213431,
62
+ "10": 135534747,
63
+ "11": 135006516,
64
+ "12": 133851895,
65
+ "13": 115169878,
66
+ "14": 107349540,
67
+ "15": 102531392,
68
+ "16": 90354753,
69
+ "17": 81195210,
70
+ "18": 78077248,
71
+ "19": 59128983,
72
+ "20": 63025520,
73
+ "21": 48129895,
74
+ "22": 51304566,
75
+ },
76
+ }
77
+
78
+
79
+ @dataclass
80
+ class VcfHeader:
81
+ """Data obtained from VCF header"""
82
+
83
+ sample_id: str
84
+ assembly: str
85
+ chr_status: bool
86
+
87
+
88
+ class VcfPicker:
89
+ """Chooses a VCF file from random for a directory if provided, otherwise selects the single template."""
90
+
91
+ def __init__(self, template_vcf: Path or None, vcf_dir: Path or None):
92
+ self.template_vcf = template_vcf
93
+ self.vcf_dir = vcf_dir
94
+
95
+ def pick_file_from_dir(self) -> Path:
96
+ """Selects a file from a directory at random."""
97
+ return secrets.choice(all_files(self.vcf_dir))
98
+
99
+ def pick_file(self) -> Path:
100
+ """Selects a VCF file from random when given a directory, if not, template vcf is assigned."""
101
+ return self.pick_file_from_dir() if self.vcf_dir is not None else self.template_vcf
102
+
103
+
104
+ def read_vcf(vcf_file: Path) -> list[str]:
105
+ """Reads the contents of a VCF file into memory - handles both uncompressed and gzipped."""
106
+ open_fn = gzip.open if is_gzipped(vcf_file) else open
107
+ vcf = open_fn(vcf_file)
108
+ vcf_contents = (
109
+ [line.decode() for line in vcf.readlines()] if is_gzipped(vcf_file) else vcf.readlines()
110
+ )
111
+ vcf.close()
112
+ return vcf_contents
113
+
114
+
115
+ class VcfHeaderParser:
116
+ """Parses the header of a VCF file."""
117
+
118
+ def __init__(self, vcf_contents: list[str]):
119
+ self.vcf_contents = vcf_contents
120
+
121
+ def parse_assembly(self) -> tuple[str, bool]:
122
+ """Parses the genome assembly and format of vcf_records."""
123
+ vcf_assembly = {}
124
+ chr_status = False
125
+ for line in self.vcf_contents:
126
+ if line.startswith("##contig=<ID"):
127
+ tokens = line.split(",")
128
+ chromosome = re.sub(
129
+ r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
130
+ )
131
+ if "chr" in chromosome:
132
+ chr_status = True
133
+ chromosome = chromosome.replace("chr", "")
134
+ contig_length = re.sub(
135
+ "[^0-9]+",
136
+ "",
137
+ [token for token in tokens if "length=" in token][0],
138
+ )
139
+ vcf_assembly[chromosome] = int(contig_length)
140
+ vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
141
+ assembly = [k for k, v in genome_assemblies.items() if v == vcf_assembly][0]
142
+ return assembly, chr_status
143
+
144
+ def parse_sample_id(self) -> str:
145
+ """Parses the sample ID of the VCF."""
146
+ for line in self.vcf_contents:
147
+ if line.startswith("#CHROM"):
148
+ return line.split("\t")[9].rstrip()
149
+
150
+ def parse_vcf_header(self) -> VcfHeader:
151
+ """Parses the header of the VCF."""
152
+ assembly, chr_status = self.parse_assembly()
153
+ sample_id = self.parse_sample_id()
154
+ return VcfHeader(sample_id, assembly, chr_status)
155
+
156
+
157
+ def check_variant_assembly(
158
+ proband_causative_variants: list[ProbandCausativeVariant],
159
+ vcf_header: VcfHeader,
160
+ phenopacket_path: Path,
161
+ ):
162
+ """Checks the assembly of the variant assembly against the VCF."""
163
+ compatible_genome_assembly = {"GRCh37", "hg19", "GRCh38", "hg38"}
164
+ phenopacket_assembly = list({variant.assembly for variant in proband_causative_variants})
165
+ if len(phenopacket_assembly) > 1:
166
+ raise ValueError("Too many genome assemblies!")
167
+ if phenopacket_assembly[0] not in compatible_genome_assembly:
168
+ raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
169
+ if phenopacket_assembly[0] != vcf_header.assembly:
170
+ raise IncompatibleGenomeAssemblyError(
171
+ assembly=phenopacket_assembly, phenopacket=phenopacket_path
172
+ )
173
+
174
+
175
+ class VcfSpiker:
176
+ """Spikes proband variants into template VCF file contents."""
177
+
178
+ def __init__(
179
+ self,
180
+ vcf_contents: list[str],
181
+ proband_causative_variants: list[ProbandCausativeVariant],
182
+ vcf_header: VcfHeader,
183
+ ):
184
+ self.vcf_contents = vcf_contents
185
+ self.proband_causative_variants = proband_causative_variants
186
+ self.vcf_header = vcf_header
187
+
188
+ def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> list[str]:
189
+ """Constructs variant entries."""
190
+ genotype_codes = {
191
+ "hemizygous": "0/1",
192
+ "homozygous": "1/1",
193
+ "heterozygous": "0/1",
194
+ "compound heterozygous": "0/1",
195
+ }
196
+ if self.vcf_header.chr_status is True and "chr" not in proband_variant_data.variant.chrom:
197
+ proband_variant_data.variant.chrom = "chr" + proband_variant_data.variant.chrom
198
+ return [
199
+ proband_variant_data.variant.chrom,
200
+ str(proband_variant_data.variant.pos),
201
+ ".",
202
+ proband_variant_data.variant.ref,
203
+ f"<{proband_variant_data.variant.alt}>"
204
+ if proband_variant_data.variant.ref == "N"
205
+ else proband_variant_data.variant.alt,
206
+ "100",
207
+ "PASS",
208
+ proband_variant_data.info
209
+ if proband_variant_data.info is not None
210
+ else "SPIKED_VARIANT_" + proband_variant_data.genotype.upper(),
211
+ "GT",
212
+ genotype_codes[proband_variant_data.genotype.lower()] + "\n",
213
+ ]
214
+
215
+ def construct_vcf_records(self):
216
+ """Inserts spiked variant into correct position within VCF."""
217
+ updated_vcf_records = copy(self.vcf_contents)
218
+ for variant in self.proband_causative_variants:
219
+ variant = self.construct_variant_entry(variant)
220
+ variant_entry_position = [
221
+ i
222
+ for i, val in enumerate(updated_vcf_records)
223
+ if val.split("\t")[0] == variant[0] and int(val.split("\t")[1]) < int(variant[1])
224
+ ][-1] + 1
225
+ updated_vcf_records.insert(variant_entry_position, "\t".join(variant))
226
+ return updated_vcf_records
227
+
228
+ def construct_header(self, updated_vcf_records) -> list[str]:
229
+ """Constructs the header of the VCF."""
230
+ updated_vcf_file = []
231
+ for line in updated_vcf_records:
232
+ text = line.replace(
233
+ self.vcf_header.sample_id,
234
+ self.proband_causative_variants[0].proband_id,
235
+ )
236
+ updated_vcf_file.append(text)
237
+ return updated_vcf_file
238
+
239
+ def construct_vcf(self) -> list[str]:
240
+ """Constructs the entire spiked VCF."""
241
+ return self.construct_header(self.construct_vcf_records())
242
+
243
+
244
+ class VcfWriter:
245
+ def __init__(
246
+ self,
247
+ vcf_contents: list[str],
248
+ spiked_vcf_file_path: Path,
249
+ ):
250
+ self.vcf_contents = vcf_contents
251
+ self.spiked_vcf_file_path = spiked_vcf_file_path
252
+
253
+ def write_gzip(self) -> None:
254
+ """Writes gzipped vcf file."""
255
+ encoded_contents = [line.encode() for line in self.vcf_contents]
256
+ with gzip.open(self.spiked_vcf_file_path, "wb") as f:
257
+ for line in encoded_contents:
258
+ f.write(line)
259
+ f.close()
260
+
261
+ def write_uncompressed(self) -> None:
262
+ """Writes an uncompressed vcf file."""
263
+ with open(self.spiked_vcf_file_path, "w") as file:
264
+ file.writelines(self.vcf_contents)
265
+ file.close()
266
+
267
+ def write_vcf_file(self) -> None:
268
+ """Writes spiked vcf file."""
269
+ self.write_gzip() if is_gzipped(self.spiked_vcf_file_path) else self.write_uncompressed()
270
+
271
+
272
+ def spike_vcf_contents(
273
+ phenopacket: Phenopacket or Family,
274
+ phenopacket_path: Path,
275
+ chosen_template_vcf: Path,
276
+ ) -> tuple[str, list[str]]:
277
+ """Spikes VCF records with variants."""
278
+ # this is a separate function to a click command as it will fail if annotated with click annotations
279
+ # and referenced from another click command
280
+ phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
281
+ vcf_contents = read_vcf(chosen_template_vcf)
282
+ vcf_header = VcfHeaderParser(vcf_contents).parse_vcf_header()
283
+ check_variant_assembly(phenopacket_causative_variants, vcf_header, phenopacket_path)
284
+ return (
285
+ vcf_header.assembly,
286
+ VcfSpiker(vcf_contents, phenopacket_causative_variants, vcf_header).construct_vcf(),
287
+ )
288
+
289
+
290
+ def generate_spiked_vcf_file(
291
+ output_dir: Path,
292
+ phenopacket: Phenopacket or Family,
293
+ phenopacket_path: Path,
294
+ chosen_template_vcf: Path,
295
+ ) -> File:
296
+ """Writes spiked vcf contents to a new file."""
297
+ try:
298
+ output_dir.mkdir()
299
+ info_log.info(f" Created a directory {output_dir}")
300
+ except FileExistsError:
301
+ pass
302
+ vcf_assembly, spiked_vcf = spike_vcf_contents(
303
+ phenopacket, phenopacket_path, chosen_template_vcf
304
+ )
305
+ spiked_vcf_path = (
306
+ output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
307
+ if is_gzipped(chosen_template_vcf)
308
+ else output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf"))
309
+ )
310
+ VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
311
+ return File(
312
+ uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
313
+ file_attributes={"fileFormat": "vcf", "genomeAssembly": vcf_assembly},
314
+ )
315
+
316
+
317
+ def create_spiked_vcf(
318
+ output_dir: Path, phenopacket_path: Path, template_vcf_path: Path, vcf_dir: Path
319
+ ):
320
+ """Creates a spiked vcf for a phenopacket."""
321
+ if template_vcf_path is None and vcf_dir is None:
322
+ raise InputError("Either a template_vcf or vcf_dir must be specified")
323
+ vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
324
+ phenopacket = phenopacket_reader(phenopacket_path)
325
+ spiked_vcf_file_message = generate_spiked_vcf_file(
326
+ output_dir, phenopacket, phenopacket_path, vcf_file_path
327
+ )
328
+ updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
329
+ spiked_vcf_file_message
330
+ )
331
+ write_phenopacket(updated_phenopacket, phenopacket_path)
332
+
333
+
334
+ def create_spiked_vcfs(
335
+ output_dir: Path, phenopacket_dir: Path, template_vcf_path: Path, vcf_dir: Path
336
+ ):
337
+ """Creates spiked vcfs for phenopackets."""
338
+ if template_vcf_path is None and vcf_dir is None:
339
+ raise InputError("Either a template_vcf or vcf_dir must be specified")
340
+ for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
341
+ vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
342
+ phenopacket = phenopacket_reader(phenopacket_path)
343
+ spiked_vcf_file_message = generate_spiked_vcf_file(
344
+ output_dir, phenopacket, phenopacket_path, vcf_file_path
345
+ )
346
+ updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
347
+ spiked_vcf_file_message
348
+ )
349
+ write_phenopacket(updated_phenopacket, phenopacket_path)
350
+ # or made a lambda one-liner for maximum wtf...
351
+ # [spike_vcf(path, output_dir, template_vcf, vcf_dir) for path in phenopacket_dir.iterdir() if path.suffix ==
352
+ # ".json"]
353
+
354
+
355
+ def spike_vcfs(
356
+ output_dir: Path,
357
+ phenopacket_path: Path,
358
+ phenopacket_dir: Path,
359
+ template_vcf_path: Path,
360
+ vcf_dir: Path,
361
+ ):
362
+ """Create spiked VCF from either a phenopacket or a phenopacket directory."""
363
+ if phenopacket_path is not None:
364
+ create_spiked_vcf(output_dir, phenopacket_path, template_vcf_path, vcf_dir)
365
+ elif phenopacket_dir is not None:
366
+ create_spiked_vcfs(output_dir, phenopacket_dir, template_vcf_path, vcf_dir)
@@ -0,0 +1,47 @@
1
+ from click import Option, UsageError
2
+
3
+
4
+ class InputError(Exception):
5
+ """Exception raised for missing required inputs."""
6
+
7
+ def __init__(self, file, message="Missing required input"):
8
+ self.file: str = file
9
+ self.message: str = message
10
+ super().__init__(self.message)
11
+
12
+ def __str__(self):
13
+ return f"{self.message} -> {self.file} "
14
+
15
+
16
+ class MutuallyExclusiveOptionError(Option):
17
+ """Exception raised for when"""
18
+
19
+ def __init__(self, *args, **kwargs):
20
+ self.mutually_exclusive = set(kwargs.pop("mutually_exclusive", []))
21
+ help_ = kwargs.get("help", "")
22
+ if self.mutually_exclusive:
23
+ ex_str = ", ".join(self.mutually_exclusive)
24
+ kwargs["help"] = help_ + (
25
+ " NOTE: This argument is mutually exclusive with " " arguments: [" + ex_str + "]."
26
+ )
27
+ super(MutuallyExclusiveOptionError, self).__init__(*args, **kwargs)
28
+
29
+ def handle_parse_result(self, ctx, opts, args):
30
+ if self.mutually_exclusive.intersection(opts) and self.name in opts:
31
+ raise UsageError(
32
+ "Illegal usage: `{}` is mutually exclusive with "
33
+ "arguments `{}`.".format(self.name, ", ".join(self.mutually_exclusive))
34
+ )
35
+
36
+ return super(MutuallyExclusiveOptionError, self).handle_parse_result(ctx, opts, args)
37
+
38
+
39
+ class IncorrectFileFormatError(Exception):
40
+ def __init__(self, file, expectation, message="Incorrect File Type"):
41
+ self.file: str = file
42
+ self.expectation: str = expectation
43
+ self.message: str = message
44
+ super().__init__(self.message)
45
+
46
+ def __str__(self):
47
+ return f"{self.message} -> {self.file} (expected {self.expectation})"
@@ -0,0 +1,53 @@
1
+ from collections import defaultdict
2
+ from pathlib import Path
3
+
4
+ from pheval.utils.file_utils import all_files
5
+ from pheval.utils.phenopacket_utils import (
6
+ GeneIdentifierUpdater,
7
+ PhenopacketRebuilder,
8
+ PhenopacketUtil,
9
+ create_hgnc_dict,
10
+ phenopacket_reader,
11
+ write_phenopacket,
12
+ )
13
+
14
+
15
+ def update_outdated_gene_context(
16
+ phenopacket_path: Path, gene_identifier: str, hgnc_data: defaultdict
17
+ ):
18
+ """Updates the gene context of the phenopacket."""
19
+ phenopacket = phenopacket_reader(phenopacket_path)
20
+ interpretations = PhenopacketUtil(phenopacket).interpretations()
21
+ updated_interpretations = GeneIdentifierUpdater(
22
+ hgnc_data=hgnc_data, gene_identifier=gene_identifier
23
+ ).update_genomic_interpretations_gene_identifier(interpretations)
24
+
25
+ return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
26
+
27
+
28
+ def create_updated_phenopacket(gene_identifier: str, phenopacket_path: Path, output_dir: Path):
29
+ """Updates the gene context within the interpretations for a phenopacket."""
30
+ hgnc_data = create_hgnc_dict()
31
+ updated_phenopacket = update_outdated_gene_context(phenopacket_path, gene_identifier, hgnc_data)
32
+ write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
33
+
34
+
35
+ def create_updated_phenopackets(gene_identifier: str, phenopacket_dir: Path, output_dir: Path):
36
+ """Updates the gene context within the interpretations for phenopackets."""
37
+ hgnc_data = create_hgnc_dict()
38
+ for phenopacket_path in all_files(phenopacket_dir):
39
+ updated_phenopacket = update_outdated_gene_context(
40
+ phenopacket_path, gene_identifier, hgnc_data
41
+ )
42
+ write_phenopacket(updated_phenopacket, output_dir.joinpath(phenopacket_path.name))
43
+
44
+
45
+ def update_phenopackets(
46
+ gene_identifier: str, phenopacket_path: Path, phenopacket_dir: Path, output_dir: Path
47
+ ):
48
+ """Update the gene identifiers in either a single phenopacket or a directory of phenopackets."""
49
+ output_dir.mkdir(exist_ok=True)
50
+ if phenopacket_path is not None:
51
+ create_updated_phenopacket(gene_identifier, phenopacket_path, output_dir)
52
+ elif phenopacket_dir is not None:
53
+ create_updated_phenopackets(gene_identifier, phenopacket_dir, output_dir)
@@ -0,0 +1,11 @@
1
+ rank gene_id gene_name score
2
+ 1 Entrez:368 ABCC6 84.62940470377605
3
+ 2 Entrez:5167 ENPP1 69.57813326517741
4
+ 3 Entrez:54790 TET2 57.23555533091227
5
+ 4 Entrez:64132 XYLT2 57.030126889546715
6
+ 5 Entrez:3949 LDLR 55.80375734965006
7
+ 6 Entrez:64240 ABCG5 53.74869124094645
8
+ 7 Entrez:348 APOE 53.691530545552574
9
+ 8 Entrez:462 SERPINC1 51.44988568623861
10
+ 9 Entrez:255738 PCSK9 50.51583385467529
11
+ 10 Entrez:2162 F13A1 50.0550905863444
@@ -0,0 +1,22 @@
1
+ Chr Start Ref Alt GT Gene CADD GWAVA DANN Sim_Score Prediction_Score
2
+ 10 100177428 C A 1/1 HPS1 46 0.436666666666666667 0.9972185359365543 0.9648632774654027 0.9213319434699428
3
+ 6 32489940 G T,C 1/1 . . 0.41 . . 0.5740168850055932
4
+ 6 32489940 G T,C 1/1 . . 0.41 . . 0.5740168850055932
5
+ 9 136328657 T C,G 1/1 . . 0.41 . . 0.5740168850055932
6
+ 10 113940329 T C,G 1/1 . . 0.4066666666666667 . . 0.5740168850055932
7
+ 19 42132273 C T,A 1/1 . . 0.41 . . 0.5740168850055932
8
+ 14 21467913 T G,A 1/1 . . 0.42333333333333334 . . 0.5735853467091718
9
+ 1 16354590 A T,G 1/1 . . 0.41333333333333333 . . 0.5718396449188846
10
+ 12 52681925 A C,T 1/1 . . 0.38000000000000006 . . 0.5711489183536179
11
+ 7 34192762 G C,A 1/1 . . 0.37666666666666665 . . 0.5708847601387523
12
+ 11 125830970 A T,G 1/1 . . 0.3766666666666667 . . 0.5708847601387523
13
+ 11 125830970 A T,G 1/1 . . 0.3766666666666667 . . 0.5708847601387523
14
+ 7 156469133 C G,T 1/1 . . 0.4666666666666666 . . 0.5692571159111212
15
+ 11 57155288 T C,A 1/1 . . 0.4666666666666666 . . 0.5692571159111212
16
+ 2 130832444 C A,T 1/1 . . 0.3833333333333333 . . 0.5689482176042727
17
+ 7 106508978 A G,C 1/1 . . 0.38333333333333336 . . 0.5689482176042727
18
+ 10 61552692 G T,C 1/1 . . 0.3833333333333333 . . 0.5689482176042727
19
+ 5 141336264 G T,A 1/1 . . 0.47666666666666674 . . 0.5686996214945524
20
+ 5 141336264 G T,A 1/1 . . 0.47666666666666674 . . 0.5686996214945524
21
+ 19 52004795 G T,C 1/1 . . 0.4633333333333334 . . 0.5680430668280317
22
+ 11 75572808 G A 1/1 UVRAG 29.8 0.6566666666666666 0.99870875812720883 . 0.5678185751935081