pheval 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/benchmark.py +12 -23
- pheval/analyse/benchmark_output_type.py +3 -5
- pheval/analyse/binary_classification_curves.py +3 -9
- pheval/analyse/binary_classification_stats.py +1 -4
- pheval/analyse/generate_plots.py +8 -18
- pheval/analyse/generate_rank_comparisons.py +1 -2
- pheval/analyse/rank_stats.py +8 -25
- pheval/analyse/run_data_parser.py +15 -9
- pheval/cli.py +1 -1
- pheval/cli_pheval_utils.py +10 -23
- pheval/config_parser.py +1 -1
- pheval/implementations/__init__.py +3 -5
- pheval/infra/exomiserdb.py +7 -15
- pheval/post_processing/phenopacket_truth_set.py +10 -31
- pheval/post_processing/post_processing.py +12 -33
- pheval/post_processing/validate_result_format.py +2 -4
- pheval/prepare/create_noisy_phenopackets.py +18 -29
- pheval/prepare/create_spiked_vcf.py +25 -56
- pheval/prepare/custom_exceptions.py +6 -7
- pheval/prepare/prepare_corpus.py +6 -17
- pheval/prepare/update_phenopacket.py +6 -17
- pheval/utils/docs_gen.py +3 -3
- pheval/utils/file_utils.py +1 -2
- pheval/utils/phenopacket_utils.py +41 -73
- pheval/utils/semsim_utils.py +6 -10
- pheval/utils/utils.py +3 -4
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/METADATA +1 -1
- pheval-0.6.4.dist-info/RECORD +57 -0
- pheval-0.6.2.dist-info/RECORD +0 -57
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/LICENSE +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/WHEEL +0 -0
- {pheval-0.6.2.dist-info → pheval-0.6.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import List
|
|
3
2
|
|
|
4
3
|
import polars as pl
|
|
5
4
|
|
|
@@ -56,7 +55,7 @@ class PhenopacketTruthSet:
|
|
|
56
55
|
phenopacket = phenopacket_reader(phenopacket_path)
|
|
57
56
|
return PhenopacketUtil(phenopacket)
|
|
58
57
|
|
|
59
|
-
def _get_causative_genes(self, phenopacket_name: str) ->
|
|
58
|
+
def _get_causative_genes(self, phenopacket_name: str) -> list[ProbandCausativeGene]:
|
|
60
59
|
"""
|
|
61
60
|
Get the causative genes for a given phenopacket.
|
|
62
61
|
Args:
|
|
@@ -67,7 +66,7 @@ class PhenopacketTruthSet:
|
|
|
67
66
|
phenopacket_util = self._get_phenopacket_util(phenopacket_name)
|
|
68
67
|
return phenopacket_util.diagnosed_genes()
|
|
69
68
|
|
|
70
|
-
def _get_causative_variants(self, phenopacket_name: str) ->
|
|
69
|
+
def _get_causative_variants(self, phenopacket_name: str) -> list[GenomicVariant]:
|
|
71
70
|
"""
|
|
72
71
|
Get the causative variants for a given phenopacket.
|
|
73
72
|
Args:
|
|
@@ -78,7 +77,7 @@ class PhenopacketTruthSet:
|
|
|
78
77
|
phenopacket_util = self._get_phenopacket_util(phenopacket_name)
|
|
79
78
|
return phenopacket_util.diagnosed_variants()
|
|
80
79
|
|
|
81
|
-
def _get_causative_diseases(self, phenopacket_name: str) ->
|
|
80
|
+
def _get_causative_diseases(self, phenopacket_name: str) -> list[ProbandDisease]:
|
|
82
81
|
"""
|
|
83
82
|
Get the diseases for a given phenopacket.
|
|
84
83
|
Args:
|
|
@@ -133,11 +132,7 @@ class PhenopacketTruthSet:
|
|
|
133
132
|
)
|
|
134
133
|
.with_columns(pl.col("rank").cast(pl.Int64))
|
|
135
134
|
.select(classified_results.columns)
|
|
136
|
-
.vstack(
|
|
137
|
-
classified_results.filter(
|
|
138
|
-
~pl.col("gene_symbol").is_in(ranked_results["gene_symbol"])
|
|
139
|
-
)
|
|
140
|
-
)
|
|
135
|
+
.vstack(classified_results.filter(~pl.col("gene_symbol").is_in(ranked_results["gene_symbol"])))
|
|
141
136
|
)
|
|
142
137
|
|
|
143
138
|
def classified_variant(self, result_name: str) -> pl.DataFrame:
|
|
@@ -181,11 +176,7 @@ class PhenopacketTruthSet:
|
|
|
181
176
|
ranked_results.with_columns(
|
|
182
177
|
[
|
|
183
178
|
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
184
|
-
.is_in(
|
|
185
|
-
classified_results.select(
|
|
186
|
-
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
187
|
-
).to_series()
|
|
188
|
-
)
|
|
179
|
+
.is_in(classified_results.select(pl.struct(["chrom", "start", "end", "ref", "alt"])).to_series())
|
|
189
180
|
.alias("true_positive")
|
|
190
181
|
]
|
|
191
182
|
)
|
|
@@ -194,17 +185,13 @@ class PhenopacketTruthSet:
|
|
|
194
185
|
.vstack(
|
|
195
186
|
classified_results.filter(
|
|
196
187
|
~pl.struct(["chrom", "start", "end", "ref", "alt"]).is_in(
|
|
197
|
-
ranked_results.select(
|
|
198
|
-
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
199
|
-
).to_series()
|
|
188
|
+
ranked_results.select(pl.struct(["chrom", "start", "end", "ref", "alt"])).to_series()
|
|
200
189
|
)
|
|
201
190
|
)
|
|
202
191
|
)
|
|
203
192
|
)
|
|
204
193
|
|
|
205
|
-
def classified_disease(
|
|
206
|
-
self, result_name: str, mondo_mapping_table: pl.DataFrame
|
|
207
|
-
) -> pl.DataFrame:
|
|
194
|
+
def classified_disease(self, result_name: str, mondo_mapping_table: pl.DataFrame) -> pl.DataFrame:
|
|
208
195
|
"""
|
|
209
196
|
Classify disease results for a given phenopacket.
|
|
210
197
|
Args:
|
|
@@ -225,9 +212,7 @@ class PhenopacketTruthSet:
|
|
|
225
212
|
pl.lit(0).cast(pl.Int64).alias("rank"),
|
|
226
213
|
pl.lit(True).alias("true_positive"),
|
|
227
214
|
pl.col("disease_identifier")
|
|
228
|
-
.map_elements(
|
|
229
|
-
lambda x: map_disease_id(x, mondo_mapping_table), return_dtype=pl.Utf8
|
|
230
|
-
)
|
|
215
|
+
.map_elements(lambda x: map_disease_id(x, mondo_mapping_table), return_dtype=pl.Utf8)
|
|
231
216
|
.alias("mondo_identifier"),
|
|
232
217
|
]
|
|
233
218
|
)
|
|
@@ -260,15 +245,9 @@ class PhenopacketTruthSet:
|
|
|
260
245
|
)
|
|
261
246
|
return (
|
|
262
247
|
ranked_results.with_columns(
|
|
263
|
-
(
|
|
264
|
-
pl.col("disease_identifier").is_in(classified_results["disease_identifier"])
|
|
265
|
-
).alias("true_positive")
|
|
248
|
+
(pl.col("mondo_identifier").is_in(classified_results["mondo_identifier"])).alias("true_positive")
|
|
266
249
|
)
|
|
267
250
|
.with_columns(pl.col("rank").cast(pl.Int64))
|
|
268
251
|
.select(classified_results.columns)
|
|
269
|
-
.vstack(
|
|
270
|
-
classified_results.filter(
|
|
271
|
-
~pl.col("disease_identifier").is_in(ranked_results["disease_identifier"])
|
|
272
|
-
)
|
|
273
|
-
)
|
|
252
|
+
.vstack(classified_results.filter(~pl.col("mondo_identifier").is_in(ranked_results["mondo_identifier"])))
|
|
274
253
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
1
2
|
from enum import Enum
|
|
2
3
|
from pathlib import Path
|
|
3
|
-
from typing import Callable, Tuple
|
|
4
4
|
|
|
5
5
|
import polars as pl
|
|
6
6
|
|
|
@@ -57,7 +57,7 @@ def _rank_results(results: pl.DataFrame, sort_order: SortOrder) -> pl.DataFrame:
|
|
|
57
57
|
results = (
|
|
58
58
|
results.sort("score", descending=sort_descending)
|
|
59
59
|
.with_columns(
|
|
60
|
-
pl.struct(["score"] + group_by)
|
|
60
|
+
pl.struct(["score"] + group_by) # noqa
|
|
61
61
|
.rank(method="dense", descending=sort_descending)
|
|
62
62
|
.cast(pl.Int32)
|
|
63
63
|
.alias("min_rank")
|
|
@@ -89,9 +89,7 @@ def _write_gene_result(ranked_results: pl.DataFrame, output_file: Path) -> None:
|
|
|
89
89
|
ranked_results ([PhEvalResult]): List of ranked PhEval gene results.
|
|
90
90
|
output_file (Path): Path to the output file.
|
|
91
91
|
"""
|
|
92
|
-
gene_output = ranked_results.select(
|
|
93
|
-
["rank", "score", "gene_symbol", "gene_identifier", "true_positive"]
|
|
94
|
-
)
|
|
92
|
+
gene_output = ranked_results.select(["rank", "score", "gene_symbol", "gene_identifier", "true_positive"])
|
|
95
93
|
_write_results_file(output_file, gene_output)
|
|
96
94
|
|
|
97
95
|
|
|
@@ -127,15 +125,11 @@ def _write_disease_result(ranked_results: pl.DataFrame, output_file: Path) -> No
|
|
|
127
125
|
ranked_results ([PhEvalResult]): List of ranked PhEval disease results.
|
|
128
126
|
output_file (Path): Path to the output file.
|
|
129
127
|
"""
|
|
130
|
-
disease_output = ranked_results.select(
|
|
131
|
-
["rank", "score", "disease_identifier", "mondo_identifier", "true_positive"]
|
|
132
|
-
)
|
|
128
|
+
disease_output = ranked_results.select(["rank", "score", "disease_identifier", "mondo_identifier", "true_positive"])
|
|
133
129
|
_write_results_file(output_file, disease_output)
|
|
134
130
|
|
|
135
131
|
|
|
136
|
-
def _get_result_type(
|
|
137
|
-
result_type: ResultType, phenopacket_truth_set: PhenopacketTruthSet
|
|
138
|
-
) -> Tuple[Callable, Callable]:
|
|
132
|
+
def _get_result_type(result_type: ResultType, phenopacket_truth_set: PhenopacketTruthSet) -> tuple[Callable, Callable]:
|
|
139
133
|
"""
|
|
140
134
|
Get the methods for extracting the entity and writing the result for a given result type.
|
|
141
135
|
Args:
|
|
@@ -156,9 +150,7 @@ def _get_result_type(
|
|
|
156
150
|
)
|
|
157
151
|
|
|
158
152
|
|
|
159
|
-
def create_empty_pheval_result(
|
|
160
|
-
phenopacket_dir: Path, output_dir: Path, result_type: ResultType
|
|
161
|
-
) -> None:
|
|
153
|
+
def create_empty_pheval_result(phenopacket_dir: Path, output_dir: Path, result_type: ResultType) -> None:
|
|
162
154
|
"""
|
|
163
155
|
Create an empty PhEval result for a given result type (gene, variant, or disease).
|
|
164
156
|
|
|
@@ -176,10 +168,7 @@ def create_empty_pheval_result(
|
|
|
176
168
|
"""
|
|
177
169
|
if result_type in executed_results:
|
|
178
170
|
return
|
|
179
|
-
logger.info(
|
|
180
|
-
f"Writing classified results for {len(all_files(phenopacket_dir))} "
|
|
181
|
-
f"phenopackets to {output_dir}"
|
|
182
|
-
)
|
|
171
|
+
logger.info(f"Writing classified results for {len(all_files(phenopacket_dir))} phenopackets to {output_dir}")
|
|
183
172
|
executed_results.add(result_type)
|
|
184
173
|
phenopacket_truth_set = PhenopacketTruthSet(phenopacket_dir)
|
|
185
174
|
classify_method, write_method = _get_result_type(result_type, phenopacket_truth_set)
|
|
@@ -209,13 +198,9 @@ def generate_gene_result(
|
|
|
209
198
|
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
210
199
|
"""
|
|
211
200
|
output_file = output_dir.joinpath(f"pheval_gene_results/{result_path.stem}-gene_result.parquet")
|
|
212
|
-
create_empty_pheval_result(
|
|
213
|
-
phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE
|
|
214
|
-
)
|
|
201
|
+
create_empty_pheval_result(phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE)
|
|
215
202
|
ranked_results = _rank_results(results, sort_order)
|
|
216
|
-
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_gene_results(
|
|
217
|
-
ranked_results, output_file
|
|
218
|
-
)
|
|
203
|
+
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_gene_results(ranked_results, output_file)
|
|
219
204
|
_write_gene_result(classified_results, output_file)
|
|
220
205
|
|
|
221
206
|
|
|
@@ -236,9 +221,7 @@ def generate_variant_result(
|
|
|
236
221
|
result_path (Path): Path to the tool-specific result file.
|
|
237
222
|
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
238
223
|
"""
|
|
239
|
-
output_file = output_dir.joinpath(
|
|
240
|
-
f"pheval_variant_results/{result_path.stem}-variant_result.parquet"
|
|
241
|
-
)
|
|
224
|
+
output_file = output_dir.joinpath(f"pheval_variant_results/{result_path.stem}-variant_result.parquet")
|
|
242
225
|
create_empty_pheval_result(
|
|
243
226
|
phenopacket_dir,
|
|
244
227
|
output_dir.joinpath("pheval_variant_results"),
|
|
@@ -247,9 +230,7 @@ def generate_variant_result(
|
|
|
247
230
|
ranked_results = _rank_results(results, sort_order).with_columns(
|
|
248
231
|
pl.concat_str(["chrom", "start", "ref", "alt"], separator="-").alias("variant_id")
|
|
249
232
|
)
|
|
250
|
-
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(
|
|
251
|
-
ranked_results, output_file
|
|
252
|
-
)
|
|
233
|
+
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(ranked_results, output_file)
|
|
253
234
|
_write_variant_result(classified_results, output_file)
|
|
254
235
|
|
|
255
236
|
|
|
@@ -270,9 +251,7 @@ def generate_disease_result(
|
|
|
270
251
|
result_path (Path): Path to the tool-specific result file.
|
|
271
252
|
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
272
253
|
"""
|
|
273
|
-
output_file = output_dir.joinpath(
|
|
274
|
-
f"pheval_disease_results/{result_path.stem}-disease_result.parquet"
|
|
275
|
-
)
|
|
254
|
+
output_file = output_dir.joinpath(f"pheval_disease_results/{result_path.stem}-disease_result.parquet")
|
|
276
255
|
create_empty_pheval_result(
|
|
277
256
|
phenopacket_dir,
|
|
278
257
|
output_dir.joinpath("pheval_disease_results"),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
1
2
|
from enum import Enum
|
|
2
3
|
from functools import wraps
|
|
3
|
-
from typing import Callable
|
|
4
4
|
|
|
5
5
|
import polars as pl
|
|
6
6
|
|
|
@@ -63,9 +63,7 @@ class ResultSchema(Enum):
|
|
|
63
63
|
raise ValueError(f"Missing required column: {col_name}")
|
|
64
64
|
|
|
65
65
|
if results.schema[col_name] != expected_type:
|
|
66
|
-
raise TypeError(
|
|
67
|
-
f"Column '{col_name}' has type {results.schema[col_name]}, expected {expected_type}"
|
|
68
|
-
)
|
|
66
|
+
raise TypeError(f"Column '{col_name}' has type {results.schema[col_name]}, expected {expected_type}")
|
|
69
67
|
|
|
70
68
|
return True
|
|
71
69
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import random
|
|
2
2
|
import time
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import List, Union
|
|
5
4
|
|
|
6
5
|
from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
|
|
7
6
|
from oaklib.resource import OntologyResource
|
|
@@ -19,7 +18,7 @@ from pheval.utils.phenopacket_utils import (
|
|
|
19
18
|
logger = get_logger()
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
def load_ontology(local_cached_ontology: Path = None) -> ProntoImplementation:
|
|
21
|
+
def load_ontology(local_cached_ontology: Path | None = None) -> ProntoImplementation:
|
|
23
22
|
"""
|
|
24
23
|
Load the Human Phenotype Ontology (HPO).
|
|
25
24
|
Args:
|
|
@@ -78,14 +77,14 @@ class HpoRandomiser:
|
|
|
78
77
|
PhenotypicFeature: The PhenotypicFeature object representing the retrieved HPO term.
|
|
79
78
|
"""
|
|
80
79
|
rels = self.hpo_ontology.entity_alias_map(hpo_id)
|
|
81
|
-
hpo_term = "".join(rels[(
|
|
80
|
+
hpo_term = "".join(rels[next(iter(rels))])
|
|
82
81
|
return PhenotypicFeature(type=OntologyClass(id=hpo_id, label=hpo_term))
|
|
83
82
|
|
|
84
83
|
@staticmethod
|
|
85
84
|
def retain_real_patient_terms(
|
|
86
|
-
phenotypic_features:
|
|
85
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
87
86
|
number_of_scrambled_terms: int,
|
|
88
|
-
) ->
|
|
87
|
+
) -> list[PhenotypicFeature]:
|
|
89
88
|
"""
|
|
90
89
|
Return a list of real patient HPO terms, retaining a specific number of non-scrambled terms.
|
|
91
90
|
|
|
@@ -104,10 +103,10 @@ class HpoRandomiser:
|
|
|
104
103
|
|
|
105
104
|
def convert_patient_terms_to_parent(
|
|
106
105
|
self,
|
|
107
|
-
phenotypic_features:
|
|
108
|
-
retained_phenotypic_features:
|
|
106
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
107
|
+
retained_phenotypic_features: list[PhenotypicFeature],
|
|
109
108
|
number_of_scrambled_terms: int,
|
|
110
|
-
) ->
|
|
109
|
+
) -> list[PhenotypicFeature]:
|
|
111
110
|
"""
|
|
112
111
|
Convert a subset of patient HPO terms to their respective parent terms.
|
|
113
112
|
|
|
@@ -133,7 +132,7 @@ class HpoRandomiser:
|
|
|
133
132
|
for term in hpo_terms_to_be_changed:
|
|
134
133
|
if self.hpo_ontology.label(term.type.id).startswith("obsolete"):
|
|
135
134
|
obsolete_term = self.hpo_ontology.entity_metadata_map(term.type.id)
|
|
136
|
-
updated_term =
|
|
135
|
+
updated_term = next(iter(obsolete_term.values()))[0]
|
|
137
136
|
parents = self.hpo_ontology.hierarchical_parents(updated_term)
|
|
138
137
|
else:
|
|
139
138
|
parents = self.hpo_ontology.hierarchical_parents(term.type.id)
|
|
@@ -143,7 +142,7 @@ class HpoRandomiser:
|
|
|
143
142
|
parent_terms.append(self.retrieve_hpo_term(random.choice(parents)))
|
|
144
143
|
return parent_terms
|
|
145
144
|
|
|
146
|
-
def create_random_hpo_terms(self, number_of_scrambled_terms: int) ->
|
|
145
|
+
def create_random_hpo_terms(self, number_of_scrambled_terms: int) -> list[PhenotypicFeature]:
|
|
147
146
|
"""
|
|
148
147
|
Generate a list of random HPO terms.
|
|
149
148
|
|
|
@@ -153,15 +152,13 @@ class HpoRandomiser:
|
|
|
153
152
|
Returns:
|
|
154
153
|
List[PhenotypicFeature]: A list of randomly selected HPO terms.
|
|
155
154
|
"""
|
|
156
|
-
random_ids = list(
|
|
157
|
-
random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms)
|
|
158
|
-
)
|
|
155
|
+
random_ids = list(random.sample(sorted(self.phenotypic_abnormalities), number_of_scrambled_terms))
|
|
159
156
|
return [self.retrieve_hpo_term(random_id) for random_id in random_ids]
|
|
160
157
|
|
|
161
158
|
def randomise_hpo_terms(
|
|
162
159
|
self,
|
|
163
|
-
phenotypic_features:
|
|
164
|
-
) ->
|
|
160
|
+
phenotypic_features: list[PhenotypicFeature],
|
|
161
|
+
) -> list[PhenotypicFeature]:
|
|
165
162
|
"""
|
|
166
163
|
Randomise the provided phenotypic features by combining retained, parent-converted, and random HPO terms.
|
|
167
164
|
|
|
@@ -181,9 +178,7 @@ class HpoRandomiser:
|
|
|
181
178
|
of randomised HPO terms to be used in the phenotypic features.
|
|
182
179
|
"""
|
|
183
180
|
number_of_scrambled_terms = self.scramble_factor_proportions(phenotypic_features)
|
|
184
|
-
retained_patient_terms = self.retain_real_patient_terms(
|
|
185
|
-
phenotypic_features, number_of_scrambled_terms
|
|
186
|
-
)
|
|
181
|
+
retained_patient_terms = self.retain_real_patient_terms(phenotypic_features, number_of_scrambled_terms)
|
|
187
182
|
return (
|
|
188
183
|
retained_patient_terms
|
|
189
184
|
+ self.convert_patient_terms_to_parent(
|
|
@@ -194,8 +189,8 @@ class HpoRandomiser:
|
|
|
194
189
|
|
|
195
190
|
def add_noise_to_phenotypic_profile(
|
|
196
191
|
self,
|
|
197
|
-
phenopacket:
|
|
198
|
-
) ->
|
|
192
|
+
phenopacket: Phenopacket | Family,
|
|
193
|
+
) -> Phenopacket | Family:
|
|
199
194
|
"""
|
|
200
195
|
Randomise the phenotypic profile of a Phenopacket or Family.
|
|
201
196
|
|
|
@@ -207,9 +202,7 @@ class HpoRandomiser:
|
|
|
207
202
|
"""
|
|
208
203
|
phenotypic_features = PhenopacketUtil(phenopacket).observed_phenotypic_features()
|
|
209
204
|
random_phenotypes = self.randomise_hpo_terms(phenotypic_features)
|
|
210
|
-
randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(
|
|
211
|
-
random_phenotypes
|
|
212
|
-
)
|
|
205
|
+
randomised_phenopacket = PhenopacketRebuilder(phenopacket).add_randomised_hpo(random_phenotypes)
|
|
213
206
|
return randomised_phenopacket
|
|
214
207
|
|
|
215
208
|
def create_scrambled_phenopacket(
|
|
@@ -283,13 +276,9 @@ def scramble_phenopackets(
|
|
|
283
276
|
ontology = load_ontology(local_cached_ontology)
|
|
284
277
|
if phenopacket_path is not None:
|
|
285
278
|
logger.info(f"Scrambling {phenopacket_path}.")
|
|
286
|
-
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopacket(
|
|
287
|
-
output_dir, phenopacket_path
|
|
288
|
-
)
|
|
279
|
+
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopacket(output_dir, phenopacket_path)
|
|
289
280
|
elif phenopacket_dir is not None:
|
|
290
|
-
logger.info(
|
|
291
|
-
f"Scrambling {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
292
|
-
)
|
|
281
|
+
logger.info(f"Scrambling {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}.")
|
|
293
282
|
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopackets(
|
|
294
283
|
output_dir,
|
|
295
284
|
phenopacket_dir,
|
|
@@ -6,7 +6,6 @@ import urllib.parse
|
|
|
6
6
|
from copy import copy
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import List, Union
|
|
10
9
|
|
|
11
10
|
from phenopackets import Family, File, Phenopacket
|
|
12
11
|
|
|
@@ -90,7 +89,7 @@ class VcfHeader:
|
|
|
90
89
|
chr_status: bool
|
|
91
90
|
|
|
92
91
|
|
|
93
|
-
def read_vcf(vcf_file: Path) ->
|
|
92
|
+
def read_vcf(vcf_file: Path) -> list[str]:
|
|
94
93
|
"""
|
|
95
94
|
Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.
|
|
96
95
|
|
|
@@ -102,9 +101,7 @@ def read_vcf(vcf_file: Path) -> List[str]:
|
|
|
102
101
|
"""
|
|
103
102
|
open_fn = gzip.open if is_gzipped(vcf_file) else open
|
|
104
103
|
vcf = open_fn(vcf_file)
|
|
105
|
-
vcf_contents = (
|
|
106
|
-
[line.decode() for line in vcf.readlines()] if is_gzipped(vcf_file) else vcf.readlines()
|
|
107
|
-
)
|
|
104
|
+
vcf_contents = [line.decode() for line in vcf.readlines()] if is_gzipped(vcf_file) else vcf.readlines()
|
|
108
105
|
vcf.close()
|
|
109
106
|
return vcf_contents
|
|
110
107
|
|
|
@@ -133,20 +130,14 @@ class VcfHeaderParser:
|
|
|
133
130
|
for line in self.vcf_contents:
|
|
134
131
|
if line.startswith("##contig=<ID"):
|
|
135
132
|
tokens = line.split(",")
|
|
136
|
-
chromosome = re.sub(
|
|
137
|
-
r"^.*?ID=", "", [token for token in tokens if "ID=" in token][0]
|
|
138
|
-
)
|
|
133
|
+
chromosome = re.sub(r"^.*?ID=", "", next(token for token in tokens if "ID=" in token))
|
|
139
134
|
if "chr" in chromosome:
|
|
140
135
|
chr_status = True
|
|
141
136
|
chromosome = chromosome.replace("chr", "")
|
|
142
|
-
contig_length = re.sub(
|
|
143
|
-
"[^0-9]+",
|
|
144
|
-
"",
|
|
145
|
-
[token for token in tokens if "length=" in token][0],
|
|
146
|
-
)
|
|
137
|
+
contig_length = re.sub("[^0-9]+", "", next(token for token in tokens if "length=" in token))
|
|
147
138
|
vcf_assembly[chromosome] = int(contig_length)
|
|
148
139
|
vcf_assembly = {i: vcf_assembly[i] for i in vcf_assembly if i.isdigit()}
|
|
149
|
-
assembly =
|
|
140
|
+
assembly = next(k for k, v in genome_assemblies.items() if v == vcf_assembly)
|
|
150
141
|
return assembly, chr_status
|
|
151
142
|
|
|
152
143
|
def parse_sample_id(self) -> str:
|
|
@@ -184,7 +175,7 @@ class VcfFile:
|
|
|
184
175
|
"""
|
|
185
176
|
|
|
186
177
|
vcf_file_name: str = None
|
|
187
|
-
vcf_contents:
|
|
178
|
+
vcf_contents: list[str] = None
|
|
188
179
|
vcf_header: VcfHeader = None
|
|
189
180
|
|
|
190
181
|
@staticmethod
|
|
@@ -205,7 +196,7 @@ class VcfFile:
|
|
|
205
196
|
|
|
206
197
|
def select_vcf_template(
|
|
207
198
|
phenopacket_path: Path,
|
|
208
|
-
proband_causative_variants:
|
|
199
|
+
proband_causative_variants: list[ProbandCausativeVariant],
|
|
209
200
|
hg19_vcf_info: VcfFile,
|
|
210
201
|
hg38_vcf_info: VcfFile,
|
|
211
202
|
hg19_vcf_dir: Path,
|
|
@@ -241,9 +232,7 @@ def select_vcf_template(
|
|
|
241
232
|
else:
|
|
242
233
|
raise InputError("Must specify hg38 template VCF!")
|
|
243
234
|
else:
|
|
244
|
-
raise IncompatibleGenomeAssemblyError(
|
|
245
|
-
proband_causative_variants[0].assembly, phenopacket_path
|
|
246
|
-
)
|
|
235
|
+
raise IncompatibleGenomeAssemblyError(proband_causative_variants[0].assembly, phenopacket_path)
|
|
247
236
|
|
|
248
237
|
|
|
249
238
|
def check_variant_assembly(
|
|
@@ -269,16 +258,10 @@ def check_variant_assembly(
|
|
|
269
258
|
raise ValueError("Too many genome assemblies!")
|
|
270
259
|
if phenopacket_assembly[0] not in compatible_genome_assembly:
|
|
271
260
|
raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
|
|
272
|
-
if (
|
|
273
|
-
phenopacket_assembly[0] in {"
|
|
274
|
-
and vcf_header.assembly not in {"hg19", "GRCh37"}
|
|
275
|
-
) or (
|
|
276
|
-
phenopacket_assembly[0] in {"hg38", "GRCh38"}
|
|
277
|
-
and vcf_header.assembly not in {"hg38", "GRCh38"}
|
|
261
|
+
if (phenopacket_assembly[0] in {"hg19", "GRCh37"} and vcf_header.assembly not in {"hg19", "GRCh37"}) or (
|
|
262
|
+
phenopacket_assembly[0] in {"hg38", "GRCh38"} and vcf_header.assembly not in {"hg38", "GRCh38"}
|
|
278
263
|
):
|
|
279
|
-
raise IncompatibleGenomeAssemblyError(
|
|
280
|
-
assembly=phenopacket_assembly, phenopacket=phenopacket_path
|
|
281
|
-
)
|
|
264
|
+
raise IncompatibleGenomeAssemblyError(assembly=phenopacket_assembly, phenopacket=phenopacket_path)
|
|
282
265
|
|
|
283
266
|
|
|
284
267
|
class VcfSpiker:
|
|
@@ -302,7 +285,7 @@ class VcfSpiker:
|
|
|
302
285
|
self.proband_causative_variants = proband_causative_variants
|
|
303
286
|
self.vcf_header = vcf_header
|
|
304
287
|
|
|
305
|
-
def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) ->
|
|
288
|
+
def construct_variant_entry(self, proband_variant_data: ProbandCausativeVariant) -> list[str]:
|
|
306
289
|
"""
|
|
307
290
|
Construct variant entries.
|
|
308
291
|
|
|
@@ -337,7 +320,7 @@ class VcfSpiker:
|
|
|
337
320
|
genotype_codes[proband_variant_data.genotype.lower()] + "\n",
|
|
338
321
|
]
|
|
339
322
|
|
|
340
|
-
def construct_vcf_records(self, template_vcf_name: str) ->
|
|
323
|
+
def construct_vcf_records(self, template_vcf_name: str) -> list[str]:
|
|
341
324
|
"""
|
|
342
325
|
Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.
|
|
343
326
|
|
|
@@ -353,8 +336,7 @@ class VcfSpiker:
|
|
|
353
336
|
matching_indices = [
|
|
354
337
|
i
|
|
355
338
|
for i, val in enumerate(updated_vcf_records)
|
|
356
|
-
if val.split("\t")[0] == variant_entry[0]
|
|
357
|
-
and int(val.split("\t")[1]) < int(variant_entry[1])
|
|
339
|
+
if val.split("\t")[0] == variant_entry[0] and int(val.split("\t")[1]) < int(variant_entry[1])
|
|
358
340
|
]
|
|
359
341
|
if matching_indices:
|
|
360
342
|
logger.info(
|
|
@@ -372,7 +354,7 @@ class VcfSpiker:
|
|
|
372
354
|
updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
|
|
373
355
|
return updated_vcf_records
|
|
374
356
|
|
|
375
|
-
def construct_header(self, updated_vcf_records:
|
|
357
|
+
def construct_header(self, updated_vcf_records: list[str]) -> list[str]:
|
|
376
358
|
"""
|
|
377
359
|
Construct the header of the VCF.
|
|
378
360
|
|
|
@@ -394,7 +376,7 @@ class VcfSpiker:
|
|
|
394
376
|
updated_vcf_file.append(text)
|
|
395
377
|
return updated_vcf_file
|
|
396
378
|
|
|
397
|
-
def construct_vcf(self, template_vcf_name: str) ->
|
|
379
|
+
def construct_vcf(self, template_vcf_name: str) -> list[str]:
|
|
398
380
|
"""
|
|
399
381
|
Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.
|
|
400
382
|
|
|
@@ -412,7 +394,7 @@ class VcfWriter:
|
|
|
412
394
|
|
|
413
395
|
def __init__(
|
|
414
396
|
self,
|
|
415
|
-
vcf_contents:
|
|
397
|
+
vcf_contents: list[str],
|
|
416
398
|
spiked_vcf_file_path: Path,
|
|
417
399
|
):
|
|
418
400
|
"""
|
|
@@ -454,13 +436,13 @@ class VcfWriter:
|
|
|
454
436
|
|
|
455
437
|
|
|
456
438
|
def spike_vcf_contents(
|
|
457
|
-
phenopacket:
|
|
439
|
+
phenopacket: Phenopacket | Family,
|
|
458
440
|
phenopacket_path: Path,
|
|
459
441
|
hg19_vcf_info: VcfFile,
|
|
460
442
|
hg38_vcf_info: VcfFile,
|
|
461
443
|
hg19_vcf_dir: Path,
|
|
462
444
|
hg38_vcf_dir: Path,
|
|
463
|
-
) -> tuple[str,
|
|
445
|
+
) -> tuple[str, list[str]]:
|
|
464
446
|
"""
|
|
465
447
|
Spike VCF records with variants obtained from a Phenopacket or Family.
|
|
466
448
|
|
|
@@ -486,9 +468,7 @@ def spike_vcf_contents(
|
|
|
486
468
|
hg19_vcf_dir,
|
|
487
469
|
hg38_vcf_dir,
|
|
488
470
|
)
|
|
489
|
-
check_variant_assembly(
|
|
490
|
-
phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
|
|
491
|
-
)
|
|
471
|
+
check_variant_assembly(phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path)
|
|
492
472
|
return (
|
|
493
473
|
chosen_template_vcf.vcf_header.assembly,
|
|
494
474
|
VcfSpiker(
|
|
@@ -501,7 +481,7 @@ def spike_vcf_contents(
|
|
|
501
481
|
|
|
502
482
|
def generate_spiked_vcf_file(
|
|
503
483
|
output_dir: Path,
|
|
504
|
-
phenopacket:
|
|
484
|
+
phenopacket: Phenopacket | Family,
|
|
505
485
|
phenopacket_path: Path,
|
|
506
486
|
hg19_vcf_info: VcfFile,
|
|
507
487
|
hg38_vcf_info: VcfFile,
|
|
@@ -566,9 +546,7 @@ def spike_and_update_phenopacket(
|
|
|
566
546
|
hg19_vcf_dir,
|
|
567
547
|
hg38_vcf_dir,
|
|
568
548
|
)
|
|
569
|
-
updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
|
|
570
|
-
spiked_vcf_file_message
|
|
571
|
-
)
|
|
549
|
+
updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(spiked_vcf_file_message)
|
|
572
550
|
write_phenopacket(updated_phenopacket, phenopacket_path)
|
|
573
551
|
|
|
574
552
|
|
|
@@ -598,9 +576,7 @@ def create_spiked_vcf(
|
|
|
598
576
|
raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
|
|
599
577
|
hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
|
|
600
578
|
hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
|
|
601
|
-
spike_and_update_phenopacket(
|
|
602
|
-
hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
|
|
603
|
-
)
|
|
579
|
+
spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path)
|
|
604
580
|
|
|
605
581
|
|
|
606
582
|
def create_spiked_vcfs(
|
|
@@ -625,12 +601,7 @@ def create_spiked_vcfs(
|
|
|
625
601
|
Raises:
|
|
626
602
|
InputError: If both hg19_template_vcf and hg38_template_vcf are None.
|
|
627
603
|
"""
|
|
628
|
-
if
|
|
629
|
-
hg19_template_vcf is None
|
|
630
|
-
and hg38_template_vcf is None
|
|
631
|
-
and hg19_vcf_dir is None
|
|
632
|
-
and hg38_vcf_dir is None
|
|
633
|
-
):
|
|
604
|
+
if hg19_template_vcf is None and hg38_template_vcf is None and hg19_vcf_dir is None and hg38_vcf_dir is None:
|
|
634
605
|
raise InputError("Need to specify a VCF!")
|
|
635
606
|
hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
|
|
636
607
|
hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
|
|
@@ -677,9 +648,7 @@ def spike_vcfs(
|
|
|
677
648
|
hg38_vcf_dir,
|
|
678
649
|
)
|
|
679
650
|
elif phenopacket_dir is not None:
|
|
680
|
-
logger.info(
|
|
681
|
-
f"Spiking variants from {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
682
|
-
)
|
|
651
|
+
logger.info(f"Spiking variants from {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}.")
|
|
683
652
|
create_spiked_vcfs(
|
|
684
653
|
output_dir,
|
|
685
654
|
phenopacket_dir,
|
|
@@ -21,19 +21,18 @@ class MutuallyExclusiveOptionError(Option):
|
|
|
21
21
|
help_ = kwargs.get("help", "")
|
|
22
22
|
if self.mutually_exclusive:
|
|
23
23
|
ex_str = ", ".join(self.mutually_exclusive)
|
|
24
|
-
kwargs["help"] = help_ + (
|
|
25
|
-
|
|
26
|
-
)
|
|
27
|
-
super(MutuallyExclusiveOptionError, self).__init__(*args, **kwargs)
|
|
24
|
+
kwargs["help"] = help_ + (" NOTE: This argument is mutually exclusive with arguments: [" + ex_str + "].")
|
|
25
|
+
super().__init__(*args, **kwargs)
|
|
28
26
|
|
|
29
27
|
def handle_parse_result(self, ctx, opts, args):
|
|
30
28
|
if self.mutually_exclusive.intersection(opts) and self.name in opts:
|
|
31
29
|
raise UsageError(
|
|
32
|
-
"Illegal usage: `{}` is mutually exclusive with "
|
|
33
|
-
|
|
30
|
+
"Illegal usage: `{}` is mutually exclusive with arguments `{}`.".format(
|
|
31
|
+
self.name, ", ".join(self.mutually_exclusive)
|
|
32
|
+
)
|
|
34
33
|
)
|
|
35
34
|
|
|
36
|
-
return super(
|
|
35
|
+
return super().handle_parse_result(ctx, opts, args)
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
class IncorrectFileFormatError(Exception):
|