pheval 0.3.6__tar.gz → 0.3.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- {pheval-0.3.6 → pheval-0.3.8}/PKG-INFO +1 -1
- {pheval-0.3.6 → pheval-0.3.8}/pyproject.toml +1 -1
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/disease_prioritisation_analysis.py +7 -14
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/gene_prioritisation_analysis.py +42 -16
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/parse_pheval_result.py +8 -1
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/variant_prioritisation_analysis.py +10 -17
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/cli_pheval_utils.py +79 -12
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/post_processing/post_processing.py +8 -7
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/create_spiked_vcf.py +104 -13
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/prepare_corpus.py +13 -1
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/file_utils.py +0 -29
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/phenopacket_utils.py +5 -3
- {pheval-0.3.6 → pheval-0.3.8}/LICENSE +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/README.md +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/analysis.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/benchmark_generator.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/benchmarking_data.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/binary_classification_stats.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/generate_plots.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/generate_summary_outputs.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/parse_benchmark_summary.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/prioritisation_rank_recorder.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/prioritisation_result_types.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/rank_stats.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/run_data_parser.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/cli.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/cli_pheval.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/config_parser.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/constants.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/implementations/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/infra/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/infra/exomiserdb.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/post_processing/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/create_noisy_phenopackets.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/custom_exceptions.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/update_phenopacket.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/resources/hgnc_complete_set.txt +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/run_metadata.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/runners/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/runners/runner.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/__init__.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/docs_gen.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/docs_gen.sh +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/exomiser.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/semsim_utils.py +0 -0
- {pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/utils.py +0 -0
|
@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResu
|
|
|
10
10
|
from pheval.analyse.rank_stats import RankStats
|
|
11
11
|
from pheval.analyse.run_data_parser import TrackInputOutputDirectories
|
|
12
12
|
from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
|
|
13
|
-
from pheval.utils.file_utils import
|
|
14
|
-
all_files,
|
|
15
|
-
files_with_suffix,
|
|
16
|
-
obtain_phenopacket_path_from_pheval_result,
|
|
17
|
-
)
|
|
13
|
+
from pheval.utils.file_utils import all_files
|
|
18
14
|
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
|
|
19
15
|
|
|
20
16
|
|
|
@@ -217,7 +213,7 @@ def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
|
|
|
217
213
|
|
|
218
214
|
|
|
219
215
|
def assess_phenopacket_disease_prioritisation(
|
|
220
|
-
|
|
216
|
+
phenopacket_path: Path,
|
|
221
217
|
score_order: str,
|
|
222
218
|
results_dir_and_input: TrackInputOutputDirectories,
|
|
223
219
|
threshold: float,
|
|
@@ -230,7 +226,7 @@ def assess_phenopacket_disease_prioritisation(
|
|
|
230
226
|
against the recorded causative diseases for a proband in the Phenopacket.
|
|
231
227
|
|
|
232
228
|
Args:
|
|
233
|
-
|
|
229
|
+
phenopacket_path (Path): Path to the Phenopacket.
|
|
234
230
|
score_order (str): The order in which scores are arranged, either ascending or descending.
|
|
235
231
|
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
236
232
|
threshold (float): Threshold for assessment.
|
|
@@ -238,8 +234,8 @@ def assess_phenopacket_disease_prioritisation(
|
|
|
238
234
|
disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
|
|
239
235
|
disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
240
236
|
"""
|
|
241
|
-
|
|
242
|
-
|
|
237
|
+
standardised_disease_result = results_dir_and_input.results_dir.joinpath(
|
|
238
|
+
f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
|
|
243
239
|
)
|
|
244
240
|
pheval_disease_result = read_standardised_result(standardised_disease_result)
|
|
245
241
|
proband_diseases = _obtain_causative_diseases(phenopacket_path)
|
|
@@ -276,12 +272,9 @@ def benchmark_disease_prioritisation(
|
|
|
276
272
|
"""
|
|
277
273
|
disease_rank_stats = RankStats()
|
|
278
274
|
disease_binary_classification_stats = BinaryClassificationStats()
|
|
279
|
-
for
|
|
280
|
-
results_directory_and_input.results_dir.joinpath("pheval_disease_results/"),
|
|
281
|
-
".tsv",
|
|
282
|
-
):
|
|
275
|
+
for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
|
|
283
276
|
assess_phenopacket_disease_prioritisation(
|
|
284
|
-
|
|
277
|
+
phenopacket_path,
|
|
285
278
|
score_order,
|
|
286
279
|
results_directory_and_input,
|
|
287
280
|
threshold,
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import re
|
|
1
3
|
from collections import defaultdict
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
from typing import List
|
|
5
|
+
from typing import List, Union
|
|
4
6
|
|
|
5
7
|
from pheval.analyse.benchmarking_data import BenchmarkRunResults
|
|
6
8
|
from pheval.analyse.binary_classification_stats import BinaryClassificationStats
|
|
@@ -10,11 +12,7 @@ from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
|
|
|
10
12
|
from pheval.analyse.rank_stats import RankStats
|
|
11
13
|
from pheval.analyse.run_data_parser import TrackInputOutputDirectories
|
|
12
14
|
from pheval.post_processing.post_processing import RankedPhEvalGeneResult
|
|
13
|
-
from pheval.utils.file_utils import
|
|
14
|
-
all_files,
|
|
15
|
-
files_with_suffix,
|
|
16
|
-
obtain_phenopacket_path_from_pheval_result,
|
|
17
|
-
)
|
|
15
|
+
from pheval.utils.file_utils import all_files
|
|
18
16
|
from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
|
|
19
17
|
|
|
20
18
|
|
|
@@ -144,6 +142,24 @@ class AssessGenePrioritisation:
|
|
|
144
142
|
)
|
|
145
143
|
)
|
|
146
144
|
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _check_string_representation(entity: str) -> Union[List[str], str]:
|
|
147
|
+
"""
|
|
148
|
+
Check if the input string is a representation of a list and returns the list if true, otherwise the string.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
entity (str): The input entity to check.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Union[List[str], str]: A list if the input string is a list representation, otherwise
|
|
155
|
+
the original string.
|
|
156
|
+
"""
|
|
157
|
+
list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
|
|
158
|
+
if list_pattern.match(entity):
|
|
159
|
+
return ast.literal_eval(entity)
|
|
160
|
+
else:
|
|
161
|
+
return entity
|
|
162
|
+
|
|
147
163
|
def assess_gene_prioritisation(
|
|
148
164
|
self,
|
|
149
165
|
rank_stats: RankStats,
|
|
@@ -165,9 +181,21 @@ class AssessGenePrioritisation:
|
|
|
165
181
|
rank_stats.total += 1
|
|
166
182
|
gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
|
|
167
183
|
for standardised_gene_result in self.standardised_gene_results:
|
|
184
|
+
gene_identifier = self._check_string_representation(
|
|
185
|
+
standardised_gene_result.gene_identifier
|
|
186
|
+
)
|
|
187
|
+
gene_symbol = self._check_string_representation(
|
|
188
|
+
standardised_gene_result.gene_symbol
|
|
189
|
+
)
|
|
168
190
|
if (
|
|
169
|
-
|
|
170
|
-
|
|
191
|
+
isinstance(gene_identifier, list)
|
|
192
|
+
and gene.gene_identifier in gene_identifier
|
|
193
|
+
or isinstance(gene_identifier, str)
|
|
194
|
+
and gene.gene_identifier == str
|
|
195
|
+
or isinstance(gene_symbol, list)
|
|
196
|
+
and gene.gene_symbol in gene_symbol
|
|
197
|
+
or isinstance(gene_symbol, str)
|
|
198
|
+
and gene.gene_symbol == gene_symbol
|
|
171
199
|
):
|
|
172
200
|
gene_match = self._record_matched_gene(
|
|
173
201
|
gene, rank_stats, standardised_gene_result
|
|
@@ -209,7 +237,7 @@ def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene
|
|
|
209
237
|
|
|
210
238
|
|
|
211
239
|
def assess_phenopacket_gene_prioritisation(
|
|
212
|
-
|
|
240
|
+
phenopacket_path: Path,
|
|
213
241
|
score_order: str,
|
|
214
242
|
results_dir_and_input: TrackInputOutputDirectories,
|
|
215
243
|
threshold: float,
|
|
@@ -222,7 +250,7 @@ def assess_phenopacket_gene_prioritisation(
|
|
|
222
250
|
against the recorded causative genes for a proband in the Phenopacket.
|
|
223
251
|
|
|
224
252
|
Args:
|
|
225
|
-
|
|
253
|
+
phenopacket_path (Path): Path to the Phenopacket.
|
|
226
254
|
score_order (str): The order in which scores are arranged, either ascending or descending.
|
|
227
255
|
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
228
256
|
threshold (float): Threshold for assessment.
|
|
@@ -230,8 +258,8 @@ def assess_phenopacket_gene_prioritisation(
|
|
|
230
258
|
gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
|
|
231
259
|
gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
232
260
|
"""
|
|
233
|
-
|
|
234
|
-
|
|
261
|
+
standardised_gene_result = results_dir_and_input.results_dir.joinpath(
|
|
262
|
+
f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
|
|
235
263
|
)
|
|
236
264
|
pheval_gene_result = read_standardised_result(standardised_gene_result)
|
|
237
265
|
proband_causative_genes = _obtain_causative_genes(phenopacket_path)
|
|
@@ -266,11 +294,9 @@ def benchmark_gene_prioritisation(
|
|
|
266
294
|
"""
|
|
267
295
|
gene_rank_stats = RankStats()
|
|
268
296
|
gene_binary_classification_stats = BinaryClassificationStats()
|
|
269
|
-
for
|
|
270
|
-
results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv"
|
|
271
|
-
):
|
|
297
|
+
for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
|
|
272
298
|
assess_phenopacket_gene_prioritisation(
|
|
273
|
-
|
|
299
|
+
phenopacket_path,
|
|
274
300
|
score_order,
|
|
275
301
|
results_directory_and_input,
|
|
276
302
|
threshold,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from typing import List
|
|
3
4
|
|
|
@@ -5,6 +6,8 @@ import pandas as pd
|
|
|
5
6
|
|
|
6
7
|
from pheval.post_processing.post_processing import PhEvalResult
|
|
7
8
|
|
|
9
|
+
info_log = logging.getLogger("info")
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def read_standardised_result(standardised_result_path: Path) -> List[dict]:
|
|
10
13
|
"""
|
|
@@ -16,7 +19,11 @@ def read_standardised_result(standardised_result_path: Path) -> List[dict]:
|
|
|
16
19
|
Returns:
|
|
17
20
|
List[dict]: A list of dictionaries representing the content of the standardised result file.
|
|
18
21
|
"""
|
|
19
|
-
|
|
22
|
+
if standardised_result_path.is_file():
|
|
23
|
+
return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records")
|
|
24
|
+
else:
|
|
25
|
+
info_log.info(f"Could not find {standardised_result_path}")
|
|
26
|
+
return pd.DataFrame().to_dict("records")
|
|
20
27
|
|
|
21
28
|
|
|
22
29
|
def parse_pheval_result(
|
|
@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import VariantPrioritisationResu
|
|
|
10
10
|
from pheval.analyse.rank_stats import RankStats
|
|
11
11
|
from pheval.analyse.run_data_parser import TrackInputOutputDirectories
|
|
12
12
|
from pheval.post_processing.post_processing import RankedPhEvalVariantResult
|
|
13
|
-
from pheval.utils.file_utils import
|
|
14
|
-
all_files,
|
|
15
|
-
files_with_suffix,
|
|
16
|
-
obtain_phenopacket_path_from_pheval_result,
|
|
17
|
-
)
|
|
13
|
+
from pheval.utils.file_utils import all_files
|
|
18
14
|
from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
|
|
19
15
|
|
|
20
16
|
|
|
@@ -166,8 +162,8 @@ class AssessVariantPrioritisation:
|
|
|
166
162
|
variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
|
|
167
163
|
for result in self.standardised_variant_results:
|
|
168
164
|
result_variant = GenomicVariant(
|
|
169
|
-
chrom=result.chromosome,
|
|
170
|
-
pos=result.start,
|
|
165
|
+
chrom=str(result.chromosome),
|
|
166
|
+
pos=int(result.start),
|
|
171
167
|
ref=result.ref,
|
|
172
168
|
alt=result.alt,
|
|
173
169
|
)
|
|
@@ -211,7 +207,7 @@ def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
|
|
|
211
207
|
|
|
212
208
|
|
|
213
209
|
def assess_phenopacket_variant_prioritisation(
|
|
214
|
-
|
|
210
|
+
phenopacket_path: Path,
|
|
215
211
|
score_order: str,
|
|
216
212
|
results_dir_and_input: TrackInputOutputDirectories,
|
|
217
213
|
threshold: float,
|
|
@@ -224,7 +220,7 @@ def assess_phenopacket_variant_prioritisation(
|
|
|
224
220
|
against the recorded causative variants for a proband in the Phenopacket.
|
|
225
221
|
|
|
226
222
|
Args:
|
|
227
|
-
|
|
223
|
+
phenopacket_path (Path): Path to the Phenopacket.
|
|
228
224
|
score_order (str): The order in which scores are arranged, either ascending or descending.
|
|
229
225
|
results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
|
|
230
226
|
threshold (float): Threshold for assessment.
|
|
@@ -232,10 +228,10 @@ def assess_phenopacket_variant_prioritisation(
|
|
|
232
228
|
variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
|
|
233
229
|
variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
|
|
234
230
|
"""
|
|
235
|
-
phenopacket_path = obtain_phenopacket_path_from_pheval_result(
|
|
236
|
-
standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
|
|
237
|
-
)
|
|
238
231
|
proband_causative_variants = _obtain_causative_variants(phenopacket_path)
|
|
232
|
+
standardised_variant_result = results_dir_and_input.results_dir.joinpath(
|
|
233
|
+
f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
|
|
234
|
+
)
|
|
239
235
|
pheval_variant_result = read_standardised_result(standardised_variant_result)
|
|
240
236
|
AssessVariantPrioritisation(
|
|
241
237
|
phenopacket_path,
|
|
@@ -270,12 +266,9 @@ def benchmark_variant_prioritisation(
|
|
|
270
266
|
"""
|
|
271
267
|
variant_rank_stats = RankStats()
|
|
272
268
|
variant_binary_classification_stats = BinaryClassificationStats()
|
|
273
|
-
for
|
|
274
|
-
results_directory_and_input.results_dir.joinpath("pheval_variant_results/"),
|
|
275
|
-
".tsv",
|
|
276
|
-
):
|
|
269
|
+
for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
|
|
277
270
|
assess_phenopacket_variant_prioritisation(
|
|
278
|
-
|
|
271
|
+
phenopacket_path,
|
|
279
272
|
score_order,
|
|
280
273
|
results_directory_and_input,
|
|
281
274
|
threshold,
|
|
@@ -260,6 +260,8 @@ def update_phenopackets_command(
|
|
|
260
260
|
required=False,
|
|
261
261
|
help="Template hg19 VCF file",
|
|
262
262
|
type=Path,
|
|
263
|
+
cls=MutuallyExclusiveOptionError,
|
|
264
|
+
mutually_exclusive=["hg19_vcf_dir"],
|
|
263
265
|
)
|
|
264
266
|
@click.option(
|
|
265
267
|
"--hg38-template-vcf",
|
|
@@ -268,6 +270,28 @@ def update_phenopackets_command(
|
|
|
268
270
|
required=False,
|
|
269
271
|
help="Template hg38 VCF file",
|
|
270
272
|
type=Path,
|
|
273
|
+
cls=MutuallyExclusiveOptionError,
|
|
274
|
+
mutually_exclusive=["hg38_vcf_dir"],
|
|
275
|
+
)
|
|
276
|
+
@click.option(
|
|
277
|
+
"--hg19-vcf-dir",
|
|
278
|
+
"-hg19-dir",
|
|
279
|
+
metavar="PATH",
|
|
280
|
+
required=False,
|
|
281
|
+
help="Path to directory containing hg19 VCF templates.",
|
|
282
|
+
type=Path,
|
|
283
|
+
cls=MutuallyExclusiveOptionError,
|
|
284
|
+
mutually_exclusive=["hg19_template_vcf"],
|
|
285
|
+
)
|
|
286
|
+
@click.option(
|
|
287
|
+
"--hg38-vcf-dir",
|
|
288
|
+
"-hg38-dir",
|
|
289
|
+
metavar="PATH",
|
|
290
|
+
required=False,
|
|
291
|
+
help="Path to directory containing hg38 VCF templates.",
|
|
292
|
+
type=Path,
|
|
293
|
+
cls=MutuallyExclusiveOptionError,
|
|
294
|
+
mutually_exclusive=["hg38_template_vcf"],
|
|
271
295
|
)
|
|
272
296
|
@click.option(
|
|
273
297
|
"--output-dir",
|
|
@@ -284,6 +308,8 @@ def create_spiked_vcfs_command(
|
|
|
284
308
|
output_dir: Path,
|
|
285
309
|
hg19_template_vcf: Path = None,
|
|
286
310
|
hg38_template_vcf: Path = None,
|
|
311
|
+
hg19_vcf_dir: Path = None,
|
|
312
|
+
hg38_vcf_dir: Path = None,
|
|
287
313
|
):
|
|
288
314
|
"""
|
|
289
315
|
Create spiked VCF from either a Phenopacket or a Phenopacket directory.
|
|
@@ -294,10 +320,20 @@ def create_spiked_vcfs_command(
|
|
|
294
320
|
output_dir (Path): The directory to store the generated spiked VCF file(s).
|
|
295
321
|
hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
|
|
296
322
|
hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
|
|
323
|
+
hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
|
|
324
|
+
hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
|
|
297
325
|
"""
|
|
298
326
|
if phenopacket_path is None and phenopacket_dir is None:
|
|
299
327
|
raise InputError("Either a phenopacket or phenopacket directory must be specified")
|
|
300
|
-
spike_vcfs(
|
|
328
|
+
spike_vcfs(
|
|
329
|
+
output_dir,
|
|
330
|
+
phenopacket_path,
|
|
331
|
+
phenopacket_dir,
|
|
332
|
+
hg19_template_vcf,
|
|
333
|
+
hg38_template_vcf,
|
|
334
|
+
hg19_vcf_dir,
|
|
335
|
+
hg38_vcf_dir,
|
|
336
|
+
)
|
|
301
337
|
|
|
302
338
|
|
|
303
339
|
@click.command()
|
|
@@ -656,6 +692,8 @@ def generate_stats_plot(
|
|
|
656
692
|
required=False,
|
|
657
693
|
help="Template hg19 VCF file",
|
|
658
694
|
type=Path,
|
|
695
|
+
cls=MutuallyExclusiveOptionError,
|
|
696
|
+
mutually_exclusive=["hg19_vcf_dir"],
|
|
659
697
|
)
|
|
660
698
|
@click.option(
|
|
661
699
|
"--hg38-template-vcf",
|
|
@@ -664,6 +702,28 @@ def generate_stats_plot(
|
|
|
664
702
|
required=False,
|
|
665
703
|
help="Template hg38 VCF file",
|
|
666
704
|
type=Path,
|
|
705
|
+
cls=MutuallyExclusiveOptionError,
|
|
706
|
+
mutually_exclusive=["hg38_vcf_dir"],
|
|
707
|
+
)
|
|
708
|
+
@click.option(
|
|
709
|
+
"--hg19-vcf-dir",
|
|
710
|
+
"-hg19-dir",
|
|
711
|
+
metavar="PATH",
|
|
712
|
+
required=False,
|
|
713
|
+
help="Path to directory containing hg19 VCF templates.",
|
|
714
|
+
type=Path,
|
|
715
|
+
cls=MutuallyExclusiveOptionError,
|
|
716
|
+
mutually_exclusive=["hg19_template_vcf"],
|
|
717
|
+
)
|
|
718
|
+
@click.option(
|
|
719
|
+
"--hg38-vcf-dir",
|
|
720
|
+
"-hg38-dir",
|
|
721
|
+
metavar="PATH",
|
|
722
|
+
required=False,
|
|
723
|
+
help="Path to directory containing hg38 VCF templates.",
|
|
724
|
+
type=Path,
|
|
725
|
+
cls=MutuallyExclusiveOptionError,
|
|
726
|
+
mutually_exclusive=["hg38_template_vcf"],
|
|
667
727
|
)
|
|
668
728
|
@click.option(
|
|
669
729
|
"--output-dir",
|
|
@@ -682,23 +742,28 @@ def prepare_corpus_command(
|
|
|
682
742
|
gene_identifier: str,
|
|
683
743
|
hg19_template_vcf: Path,
|
|
684
744
|
hg38_template_vcf: Path,
|
|
745
|
+
hg19_vcf_dir: Path,
|
|
746
|
+
hg38_vcf_dir: Path,
|
|
685
747
|
output_dir: Path,
|
|
686
748
|
):
|
|
687
749
|
"""
|
|
688
750
|
Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
|
|
689
751
|
gene identifiers.
|
|
690
752
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
753
|
+
Args:
|
|
754
|
+
phenopacket_dir (Path): The path to the directory containing Phenopackets.
|
|
755
|
+
variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
|
|
756
|
+
gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
|
|
757
|
+
disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
|
|
758
|
+
gene_identifier (str): Identifier for updating gene identifiers, if applicable.
|
|
759
|
+
hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
|
|
760
|
+
hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
|
|
761
|
+
hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
|
|
762
|
+
hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
|
|
763
|
+
output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
|
|
764
|
+
Notes:
|
|
765
|
+
To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
|
|
766
|
+
hg19_vcf_dir or hg38_vcf_dir is required.
|
|
702
767
|
"""
|
|
703
768
|
prepare_corpus(
|
|
704
769
|
phenopacket_dir,
|
|
@@ -708,5 +773,7 @@ def prepare_corpus_command(
|
|
|
708
773
|
gene_identifier,
|
|
709
774
|
hg19_template_vcf,
|
|
710
775
|
hg38_template_vcf,
|
|
776
|
+
hg19_vcf_dir,
|
|
777
|
+
hg38_vcf_dir,
|
|
711
778
|
output_dir,
|
|
712
779
|
)
|
|
@@ -3,6 +3,7 @@ import operator
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import List, Union
|
|
6
7
|
|
|
7
8
|
import pandas as pd
|
|
8
9
|
|
|
@@ -30,8 +31,8 @@ class PhEvalResult:
|
|
|
30
31
|
class PhEvalGeneResult(PhEvalResult):
|
|
31
32
|
"""Minimal data required from tool-specific output for gene prioritisation result
|
|
32
33
|
Args:
|
|
33
|
-
gene_symbol (str): The gene symbol for the result entry
|
|
34
|
-
gene_identifier (str): The ENSEMBL gene identifier for the result entry
|
|
34
|
+
gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
|
|
35
|
+
gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
|
|
35
36
|
score (float): The score for the gene result entry
|
|
36
37
|
Notes:
|
|
37
38
|
While we recommend providing the gene identifier in the ENSEMBL namespace,
|
|
@@ -39,8 +40,8 @@ class PhEvalGeneResult(PhEvalResult):
|
|
|
39
40
|
in the analysis.
|
|
40
41
|
"""
|
|
41
42
|
|
|
42
|
-
gene_symbol: str
|
|
43
|
-
gene_identifier: str
|
|
43
|
+
gene_symbol: Union[List[str], str]
|
|
44
|
+
gene_identifier: Union[List[str], str]
|
|
44
45
|
score: float
|
|
45
46
|
|
|
46
47
|
|
|
@@ -375,11 +376,11 @@ def generate_pheval_result(
|
|
|
375
376
|
info_log.warning(f"No results found for {tool_result_path.name}")
|
|
376
377
|
return
|
|
377
378
|
ranked_pheval_result = _create_pheval_result(pheval_result, sort_order_str)
|
|
378
|
-
if all(isinstance(result,
|
|
379
|
+
if all(isinstance(result, PhEvalGeneResult) for result in pheval_result):
|
|
379
380
|
_write_pheval_gene_result(ranked_pheval_result, output_dir, tool_result_path)
|
|
380
|
-
elif all(isinstance(result,
|
|
381
|
+
elif all(isinstance(result, PhEvalVariantResult) for result in pheval_result):
|
|
381
382
|
_write_pheval_variant_result(ranked_pheval_result, output_dir, tool_result_path)
|
|
382
|
-
elif all(isinstance(result,
|
|
383
|
+
elif all(isinstance(result, PhEvalDiseaseResult) for result in pheval_result):
|
|
383
384
|
_write_pheval_disease_result(ranked_pheval_result, output_dir, tool_result_path)
|
|
384
385
|
else:
|
|
385
386
|
raise ValueError("Results are not all of the same type.")
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import gzip
|
|
2
2
|
import logging
|
|
3
|
+
import random
|
|
3
4
|
import re
|
|
4
5
|
import urllib.parse
|
|
5
6
|
from copy import copy
|
|
@@ -10,7 +11,7 @@ from typing import List, Union
|
|
|
10
11
|
from phenopackets import Family, File, Phenopacket
|
|
11
12
|
|
|
12
13
|
from pheval.prepare.custom_exceptions import InputError
|
|
13
|
-
from pheval.utils.file_utils import files_with_suffix, is_gzipped
|
|
14
|
+
from pheval.utils.file_utils import all_files, files_with_suffix, is_gzipped
|
|
14
15
|
from pheval.utils.phenopacket_utils import (
|
|
15
16
|
IncompatibleGenomeAssemblyError,
|
|
16
17
|
PhenopacketRebuilder,
|
|
@@ -207,6 +208,8 @@ def select_vcf_template(
|
|
|
207
208
|
proband_causative_variants: List[ProbandCausativeVariant],
|
|
208
209
|
hg19_vcf_info: VcfFile,
|
|
209
210
|
hg38_vcf_info: VcfFile,
|
|
211
|
+
hg19_vcf_dir: Path,
|
|
212
|
+
hg38_vcf_dir: Path,
|
|
210
213
|
) -> VcfFile:
|
|
211
214
|
"""
|
|
212
215
|
Select the appropriate VCF template based on the assembly information of the proband causative variants.
|
|
@@ -216,6 +219,8 @@ def select_vcf_template(
|
|
|
216
219
|
proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
|
|
217
220
|
hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
|
|
218
221
|
hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
|
|
222
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
|
|
223
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
|
|
219
224
|
|
|
220
225
|
Returns:
|
|
221
226
|
VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
|
|
@@ -224,11 +229,15 @@ def select_vcf_template(
|
|
|
224
229
|
if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
|
|
225
230
|
if hg19_vcf_info:
|
|
226
231
|
return hg19_vcf_info
|
|
232
|
+
elif hg19_vcf_dir:
|
|
233
|
+
return VcfFile.populate_fields(random.choice(all_files(hg19_vcf_dir)))
|
|
227
234
|
else:
|
|
228
235
|
raise InputError("Must specify hg19 template VCF!")
|
|
229
236
|
elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
|
|
230
237
|
if hg38_vcf_info:
|
|
231
238
|
return hg38_vcf_info
|
|
239
|
+
elif hg38_vcf_dir:
|
|
240
|
+
return VcfFile.populate_fields(random.choice(all_files(hg38_vcf_dir)))
|
|
232
241
|
else:
|
|
233
242
|
raise InputError("Must specify hg38 template VCF!")
|
|
234
243
|
else:
|
|
@@ -445,6 +454,8 @@ def spike_vcf_contents(
|
|
|
445
454
|
phenopacket_path: Path,
|
|
446
455
|
hg19_vcf_info: VcfFile,
|
|
447
456
|
hg38_vcf_info: VcfFile,
|
|
457
|
+
hg19_vcf_dir: Path,
|
|
458
|
+
hg38_vcf_dir: Path,
|
|
448
459
|
) -> tuple[str, List[str]]:
|
|
449
460
|
"""
|
|
450
461
|
Spike VCF records with variants obtained from a Phenopacket or Family.
|
|
@@ -454,6 +465,8 @@ def spike_vcf_contents(
|
|
|
454
465
|
phenopacket_path (Path): Path to the Phenopacket file.
|
|
455
466
|
hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
|
|
456
467
|
hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
|
|
468
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
|
|
469
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
|
|
457
470
|
|
|
458
471
|
Returns:
|
|
459
472
|
A tuple containing:
|
|
@@ -462,7 +475,12 @@ def spike_vcf_contents(
|
|
|
462
475
|
"""
|
|
463
476
|
phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
|
|
464
477
|
chosen_template_vcf = select_vcf_template(
|
|
465
|
-
phenopacket_path,
|
|
478
|
+
phenopacket_path,
|
|
479
|
+
phenopacket_causative_variants,
|
|
480
|
+
hg19_vcf_info,
|
|
481
|
+
hg38_vcf_info,
|
|
482
|
+
hg19_vcf_dir,
|
|
483
|
+
hg38_vcf_dir,
|
|
466
484
|
)
|
|
467
485
|
check_variant_assembly(
|
|
468
486
|
phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
|
|
@@ -483,6 +501,8 @@ def generate_spiked_vcf_file(
|
|
|
483
501
|
phenopacket_path: Path,
|
|
484
502
|
hg19_vcf_info: VcfFile,
|
|
485
503
|
hg38_vcf_info: VcfFile,
|
|
504
|
+
hg19_vcf_dir: Path,
|
|
505
|
+
hg38_vcf_dir: Path,
|
|
486
506
|
) -> File:
|
|
487
507
|
"""
|
|
488
508
|
Write spiked VCF contents to a new file.
|
|
@@ -493,13 +513,15 @@ def generate_spiked_vcf_file(
|
|
|
493
513
|
phenopacket_path (Path): Path to the Phenopacket file.
|
|
494
514
|
hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
|
|
495
515
|
hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
|
|
516
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
|
|
517
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
|
|
496
518
|
Returns:
|
|
497
519
|
File: The generated File object representing the newly created spiked VCF file.
|
|
498
520
|
"""
|
|
499
521
|
output_dir.mkdir(exist_ok=True)
|
|
500
522
|
info_log.info(f" Created a directory {output_dir}")
|
|
501
523
|
vcf_assembly, spiked_vcf = spike_vcf_contents(
|
|
502
|
-
phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
|
|
524
|
+
phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir
|
|
503
525
|
)
|
|
504
526
|
spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
|
|
505
527
|
VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
|
|
@@ -509,10 +531,38 @@ def generate_spiked_vcf_file(
|
|
|
509
531
|
)
|
|
510
532
|
|
|
511
533
|
|
|
512
|
-
def spike_and_update_phenopacket(
|
|
534
|
+
def spike_and_update_phenopacket(
|
|
535
|
+
hg19_vcf_info: VcfFile,
|
|
536
|
+
hg38_vcf_info: VcfFile,
|
|
537
|
+
hg19_vcf_dir: Path,
|
|
538
|
+
hg38_vcf_dir: Path,
|
|
539
|
+
output_dir: Path,
|
|
540
|
+
phenopacket_path: Path,
|
|
541
|
+
) -> None:
|
|
542
|
+
"""
|
|
543
|
+
Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket
|
|
544
|
+
accordingly, and write the updated Phenopacket to the specified output directory.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
|
|
548
|
+
hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
|
|
549
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
|
|
550
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
|
|
551
|
+
output_dir (Path): Directory where the updated Phenopacket will be saved.
|
|
552
|
+
phenopacket_path (Path): Path to the original Phenopacket file.
|
|
553
|
+
|
|
554
|
+
Returns:
|
|
555
|
+
None
|
|
556
|
+
"""
|
|
513
557
|
phenopacket = phenopacket_reader(phenopacket_path)
|
|
514
558
|
spiked_vcf_file_message = generate_spiked_vcf_file(
|
|
515
|
-
output_dir,
|
|
559
|
+
output_dir,
|
|
560
|
+
phenopacket,
|
|
561
|
+
phenopacket_path,
|
|
562
|
+
hg19_vcf_info,
|
|
563
|
+
hg38_vcf_info,
|
|
564
|
+
hg19_vcf_dir,
|
|
565
|
+
hg38_vcf_dir,
|
|
516
566
|
)
|
|
517
567
|
updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
|
|
518
568
|
spiked_vcf_file_message
|
|
@@ -521,7 +571,12 @@ def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, pheno
|
|
|
521
571
|
|
|
522
572
|
|
|
523
573
|
def create_spiked_vcf(
|
|
524
|
-
output_dir: Path,
|
|
574
|
+
output_dir: Path,
|
|
575
|
+
phenopacket_path: Path,
|
|
576
|
+
hg19_template_vcf: Path,
|
|
577
|
+
hg38_template_vcf: Path,
|
|
578
|
+
hg19_vcf_dir: Path,
|
|
579
|
+
hg38_vcf_dir: Path,
|
|
525
580
|
) -> None:
|
|
526
581
|
"""
|
|
527
582
|
Create a spiked VCF for a Phenopacket.
|
|
@@ -531,6 +586,8 @@ def create_spiked_vcf(
|
|
|
531
586
|
phenopacket_path (Path): Path to the Phenopacket file.
|
|
532
587
|
hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
|
|
533
588
|
hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
|
|
589
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
|
|
590
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
|
|
534
591
|
|
|
535
592
|
Raises:
|
|
536
593
|
InputError: If both hg19_template_vcf and hg38_template_vcf are None.
|
|
@@ -539,11 +596,18 @@ def create_spiked_vcf(
|
|
|
539
596
|
raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
|
|
540
597
|
hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
|
|
541
598
|
hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
|
|
542
|
-
spike_and_update_phenopacket(
|
|
599
|
+
spike_and_update_phenopacket(
|
|
600
|
+
hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
|
|
601
|
+
)
|
|
543
602
|
|
|
544
603
|
|
|
545
604
|
def create_spiked_vcfs(
|
|
546
|
-
output_dir: Path,
|
|
605
|
+
output_dir: Path,
|
|
606
|
+
phenopacket_dir: Path,
|
|
607
|
+
hg19_template_vcf: Path,
|
|
608
|
+
hg38_template_vcf: Path,
|
|
609
|
+
hg19_vcf_dir: Path,
|
|
610
|
+
hg38_vcf_dir: Path,
|
|
547
611
|
) -> None:
|
|
548
612
|
"""
|
|
549
613
|
Create a spiked VCF for a directory of Phenopackets.
|
|
@@ -553,16 +617,25 @@ def create_spiked_vcfs(
|
|
|
553
617
|
phenopacket_dir (Path): Path to the Phenopacket directory.
|
|
554
618
|
hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
|
|
555
619
|
hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
|
|
620
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
|
|
621
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
|
|
556
622
|
|
|
557
623
|
Raises:
|
|
558
624
|
InputError: If both hg19_template_vcf and hg38_template_vcf are None.
|
|
559
625
|
"""
|
|
560
|
-
if
|
|
561
|
-
|
|
626
|
+
if (
|
|
627
|
+
hg19_template_vcf is None
|
|
628
|
+
and hg38_template_vcf is None
|
|
629
|
+
and hg19_vcf_dir is None
|
|
630
|
+
and hg38_vcf_dir is None
|
|
631
|
+
):
|
|
632
|
+
raise InputError("Need to specify a VCF!")
|
|
562
633
|
hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
|
|
563
634
|
hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
|
|
564
635
|
for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
|
|
565
|
-
spike_and_update_phenopacket(
|
|
636
|
+
spike_and_update_phenopacket(
|
|
637
|
+
hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
|
|
638
|
+
)
|
|
566
639
|
|
|
567
640
|
|
|
568
641
|
def spike_vcfs(
|
|
@@ -571,6 +644,8 @@ def spike_vcfs(
|
|
|
571
644
|
phenopacket_dir: Path,
|
|
572
645
|
hg19_template_vcf: Path,
|
|
573
646
|
hg38_template_vcf: Path,
|
|
647
|
+
hg19_vcf_dir: Path,
|
|
648
|
+
hg38_vcf_dir: Path,
|
|
574
649
|
) -> None:
|
|
575
650
|
"""
|
|
576
651
|
Create spiked VCF from either a Phenopacket or a Phenopacket directory.
|
|
@@ -581,8 +656,24 @@ def spike_vcfs(
|
|
|
581
656
|
phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
|
|
582
657
|
hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
|
|
583
658
|
hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
|
|
659
|
+
hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
|
|
660
|
+
hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
|
|
584
661
|
"""
|
|
585
662
|
if phenopacket_path is not None:
|
|
586
|
-
create_spiked_vcf(
|
|
663
|
+
create_spiked_vcf(
|
|
664
|
+
output_dir,
|
|
665
|
+
phenopacket_path,
|
|
666
|
+
hg19_template_vcf,
|
|
667
|
+
hg38_template_vcf,
|
|
668
|
+
hg19_vcf_dir,
|
|
669
|
+
hg38_vcf_dir,
|
|
670
|
+
)
|
|
587
671
|
elif phenopacket_dir is not None:
|
|
588
|
-
create_spiked_vcfs(
|
|
672
|
+
create_spiked_vcfs(
|
|
673
|
+
output_dir,
|
|
674
|
+
phenopacket_dir,
|
|
675
|
+
hg19_template_vcf,
|
|
676
|
+
hg38_template_vcf,
|
|
677
|
+
hg19_vcf_dir,
|
|
678
|
+
hg38_vcf_dir,
|
|
679
|
+
)
|
|
@@ -18,6 +18,8 @@ def prepare_corpus(
|
|
|
18
18
|
gene_identifier: str,
|
|
19
19
|
hg19_template_vcf: Path,
|
|
20
20
|
hg38_template_vcf: Path,
|
|
21
|
+
hg19_vcf_dir: Path,
|
|
22
|
+
hg38_vcf_dir: Path,
|
|
21
23
|
output_dir: Path,
|
|
22
24
|
) -> None:
|
|
23
25
|
"""
|
|
@@ -34,7 +36,12 @@ def prepare_corpus(
|
|
|
34
36
|
VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
|
|
35
37
|
hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
|
|
36
38
|
VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
|
|
39
|
+
hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional).
|
|
40
|
+
hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional).
|
|
37
41
|
output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
|
|
42
|
+
Notes:
|
|
43
|
+
To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
|
|
44
|
+
hg19_vcf_dir or hg38_vcf_dir is required.
|
|
38
45
|
"""
|
|
39
46
|
output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
|
|
40
47
|
for phenopacket_path in all_files(phenopacket_dir):
|
|
@@ -65,7 +72,12 @@ def prepare_corpus(
|
|
|
65
72
|
if hg19_template_vcf or hg38_template_vcf:
|
|
66
73
|
output_dir.joinpath("vcf").mkdir(exist_ok=True)
|
|
67
74
|
create_spiked_vcf(
|
|
68
|
-
output_dir.joinpath("vcf"),
|
|
75
|
+
output_dir.joinpath("vcf"),
|
|
76
|
+
phenopacket_path,
|
|
77
|
+
hg19_template_vcf,
|
|
78
|
+
hg38_template_vcf,
|
|
79
|
+
hg19_vcf_dir,
|
|
80
|
+
hg38_vcf_dir,
|
|
69
81
|
)
|
|
70
82
|
if gene_identifier:
|
|
71
83
|
create_updated_phenopacket(
|
|
@@ -70,35 +70,6 @@ def normalise_file_name(file_path: Path) -> str:
|
|
|
70
70
|
return re.sub("[\u0300-\u036f]", "", normalised_file_name)
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def obtain_phenopacket_path_from_pheval_result(
|
|
74
|
-
pheval_result_path: Path, phenopacket_paths: list[Path]
|
|
75
|
-
) -> Path:
|
|
76
|
-
"""
|
|
77
|
-
Obtains the phenopacket file name when given a pheval result file name
|
|
78
|
-
and a list of full paths of phenopackets to be queried.
|
|
79
|
-
|
|
80
|
-
Args:
|
|
81
|
-
pheval_result_path (Path): The PhEval result.
|
|
82
|
-
phenopacket_paths (list[Path]): List of full paths of phenopackets to be queried.
|
|
83
|
-
|
|
84
|
-
Returns:
|
|
85
|
-
Path: The matching phenopacket file path from the provided list.
|
|
86
|
-
"""
|
|
87
|
-
pheval_result_path_stem_stripped = pheval_result_path.stem.split("-pheval_")[0]
|
|
88
|
-
matching_phenopacket_paths = [
|
|
89
|
-
phenopacket_path
|
|
90
|
-
for phenopacket_path in phenopacket_paths
|
|
91
|
-
if phenopacket_path.stem == pheval_result_path_stem_stripped
|
|
92
|
-
]
|
|
93
|
-
if matching_phenopacket_paths:
|
|
94
|
-
return matching_phenopacket_paths[0]
|
|
95
|
-
else:
|
|
96
|
-
raise FileNotFoundError(
|
|
97
|
-
f"Unable to find matching phenopacket file named "
|
|
98
|
-
f"{pheval_result_path_stem_stripped}.json for {pheval_result_path.name}"
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
73
|
def ensure_file_exists(*files: str):
|
|
103
74
|
"""Ensures the existence of files passed as parameter
|
|
104
75
|
Raises:
|
|
@@ -468,10 +468,12 @@ class PhenopacketUtil:
|
|
|
468
468
|
for i in pheno_interpretation:
|
|
469
469
|
for g in i.diagnosis.genomic_interpretations:
|
|
470
470
|
variant = GenomicVariant(
|
|
471
|
-
chrom=
|
|
472
|
-
|
|
471
|
+
chrom=str(
|
|
472
|
+
g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
|
|
473
|
+
"chr", ""
|
|
474
|
+
)
|
|
473
475
|
),
|
|
474
|
-
pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
|
|
476
|
+
pos=int(g.variant_interpretation.variation_descriptor.vcf_record.pos),
|
|
475
477
|
ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
|
|
476
478
|
alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
|
|
477
479
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|