pheval 0.5.1__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- {pheval-0.5.1 → pheval-0.5.3}/PKG-INFO +4 -4
- {pheval-0.5.1 → pheval-0.5.3}/README.md +3 -3
- {pheval-0.5.1 → pheval-0.5.3}/pyproject.toml +1 -1
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/benchmark.py +37 -17
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/cli_pheval.py +11 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/config_parser.py +5 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/post_processing/post_processing.py +12 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/create_noisy_phenopackets.py +17 -1
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/create_spiked_vcf.py +18 -6
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/prepare_corpus.py +29 -9
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/update_phenopacket.py +20 -2
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/runners/runner.py +7 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/phenopacket_utils.py +10 -5
- {pheval-0.5.1 → pheval-0.5.3}/LICENSE +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/benchmark_db_manager.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/benchmark_output_type.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/binary_classification_curves.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/binary_classification_stats.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/generate_plots.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/generate_rank_comparisons.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/rank_stats.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/analyse/run_data_parser.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/cli.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/cli_pheval_utils.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/implementations/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/infra/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/infra/exomiserdb.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/post_processing/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/post_processing/phenopacket_truth_set.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/post_processing/validate_result_format.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/prepare/custom_exceptions.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/resources/hgnc_complete_set.txt +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/run_metadata.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/runners/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/__init__.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/docs_gen.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/docs_gen.sh +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/exomiser.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/file_utils.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/logger.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/semsim_utils.py +0 -0
- {pheval-0.5.1 → pheval-0.5.3}/src/pheval/utils/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pheval
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary:
|
|
5
5
|
Author: Yasemin Bridges
|
|
6
6
|
Author-email: y.bridges@qmul.ac.uk
|
|
@@ -32,10 +32,10 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
|
|
33
33
|
# PhEval - Phenotypic Inference Evaluation Framework
|
|
34
34
|
|
|
35
|
-

|
|
35
|
+
[](https://pypi.org/project/pheval/)
|
|
36
36
|

|
|
37
37
|

|
|
38
|
-

|
|
39
39
|

|
|
40
40
|
|
|
41
41
|
## Overview
|
|
@@ -53,7 +53,7 @@ For more information please see the full [documentation](https://monarch-initiat
|
|
|
53
53
|
|
|
54
54
|
## Download and Installation
|
|
55
55
|
|
|
56
|
-
1. Ensure you have Python 3.
|
|
56
|
+
1. Ensure you have Python 3.10 or greater installed.
|
|
57
57
|
2. Install with `pip`:
|
|
58
58
|
```bash
|
|
59
59
|
pip install pheval
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# PhEval - Phenotypic Inference Evaluation Framework
|
|
2
2
|
|
|
3
|
-

|
|
3
|
+
[](https://pypi.org/project/pheval/)
|
|
4
4
|

|
|
5
5
|

|
|
6
|
-

|
|
7
7
|

|
|
8
8
|
|
|
9
9
|
## Overview
|
|
@@ -21,7 +21,7 @@ For more information please see the full [documentation](https://monarch-initiat
|
|
|
21
21
|
|
|
22
22
|
## Download and Installation
|
|
23
23
|
|
|
24
|
-
1. Ensure you have Python 3.
|
|
24
|
+
1. Ensure you have Python 3.10 or greater installed.
|
|
25
25
|
2. Install with `pip`:
|
|
26
26
|
```bash
|
|
27
27
|
pip install pheval
|
|
@@ -23,27 +23,32 @@ def scan_directory(run: RunConfig, benchmark_type: BenchmarkOutputType) -> pl.La
|
|
|
23
23
|
run (RunConfig): RunConfig object.
|
|
24
24
|
benchmark_type (BenchmarkOutputTypeEnum): Benchmark output type.
|
|
25
25
|
Returns:
|
|
26
|
-
pl.LazyFrame: LazyFrame object containing all the results in the directory
|
|
26
|
+
pl.LazyFrame: LazyFrame object containing all the results in the directory.
|
|
27
27
|
"""
|
|
28
28
|
logger = get_logger()
|
|
29
29
|
logger.info(f"Analysing results in {run.results_dir.joinpath(benchmark_type.result_directory)}")
|
|
30
30
|
return (
|
|
31
|
-
pl.scan_parquet(
|
|
32
|
-
run.results_dir.joinpath(benchmark_type.result_directory),
|
|
33
|
-
include_file_paths="file_path",
|
|
34
|
-
).with_columns(
|
|
35
|
-
pl.col("rank").cast(pl.Int64),
|
|
36
|
-
pl.col("file_path").str.extract(r"([^/\\]+)$").alias("result_file"),
|
|
37
|
-
pl.col("true_positive").fill_null(False),
|
|
38
|
-
)
|
|
39
|
-
).filter(
|
|
40
31
|
(
|
|
41
|
-
pl.
|
|
42
|
-
|
|
43
|
-
|
|
32
|
+
pl.scan_parquet(
|
|
33
|
+
run.results_dir.joinpath(benchmark_type.result_directory),
|
|
34
|
+
include_file_paths="file_path",
|
|
35
|
+
).with_columns(
|
|
36
|
+
pl.col("rank").cast(pl.Int64),
|
|
37
|
+
pl.col("file_path").str.extract(r"([^/\\]+)$").alias("result_file"),
|
|
38
|
+
pl.col("true_positive").fill_null(False),
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
.filter(
|
|
42
|
+
(
|
|
43
|
+
pl.col("score") >= run.threshold
|
|
44
|
+
if run.score_order.lower() == "descending"
|
|
45
|
+
else pl.col("score") <= run.threshold
|
|
46
|
+
)
|
|
47
|
+
if run.threshold is not None
|
|
48
|
+
else True
|
|
44
49
|
)
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
.sort("rank")
|
|
51
|
+
.unique(subset=["file_path", *benchmark_type.columns], keep="first")
|
|
47
52
|
)
|
|
48
53
|
|
|
49
54
|
|
|
@@ -68,14 +73,29 @@ def process_stats(
|
|
|
68
73
|
)
|
|
69
74
|
curve_results.append(compute_curves(run.run_identifier, result_scan))
|
|
70
75
|
true_positive_cases.append(
|
|
71
|
-
result_scan.filter(pl.col("true_positive"))
|
|
76
|
+
result_scan.filter(pl.col("true_positive"))
|
|
77
|
+
.select(
|
|
72
78
|
["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)]
|
|
73
79
|
)
|
|
80
|
+
.sort(["result_file", *benchmark_type.columns])
|
|
74
81
|
)
|
|
75
82
|
return (
|
|
76
83
|
pl.concat(stats, how="vertical").collect(),
|
|
77
84
|
pl.concat(curve_results, how="vertical").collect(),
|
|
78
|
-
pl.concat(
|
|
85
|
+
pl.concat(
|
|
86
|
+
[true_positive_cases[0]]
|
|
87
|
+
+ [
|
|
88
|
+
df.select(
|
|
89
|
+
[
|
|
90
|
+
col
|
|
91
|
+
for col in df.collect_schema().keys()
|
|
92
|
+
if col not in ["result_file", *benchmark_type.columns]
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
for df in true_positive_cases[1:]
|
|
96
|
+
],
|
|
97
|
+
how="horizontal",
|
|
98
|
+
).collect(),
|
|
79
99
|
)
|
|
80
100
|
|
|
81
101
|
|
|
@@ -2,12 +2,16 @@
|
|
|
2
2
|
Monarch Initiative
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
import time
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
7
8
|
import click
|
|
8
9
|
|
|
9
10
|
from pheval.implementations import get_implementation_resolver
|
|
10
11
|
from pheval.utils.file_utils import write_metadata
|
|
12
|
+
from pheval.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
11
15
|
|
|
12
16
|
|
|
13
17
|
@click.command()
|
|
@@ -84,11 +88,18 @@ def run(
|
|
|
84
88
|
config (Path): The path of the configuration file (optional e.g., config.yaml)
|
|
85
89
|
version (str): The version of the tool implementation
|
|
86
90
|
"""
|
|
91
|
+
logger.info(f"Executing {runner}.")
|
|
92
|
+
start_time = time.perf_counter()
|
|
87
93
|
runner_class = get_implementation_resolver().lookup(runner)
|
|
88
94
|
runner_instance = runner_class(input_dir, testdata_dir, tmp_dir, output_dir, config, version)
|
|
89
95
|
runner_instance.build_output_directory_structure()
|
|
96
|
+
logger.info("Executing prepare phase.")
|
|
90
97
|
runner_instance.prepare()
|
|
98
|
+
logger.info("Executing run phase.")
|
|
91
99
|
runner_instance.run()
|
|
100
|
+
logger.info("Executing post-processing phase.")
|
|
92
101
|
runner_instance.post_process()
|
|
93
102
|
run_metadata = runner_instance.construct_meta_data()
|
|
103
|
+
logger.info(f"Writing metadata for run to {output_dir}.")
|
|
94
104
|
write_metadata(output_dir, run_metadata)
|
|
105
|
+
logger.info(f"Run completed! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
|
@@ -6,6 +6,10 @@ import yaml
|
|
|
6
6
|
from serde import serde
|
|
7
7
|
from serde.yaml import from_yaml
|
|
8
8
|
|
|
9
|
+
from pheval.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
9
13
|
|
|
10
14
|
@serde
|
|
11
15
|
@dataclass
|
|
@@ -34,6 +38,7 @@ class InputDirConfig:
|
|
|
34
38
|
|
|
35
39
|
def parse_input_dir_config(input_dir: Path) -> InputDirConfig:
|
|
36
40
|
"""Reads the config file."""
|
|
41
|
+
logger.info(f"Parsing config.yaml located in {input_dir}.")
|
|
37
42
|
with open(Path(input_dir).joinpath("config.yaml"), "r") as config_file:
|
|
38
43
|
config = yaml.safe_load(config_file)
|
|
39
44
|
config_file.close()
|
|
@@ -180,6 +180,10 @@ def generate_gene_result(
|
|
|
180
180
|
phenopacket_dir (Path): Path to the Phenopacket directory
|
|
181
181
|
"""
|
|
182
182
|
output_file = output_dir.joinpath(f"pheval_gene_results/{result_path.stem}-gene_result.parquet")
|
|
183
|
+
logger.info(
|
|
184
|
+
f"Writing classified results for {len(all_files(phenopacket_dir))} "
|
|
185
|
+
f"phenopackets to {output_dir.joinpath('pheval_gene_results')}"
|
|
186
|
+
)
|
|
183
187
|
create_empty_pheval_result(
|
|
184
188
|
phenopacket_dir, output_dir.joinpath("pheval_gene_results"), ResultType.GENE
|
|
185
189
|
)
|
|
@@ -210,6 +214,10 @@ def generate_variant_result(
|
|
|
210
214
|
output_file = output_dir.joinpath(
|
|
211
215
|
f"pheval_variant_results/{result_path.stem}-variant_result.parquet"
|
|
212
216
|
)
|
|
217
|
+
logger.info(
|
|
218
|
+
f"Writing classified results for {len(all_files(phenopacket_dir))} "
|
|
219
|
+
f"phenopackets to {output_dir.joinpath('pheval_variant_results')}"
|
|
220
|
+
)
|
|
213
221
|
create_empty_pheval_result(
|
|
214
222
|
phenopacket_dir, output_dir.joinpath("pheval_variant_results"), ResultType.VARIANT
|
|
215
223
|
)
|
|
@@ -242,6 +250,10 @@ def generate_disease_result(
|
|
|
242
250
|
output_file = output_dir.joinpath(
|
|
243
251
|
f"pheval_disease_results/{result_path.stem}-disease_result.parquet"
|
|
244
252
|
)
|
|
253
|
+
logger.info(
|
|
254
|
+
f"Writing classified results for {len(all_files(phenopacket_dir))} "
|
|
255
|
+
f"phenopackets to {output_dir.joinpath('pheval_disease_results')}"
|
|
256
|
+
)
|
|
245
257
|
create_empty_pheval_result(
|
|
246
258
|
phenopacket_dir, output_dir.joinpath("pheval_disease_results"), ResultType.DISEASE
|
|
247
259
|
)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import random
|
|
2
|
+
import time
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import List, Union
|
|
4
5
|
|
|
@@ -6,7 +7,8 @@ from oaklib.implementations.pronto.pronto_implementation import ProntoImplementa
|
|
|
6
7
|
from oaklib.resource import OntologyResource
|
|
7
8
|
from phenopackets import Family, OntologyClass, Phenopacket, PhenotypicFeature
|
|
8
9
|
|
|
9
|
-
from pheval.utils.file_utils import files_with_suffix
|
|
10
|
+
from pheval.utils.file_utils import all_files, files_with_suffix
|
|
11
|
+
from pheval.utils.logger import get_logger
|
|
10
12
|
from pheval.utils.phenopacket_utils import (
|
|
11
13
|
PhenopacketRebuilder,
|
|
12
14
|
PhenopacketUtil,
|
|
@@ -14,6 +16,8 @@ from pheval.utils.phenopacket_utils import (
|
|
|
14
16
|
write_phenopacket,
|
|
15
17
|
)
|
|
16
18
|
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def load_ontology(local_cached_ontology: Path = None) -> ProntoImplementation:
|
|
19
23
|
"""
|
|
@@ -24,9 +28,11 @@ def load_ontology(local_cached_ontology: Path = None) -> ProntoImplementation:
|
|
|
24
28
|
ProntoImplementation: An instance of ProntoImplementation containing the loaded HPO.
|
|
25
29
|
"""
|
|
26
30
|
if local_cached_ontology is None:
|
|
31
|
+
logger.warning("No local cached ontology found, using default ontology.")
|
|
27
32
|
resource = OntologyResource(slug="hp.obo", local=False)
|
|
28
33
|
return ProntoImplementation(resource)
|
|
29
34
|
else:
|
|
35
|
+
logger.info(f"Loading local ontology from {local_cached_ontology}.")
|
|
30
36
|
resource = OntologyResource(slug=local_cached_ontology, local=True)
|
|
31
37
|
return ProntoImplementation(resource)
|
|
32
38
|
|
|
@@ -241,6 +247,7 @@ class HpoRandomiser:
|
|
|
241
247
|
"""
|
|
242
248
|
phenopacket_files = files_with_suffix(phenopacket_dir, ".json")
|
|
243
249
|
for phenopacket_path in phenopacket_files:
|
|
250
|
+
logger.info(f"Scrambling {phenopacket_path.name}.")
|
|
244
251
|
phenopacket = phenopacket_reader(phenopacket_path)
|
|
245
252
|
created_noisy_phenopacket = self.add_noise_to_phenotypic_profile(phenopacket)
|
|
246
253
|
write_phenopacket(
|
|
@@ -268,14 +275,23 @@ def scramble_phenopackets(
|
|
|
268
275
|
scramble_factor (float): A factor determining the level of scrambling for phenotypic features.
|
|
269
276
|
local_cached_ontology (Path): The path to the local cached ontology.
|
|
270
277
|
"""
|
|
278
|
+
start_time = time.perf_counter()
|
|
279
|
+
logger.info("Initiating scrambling.")
|
|
280
|
+
logger.info(f"Created directory {output_dir}.")
|
|
281
|
+
logger.info(f"Scramble factor set to {scramble_factor}.")
|
|
271
282
|
output_dir.mkdir(exist_ok=True)
|
|
272
283
|
ontology = load_ontology(local_cached_ontology)
|
|
273
284
|
if phenopacket_path is not None:
|
|
285
|
+
logger.info(f"Scrambling {phenopacket_path}.")
|
|
274
286
|
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopacket(
|
|
275
287
|
output_dir, phenopacket_path
|
|
276
288
|
)
|
|
277
289
|
elif phenopacket_dir is not None:
|
|
290
|
+
logger.info(
|
|
291
|
+
f"Scrambling {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
292
|
+
)
|
|
278
293
|
HpoRandomiser(ontology, scramble_factor).create_scrambled_phenopackets(
|
|
279
294
|
output_dir,
|
|
280
295
|
phenopacket_dir,
|
|
281
296
|
)
|
|
297
|
+
logger.info(f"Finished scrambling! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import gzip
|
|
2
|
-
import logging
|
|
3
2
|
import random
|
|
4
3
|
import re
|
|
4
|
+
import time
|
|
5
5
|
import urllib.parse
|
|
6
6
|
from copy import copy
|
|
7
7
|
from dataclasses import dataclass
|
|
@@ -12,6 +12,7 @@ from phenopackets import Family, File, Phenopacket
|
|
|
12
12
|
|
|
13
13
|
from pheval.prepare.custom_exceptions import InputError
|
|
14
14
|
from pheval.utils.file_utils import all_files, files_with_suffix, is_gzipped
|
|
15
|
+
from pheval.utils.logger import get_logger
|
|
15
16
|
from pheval.utils.phenopacket_utils import (
|
|
16
17
|
IncompatibleGenomeAssemblyError,
|
|
17
18
|
PhenopacketRebuilder,
|
|
@@ -21,8 +22,7 @@ from pheval.utils.phenopacket_utils import (
|
|
|
21
22
|
write_phenopacket,
|
|
22
23
|
)
|
|
23
24
|
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
logger = get_logger()
|
|
26
26
|
genome_assemblies = {
|
|
27
27
|
"GRCh38": {
|
|
28
28
|
"1": 248956422,
|
|
@@ -357,9 +357,13 @@ class VcfSpiker:
|
|
|
357
357
|
and int(val.split("\t")[1]) < int(variant_entry[1])
|
|
358
358
|
]
|
|
359
359
|
if matching_indices:
|
|
360
|
+
logger.info(
|
|
361
|
+
f"Successfully spiked variant {variant.variant.chrom}-{variant.variant.pos}-"
|
|
362
|
+
f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}"
|
|
363
|
+
)
|
|
360
364
|
variant_entry_position = matching_indices[-1] + 1
|
|
361
365
|
else:
|
|
362
|
-
|
|
366
|
+
logger.warning(
|
|
363
367
|
f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
|
|
364
368
|
f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
|
|
365
369
|
"inserting at end of VCF contents."
|
|
@@ -518,8 +522,6 @@ def generate_spiked_vcf_file(
|
|
|
518
522
|
Returns:
|
|
519
523
|
File: The generated File object representing the newly created spiked VCF file.
|
|
520
524
|
"""
|
|
521
|
-
output_dir.mkdir(exist_ok=True)
|
|
522
|
-
info_log.info(f" Created a directory {output_dir}")
|
|
523
525
|
vcf_assembly, spiked_vcf = spike_vcf_contents(
|
|
524
526
|
phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir
|
|
525
527
|
)
|
|
@@ -633,6 +635,7 @@ def create_spiked_vcfs(
|
|
|
633
635
|
hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
|
|
634
636
|
hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
|
|
635
637
|
for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
|
|
638
|
+
logger.info(f"Creating spiked VCF for: {phenopacket_path.name}")
|
|
636
639
|
spike_and_update_phenopacket(
|
|
637
640
|
hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
|
|
638
641
|
)
|
|
@@ -659,7 +662,12 @@ def spike_vcfs(
|
|
|
659
662
|
hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
|
|
660
663
|
hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
|
|
661
664
|
"""
|
|
665
|
+
start_time = time.perf_counter()
|
|
666
|
+
logger.info("Creating spiked VCFs.")
|
|
667
|
+
output_dir.mkdir(exist_ok=True)
|
|
668
|
+
logger.info(f" Created output directory: {output_dir}")
|
|
662
669
|
if phenopacket_path is not None:
|
|
670
|
+
logger.info(f"Spiking variants from {phenopacket_path}.")
|
|
663
671
|
create_spiked_vcf(
|
|
664
672
|
output_dir,
|
|
665
673
|
phenopacket_path,
|
|
@@ -669,6 +677,9 @@ def spike_vcfs(
|
|
|
669
677
|
hg38_vcf_dir,
|
|
670
678
|
)
|
|
671
679
|
elif phenopacket_dir is not None:
|
|
680
|
+
logger.info(
|
|
681
|
+
f"Spiking variants from {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
682
|
+
)
|
|
672
683
|
create_spiked_vcfs(
|
|
673
684
|
output_dir,
|
|
674
685
|
phenopacket_dir,
|
|
@@ -677,3 +688,4 @@ def spike_vcfs(
|
|
|
677
688
|
hg19_vcf_dir,
|
|
678
689
|
hg38_vcf_dir,
|
|
679
690
|
)
|
|
691
|
+
logger.info(f"Finished spiking! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
|
@@ -1,13 +1,18 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
import shutil
|
|
2
|
+
import time
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
|
|
5
5
|
from pheval.prepare.create_spiked_vcf import create_spiked_vcf
|
|
6
6
|
from pheval.prepare.update_phenopacket import create_updated_phenopacket
|
|
7
7
|
from pheval.utils.file_utils import all_files
|
|
8
|
-
from pheval.utils.
|
|
8
|
+
from pheval.utils.logger import get_logger
|
|
9
|
+
from pheval.utils.phenopacket_utils import (
|
|
10
|
+
PhenopacketUtil,
|
|
11
|
+
create_gene_identifier_map,
|
|
12
|
+
phenopacket_reader,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
|
-
|
|
15
|
+
logger = get_logger()
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
def prepare_corpus(
|
|
@@ -43,39 +48,46 @@ def prepare_corpus(
|
|
|
43
48
|
To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
|
|
44
49
|
hg19_vcf_dir or hg38_vcf_dir is required.
|
|
45
50
|
"""
|
|
51
|
+
start_time = time.perf_counter()
|
|
52
|
+
logger.info(f"Preparing corpus for {phenopacket_dir}")
|
|
46
53
|
output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
|
|
54
|
+
logger.info(f" Created output directory: {output_dir.joinpath('phenopackets')}")
|
|
55
|
+
identifier_map = create_gene_identifier_map()
|
|
47
56
|
for phenopacket_path in all_files(phenopacket_dir):
|
|
48
57
|
phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
|
|
49
58
|
if not phenopacket_util.observed_phenotypic_features():
|
|
50
|
-
|
|
59
|
+
logger.warning(
|
|
51
60
|
f"Removed {phenopacket_path.name} from the corpus due to no observed phenotypic features."
|
|
52
61
|
)
|
|
53
62
|
continue
|
|
54
63
|
if variant_analysis:
|
|
55
64
|
if phenopacket_util.check_incomplete_variant_record():
|
|
56
|
-
|
|
65
|
+
logger.warning(
|
|
57
66
|
f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
|
|
58
67
|
)
|
|
59
68
|
continue
|
|
60
69
|
elif phenopacket_util.check_variant_alleles():
|
|
61
|
-
|
|
70
|
+
logger.warning(
|
|
62
71
|
f"Removed {phenopacket_path.name} from the corpus due to identical "
|
|
63
72
|
"reference and alternate allele fields."
|
|
64
73
|
)
|
|
65
74
|
if gene_analysis:
|
|
66
75
|
if phenopacket_util.check_incomplete_gene_record():
|
|
67
|
-
|
|
76
|
+
logger.warning(
|
|
68
77
|
f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
|
|
69
78
|
)
|
|
70
79
|
continue
|
|
71
80
|
if disease_analysis:
|
|
72
81
|
if phenopacket_util.check_incomplete_disease_record():
|
|
73
|
-
|
|
82
|
+
logger.warning(
|
|
74
83
|
f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
|
|
75
84
|
)
|
|
76
85
|
continue
|
|
86
|
+
logger.info(f"{phenopacket_path.name} OK!")
|
|
77
87
|
if hg19_template_vcf or hg38_template_vcf:
|
|
78
88
|
output_dir.joinpath("vcf").mkdir(exist_ok=True)
|
|
89
|
+
logger.info(f" Created output directory: {output_dir.joinpath('vcf')}")
|
|
90
|
+
logger.info(f"Spiking VCF for {phenopacket_path}.")
|
|
79
91
|
create_spiked_vcf(
|
|
80
92
|
output_dir.joinpath("vcf"),
|
|
81
93
|
phenopacket_path,
|
|
@@ -85,8 +97,12 @@ def prepare_corpus(
|
|
|
85
97
|
hg38_vcf_dir,
|
|
86
98
|
)
|
|
87
99
|
if gene_identifier:
|
|
100
|
+
logger.info(f"Updating gene identifiers to {gene_identifier} for {phenopacket_dir}")
|
|
88
101
|
create_updated_phenopacket(
|
|
89
|
-
gene_identifier,
|
|
102
|
+
gene_identifier,
|
|
103
|
+
phenopacket_path,
|
|
104
|
+
output_dir.joinpath("phenopackets"),
|
|
105
|
+
identifier_map,
|
|
90
106
|
)
|
|
91
107
|
else:
|
|
92
108
|
# if not updating phenopacket gene identifiers then copy phenopacket as is to output directory
|
|
@@ -97,3 +113,7 @@ def prepare_corpus(
|
|
|
97
113
|
if phenopacket_path != output_dir.joinpath(f"phenopackets/{phenopacket_path.name}")
|
|
98
114
|
else None
|
|
99
115
|
)
|
|
116
|
+
logger.info(
|
|
117
|
+
f"Finished preparing corpus for {phenopacket_dir}. "
|
|
118
|
+
f"Total time: {time.perf_counter() - start_time:.2f} seconds."
|
|
119
|
+
)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import time
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
from typing import Union
|
|
3
4
|
|
|
@@ -5,6 +6,7 @@ import polars as pl
|
|
|
5
6
|
from phenopackets import Family, Phenopacket
|
|
6
7
|
|
|
7
8
|
from pheval.utils.file_utils import all_files
|
|
9
|
+
from pheval.utils.logger import get_logger
|
|
8
10
|
from pheval.utils.phenopacket_utils import (
|
|
9
11
|
GeneIdentifierUpdater,
|
|
10
12
|
PhenopacketRebuilder,
|
|
@@ -14,6 +16,8 @@ from pheval.utils.phenopacket_utils import (
|
|
|
14
16
|
write_phenopacket,
|
|
15
17
|
)
|
|
16
18
|
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def update_outdated_gene_context(
|
|
19
23
|
phenopacket_path: Path, gene_identifier: str, identifier_map: pl.DataFrame
|
|
@@ -43,7 +47,10 @@ def update_outdated_gene_context(
|
|
|
43
47
|
|
|
44
48
|
|
|
45
49
|
def create_updated_phenopacket(
|
|
46
|
-
gene_identifier: str,
|
|
50
|
+
gene_identifier: str,
|
|
51
|
+
phenopacket_path: Path,
|
|
52
|
+
output_dir: Path,
|
|
53
|
+
identifier_map: pl.DataFrame = None,
|
|
47
54
|
) -> None:
|
|
48
55
|
"""
|
|
49
56
|
Update the gene context within the interpretations for a Phenopacket and writes the updated Phenopacket.
|
|
@@ -52,12 +59,13 @@ def create_updated_phenopacket(
|
|
|
52
59
|
gene_identifier (str): Identifier used to update the gene context.
|
|
53
60
|
phenopacket_path (Path): The path to the input Phenopacket file.
|
|
54
61
|
output_dir (Path): The directory where the updated Phenopacket will be written.
|
|
62
|
+
identifier_map (pl.DataFrame): The gene identifier map used for updating.
|
|
55
63
|
Notes:
|
|
56
64
|
The gene_identifier parameter should be chosen from ensembl_id, hgnc_id, or entrez_id
|
|
57
65
|
to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
|
|
58
66
|
to describe the gene identifiers.
|
|
59
67
|
"""
|
|
60
|
-
identifier_map = create_gene_identifier_map()
|
|
68
|
+
identifier_map = create_gene_identifier_map() if identifier_map is None else identifier_map
|
|
61
69
|
updated_phenopacket = update_outdated_gene_context(
|
|
62
70
|
phenopacket_path, gene_identifier, identifier_map
|
|
63
71
|
)
|
|
@@ -82,6 +90,7 @@ def create_updated_phenopackets(
|
|
|
82
90
|
"""
|
|
83
91
|
identifier_map = create_gene_identifier_map()
|
|
84
92
|
for phenopacket_path in all_files(phenopacket_dir):
|
|
93
|
+
logger.info(f"Updating gene context for: {phenopacket_path.name}")
|
|
85
94
|
updated_phenopacket = update_outdated_gene_context(
|
|
86
95
|
phenopacket_path, gene_identifier, identifier_map
|
|
87
96
|
)
|
|
@@ -104,8 +113,17 @@ def update_phenopackets(
|
|
|
104
113
|
to update to the current gene identifier in the Phenopacket. We recommend using the ENSEMBL namespace
|
|
105
114
|
to describe the gene identifiers.
|
|
106
115
|
"""
|
|
116
|
+
start_time = time.perf_counter()
|
|
117
|
+
logger.info("Updating phenopackets.")
|
|
107
118
|
output_dir.mkdir(exist_ok=True)
|
|
119
|
+
logger.info(f"Created directory {output_dir}.")
|
|
120
|
+
logger.info(f"Gene identifier set to: {gene_identifier}.")
|
|
108
121
|
if phenopacket_path is not None:
|
|
122
|
+
logger.info(f"Updating {phenopacket_path}.")
|
|
109
123
|
create_updated_phenopacket(gene_identifier, phenopacket_path, output_dir)
|
|
110
124
|
elif phenopacket_dir is not None:
|
|
125
|
+
logger.info(
|
|
126
|
+
f"Updating {len(all_files(phenopacket_dir))} phenopackets in {phenopacket_dir}."
|
|
127
|
+
)
|
|
111
128
|
create_updated_phenopackets(gene_identifier, phenopacket_dir, output_dir)
|
|
129
|
+
logger.info(f"Updating finished! Total time: {time.perf_counter() - start_time:.2f} seconds.")
|
|
@@ -7,6 +7,9 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
from pheval.config_parser import parse_input_dir_config
|
|
9
9
|
from pheval.run_metadata import BasicOutputRunMetaData
|
|
10
|
+
from pheval.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
@dataclass
|
|
@@ -86,6 +89,10 @@ class PhEvalRunner(ABC):
|
|
|
86
89
|
|
|
87
90
|
def build_output_directory_structure(self):
|
|
88
91
|
"""build output directory structure"""
|
|
92
|
+
logger.info(
|
|
93
|
+
f"Building output directory structure for {self.input_dir_config.tool} "
|
|
94
|
+
f"version {self.input_dir_config.tool_version}"
|
|
95
|
+
)
|
|
89
96
|
self.tool_input_commands_dir.mkdir(exist_ok=True)
|
|
90
97
|
self.raw_results_dir.mkdir(exist_ok=True)
|
|
91
98
|
if self._get_variant_analysis():
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import logging
|
|
3
2
|
import os
|
|
4
3
|
from copy import copy
|
|
5
4
|
from dataclasses import dataclass
|
|
@@ -19,8 +18,9 @@ from phenopackets import (
|
|
|
19
18
|
)
|
|
20
19
|
|
|
21
20
|
from pheval.prepare.custom_exceptions import IncorrectFileFormatError
|
|
21
|
+
from pheval.utils.logger import get_logger
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
logger = get_logger()
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class IncompatibleGenomeAssemblyError(Exception):
|
|
@@ -161,6 +161,7 @@ def create_gene_identifier_map() -> pl.DataFrame:
|
|
|
161
161
|
Returns:
|
|
162
162
|
pl.DataFrame: A mapping of gene identifiers to gene symbols.
|
|
163
163
|
"""
|
|
164
|
+
logger.info("Creating gene identifier map.")
|
|
164
165
|
hgnc_df = parse_hgnc_data()
|
|
165
166
|
return hgnc_df.melt(
|
|
166
167
|
id_vars=["gene_symbol", "prev_symbols"],
|
|
@@ -192,6 +193,7 @@ def phenopacket_reader(file: Path) -> Union[Phenopacket, Family]:
|
|
|
192
193
|
Returns:
|
|
193
194
|
Union[Phenopacket, Family]: Contents of the Phenopacket file as a Phenopacket or Family object
|
|
194
195
|
"""
|
|
196
|
+
logger.info(f"Parsing Phenopacket: {file.name}")
|
|
195
197
|
file = open(file, "r")
|
|
196
198
|
phenopacket = json.load(file)
|
|
197
199
|
file.close()
|
|
@@ -593,6 +595,7 @@ class PhenopacketRebuilder:
|
|
|
593
595
|
Returns:
|
|
594
596
|
- Phenopacket or Family: The Phenopacket or Family object with the added spiked VCF path.
|
|
595
597
|
"""
|
|
598
|
+
logger.info(f"Adding spiked VCF path {spiked_vcf_file_data.uri} to phenopacket.")
|
|
596
599
|
phenopacket = copy(self.phenopacket)
|
|
597
600
|
phenopacket_files = [
|
|
598
601
|
file for file in phenopacket.files if file.file_attributes["fileFormat"] != "vcf"
|
|
@@ -627,6 +630,7 @@ def write_phenopacket(phenopacket: Union[Phenopacket, Family], output_file: Path
|
|
|
627
630
|
Returns:
|
|
628
631
|
None
|
|
629
632
|
"""
|
|
633
|
+
logger.info(f"Writing Phenopacket to {output_file}.")
|
|
630
634
|
phenopacket_json = create_json_message(phenopacket)
|
|
631
635
|
with open(output_file, "w") as outfile:
|
|
632
636
|
outfile.write(phenopacket_json)
|
|
@@ -675,6 +679,7 @@ class GeneIdentifierUpdater:
|
|
|
675
679
|
)
|
|
676
680
|
if prev_symbol_matches.height > 0:
|
|
677
681
|
return prev_symbol_matches["identifier"][0]
|
|
682
|
+
logger.warn(f"Could not find {self.gene_identifier} for {gene_symbol}.")
|
|
678
683
|
return None
|
|
679
684
|
|
|
680
685
|
def obtain_gene_symbol_from_identifier(self, query_gene_identifier: str) -> str:
|
|
@@ -735,10 +740,10 @@ class GeneIdentifierUpdater:
|
|
|
735
740
|
updated_gene_identifier = self.find_identifier(
|
|
736
741
|
g.variant_interpretation.variation_descriptor.gene_context.symbol
|
|
737
742
|
)
|
|
738
|
-
|
|
739
|
-
f"Updating gene identifier in {phenopacket_path} from "
|
|
743
|
+
logger.info(
|
|
744
|
+
f"Updating gene identifier in {phenopacket_path.name} from "
|
|
740
745
|
f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
|
|
741
|
-
f"to {updated_gene_identifier}"
|
|
746
|
+
f" to {updated_gene_identifier}"
|
|
742
747
|
)
|
|
743
748
|
g.variant_interpretation.variation_descriptor.gene_context.value_id = (
|
|
744
749
|
updated_gene_identifier
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|