pheval 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pheval might be problematic. Click here for more details.
- pheval/analyse/benchmark.py +37 -17
- pheval/post_processing/phenopacket_truth_set.py +21 -6
- pheval/post_processing/post_processing.py +3 -17
- pheval/post_processing/validate_result_format.py +9 -10
- {pheval-0.5.0.dist-info → pheval-0.5.2.dist-info}/METADATA +4 -4
- {pheval-0.5.0.dist-info → pheval-0.5.2.dist-info}/RECORD +9 -9
- {pheval-0.5.0.dist-info → pheval-0.5.2.dist-info}/LICENSE +0 -0
- {pheval-0.5.0.dist-info → pheval-0.5.2.dist-info}/WHEEL +0 -0
- {pheval-0.5.0.dist-info → pheval-0.5.2.dist-info}/entry_points.txt +0 -0
pheval/analyse/benchmark.py
CHANGED
|
@@ -23,27 +23,32 @@ def scan_directory(run: RunConfig, benchmark_type: BenchmarkOutputType) -> pl.La
|
|
|
23
23
|
run (RunConfig): RunConfig object.
|
|
24
24
|
benchmark_type (BenchmarkOutputTypeEnum): Benchmark output type.
|
|
25
25
|
Returns:
|
|
26
|
-
pl.LazyFrame: LazyFrame object containing all the results in the directory
|
|
26
|
+
pl.LazyFrame: LazyFrame object containing all the results in the directory.
|
|
27
27
|
"""
|
|
28
28
|
logger = get_logger()
|
|
29
29
|
logger.info(f"Analysing results in {run.results_dir.joinpath(benchmark_type.result_directory)}")
|
|
30
30
|
return (
|
|
31
|
-
pl.scan_parquet(
|
|
32
|
-
run.results_dir.joinpath(benchmark_type.result_directory),
|
|
33
|
-
include_file_paths="file_path",
|
|
34
|
-
).with_columns(
|
|
35
|
-
pl.col("rank").cast(pl.Int64),
|
|
36
|
-
pl.col("file_path").str.extract(r"([^/\\]+)$").alias("result_file"),
|
|
37
|
-
pl.col("true_positive").fill_null(False),
|
|
38
|
-
)
|
|
39
|
-
).filter(
|
|
40
31
|
(
|
|
41
|
-
pl.
|
|
42
|
-
|
|
43
|
-
|
|
32
|
+
pl.scan_parquet(
|
|
33
|
+
run.results_dir.joinpath(benchmark_type.result_directory),
|
|
34
|
+
include_file_paths="file_path",
|
|
35
|
+
).with_columns(
|
|
36
|
+
pl.col("rank").cast(pl.Int64),
|
|
37
|
+
pl.col("file_path").str.extract(r"([^/\\]+)$").alias("result_file"),
|
|
38
|
+
pl.col("true_positive").fill_null(False),
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
.filter(
|
|
42
|
+
(
|
|
43
|
+
pl.col("score") >= run.threshold
|
|
44
|
+
if run.score_order.lower() == "descending"
|
|
45
|
+
else pl.col("score") <= run.threshold
|
|
46
|
+
)
|
|
47
|
+
if run.threshold is not None
|
|
48
|
+
else True
|
|
44
49
|
)
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
.sort("rank")
|
|
51
|
+
.unique(subset=["file_path", *benchmark_type.columns], keep="first")
|
|
47
52
|
)
|
|
48
53
|
|
|
49
54
|
|
|
@@ -68,14 +73,29 @@ def process_stats(
|
|
|
68
73
|
)
|
|
69
74
|
curve_results.append(compute_curves(run.run_identifier, result_scan))
|
|
70
75
|
true_positive_cases.append(
|
|
71
|
-
result_scan.filter(pl.col("true_positive"))
|
|
76
|
+
result_scan.filter(pl.col("true_positive"))
|
|
77
|
+
.select(
|
|
72
78
|
["result_file", *benchmark_type.columns, pl.col("rank").alias(run.run_identifier)]
|
|
73
79
|
)
|
|
80
|
+
.sort(["result_file", *benchmark_type.columns])
|
|
74
81
|
)
|
|
75
82
|
return (
|
|
76
83
|
pl.concat(stats, how="vertical").collect(),
|
|
77
84
|
pl.concat(curve_results, how="vertical").collect(),
|
|
78
|
-
pl.concat(
|
|
85
|
+
pl.concat(
|
|
86
|
+
[true_positive_cases[0]]
|
|
87
|
+
+ [
|
|
88
|
+
df.select(
|
|
89
|
+
[
|
|
90
|
+
col
|
|
91
|
+
for col in df.collect_schema().keys()
|
|
92
|
+
if col not in ["result_file", *benchmark_type.columns]
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
for df in true_positive_cases[1:]
|
|
96
|
+
],
|
|
97
|
+
how="horizontal",
|
|
98
|
+
).collect(),
|
|
79
99
|
)
|
|
80
100
|
|
|
81
101
|
|
|
@@ -12,6 +12,18 @@ from pheval.utils.phenopacket_utils import (
|
|
|
12
12
|
)
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
|
|
16
|
+
"""Calculate the end position for a variant
|
|
17
|
+
Args:
|
|
18
|
+
variant_start (int): The start position of the variant
|
|
19
|
+
variant_ref (str): The reference allele of the variant
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
int: The end position of the variant
|
|
23
|
+
"""
|
|
24
|
+
return variant_start + len(variant_ref) - 1
|
|
25
|
+
|
|
26
|
+
|
|
15
27
|
class PhenopacketTruthSet:
|
|
16
28
|
"""Class for finding the causative gene/disease/variant from a phenopacket"""
|
|
17
29
|
|
|
@@ -139,13 +151,14 @@ class PhenopacketTruthSet:
|
|
|
139
151
|
return pl.DataFrame(
|
|
140
152
|
{
|
|
141
153
|
"chrom": [v.chrom for v in variants],
|
|
142
|
-
"
|
|
154
|
+
"start": [v.pos for v in variants],
|
|
155
|
+
"end": [calculate_end_pos(v.pos, v.ref) for v in variants],
|
|
143
156
|
"ref": [v.ref for v in variants],
|
|
144
157
|
"alt": [v.alt for v in variants],
|
|
145
158
|
}
|
|
146
159
|
).with_columns(
|
|
147
160
|
[
|
|
148
|
-
pl.concat_str(["chrom", "
|
|
161
|
+
pl.concat_str(["chrom", "start", "ref", "alt"], separator="-").alias("variant_id"),
|
|
149
162
|
pl.lit(0.0).cast(pl.Float64).alias("score"),
|
|
150
163
|
pl.lit(0).cast(pl.Int64).alias("rank"),
|
|
151
164
|
pl.lit(True).alias("true_positive"),
|
|
@@ -166,10 +179,10 @@ class PhenopacketTruthSet:
|
|
|
166
179
|
return (
|
|
167
180
|
ranked_results.with_columns(
|
|
168
181
|
[
|
|
169
|
-
pl.struct(["chrom", "
|
|
182
|
+
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
170
183
|
.is_in(
|
|
171
184
|
classified_results.select(
|
|
172
|
-
pl.struct(["chrom", "
|
|
185
|
+
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
173
186
|
).to_series()
|
|
174
187
|
)
|
|
175
188
|
.alias("true_positive")
|
|
@@ -179,8 +192,10 @@ class PhenopacketTruthSet:
|
|
|
179
192
|
.select(classified_results.columns)
|
|
180
193
|
.vstack(
|
|
181
194
|
classified_results.filter(
|
|
182
|
-
~pl.struct(["chrom", "
|
|
183
|
-
ranked_results.select(
|
|
195
|
+
~pl.struct(["chrom", "start", "end", "ref", "alt"]).is_in(
|
|
196
|
+
ranked_results.select(
|
|
197
|
+
pl.struct(["chrom", "start", "end", "ref", "alt"])
|
|
198
|
+
).to_series()
|
|
184
199
|
)
|
|
185
200
|
)
|
|
186
201
|
)
|
|
@@ -22,18 +22,6 @@ class ResultType(Enum):
|
|
|
22
22
|
VARIANT = "variant"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def calculate_end_pos(variant_start: int, variant_ref: str) -> int:
|
|
26
|
-
"""Calculate the end position for a variant
|
|
27
|
-
Args:
|
|
28
|
-
variant_start (int): The start position of the variant
|
|
29
|
-
variant_ref (str): The reference allele of the variant
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
int: The end position of the variant
|
|
33
|
-
"""
|
|
34
|
-
return variant_start + len(variant_ref) - 1
|
|
35
|
-
|
|
36
|
-
|
|
37
25
|
class SortOrder(Enum):
|
|
38
26
|
"""Enumeration representing sorting orders."""
|
|
39
27
|
|
|
@@ -106,7 +94,7 @@ def _write_variant_result(ranked_results: pl.DataFrame, output_file: Path) -> No
|
|
|
106
94
|
output_file (Path): Path to the output file.
|
|
107
95
|
"""
|
|
108
96
|
variant_output = ranked_results.select(
|
|
109
|
-
["rank", "score", "
|
|
97
|
+
["rank", "score", "chrom", "start", "end", "ref", "alt", "variant_id", "true_positive"]
|
|
110
98
|
)
|
|
111
99
|
_write_results_file(output_file, variant_output)
|
|
112
100
|
|
|
@@ -119,9 +107,7 @@ def _write_disease_result(ranked_results: pl.DataFrame, output_file: Path) -> No
|
|
|
119
107
|
ranked_results ([PhEvalResult]): List of ranked PhEval disease results.
|
|
120
108
|
output_file (Path): Path to the output file.
|
|
121
109
|
"""
|
|
122
|
-
disease_output = ranked_results.select(
|
|
123
|
-
["rank", "score", "disease_name", "disease_identifier", "true_positive"]
|
|
124
|
-
)
|
|
110
|
+
disease_output = ranked_results.select(["rank", "score", "disease_identifier", "true_positive"])
|
|
125
111
|
_write_results_file(output_file, disease_output)
|
|
126
112
|
|
|
127
113
|
|
|
@@ -228,7 +214,7 @@ def generate_variant_result(
|
|
|
228
214
|
phenopacket_dir, output_dir.joinpath("pheval_variant_results"), ResultType.VARIANT
|
|
229
215
|
)
|
|
230
216
|
ranked_results = _rank_results(results, sort_order).with_columns(
|
|
231
|
-
pl.concat_str(["chrom", "
|
|
217
|
+
pl.concat_str(["chrom", "start", "ref", "alt"], separator="-").alias("variant_id")
|
|
232
218
|
)
|
|
233
219
|
classified_results = PhenopacketTruthSet(phenopacket_dir).merge_variant_results(
|
|
234
220
|
ranked_results, output_file
|
|
@@ -35,18 +35,17 @@ class ResultSchema(Enum):
|
|
|
35
35
|
)
|
|
36
36
|
DISEASE_RESULT_SCHEMA = pl.Schema(
|
|
37
37
|
{
|
|
38
|
-
"disease_name": pl.String,
|
|
39
38
|
"disease_identifier": pl.String,
|
|
40
39
|
"score": pl.Float64,
|
|
41
40
|
"grouping_id": pl.Utf8,
|
|
42
41
|
}
|
|
43
42
|
)
|
|
44
43
|
|
|
45
|
-
def validate(self,
|
|
44
|
+
def validate(self, results: pl.DataFrame) -> bool:
|
|
46
45
|
"""
|
|
47
46
|
Validate that a DataFrame follows the expected schema.
|
|
48
47
|
Args:
|
|
49
|
-
|
|
48
|
+
results (pl.DataFrame): The DataFrame to validate.
|
|
50
49
|
Raises:
|
|
51
50
|
ValueError: If a required column is missing or the grouping_id column contains a null value.
|
|
52
51
|
TypeError: If a column exists but has an incorrect data type.
|
|
@@ -55,18 +54,18 @@ class ResultSchema(Enum):
|
|
|
55
54
|
"""
|
|
56
55
|
expected_schema = self.value
|
|
57
56
|
|
|
58
|
-
if "grouping_id" in
|
|
57
|
+
if "grouping_id" in results.columns and results["grouping_id"].null_count() > 0:
|
|
59
58
|
raise ValueError("'grouping_id' column should not contain null values if provided.")
|
|
60
59
|
|
|
61
60
|
for col_name, expected_type in expected_schema.items():
|
|
62
|
-
if col_name not in
|
|
61
|
+
if col_name not in results.schema:
|
|
63
62
|
if col_name == "grouping_id":
|
|
64
63
|
continue
|
|
65
64
|
raise ValueError(f"Missing required column: {col_name}")
|
|
66
65
|
|
|
67
|
-
if
|
|
66
|
+
if results.schema[col_name] != expected_type:
|
|
68
67
|
raise TypeError(
|
|
69
|
-
f"Column '{col_name}' has type {
|
|
68
|
+
f"Column '{col_name}' has type {results.schema[col_name]}, expected {expected_type}"
|
|
70
69
|
)
|
|
71
70
|
|
|
72
71
|
return True
|
|
@@ -83,9 +82,9 @@ def validate_dataframe(schema: ResultSchema) -> Callable:
|
|
|
83
82
|
|
|
84
83
|
def decorator(func: Callable) -> Callable:
|
|
85
84
|
@wraps(func)
|
|
86
|
-
def wrapper(
|
|
87
|
-
schema.validate(
|
|
88
|
-
return func(
|
|
85
|
+
def wrapper(results: pl.DataFrame, *args, **kwargs):
|
|
86
|
+
schema.validate(results)
|
|
87
|
+
return func(results, *args, **kwargs)
|
|
89
88
|
|
|
90
89
|
return wrapper
|
|
91
90
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: pheval
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary:
|
|
5
5
|
Author: Yasemin Bridges
|
|
6
6
|
Author-email: y.bridges@qmul.ac.uk
|
|
@@ -32,10 +32,10 @@ Description-Content-Type: text/markdown
|
|
|
32
32
|
|
|
33
33
|
# PhEval - Phenotypic Inference Evaluation Framework
|
|
34
34
|
|
|
35
|
-

|
|
35
|
+
[](https://pypi.org/project/pheval/)
|
|
36
36
|

|
|
37
37
|

|
|
38
|
-

|
|
39
39
|

|
|
40
40
|
|
|
41
41
|
## Overview
|
|
@@ -53,7 +53,7 @@ For more information please see the full [documentation](https://monarch-initiat
|
|
|
53
53
|
|
|
54
54
|
## Download and Installation
|
|
55
55
|
|
|
56
|
-
1. Ensure you have Python 3.
|
|
56
|
+
1. Ensure you have Python 3.10 or greater installed.
|
|
57
57
|
2. Install with `pip`:
|
|
58
58
|
```bash
|
|
59
59
|
pip install pheval
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
pheval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
pheval/analyse/benchmark.py,sha256=
|
|
3
|
+
pheval/analyse/benchmark.py,sha256=sfjReLmodXwCT9ZyZDE6Oli0j1S5ygicJshb7n4-x4U,6916
|
|
4
4
|
pheval/analyse/benchmark_db_manager.py,sha256=zS1TI76YuV2_YXLipHLSyh-XDR5kTxyOwhRhHRFHfjQ,764
|
|
5
5
|
pheval/analyse/benchmark_output_type.py,sha256=bh-qQvV4AF7BHQyr_bdY8HTTzYZVe7KvoIoUF0D9k-g,1468
|
|
6
6
|
pheval/analyse/binary_classification_curves.py,sha256=Crb45rJWc5rxDdx82sgoHRvYHE2D5pus91fgl39FyRw,5007
|
|
@@ -17,9 +17,9 @@ pheval/implementations/__init__.py,sha256=BMUTotjTdgy5j5xubWCIQgRXrSQ1ZIcjooer7r
|
|
|
17
17
|
pheval/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
pheval/infra/exomiserdb.py,sha256=pM9-TfjrgurtH4OtM1Enk5oVhIxGQN3rKRlrxHuObTM,5080
|
|
19
19
|
pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
pheval/post_processing/phenopacket_truth_set.py,sha256=
|
|
21
|
-
pheval/post_processing/post_processing.py,sha256=
|
|
22
|
-
pheval/post_processing/validate_result_format.py,sha256=
|
|
20
|
+
pheval/post_processing/phenopacket_truth_set.py,sha256=EvpfS0NJpcipI1muCtB0PBUghXtktln9vF5PUk57wSM,9412
|
|
21
|
+
pheval/post_processing/post_processing.py,sha256=VadU-tjToEa2auvNpmbIzKuGtRvN4E89pH_GH1RiHm0,9078
|
|
22
|
+
pheval/post_processing/validate_result_format.py,sha256=rRlVVIT5ZtdD_Qi0tQVqRSghCrxEDZCKImtw1ygcbtA,2927
|
|
23
23
|
pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
24
|
pheval/prepare/create_noisy_phenopackets.py,sha256=ydhA4mpqKTDc4hBu8YfvNW2nMubHK3dbO-cv0lA4JFQ,11504
|
|
25
25
|
pheval/prepare/create_spiked_vcf.py,sha256=90A-Mi8QKhvN036vtFEVWAHgzHO37itiLYrqYlG4LiA,23953
|
|
@@ -46,8 +46,8 @@ pheval/utils/logger.py,sha256=5DZl5uMltUDQorhkvg_B7_ZhFwApAmEkWneFIOKfRGQ,1566
|
|
|
46
46
|
pheval/utils/phenopacket_utils.py,sha256=AfV_mWac6n5HCc5zjfH6CGP8T0qI0LR0VBrooaKmgdY,26978
|
|
47
47
|
pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
|
|
48
48
|
pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
|
|
49
|
-
pheval-0.5.
|
|
50
|
-
pheval-0.5.
|
|
51
|
-
pheval-0.5.
|
|
52
|
-
pheval-0.5.
|
|
53
|
-
pheval-0.5.
|
|
49
|
+
pheval-0.5.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
50
|
+
pheval-0.5.2.dist-info/METADATA,sha256=do8ya_Tw3VD-md2rPf83DBmVTyytUrH7tEhQenjN-6o,6494
|
|
51
|
+
pheval-0.5.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
52
|
+
pheval-0.5.2.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
|
|
53
|
+
pheval-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|