pylocuszoom 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +39 -20
- pylocuszoom/backends/__init__.py +1 -5
- pylocuszoom/backends/base.py +1 -1
- pylocuszoom/backends/bokeh_backend.py +4 -7
- pylocuszoom/backends/matplotlib_backend.py +6 -1
- pylocuszoom/backends/plotly_backend.py +11 -12
- pylocuszoom/colors.py +132 -0
- pylocuszoom/eqtl.py +3 -2
- pylocuszoom/finemapping.py +224 -0
- pylocuszoom/gene_track.py +44 -31
- pylocuszoom/labels.py +32 -33
- pylocuszoom/ld.py +8 -7
- pylocuszoom/plotter.py +381 -66
- pylocuszoom/recombination.py +14 -14
- pylocuszoom/utils.py +3 -1
- {pylocuszoom-0.1.0.dist-info → pylocuszoom-0.2.0.dist-info}/METADATA +20 -25
- pylocuszoom-0.2.0.dist-info/RECORD +21 -0
- pylocuszoom-0.1.0.dist-info/RECORD +0 -20
- {pylocuszoom-0.1.0.dist-info → pylocuszoom-0.2.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.1.0.dist-info → pylocuszoom-0.2.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/__init__.py
CHANGED
|
@@ -3,20 +3,21 @@
|
|
|
3
3
|
This package provides LocusZoom-style regional association plots with:
|
|
4
4
|
- LD coloring based on R² with lead variant
|
|
5
5
|
- Gene and exon tracks
|
|
6
|
-
- Recombination rate overlays (
|
|
6
|
+
- Recombination rate overlays (canine built-in, or user-provided)
|
|
7
7
|
- Automatic SNP labeling
|
|
8
8
|
- Multiple backends: matplotlib (static), plotly (interactive), bokeh (dashboards)
|
|
9
9
|
- eQTL overlay support
|
|
10
|
+
- Fine-mapping/SuSiE visualization (PIP line with credible set coloring)
|
|
10
11
|
- PySpark DataFrame support for large-scale data
|
|
11
12
|
|
|
12
13
|
Example:
|
|
13
14
|
>>> from pylocuszoom import LocusZoomPlotter
|
|
14
|
-
>>> plotter = LocusZoomPlotter(species="
|
|
15
|
+
>>> plotter = LocusZoomPlotter(species="canine")
|
|
15
16
|
>>> fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
|
|
16
17
|
>>> fig.savefig("regional_plot.png", dpi=150)
|
|
17
18
|
|
|
18
19
|
Interactive example:
|
|
19
|
-
>>> plotter = LocusZoomPlotter(species="
|
|
20
|
+
>>> plotter = LocusZoomPlotter(species="canine", backend="plotly")
|
|
20
21
|
>>> fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
|
|
21
22
|
>>> fig.write_html("regional_plot.html")
|
|
22
23
|
|
|
@@ -28,22 +29,42 @@ Stacked plots:
|
|
|
28
29
|
... )
|
|
29
30
|
|
|
30
31
|
Species Support:
|
|
31
|
-
-
|
|
32
|
-
-
|
|
32
|
+
- Canine (Canis lupus familiaris): Full features including built-in recombination maps
|
|
33
|
+
- Feline (Felis catus): LD coloring and gene tracks (user provides recombination data)
|
|
33
34
|
- Custom: User provides all reference data
|
|
34
35
|
"""
|
|
35
36
|
|
|
36
37
|
__version__ = "0.1.0"
|
|
37
38
|
|
|
38
39
|
# Main plotter class
|
|
39
|
-
from .plotter import LocusZoomPlotter
|
|
40
|
-
|
|
41
40
|
# Backend types
|
|
42
41
|
from .backends import BackendType, get_backend
|
|
43
42
|
|
|
44
43
|
# Colors and LD
|
|
45
44
|
from .colors import LEAD_SNP_COLOR, get_ld_bin, get_ld_color, get_ld_color_palette
|
|
46
45
|
|
|
46
|
+
# eQTL support
|
|
47
|
+
from .eqtl import (
|
|
48
|
+
EQTLValidationError,
|
|
49
|
+
calculate_colocalization_overlap,
|
|
50
|
+
filter_eqtl_by_gene,
|
|
51
|
+
filter_eqtl_by_region,
|
|
52
|
+
get_eqtl_genes,
|
|
53
|
+
prepare_eqtl_for_plotting,
|
|
54
|
+
validate_eqtl_df,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Fine-mapping/SuSiE support
|
|
58
|
+
from .finemapping import (
|
|
59
|
+
FinemappingValidationError,
|
|
60
|
+
filter_by_credible_set,
|
|
61
|
+
filter_finemapping_by_region,
|
|
62
|
+
get_credible_sets,
|
|
63
|
+
get_top_pip_variants,
|
|
64
|
+
prepare_finemapping_for_plotting,
|
|
65
|
+
validate_finemapping_df,
|
|
66
|
+
)
|
|
67
|
+
|
|
47
68
|
# Gene track
|
|
48
69
|
from .gene_track import get_nearest_gene, plot_gene_track
|
|
49
70
|
|
|
@@ -55,26 +76,16 @@ from .ld import calculate_ld
|
|
|
55
76
|
|
|
56
77
|
# Logging configuration
|
|
57
78
|
from .logging import disable_logging, enable_logging
|
|
79
|
+
from .plotter import LocusZoomPlotter
|
|
58
80
|
|
|
59
81
|
# Reference data management
|
|
60
82
|
from .recombination import (
|
|
61
83
|
add_recombination_overlay,
|
|
62
|
-
|
|
84
|
+
download_canine_recombination_maps,
|
|
63
85
|
get_recombination_rate_for_region,
|
|
64
86
|
load_recombination_map,
|
|
65
87
|
)
|
|
66
88
|
|
|
67
|
-
# eQTL support
|
|
68
|
-
from .eqtl import (
|
|
69
|
-
EQTLValidationError,
|
|
70
|
-
calculate_colocalization_overlap,
|
|
71
|
-
filter_eqtl_by_gene,
|
|
72
|
-
filter_eqtl_by_region,
|
|
73
|
-
get_eqtl_genes,
|
|
74
|
-
prepare_eqtl_for_plotting,
|
|
75
|
-
validate_eqtl_df,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
89
|
# Validation utilities
|
|
79
90
|
from .utils import ValidationError, to_pandas
|
|
80
91
|
|
|
@@ -86,7 +97,7 @@ __all__ = [
|
|
|
86
97
|
"BackendType",
|
|
87
98
|
"get_backend",
|
|
88
99
|
# Reference data
|
|
89
|
-
"
|
|
100
|
+
"download_canine_recombination_maps",
|
|
90
101
|
# Colors
|
|
91
102
|
"get_ld_color",
|
|
92
103
|
"get_ld_bin",
|
|
@@ -111,6 +122,14 @@ __all__ = [
|
|
|
111
122
|
"get_eqtl_genes",
|
|
112
123
|
"calculate_colocalization_overlap",
|
|
113
124
|
"EQTLValidationError",
|
|
125
|
+
# Fine-mapping/SuSiE
|
|
126
|
+
"validate_finemapping_df",
|
|
127
|
+
"filter_finemapping_by_region",
|
|
128
|
+
"filter_by_credible_set",
|
|
129
|
+
"get_credible_sets",
|
|
130
|
+
"get_top_pip_variants",
|
|
131
|
+
"prepare_finemapping_for_plotting",
|
|
132
|
+
"FinemappingValidationError",
|
|
114
133
|
# Logging
|
|
115
134
|
"enable_logging",
|
|
116
135
|
"disable_logging",
|
pylocuszoom/backends/__init__.py
CHANGED
|
@@ -3,15 +3,11 @@
|
|
|
3
3
|
Supports matplotlib (default), plotly, and bokeh backends.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Literal
|
|
7
7
|
|
|
8
8
|
from .base import PlotBackend
|
|
9
9
|
from .matplotlib_backend import MatplotlibBackend
|
|
10
10
|
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from .bokeh_backend import BokehBackend
|
|
13
|
-
from .plotly_backend import PlotlyBackend
|
|
14
|
-
|
|
15
11
|
BackendType = Literal["matplotlib", "plotly", "bokeh"]
|
|
16
12
|
|
|
17
13
|
_BACKENDS: dict[str, type[PlotBackend]] = {
|
pylocuszoom/backends/base.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any, List, Optional, Tuple, Union
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from bokeh.io import export_png, export_svgs, output_file, save, show
|
|
10
10
|
from bokeh.layouts import column
|
|
11
|
-
from bokeh.models import ColumnDataSource, HoverTool,
|
|
11
|
+
from bokeh.models import ColumnDataSource, HoverTool, Span
|
|
12
12
|
from bokeh.plotting import figure
|
|
13
13
|
|
|
14
14
|
|
|
@@ -108,10 +108,10 @@ class BokehBackend:
|
|
|
108
108
|
|
|
109
109
|
# Handle sizes (convert from area to diameter)
|
|
110
110
|
if isinstance(sizes, (int, float)):
|
|
111
|
-
bokeh_size = max(6, sizes
|
|
111
|
+
bokeh_size = max(6, sizes**0.5)
|
|
112
112
|
data["size"] = [bokeh_size] * len(x)
|
|
113
113
|
else:
|
|
114
|
-
data["size"] = [max(6, s
|
|
114
|
+
data["size"] = [max(6, s**0.5) for s in sizes]
|
|
115
115
|
|
|
116
116
|
# Add hover data
|
|
117
117
|
tooltips = []
|
|
@@ -289,7 +289,6 @@ class BokehBackend:
|
|
|
289
289
|
zorder: int = 2,
|
|
290
290
|
) -> Any:
|
|
291
291
|
"""Add a rectangle to the figure."""
|
|
292
|
-
from bokeh.models import Rect
|
|
293
292
|
|
|
294
293
|
x_center = xy[0] + width / 2
|
|
295
294
|
y_center = xy[1] + height / 2
|
|
@@ -389,9 +388,7 @@ class BokehBackend:
|
|
|
389
388
|
# For now, assume values are already in bp and need /1e6
|
|
390
389
|
from bokeh.models import FuncTickFormatter
|
|
391
390
|
|
|
392
|
-
ax.xaxis.formatter = FuncTickFormatter(
|
|
393
|
-
code="return (tick / 1e6).toFixed(2);"
|
|
394
|
-
)
|
|
391
|
+
ax.xaxis.formatter = FuncTickFormatter(code="return (tick / 1e6).toFixed(2);")
|
|
395
392
|
|
|
396
393
|
def save(
|
|
397
394
|
self,
|
|
@@ -205,7 +205,12 @@ class MatplotlibBackend:
|
|
|
205
205
|
|
|
206
206
|
def set_title(self, ax: Axes, title: str, fontsize: int = 14) -> None:
|
|
207
207
|
"""Set panel title."""
|
|
208
|
-
ax.set_title(
|
|
208
|
+
ax.set_title(
|
|
209
|
+
title,
|
|
210
|
+
fontsize=fontsize,
|
|
211
|
+
fontweight="bold",
|
|
212
|
+
fontfamily="sans-serif",
|
|
213
|
+
)
|
|
209
214
|
|
|
210
215
|
def create_twin_axis(self, ax: Axes) -> Axes:
|
|
211
216
|
"""Create a secondary y-axis sharing the same x-axis."""
|
|
@@ -100,9 +100,9 @@ class PlotlyBackend:
|
|
|
100
100
|
|
|
101
101
|
# Convert size (matplotlib uses area, plotly uses diameter)
|
|
102
102
|
if isinstance(sizes, (int, float)):
|
|
103
|
-
size = max(6, sizes
|
|
103
|
+
size = max(6, sizes**0.5) # Approximate conversion
|
|
104
104
|
else:
|
|
105
|
-
size = [max(6, s
|
|
105
|
+
size = [max(6, s**0.5) for s in sizes]
|
|
106
106
|
|
|
107
107
|
# Build hover template
|
|
108
108
|
if hover_data is not None:
|
|
@@ -317,7 +317,9 @@ class PlotlyBackend:
|
|
|
317
317
|
"""Set x-axis label."""
|
|
318
318
|
fig, row = ax
|
|
319
319
|
xaxis = f"xaxis{row}" if row > 1 else "xaxis"
|
|
320
|
-
fig.update_layout(
|
|
320
|
+
fig.update_layout(
|
|
321
|
+
**{xaxis: dict(title=dict(text=label, font=dict(size=fontsize)))}
|
|
322
|
+
)
|
|
321
323
|
|
|
322
324
|
def set_ylabel(
|
|
323
325
|
self, ax: Tuple[go.Figure, int], label: str, fontsize: int = 12
|
|
@@ -325,7 +327,9 @@ class PlotlyBackend:
|
|
|
325
327
|
"""Set y-axis label."""
|
|
326
328
|
fig, row = ax
|
|
327
329
|
yaxis = f"yaxis{row}" if row > 1 else "yaxis"
|
|
328
|
-
fig.update_layout(
|
|
330
|
+
fig.update_layout(
|
|
331
|
+
**{yaxis: dict(title=dict(text=label, font=dict(size=fontsize)))}
|
|
332
|
+
)
|
|
329
333
|
|
|
330
334
|
def set_title(
|
|
331
335
|
self, ax: Tuple[go.Figure, int], title: str, fontsize: int = 14
|
|
@@ -395,14 +399,9 @@ class PlotlyBackend:
|
|
|
395
399
|
|
|
396
400
|
Plotly doesn't have spines, but we can hide axis lines.
|
|
397
401
|
"""
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
yaxis = f"yaxis{row}" if row > 1 else "yaxis"
|
|
402
|
-
|
|
403
|
-
if "top" in spines or "right" in spines:
|
|
404
|
-
# Plotly's template "plotly_white" already hides these
|
|
405
|
-
pass
|
|
402
|
+
# Plotly's template "plotly_white" already hides top/right lines
|
|
403
|
+
# No action needed - method exists for API compatibility
|
|
404
|
+
pass
|
|
406
405
|
|
|
407
406
|
def format_xaxis_mb(self, ax: Tuple[go.Figure, int]) -> None:
|
|
408
407
|
"""Format x-axis to show megabase values."""
|
pylocuszoom/colors.py
CHANGED
|
@@ -29,6 +29,101 @@ LD_NA_LABEL = "NA"
|
|
|
29
29
|
# Lead SNP color (purple diamond)
|
|
30
30
|
LEAD_SNP_COLOR = "#7D26CD" # purple3
|
|
31
31
|
|
|
32
|
+
# Fine-mapping/SuSiE credible set colors
|
|
33
|
+
# Colors for up to 10 credible sets, matching locuszoomr style
|
|
34
|
+
CREDIBLE_SET_COLORS: List[str] = [
|
|
35
|
+
"#FF7F00", # orange (CS1)
|
|
36
|
+
"#1F78B4", # blue (CS2)
|
|
37
|
+
"#33A02C", # green (CS3)
|
|
38
|
+
"#E31A1C", # red (CS4)
|
|
39
|
+
"#6A3D9A", # purple (CS5)
|
|
40
|
+
"#B15928", # brown (CS6)
|
|
41
|
+
"#FB9A99", # pink (CS7)
|
|
42
|
+
"#A6CEE3", # light blue (CS8)
|
|
43
|
+
"#B2DF8A", # light green (CS9)
|
|
44
|
+
"#FDBF6F", # light orange (CS10)
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# PIP line color (when not showing credible sets)
|
|
48
|
+
PIP_LINE_COLOR = "#FF7F00" # orange
|
|
49
|
+
|
|
50
|
+
# eQTL effect size bins - matches locuszoomr color scheme
|
|
51
|
+
# Format: (min_threshold, max_threshold, label, color)
|
|
52
|
+
# Positive effects (upward triangles)
|
|
53
|
+
EQTL_POSITIVE_BINS: List[Tuple[float, float, str, str]] = [
|
|
54
|
+
(0.3, 0.4, "0.3 : 0.4", "#8B1A1A"), # dark red/maroon
|
|
55
|
+
(0.2, 0.3, "0.2 : 0.3", "#FF6600"), # orange
|
|
56
|
+
(0.1, 0.2, "0.1 : 0.2", "#FFB347"), # light orange
|
|
57
|
+
]
|
|
58
|
+
# Negative effects (downward triangles)
|
|
59
|
+
EQTL_NEGATIVE_BINS: List[Tuple[float, float, str, str]] = [
|
|
60
|
+
(-0.2, -0.1, "-0.2 : -0.1", "#66CDAA"), # medium aquamarine
|
|
61
|
+
(-0.3, -0.2, "-0.3 : -0.2", "#4682B4"), # steel blue
|
|
62
|
+
(-0.4, -0.3, "-0.4 : -0.3", "#00008B"), # dark blue
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def get_eqtl_color(effect: Optional[float]) -> str:
|
|
67
|
+
"""Get color based on eQTL effect size.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
effect: Effect size (beta coefficient).
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Hex color code string.
|
|
74
|
+
"""
|
|
75
|
+
if _is_missing(effect):
|
|
76
|
+
return LD_NA_COLOR
|
|
77
|
+
|
|
78
|
+
if effect >= 0:
|
|
79
|
+
for min_t, max_t, _, color in EQTL_POSITIVE_BINS:
|
|
80
|
+
if min_t <= effect < max_t or (max_t == 0.4 and effect >= max_t):
|
|
81
|
+
return color
|
|
82
|
+
return EQTL_POSITIVE_BINS[-1][3] # smallest positive bin
|
|
83
|
+
else:
|
|
84
|
+
for min_t, max_t, _, color in EQTL_NEGATIVE_BINS:
|
|
85
|
+
if min_t < effect <= max_t or (min_t == -0.4 and effect <= min_t):
|
|
86
|
+
return color
|
|
87
|
+
return EQTL_NEGATIVE_BINS[-1][3] # smallest negative bin
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_eqtl_bin(effect: Optional[float]) -> str:
|
|
91
|
+
"""Get eQTL effect bin label.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
effect: Effect size (beta coefficient).
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Bin label string.
|
|
98
|
+
"""
|
|
99
|
+
if _is_missing(effect):
|
|
100
|
+
return LD_NA_LABEL
|
|
101
|
+
|
|
102
|
+
if effect >= 0:
|
|
103
|
+
for min_t, max_t, label, _ in EQTL_POSITIVE_BINS:
|
|
104
|
+
if min_t <= effect < max_t or (max_t == 0.4 and effect >= max_t):
|
|
105
|
+
return label
|
|
106
|
+
return EQTL_POSITIVE_BINS[-1][2]
|
|
107
|
+
else:
|
|
108
|
+
for min_t, max_t, label, _ in EQTL_NEGATIVE_BINS:
|
|
109
|
+
if min_t < effect <= max_t or (min_t == -0.4 and effect <= min_t):
|
|
110
|
+
return label
|
|
111
|
+
return EQTL_NEGATIVE_BINS[-1][2]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_eqtl_color_palette() -> dict[str, str]:
|
|
115
|
+
"""Get color palette for eQTL effect bins.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Dictionary mapping bin labels to hex colors.
|
|
119
|
+
"""
|
|
120
|
+
palette = {}
|
|
121
|
+
for _, _, label, color in EQTL_POSITIVE_BINS:
|
|
122
|
+
palette[label] = color
|
|
123
|
+
for _, _, label, color in EQTL_NEGATIVE_BINS:
|
|
124
|
+
palette[label] = color
|
|
125
|
+
return palette
|
|
126
|
+
|
|
32
127
|
|
|
33
128
|
def get_ld_color(r2: Optional[float]) -> str:
|
|
34
129
|
"""Get LocusZoom-style color based on LD R² value.
|
|
@@ -105,3 +200,40 @@ def get_ld_color_palette() -> dict[str, str]:
|
|
|
105
200
|
palette = {label: color for _, label, color in LD_BINS}
|
|
106
201
|
palette[LD_NA_LABEL] = LD_NA_COLOR
|
|
107
202
|
return palette
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def get_credible_set_color(cs_id: int) -> str:
|
|
206
|
+
"""Get color for a credible set.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
cs_id: Credible set ID (1-indexed).
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Hex color code string.
|
|
213
|
+
|
|
214
|
+
Example:
|
|
215
|
+
>>> get_credible_set_color(1)
|
|
216
|
+
'#FF7F00'
|
|
217
|
+
"""
|
|
218
|
+
if cs_id < 1:
|
|
219
|
+
return LD_NA_COLOR
|
|
220
|
+
# Use modulo to cycle through colors if more than 10 credible sets
|
|
221
|
+
idx = (cs_id - 1) % len(CREDIBLE_SET_COLORS)
|
|
222
|
+
return CREDIBLE_SET_COLORS[idx]
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def get_credible_set_color_palette(n_sets: int = 10) -> dict[int, str]:
|
|
226
|
+
"""Get color palette for credible sets.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
n_sets: Number of credible sets to include.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Dictionary mapping credible set IDs (1-indexed) to hex colors.
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
>>> palette = get_credible_set_color_palette(3)
|
|
236
|
+
>>> palette[1]
|
|
237
|
+
'#FF7F00'
|
|
238
|
+
"""
|
|
239
|
+
return {i + 1: CREDIBLE_SET_COLORS[i % len(CREDIBLE_SET_COLORS)] for i in range(n_sets)}
|
pylocuszoom/eqtl.py
CHANGED
|
@@ -11,7 +11,6 @@ import pandas as pd
|
|
|
11
11
|
|
|
12
12
|
from .logging import logger
|
|
13
13
|
|
|
14
|
-
|
|
15
14
|
REQUIRED_EQTL_COLS = ["pos", "p_value"]
|
|
16
15
|
OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
|
|
17
16
|
|
|
@@ -109,7 +108,9 @@ def filter_eqtl_by_region(
|
|
|
109
108
|
mask = mask & (df_chrom == chrom_str)
|
|
110
109
|
|
|
111
110
|
filtered = df[mask].copy()
|
|
112
|
-
logger.debug(
|
|
111
|
+
logger.debug(
|
|
112
|
+
f"Filtered eQTL data to {len(filtered)} variants in region chr{chrom}:{start}-{end}"
|
|
113
|
+
)
|
|
113
114
|
return filtered
|
|
114
115
|
|
|
115
116
|
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Fine-mapping/SuSiE data handling for pyLocusZoom.
|
|
2
|
+
|
|
3
|
+
Provides utilities for loading, validating, and preparing statistical
|
|
4
|
+
fine-mapping results (SuSiE, FINEMAP, etc.) for visualization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from .logging import logger
|
|
13
|
+
|
|
14
|
+
# Required columns for fine-mapping data
|
|
15
|
+
REQUIRED_FINEMAPPING_COLS = ["pos", "pip"]
|
|
16
|
+
OPTIONAL_FINEMAPPING_COLS = ["rs", "cs", "cs_id", "effect", "se"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FinemappingValidationError(ValueError):
|
|
20
|
+
"""Raised when fine-mapping DataFrame validation fails."""
|
|
21
|
+
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_finemapping_df(
|
|
26
|
+
df: pd.DataFrame,
|
|
27
|
+
pos_col: str = "pos",
|
|
28
|
+
pip_col: str = "pip",
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Validate fine-mapping DataFrame has required columns.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
df: Fine-mapping DataFrame to validate.
|
|
34
|
+
pos_col: Column name for genomic position.
|
|
35
|
+
pip_col: Column name for posterior inclusion probability.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
FinemappingValidationError: If required columns are missing.
|
|
39
|
+
"""
|
|
40
|
+
missing = []
|
|
41
|
+
if pos_col not in df.columns:
|
|
42
|
+
missing.append(pos_col)
|
|
43
|
+
if pip_col not in df.columns:
|
|
44
|
+
missing.append(pip_col)
|
|
45
|
+
|
|
46
|
+
if missing:
|
|
47
|
+
raise FinemappingValidationError(
|
|
48
|
+
f"Fine-mapping DataFrame missing required columns: {missing}. "
|
|
49
|
+
f"Required: {pos_col} (position), {pip_col} (posterior inclusion probability)"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Validate PIP values are in [0, 1]
|
|
53
|
+
if not df[pip_col].between(0, 1).all():
|
|
54
|
+
invalid_count = (~df[pip_col].between(0, 1)).sum()
|
|
55
|
+
raise FinemappingValidationError(
|
|
56
|
+
f"PIP values must be between 0 and 1. Found {invalid_count} invalid values."
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def filter_finemapping_by_region(
|
|
61
|
+
df: pd.DataFrame,
|
|
62
|
+
chrom: int,
|
|
63
|
+
start: int,
|
|
64
|
+
end: int,
|
|
65
|
+
pos_col: str = "pos",
|
|
66
|
+
chrom_col: Optional[str] = "chr",
|
|
67
|
+
) -> pd.DataFrame:
|
|
68
|
+
"""Filter fine-mapping data to a genomic region.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
df: Fine-mapping DataFrame.
|
|
72
|
+
chrom: Chromosome number.
|
|
73
|
+
start: Start position.
|
|
74
|
+
end: End position.
|
|
75
|
+
pos_col: Column name for position.
|
|
76
|
+
chrom_col: Column name for chromosome (if present).
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Filtered DataFrame containing only variants in the region.
|
|
80
|
+
"""
|
|
81
|
+
mask = (df[pos_col] >= start) & (df[pos_col] <= end)
|
|
82
|
+
|
|
83
|
+
# Filter by chromosome if column exists
|
|
84
|
+
if chrom_col and chrom_col in df.columns:
|
|
85
|
+
chrom_str = str(chrom).replace("chr", "")
|
|
86
|
+
df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
|
|
87
|
+
mask = mask & (df_chrom == chrom_str)
|
|
88
|
+
|
|
89
|
+
filtered = df[mask].copy()
|
|
90
|
+
logger.debug(
|
|
91
|
+
f"Filtered fine-mapping data to {len(filtered)} variants in region "
|
|
92
|
+
f"chr{chrom}:{start}-{end}"
|
|
93
|
+
)
|
|
94
|
+
return filtered
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_credible_sets(
|
|
98
|
+
df: pd.DataFrame,
|
|
99
|
+
cs_col: str = "cs",
|
|
100
|
+
) -> List[int]:
|
|
101
|
+
"""Get list of unique credible set IDs.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
df: Fine-mapping DataFrame.
|
|
105
|
+
cs_col: Column containing credible set assignments.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Sorted list of unique credible set IDs (excluding 0/NA).
|
|
109
|
+
"""
|
|
110
|
+
if cs_col not in df.columns:
|
|
111
|
+
return []
|
|
112
|
+
# Filter out variants not in a credible set (typically cs=0 or NA)
|
|
113
|
+
cs_values = df[cs_col].dropna()
|
|
114
|
+
cs_values = cs_values[cs_values != 0]
|
|
115
|
+
return sorted(cs_values.unique().tolist())
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def filter_by_credible_set(
|
|
119
|
+
df: pd.DataFrame,
|
|
120
|
+
cs_id: int,
|
|
121
|
+
cs_col: str = "cs",
|
|
122
|
+
) -> pd.DataFrame:
|
|
123
|
+
"""Filter to variants in a specific credible set.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
df: Fine-mapping DataFrame.
|
|
127
|
+
cs_id: Credible set ID to filter for.
|
|
128
|
+
cs_col: Column containing credible set assignments.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Filtered DataFrame containing only variants in the credible set.
|
|
132
|
+
"""
|
|
133
|
+
if cs_col not in df.columns:
|
|
134
|
+
raise FinemappingValidationError(
|
|
135
|
+
f"Cannot filter by credible set: column '{cs_col}' not found. "
|
|
136
|
+
f"Available columns: {list(df.columns)}"
|
|
137
|
+
)
|
|
138
|
+
return df[df[cs_col] == cs_id].copy()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def prepare_finemapping_for_plotting(
|
|
142
|
+
df: pd.DataFrame,
|
|
143
|
+
pos_col: str = "pos",
|
|
144
|
+
pip_col: str = "pip",
|
|
145
|
+
chrom: Optional[int] = None,
|
|
146
|
+
start: Optional[int] = None,
|
|
147
|
+
end: Optional[int] = None,
|
|
148
|
+
) -> pd.DataFrame:
|
|
149
|
+
"""Prepare fine-mapping data for plotting.
|
|
150
|
+
|
|
151
|
+
Validates, filters, and sorts data for plotting as a line or scatter.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
df: Raw fine-mapping DataFrame.
|
|
155
|
+
pos_col: Column name for position.
|
|
156
|
+
pip_col: Column name for PIP.
|
|
157
|
+
chrom: Optional chromosome for region filtering.
|
|
158
|
+
start: Optional start position for region filtering.
|
|
159
|
+
end: Optional end position for region filtering.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Prepared DataFrame sorted by position.
|
|
163
|
+
"""
|
|
164
|
+
validate_finemapping_df(df, pos_col=pos_col, pip_col=pip_col)
|
|
165
|
+
|
|
166
|
+
result = df.copy()
|
|
167
|
+
|
|
168
|
+
# Filter by region if specified
|
|
169
|
+
if chrom is not None and start is not None and end is not None:
|
|
170
|
+
result = filter_finemapping_by_region(
|
|
171
|
+
result, chrom, start, end, pos_col=pos_col
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Sort by position for line plotting
|
|
175
|
+
result = result.sort_values(pos_col)
|
|
176
|
+
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def get_top_pip_variants(
|
|
181
|
+
df: pd.DataFrame,
|
|
182
|
+
n: int = 5,
|
|
183
|
+
pip_col: str = "pip",
|
|
184
|
+
pip_threshold: float = 0.0,
|
|
185
|
+
) -> pd.DataFrame:
|
|
186
|
+
"""Get top variants by posterior inclusion probability.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
df: Fine-mapping DataFrame.
|
|
190
|
+
n: Number of top variants to return.
|
|
191
|
+
pip_col: Column containing PIP values.
|
|
192
|
+
pip_threshold: Minimum PIP threshold.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
DataFrame with top N variants by PIP.
|
|
196
|
+
"""
|
|
197
|
+
filtered = df[df[pip_col] >= pip_threshold]
|
|
198
|
+
return filtered.nlargest(n, pip_col)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def calculate_credible_set_coverage(
|
|
202
|
+
df: pd.DataFrame,
|
|
203
|
+
cs_col: str = "cs",
|
|
204
|
+
pip_col: str = "pip",
|
|
205
|
+
) -> dict:
|
|
206
|
+
"""Calculate cumulative PIP for each credible set.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
df: Fine-mapping DataFrame.
|
|
210
|
+
cs_col: Column containing credible set assignments.
|
|
211
|
+
pip_col: Column containing PIP values.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dictionary mapping credible set ID to cumulative PIP.
|
|
215
|
+
"""
|
|
216
|
+
if cs_col not in df.columns:
|
|
217
|
+
return {}
|
|
218
|
+
|
|
219
|
+
coverage = {}
|
|
220
|
+
for cs_id in get_credible_sets(df, cs_col):
|
|
221
|
+
cs_data = filter_by_credible_set(df, cs_id, cs_col)
|
|
222
|
+
coverage[cs_id] = cs_data[pip_col].sum()
|
|
223
|
+
|
|
224
|
+
return coverage
|