pylocuszoom 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +9 -1
- pylocuszoom/_plotter_utils.py +66 -0
- pylocuszoom/backends/base.py +56 -0
- pylocuszoom/backends/bokeh_backend.py +141 -29
- pylocuszoom/backends/matplotlib_backend.py +60 -0
- pylocuszoom/backends/plotly_backend.py +297 -88
- pylocuszoom/ensembl.py +6 -11
- pylocuszoom/gene_track.py +2 -24
- pylocuszoom/labels.py +6 -2
- pylocuszoom/manhattan.py +246 -0
- pylocuszoom/manhattan_plotter.py +760 -0
- pylocuszoom/plotter.py +236 -270
- pylocuszoom/qq.py +123 -0
- pylocuszoom/recombination.py +7 -7
- pylocuszoom/stats_plotter.py +319 -0
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.0.dist-info}/METADATA +124 -14
- pylocuszoom-1.1.0.dist-info/RECORD +36 -0
- pylocuszoom-1.0.0.dist-info/RECORD +0 -31
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/qq.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""QQ plot data preparation and statistics."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy import stats
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_lambda_gc(p_values: np.ndarray) -> float:
|
|
9
|
+
"""Calculate genomic inflation factor (lambda GC).
|
|
10
|
+
|
|
11
|
+
Lambda is the ratio of the median observed chi-squared statistic
|
|
12
|
+
to the expected median under the null hypothesis.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
p_values: Array of p-values.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Genomic inflation factor (lambda). Returns NaN if no valid p-values.
|
|
19
|
+
"""
|
|
20
|
+
# Remove NaN and zero/negative values
|
|
21
|
+
p_clean = p_values[~np.isnan(p_values) & (p_values > 0)]
|
|
22
|
+
if len(p_clean) == 0:
|
|
23
|
+
return np.nan
|
|
24
|
+
|
|
25
|
+
# Convert to chi-squared statistics (1 df)
|
|
26
|
+
chi2 = stats.chi2.ppf(1 - p_clean, df=1)
|
|
27
|
+
|
|
28
|
+
# Expected median for chi-squared with 1 df
|
|
29
|
+
expected_median = stats.chi2.ppf(0.5, df=1)
|
|
30
|
+
|
|
31
|
+
# Lambda = observed median / expected median
|
|
32
|
+
return np.median(chi2) / expected_median
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def calculate_confidence_band(
|
|
36
|
+
n_points: int, confidence: float = 0.95
|
|
37
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
|
38
|
+
"""Calculate confidence band for QQ plot.
|
|
39
|
+
|
|
40
|
+
Uses order statistics to compute expected distribution of p-values
|
|
41
|
+
under the null hypothesis.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
n_points: Number of p-values.
|
|
45
|
+
confidence: Confidence level (default 0.95 for 95% CI).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Tuple of (expected, lower_bound, upper_bound) arrays in -log10 scale.
|
|
49
|
+
"""
|
|
50
|
+
# Expected quantiles
|
|
51
|
+
expected = -np.log10((np.arange(1, n_points + 1)) / (n_points + 1))
|
|
52
|
+
|
|
53
|
+
# Confidence interval using beta distribution
|
|
54
|
+
alpha = 1 - confidence
|
|
55
|
+
ranks = np.arange(1, n_points + 1)
|
|
56
|
+
n_minus_rank = n_points - ranks + 1
|
|
57
|
+
|
|
58
|
+
lower_p = stats.beta.ppf(alpha / 2, ranks, n_minus_rank)
|
|
59
|
+
upper_p = stats.beta.ppf(1 - alpha / 2, ranks, n_minus_rank)
|
|
60
|
+
|
|
61
|
+
# Convert to -log10 scale (swap because -log10 reverses order)
|
|
62
|
+
lower_bound = -np.log10(upper_p)
|
|
63
|
+
upper_bound = -np.log10(lower_p)
|
|
64
|
+
|
|
65
|
+
return expected, lower_bound, upper_bound
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def prepare_qq_data(
|
|
69
|
+
df: pd.DataFrame,
|
|
70
|
+
p_col: str = "p",
|
|
71
|
+
) -> pd.DataFrame:
|
|
72
|
+
"""Prepare DataFrame for QQ plot rendering.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
df: DataFrame with p-values.
|
|
76
|
+
p_col: Column name for p-value.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
DataFrame with columns for QQ plotting:
|
|
80
|
+
- _expected: Expected -log10(p) under null
|
|
81
|
+
- _observed: Observed -log10(p)
|
|
82
|
+
- _ci_lower: Lower confidence bound
|
|
83
|
+
- _ci_upper: Upper confidence bound
|
|
84
|
+
|
|
85
|
+
Attributes stored in DataFrame.attrs:
|
|
86
|
+
- lambda_gc: Genomic inflation factor
|
|
87
|
+
- n_variants: Number of valid p-values
|
|
88
|
+
"""
|
|
89
|
+
if p_col not in df.columns:
|
|
90
|
+
raise ValueError(f"Column '{p_col}' not found in DataFrame")
|
|
91
|
+
|
|
92
|
+
# Get p-values and filter invalid
|
|
93
|
+
p_values = df[p_col].values
|
|
94
|
+
valid_mask = ~np.isnan(p_values) & (p_values > 0) & (p_values <= 1)
|
|
95
|
+
p_valid = p_values[valid_mask]
|
|
96
|
+
|
|
97
|
+
if len(p_valid) == 0:
|
|
98
|
+
raise ValueError("No valid p-values found (must be > 0 and <= 1)")
|
|
99
|
+
|
|
100
|
+
# Sort p-values (smallest first -> largest -log10 last)
|
|
101
|
+
p_sorted = np.sort(p_valid)
|
|
102
|
+
|
|
103
|
+
# Calculate observed -log10(p)
|
|
104
|
+
observed = -np.log10(p_sorted)
|
|
105
|
+
|
|
106
|
+
# Calculate expected and confidence bands
|
|
107
|
+
expected, ci_lower, ci_upper = calculate_confidence_band(len(p_sorted))
|
|
108
|
+
|
|
109
|
+
# Create result DataFrame
|
|
110
|
+
result = pd.DataFrame(
|
|
111
|
+
{
|
|
112
|
+
"_expected": expected,
|
|
113
|
+
"_observed": observed,
|
|
114
|
+
"_ci_lower": ci_lower,
|
|
115
|
+
"_ci_upper": ci_upper,
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Store statistics in attrs
|
|
120
|
+
result.attrs["lambda_gc"] = calculate_lambda_gc(p_valid)
|
|
121
|
+
result.attrs["n_variants"] = len(p_valid)
|
|
122
|
+
|
|
123
|
+
return result
|
pylocuszoom/recombination.py
CHANGED
|
@@ -432,8 +432,8 @@ def add_recombination_overlay(
|
|
|
432
432
|
region_recomb["pos"],
|
|
433
433
|
region_recomb["rate"],
|
|
434
434
|
color=RECOMB_COLOR,
|
|
435
|
-
linewidth=
|
|
436
|
-
alpha=0.
|
|
435
|
+
linewidth=2.5,
|
|
436
|
+
alpha=0.8,
|
|
437
437
|
zorder=0, # Behind scatter points
|
|
438
438
|
)
|
|
439
439
|
|
|
@@ -447,14 +447,14 @@ def add_recombination_overlay(
|
|
|
447
447
|
zorder=0,
|
|
448
448
|
)
|
|
449
449
|
|
|
450
|
-
# Format secondary axis
|
|
451
|
-
recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color=
|
|
452
|
-
recomb_ax.tick_params(axis="y", labelcolor=
|
|
450
|
+
# Format secondary axis - use black for label text (more readable)
|
|
451
|
+
recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color="black", fontsize=9)
|
|
452
|
+
recomb_ax.tick_params(axis="y", labelcolor="black", labelsize=8)
|
|
453
453
|
recomb_ax.set_ylim(bottom=0)
|
|
454
454
|
|
|
455
|
-
#
|
|
455
|
+
# Scale to fit data with headroom
|
|
456
456
|
max_rate = region_recomb["rate"].max()
|
|
457
|
-
recomb_ax.set_ylim(0, max(max_rate * 1.
|
|
457
|
+
recomb_ax.set_ylim(0, max(max_rate * 1.3, 10))
|
|
458
458
|
|
|
459
459
|
# Remove top spine for cleaner look
|
|
460
460
|
recomb_ax.spines["top"].set_visible(False)
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
"""Statistical visualization plotter for PheWAS and forest plots.
|
|
2
|
+
|
|
3
|
+
Provides variant-centric visualizations:
|
|
4
|
+
- PheWAS plots showing associations across phenotypes
|
|
5
|
+
- Forest plots showing effect sizes with confidence intervals
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from ._plotter_utils import DEFAULT_GENOMEWIDE_THRESHOLD, transform_pvalues
|
|
14
|
+
from .backends import BackendType, get_backend
|
|
15
|
+
from .colors import get_phewas_category_palette
|
|
16
|
+
from .forest import validate_forest_df
|
|
17
|
+
from .phewas import validate_phewas_df
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class StatsPlotter:
|
|
21
|
+
"""Statistical visualization plotter for PheWAS and forest plots.
|
|
22
|
+
|
|
23
|
+
Creates variant-centric visualizations for phenome-wide associations
|
|
24
|
+
and meta-analysis forest plots.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
|
|
28
|
+
genomewide_threshold: P-value threshold for significance line.
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> plotter = StatsPlotter()
|
|
32
|
+
>>> fig = plotter.plot_phewas(phewas_df, variant_id="rs12345")
|
|
33
|
+
>>> fig.savefig("phewas.png", dpi=150)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
backend: BackendType = "matplotlib",
|
|
39
|
+
genomewide_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
|
|
40
|
+
):
|
|
41
|
+
"""Initialize the stats plotter."""
|
|
42
|
+
self._backend = get_backend(backend)
|
|
43
|
+
self.genomewide_threshold = genomewide_threshold
|
|
44
|
+
|
|
45
|
+
def plot_phewas(
|
|
46
|
+
self,
|
|
47
|
+
phewas_df: pd.DataFrame,
|
|
48
|
+
variant_id: str,
|
|
49
|
+
phenotype_col: str = "phenotype",
|
|
50
|
+
p_col: str = "p_value",
|
|
51
|
+
category_col: str = "category",
|
|
52
|
+
effect_col: Optional[str] = None,
|
|
53
|
+
significance_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
|
|
54
|
+
figsize: Tuple[float, float] = (10, 8),
|
|
55
|
+
) -> Any:
|
|
56
|
+
"""Create a PheWAS (Phenome-Wide Association Study) plot.
|
|
57
|
+
|
|
58
|
+
Shows associations of a single variant across multiple phenotypes,
|
|
59
|
+
with phenotypes grouped by category and colored accordingly.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
phewas_df: DataFrame with phenotype associations.
|
|
63
|
+
variant_id: Variant identifier (e.g., "rs12345") for plot title.
|
|
64
|
+
phenotype_col: Column name for phenotype names.
|
|
65
|
+
p_col: Column name for p-values.
|
|
66
|
+
category_col: Column name for phenotype categories.
|
|
67
|
+
effect_col: Optional column name for effect direction (beta/OR).
|
|
68
|
+
significance_threshold: P-value threshold for significance line.
|
|
69
|
+
figsize: Figure size as (width, height).
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Figure object (type depends on backend).
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
>>> fig = plotter.plot_phewas(
|
|
76
|
+
... phewas_df,
|
|
77
|
+
... variant_id="rs12345",
|
|
78
|
+
... category_col="category",
|
|
79
|
+
... )
|
|
80
|
+
"""
|
|
81
|
+
validate_phewas_df(phewas_df, phenotype_col, p_col, category_col)
|
|
82
|
+
|
|
83
|
+
df = phewas_df.copy()
|
|
84
|
+
df = transform_pvalues(df, p_col)
|
|
85
|
+
|
|
86
|
+
# Sort by category then by p-value for consistent ordering
|
|
87
|
+
if category_col in df.columns:
|
|
88
|
+
df = df.sort_values([category_col, p_col])
|
|
89
|
+
categories = df[category_col].unique().tolist()
|
|
90
|
+
palette = get_phewas_category_palette(categories)
|
|
91
|
+
else:
|
|
92
|
+
df = df.sort_values(p_col)
|
|
93
|
+
categories = []
|
|
94
|
+
palette = {}
|
|
95
|
+
|
|
96
|
+
# Create figure
|
|
97
|
+
fig, axes = self._backend.create_figure(
|
|
98
|
+
n_panels=1,
|
|
99
|
+
height_ratios=[1.0],
|
|
100
|
+
figsize=figsize,
|
|
101
|
+
)
|
|
102
|
+
ax = axes[0]
|
|
103
|
+
|
|
104
|
+
# Assign y-positions (one per phenotype)
|
|
105
|
+
df["y_pos"] = range(len(df))
|
|
106
|
+
|
|
107
|
+
# Plot points by category
|
|
108
|
+
if categories:
|
|
109
|
+
for cat in categories:
|
|
110
|
+
# Handle NaN category: NaN == NaN is False in pandas
|
|
111
|
+
if pd.isna(cat):
|
|
112
|
+
cat_data = df[df[category_col].isna()]
|
|
113
|
+
else:
|
|
114
|
+
cat_data = df[df[category_col] == cat]
|
|
115
|
+
# Use upward triangles for positive effects, circles otherwise
|
|
116
|
+
if effect_col and effect_col in cat_data.columns:
|
|
117
|
+
# Vectorized: split by effect sign, 2 scatter calls per category
|
|
118
|
+
pos_data = cat_data[cat_data[effect_col] >= 0]
|
|
119
|
+
neg_data = cat_data[cat_data[effect_col] < 0]
|
|
120
|
+
|
|
121
|
+
if not pos_data.empty:
|
|
122
|
+
self._backend.scatter(
|
|
123
|
+
ax,
|
|
124
|
+
pos_data["neglog10p"],
|
|
125
|
+
pos_data["y_pos"],
|
|
126
|
+
colors=palette[cat],
|
|
127
|
+
sizes=60,
|
|
128
|
+
marker="^",
|
|
129
|
+
edgecolor="black",
|
|
130
|
+
linewidth=0.5,
|
|
131
|
+
zorder=2,
|
|
132
|
+
)
|
|
133
|
+
if not neg_data.empty:
|
|
134
|
+
self._backend.scatter(
|
|
135
|
+
ax,
|
|
136
|
+
neg_data["neglog10p"],
|
|
137
|
+
neg_data["y_pos"],
|
|
138
|
+
colors=palette[cat],
|
|
139
|
+
sizes=60,
|
|
140
|
+
marker="v",
|
|
141
|
+
edgecolor="black",
|
|
142
|
+
linewidth=0.5,
|
|
143
|
+
zorder=2,
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
self._backend.scatter(
|
|
147
|
+
ax,
|
|
148
|
+
cat_data["neglog10p"],
|
|
149
|
+
cat_data["y_pos"],
|
|
150
|
+
colors=palette[cat],
|
|
151
|
+
sizes=60,
|
|
152
|
+
marker="o",
|
|
153
|
+
edgecolor="black",
|
|
154
|
+
linewidth=0.5,
|
|
155
|
+
zorder=2,
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
self._backend.scatter(
|
|
159
|
+
ax,
|
|
160
|
+
df["neglog10p"],
|
|
161
|
+
df["y_pos"],
|
|
162
|
+
colors="#4169E1",
|
|
163
|
+
sizes=60,
|
|
164
|
+
edgecolor="black",
|
|
165
|
+
linewidth=0.5,
|
|
166
|
+
zorder=2,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Add significance threshold line
|
|
170
|
+
sig_line = -np.log10(significance_threshold)
|
|
171
|
+
self._backend.axvline(
|
|
172
|
+
ax, x=sig_line, color="red", linestyle="--", linewidth=1, alpha=0.7
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Set axis labels and limits
|
|
176
|
+
self._backend.set_xlabel(ax, r"$-\log_{10}$ P")
|
|
177
|
+
self._backend.set_ylabel(ax, "Phenotype")
|
|
178
|
+
self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
|
|
179
|
+
|
|
180
|
+
# Set y-tick labels to phenotype names
|
|
181
|
+
self._backend.set_yticks(
|
|
182
|
+
ax,
|
|
183
|
+
positions=df["y_pos"].tolist(),
|
|
184
|
+
labels=df[phenotype_col].tolist(),
|
|
185
|
+
fontsize=8,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
self._backend.set_title(ax, f"PheWAS: {variant_id}")
|
|
189
|
+
self._backend.hide_spines(ax, ["top", "right"])
|
|
190
|
+
self._backend.finalize_layout(fig)
|
|
191
|
+
|
|
192
|
+
return fig
|
|
193
|
+
|
|
194
|
+
def plot_forest(
|
|
195
|
+
self,
|
|
196
|
+
forest_df: pd.DataFrame,
|
|
197
|
+
variant_id: str,
|
|
198
|
+
study_col: str = "study",
|
|
199
|
+
effect_col: str = "effect",
|
|
200
|
+
ci_lower_col: str = "ci_lower",
|
|
201
|
+
ci_upper_col: str = "ci_upper",
|
|
202
|
+
weight_col: Optional[str] = None,
|
|
203
|
+
null_value: float = 0.0,
|
|
204
|
+
effect_label: str = "Effect Size",
|
|
205
|
+
figsize: Tuple[float, float] = (8, 6),
|
|
206
|
+
) -> Any:
|
|
207
|
+
"""Create a forest plot showing effect sizes with confidence intervals.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
forest_df: DataFrame with effect sizes and confidence intervals.
|
|
211
|
+
variant_id: Variant identifier for plot title.
|
|
212
|
+
study_col: Column name for study/phenotype names.
|
|
213
|
+
effect_col: Column name for effect sizes.
|
|
214
|
+
ci_lower_col: Column name for lower confidence interval.
|
|
215
|
+
ci_upper_col: Column name for upper confidence interval.
|
|
216
|
+
weight_col: Optional column for study weights (affects marker size).
|
|
217
|
+
null_value: Reference value for null effect (0 for beta, 1 for OR).
|
|
218
|
+
effect_label: X-axis label.
|
|
219
|
+
figsize: Figure size as (width, height).
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Figure object (type depends on backend).
|
|
223
|
+
|
|
224
|
+
Example:
|
|
225
|
+
>>> fig = plotter.plot_forest(
|
|
226
|
+
... forest_df,
|
|
227
|
+
... variant_id="rs12345",
|
|
228
|
+
... effect_label="Odds Ratio",
|
|
229
|
+
... null_value=1.0,
|
|
230
|
+
... )
|
|
231
|
+
"""
|
|
232
|
+
validate_forest_df(forest_df, study_col, effect_col, ci_lower_col, ci_upper_col)
|
|
233
|
+
|
|
234
|
+
df = forest_df.copy()
|
|
235
|
+
|
|
236
|
+
# Create figure
|
|
237
|
+
fig, axes = self._backend.create_figure(
|
|
238
|
+
n_panels=1,
|
|
239
|
+
height_ratios=[1.0],
|
|
240
|
+
figsize=figsize,
|
|
241
|
+
)
|
|
242
|
+
ax = axes[0]
|
|
243
|
+
|
|
244
|
+
# Assign y-positions (reverse so first study is at top)
|
|
245
|
+
df["y_pos"] = range(len(df) - 1, -1, -1)
|
|
246
|
+
|
|
247
|
+
# Calculate marker sizes from weights
|
|
248
|
+
if weight_col and weight_col in df.columns:
|
|
249
|
+
# Scale weights to marker sizes (min 40, max 200)
|
|
250
|
+
weights = df[weight_col]
|
|
251
|
+
min_size, max_size = 40, 200
|
|
252
|
+
weight_range = weights.max() - weights.min()
|
|
253
|
+
if weight_range > 0:
|
|
254
|
+
sizes = min_size + (weights - weights.min()) / weight_range * (
|
|
255
|
+
max_size - min_size
|
|
256
|
+
)
|
|
257
|
+
else:
|
|
258
|
+
sizes = (min_size + max_size) / 2
|
|
259
|
+
else:
|
|
260
|
+
sizes = 80
|
|
261
|
+
|
|
262
|
+
# Calculate error bar extents
|
|
263
|
+
xerr_lower = df[effect_col] - df[ci_lower_col]
|
|
264
|
+
xerr_upper = df[ci_upper_col] - df[effect_col]
|
|
265
|
+
|
|
266
|
+
# Plot error bars (confidence intervals)
|
|
267
|
+
self._backend.errorbar_h(
|
|
268
|
+
ax,
|
|
269
|
+
x=df[effect_col],
|
|
270
|
+
y=df["y_pos"],
|
|
271
|
+
xerr_lower=xerr_lower,
|
|
272
|
+
xerr_upper=xerr_upper,
|
|
273
|
+
color="black",
|
|
274
|
+
linewidth=1.5,
|
|
275
|
+
capsize=3,
|
|
276
|
+
zorder=2,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Plot effect size markers
|
|
280
|
+
self._backend.scatter(
|
|
281
|
+
ax,
|
|
282
|
+
df[effect_col],
|
|
283
|
+
df["y_pos"],
|
|
284
|
+
colors="#4169E1",
|
|
285
|
+
sizes=sizes,
|
|
286
|
+
marker="s", # square markers typical for forest plots
|
|
287
|
+
edgecolor="black",
|
|
288
|
+
linewidth=0.5,
|
|
289
|
+
zorder=3,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Add null effect line
|
|
293
|
+
self._backend.axvline(
|
|
294
|
+
ax, x=null_value, color="grey", linestyle="--", linewidth=1, alpha=0.7
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Set axis labels and limits
|
|
298
|
+
self._backend.set_xlabel(ax, effect_label)
|
|
299
|
+
self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
|
|
300
|
+
|
|
301
|
+
# Ensure x-axis includes the null value with some padding
|
|
302
|
+
x_min = min(df[ci_lower_col].min(), null_value)
|
|
303
|
+
x_max = max(df[ci_upper_col].max(), null_value)
|
|
304
|
+
x_padding = (x_max - x_min) * 0.1
|
|
305
|
+
self._backend.set_xlim(ax, x_min - x_padding, x_max + x_padding)
|
|
306
|
+
|
|
307
|
+
# Set y-tick labels to study names
|
|
308
|
+
self._backend.set_yticks(
|
|
309
|
+
ax,
|
|
310
|
+
positions=df["y_pos"].tolist(),
|
|
311
|
+
labels=df[study_col].tolist(),
|
|
312
|
+
fontsize=10,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
self._backend.set_title(ax, f"Forest Plot: {variant_id}")
|
|
316
|
+
self._backend.hide_spines(ax, ["top", "right"])
|
|
317
|
+
self._backend.finalize_layout(fig)
|
|
318
|
+
|
|
319
|
+
return fig
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pylocuszoom
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Publication-ready regional association plots with LD coloring, gene tracks, and recombination overlays
|
|
5
5
|
Project-URL: Homepage, https://github.com/michael-denyer/pylocuszoom
|
|
6
6
|
Project-URL: Documentation, https://github.com/michael-denyer/pylocuszoom#readme
|
|
@@ -21,6 +21,7 @@ Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
|
21
21
|
Requires-Python: >=3.10
|
|
22
22
|
Requires-Dist: adjusttext>=0.8
|
|
23
23
|
Requires-Dist: bokeh>=3.8.2
|
|
24
|
+
Requires-Dist: colorcet>=3.0.0
|
|
24
25
|
Requires-Dist: kaleido>=0.2.0
|
|
25
26
|
Requires-Dist: loguru>=0.7.0
|
|
26
27
|
Requires-Dist: matplotlib>=3.5.0
|
|
@@ -66,22 +67,24 @@ Inspired by [LocusZoom](http://locuszoom.org/) and [locuszoomr](https://github.c
|
|
|
66
67
|
- **Multi-species support**: Built-in reference data for *Canis lupus familiaris* (CanFam3.1/CanFam4) and *Felis catus* (FelCat9), or optionally provide your own for any species
|
|
67
68
|
- **LD coloring**: SNPs colored by linkage disequilibrium (R²) with lead variant
|
|
68
69
|
- **Gene tracks**: Annotated gene/exon positions below the association plot
|
|
69
|
-
- **Recombination rate**:
|
|
70
|
+
- **Recombination rate**: Overlay across region (*Canis lupus familiaris* built-in, or user-provided)
|
|
70
71
|
- **SNP labels (matplotlib)**: Automatic labeling of top SNPs by p-value (RS IDs)
|
|
71
72
|
- **Hover tooltips (Plotly and Bokeh)**: Detailed SNP data on hover
|
|
72
73
|
|
|
73
|
-

|
|
75
|
+
*Regional association plot with LD coloring, gene/exon track, recombination rate overlay (blue line), and top SNP labels.*
|
|
75
76
|
|
|
76
77
|
2. **Stacked plots**: Compare multiple GWAS/phenotypes vertically
|
|
77
|
-
3. **
|
|
78
|
-
4. **
|
|
79
|
-
5. **
|
|
80
|
-
6. **
|
|
81
|
-
7. **
|
|
82
|
-
8. **
|
|
83
|
-
9. **
|
|
84
|
-
10. **
|
|
78
|
+
3. **Manhattan plots**: Genome-wide association visualization with chromosome coloring
|
|
79
|
+
4. **QQ plots**: Quantile-quantile plots with confidence bands and genomic inflation factor
|
|
80
|
+
5. **eQTL plot**: Expression QTL data aligned with association plots and gene tracks
|
|
81
|
+
6. **Fine-mapping plots**: Visualize SuSiE credible sets with posterior inclusion probabilities
|
|
82
|
+
7. **PheWAS plots**: Phenome-wide association study visualization across multiple phenotypes
|
|
83
|
+
8. **Forest plots**: Meta-analysis effect size visualization with confidence intervals
|
|
84
|
+
9. **Multiple backends**: matplotlib (publication-ready), plotly (interactive), bokeh (dashboard integration)
|
|
85
|
+
10. **Pandas and PySpark support**: Works with both Pandas and PySpark DataFrames for large-scale genomics data
|
|
86
|
+
11. **Convenience data file loaders**: Load and validate common GWAS, eQTL and fine-mapping file formats
|
|
87
|
+
12. **Automatic gene annotations**: Fetch gene/exon data from Ensembl REST API with caching (human, mouse, rat, canine, feline, and any Ensembl species)
|
|
85
88
|
|
|
86
89
|
## Installation
|
|
87
90
|
|
|
@@ -107,15 +110,16 @@ conda install -c bioconda pylocuszoom
|
|
|
107
110
|
from pylocuszoom import LocusZoomPlotter
|
|
108
111
|
|
|
109
112
|
# Initialize plotter (loads reference data for canine)
|
|
110
|
-
plotter = LocusZoomPlotter(species="canine")
|
|
113
|
+
plotter = LocusZoomPlotter(species="canine", auto_genes=True)
|
|
111
114
|
|
|
112
115
|
# Plot with parameters passed directly
|
|
113
116
|
fig = plotter.plot(
|
|
114
|
-
gwas_df, # DataFrame with
|
|
117
|
+
gwas_df, # DataFrame with pos, p_value, rs columns
|
|
115
118
|
chrom=1,
|
|
116
119
|
start=1000000,
|
|
117
120
|
end=2000000,
|
|
118
121
|
lead_pos=1500000, # Highlight lead SNP
|
|
122
|
+
show_recombination=True, # Overlay recombination rate
|
|
119
123
|
)
|
|
120
124
|
fig.savefig("regional_plot.png", dpi=150)
|
|
121
125
|
```
|
|
@@ -355,6 +359,112 @@ fig = plotter.plot_forest(
|
|
|
355
359
|

|
|
356
360
|
*Forest plot with effect sizes, confidence intervals, and weight-proportional markers.*
|
|
357
361
|
|
|
362
|
+
## Manhattan Plots
|
|
363
|
+
|
|
364
|
+
Create genome-wide Manhattan plots showing associations across all chromosomes:
|
|
365
|
+
|
|
366
|
+
```python
|
|
367
|
+
from pylocuszoom import LocusZoomPlotter
|
|
368
|
+
|
|
369
|
+
plotter = LocusZoomPlotter(species="human")
|
|
370
|
+
|
|
371
|
+
fig = plotter.plot_manhattan(
|
|
372
|
+
gwas_df,
|
|
373
|
+
chrom_col="chrom",
|
|
374
|
+
pos_col="pos",
|
|
375
|
+
p_col="p",
|
|
376
|
+
significance_threshold=5e-8, # Genome-wide significance line
|
|
377
|
+
figsize=(12, 5),
|
|
378
|
+
)
|
|
379
|
+
fig.savefig("manhattan.png", dpi=150)
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+

|
|
383
|
+
*Manhattan plot showing genome-wide associations with chromosome coloring and significance threshold.*
|
|
384
|
+
|
|
385
|
+
Categorical Manhattan plots (PheWAS-style) are also supported:
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
fig = plotter.plot_manhattan(
|
|
389
|
+
phewas_df,
|
|
390
|
+
category_col="phenotype_category",
|
|
391
|
+
p_col="pvalue",
|
|
392
|
+
)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
## QQ Plots
|
|
396
|
+
|
|
397
|
+
Create quantile-quantile plots to assess p-value distribution:
|
|
398
|
+
|
|
399
|
+
```python
|
|
400
|
+
from pylocuszoom import LocusZoomPlotter
|
|
401
|
+
|
|
402
|
+
plotter = LocusZoomPlotter()
|
|
403
|
+
|
|
404
|
+
fig = plotter.plot_qq(
|
|
405
|
+
gwas_df,
|
|
406
|
+
p_col="p",
|
|
407
|
+
show_confidence_band=True, # 95% confidence band
|
|
408
|
+
show_lambda=True, # Genomic inflation factor in title
|
|
409
|
+
figsize=(6, 6),
|
|
410
|
+
)
|
|
411
|
+
fig.savefig("qq_plot.png", dpi=150)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+

|
|
415
|
+
*QQ plot with 95% confidence band and genomic inflation factor (λ).*
|
|
416
|
+
|
|
417
|
+
## Stacked Manhattan Plots
|
|
418
|
+
|
|
419
|
+
Compare multiple GWAS results in vertically stacked Manhattan plots:
|
|
420
|
+
|
|
421
|
+
```python
|
|
422
|
+
from pylocuszoom import LocusZoomPlotter
|
|
423
|
+
|
|
424
|
+
plotter = LocusZoomPlotter()
|
|
425
|
+
|
|
426
|
+
fig = plotter.plot_manhattan_stacked(
|
|
427
|
+
[gwas_study1, gwas_study2, gwas_study3],
|
|
428
|
+
chrom_col="chrom",
|
|
429
|
+
pos_col="pos",
|
|
430
|
+
p_col="p",
|
|
431
|
+
panel_labels=["Study 1", "Study 2", "Study 3"],
|
|
432
|
+
significance_threshold=5e-8,
|
|
433
|
+
figsize=(12, 8),
|
|
434
|
+
title="Multi-study GWAS Comparison",
|
|
435
|
+
)
|
|
436
|
+
fig.savefig("manhattan_stacked.png", dpi=150)
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+

|
|
440
|
+
*Stacked Manhattan plots comparing three GWAS studies with shared chromosome axis.*
|
|
441
|
+
|
|
442
|
+
## Manhattan and QQ Side-by-Side
|
|
443
|
+
|
|
444
|
+
Create combined Manhattan and QQ plots in a single figure:
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
from pylocuszoom import LocusZoomPlotter
|
|
448
|
+
|
|
449
|
+
plotter = LocusZoomPlotter()
|
|
450
|
+
|
|
451
|
+
fig = plotter.plot_manhattan_qq(
|
|
452
|
+
gwas_df,
|
|
453
|
+
chrom_col="chrom",
|
|
454
|
+
pos_col="pos",
|
|
455
|
+
p_col="p",
|
|
456
|
+
significance_threshold=5e-8,
|
|
457
|
+
show_confidence_band=True,
|
|
458
|
+
show_lambda=True,
|
|
459
|
+
figsize=(14, 5),
|
|
460
|
+
title="GWAS Results",
|
|
461
|
+
)
|
|
462
|
+
fig.savefig("manhattan_qq.png", dpi=150)
|
|
463
|
+
```
|
|
464
|
+
|
|
465
|
+

|
|
466
|
+
*Combined Manhattan and QQ plot showing genome-wide associations and p-value distribution.*
|
|
467
|
+
|
|
358
468
|
## PySpark Support
|
|
359
469
|
|
|
360
470
|
For large-scale genomics data, convert PySpark DataFrames with `to_pandas()` before plotting:
|