pylocuszoom 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +120 -0
- pylocuszoom/backends/__init__.py +52 -0
- pylocuszoom/backends/base.py +341 -0
- pylocuszoom/backends/bokeh_backend.py +441 -0
- pylocuszoom/backends/matplotlib_backend.py +288 -0
- pylocuszoom/backends/plotly_backend.py +474 -0
- pylocuszoom/colors.py +107 -0
- pylocuszoom/eqtl.py +218 -0
- pylocuszoom/gene_track.py +311 -0
- pylocuszoom/labels.py +118 -0
- pylocuszoom/ld.py +209 -0
- pylocuszoom/logging.py +153 -0
- pylocuszoom/plotter.py +733 -0
- pylocuszoom/recombination.py +432 -0
- pylocuszoom/reference_data/__init__.py +4 -0
- pylocuszoom/utils.py +194 -0
- pylocuszoom-0.1.0.dist-info/METADATA +367 -0
- pylocuszoom-0.1.0.dist-info/RECORD +20 -0
- pylocuszoom-0.1.0.dist-info/WHEEL +4 -0
- pylocuszoom-0.1.0.dist-info/licenses/LICENSE.md +17 -0
pylocuszoom/plotter.py
ADDED
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
"""Main LocusZoomPlotter class for regional association plots.
|
|
2
|
+
|
|
3
|
+
Orchestrates all components (LD coloring, gene track, recombination overlay,
|
|
4
|
+
SNP labels) into a unified plotting interface.
|
|
5
|
+
|
|
6
|
+
Supports multiple backends:
|
|
7
|
+
- matplotlib (default): Static publication-quality plots
|
|
8
|
+
- plotly: Interactive HTML with hover tooltips
|
|
9
|
+
- bokeh: Interactive HTML for dashboards
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, List, Optional, Tuple, Union
|
|
14
|
+
|
|
15
|
+
import matplotlib.pyplot as plt
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from matplotlib.axes import Axes
|
|
19
|
+
from matplotlib.figure import Figure
|
|
20
|
+
from matplotlib.lines import Line2D
|
|
21
|
+
from matplotlib.patches import Patch
|
|
22
|
+
from matplotlib.ticker import FuncFormatter, MaxNLocator
|
|
23
|
+
|
|
24
|
+
from .backends import BackendType, PlotBackend, get_backend
|
|
25
|
+
|
|
26
|
+
from .colors import (
|
|
27
|
+
LD_BINS,
|
|
28
|
+
LEAD_SNP_COLOR,
|
|
29
|
+
get_ld_bin,
|
|
30
|
+
get_ld_color_palette,
|
|
31
|
+
)
|
|
32
|
+
from .gene_track import assign_gene_positions, plot_gene_track
|
|
33
|
+
from .labels import add_snp_labels
|
|
34
|
+
from .ld import calculate_ld, find_plink
|
|
35
|
+
from .logging import enable_logging, logger
|
|
36
|
+
from .recombination import (
|
|
37
|
+
add_recombination_overlay,
|
|
38
|
+
download_dog_recombination_maps,
|
|
39
|
+
get_default_data_dir,
|
|
40
|
+
get_recombination_rate_for_region,
|
|
41
|
+
)
|
|
42
|
+
from .utils import normalize_chrom, validate_genes_df, validate_gwas_df
|
|
43
|
+
|
|
44
|
+
# Default significance threshold: 5e-8 for human, 5e-7 for dog
|
|
45
|
+
DEFAULT_GENOMEWIDE_THRESHOLD = 5e-7
|
|
46
|
+
DEFAULT_GENOMEWIDE_LINE = -np.log10(DEFAULT_GENOMEWIDE_THRESHOLD)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class LocusZoomPlotter:
|
|
50
|
+
"""Regional association plot generator with LD coloring and annotations.
|
|
51
|
+
|
|
52
|
+
Creates LocusZoom-style regional plots with:
|
|
53
|
+
- LD coloring based on R² with lead variant
|
|
54
|
+
- Gene and exon tracks
|
|
55
|
+
- Recombination rate overlays (dog built-in, or user-provided)
|
|
56
|
+
- Automatic SNP labeling
|
|
57
|
+
|
|
58
|
+
Supports multiple rendering backends:
|
|
59
|
+
- matplotlib (default): Static publication-quality plots
|
|
60
|
+
- plotly: Interactive HTML with hover tooltips
|
|
61
|
+
- bokeh: Interactive HTML for dashboards
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
species: Species name ('dog', 'cat', or None for custom).
|
|
65
|
+
Dog has built-in recombination maps.
|
|
66
|
+
genome_build: Genome build for coordinate system. For dog:
|
|
67
|
+
"canfam3.1" (default) or "canfam4". If "canfam4", recombination
|
|
68
|
+
maps are automatically lifted over from CanFam3.1.
|
|
69
|
+
backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
|
|
70
|
+
Defaults to 'matplotlib' for static plots.
|
|
71
|
+
plink_path: Path to PLINK executable for LD calculation.
|
|
72
|
+
Auto-detects if None.
|
|
73
|
+
recomb_data_dir: Directory containing recombination maps.
|
|
74
|
+
Uses platform cache if None.
|
|
75
|
+
genomewide_threshold: P-value threshold for significance line.
|
|
76
|
+
log_level: Logging level ("DEBUG", "INFO", "WARNING", "ERROR", or None
|
|
77
|
+
to disable). Defaults to "INFO".
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
>>> # Static plot (default)
|
|
81
|
+
>>> plotter = LocusZoomPlotter(species="dog")
|
|
82
|
+
>>>
|
|
83
|
+
>>> # Interactive plot with plotly
|
|
84
|
+
>>> plotter = LocusZoomPlotter(species="dog", backend="plotly")
|
|
85
|
+
>>>
|
|
86
|
+
>>> fig = plotter.plot(
|
|
87
|
+
... gwas_df,
|
|
88
|
+
... chrom=1,
|
|
89
|
+
... start=1000000,
|
|
90
|
+
... end=2000000,
|
|
91
|
+
... lead_pos=1500000,
|
|
92
|
+
... )
|
|
93
|
+
>>> fig.savefig("regional_plot.png", dpi=150) # matplotlib
|
|
94
|
+
>>> # or fig.save("plot.html") # plotly/bokeh
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
species: str = "dog",
|
|
100
|
+
genome_build: Optional[str] = None,
|
|
101
|
+
backend: BackendType = "matplotlib",
|
|
102
|
+
plink_path: Optional[str] = None,
|
|
103
|
+
recomb_data_dir: Optional[str] = None,
|
|
104
|
+
genomewide_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
|
|
105
|
+
log_level: Optional[str] = "INFO",
|
|
106
|
+
):
|
|
107
|
+
"""Initialize the plotter."""
|
|
108
|
+
# Configure logging
|
|
109
|
+
if log_level is not None:
|
|
110
|
+
enable_logging(log_level)
|
|
111
|
+
|
|
112
|
+
self.species = species
|
|
113
|
+
self.genome_build = (
|
|
114
|
+
genome_build if genome_build else self._default_build(species)
|
|
115
|
+
)
|
|
116
|
+
self.backend_name = backend
|
|
117
|
+
self._backend = get_backend(backend)
|
|
118
|
+
self.plink_path = plink_path or find_plink()
|
|
119
|
+
self.recomb_data_dir = recomb_data_dir
|
|
120
|
+
self.genomewide_threshold = genomewide_threshold
|
|
121
|
+
self._genomewide_line = -np.log10(genomewide_threshold)
|
|
122
|
+
|
|
123
|
+
# Cache for loaded data
|
|
124
|
+
self._recomb_cache = {}
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _default_build(species: str) -> Optional[str]:
|
|
128
|
+
"""Get default genome build for species."""
|
|
129
|
+
if species == "dog":
|
|
130
|
+
return "canfam3.1"
|
|
131
|
+
if species == "cat":
|
|
132
|
+
return "felCat9"
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
def _ensure_recomb_maps(self) -> Optional[Path]:
|
|
136
|
+
"""Ensure recombination maps are downloaded.
|
|
137
|
+
|
|
138
|
+
Returns path to recombination map directory, or None if not available.
|
|
139
|
+
"""
|
|
140
|
+
if self.species == "dog":
|
|
141
|
+
if self.recomb_data_dir:
|
|
142
|
+
return Path(self.recomb_data_dir)
|
|
143
|
+
# Check if already downloaded
|
|
144
|
+
default_dir = get_default_data_dir()
|
|
145
|
+
if (
|
|
146
|
+
default_dir.exists()
|
|
147
|
+
and len(list(default_dir.glob("chr*_recomb.tsv"))) >= 38
|
|
148
|
+
):
|
|
149
|
+
return default_dir
|
|
150
|
+
# Download
|
|
151
|
+
try:
|
|
152
|
+
return download_dog_recombination_maps()
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.warning(f"Could not download recombination maps: {e}")
|
|
155
|
+
return None
|
|
156
|
+
elif self.recomb_data_dir:
|
|
157
|
+
return Path(self.recomb_data_dir)
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
def _get_recomb_for_region(
|
|
161
|
+
self, chrom: int, start: int, end: int
|
|
162
|
+
) -> Optional[pd.DataFrame]:
|
|
163
|
+
"""Get recombination rate data for a region, with caching."""
|
|
164
|
+
cache_key = (chrom, start, end, self.genome_build)
|
|
165
|
+
if cache_key in self._recomb_cache:
|
|
166
|
+
return self._recomb_cache[cache_key]
|
|
167
|
+
|
|
168
|
+
recomb_dir = self._ensure_recomb_maps()
|
|
169
|
+
if recomb_dir is None:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
recomb_df = get_recombination_rate_for_region(
|
|
174
|
+
chrom=chrom,
|
|
175
|
+
start=start,
|
|
176
|
+
end=end,
|
|
177
|
+
species=self.species,
|
|
178
|
+
data_dir=str(recomb_dir),
|
|
179
|
+
genome_build=self.genome_build,
|
|
180
|
+
)
|
|
181
|
+
self._recomb_cache[cache_key] = recomb_df
|
|
182
|
+
return recomb_df
|
|
183
|
+
except FileNotFoundError:
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
def plot(
|
|
187
|
+
self,
|
|
188
|
+
gwas_df: pd.DataFrame,
|
|
189
|
+
chrom: int,
|
|
190
|
+
start: int,
|
|
191
|
+
end: int,
|
|
192
|
+
lead_pos: Optional[int] = None,
|
|
193
|
+
ld_reference_file: Optional[str] = None,
|
|
194
|
+
ld_col: Optional[str] = None,
|
|
195
|
+
genes_df: Optional[pd.DataFrame] = None,
|
|
196
|
+
exons_df: Optional[pd.DataFrame] = None,
|
|
197
|
+
recomb_df: Optional[pd.DataFrame] = None,
|
|
198
|
+
show_recombination: bool = True,
|
|
199
|
+
snp_labels: bool = True,
|
|
200
|
+
label_top_n: int = 5,
|
|
201
|
+
pos_col: str = "ps",
|
|
202
|
+
p_col: str = "p_wald",
|
|
203
|
+
rs_col: str = "rs",
|
|
204
|
+
figsize: Tuple[int, int] = (12, 8),
|
|
205
|
+
) -> Figure:
|
|
206
|
+
"""Create a regional association plot.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
gwas_df: GWAS results DataFrame.
|
|
210
|
+
chrom: Chromosome number.
|
|
211
|
+
start: Start position of the region.
|
|
212
|
+
end: End position of the region.
|
|
213
|
+
lead_pos: Position of the lead/index SNP to highlight.
|
|
214
|
+
ld_reference_file: PLINK binary fileset for LD calculation.
|
|
215
|
+
If provided with lead_pos, calculates LD on the fly.
|
|
216
|
+
ld_col: Column name for pre-computed LD (R²) values.
|
|
217
|
+
Use this if LD was calculated externally.
|
|
218
|
+
genes_df: Gene annotations with chr, start, end, gene_name.
|
|
219
|
+
exons_df: Exon annotations with chr, start, end, gene_name.
|
|
220
|
+
recomb_df: Pre-loaded recombination rate data.
|
|
221
|
+
If None and show_recombination=True, loads from species default.
|
|
222
|
+
show_recombination: Whether to show recombination rate overlay.
|
|
223
|
+
snp_labels: Whether to label top SNPs.
|
|
224
|
+
label_top_n: Number of top SNPs to label.
|
|
225
|
+
pos_col: Column name for position.
|
|
226
|
+
p_col: Column name for p-value.
|
|
227
|
+
rs_col: Column name for SNP ID.
|
|
228
|
+
figsize: Figure size.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Matplotlib Figure object.
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
ValidationError: If required DataFrame columns are missing.
|
|
235
|
+
"""
|
|
236
|
+
# Validate inputs
|
|
237
|
+
validate_gwas_df(gwas_df, pos_col=pos_col, p_col=p_col)
|
|
238
|
+
if genes_df is not None:
|
|
239
|
+
validate_genes_df(genes_df)
|
|
240
|
+
|
|
241
|
+
logger.debug(f"Creating plot for chr{chrom}:{start}-{end}")
|
|
242
|
+
|
|
243
|
+
# Prevent auto-display in interactive environments
|
|
244
|
+
plt.ioff()
|
|
245
|
+
|
|
246
|
+
# Prepare data
|
|
247
|
+
df = gwas_df.copy()
|
|
248
|
+
df["neglog10p"] = -np.log10(df[p_col].clip(lower=1e-300))
|
|
249
|
+
|
|
250
|
+
# Calculate LD if reference file provided
|
|
251
|
+
if ld_reference_file and lead_pos and ld_col is None:
|
|
252
|
+
lead_snp_row = df[df[pos_col] == lead_pos]
|
|
253
|
+
if not lead_snp_row.empty:
|
|
254
|
+
lead_snp_id = lead_snp_row[rs_col].iloc[0]
|
|
255
|
+
logger.debug(f"Calculating LD for lead SNP {lead_snp_id}")
|
|
256
|
+
ld_df = calculate_ld(
|
|
257
|
+
bfile_path=ld_reference_file,
|
|
258
|
+
lead_snp=lead_snp_id,
|
|
259
|
+
window_kb=max((end - start) // 1000, 500),
|
|
260
|
+
plink_path=self.plink_path,
|
|
261
|
+
species=self.species,
|
|
262
|
+
)
|
|
263
|
+
if not ld_df.empty:
|
|
264
|
+
df = df.merge(ld_df, left_on=rs_col, right_on="SNP", how="left")
|
|
265
|
+
ld_col = "R2"
|
|
266
|
+
|
|
267
|
+
# Load recombination data if needed
|
|
268
|
+
if show_recombination and recomb_df is None:
|
|
269
|
+
recomb_df = self._get_recomb_for_region(chrom, start, end)
|
|
270
|
+
|
|
271
|
+
# Create figure layout
|
|
272
|
+
fig, ax, gene_ax = self._create_figure(genes_df, chrom, start, end, figsize)
|
|
273
|
+
|
|
274
|
+
# Plot association data
|
|
275
|
+
self._plot_association(ax, df, pos_col, ld_col, lead_pos)
|
|
276
|
+
|
|
277
|
+
# Add significance line
|
|
278
|
+
ax.axhline(
|
|
279
|
+
y=self._genomewide_line,
|
|
280
|
+
color="grey",
|
|
281
|
+
linestyle="--",
|
|
282
|
+
linewidth=1,
|
|
283
|
+
zorder=1,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# Add SNP labels
|
|
287
|
+
if snp_labels and rs_col in df.columns and label_top_n > 0 and not df.empty:
|
|
288
|
+
add_snp_labels(
|
|
289
|
+
ax,
|
|
290
|
+
df,
|
|
291
|
+
pos_col=pos_col,
|
|
292
|
+
neglog10p_col="neglog10p",
|
|
293
|
+
rs_col=rs_col,
|
|
294
|
+
label_top_n=label_top_n,
|
|
295
|
+
genes_df=genes_df,
|
|
296
|
+
chrom=chrom,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Add recombination overlay
|
|
300
|
+
if recomb_df is not None and not recomb_df.empty:
|
|
301
|
+
add_recombination_overlay(ax, recomb_df, start, end)
|
|
302
|
+
|
|
303
|
+
# Format axes
|
|
304
|
+
ax.set_ylabel(r"$-\log_{10}$ P")
|
|
305
|
+
ax.set_xlim(start, end)
|
|
306
|
+
ax.spines["top"].set_visible(False)
|
|
307
|
+
ax.spines["right"].set_visible(False)
|
|
308
|
+
|
|
309
|
+
# Add LD legend
|
|
310
|
+
if ld_col is not None and ld_col in df.columns:
|
|
311
|
+
self._add_ld_legend(ax)
|
|
312
|
+
|
|
313
|
+
# Plot gene track
|
|
314
|
+
if genes_df is not None and gene_ax is not None:
|
|
315
|
+
plot_gene_track(gene_ax, genes_df, chrom, start, end, exons_df)
|
|
316
|
+
gene_ax.set_xlabel(f"Chromosome {chrom} (Mb)")
|
|
317
|
+
gene_ax.spines["top"].set_visible(False)
|
|
318
|
+
gene_ax.spines["right"].set_visible(False)
|
|
319
|
+
gene_ax.spines["left"].set_visible(False)
|
|
320
|
+
else:
|
|
321
|
+
ax.set_xlabel(f"Chromosome {chrom} (Mb)")
|
|
322
|
+
|
|
323
|
+
# Format x-axis with Mb labels
|
|
324
|
+
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{x / 1e6:.2f}"))
|
|
325
|
+
ax.xaxis.set_major_locator(MaxNLocator(nbins=6))
|
|
326
|
+
|
|
327
|
+
# Adjust layout
|
|
328
|
+
fig.subplots_adjust(left=0.08, right=0.95, top=0.95, bottom=0.1, hspace=0.08)
|
|
329
|
+
plt.ion()
|
|
330
|
+
|
|
331
|
+
return fig
|
|
332
|
+
|
|
333
|
+
def _create_figure(
|
|
334
|
+
self,
|
|
335
|
+
genes_df: Optional[pd.DataFrame],
|
|
336
|
+
chrom: int,
|
|
337
|
+
start: int,
|
|
338
|
+
end: int,
|
|
339
|
+
figsize: Tuple[int, int],
|
|
340
|
+
) -> Tuple[Figure, Axes, Optional[Axes]]:
|
|
341
|
+
"""Create figure with optional gene track."""
|
|
342
|
+
if genes_df is not None:
|
|
343
|
+
# Calculate dynamic height based on gene rows
|
|
344
|
+
chrom_str = normalize_chrom(chrom)
|
|
345
|
+
region_genes = genes_df[
|
|
346
|
+
(
|
|
347
|
+
genes_df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
348
|
+
== chrom_str
|
|
349
|
+
)
|
|
350
|
+
& (genes_df["end"] >= start)
|
|
351
|
+
& (genes_df["start"] <= end)
|
|
352
|
+
]
|
|
353
|
+
if not region_genes.empty:
|
|
354
|
+
temp_positions = assign_gene_positions(
|
|
355
|
+
region_genes.sort_values("start"), start, end
|
|
356
|
+
)
|
|
357
|
+
n_gene_rows = max(temp_positions) + 1 if temp_positions else 1
|
|
358
|
+
else:
|
|
359
|
+
n_gene_rows = 1
|
|
360
|
+
|
|
361
|
+
base_gene_height = 1.0
|
|
362
|
+
per_row_height = 0.5
|
|
363
|
+
gene_track_height = base_gene_height + (n_gene_rows - 1) * per_row_height
|
|
364
|
+
assoc_height = figsize[1] * 0.6
|
|
365
|
+
total_height = assoc_height + gene_track_height
|
|
366
|
+
|
|
367
|
+
fig, axes = plt.subplots(
|
|
368
|
+
2,
|
|
369
|
+
1,
|
|
370
|
+
figsize=(figsize[0], total_height),
|
|
371
|
+
height_ratios=[assoc_height, gene_track_height],
|
|
372
|
+
sharex=True,
|
|
373
|
+
gridspec_kw={"hspace": 0},
|
|
374
|
+
)
|
|
375
|
+
return fig, axes[0], axes[1]
|
|
376
|
+
else:
|
|
377
|
+
fig, ax = plt.subplots(figsize=(figsize[0], figsize[1] * 0.75))
|
|
378
|
+
return fig, ax, None
|
|
379
|
+
|
|
380
|
+
def _plot_association(
|
|
381
|
+
self,
|
|
382
|
+
ax: Axes,
|
|
383
|
+
df: pd.DataFrame,
|
|
384
|
+
pos_col: str,
|
|
385
|
+
ld_col: Optional[str],
|
|
386
|
+
lead_pos: Optional[int],
|
|
387
|
+
) -> None:
|
|
388
|
+
"""Plot association scatter with LD coloring."""
|
|
389
|
+
# LD-based coloring
|
|
390
|
+
if ld_col is not None and ld_col in df.columns:
|
|
391
|
+
df["ld_bin"] = df[ld_col].apply(get_ld_bin)
|
|
392
|
+
df = df.sort_values(ld_col, ascending=True, na_position="first")
|
|
393
|
+
|
|
394
|
+
palette = get_ld_color_palette()
|
|
395
|
+
for bin_label in df["ld_bin"].unique():
|
|
396
|
+
bin_data = df[df["ld_bin"] == bin_label]
|
|
397
|
+
ax.scatter(
|
|
398
|
+
bin_data[pos_col],
|
|
399
|
+
bin_data["neglog10p"],
|
|
400
|
+
c=palette.get(bin_label, "#BEBEBE"),
|
|
401
|
+
s=60,
|
|
402
|
+
edgecolor="black",
|
|
403
|
+
linewidth=0.5,
|
|
404
|
+
zorder=2,
|
|
405
|
+
)
|
|
406
|
+
else:
|
|
407
|
+
# Default: grey points
|
|
408
|
+
ax.scatter(
|
|
409
|
+
df[pos_col],
|
|
410
|
+
df["neglog10p"],
|
|
411
|
+
c="#BEBEBE",
|
|
412
|
+
s=60,
|
|
413
|
+
edgecolor="black",
|
|
414
|
+
linewidth=0.5,
|
|
415
|
+
zorder=2,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Highlight lead SNP
|
|
419
|
+
if lead_pos is not None:
|
|
420
|
+
lead_snp = df[df[pos_col] == lead_pos]
|
|
421
|
+
if not lead_snp.empty:
|
|
422
|
+
ax.scatter(
|
|
423
|
+
lead_snp[pos_col],
|
|
424
|
+
lead_snp["neglog10p"],
|
|
425
|
+
c=LEAD_SNP_COLOR,
|
|
426
|
+
s=120,
|
|
427
|
+
marker="D",
|
|
428
|
+
edgecolors="black",
|
|
429
|
+
linewidths=1,
|
|
430
|
+
zorder=10,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
def _add_ld_legend(self, ax: Axes) -> None:
|
|
434
|
+
"""Add LD color legend to plot."""
|
|
435
|
+
palette = get_ld_color_palette()
|
|
436
|
+
legend_elements = [
|
|
437
|
+
Line2D(
|
|
438
|
+
[0],
|
|
439
|
+
[0],
|
|
440
|
+
marker="D",
|
|
441
|
+
color="w",
|
|
442
|
+
markerfacecolor=LEAD_SNP_COLOR,
|
|
443
|
+
markeredgecolor="black",
|
|
444
|
+
markersize=8,
|
|
445
|
+
label="Index SNP",
|
|
446
|
+
),
|
|
447
|
+
]
|
|
448
|
+
|
|
449
|
+
for threshold, label, _ in LD_BINS:
|
|
450
|
+
legend_elements.append(
|
|
451
|
+
Patch(
|
|
452
|
+
facecolor=palette[label],
|
|
453
|
+
edgecolor="black",
|
|
454
|
+
label=label,
|
|
455
|
+
)
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
ax.legend(
|
|
459
|
+
handles=legend_elements,
|
|
460
|
+
loc="upper left",
|
|
461
|
+
fontsize=9,
|
|
462
|
+
frameon=True,
|
|
463
|
+
framealpha=0.9,
|
|
464
|
+
title=r"$r^2$",
|
|
465
|
+
title_fontsize=10,
|
|
466
|
+
handlelength=1.5,
|
|
467
|
+
handleheight=1.0,
|
|
468
|
+
labelspacing=0.4,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
def plot_stacked(
|
|
472
|
+
self,
|
|
473
|
+
gwas_dfs: List[pd.DataFrame],
|
|
474
|
+
chrom: int,
|
|
475
|
+
start: int,
|
|
476
|
+
end: int,
|
|
477
|
+
lead_positions: Optional[List[int]] = None,
|
|
478
|
+
panel_labels: Optional[List[str]] = None,
|
|
479
|
+
ld_reference_file: Optional[str] = None,
|
|
480
|
+
ld_reference_files: Optional[List[str]] = None,
|
|
481
|
+
genes_df: Optional[pd.DataFrame] = None,
|
|
482
|
+
exons_df: Optional[pd.DataFrame] = None,
|
|
483
|
+
eqtl_df: Optional[pd.DataFrame] = None,
|
|
484
|
+
eqtl_gene: Optional[str] = None,
|
|
485
|
+
recomb_df: Optional[pd.DataFrame] = None,
|
|
486
|
+
show_recombination: bool = True,
|
|
487
|
+
snp_labels: bool = True,
|
|
488
|
+
label_top_n: int = 3,
|
|
489
|
+
pos_col: str = "ps",
|
|
490
|
+
p_col: str = "p_wald",
|
|
491
|
+
rs_col: str = "rs",
|
|
492
|
+
figsize: Tuple[float, Optional[float]] = (12, None),
|
|
493
|
+
) -> Any:
|
|
494
|
+
"""Create stacked regional association plots for multiple GWAS.
|
|
495
|
+
|
|
496
|
+
Vertically stacks multiple GWAS results for comparison, with shared
|
|
497
|
+
x-axis and optional gene track at the bottom.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
gwas_dfs: List of GWAS results DataFrames to stack.
|
|
501
|
+
chrom: Chromosome number.
|
|
502
|
+
start: Start position of the region.
|
|
503
|
+
end: End position of the region.
|
|
504
|
+
lead_positions: List of lead SNP positions (one per GWAS).
|
|
505
|
+
If None, auto-detects from lowest p-value.
|
|
506
|
+
panel_labels: Labels for each panel (e.g., phenotype names).
|
|
507
|
+
ld_reference_file: Single PLINK fileset for all panels.
|
|
508
|
+
ld_reference_files: List of PLINK filesets (one per panel).
|
|
509
|
+
genes_df: Gene annotations for bottom track.
|
|
510
|
+
exons_df: Exon annotations for gene track.
|
|
511
|
+
eqtl_df: eQTL data to display as additional panel.
|
|
512
|
+
eqtl_gene: Filter eQTL data to this target gene.
|
|
513
|
+
recomb_df: Pre-loaded recombination rate data.
|
|
514
|
+
show_recombination: Whether to show recombination overlay.
|
|
515
|
+
snp_labels: Whether to label top SNPs.
|
|
516
|
+
label_top_n: Number of top SNPs to label per panel.
|
|
517
|
+
pos_col: Column name for position.
|
|
518
|
+
p_col: Column name for p-value.
|
|
519
|
+
rs_col: Column name for SNP ID.
|
|
520
|
+
figsize: Figure size (width, height). If height is None, auto-calculates.
|
|
521
|
+
|
|
522
|
+
Returns:
|
|
523
|
+
Figure object (type depends on backend).
|
|
524
|
+
|
|
525
|
+
Example:
|
|
526
|
+
>>> fig = plotter.plot_stacked(
|
|
527
|
+
... [gwas_height, gwas_bmi, gwas_whr],
|
|
528
|
+
... chrom=1, start=1000000, end=2000000,
|
|
529
|
+
... panel_labels=["Height", "BMI", "WHR"],
|
|
530
|
+
... genes_df=genes_df,
|
|
531
|
+
... )
|
|
532
|
+
"""
|
|
533
|
+
n_gwas = len(gwas_dfs)
|
|
534
|
+
if n_gwas == 0:
|
|
535
|
+
raise ValueError("At least one GWAS DataFrame required")
|
|
536
|
+
|
|
537
|
+
# Validate inputs
|
|
538
|
+
for i, df in enumerate(gwas_dfs):
|
|
539
|
+
validate_gwas_df(df, pos_col=pos_col, p_col=p_col)
|
|
540
|
+
if genes_df is not None:
|
|
541
|
+
validate_genes_df(genes_df)
|
|
542
|
+
|
|
543
|
+
# Handle lead positions
|
|
544
|
+
if lead_positions is None:
|
|
545
|
+
lead_positions = []
|
|
546
|
+
for df in gwas_dfs:
|
|
547
|
+
region_df = df[(df[pos_col] >= start) & (df[pos_col] <= end)]
|
|
548
|
+
if not region_df.empty:
|
|
549
|
+
lead_idx = region_df[p_col].idxmin()
|
|
550
|
+
lead_positions.append(int(region_df.loc[lead_idx, pos_col]))
|
|
551
|
+
else:
|
|
552
|
+
lead_positions.append(None)
|
|
553
|
+
|
|
554
|
+
# Handle LD reference files
|
|
555
|
+
if ld_reference_files is None and ld_reference_file is not None:
|
|
556
|
+
ld_reference_files = [ld_reference_file] * n_gwas
|
|
557
|
+
|
|
558
|
+
# Calculate panel layout
|
|
559
|
+
panel_height = 2.5 # inches per GWAS panel
|
|
560
|
+
eqtl_height = 2.0 if eqtl_df is not None else 0
|
|
561
|
+
|
|
562
|
+
# Gene track height
|
|
563
|
+
if genes_df is not None:
|
|
564
|
+
chrom_str = normalize_chrom(chrom)
|
|
565
|
+
region_genes = genes_df[
|
|
566
|
+
(genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
|
|
567
|
+
& (genes_df["end"] >= start)
|
|
568
|
+
& (genes_df["start"] <= end)
|
|
569
|
+
]
|
|
570
|
+
if not region_genes.empty:
|
|
571
|
+
temp_positions = assign_gene_positions(
|
|
572
|
+
region_genes.sort_values("start"), start, end
|
|
573
|
+
)
|
|
574
|
+
n_gene_rows = max(temp_positions) + 1 if temp_positions else 1
|
|
575
|
+
else:
|
|
576
|
+
n_gene_rows = 1
|
|
577
|
+
gene_track_height = 1.0 + (n_gene_rows - 1) * 0.5
|
|
578
|
+
else:
|
|
579
|
+
gene_track_height = 0
|
|
580
|
+
|
|
581
|
+
# Calculate total panels and heights
|
|
582
|
+
n_panels = n_gwas + (1 if eqtl_df is not None else 0) + (1 if genes_df is not None else 0)
|
|
583
|
+
height_ratios = [panel_height] * n_gwas
|
|
584
|
+
if eqtl_df is not None:
|
|
585
|
+
height_ratios.append(eqtl_height)
|
|
586
|
+
if genes_df is not None:
|
|
587
|
+
height_ratios.append(gene_track_height)
|
|
588
|
+
|
|
589
|
+
# Calculate figure height
|
|
590
|
+
total_height = figsize[1] if figsize[1] else sum(height_ratios)
|
|
591
|
+
actual_figsize = (figsize[0], total_height)
|
|
592
|
+
|
|
593
|
+
logger.debug(f"Creating stacked plot with {n_panels} panels for chr{chrom}:{start}-{end}")
|
|
594
|
+
|
|
595
|
+
# Prevent auto-display in interactive environments
|
|
596
|
+
plt.ioff()
|
|
597
|
+
|
|
598
|
+
# Load recombination data if needed
|
|
599
|
+
if show_recombination and recomb_df is None:
|
|
600
|
+
recomb_df = self._get_recomb_for_region(chrom, start, end)
|
|
601
|
+
|
|
602
|
+
# Create figure
|
|
603
|
+
fig, axes = plt.subplots(
|
|
604
|
+
n_panels,
|
|
605
|
+
1,
|
|
606
|
+
figsize=actual_figsize,
|
|
607
|
+
height_ratios=height_ratios,
|
|
608
|
+
sharex=True,
|
|
609
|
+
gridspec_kw={"hspace": 0.05},
|
|
610
|
+
)
|
|
611
|
+
if n_panels == 1:
|
|
612
|
+
axes = [axes]
|
|
613
|
+
|
|
614
|
+
# Plot each GWAS panel
|
|
615
|
+
for i, (gwas_df, lead_pos) in enumerate(zip(gwas_dfs, lead_positions)):
|
|
616
|
+
ax = axes[i]
|
|
617
|
+
df = gwas_df.copy()
|
|
618
|
+
df["neglog10p"] = -np.log10(df[p_col].clip(lower=1e-300))
|
|
619
|
+
|
|
620
|
+
# Calculate LD if reference provided
|
|
621
|
+
ld_col = None
|
|
622
|
+
if ld_reference_files and ld_reference_files[i] and lead_pos:
|
|
623
|
+
lead_snp_row = df[df[pos_col] == lead_pos]
|
|
624
|
+
if not lead_snp_row.empty and rs_col in df.columns:
|
|
625
|
+
lead_snp_id = lead_snp_row[rs_col].iloc[0]
|
|
626
|
+
ld_df = calculate_ld(
|
|
627
|
+
bfile_path=ld_reference_files[i],
|
|
628
|
+
lead_snp=lead_snp_id,
|
|
629
|
+
window_kb=max((end - start) // 1000, 500),
|
|
630
|
+
plink_path=self.plink_path,
|
|
631
|
+
species=self.species,
|
|
632
|
+
)
|
|
633
|
+
if not ld_df.empty:
|
|
634
|
+
df = df.merge(ld_df, left_on=rs_col, right_on="SNP", how="left")
|
|
635
|
+
ld_col = "R2"
|
|
636
|
+
|
|
637
|
+
# Plot association
|
|
638
|
+
self._plot_association(ax, df, pos_col, ld_col, lead_pos)
|
|
639
|
+
|
|
640
|
+
# Add significance line
|
|
641
|
+
ax.axhline(y=self._genomewide_line, color="grey", linestyle="--", linewidth=1, zorder=1)
|
|
642
|
+
|
|
643
|
+
# Add SNP labels
|
|
644
|
+
if snp_labels and rs_col in df.columns and label_top_n > 0 and not df.empty:
|
|
645
|
+
add_snp_labels(
|
|
646
|
+
ax, df, pos_col=pos_col, neglog10p_col="neglog10p",
|
|
647
|
+
rs_col=rs_col, label_top_n=label_top_n, genes_df=genes_df, chrom=chrom,
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
# Add recombination overlay (only on first panel)
|
|
651
|
+
if i == 0 and recomb_df is not None and not recomb_df.empty:
|
|
652
|
+
add_recombination_overlay(ax, recomb_df, start, end)
|
|
653
|
+
|
|
654
|
+
# Format axes
|
|
655
|
+
ax.set_ylabel(r"$-\log_{10}$ P")
|
|
656
|
+
ax.set_xlim(start, end)
|
|
657
|
+
ax.spines["top"].set_visible(False)
|
|
658
|
+
ax.spines["right"].set_visible(False)
|
|
659
|
+
|
|
660
|
+
# Add panel label
|
|
661
|
+
if panel_labels and i < len(panel_labels):
|
|
662
|
+
ax.annotate(
|
|
663
|
+
panel_labels[i],
|
|
664
|
+
xy=(0.02, 0.95),
|
|
665
|
+
xycoords="axes fraction",
|
|
666
|
+
fontsize=11,
|
|
667
|
+
fontweight="bold",
|
|
668
|
+
va="top",
|
|
669
|
+
ha="left",
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Add LD legend (only on first panel)
|
|
673
|
+
if i == 0 and ld_col is not None and ld_col in df.columns:
|
|
674
|
+
self._add_ld_legend(ax)
|
|
675
|
+
|
|
676
|
+
# Plot eQTL panel if provided
|
|
677
|
+
panel_idx = n_gwas
|
|
678
|
+
if eqtl_df is not None:
|
|
679
|
+
ax = axes[panel_idx]
|
|
680
|
+
eqtl_data = eqtl_df.copy()
|
|
681
|
+
|
|
682
|
+
# Filter by gene if specified
|
|
683
|
+
if eqtl_gene and "gene" in eqtl_data.columns:
|
|
684
|
+
eqtl_data = eqtl_data[eqtl_data["gene"] == eqtl_gene]
|
|
685
|
+
|
|
686
|
+
# Filter by region
|
|
687
|
+
if "pos" in eqtl_data.columns:
|
|
688
|
+
eqtl_data = eqtl_data[(eqtl_data["pos"] >= start) & (eqtl_data["pos"] <= end)]
|
|
689
|
+
|
|
690
|
+
if not eqtl_data.empty:
|
|
691
|
+
eqtl_data["neglog10p"] = -np.log10(eqtl_data["p_value"].clip(lower=1e-300))
|
|
692
|
+
|
|
693
|
+
# Plot as diamonds (different from GWAS circles)
|
|
694
|
+
ax.scatter(
|
|
695
|
+
eqtl_data["pos"],
|
|
696
|
+
eqtl_data["neglog10p"],
|
|
697
|
+
c="#FF6B6B",
|
|
698
|
+
s=60,
|
|
699
|
+
marker="D",
|
|
700
|
+
edgecolor="black",
|
|
701
|
+
linewidth=0.5,
|
|
702
|
+
zorder=2,
|
|
703
|
+
label=f"eQTL ({eqtl_gene})" if eqtl_gene else "eQTL",
|
|
704
|
+
)
|
|
705
|
+
ax.legend(loc="upper left", fontsize=9)
|
|
706
|
+
|
|
707
|
+
ax.set_ylabel(r"$-\log_{10}$ P (eQTL)")
|
|
708
|
+
ax.axhline(y=self._genomewide_line, color="grey", linestyle="--", linewidth=1)
|
|
709
|
+
ax.spines["top"].set_visible(False)
|
|
710
|
+
ax.spines["right"].set_visible(False)
|
|
711
|
+
panel_idx += 1
|
|
712
|
+
|
|
713
|
+
# Plot gene track
|
|
714
|
+
if genes_df is not None:
|
|
715
|
+
gene_ax = axes[panel_idx]
|
|
716
|
+
plot_gene_track(gene_ax, genes_df, chrom, start, end, exons_df)
|
|
717
|
+
gene_ax.set_xlabel(f"Chromosome {chrom} (Mb)")
|
|
718
|
+
gene_ax.spines["top"].set_visible(False)
|
|
719
|
+
gene_ax.spines["right"].set_visible(False)
|
|
720
|
+
gene_ax.spines["left"].set_visible(False)
|
|
721
|
+
else:
|
|
722
|
+
# Set x-label on bottom panel
|
|
723
|
+
axes[-1].set_xlabel(f"Chromosome {chrom} (Mb)")
|
|
724
|
+
|
|
725
|
+
# Format x-axis
|
|
726
|
+
axes[0].xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{x / 1e6:.2f}"))
|
|
727
|
+
axes[0].xaxis.set_major_locator(MaxNLocator(nbins=6))
|
|
728
|
+
|
|
729
|
+
# Adjust layout
|
|
730
|
+
fig.subplots_adjust(left=0.08, right=0.95, top=0.95, bottom=0.08, hspace=0.05)
|
|
731
|
+
plt.ion()
|
|
732
|
+
|
|
733
|
+
return fig
|