pylocuszoom 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +120 -0
- pylocuszoom/backends/__init__.py +52 -0
- pylocuszoom/backends/base.py +341 -0
- pylocuszoom/backends/bokeh_backend.py +441 -0
- pylocuszoom/backends/matplotlib_backend.py +288 -0
- pylocuszoom/backends/plotly_backend.py +474 -0
- pylocuszoom/colors.py +107 -0
- pylocuszoom/eqtl.py +218 -0
- pylocuszoom/gene_track.py +311 -0
- pylocuszoom/labels.py +118 -0
- pylocuszoom/ld.py +209 -0
- pylocuszoom/logging.py +153 -0
- pylocuszoom/plotter.py +733 -0
- pylocuszoom/recombination.py +432 -0
- pylocuszoom/reference_data/__init__.py +4 -0
- pylocuszoom/utils.py +194 -0
- pylocuszoom-0.1.0.dist-info/METADATA +367 -0
- pylocuszoom-0.1.0.dist-info/RECORD +20 -0
- pylocuszoom-0.1.0.dist-info/WHEEL +4 -0
- pylocuszoom-0.1.0.dist-info/licenses/LICENSE.md +17 -0
pylocuszoom/eqtl.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""eQTL data handling and validation for pyLocusZoom.
|
|
2
|
+
|
|
3
|
+
Provides utilities for loading, validating, and preparing expression
|
|
4
|
+
quantitative trait loci (eQTL) data for overlay on regional plots.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from .logging import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
REQUIRED_EQTL_COLS = ["pos", "p_value"]
|
|
16
|
+
OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EQTLValidationError(ValueError):
|
|
20
|
+
"""Raised when eQTL DataFrame validation fails."""
|
|
21
|
+
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def validate_eqtl_df(
|
|
26
|
+
df: pd.DataFrame,
|
|
27
|
+
pos_col: str = "pos",
|
|
28
|
+
p_col: str = "p_value",
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Validate eQTL DataFrame has required columns.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
df: eQTL DataFrame to validate.
|
|
34
|
+
pos_col: Column name for genomic position.
|
|
35
|
+
p_col: Column name for p-value.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
EQTLValidationError: If required columns are missing.
|
|
39
|
+
"""
|
|
40
|
+
missing = []
|
|
41
|
+
if pos_col not in df.columns:
|
|
42
|
+
missing.append(pos_col)
|
|
43
|
+
if p_col not in df.columns:
|
|
44
|
+
missing.append(p_col)
|
|
45
|
+
|
|
46
|
+
if missing:
|
|
47
|
+
raise EQTLValidationError(
|
|
48
|
+
f"eQTL DataFrame missing required columns: {missing}. "
|
|
49
|
+
f"Required: {pos_col} (position), {p_col} (p-value)"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def filter_eqtl_by_gene(
|
|
54
|
+
df: pd.DataFrame,
|
|
55
|
+
gene: str,
|
|
56
|
+
gene_col: str = "gene",
|
|
57
|
+
) -> pd.DataFrame:
|
|
58
|
+
"""Filter eQTL data to a specific target gene.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
df: eQTL DataFrame.
|
|
62
|
+
gene: Target gene name to filter for.
|
|
63
|
+
gene_col: Column containing gene names.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Filtered DataFrame containing only eQTLs for the target gene.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
EQTLValidationError: If gene column doesn't exist.
|
|
70
|
+
"""
|
|
71
|
+
if gene_col not in df.columns:
|
|
72
|
+
raise EQTLValidationError(
|
|
73
|
+
f"Cannot filter by gene: column '{gene_col}' not found. "
|
|
74
|
+
f"Available columns: {list(df.columns)}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
filtered = df[df[gene_col] == gene].copy()
|
|
78
|
+
logger.debug(f"Filtered eQTL data to {len(filtered)} variants for gene {gene}")
|
|
79
|
+
return filtered
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def filter_eqtl_by_region(
|
|
83
|
+
df: pd.DataFrame,
|
|
84
|
+
chrom: int,
|
|
85
|
+
start: int,
|
|
86
|
+
end: int,
|
|
87
|
+
pos_col: str = "pos",
|
|
88
|
+
chrom_col: Optional[str] = "chr",
|
|
89
|
+
) -> pd.DataFrame:
|
|
90
|
+
"""Filter eQTL data to a genomic region.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
df: eQTL DataFrame.
|
|
94
|
+
chrom: Chromosome number.
|
|
95
|
+
start: Start position.
|
|
96
|
+
end: End position.
|
|
97
|
+
pos_col: Column name for position.
|
|
98
|
+
chrom_col: Column name for chromosome (if present).
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Filtered DataFrame containing only eQTLs in the region.
|
|
102
|
+
"""
|
|
103
|
+
mask = (df[pos_col] >= start) & (df[pos_col] <= end)
|
|
104
|
+
|
|
105
|
+
# Filter by chromosome if column exists
|
|
106
|
+
if chrom_col and chrom_col in df.columns:
|
|
107
|
+
chrom_str = str(chrom).replace("chr", "")
|
|
108
|
+
df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
|
|
109
|
+
mask = mask & (df_chrom == chrom_str)
|
|
110
|
+
|
|
111
|
+
filtered = df[mask].copy()
|
|
112
|
+
logger.debug(f"Filtered eQTL data to {len(filtered)} variants in region chr{chrom}:{start}-{end}")
|
|
113
|
+
return filtered
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def prepare_eqtl_for_plotting(
|
|
117
|
+
df: pd.DataFrame,
|
|
118
|
+
pos_col: str = "pos",
|
|
119
|
+
p_col: str = "p_value",
|
|
120
|
+
gene: Optional[str] = None,
|
|
121
|
+
chrom: Optional[int] = None,
|
|
122
|
+
start: Optional[int] = None,
|
|
123
|
+
end: Optional[int] = None,
|
|
124
|
+
) -> pd.DataFrame:
|
|
125
|
+
"""Prepare eQTL data for plotting.
|
|
126
|
+
|
|
127
|
+
Validates, filters, and adds computed columns needed for plotting.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
df: Raw eQTL DataFrame.
|
|
131
|
+
pos_col: Column name for position.
|
|
132
|
+
p_col: Column name for p-value.
|
|
133
|
+
gene: Optional gene to filter for.
|
|
134
|
+
chrom: Optional chromosome for region filtering.
|
|
135
|
+
start: Optional start position for region filtering.
|
|
136
|
+
end: Optional end position for region filtering.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Prepared DataFrame with neglog10p column added.
|
|
140
|
+
"""
|
|
141
|
+
validate_eqtl_df(df, pos_col=pos_col, p_col=p_col)
|
|
142
|
+
|
|
143
|
+
result = df.copy()
|
|
144
|
+
|
|
145
|
+
# Filter by gene if specified
|
|
146
|
+
if gene:
|
|
147
|
+
result = filter_eqtl_by_gene(result, gene)
|
|
148
|
+
|
|
149
|
+
# Filter by region if specified
|
|
150
|
+
if chrom is not None and start is not None and end is not None:
|
|
151
|
+
result = filter_eqtl_by_region(result, chrom, start, end, pos_col=pos_col)
|
|
152
|
+
|
|
153
|
+
# Add -log10(p) column
|
|
154
|
+
result["neglog10p"] = -np.log10(result[p_col].clip(lower=1e-300))
|
|
155
|
+
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_eqtl_genes(df: pd.DataFrame, gene_col: str = "gene") -> List[str]:
|
|
160
|
+
"""Get list of unique genes in eQTL data.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
df: eQTL DataFrame.
|
|
164
|
+
gene_col: Column containing gene names.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Sorted list of unique gene names.
|
|
168
|
+
"""
|
|
169
|
+
if gene_col not in df.columns:
|
|
170
|
+
return []
|
|
171
|
+
return sorted(df[gene_col].dropna().unique().tolist())
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def calculate_colocalization_overlap(
|
|
175
|
+
gwas_df: pd.DataFrame,
|
|
176
|
+
eqtl_df: pd.DataFrame,
|
|
177
|
+
gwas_pos_col: str = "ps",
|
|
178
|
+
eqtl_pos_col: str = "pos",
|
|
179
|
+
gwas_p_col: str = "p_wald",
|
|
180
|
+
eqtl_p_col: str = "p_value",
|
|
181
|
+
p_threshold: float = 1e-5,
|
|
182
|
+
) -> pd.DataFrame:
|
|
183
|
+
"""Find SNPs significant in both GWAS and eQTL.
|
|
184
|
+
|
|
185
|
+
Simple overlap analysis - for formal colocalization,
|
|
186
|
+
use dedicated tools like coloc or eCAVIAR.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
gwas_df: GWAS results DataFrame.
|
|
190
|
+
eqtl_df: eQTL results DataFrame.
|
|
191
|
+
gwas_pos_col: Position column in GWAS data.
|
|
192
|
+
eqtl_pos_col: Position column in eQTL data.
|
|
193
|
+
gwas_p_col: P-value column in GWAS data.
|
|
194
|
+
eqtl_p_col: P-value column in eQTL data.
|
|
195
|
+
p_threshold: P-value threshold for significance.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
DataFrame with overlapping significant SNPs from both datasets.
|
|
199
|
+
"""
|
|
200
|
+
# Filter to significant SNPs
|
|
201
|
+
sig_gwas = gwas_df[gwas_df[gwas_p_col] < p_threshold][[gwas_pos_col, gwas_p_col]]
|
|
202
|
+
sig_eqtl = eqtl_df[eqtl_df[eqtl_p_col] < p_threshold][[eqtl_pos_col, eqtl_p_col]]
|
|
203
|
+
|
|
204
|
+
# Merge on position
|
|
205
|
+
overlap = sig_gwas.merge(
|
|
206
|
+
sig_eqtl,
|
|
207
|
+
left_on=gwas_pos_col,
|
|
208
|
+
right_on=eqtl_pos_col,
|
|
209
|
+
how="inner",
|
|
210
|
+
suffixes=("_gwas", "_eqtl"),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Found {len(overlap)} SNPs significant in both GWAS and eQTL "
|
|
215
|
+
f"(p < {p_threshold})"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return overlap
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Gene track visualization for regional association plots.
|
|
2
|
+
|
|
3
|
+
Provides LocusZoom-style gene track plotting with:
|
|
4
|
+
- Thin horizontal lines for introns
|
|
5
|
+
- Thick rectangles for exons
|
|
6
|
+
- Arrows indicating strand direction
|
|
7
|
+
- Gene name labels
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import List, Optional, Union
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from matplotlib.axes import Axes
|
|
14
|
+
from matplotlib.patches import Polygon, Rectangle
|
|
15
|
+
|
|
16
|
+
from .utils import normalize_chrom
|
|
17
|
+
|
|
18
|
+
# Strand-specific colors (bold, distinct)
|
|
19
|
+
STRAND_COLORS: dict[Optional[str], str] = {
|
|
20
|
+
"+": "#6A3D9A", # Bold purple for forward strand
|
|
21
|
+
"-": "#1F78B4", # Bold teal/blue for reverse strand
|
|
22
|
+
None: "#666666", # Grey if no strand info
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Layout constants
|
|
26
|
+
ROW_HEIGHT = 0.40 # Total height per row
|
|
27
|
+
GENE_AREA = 0.28 # Bottom portion for gene drawing
|
|
28
|
+
EXON_HEIGHT = 0.22 # Exon rectangle height
|
|
29
|
+
INTRON_HEIGHT = 0.02 # Thin intron line
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def assign_gene_positions(genes_df: pd.DataFrame, start: int, end: int) -> List[int]:
|
|
33
|
+
"""Assign row indices to genes to minimize overlap.
|
|
34
|
+
|
|
35
|
+
Uses a greedy algorithm to stack genes vertically, placing each gene
|
|
36
|
+
in the lowest row where it doesn't overlap with existing genes.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
genes_df: Gene annotations DataFrame sorted by start position.
|
|
40
|
+
start: Region start position.
|
|
41
|
+
end: Region end position.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of integer row indices (0, 1, 2, ...) for each gene.
|
|
45
|
+
"""
|
|
46
|
+
positions = []
|
|
47
|
+
occupied = [] # List of (end_pos, row)
|
|
48
|
+
region_width = end - start
|
|
49
|
+
|
|
50
|
+
for _, gene in genes_df.iterrows():
|
|
51
|
+
gene_start = max(gene["start"], start)
|
|
52
|
+
gene_end = min(gene["end"], end)
|
|
53
|
+
|
|
54
|
+
# Find first available row with buffer for label spacing
|
|
55
|
+
row = 0
|
|
56
|
+
label_buffer = region_width * 0.08 # Extra space for labels
|
|
57
|
+
for occ_end, occ_row in occupied:
|
|
58
|
+
if occ_row == row and occ_end > gene_start - label_buffer:
|
|
59
|
+
row = occ_row + 1
|
|
60
|
+
|
|
61
|
+
positions.append(row)
|
|
62
|
+
occupied.append((gene_end, row))
|
|
63
|
+
|
|
64
|
+
return positions
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_nearest_gene(
|
|
68
|
+
genes_df: pd.DataFrame,
|
|
69
|
+
chrom: Union[int, str],
|
|
70
|
+
pos: int,
|
|
71
|
+
window: int = 50000,
|
|
72
|
+
) -> Optional[str]:
|
|
73
|
+
"""Get the nearest gene name for a genomic position.
|
|
74
|
+
|
|
75
|
+
Searches for genes that overlap or are within the specified window
|
|
76
|
+
of the given position, returning the closest by midpoint distance.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
genes_df: Gene annotations DataFrame with chr, start, end, gene_name.
|
|
80
|
+
chrom: Chromosome number or string.
|
|
81
|
+
pos: Position in base pairs.
|
|
82
|
+
window: Window size in bp for searching nearby genes.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Gene name string or None if no gene found within window.
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> gene = get_nearest_gene(genes_df, chrom=1, pos=1500000)
|
|
89
|
+
>>> gene
|
|
90
|
+
'BRCA1'
|
|
91
|
+
"""
|
|
92
|
+
chrom_str = normalize_chrom(chrom)
|
|
93
|
+
chrom_genes = genes_df[
|
|
94
|
+
genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
if chrom_genes.empty:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
# Find genes that overlap or are within window
|
|
101
|
+
nearby = chrom_genes[
|
|
102
|
+
(chrom_genes["start"] - window <= pos) & (chrom_genes["end"] + window >= pos)
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
if nearby.empty:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
# Return the closest gene (by midpoint distance)
|
|
109
|
+
nearby = nearby.copy()
|
|
110
|
+
nearby["dist"] = abs((nearby["start"] + nearby["end"]) / 2 - pos)
|
|
111
|
+
return nearby.loc[nearby["dist"].idxmin(), "gene_name"]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def plot_gene_track(
|
|
115
|
+
ax: Axes,
|
|
116
|
+
genes_df: pd.DataFrame,
|
|
117
|
+
chrom: Union[int, str],
|
|
118
|
+
start: int,
|
|
119
|
+
end: int,
|
|
120
|
+
exons_df: Optional[pd.DataFrame] = None,
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Plot gene annotations as a LocusZoom-style track.
|
|
123
|
+
|
|
124
|
+
Creates a gene track with:
|
|
125
|
+
- Thin horizontal lines for introns (gene body)
|
|
126
|
+
- Thick rectangles for exons
|
|
127
|
+
- Arrows indicating strand direction
|
|
128
|
+
- Gene name labels
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
ax: Matplotlib axes for gene track.
|
|
132
|
+
genes_df: Gene annotations with chr, start, end, gene_name,
|
|
133
|
+
and optionally strand (+/-) column.
|
|
134
|
+
chrom: Chromosome number or string.
|
|
135
|
+
start: Region start position.
|
|
136
|
+
end: Region end position.
|
|
137
|
+
exons_df: Exon annotations with chr, start, end, gene_name
|
|
138
|
+
columns for drawing exon structure. Optional.
|
|
139
|
+
"""
|
|
140
|
+
chrom_str = normalize_chrom(chrom)
|
|
141
|
+
region_genes = genes_df[
|
|
142
|
+
(genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
|
|
143
|
+
& (genes_df["end"] >= start)
|
|
144
|
+
& (genes_df["start"] <= end)
|
|
145
|
+
].copy()
|
|
146
|
+
|
|
147
|
+
ax.set_xlim(start, end)
|
|
148
|
+
ax.set_ylabel("Genes", fontsize=10)
|
|
149
|
+
ax.set_yticks([])
|
|
150
|
+
|
|
151
|
+
# theme_classic: only bottom spine
|
|
152
|
+
ax.spines["top"].set_visible(False)
|
|
153
|
+
ax.spines["right"].set_visible(False)
|
|
154
|
+
ax.spines["left"].set_visible(False)
|
|
155
|
+
ax.spines["bottom"].set_linewidth(0.5)
|
|
156
|
+
|
|
157
|
+
if region_genes.empty:
|
|
158
|
+
ax.set_ylim(0, 1)
|
|
159
|
+
ax.text(
|
|
160
|
+
(start + end) / 2,
|
|
161
|
+
0.5,
|
|
162
|
+
"No genes",
|
|
163
|
+
ha="center",
|
|
164
|
+
va="center",
|
|
165
|
+
fontsize=9,
|
|
166
|
+
color="grey",
|
|
167
|
+
style="italic",
|
|
168
|
+
)
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# Assign vertical positions to avoid overlap
|
|
172
|
+
region_genes = region_genes.sort_values("start")
|
|
173
|
+
positions = assign_gene_positions(region_genes, start, end)
|
|
174
|
+
|
|
175
|
+
# Set y-axis limits - small bottom margin for gene body, tight top
|
|
176
|
+
max_row = max(positions) if positions else 0
|
|
177
|
+
bottom_margin = EXON_HEIGHT / 2 + 0.02 # Room for bottom gene
|
|
178
|
+
top_margin = 0.15 # Small space above top label
|
|
179
|
+
ax.set_ylim(
|
|
180
|
+
-bottom_margin,
|
|
181
|
+
(max_row + 1) * ROW_HEIGHT - ROW_HEIGHT + GENE_AREA + top_margin,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Filter exons for this region if available
|
|
185
|
+
region_exons = None
|
|
186
|
+
if exons_df is not None and not exons_df.empty:
|
|
187
|
+
region_exons = exons_df[
|
|
188
|
+
(
|
|
189
|
+
exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
190
|
+
== chrom_str
|
|
191
|
+
)
|
|
192
|
+
& (exons_df["end"] >= start)
|
|
193
|
+
& (exons_df["start"] <= end)
|
|
194
|
+
].copy()
|
|
195
|
+
|
|
196
|
+
for idx, (_, gene) in enumerate(region_genes.iterrows()):
|
|
197
|
+
gene_start = max(int(gene["start"]), start)
|
|
198
|
+
gene_end = min(int(gene["end"]), end)
|
|
199
|
+
row = positions[idx]
|
|
200
|
+
gene_name = gene.get("gene_name", "")
|
|
201
|
+
|
|
202
|
+
# Get strand-specific color
|
|
203
|
+
strand = gene.get("strand") if "strand" in gene.index else None
|
|
204
|
+
gene_col = STRAND_COLORS.get(strand, STRAND_COLORS[None])
|
|
205
|
+
|
|
206
|
+
# Y position: bottom of row + offset for gene area
|
|
207
|
+
y_gene = row * ROW_HEIGHT + 0.05
|
|
208
|
+
y_label = y_gene + EXON_HEIGHT / 2 + 0.01 # Just above gene top
|
|
209
|
+
|
|
210
|
+
# Check if we have exon data for this gene
|
|
211
|
+
gene_exons = None
|
|
212
|
+
if region_exons is not None and not region_exons.empty and gene_name:
|
|
213
|
+
gene_exons = region_exons[region_exons["gene_name"] == gene_name].copy()
|
|
214
|
+
|
|
215
|
+
if gene_exons is not None and not gene_exons.empty:
|
|
216
|
+
# Draw intron line (thin horizontal line spanning gene)
|
|
217
|
+
ax.add_patch(
|
|
218
|
+
Rectangle(
|
|
219
|
+
(gene_start, y_gene - INTRON_HEIGHT / 2),
|
|
220
|
+
gene_end - gene_start,
|
|
221
|
+
INTRON_HEIGHT,
|
|
222
|
+
facecolor=gene_col,
|
|
223
|
+
edgecolor=gene_col,
|
|
224
|
+
linewidth=0.5,
|
|
225
|
+
zorder=1,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Draw exons (thick rectangles)
|
|
230
|
+
for _, exon in gene_exons.iterrows():
|
|
231
|
+
exon_start = max(int(exon["start"]), start)
|
|
232
|
+
exon_end = min(int(exon["end"]), end)
|
|
233
|
+
ax.add_patch(
|
|
234
|
+
Rectangle(
|
|
235
|
+
(exon_start, y_gene - EXON_HEIGHT / 2),
|
|
236
|
+
exon_end - exon_start,
|
|
237
|
+
EXON_HEIGHT,
|
|
238
|
+
facecolor=gene_col,
|
|
239
|
+
edgecolor=gene_col,
|
|
240
|
+
linewidth=0.5,
|
|
241
|
+
zorder=2,
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
else:
|
|
245
|
+
# No exon data - draw full gene body as rectangle (fallback)
|
|
246
|
+
ax.add_patch(
|
|
247
|
+
Rectangle(
|
|
248
|
+
(gene_start, y_gene - EXON_HEIGHT / 2),
|
|
249
|
+
gene_end - gene_start,
|
|
250
|
+
EXON_HEIGHT,
|
|
251
|
+
facecolor=gene_col,
|
|
252
|
+
edgecolor=gene_col,
|
|
253
|
+
linewidth=0.5,
|
|
254
|
+
zorder=2,
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Add strand direction triangle at gene tip
|
|
259
|
+
if "strand" in gene.index:
|
|
260
|
+
strand = gene["strand"]
|
|
261
|
+
region_width = end - start
|
|
262
|
+
arrow_dir = 1 if strand == "+" else -1
|
|
263
|
+
|
|
264
|
+
# Triangle dimensions - whole arrow past gene end
|
|
265
|
+
tri_height = EXON_HEIGHT * 0.35
|
|
266
|
+
tri_width = region_width * 0.006
|
|
267
|
+
|
|
268
|
+
# Triangle entirely past gene tip
|
|
269
|
+
if arrow_dir == 1: # Forward strand: arrow starts at gene end
|
|
270
|
+
base_x = gene_end
|
|
271
|
+
tip_x = base_x + tri_width
|
|
272
|
+
tri_points = [
|
|
273
|
+
[tip_x, y_gene], # Tip pointing right
|
|
274
|
+
[base_x, y_gene + tri_height],
|
|
275
|
+
[base_x, y_gene - tri_height],
|
|
276
|
+
]
|
|
277
|
+
else: # Reverse strand: arrow starts at gene start
|
|
278
|
+
base_x = gene_start
|
|
279
|
+
tip_x = base_x - tri_width
|
|
280
|
+
tri_points = [
|
|
281
|
+
[tip_x, y_gene], # Tip pointing left
|
|
282
|
+
[base_x, y_gene + tri_height],
|
|
283
|
+
[base_x, y_gene - tri_height],
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
triangle = Polygon(
|
|
287
|
+
tri_points,
|
|
288
|
+
closed=True,
|
|
289
|
+
facecolor="black",
|
|
290
|
+
edgecolor="black",
|
|
291
|
+
linewidth=0.5,
|
|
292
|
+
zorder=5,
|
|
293
|
+
)
|
|
294
|
+
ax.add_patch(triangle)
|
|
295
|
+
|
|
296
|
+
# Add gene name label in the gap above gene
|
|
297
|
+
if gene_name:
|
|
298
|
+
label_pos = (gene_start + gene_end) / 2
|
|
299
|
+
ax.text(
|
|
300
|
+
label_pos,
|
|
301
|
+
y_label,
|
|
302
|
+
gene_name,
|
|
303
|
+
ha="center",
|
|
304
|
+
va="bottom",
|
|
305
|
+
fontsize=5.5,
|
|
306
|
+
color="#000000",
|
|
307
|
+
fontweight="medium",
|
|
308
|
+
style="italic",
|
|
309
|
+
zorder=4,
|
|
310
|
+
clip_on=True,
|
|
311
|
+
)
|
pylocuszoom/labels.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""SNP label placement for regional association plots.
|
|
2
|
+
|
|
3
|
+
Provides automatic labeling of top significant SNPs with:
|
|
4
|
+
- SNP ID (rs number)
|
|
5
|
+
- Nearest gene name (if gene annotations provided)
|
|
6
|
+
- Automatic overlap avoidance (if adjustText installed)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from matplotlib.axes import Axes
|
|
13
|
+
from matplotlib.text import Annotation
|
|
14
|
+
|
|
15
|
+
from .gene_track import get_nearest_gene
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def add_snp_labels(
|
|
19
|
+
ax: Axes,
|
|
20
|
+
df: pd.DataFrame,
|
|
21
|
+
pos_col: str = "ps",
|
|
22
|
+
neglog10p_col: str = "neglog10p",
|
|
23
|
+
rs_col: str = "rs",
|
|
24
|
+
label_top_n: int = 5,
|
|
25
|
+
genes_df: Optional[pd.DataFrame] = None,
|
|
26
|
+
chrom: Optional[Union[int, str]] = None,
|
|
27
|
+
max_label_length: int = 15,
|
|
28
|
+
) -> List[Annotation]:
|
|
29
|
+
"""Add text labels to top SNPs in the regional plot.
|
|
30
|
+
|
|
31
|
+
Labels the most significant SNPs with either their SNP ID
|
|
32
|
+
or the nearest gene name (if genes_df provided).
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
ax: Matplotlib axes object.
|
|
36
|
+
df: DataFrame with SNP data. Must have the specified position,
|
|
37
|
+
neglog10p, and rs columns.
|
|
38
|
+
pos_col: Column name for position.
|
|
39
|
+
neglog10p_col: Column name for -log10(p-value).
|
|
40
|
+
rs_col: Column name for SNP ID.
|
|
41
|
+
label_top_n: Number of top SNPs to label.
|
|
42
|
+
genes_df: Optional gene annotations for gene-based labels.
|
|
43
|
+
If provided with chrom, labels will show nearest gene name
|
|
44
|
+
instead of SNP ID.
|
|
45
|
+
chrom: Chromosome number. Required if genes_df is provided.
|
|
46
|
+
max_label_length: Maximum label length before truncation.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of matplotlib text annotation objects.
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> fig, ax = plt.subplots()
|
|
53
|
+
>>> # ... plot your data ...
|
|
54
|
+
>>> texts = add_snp_labels(ax, df, label_top_n=5)
|
|
55
|
+
"""
|
|
56
|
+
if neglog10p_col not in df.columns:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
f"Column '{neglog10p_col}' not found in DataFrame. "
|
|
59
|
+
"Ensure -log10(p) values are calculated before calling add_snp_labels."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Get top N SNPs by -log10(p)
|
|
63
|
+
top_snps = df.nlargest(label_top_n, neglog10p_col)
|
|
64
|
+
|
|
65
|
+
texts = []
|
|
66
|
+
for _, snp in top_snps.iterrows():
|
|
67
|
+
x = snp[pos_col]
|
|
68
|
+
y = snp[neglog10p_col]
|
|
69
|
+
|
|
70
|
+
# Determine label text
|
|
71
|
+
label = str(snp[rs_col])
|
|
72
|
+
|
|
73
|
+
# Try to get gene name if genes_df provided
|
|
74
|
+
if genes_df is not None and chrom is not None:
|
|
75
|
+
nearest_gene = get_nearest_gene(genes_df, chrom, int(x))
|
|
76
|
+
if nearest_gene:
|
|
77
|
+
label = nearest_gene
|
|
78
|
+
|
|
79
|
+
# Truncate long labels
|
|
80
|
+
if len(label) > max_label_length:
|
|
81
|
+
label = label[: max_label_length - 3] + "..."
|
|
82
|
+
|
|
83
|
+
# Add text annotation with offset
|
|
84
|
+
text = ax.annotate(
|
|
85
|
+
label,
|
|
86
|
+
xy=(x, y),
|
|
87
|
+
xytext=(5, 5),
|
|
88
|
+
textcoords="offset points",
|
|
89
|
+
fontsize=8,
|
|
90
|
+
fontweight="bold",
|
|
91
|
+
color="#333333",
|
|
92
|
+
ha="left",
|
|
93
|
+
va="bottom",
|
|
94
|
+
zorder=15,
|
|
95
|
+
bbox=dict(
|
|
96
|
+
boxstyle="round,pad=0.2",
|
|
97
|
+
facecolor="white",
|
|
98
|
+
edgecolor="none",
|
|
99
|
+
alpha=0.8,
|
|
100
|
+
),
|
|
101
|
+
)
|
|
102
|
+
texts.append(text)
|
|
103
|
+
|
|
104
|
+
# Try to adjust text positions to avoid overlap
|
|
105
|
+
try:
|
|
106
|
+
from adjustText import adjust_text
|
|
107
|
+
|
|
108
|
+
adjust_text(
|
|
109
|
+
texts,
|
|
110
|
+
ax=ax,
|
|
111
|
+
arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
|
|
112
|
+
expand_points=(1.5, 1.5),
|
|
113
|
+
)
|
|
114
|
+
except ImportError:
|
|
115
|
+
# adjustText not installed, labels may overlap
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
return texts
|