pylocuszoom 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/eqtl.py ADDED
@@ -0,0 +1,218 @@
1
+ """eQTL data handling and validation for pyLocusZoom.
2
+
3
+ Provides utilities for loading, validating, and preparing expression
4
+ quantitative trait loci (eQTL) data for overlay on regional plots.
5
+ """
6
+
7
+ from typing import List, Optional
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from .logging import logger
13
+
14
+
15
+ REQUIRED_EQTL_COLS = ["pos", "p_value"]
16
+ OPTIONAL_EQTL_COLS = ["gene", "effect_size", "rs", "se"]
17
+
18
+
19
+ class EQTLValidationError(ValueError):
20
+ """Raised when eQTL DataFrame validation fails."""
21
+
22
+ pass
23
+
24
+
25
+ def validate_eqtl_df(
26
+ df: pd.DataFrame,
27
+ pos_col: str = "pos",
28
+ p_col: str = "p_value",
29
+ ) -> None:
30
+ """Validate eQTL DataFrame has required columns.
31
+
32
+ Args:
33
+ df: eQTL DataFrame to validate.
34
+ pos_col: Column name for genomic position.
35
+ p_col: Column name for p-value.
36
+
37
+ Raises:
38
+ EQTLValidationError: If required columns are missing.
39
+ """
40
+ missing = []
41
+ if pos_col not in df.columns:
42
+ missing.append(pos_col)
43
+ if p_col not in df.columns:
44
+ missing.append(p_col)
45
+
46
+ if missing:
47
+ raise EQTLValidationError(
48
+ f"eQTL DataFrame missing required columns: {missing}. "
49
+ f"Required: {pos_col} (position), {p_col} (p-value)"
50
+ )
51
+
52
+
53
+ def filter_eqtl_by_gene(
54
+ df: pd.DataFrame,
55
+ gene: str,
56
+ gene_col: str = "gene",
57
+ ) -> pd.DataFrame:
58
+ """Filter eQTL data to a specific target gene.
59
+
60
+ Args:
61
+ df: eQTL DataFrame.
62
+ gene: Target gene name to filter for.
63
+ gene_col: Column containing gene names.
64
+
65
+ Returns:
66
+ Filtered DataFrame containing only eQTLs for the target gene.
67
+
68
+ Raises:
69
+ EQTLValidationError: If gene column doesn't exist.
70
+ """
71
+ if gene_col not in df.columns:
72
+ raise EQTLValidationError(
73
+ f"Cannot filter by gene: column '{gene_col}' not found. "
74
+ f"Available columns: {list(df.columns)}"
75
+ )
76
+
77
+ filtered = df[df[gene_col] == gene].copy()
78
+ logger.debug(f"Filtered eQTL data to {len(filtered)} variants for gene {gene}")
79
+ return filtered
80
+
81
+
82
+ def filter_eqtl_by_region(
83
+ df: pd.DataFrame,
84
+ chrom: int,
85
+ start: int,
86
+ end: int,
87
+ pos_col: str = "pos",
88
+ chrom_col: Optional[str] = "chr",
89
+ ) -> pd.DataFrame:
90
+ """Filter eQTL data to a genomic region.
91
+
92
+ Args:
93
+ df: eQTL DataFrame.
94
+ chrom: Chromosome number.
95
+ start: Start position.
96
+ end: End position.
97
+ pos_col: Column name for position.
98
+ chrom_col: Column name for chromosome (if present).
99
+
100
+ Returns:
101
+ Filtered DataFrame containing only eQTLs in the region.
102
+ """
103
+ mask = (df[pos_col] >= start) & (df[pos_col] <= end)
104
+
105
+ # Filter by chromosome if column exists
106
+ if chrom_col and chrom_col in df.columns:
107
+ chrom_str = str(chrom).replace("chr", "")
108
+ df_chrom = df[chrom_col].astype(str).str.replace("chr", "", regex=False)
109
+ mask = mask & (df_chrom == chrom_str)
110
+
111
+ filtered = df[mask].copy()
112
+ logger.debug(f"Filtered eQTL data to {len(filtered)} variants in region chr{chrom}:{start}-{end}")
113
+ return filtered
114
+
115
+
116
+ def prepare_eqtl_for_plotting(
117
+ df: pd.DataFrame,
118
+ pos_col: str = "pos",
119
+ p_col: str = "p_value",
120
+ gene: Optional[str] = None,
121
+ chrom: Optional[int] = None,
122
+ start: Optional[int] = None,
123
+ end: Optional[int] = None,
124
+ ) -> pd.DataFrame:
125
+ """Prepare eQTL data for plotting.
126
+
127
+ Validates, filters, and adds computed columns needed for plotting.
128
+
129
+ Args:
130
+ df: Raw eQTL DataFrame.
131
+ pos_col: Column name for position.
132
+ p_col: Column name for p-value.
133
+ gene: Optional gene to filter for.
134
+ chrom: Optional chromosome for region filtering.
135
+ start: Optional start position for region filtering.
136
+ end: Optional end position for region filtering.
137
+
138
+ Returns:
139
+ Prepared DataFrame with neglog10p column added.
140
+ """
141
+ validate_eqtl_df(df, pos_col=pos_col, p_col=p_col)
142
+
143
+ result = df.copy()
144
+
145
+ # Filter by gene if specified
146
+ if gene:
147
+ result = filter_eqtl_by_gene(result, gene)
148
+
149
+ # Filter by region if specified
150
+ if chrom is not None and start is not None and end is not None:
151
+ result = filter_eqtl_by_region(result, chrom, start, end, pos_col=pos_col)
152
+
153
+ # Add -log10(p) column
154
+ result["neglog10p"] = -np.log10(result[p_col].clip(lower=1e-300))
155
+
156
+ return result
157
+
158
+
159
+ def get_eqtl_genes(df: pd.DataFrame, gene_col: str = "gene") -> List[str]:
160
+ """Get list of unique genes in eQTL data.
161
+
162
+ Args:
163
+ df: eQTL DataFrame.
164
+ gene_col: Column containing gene names.
165
+
166
+ Returns:
167
+ Sorted list of unique gene names.
168
+ """
169
+ if gene_col not in df.columns:
170
+ return []
171
+ return sorted(df[gene_col].dropna().unique().tolist())
172
+
173
+
174
+ def calculate_colocalization_overlap(
175
+ gwas_df: pd.DataFrame,
176
+ eqtl_df: pd.DataFrame,
177
+ gwas_pos_col: str = "ps",
178
+ eqtl_pos_col: str = "pos",
179
+ gwas_p_col: str = "p_wald",
180
+ eqtl_p_col: str = "p_value",
181
+ p_threshold: float = 1e-5,
182
+ ) -> pd.DataFrame:
183
+ """Find SNPs significant in both GWAS and eQTL.
184
+
185
+ Simple overlap analysis - for formal colocalization,
186
+ use dedicated tools like coloc or eCAVIAR.
187
+
188
+ Args:
189
+ gwas_df: GWAS results DataFrame.
190
+ eqtl_df: eQTL results DataFrame.
191
+ gwas_pos_col: Position column in GWAS data.
192
+ eqtl_pos_col: Position column in eQTL data.
193
+ gwas_p_col: P-value column in GWAS data.
194
+ eqtl_p_col: P-value column in eQTL data.
195
+ p_threshold: P-value threshold for significance.
196
+
197
+ Returns:
198
+ DataFrame with overlapping significant SNPs from both datasets.
199
+ """
200
+ # Filter to significant SNPs
201
+ sig_gwas = gwas_df[gwas_df[gwas_p_col] < p_threshold][[gwas_pos_col, gwas_p_col]]
202
+ sig_eqtl = eqtl_df[eqtl_df[eqtl_p_col] < p_threshold][[eqtl_pos_col, eqtl_p_col]]
203
+
204
+ # Merge on position
205
+ overlap = sig_gwas.merge(
206
+ sig_eqtl,
207
+ left_on=gwas_pos_col,
208
+ right_on=eqtl_pos_col,
209
+ how="inner",
210
+ suffixes=("_gwas", "_eqtl"),
211
+ )
212
+
213
+ logger.info(
214
+ f"Found {len(overlap)} SNPs significant in both GWAS and eQTL "
215
+ f"(p < {p_threshold})"
216
+ )
217
+
218
+ return overlap
@@ -0,0 +1,311 @@
1
+ """Gene track visualization for regional association plots.
2
+
3
+ Provides LocusZoom-style gene track plotting with:
4
+ - Thin horizontal lines for introns
5
+ - Thick rectangles for exons
6
+ - Arrows indicating strand direction
7
+ - Gene name labels
8
+ """
9
+
10
+ from typing import List, Optional, Union
11
+
12
+ import pandas as pd
13
+ from matplotlib.axes import Axes
14
+ from matplotlib.patches import Polygon, Rectangle
15
+
16
+ from .utils import normalize_chrom
17
+
18
+ # Strand-specific colors (bold, distinct)
19
+ STRAND_COLORS: dict[Optional[str], str] = {
20
+ "+": "#6A3D9A", # Bold purple for forward strand
21
+ "-": "#1F78B4", # Bold teal/blue for reverse strand
22
+ None: "#666666", # Grey if no strand info
23
+ }
24
+
25
+ # Layout constants
26
+ ROW_HEIGHT = 0.40 # Total height per row
27
+ GENE_AREA = 0.28 # Bottom portion for gene drawing
28
+ EXON_HEIGHT = 0.22 # Exon rectangle height
29
+ INTRON_HEIGHT = 0.02 # Thin intron line
30
+
31
+
32
+ def assign_gene_positions(genes_df: pd.DataFrame, start: int, end: int) -> List[int]:
33
+ """Assign row indices to genes to minimize overlap.
34
+
35
+ Uses a greedy algorithm to stack genes vertically, placing each gene
36
+ in the lowest row where it doesn't overlap with existing genes.
37
+
38
+ Args:
39
+ genes_df: Gene annotations DataFrame sorted by start position.
40
+ start: Region start position.
41
+ end: Region end position.
42
+
43
+ Returns:
44
+ List of integer row indices (0, 1, 2, ...) for each gene.
45
+ """
46
+ positions = []
47
+ occupied = [] # List of (end_pos, row)
48
+ region_width = end - start
49
+
50
+ for _, gene in genes_df.iterrows():
51
+ gene_start = max(gene["start"], start)
52
+ gene_end = min(gene["end"], end)
53
+
54
+ # Find first available row with buffer for label spacing
55
+ row = 0
56
+ label_buffer = region_width * 0.08 # Extra space for labels
57
+ for occ_end, occ_row in occupied:
58
+ if occ_row == row and occ_end > gene_start - label_buffer:
59
+ row = occ_row + 1
60
+
61
+ positions.append(row)
62
+ occupied.append((gene_end, row))
63
+
64
+ return positions
65
+
66
+
67
+ def get_nearest_gene(
68
+ genes_df: pd.DataFrame,
69
+ chrom: Union[int, str],
70
+ pos: int,
71
+ window: int = 50000,
72
+ ) -> Optional[str]:
73
+ """Get the nearest gene name for a genomic position.
74
+
75
+ Searches for genes that overlap or are within the specified window
76
+ of the given position, returning the closest by midpoint distance.
77
+
78
+ Args:
79
+ genes_df: Gene annotations DataFrame with chr, start, end, gene_name.
80
+ chrom: Chromosome number or string.
81
+ pos: Position in base pairs.
82
+ window: Window size in bp for searching nearby genes.
83
+
84
+ Returns:
85
+ Gene name string or None if no gene found within window.
86
+
87
+ Example:
88
+ >>> gene = get_nearest_gene(genes_df, chrom=1, pos=1500000)
89
+ >>> gene
90
+ 'BRCA1'
91
+ """
92
+ chrom_str = normalize_chrom(chrom)
93
+ chrom_genes = genes_df[
94
+ genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str
95
+ ]
96
+
97
+ if chrom_genes.empty:
98
+ return None
99
+
100
+ # Find genes that overlap or are within window
101
+ nearby = chrom_genes[
102
+ (chrom_genes["start"] - window <= pos) & (chrom_genes["end"] + window >= pos)
103
+ ]
104
+
105
+ if nearby.empty:
106
+ return None
107
+
108
+ # Return the closest gene (by midpoint distance)
109
+ nearby = nearby.copy()
110
+ nearby["dist"] = abs((nearby["start"] + nearby["end"]) / 2 - pos)
111
+ return nearby.loc[nearby["dist"].idxmin(), "gene_name"]
112
+
113
+
114
+ def plot_gene_track(
115
+ ax: Axes,
116
+ genes_df: pd.DataFrame,
117
+ chrom: Union[int, str],
118
+ start: int,
119
+ end: int,
120
+ exons_df: Optional[pd.DataFrame] = None,
121
+ ) -> None:
122
+ """Plot gene annotations as a LocusZoom-style track.
123
+
124
+ Creates a gene track with:
125
+ - Thin horizontal lines for introns (gene body)
126
+ - Thick rectangles for exons
127
+ - Arrows indicating strand direction
128
+ - Gene name labels
129
+
130
+ Args:
131
+ ax: Matplotlib axes for gene track.
132
+ genes_df: Gene annotations with chr, start, end, gene_name,
133
+ and optionally strand (+/-) column.
134
+ chrom: Chromosome number or string.
135
+ start: Region start position.
136
+ end: Region end position.
137
+ exons_df: Exon annotations with chr, start, end, gene_name
138
+ columns for drawing exon structure. Optional.
139
+ """
140
+ chrom_str = normalize_chrom(chrom)
141
+ region_genes = genes_df[
142
+ (genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
143
+ & (genes_df["end"] >= start)
144
+ & (genes_df["start"] <= end)
145
+ ].copy()
146
+
147
+ ax.set_xlim(start, end)
148
+ ax.set_ylabel("Genes", fontsize=10)
149
+ ax.set_yticks([])
150
+
151
+ # theme_classic: only bottom spine
152
+ ax.spines["top"].set_visible(False)
153
+ ax.spines["right"].set_visible(False)
154
+ ax.spines["left"].set_visible(False)
155
+ ax.spines["bottom"].set_linewidth(0.5)
156
+
157
+ if region_genes.empty:
158
+ ax.set_ylim(0, 1)
159
+ ax.text(
160
+ (start + end) / 2,
161
+ 0.5,
162
+ "No genes",
163
+ ha="center",
164
+ va="center",
165
+ fontsize=9,
166
+ color="grey",
167
+ style="italic",
168
+ )
169
+ return
170
+
171
+ # Assign vertical positions to avoid overlap
172
+ region_genes = region_genes.sort_values("start")
173
+ positions = assign_gene_positions(region_genes, start, end)
174
+
175
+ # Set y-axis limits - small bottom margin for gene body, tight top
176
+ max_row = max(positions) if positions else 0
177
+ bottom_margin = EXON_HEIGHT / 2 + 0.02 # Room for bottom gene
178
+ top_margin = 0.15 # Small space above top label
179
+ ax.set_ylim(
180
+ -bottom_margin,
181
+ (max_row + 1) * ROW_HEIGHT - ROW_HEIGHT + GENE_AREA + top_margin,
182
+ )
183
+
184
+ # Filter exons for this region if available
185
+ region_exons = None
186
+ if exons_df is not None and not exons_df.empty:
187
+ region_exons = exons_df[
188
+ (
189
+ exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
190
+ == chrom_str
191
+ )
192
+ & (exons_df["end"] >= start)
193
+ & (exons_df["start"] <= end)
194
+ ].copy()
195
+
196
+ for idx, (_, gene) in enumerate(region_genes.iterrows()):
197
+ gene_start = max(int(gene["start"]), start)
198
+ gene_end = min(int(gene["end"]), end)
199
+ row = positions[idx]
200
+ gene_name = gene.get("gene_name", "")
201
+
202
+ # Get strand-specific color
203
+ strand = gene.get("strand") if "strand" in gene.index else None
204
+ gene_col = STRAND_COLORS.get(strand, STRAND_COLORS[None])
205
+
206
+ # Y position: bottom of row + offset for gene area
207
+ y_gene = row * ROW_HEIGHT + 0.05
208
+ y_label = y_gene + EXON_HEIGHT / 2 + 0.01 # Just above gene top
209
+
210
+ # Check if we have exon data for this gene
211
+ gene_exons = None
212
+ if region_exons is not None and not region_exons.empty and gene_name:
213
+ gene_exons = region_exons[region_exons["gene_name"] == gene_name].copy()
214
+
215
+ if gene_exons is not None and not gene_exons.empty:
216
+ # Draw intron line (thin horizontal line spanning gene)
217
+ ax.add_patch(
218
+ Rectangle(
219
+ (gene_start, y_gene - INTRON_HEIGHT / 2),
220
+ gene_end - gene_start,
221
+ INTRON_HEIGHT,
222
+ facecolor=gene_col,
223
+ edgecolor=gene_col,
224
+ linewidth=0.5,
225
+ zorder=1,
226
+ )
227
+ )
228
+
229
+ # Draw exons (thick rectangles)
230
+ for _, exon in gene_exons.iterrows():
231
+ exon_start = max(int(exon["start"]), start)
232
+ exon_end = min(int(exon["end"]), end)
233
+ ax.add_patch(
234
+ Rectangle(
235
+ (exon_start, y_gene - EXON_HEIGHT / 2),
236
+ exon_end - exon_start,
237
+ EXON_HEIGHT,
238
+ facecolor=gene_col,
239
+ edgecolor=gene_col,
240
+ linewidth=0.5,
241
+ zorder=2,
242
+ )
243
+ )
244
+ else:
245
+ # No exon data - draw full gene body as rectangle (fallback)
246
+ ax.add_patch(
247
+ Rectangle(
248
+ (gene_start, y_gene - EXON_HEIGHT / 2),
249
+ gene_end - gene_start,
250
+ EXON_HEIGHT,
251
+ facecolor=gene_col,
252
+ edgecolor=gene_col,
253
+ linewidth=0.5,
254
+ zorder=2,
255
+ )
256
+ )
257
+
258
+ # Add strand direction triangle at gene tip
259
+ if "strand" in gene.index:
260
+ strand = gene["strand"]
261
+ region_width = end - start
262
+ arrow_dir = 1 if strand == "+" else -1
263
+
264
+ # Triangle dimensions - whole arrow past gene end
265
+ tri_height = EXON_HEIGHT * 0.35
266
+ tri_width = region_width * 0.006
267
+
268
+ # Triangle entirely past gene tip
269
+ if arrow_dir == 1: # Forward strand: arrow starts at gene end
270
+ base_x = gene_end
271
+ tip_x = base_x + tri_width
272
+ tri_points = [
273
+ [tip_x, y_gene], # Tip pointing right
274
+ [base_x, y_gene + tri_height],
275
+ [base_x, y_gene - tri_height],
276
+ ]
277
+ else: # Reverse strand: arrow starts at gene start
278
+ base_x = gene_start
279
+ tip_x = base_x - tri_width
280
+ tri_points = [
281
+ [tip_x, y_gene], # Tip pointing left
282
+ [base_x, y_gene + tri_height],
283
+ [base_x, y_gene - tri_height],
284
+ ]
285
+
286
+ triangle = Polygon(
287
+ tri_points,
288
+ closed=True,
289
+ facecolor="black",
290
+ edgecolor="black",
291
+ linewidth=0.5,
292
+ zorder=5,
293
+ )
294
+ ax.add_patch(triangle)
295
+
296
+ # Add gene name label in the gap above gene
297
+ if gene_name:
298
+ label_pos = (gene_start + gene_end) / 2
299
+ ax.text(
300
+ label_pos,
301
+ y_label,
302
+ gene_name,
303
+ ha="center",
304
+ va="bottom",
305
+ fontsize=5.5,
306
+ color="#000000",
307
+ fontweight="medium",
308
+ style="italic",
309
+ zorder=4,
310
+ clip_on=True,
311
+ )
pylocuszoom/labels.py ADDED
@@ -0,0 +1,118 @@
1
+ """SNP label placement for regional association plots.
2
+
3
+ Provides automatic labeling of top significant SNPs with:
4
+ - SNP ID (rs number)
5
+ - Nearest gene name (if gene annotations provided)
6
+ - Automatic overlap avoidance (if adjustText installed)
7
+ """
8
+
9
+ from typing import List, Optional, Union
10
+
11
+ import pandas as pd
12
+ from matplotlib.axes import Axes
13
+ from matplotlib.text import Annotation
14
+
15
+ from .gene_track import get_nearest_gene
16
+
17
+
18
+ def add_snp_labels(
19
+ ax: Axes,
20
+ df: pd.DataFrame,
21
+ pos_col: str = "ps",
22
+ neglog10p_col: str = "neglog10p",
23
+ rs_col: str = "rs",
24
+ label_top_n: int = 5,
25
+ genes_df: Optional[pd.DataFrame] = None,
26
+ chrom: Optional[Union[int, str]] = None,
27
+ max_label_length: int = 15,
28
+ ) -> List[Annotation]:
29
+ """Add text labels to top SNPs in the regional plot.
30
+
31
+ Labels the most significant SNPs with either their SNP ID
32
+ or the nearest gene name (if genes_df provided).
33
+
34
+ Args:
35
+ ax: Matplotlib axes object.
36
+ df: DataFrame with SNP data. Must have the specified position,
37
+ neglog10p, and rs columns.
38
+ pos_col: Column name for position.
39
+ neglog10p_col: Column name for -log10(p-value).
40
+ rs_col: Column name for SNP ID.
41
+ label_top_n: Number of top SNPs to label.
42
+ genes_df: Optional gene annotations for gene-based labels.
43
+ If provided with chrom, labels will show nearest gene name
44
+ instead of SNP ID.
45
+ chrom: Chromosome number. Required if genes_df is provided.
46
+ max_label_length: Maximum label length before truncation.
47
+
48
+ Returns:
49
+ List of matplotlib text annotation objects.
50
+
51
+ Example:
52
+ >>> fig, ax = plt.subplots()
53
+ >>> # ... plot your data ...
54
+ >>> texts = add_snp_labels(ax, df, label_top_n=5)
55
+ """
56
+ if neglog10p_col not in df.columns:
57
+ raise ValueError(
58
+ f"Column '{neglog10p_col}' not found in DataFrame. "
59
+ "Ensure -log10(p) values are calculated before calling add_snp_labels."
60
+ )
61
+
62
+ # Get top N SNPs by -log10(p)
63
+ top_snps = df.nlargest(label_top_n, neglog10p_col)
64
+
65
+ texts = []
66
+ for _, snp in top_snps.iterrows():
67
+ x = snp[pos_col]
68
+ y = snp[neglog10p_col]
69
+
70
+ # Determine label text
71
+ label = str(snp[rs_col])
72
+
73
+ # Try to get gene name if genes_df provided
74
+ if genes_df is not None and chrom is not None:
75
+ nearest_gene = get_nearest_gene(genes_df, chrom, int(x))
76
+ if nearest_gene:
77
+ label = nearest_gene
78
+
79
+ # Truncate long labels
80
+ if len(label) > max_label_length:
81
+ label = label[: max_label_length - 3] + "..."
82
+
83
+ # Add text annotation with offset
84
+ text = ax.annotate(
85
+ label,
86
+ xy=(x, y),
87
+ xytext=(5, 5),
88
+ textcoords="offset points",
89
+ fontsize=8,
90
+ fontweight="bold",
91
+ color="#333333",
92
+ ha="left",
93
+ va="bottom",
94
+ zorder=15,
95
+ bbox=dict(
96
+ boxstyle="round,pad=0.2",
97
+ facecolor="white",
98
+ edgecolor="none",
99
+ alpha=0.8,
100
+ ),
101
+ )
102
+ texts.append(text)
103
+
104
+ # Try to adjust text positions to avoid overlap
105
+ try:
106
+ from adjustText import adjust_text
107
+
108
+ adjust_text(
109
+ texts,
110
+ ax=ax,
111
+ arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
112
+ expand_points=(1.5, 1.5),
113
+ )
114
+ except ImportError:
115
+ # adjustText not installed, labels may overlap
116
+ pass
117
+
118
+ return texts