pylocuszoom 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +38 -2
- pylocuszoom/backends/__init__.py +116 -17
- pylocuszoom/backends/base.py +424 -35
- pylocuszoom/backends/bokeh_backend.py +192 -34
- pylocuszoom/backends/hover.py +198 -0
- pylocuszoom/backends/matplotlib_backend.py +332 -3
- pylocuszoom/backends/plotly_backend.py +187 -38
- pylocuszoom/colors.py +41 -0
- pylocuszoom/ensembl.py +476 -0
- pylocuszoom/eqtl.py +15 -19
- pylocuszoom/finemapping.py +17 -26
- pylocuszoom/forest.py +35 -0
- pylocuszoom/gene_track.py +161 -135
- pylocuszoom/loaders.py +38 -18
- pylocuszoom/phewas.py +34 -0
- pylocuszoom/plotter.py +370 -190
- pylocuszoom/recombination.py +64 -34
- pylocuszoom/schemas.py +37 -26
- pylocuszoom/utils.py +52 -0
- pylocuszoom/validation.py +172 -0
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/METADATA +97 -28
- pylocuszoom-0.8.0.dist-info/RECORD +29 -0
- pylocuszoom-0.5.0.dist-info/RECORD +0 -24
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.5.0.dist-info → pylocuszoom-0.8.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/gene_track.py
CHANGED
|
@@ -28,6 +28,10 @@ GENE_AREA = 0.25 # Bottom portion for gene drawing
|
|
|
28
28
|
EXON_HEIGHT = 0.20 # Exon rectangle height
|
|
29
29
|
INTRON_HEIGHT = 0.02 # Thin intron line
|
|
30
30
|
|
|
31
|
+
# Arrow dimensions (pre-computed for clarity)
|
|
32
|
+
ARROW_HEIGHT_RATIO = 0.2625 # EXON_HEIGHT * 0.35 * 0.75 (75% of original height)
|
|
33
|
+
ARROW_WIDTH_RATIO = 0.0066 # region_width * 0.006 * 1.1 (10% wider than original)
|
|
34
|
+
|
|
31
35
|
|
|
32
36
|
def assign_gene_positions(genes_df: pd.DataFrame, start: int, end: int) -> List[int]:
|
|
33
37
|
"""Assign row indices to genes to minimize overlap.
|
|
@@ -111,6 +115,147 @@ def get_nearest_gene(
|
|
|
111
115
|
return nearby.loc[nearby["dist"].idxmin(), "gene_name"]
|
|
112
116
|
|
|
113
117
|
|
|
118
|
+
def _filter_genes_by_region(
|
|
119
|
+
df: pd.DataFrame, chrom: Union[int, str], start: int, end: int
|
|
120
|
+
) -> pd.DataFrame:
|
|
121
|
+
"""Filter a DataFrame to genes/exons within a genomic region."""
|
|
122
|
+
chrom_str = normalize_chrom(chrom)
|
|
123
|
+
return df[
|
|
124
|
+
(df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
|
|
125
|
+
& (df["end"] >= start)
|
|
126
|
+
& (df["start"] <= end)
|
|
127
|
+
].copy()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _compute_arrow_geometry(
|
|
131
|
+
gene_start: int, gene_end: int, region_width: int, strand: str
|
|
132
|
+
) -> tuple[list[float], float, float, str]:
|
|
133
|
+
"""Compute arrow tip positions and dimensions for strand arrows.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Tuple of (arrow_tip_positions, tri_height, tri_width, arrow_color).
|
|
137
|
+
"""
|
|
138
|
+
tri_height = EXON_HEIGHT * ARROW_HEIGHT_RATIO
|
|
139
|
+
tri_width = region_width * ARROW_WIDTH_RATIO
|
|
140
|
+
|
|
141
|
+
tip_offset = tri_width / 2
|
|
142
|
+
tail_offset = tri_width * 1.5
|
|
143
|
+
gene_center = (gene_start + gene_end) / 2
|
|
144
|
+
|
|
145
|
+
if strand == "+":
|
|
146
|
+
arrow_tip_positions = [
|
|
147
|
+
gene_start + tail_offset,
|
|
148
|
+
gene_center + tri_width / 2,
|
|
149
|
+
gene_end - tip_offset,
|
|
150
|
+
]
|
|
151
|
+
arrow_color = "#000000"
|
|
152
|
+
else:
|
|
153
|
+
arrow_tip_positions = [
|
|
154
|
+
gene_end - tail_offset,
|
|
155
|
+
gene_center - tri_width / 2,
|
|
156
|
+
gene_start + tip_offset,
|
|
157
|
+
]
|
|
158
|
+
arrow_color = "#333333"
|
|
159
|
+
|
|
160
|
+
return arrow_tip_positions, tri_height, tri_width, arrow_color
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _draw_strand_arrows_matplotlib(
|
|
164
|
+
ax: Axes,
|
|
165
|
+
gene: pd.Series,
|
|
166
|
+
gene_start: int,
|
|
167
|
+
gene_end: int,
|
|
168
|
+
y_gene: float,
|
|
169
|
+
region_width: int,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""Draw strand direction arrows using matplotlib."""
|
|
172
|
+
strand = gene["strand"]
|
|
173
|
+
arrow_tip_positions, tri_height, tri_width, arrow_color = _compute_arrow_geometry(
|
|
174
|
+
gene_start, gene_end, region_width, strand
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Draw connecting line between arrow centers
|
|
178
|
+
if len(arrow_tip_positions) > 1:
|
|
179
|
+
ax.plot(
|
|
180
|
+
[arrow_tip_positions[0], arrow_tip_positions[-1]],
|
|
181
|
+
[y_gene, y_gene],
|
|
182
|
+
color=arrow_color,
|
|
183
|
+
linewidth=1.0,
|
|
184
|
+
zorder=4,
|
|
185
|
+
solid_capstyle="butt",
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
for tip_x in arrow_tip_positions:
|
|
189
|
+
if strand == "+":
|
|
190
|
+
base_x = tip_x - tri_width
|
|
191
|
+
else:
|
|
192
|
+
base_x = tip_x + tri_width
|
|
193
|
+
|
|
194
|
+
tri_points = [
|
|
195
|
+
[tip_x, y_gene],
|
|
196
|
+
[base_x, y_gene + tri_height],
|
|
197
|
+
[base_x, y_gene - tri_height],
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
triangle = Polygon(
|
|
201
|
+
tri_points,
|
|
202
|
+
closed=True,
|
|
203
|
+
facecolor=arrow_color,
|
|
204
|
+
edgecolor=arrow_color,
|
|
205
|
+
linewidth=0.5,
|
|
206
|
+
zorder=5,
|
|
207
|
+
)
|
|
208
|
+
ax.add_patch(triangle)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _draw_strand_arrows_generic(
|
|
212
|
+
ax: Any,
|
|
213
|
+
backend: Any,
|
|
214
|
+
gene: pd.Series,
|
|
215
|
+
gene_start: int,
|
|
216
|
+
gene_end: int,
|
|
217
|
+
y_gene: float,
|
|
218
|
+
region_width: int,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Draw strand direction arrows using a generic backend."""
|
|
221
|
+
strand = gene["strand"]
|
|
222
|
+
arrow_tip_positions, tri_height, tri_width, arrow_color = _compute_arrow_geometry(
|
|
223
|
+
gene_start, gene_end, region_width, strand
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Draw connecting line between arrow centers
|
|
227
|
+
if len(arrow_tip_positions) > 1:
|
|
228
|
+
backend.line(
|
|
229
|
+
ax,
|
|
230
|
+
x=pd.Series([arrow_tip_positions[0], arrow_tip_positions[-1]]),
|
|
231
|
+
y=pd.Series([y_gene, y_gene]),
|
|
232
|
+
color=arrow_color,
|
|
233
|
+
linewidth=1.0,
|
|
234
|
+
zorder=4,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
for tip_x in arrow_tip_positions:
|
|
238
|
+
if strand == "+":
|
|
239
|
+
base_x = tip_x - tri_width
|
|
240
|
+
else:
|
|
241
|
+
base_x = tip_x + tri_width
|
|
242
|
+
|
|
243
|
+
tri_points = [
|
|
244
|
+
[tip_x, y_gene],
|
|
245
|
+
[base_x, y_gene + tri_height],
|
|
246
|
+
[base_x, y_gene - tri_height],
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
backend.add_polygon(
|
|
250
|
+
ax,
|
|
251
|
+
tri_points,
|
|
252
|
+
facecolor=arrow_color,
|
|
253
|
+
edgecolor=arrow_color,
|
|
254
|
+
linewidth=0.5,
|
|
255
|
+
zorder=5,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
114
259
|
def plot_gene_track(
|
|
115
260
|
ax: Axes,
|
|
116
261
|
genes_df: pd.DataFrame,
|
|
@@ -137,12 +282,7 @@ def plot_gene_track(
|
|
|
137
282
|
exons_df: Exon annotations with chr, start, end, gene_name
|
|
138
283
|
columns for drawing exon structure. Optional.
|
|
139
284
|
"""
|
|
140
|
-
|
|
141
|
-
region_genes = genes_df[
|
|
142
|
-
(genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
|
|
143
|
-
& (genes_df["end"] >= start)
|
|
144
|
-
& (genes_df["start"] <= end)
|
|
145
|
-
].copy()
|
|
285
|
+
region_genes = _filter_genes_by_region(genes_df, chrom, start, end)
|
|
146
286
|
|
|
147
287
|
ax.set_xlim(start, end)
|
|
148
288
|
ax.set_ylabel("")
|
|
@@ -178,20 +318,13 @@ def plot_gene_track(
|
|
|
178
318
|
top_margin = 0.05 # Minimal space above top label
|
|
179
319
|
ax.set_ylim(
|
|
180
320
|
-bottom_margin,
|
|
181
|
-
|
|
321
|
+
max_row * ROW_HEIGHT + GENE_AREA + top_margin,
|
|
182
322
|
)
|
|
183
323
|
|
|
184
324
|
# Filter exons for this region if available
|
|
185
325
|
region_exons = None
|
|
186
326
|
if exons_df is not None and not exons_df.empty:
|
|
187
|
-
region_exons = exons_df
|
|
188
|
-
(
|
|
189
|
-
exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
190
|
-
== chrom_str
|
|
191
|
-
)
|
|
192
|
-
& (exons_df["end"] >= start)
|
|
193
|
-
& (exons_df["start"] <= end)
|
|
194
|
-
].copy()
|
|
327
|
+
region_exons = _filter_genes_by_region(exons_df, chrom, start, end)
|
|
195
328
|
|
|
196
329
|
region_width = end - start
|
|
197
330
|
|
|
@@ -257,59 +390,11 @@ def plot_gene_track(
|
|
|
257
390
|
)
|
|
258
391
|
)
|
|
259
392
|
|
|
260
|
-
# Add strand direction triangles
|
|
393
|
+
# Add strand direction triangles
|
|
261
394
|
if "strand" in gene.index:
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
# Triangle dimensions
|
|
266
|
-
tri_height = EXON_HEIGHT * 0.35
|
|
267
|
-
tri_width = region_width * 0.006
|
|
268
|
-
|
|
269
|
-
# Arrow positions: front, middle, back (tip positions)
|
|
270
|
-
tip_offset = tri_width / 2 # Tiny offset to keep tip inside gene
|
|
271
|
-
tail_offset = tri_width * 1.5 # Offset for tail arrow from gene start/end
|
|
272
|
-
gene_center = (gene_start + gene_end) / 2
|
|
273
|
-
if arrow_dir == 1: # Forward strand
|
|
274
|
-
arrow_tip_positions = [
|
|
275
|
-
gene_start + tail_offset, # Tail (tip inside gene)
|
|
276
|
-
gene_center + tri_width / 2, # Middle (arrow center at gene center)
|
|
277
|
-
gene_end - tip_offset, # Tip (near gene end)
|
|
278
|
-
]
|
|
279
|
-
arrow_color = "#000000" # Black for forward
|
|
280
|
-
else: # Reverse strand
|
|
281
|
-
arrow_tip_positions = [
|
|
282
|
-
gene_end - tail_offset, # Tail (tip inside gene)
|
|
283
|
-
gene_center - tri_width / 2, # Middle (arrow center at gene center)
|
|
284
|
-
gene_start + tip_offset, # Tip (near gene start)
|
|
285
|
-
]
|
|
286
|
-
arrow_color = "#333333" # Dark grey for reverse
|
|
287
|
-
|
|
288
|
-
for tip_x in arrow_tip_positions:
|
|
289
|
-
if arrow_dir == 1:
|
|
290
|
-
base_x = tip_x - tri_width
|
|
291
|
-
tri_points = [
|
|
292
|
-
[tip_x, y_gene], # Tip pointing right
|
|
293
|
-
[base_x, y_gene + tri_height],
|
|
294
|
-
[base_x, y_gene - tri_height],
|
|
295
|
-
]
|
|
296
|
-
else:
|
|
297
|
-
base_x = tip_x + tri_width
|
|
298
|
-
tri_points = [
|
|
299
|
-
[tip_x, y_gene], # Tip pointing left
|
|
300
|
-
[base_x, y_gene + tri_height],
|
|
301
|
-
[base_x, y_gene - tri_height],
|
|
302
|
-
]
|
|
303
|
-
|
|
304
|
-
triangle = Polygon(
|
|
305
|
-
tri_points,
|
|
306
|
-
closed=True,
|
|
307
|
-
facecolor=arrow_color,
|
|
308
|
-
edgecolor=arrow_color,
|
|
309
|
-
linewidth=0.5,
|
|
310
|
-
zorder=5,
|
|
311
|
-
)
|
|
312
|
-
ax.add_patch(triangle)
|
|
395
|
+
_draw_strand_arrows_matplotlib(
|
|
396
|
+
ax, gene, gene_start, gene_end, y_gene, region_width
|
|
397
|
+
)
|
|
313
398
|
|
|
314
399
|
# Add gene name label in the gap above gene
|
|
315
400
|
if gene_name:
|
|
@@ -320,7 +405,7 @@ def plot_gene_track(
|
|
|
320
405
|
gene_name,
|
|
321
406
|
ha="center",
|
|
322
407
|
va="bottom",
|
|
323
|
-
fontsize=
|
|
408
|
+
fontsize=7,
|
|
324
409
|
color="#000000",
|
|
325
410
|
fontweight="medium",
|
|
326
411
|
style="italic",
|
|
@@ -353,12 +438,7 @@ def plot_gene_track_generic(
|
|
|
353
438
|
exons_df: Exon annotations with chr, start, end, gene_name
|
|
354
439
|
columns for drawing exon structure. Optional.
|
|
355
440
|
"""
|
|
356
|
-
|
|
357
|
-
region_genes = genes_df[
|
|
358
|
-
(genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
|
|
359
|
-
& (genes_df["end"] >= start)
|
|
360
|
-
& (genes_df["start"] <= end)
|
|
361
|
-
].copy()
|
|
441
|
+
region_genes = _filter_genes_by_region(genes_df, chrom, start, end)
|
|
362
442
|
|
|
363
443
|
backend.set_xlim(ax, start, end)
|
|
364
444
|
backend.set_ylabel(ax, "", fontsize=10)
|
|
@@ -389,20 +469,13 @@ def plot_gene_track_generic(
|
|
|
389
469
|
backend.set_ylim(
|
|
390
470
|
ax,
|
|
391
471
|
-bottom_margin,
|
|
392
|
-
|
|
472
|
+
max_row * ROW_HEIGHT + GENE_AREA + top_margin,
|
|
393
473
|
)
|
|
394
474
|
|
|
395
475
|
# Filter exons for this region if available
|
|
396
476
|
region_exons = None
|
|
397
477
|
if exons_df is not None and not exons_df.empty:
|
|
398
|
-
region_exons = exons_df
|
|
399
|
-
(
|
|
400
|
-
exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
|
|
401
|
-
== chrom_str
|
|
402
|
-
)
|
|
403
|
-
& (exons_df["end"] >= start)
|
|
404
|
-
& (exons_df["start"] <= end)
|
|
405
|
-
].copy()
|
|
478
|
+
region_exons = _filter_genes_by_region(exons_df, chrom, start, end)
|
|
406
479
|
|
|
407
480
|
region_width = end - start
|
|
408
481
|
|
|
@@ -465,58 +538,11 @@ def plot_gene_track_generic(
|
|
|
465
538
|
zorder=2,
|
|
466
539
|
)
|
|
467
540
|
|
|
468
|
-
# Add strand direction triangles
|
|
541
|
+
# Add strand direction triangles
|
|
469
542
|
if "strand" in gene.index:
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
# Triangle dimensions
|
|
474
|
-
tri_height = EXON_HEIGHT * 0.35
|
|
475
|
-
tri_width = region_width * 0.006
|
|
476
|
-
|
|
477
|
-
# Arrow positions: front, middle, back (tip positions)
|
|
478
|
-
tip_offset = tri_width / 2 # Tiny offset to keep tip inside gene
|
|
479
|
-
tail_offset = tri_width * 1.5 # Offset for tail arrow from gene start/end
|
|
480
|
-
gene_center = (gene_start + gene_end) / 2
|
|
481
|
-
if arrow_dir == 1: # Forward strand
|
|
482
|
-
arrow_tip_positions = [
|
|
483
|
-
gene_start + tail_offset, # Tail (tip inside gene)
|
|
484
|
-
gene_center + tri_width / 2, # Middle (arrow center at gene center)
|
|
485
|
-
gene_end - tip_offset, # Tip (near gene end)
|
|
486
|
-
]
|
|
487
|
-
arrow_color = "#000000" # Black for forward
|
|
488
|
-
else: # Reverse strand
|
|
489
|
-
arrow_tip_positions = [
|
|
490
|
-
gene_end - tail_offset, # Tail (tip inside gene)
|
|
491
|
-
gene_center - tri_width / 2, # Middle (arrow center at gene center)
|
|
492
|
-
gene_start + tip_offset, # Tip (near gene start)
|
|
493
|
-
]
|
|
494
|
-
arrow_color = "#333333" # Dark grey for reverse
|
|
495
|
-
|
|
496
|
-
for tip_x in arrow_tip_positions:
|
|
497
|
-
if arrow_dir == 1:
|
|
498
|
-
base_x = tip_x - tri_width
|
|
499
|
-
tri_points = [
|
|
500
|
-
[tip_x, y_gene], # Tip pointing right
|
|
501
|
-
[base_x, y_gene + tri_height],
|
|
502
|
-
[base_x, y_gene - tri_height],
|
|
503
|
-
]
|
|
504
|
-
else:
|
|
505
|
-
base_x = tip_x + tri_width
|
|
506
|
-
tri_points = [
|
|
507
|
-
[tip_x, y_gene], # Tip pointing left
|
|
508
|
-
[base_x, y_gene + tri_height],
|
|
509
|
-
[base_x, y_gene - tri_height],
|
|
510
|
-
]
|
|
511
|
-
|
|
512
|
-
backend.add_polygon(
|
|
513
|
-
ax,
|
|
514
|
-
tri_points,
|
|
515
|
-
facecolor=arrow_color,
|
|
516
|
-
edgecolor=arrow_color,
|
|
517
|
-
linewidth=0.5,
|
|
518
|
-
zorder=5,
|
|
519
|
-
)
|
|
543
|
+
_draw_strand_arrows_generic(
|
|
544
|
+
ax, backend, gene, gene_start, gene_end, y_gene, region_width
|
|
545
|
+
)
|
|
520
546
|
|
|
521
547
|
# Add gene name label in the gap above gene
|
|
522
548
|
if gene_name:
|
|
@@ -526,7 +552,7 @@ def plot_gene_track_generic(
|
|
|
526
552
|
label_pos,
|
|
527
553
|
y_label,
|
|
528
554
|
gene_name,
|
|
529
|
-
fontsize=
|
|
555
|
+
fontsize=7,
|
|
530
556
|
ha="center",
|
|
531
557
|
va="bottom",
|
|
532
558
|
color="#000000",
|
pylocuszoom/loaders.py
CHANGED
|
@@ -260,10 +260,14 @@ def load_saige(
|
|
|
260
260
|
"POS": pos_col,
|
|
261
261
|
"MarkerID": rs_col,
|
|
262
262
|
"CHR": "chr",
|
|
263
|
-
"p.value": p_col,
|
|
264
|
-
"p.value.NA": p_col, # SPA-adjusted
|
|
265
263
|
}
|
|
266
264
|
|
|
265
|
+
# Prefer SPA-adjusted p-value (p.value.NA) over raw p.value when both present
|
|
266
|
+
if "p.value.NA" in df.columns:
|
|
267
|
+
col_map["p.value.NA"] = p_col
|
|
268
|
+
elif "p.value" in df.columns:
|
|
269
|
+
col_map["p.value"] = p_col
|
|
270
|
+
|
|
267
271
|
df = df.rename(columns=col_map)
|
|
268
272
|
logger.debug(f"Loaded SAIGE file with {len(df)} variants")
|
|
269
273
|
validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
|
|
@@ -318,7 +322,7 @@ def load_gtex_eqtl(
|
|
|
318
322
|
gene: Optional gene to filter to (ENSG ID or gene symbol).
|
|
319
323
|
|
|
320
324
|
Returns:
|
|
321
|
-
DataFrame with columns: pos, p_value, gene,
|
|
325
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
322
326
|
|
|
323
327
|
Example:
|
|
324
328
|
>>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
|
|
@@ -351,10 +355,10 @@ def load_gtex_eqtl(
|
|
|
351
355
|
col_map[col] = "gene"
|
|
352
356
|
break
|
|
353
357
|
|
|
354
|
-
# Effect size (slope)
|
|
358
|
+
# Effect size (slope) - standardize to effect_size for plotting compatibility
|
|
355
359
|
for col in ["slope", "beta", "effect_size"]:
|
|
356
360
|
if col in df.columns:
|
|
357
|
-
col_map[col] = "
|
|
361
|
+
col_map[col] = "effect_size"
|
|
358
362
|
break
|
|
359
363
|
|
|
360
364
|
df = df.rename(columns=col_map)
|
|
@@ -385,7 +389,7 @@ def load_eqtl_catalogue(
|
|
|
385
389
|
gene: Optional gene to filter to.
|
|
386
390
|
|
|
387
391
|
Returns:
|
|
388
|
-
DataFrame with columns: pos, p_value, gene,
|
|
392
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
389
393
|
"""
|
|
390
394
|
df = pd.read_csv(filepath, sep="\t")
|
|
391
395
|
|
|
@@ -393,7 +397,7 @@ def load_eqtl_catalogue(
|
|
|
393
397
|
"position": "pos",
|
|
394
398
|
"pvalue": "p_value",
|
|
395
399
|
"gene_id": "gene",
|
|
396
|
-
"beta": "
|
|
400
|
+
"beta": "effect_size", # Standardize to effect_size for plotter
|
|
397
401
|
"chromosome": "chr",
|
|
398
402
|
}
|
|
399
403
|
|
|
@@ -422,7 +426,7 @@ def load_matrixeqtl(
|
|
|
422
426
|
gene: Optional gene to filter to.
|
|
423
427
|
|
|
424
428
|
Returns:
|
|
425
|
-
DataFrame with columns: pos, p_value, gene,
|
|
429
|
+
DataFrame with columns: pos, p_value, gene, effect_size.
|
|
426
430
|
|
|
427
431
|
Note:
|
|
428
432
|
MatrixEQTL output doesn't include position by default.
|
|
@@ -435,7 +439,7 @@ def load_matrixeqtl(
|
|
|
435
439
|
"gene": "gene",
|
|
436
440
|
"p-value": "p_value",
|
|
437
441
|
"pvalue": "p_value",
|
|
438
|
-
"beta": "
|
|
442
|
+
"beta": "effect_size", # Standardize to effect_size for plotter
|
|
439
443
|
"t-stat": "t_stat",
|
|
440
444
|
}
|
|
441
445
|
|
|
@@ -725,14 +729,28 @@ def load_bed(
|
|
|
725
729
|
# Assign column names if no header
|
|
726
730
|
if not has_header:
|
|
727
731
|
n_cols = len(df.columns)
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
732
|
+
# Standard BED column names (up to BED12)
|
|
733
|
+
bed_col_names = [
|
|
734
|
+
"chr",
|
|
735
|
+
"start",
|
|
736
|
+
"end",
|
|
737
|
+
"gene_name",
|
|
738
|
+
"score",
|
|
739
|
+
"strand",
|
|
740
|
+
"thickStart",
|
|
741
|
+
"thickEnd",
|
|
742
|
+
"itemRgb",
|
|
743
|
+
"blockCount",
|
|
744
|
+
"blockSizes",
|
|
745
|
+
"blockStarts",
|
|
746
|
+
]
|
|
747
|
+
# Use standard names for known columns, generic for extras
|
|
748
|
+
if n_cols <= len(bed_col_names):
|
|
749
|
+
df.columns = bed_col_names[:n_cols]
|
|
750
|
+
else:
|
|
751
|
+
# More columns than BED12 - use known names + generic
|
|
752
|
+
extra_cols = [f"col{i}" for i in range(len(bed_col_names), n_cols)]
|
|
753
|
+
df.columns = bed_col_names + extra_cols
|
|
736
754
|
|
|
737
755
|
# Standardize column names if header was present
|
|
738
756
|
col_map = {
|
|
@@ -859,4 +877,6 @@ def load_gwas(
|
|
|
859
877
|
if format not in loaders:
|
|
860
878
|
raise ValueError(f"Unknown format '{format}'. Options: {list(loaders.keys())}")
|
|
861
879
|
|
|
862
|
-
return loaders[format](
|
|
880
|
+
return loaders[format](
|
|
881
|
+
filepath, pos_col=pos_col, p_col=p_col, rs_col=rs_col, **kwargs
|
|
882
|
+
)
|
pylocuszoom/phewas.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""PheWAS data validation and preparation.
|
|
2
|
+
|
|
3
|
+
Validates and prepares phenome-wide association study data for plotting.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from .validation import DataFrameValidator
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def validate_phewas_df(
|
|
12
|
+
df: pd.DataFrame,
|
|
13
|
+
phenotype_col: str = "phenotype",
|
|
14
|
+
p_col: str = "p_value",
|
|
15
|
+
category_col: str = "category",
|
|
16
|
+
) -> None:
|
|
17
|
+
"""Validate PheWAS DataFrame has required columns and types.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
df: PheWAS results DataFrame.
|
|
21
|
+
phenotype_col: Column name for phenotype names.
|
|
22
|
+
p_col: Column name for p-values.
|
|
23
|
+
category_col: Column name for phenotype categories (optional).
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValidationError: If required columns are missing or have invalid types.
|
|
27
|
+
"""
|
|
28
|
+
(
|
|
29
|
+
DataFrameValidator(df, "PheWAS DataFrame")
|
|
30
|
+
.require_columns([phenotype_col, p_col])
|
|
31
|
+
.require_numeric([p_col])
|
|
32
|
+
.require_range(p_col, min_val=0, max_val=1, exclusive_min=True)
|
|
33
|
+
.validate()
|
|
34
|
+
)
|