pylocuszoom 0.5.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/gene_track.py CHANGED
@@ -28,6 +28,10 @@ GENE_AREA = 0.25 # Bottom portion for gene drawing
28
28
  EXON_HEIGHT = 0.20 # Exon rectangle height
29
29
  INTRON_HEIGHT = 0.02 # Thin intron line
30
30
 
31
+ # Arrow dimensions (pre-computed for clarity)
32
+ ARROW_HEIGHT_RATIO = 0.2625 # EXON_HEIGHT * 0.35 * 0.75 (75% of original height)
33
+ ARROW_WIDTH_RATIO = 0.0066 # region_width * 0.006 * 1.1 (10% wider than original)
34
+
31
35
 
32
36
  def assign_gene_positions(genes_df: pd.DataFrame, start: int, end: int) -> List[int]:
33
37
  """Assign row indices to genes to minimize overlap.
@@ -111,6 +115,147 @@ def get_nearest_gene(
111
115
  return nearby.loc[nearby["dist"].idxmin(), "gene_name"]
112
116
 
113
117
 
118
+ def _filter_genes_by_region(
119
+ df: pd.DataFrame, chrom: Union[int, str], start: int, end: int
120
+ ) -> pd.DataFrame:
121
+ """Filter a DataFrame to genes/exons within a genomic region."""
122
+ chrom_str = normalize_chrom(chrom)
123
+ return df[
124
+ (df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
125
+ & (df["end"] >= start)
126
+ & (df["start"] <= end)
127
+ ].copy()
128
+
129
+
130
+ def _compute_arrow_geometry(
131
+ gene_start: int, gene_end: int, region_width: int, strand: str
132
+ ) -> tuple[list[float], float, float, str]:
133
+ """Compute arrow tip positions and dimensions for strand arrows.
134
+
135
+ Returns:
136
+ Tuple of (arrow_tip_positions, tri_height, tri_width, arrow_color).
137
+ """
138
+ tri_height = EXON_HEIGHT * ARROW_HEIGHT_RATIO
139
+ tri_width = region_width * ARROW_WIDTH_RATIO
140
+
141
+ tip_offset = tri_width / 2
142
+ tail_offset = tri_width * 1.5
143
+ gene_center = (gene_start + gene_end) / 2
144
+
145
+ if strand == "+":
146
+ arrow_tip_positions = [
147
+ gene_start + tail_offset,
148
+ gene_center + tri_width / 2,
149
+ gene_end - tip_offset,
150
+ ]
151
+ arrow_color = "#000000"
152
+ else:
153
+ arrow_tip_positions = [
154
+ gene_end - tail_offset,
155
+ gene_center - tri_width / 2,
156
+ gene_start + tip_offset,
157
+ ]
158
+ arrow_color = "#333333"
159
+
160
+ return arrow_tip_positions, tri_height, tri_width, arrow_color
161
+
162
+
163
+ def _draw_strand_arrows_matplotlib(
164
+ ax: Axes,
165
+ gene: pd.Series,
166
+ gene_start: int,
167
+ gene_end: int,
168
+ y_gene: float,
169
+ region_width: int,
170
+ ) -> None:
171
+ """Draw strand direction arrows using matplotlib."""
172
+ strand = gene["strand"]
173
+ arrow_tip_positions, tri_height, tri_width, arrow_color = _compute_arrow_geometry(
174
+ gene_start, gene_end, region_width, strand
175
+ )
176
+
177
+ # Draw connecting line between arrow centers
178
+ if len(arrow_tip_positions) > 1:
179
+ ax.plot(
180
+ [arrow_tip_positions[0], arrow_tip_positions[-1]],
181
+ [y_gene, y_gene],
182
+ color=arrow_color,
183
+ linewidth=1.0,
184
+ zorder=4,
185
+ solid_capstyle="butt",
186
+ )
187
+
188
+ for tip_x in arrow_tip_positions:
189
+ if strand == "+":
190
+ base_x = tip_x - tri_width
191
+ else:
192
+ base_x = tip_x + tri_width
193
+
194
+ tri_points = [
195
+ [tip_x, y_gene],
196
+ [base_x, y_gene + tri_height],
197
+ [base_x, y_gene - tri_height],
198
+ ]
199
+
200
+ triangle = Polygon(
201
+ tri_points,
202
+ closed=True,
203
+ facecolor=arrow_color,
204
+ edgecolor=arrow_color,
205
+ linewidth=0.5,
206
+ zorder=5,
207
+ )
208
+ ax.add_patch(triangle)
209
+
210
+
211
+ def _draw_strand_arrows_generic(
212
+ ax: Any,
213
+ backend: Any,
214
+ gene: pd.Series,
215
+ gene_start: int,
216
+ gene_end: int,
217
+ y_gene: float,
218
+ region_width: int,
219
+ ) -> None:
220
+ """Draw strand direction arrows using a generic backend."""
221
+ strand = gene["strand"]
222
+ arrow_tip_positions, tri_height, tri_width, arrow_color = _compute_arrow_geometry(
223
+ gene_start, gene_end, region_width, strand
224
+ )
225
+
226
+ # Draw connecting line between arrow centers
227
+ if len(arrow_tip_positions) > 1:
228
+ backend.line(
229
+ ax,
230
+ x=pd.Series([arrow_tip_positions[0], arrow_tip_positions[-1]]),
231
+ y=pd.Series([y_gene, y_gene]),
232
+ color=arrow_color,
233
+ linewidth=1.0,
234
+ zorder=4,
235
+ )
236
+
237
+ for tip_x in arrow_tip_positions:
238
+ if strand == "+":
239
+ base_x = tip_x - tri_width
240
+ else:
241
+ base_x = tip_x + tri_width
242
+
243
+ tri_points = [
244
+ [tip_x, y_gene],
245
+ [base_x, y_gene + tri_height],
246
+ [base_x, y_gene - tri_height],
247
+ ]
248
+
249
+ backend.add_polygon(
250
+ ax,
251
+ tri_points,
252
+ facecolor=arrow_color,
253
+ edgecolor=arrow_color,
254
+ linewidth=0.5,
255
+ zorder=5,
256
+ )
257
+
258
+
114
259
  def plot_gene_track(
115
260
  ax: Axes,
116
261
  genes_df: pd.DataFrame,
@@ -137,12 +282,7 @@ def plot_gene_track(
137
282
  exons_df: Exon annotations with chr, start, end, gene_name
138
283
  columns for drawing exon structure. Optional.
139
284
  """
140
- chrom_str = normalize_chrom(chrom)
141
- region_genes = genes_df[
142
- (genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
143
- & (genes_df["end"] >= start)
144
- & (genes_df["start"] <= end)
145
- ].copy()
285
+ region_genes = _filter_genes_by_region(genes_df, chrom, start, end)
146
286
 
147
287
  ax.set_xlim(start, end)
148
288
  ax.set_ylabel("")
@@ -178,20 +318,13 @@ def plot_gene_track(
178
318
  top_margin = 0.05 # Minimal space above top label
179
319
  ax.set_ylim(
180
320
  -bottom_margin,
181
- (max_row + 1) * ROW_HEIGHT - ROW_HEIGHT + GENE_AREA + top_margin,
321
+ max_row * ROW_HEIGHT + GENE_AREA + top_margin,
182
322
  )
183
323
 
184
324
  # Filter exons for this region if available
185
325
  region_exons = None
186
326
  if exons_df is not None and not exons_df.empty:
187
- region_exons = exons_df[
188
- (
189
- exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
190
- == chrom_str
191
- )
192
- & (exons_df["end"] >= start)
193
- & (exons_df["start"] <= end)
194
- ].copy()
327
+ region_exons = _filter_genes_by_region(exons_df, chrom, start, end)
195
328
 
196
329
  region_width = end - start
197
330
 
@@ -257,59 +390,11 @@ def plot_gene_track(
257
390
  )
258
391
  )
259
392
 
260
- # Add strand direction triangles (tip, center, tail)
393
+ # Add strand direction triangles
261
394
  if "strand" in gene.index:
262
- strand = gene["strand"]
263
- arrow_dir = 1 if strand == "+" else -1
264
-
265
- # Triangle dimensions
266
- tri_height = EXON_HEIGHT * 0.35
267
- tri_width = region_width * 0.006
268
-
269
- # Arrow positions: front, middle, back (tip positions)
270
- tip_offset = tri_width / 2 # Tiny offset to keep tip inside gene
271
- tail_offset = tri_width * 1.5 # Offset for tail arrow from gene start/end
272
- gene_center = (gene_start + gene_end) / 2
273
- if arrow_dir == 1: # Forward strand
274
- arrow_tip_positions = [
275
- gene_start + tail_offset, # Tail (tip inside gene)
276
- gene_center + tri_width / 2, # Middle (arrow center at gene center)
277
- gene_end - tip_offset, # Tip (near gene end)
278
- ]
279
- arrow_color = "#000000" # Black for forward
280
- else: # Reverse strand
281
- arrow_tip_positions = [
282
- gene_end - tail_offset, # Tail (tip inside gene)
283
- gene_center - tri_width / 2, # Middle (arrow center at gene center)
284
- gene_start + tip_offset, # Tip (near gene start)
285
- ]
286
- arrow_color = "#333333" # Dark grey for reverse
287
-
288
- for tip_x in arrow_tip_positions:
289
- if arrow_dir == 1:
290
- base_x = tip_x - tri_width
291
- tri_points = [
292
- [tip_x, y_gene], # Tip pointing right
293
- [base_x, y_gene + tri_height],
294
- [base_x, y_gene - tri_height],
295
- ]
296
- else:
297
- base_x = tip_x + tri_width
298
- tri_points = [
299
- [tip_x, y_gene], # Tip pointing left
300
- [base_x, y_gene + tri_height],
301
- [base_x, y_gene - tri_height],
302
- ]
303
-
304
- triangle = Polygon(
305
- tri_points,
306
- closed=True,
307
- facecolor=arrow_color,
308
- edgecolor=arrow_color,
309
- linewidth=0.5,
310
- zorder=5,
311
- )
312
- ax.add_patch(triangle)
395
+ _draw_strand_arrows_matplotlib(
396
+ ax, gene, gene_start, gene_end, y_gene, region_width
397
+ )
313
398
 
314
399
  # Add gene name label in the gap above gene
315
400
  if gene_name:
@@ -320,7 +405,7 @@ def plot_gene_track(
320
405
  gene_name,
321
406
  ha="center",
322
407
  va="bottom",
323
- fontsize=5.5,
408
+ fontsize=7,
324
409
  color="#000000",
325
410
  fontweight="medium",
326
411
  style="italic",
@@ -353,12 +438,7 @@ def plot_gene_track_generic(
353
438
  exons_df: Exon annotations with chr, start, end, gene_name
354
439
  columns for drawing exon structure. Optional.
355
440
  """
356
- chrom_str = normalize_chrom(chrom)
357
- region_genes = genes_df[
358
- (genes_df["chr"].astype(str).str.replace("chr", "", regex=False) == chrom_str)
359
- & (genes_df["end"] >= start)
360
- & (genes_df["start"] <= end)
361
- ].copy()
441
+ region_genes = _filter_genes_by_region(genes_df, chrom, start, end)
362
442
 
363
443
  backend.set_xlim(ax, start, end)
364
444
  backend.set_ylabel(ax, "", fontsize=10)
@@ -389,20 +469,13 @@ def plot_gene_track_generic(
389
469
  backend.set_ylim(
390
470
  ax,
391
471
  -bottom_margin,
392
- (max_row + 1) * ROW_HEIGHT - ROW_HEIGHT + GENE_AREA + top_margin,
472
+ max_row * ROW_HEIGHT + GENE_AREA + top_margin,
393
473
  )
394
474
 
395
475
  # Filter exons for this region if available
396
476
  region_exons = None
397
477
  if exons_df is not None and not exons_df.empty:
398
- region_exons = exons_df[
399
- (
400
- exons_df["chr"].astype(str).str.replace("chr", "", regex=False)
401
- == chrom_str
402
- )
403
- & (exons_df["end"] >= start)
404
- & (exons_df["start"] <= end)
405
- ].copy()
478
+ region_exons = _filter_genes_by_region(exons_df, chrom, start, end)
406
479
 
407
480
  region_width = end - start
408
481
 
@@ -465,58 +538,11 @@ def plot_gene_track_generic(
465
538
  zorder=2,
466
539
  )
467
540
 
468
- # Add strand direction triangles (tip, center, tail)
541
+ # Add strand direction triangles
469
542
  if "strand" in gene.index:
470
- strand = gene["strand"]
471
- arrow_dir = 1 if strand == "+" else -1
472
-
473
- # Triangle dimensions
474
- tri_height = EXON_HEIGHT * 0.35
475
- tri_width = region_width * 0.006
476
-
477
- # Arrow positions: front, middle, back (tip positions)
478
- tip_offset = tri_width / 2 # Tiny offset to keep tip inside gene
479
- tail_offset = tri_width * 1.5 # Offset for tail arrow from gene start/end
480
- gene_center = (gene_start + gene_end) / 2
481
- if arrow_dir == 1: # Forward strand
482
- arrow_tip_positions = [
483
- gene_start + tail_offset, # Tail (tip inside gene)
484
- gene_center + tri_width / 2, # Middle (arrow center at gene center)
485
- gene_end - tip_offset, # Tip (near gene end)
486
- ]
487
- arrow_color = "#000000" # Black for forward
488
- else: # Reverse strand
489
- arrow_tip_positions = [
490
- gene_end - tail_offset, # Tail (tip inside gene)
491
- gene_center - tri_width / 2, # Middle (arrow center at gene center)
492
- gene_start + tip_offset, # Tip (near gene start)
493
- ]
494
- arrow_color = "#333333" # Dark grey for reverse
495
-
496
- for tip_x in arrow_tip_positions:
497
- if arrow_dir == 1:
498
- base_x = tip_x - tri_width
499
- tri_points = [
500
- [tip_x, y_gene], # Tip pointing right
501
- [base_x, y_gene + tri_height],
502
- [base_x, y_gene - tri_height],
503
- ]
504
- else:
505
- base_x = tip_x + tri_width
506
- tri_points = [
507
- [tip_x, y_gene], # Tip pointing left
508
- [base_x, y_gene + tri_height],
509
- [base_x, y_gene - tri_height],
510
- ]
511
-
512
- backend.add_polygon(
513
- ax,
514
- tri_points,
515
- facecolor=arrow_color,
516
- edgecolor=arrow_color,
517
- linewidth=0.5,
518
- zorder=5,
519
- )
543
+ _draw_strand_arrows_generic(
544
+ ax, backend, gene, gene_start, gene_end, y_gene, region_width
545
+ )
520
546
 
521
547
  # Add gene name label in the gap above gene
522
548
  if gene_name:
@@ -526,7 +552,7 @@ def plot_gene_track_generic(
526
552
  label_pos,
527
553
  y_label,
528
554
  gene_name,
529
- fontsize=6,
555
+ fontsize=7,
530
556
  ha="center",
531
557
  va="bottom",
532
558
  color="#000000",
pylocuszoom/loaders.py CHANGED
@@ -260,10 +260,14 @@ def load_saige(
260
260
  "POS": pos_col,
261
261
  "MarkerID": rs_col,
262
262
  "CHR": "chr",
263
- "p.value": p_col,
264
- "p.value.NA": p_col, # SPA-adjusted
265
263
  }
266
264
 
265
+ # Prefer SPA-adjusted p-value (p.value.NA) over raw p.value when both present
266
+ if "p.value.NA" in df.columns:
267
+ col_map["p.value.NA"] = p_col
268
+ elif "p.value" in df.columns:
269
+ col_map["p.value"] = p_col
270
+
267
271
  df = df.rename(columns=col_map)
268
272
  logger.debug(f"Loaded SAIGE file with {len(df)} variants")
269
273
  validate_gwas_dataframe(df, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
@@ -318,7 +322,7 @@ def load_gtex_eqtl(
318
322
  gene: Optional gene to filter to (ENSG ID or gene symbol).
319
323
 
320
324
  Returns:
321
- DataFrame with columns: pos, p_value, gene, effect.
325
+ DataFrame with columns: pos, p_value, gene, effect_size.
322
326
 
323
327
  Example:
324
328
  >>> eqtl_df = load_gtex_eqtl("GTEx_Analysis.signif_pairs.txt.gz", gene="BRCA1")
@@ -351,10 +355,10 @@ def load_gtex_eqtl(
351
355
  col_map[col] = "gene"
352
356
  break
353
357
 
354
- # Effect size (slope)
358
+ # Effect size (slope) - standardize to effect_size for plotting compatibility
355
359
  for col in ["slope", "beta", "effect_size"]:
356
360
  if col in df.columns:
357
- col_map[col] = "effect"
361
+ col_map[col] = "effect_size"
358
362
  break
359
363
 
360
364
  df = df.rename(columns=col_map)
@@ -385,7 +389,7 @@ def load_eqtl_catalogue(
385
389
  gene: Optional gene to filter to.
386
390
 
387
391
  Returns:
388
- DataFrame with columns: pos, p_value, gene, effect.
392
+ DataFrame with columns: pos, p_value, gene, effect_size.
389
393
  """
390
394
  df = pd.read_csv(filepath, sep="\t")
391
395
 
@@ -393,7 +397,7 @@ def load_eqtl_catalogue(
393
397
  "position": "pos",
394
398
  "pvalue": "p_value",
395
399
  "gene_id": "gene",
396
- "beta": "effect",
400
+ "beta": "effect_size", # Standardize to effect_size for plotter
397
401
  "chromosome": "chr",
398
402
  }
399
403
 
@@ -422,7 +426,7 @@ def load_matrixeqtl(
422
426
  gene: Optional gene to filter to.
423
427
 
424
428
  Returns:
425
- DataFrame with columns: pos, p_value, gene, effect.
429
+ DataFrame with columns: pos, p_value, gene, effect_size.
426
430
 
427
431
  Note:
428
432
  MatrixEQTL output doesn't include position by default.
@@ -435,7 +439,7 @@ def load_matrixeqtl(
435
439
  "gene": "gene",
436
440
  "p-value": "p_value",
437
441
  "pvalue": "p_value",
438
- "beta": "effect",
442
+ "beta": "effect_size", # Standardize to effect_size for plotter
439
443
  "t-stat": "t_stat",
440
444
  }
441
445
 
@@ -725,14 +729,28 @@ def load_bed(
725
729
  # Assign column names if no header
726
730
  if not has_header:
727
731
  n_cols = len(df.columns)
728
- col_names = ["chr", "start", "end"]
729
- if n_cols >= 4:
730
- col_names.append("gene_name")
731
- if n_cols >= 5:
732
- col_names.append("score")
733
- if n_cols >= 6:
734
- col_names.append("strand")
735
- df.columns = col_names[:n_cols]
732
+ # Standard BED column names (up to BED12)
733
+ bed_col_names = [
734
+ "chr",
735
+ "start",
736
+ "end",
737
+ "gene_name",
738
+ "score",
739
+ "strand",
740
+ "thickStart",
741
+ "thickEnd",
742
+ "itemRgb",
743
+ "blockCount",
744
+ "blockSizes",
745
+ "blockStarts",
746
+ ]
747
+ # Use standard names for known columns, generic for extras
748
+ if n_cols <= len(bed_col_names):
749
+ df.columns = bed_col_names[:n_cols]
750
+ else:
751
+ # More columns than BED12 - use known names + generic
752
+ extra_cols = [f"col{i}" for i in range(len(bed_col_names), n_cols)]
753
+ df.columns = bed_col_names + extra_cols
736
754
 
737
755
  # Standardize column names if header was present
738
756
  col_map = {
@@ -859,4 +877,6 @@ def load_gwas(
859
877
  if format not in loaders:
860
878
  raise ValueError(f"Unknown format '{format}'. Options: {list(loaders.keys())}")
861
879
 
862
- return loaders[format](filepath, pos_col=pos_col, p_col=p_col, rs_col=rs_col)
880
+ return loaders[format](
881
+ filepath, pos_col=pos_col, p_col=p_col, rs_col=rs_col, **kwargs
882
+ )
pylocuszoom/phewas.py ADDED
@@ -0,0 +1,34 @@
1
+ """PheWAS data validation and preparation.
2
+
3
+ Validates and prepares phenome-wide association study data for plotting.
4
+ """
5
+
6
+ import pandas as pd
7
+
8
+ from .validation import DataFrameValidator
9
+
10
+
11
+ def validate_phewas_df(
12
+ df: pd.DataFrame,
13
+ phenotype_col: str = "phenotype",
14
+ p_col: str = "p_value",
15
+ category_col: str = "category",
16
+ ) -> None:
17
+ """Validate PheWAS DataFrame has required columns and types.
18
+
19
+ Args:
20
+ df: PheWAS results DataFrame.
21
+ phenotype_col: Column name for phenotype names.
22
+ p_col: Column name for p-values.
23
+ category_col: Column name for phenotype categories (optional).
24
+
25
+ Raises:
26
+ ValidationError: If required columns are missing or have invalid types.
27
+ """
28
+ (
29
+ DataFrameValidator(df, "PheWAS DataFrame")
30
+ .require_columns([phenotype_col, p_col])
31
+ .require_numeric([p_col])
32
+ .require_range(p_col, min_val=0, max_val=1, exclusive_min=True)
33
+ .validate()
34
+ )