pylocuszoom 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/plotter.py CHANGED
@@ -15,12 +15,9 @@ from typing import Any, List, Optional, Tuple
15
15
  import matplotlib.pyplot as plt
16
16
  import numpy as np
17
17
  import pandas as pd
18
- from matplotlib.axes import Axes
19
- from matplotlib.figure import Figure
20
- from matplotlib.lines import Line2D
21
- from matplotlib.patches import Patch
22
18
 
23
19
  from .backends import BackendType, get_backend
20
+ from .backends.hover import HoverConfig, HoverDataBuilder
24
21
  from .colors import (
25
22
  EQTL_NEGATIVE_BINS,
26
23
  EQTL_POSITIVE_BINS,
@@ -33,6 +30,7 @@ from .colors import (
33
30
  get_ld_color_palette,
34
31
  get_phewas_category_palette,
35
32
  )
33
+ from .ensembl import get_genes_for_region
36
34
  from .eqtl import validate_eqtl_df
37
35
  from .finemapping import (
38
36
  get_credible_sets,
@@ -41,16 +39,13 @@ from .finemapping import (
41
39
  from .forest import validate_forest_df
42
40
  from .gene_track import (
43
41
  assign_gene_positions,
44
- plot_gene_track,
45
42
  plot_gene_track_generic,
46
43
  )
47
- from .labels import add_snp_labels
48
44
  from .ld import calculate_ld, find_plink
49
45
  from .logging import enable_logging, logger
50
46
  from .phewas import validate_phewas_df
51
47
  from .recombination import (
52
48
  RECOMB_COLOR,
53
- add_recombination_overlay,
54
49
  download_canine_recombination_maps,
55
50
  get_default_data_dir,
56
51
  get_recombination_rate_for_region,
@@ -119,8 +114,21 @@ class LocusZoomPlotter:
119
114
  recomb_data_dir: Optional[str] = None,
120
115
  genomewide_threshold: float = DEFAULT_GENOMEWIDE_THRESHOLD,
121
116
  log_level: Optional[str] = "INFO",
117
+ auto_genes: bool = False,
122
118
  ):
123
- """Initialize the plotter."""
119
+ """Initialize the plotter.
120
+
121
+ Args:
122
+ species: Species name ('canine', 'feline', or None for custom).
123
+ genome_build: Genome build for coordinate system.
124
+ backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
125
+ plink_path: Path to PLINK executable for LD calculation.
126
+ recomb_data_dir: Directory containing recombination maps.
127
+ genomewide_threshold: P-value threshold for significance line.
128
+ log_level: Logging level.
129
+ auto_genes: If True, automatically fetch genes from Ensembl when
130
+ genes_df is not provided. Default False for backward compatibility.
131
+ """
124
132
  # Configure logging
125
133
  if log_level is not None:
126
134
  enable_logging(log_level)
@@ -129,12 +137,12 @@ class LocusZoomPlotter:
129
137
  self.genome_build = (
130
138
  genome_build if genome_build else self._default_build(species)
131
139
  )
132
- self.backend_name = backend
133
140
  self._backend = get_backend(backend)
134
141
  self.plink_path = plink_path or find_plink()
135
142
  self.recomb_data_dir = recomb_data_dir
136
143
  self.genomewide_threshold = genomewide_threshold
137
144
  self._genomewide_line = -np.log10(genomewide_threshold)
145
+ self._auto_genes = auto_genes
138
146
 
139
147
  # Cache for loaded data
140
148
  self._recomb_cache = {}
@@ -248,6 +256,22 @@ class LocusZoomPlotter:
248
256
  """
249
257
  # Validate inputs
250
258
  validate_gwas_df(gwas_df, pos_col=pos_col, p_col=p_col)
259
+
260
+ # Auto-fetch genes if enabled and not provided
261
+ if genes_df is None and self._auto_genes:
262
+ logger.debug(
263
+ f"auto_genes enabled, fetching genes for chr{chrom}:{start}-{end}"
264
+ )
265
+ genes_df = get_genes_for_region(
266
+ species=self.species,
267
+ chrom=chrom,
268
+ start=start,
269
+ end=end,
270
+ )
271
+ if genes_df.empty:
272
+ logger.debug("No genes found in region from Ensembl")
273
+ genes_df = None
274
+
251
275
  if genes_df is not None:
252
276
  validate_genes_df(genes_df)
253
277
 
@@ -305,10 +329,10 @@ class LocusZoomPlotter:
305
329
  zorder=1,
306
330
  )
307
331
 
308
- # Add SNP labels (matplotlib only - interactive backends use hover tooltips)
332
+ # Add SNP labels (capability check - interactive backends use hover tooltips)
309
333
  if snp_labels and rs_col in df.columns and label_top_n > 0 and not df.empty:
310
- if self.backend_name == "matplotlib":
311
- add_snp_labels(
334
+ if self._backend.supports_snp_labels:
335
+ self._backend.add_snp_labels(
312
336
  ax,
313
337
  df,
314
338
  pos_col=pos_col,
@@ -319,12 +343,10 @@ class LocusZoomPlotter:
319
343
  chrom=chrom,
320
344
  )
321
345
 
322
- # Add recombination overlay (all backends)
346
+ # Add recombination overlay (all backends with secondary axis support)
323
347
  if recomb_df is not None and not recomb_df.empty:
324
- if self.backend_name == "matplotlib":
325
- add_recombination_overlay(ax, recomb_df, start, end)
326
- else:
327
- self._add_recombination_overlay_generic(ax, recomb_df, start, end)
348
+ if self._backend.supports_secondary_axis:
349
+ self._add_recombination_overlay(ax, recomb_df, start, end)
328
350
 
329
351
  # Format axes
330
352
  self._backend.set_ylabel(ax, r"$-\log_{10}$ P")
@@ -333,19 +355,13 @@ class LocusZoomPlotter:
333
355
 
334
356
  # Add LD legend (all backends)
335
357
  if ld_col is not None and ld_col in df.columns:
336
- if self.backend_name == "matplotlib":
337
- self._add_ld_legend(ax)
338
- else:
339
- self._backend.add_ld_legend(ax, LD_BINS, LEAD_SNP_COLOR)
358
+ self._backend.add_ld_legend(ax, LD_BINS, LEAD_SNP_COLOR)
340
359
 
341
- # Plot gene track (all backends)
360
+ # Plot gene track (all backends use generic function)
342
361
  if genes_df is not None and gene_ax is not None:
343
- if self.backend_name == "matplotlib":
344
- plot_gene_track(gene_ax, genes_df, chrom, start, end, exons_df)
345
- else:
346
- plot_gene_track_generic(
347
- gene_ax, self._backend, genes_df, chrom, start, end, exons_df
348
- )
362
+ plot_gene_track_generic(
363
+ gene_ax, self._backend, genes_df, chrom, start, end, exons_df
364
+ )
349
365
  self._backend.set_xlabel(gene_ax, f"Chromosome {chrom} (Mb)")
350
366
  self._backend.hide_spines(gene_ax, ["top", "right", "left"])
351
367
  else:
@@ -366,7 +382,7 @@ class LocusZoomPlotter:
366
382
  start: int,
367
383
  end: int,
368
384
  figsize: Tuple[int, int],
369
- ) -> Tuple[Figure, Axes, Optional[Axes]]:
385
+ ) -> Tuple[Any, Any, Optional[Any]]:
370
386
  """Create figure with optional gene track."""
371
387
  if genes_df is not None:
372
388
  # Calculate dynamic height based on gene rows
@@ -410,7 +426,7 @@ class LocusZoomPlotter:
410
426
 
411
427
  def _plot_association(
412
428
  self,
413
- ax: Axes,
429
+ ax: Any,
414
430
  df: pd.DataFrame,
415
431
  pos_col: str,
416
432
  ld_col: Optional[str],
@@ -419,23 +435,14 @@ class LocusZoomPlotter:
419
435
  p_col: Optional[str] = None,
420
436
  ) -> None:
421
437
  """Plot association scatter with LD coloring."""
422
-
423
- def _build_hover_data(subset_df: pd.DataFrame) -> Optional[pd.DataFrame]:
424
- """Build hover data for interactive backends."""
425
- hover_cols = {}
426
- # RS ID first (will be bold in hover)
427
- if rs_col and rs_col in subset_df.columns:
428
- hover_cols["SNP"] = subset_df[rs_col].values
429
- # Position
430
- if pos_col in subset_df.columns:
431
- hover_cols["Position"] = subset_df[pos_col].values
432
- # P-value
433
- if p_col and p_col in subset_df.columns:
434
- hover_cols["P-value"] = subset_df[p_col].values
435
- # LD
436
- if ld_col and ld_col in subset_df.columns:
437
- hover_cols["R²"] = subset_df[ld_col].values
438
- return pd.DataFrame(hover_cols) if hover_cols else None
438
+ # Build hover data using HoverDataBuilder
439
+ hover_config = HoverConfig(
440
+ snp_col=rs_col if rs_col and rs_col in df.columns else None,
441
+ pos_col=pos_col if pos_col in df.columns else None,
442
+ p_col=p_col if p_col and p_col in df.columns else None,
443
+ ld_col=ld_col if ld_col and ld_col in df.columns else None,
444
+ )
445
+ hover_builder = HoverDataBuilder(hover_config)
439
446
 
440
447
  # LD-based coloring
441
448
  if ld_col is not None and ld_col in df.columns:
@@ -454,7 +461,7 @@ class LocusZoomPlotter:
454
461
  edgecolor="black",
455
462
  linewidth=0.5,
456
463
  zorder=2,
457
- hover_data=_build_hover_data(bin_data),
464
+ hover_data=hover_builder.build_dataframe(bin_data),
458
465
  )
459
466
  else:
460
467
  # Default: grey points
@@ -467,7 +474,7 @@ class LocusZoomPlotter:
467
474
  edgecolor="black",
468
475
  linewidth=0.5,
469
476
  zorder=2,
470
- hover_data=_build_hover_data(df),
477
+ hover_data=hover_builder.build_dataframe(df),
471
478
  )
472
479
 
473
480
  # Highlight lead SNP with larger, more prominent marker
@@ -484,57 +491,21 @@ class LocusZoomPlotter:
484
491
  edgecolor="black",
485
492
  linewidth=1.5,
486
493
  zorder=10,
487
- hover_data=_build_hover_data(lead_snp),
494
+ hover_data=hover_builder.build_dataframe(lead_snp),
488
495
  )
489
496
 
490
- def _add_ld_legend(self, ax: Axes) -> None:
491
- """Add LD color legend to plot."""
492
- palette = get_ld_color_palette()
493
- legend_elements = [
494
- Line2D(
495
- [0],
496
- [0],
497
- marker="D",
498
- color="w",
499
- markerfacecolor=LEAD_SNP_COLOR,
500
- markeredgecolor="black",
501
- markersize=6,
502
- label="Lead SNP",
503
- ),
504
- ]
505
-
506
- for threshold, label, _ in LD_BINS:
507
- legend_elements.append(
508
- Patch(
509
- facecolor=palette[label],
510
- edgecolor="black",
511
- label=label,
512
- )
513
- )
514
-
515
- ax.legend(
516
- handles=legend_elements,
517
- loc="upper right",
518
- fontsize=9,
519
- frameon=True,
520
- framealpha=0.9,
521
- title=r"$r^2$",
522
- title_fontsize=10,
523
- handlelength=1.5,
524
- handleheight=1.0,
525
- labelspacing=0.4,
526
- )
527
-
528
- def _add_recombination_overlay_generic(
497
+ def _add_recombination_overlay(
529
498
  self,
530
499
  ax: Any,
531
500
  recomb_df: pd.DataFrame,
532
501
  start: int,
533
502
  end: int,
534
503
  ) -> None:
535
- """Add recombination overlay for interactive backends (plotly/bokeh).
504
+ """Add recombination overlay for all backends.
536
505
 
537
506
  Creates a secondary y-axis with recombination rate line and fill.
507
+ Uses backend-agnostic secondary axis methods that work across
508
+ matplotlib, plotly, and bokeh.
538
509
  """
539
510
  # Filter to region
540
511
  region_recomb = recomb_df[
@@ -591,7 +562,7 @@ class LocusZoomPlotter:
591
562
 
592
563
  def _plot_finemapping(
593
564
  self,
594
- ax: Axes,
565
+ ax: Any,
595
566
  df: pd.DataFrame,
596
567
  pos_col: str = "pos",
597
568
  pip_col: str = "pip",
@@ -610,22 +581,15 @@ class LocusZoomPlotter:
610
581
  show_credible_sets: Whether to color points by credible set.
611
582
  pip_threshold: Minimum PIP to display as scatter point.
612
583
  """
613
-
614
- def _build_finemapping_hover_data(
615
- subset_df: pd.DataFrame,
616
- ) -> Optional[pd.DataFrame]:
617
- """Build hover data for interactive backends."""
618
- hover_cols = {}
619
- # Position
620
- if pos_col in subset_df.columns:
621
- hover_cols["Position"] = subset_df[pos_col].values
622
- # PIP
623
- if pip_col in subset_df.columns:
624
- hover_cols["PIP"] = subset_df[pip_col].values
625
- # Credible set
626
- if cs_col and cs_col in subset_df.columns:
627
- hover_cols["Credible Set"] = subset_df[cs_col].values
628
- return pd.DataFrame(hover_cols) if hover_cols else None
584
+ # Build hover data using HoverDataBuilder
585
+ extra_cols = {pip_col: "PIP"}
586
+ if cs_col and cs_col in df.columns:
587
+ extra_cols[cs_col] = "Credible Set"
588
+ hover_config = HoverConfig(
589
+ pos_col=pos_col if pos_col in df.columns else None,
590
+ extra_cols=extra_cols,
591
+ )
592
+ hover_builder = HoverDataBuilder(hover_config)
629
593
 
630
594
  # Sort by position for line plotting
631
595
  df = df.sort_values(pos_col)
@@ -660,7 +624,7 @@ class LocusZoomPlotter:
660
624
  edgecolor="black",
661
625
  linewidth=0.5,
662
626
  zorder=3,
663
- hover_data=_build_finemapping_hover_data(cs_data),
627
+ hover_data=hover_builder.build_dataframe(cs_data),
664
628
  )
665
629
  # Plot variants not in any credible set
666
630
  non_cs_data = df[(df[cs_col].isna()) | (df[cs_col] == 0)]
@@ -677,7 +641,7 @@ class LocusZoomPlotter:
677
641
  edgecolor="black",
678
642
  linewidth=0.3,
679
643
  zorder=2,
680
- hover_data=_build_finemapping_hover_data(non_cs_data),
644
+ hover_data=hover_builder.build_dataframe(non_cs_data),
681
645
  )
682
646
  else:
683
647
  # No credible sets - show all points above threshold
@@ -694,7 +658,7 @@ class LocusZoomPlotter:
694
658
  edgecolor="black",
695
659
  linewidth=0.5,
696
660
  zorder=3,
697
- hover_data=_build_finemapping_hover_data(high_pip),
661
+ hover_data=hover_builder.build_dataframe(high_pip),
698
662
  )
699
663
 
700
664
  def plot_stacked(
@@ -912,10 +876,10 @@ class LocusZoomPlotter:
912
876
  zorder=1,
913
877
  )
914
878
 
915
- # Add SNP labels (matplotlib only - interactive backends use hover tooltips)
879
+ # Add SNP labels (capability check - interactive backends use hover tooltips)
916
880
  if snp_labels and rs_col in df.columns and label_top_n > 0 and not df.empty:
917
- if self.backend_name == "matplotlib":
918
- add_snp_labels(
881
+ if self._backend.supports_snp_labels:
882
+ self._backend.add_snp_labels(
919
883
  ax,
920
884
  df,
921
885
  pos_col=pos_col,
@@ -928,10 +892,8 @@ class LocusZoomPlotter:
928
892
 
929
893
  # Add recombination overlay (only on first panel, all backends)
930
894
  if i == 0 and recomb_df is not None and not recomb_df.empty:
931
- if self.backend_name == "matplotlib":
932
- add_recombination_overlay(ax, recomb_df, start, end)
933
- else:
934
- self._add_recombination_overlay_generic(ax, recomb_df, start, end)
895
+ if self._backend.supports_secondary_axis:
896
+ self._add_recombination_overlay(ax, recomb_df, start, end)
935
897
 
936
898
  # Format axes
937
899
  self._backend.set_ylabel(ax, r"$-\log_{10}$ P")
@@ -940,50 +902,11 @@ class LocusZoomPlotter:
940
902
 
941
903
  # Add panel label
942
904
  if panel_labels and i < len(panel_labels):
943
- if self.backend_name == "matplotlib":
944
- ax.annotate(
945
- panel_labels[i],
946
- xy=(0.02, 0.95),
947
- xycoords="axes fraction",
948
- fontsize=11,
949
- fontweight="bold",
950
- va="top",
951
- ha="left",
952
- )
953
- elif self.backend_name == "plotly":
954
- fig, row = ax
955
- fig.add_annotation(
956
- text=f"<b>{panel_labels[i]}</b>",
957
- xref=f"x{row} domain" if row > 1 else "x domain",
958
- yref=f"y{row} domain" if row > 1 else "y domain",
959
- x=0.02,
960
- y=0.95,
961
- showarrow=False,
962
- font=dict(size=11),
963
- xanchor="left",
964
- yanchor="top",
965
- )
966
- elif self.backend_name == "bokeh":
967
- from bokeh.models import Label
968
-
969
- # Get y-axis range for positioning
970
- y_max = ax.y_range.end if ax.y_range.end else 10
971
- x_min = ax.x_range.start if ax.x_range.start else start
972
- label = Label(
973
- x=x_min + (end - start) * 0.02,
974
- y=y_max * 0.95,
975
- text=panel_labels[i],
976
- text_font_size="11pt",
977
- text_font_style="bold",
978
- )
979
- ax.add_layout(label)
905
+ self._backend.add_panel_label(ax, panel_labels[i])
980
906
 
981
907
  # Add LD legend (only on first panel, all backends)
982
908
  if i == 0 and panel_ld_col is not None and panel_ld_col in df.columns:
983
- if self.backend_name == "matplotlib":
984
- self._add_ld_legend(ax)
985
- else:
986
- self._backend.add_ld_legend(ax, LD_BINS, LEAD_SNP_COLOR)
909
+ self._backend.add_ld_legend(ax, LD_BINS, LEAD_SNP_COLOR)
987
910
 
988
911
  # Track current panel index
989
912
  panel_idx = n_gwas
@@ -1050,24 +973,18 @@ class LocusZoomPlotter:
1050
973
  eqtl_data["p_value"].clip(lower=1e-300)
1051
974
  )
1052
975
 
1053
- def _build_eqtl_hover_data(
1054
- subset_df: pd.DataFrame,
1055
- ) -> Optional[pd.DataFrame]:
1056
- """Build hover data for eQTL interactive backends."""
1057
- hover_cols = {}
1058
- # Position
1059
- if "pos" in subset_df.columns:
1060
- hover_cols["Position"] = subset_df["pos"].values
1061
- # P-value
1062
- if "p_value" in subset_df.columns:
1063
- hover_cols["P-value"] = subset_df["p_value"].values
1064
- # Effect size
1065
- if "effect_size" in subset_df.columns:
1066
- hover_cols["Effect"] = subset_df["effect_size"].values
1067
- # Gene
1068
- if "gene" in subset_df.columns:
1069
- hover_cols["Gene"] = subset_df["gene"].values
1070
- return pd.DataFrame(hover_cols) if hover_cols else None
976
+ # Build hover data using HoverDataBuilder
977
+ eqtl_extra_cols = {}
978
+ if "effect_size" in eqtl_data.columns:
979
+ eqtl_extra_cols["effect_size"] = "Effect"
980
+ if "gene" in eqtl_data.columns:
981
+ eqtl_extra_cols["gene"] = "Gene"
982
+ eqtl_hover_config = HoverConfig(
983
+ pos_col="pos" if "pos" in eqtl_data.columns else None,
984
+ p_col="p_value" if "p_value" in eqtl_data.columns else None,
985
+ extra_cols=eqtl_extra_cols,
986
+ )
987
+ eqtl_hover_builder = HoverDataBuilder(eqtl_hover_config)
1071
988
 
1072
989
  # Check if effect_size column exists for directional coloring
1073
990
  has_effect = "effect_size" in eqtl_data.columns
@@ -1090,7 +1007,7 @@ class LocusZoomPlotter:
1090
1007
  edgecolor="black",
1091
1008
  linewidth=0.5,
1092
1009
  zorder=2,
1093
- hover_data=_build_eqtl_hover_data(row_df),
1010
+ hover_data=eqtl_hover_builder.build_dataframe(row_df),
1094
1011
  )
1095
1012
  # Plot negative effects (down triangles)
1096
1013
  for _, row in neg_effects.iterrows():
@@ -1105,7 +1022,7 @@ class LocusZoomPlotter:
1105
1022
  edgecolor="black",
1106
1023
  linewidth=0.5,
1107
1024
  zorder=2,
1108
- hover_data=_build_eqtl_hover_data(row_df),
1025
+ hover_data=eqtl_hover_builder.build_dataframe(row_df),
1109
1026
  )
1110
1027
  # Add eQTL effect legend (all backends)
1111
1028
  self._backend.add_eqtl_legend(
@@ -1125,7 +1042,7 @@ class LocusZoomPlotter:
1125
1042
  linewidth=0.5,
1126
1043
  zorder=2,
1127
1044
  label=label,
1128
- hover_data=_build_eqtl_hover_data(eqtl_data),
1045
+ hover_data=eqtl_hover_builder.build_dataframe(eqtl_data),
1129
1046
  )
1130
1047
  self._backend.add_simple_legend(ax, label, loc="upper right")
1131
1048
 
@@ -1141,15 +1058,12 @@ class LocusZoomPlotter:
1141
1058
  self._backend.hide_spines(ax, ["top", "right"])
1142
1059
  panel_idx += 1
1143
1060
 
1144
- # Plot gene track (all backends)
1061
+ # Plot gene track (all backends use generic function)
1145
1062
  if genes_df is not None:
1146
1063
  gene_ax = axes[panel_idx]
1147
- if self.backend_name == "matplotlib":
1148
- plot_gene_track(gene_ax, genes_df, chrom, start, end, exons_df)
1149
- else:
1150
- plot_gene_track_generic(
1151
- gene_ax, self._backend, genes_df, chrom, start, end, exons_df
1152
- )
1064
+ plot_gene_track_generic(
1065
+ gene_ax, self._backend, genes_df, chrom, start, end, exons_df
1066
+ )
1153
1067
  self._backend.set_xlabel(gene_ax, f"Chromosome {chrom} (Mb)")
1154
1068
  self._backend.hide_spines(gene_ax, ["top", "right", "left"])
1155
1069
  else:
@@ -1281,10 +1195,13 @@ class LocusZoomPlotter:
1281
1195
  self._backend.set_ylabel(ax, "Phenotype")
1282
1196
  self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
1283
1197
 
1284
- # Set y-tick labels to phenotype names (matplotlib only)
1285
- if self.backend_name == "matplotlib":
1286
- ax.set_yticks(df["y_pos"])
1287
- ax.set_yticklabels(df[phenotype_col], fontsize=8)
1198
+ # Set y-tick labels to phenotype names
1199
+ self._backend.set_yticks(
1200
+ ax,
1201
+ positions=df["y_pos"].tolist(),
1202
+ labels=df[phenotype_col].tolist(),
1203
+ fontsize=8,
1204
+ )
1288
1205
 
1289
1206
  self._backend.set_title(ax, f"PheWAS: {variant_id}")
1290
1207
  self._backend.hide_spines(ax, ["top", "right"])
@@ -1399,10 +1316,19 @@ class LocusZoomPlotter:
1399
1316
  self._backend.set_xlabel(ax, effect_label)
1400
1317
  self._backend.set_ylim(ax, -0.5, len(df) - 0.5)
1401
1318
 
1402
- # Set y-tick labels to study names (matplotlib only)
1403
- if self.backend_name == "matplotlib":
1404
- ax.set_yticks(df["y_pos"])
1405
- ax.set_yticklabels(df[study_col], fontsize=10)
1319
+ # Ensure x-axis includes the null value with some padding
1320
+ x_min = min(df[ci_lower_col].min(), null_value)
1321
+ x_max = max(df[ci_upper_col].max(), null_value)
1322
+ x_padding = (x_max - x_min) * 0.1
1323
+ self._backend.set_xlim(ax, x_min - x_padding, x_max + x_padding)
1324
+
1325
+ # Set y-tick labels to study names
1326
+ self._backend.set_yticks(
1327
+ ax,
1328
+ positions=df["y_pos"].tolist(),
1329
+ labels=df[study_col].tolist(),
1330
+ fontsize=10,
1331
+ )
1406
1332
 
1407
1333
  self._backend.set_title(ax, f"Forest Plot: {variant_id}")
1408
1334
  self._backend.hide_spines(ax, ["top", "right"])
@@ -18,6 +18,7 @@ from matplotlib.axes import Axes
18
18
  from tqdm import tqdm
19
19
 
20
20
  from .logging import logger
21
+ from .utils import filter_by_region
21
22
 
22
23
  # Recombination overlay color
23
24
  RECOMB_COLOR = "#7FCDFF" # Light blue
@@ -252,10 +253,20 @@ def download_canine_recombination_maps(
252
253
 
253
254
  logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
254
255
 
255
- # Extract tar.gz
256
+ # Extract tar.gz with path traversal protection
256
257
  logger.debug("Extracting genetic maps...")
257
258
  with tarfile.open(tar_path, "r:gz") as tar:
258
- tar.extractall(tmpdir)
259
+ # Filter to prevent path traversal attacks
260
+ safe_members = []
261
+ for member in tar.getmembers():
262
+ # Resolve the path and ensure it stays within tmpdir
263
+ member_path = Path(tmpdir) / member.name
264
+ try:
265
+ member_path.resolve().relative_to(Path(tmpdir).resolve())
266
+ safe_members.append(member)
267
+ except ValueError:
268
+ logger.warning(f"Skipping unsafe path in archive: {member.name}")
269
+ tar.extractall(tmpdir, members=safe_members)
259
270
 
260
271
  # Find and process the extracted files
261
272
  extracted_dir = Path(tmpdir)
@@ -374,7 +385,12 @@ def get_recombination_rate_for_region(
374
385
  )
375
386
 
376
387
  # Filter to region
377
- region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
388
+ region_df = filter_by_region(
389
+ df,
390
+ region=(chrom, start, end),
391
+ chrom_col="", # Recomb maps don't have chromosome column
392
+ pos_col="pos",
393
+ )
378
394
 
379
395
  return region_df[["pos", "rate"]]
380
396
 
pylocuszoom/utils.py CHANGED
@@ -106,6 +106,58 @@ def normalize_chrom(chrom: Union[int, str]) -> str:
106
106
  return str(chrom).replace("chr", "")
107
107
 
108
108
 
109
+ def filter_by_region(
110
+ df: pd.DataFrame,
111
+ region: tuple,
112
+ chrom_col: str = "chrom",
113
+ pos_col: str = "pos",
114
+ ) -> pd.DataFrame:
115
+ """Filter DataFrame to genomic region with inclusive bounds.
116
+
117
+ Filters rows where position is within [start, end] (inclusive).
118
+ If chrom_col exists in DataFrame, also filters by chromosome.
119
+ Chromosome comparison normalizes types (int/str, chr prefix).
120
+
121
+ Args:
122
+ df: DataFrame to filter.
123
+ region: Tuple of (chrom, start, end) defining the region.
124
+ chrom_col: Column name for chromosome (default: "chrom").
125
+ If column doesn't exist, filters by position only.
126
+ pos_col: Column name for position (default: "pos").
127
+
128
+ Returns:
129
+ Filtered DataFrame (copy, not view).
130
+
131
+ Raises:
132
+ KeyError: If pos_col is not found in DataFrame.
133
+
134
+ Example:
135
+ >>> filtered = filter_by_region(df, region=(1, 1000000, 2000000))
136
+ >>> filtered = filter_by_region(df, region=("chr1", 1e6, 2e6), pos_col="position")
137
+ """
138
+ chrom, start, end = region
139
+
140
+ # Validate position column exists
141
+ if pos_col not in df.columns:
142
+ raise KeyError(
143
+ f"Position column '{pos_col}' not found in DataFrame. "
144
+ f"Available columns: {list(df.columns)}"
145
+ )
146
+
147
+ # Position filtering (inclusive bounds)
148
+ mask = (df[pos_col] >= start) & (df[pos_col] <= end)
149
+
150
+ # Chromosome filtering (if column exists)
151
+ if chrom_col in df.columns:
152
+ chrom_normalized = normalize_chrom(chrom)
153
+ df_chrom_normalized = (
154
+ df[chrom_col].astype(str).str.replace("chr", "", regex=False)
155
+ )
156
+ mask = mask & (df_chrom_normalized == chrom_normalized)
157
+
158
+ return df[mask].copy()
159
+
160
+
109
161
  def validate_dataframe(
110
162
  df: pd.DataFrame,
111
163
  required_cols: List[str],