pylocuszoom 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ from matplotlib.axes import Axes
18
18
  from tqdm import tqdm
19
19
 
20
20
  from .logging import logger
21
+ from .utils import filter_by_region
21
22
 
22
23
  # Recombination overlay color
23
24
  RECOMB_COLOR = "#7FCDFF" # Light blue
@@ -252,10 +253,20 @@ def download_canine_recombination_maps(
252
253
 
253
254
  logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
254
255
 
255
- # Extract tar.gz
256
+ # Extract tar.gz with path traversal protection
256
257
  logger.debug("Extracting genetic maps...")
257
258
  with tarfile.open(tar_path, "r:gz") as tar:
258
- tar.extractall(tmpdir)
259
+ # Filter to prevent path traversal attacks
260
+ safe_members = []
261
+ for member in tar.getmembers():
262
+ # Resolve the path and ensure it stays within tmpdir
263
+ member_path = Path(tmpdir) / member.name
264
+ try:
265
+ member_path.resolve().relative_to(Path(tmpdir).resolve())
266
+ safe_members.append(member)
267
+ except ValueError:
268
+ logger.warning(f"Skipping unsafe path in archive: {member.name}")
269
+ tar.extractall(tmpdir, members=safe_members)
259
270
 
260
271
  # Find and process the extracted files
261
272
  extracted_dir = Path(tmpdir)
@@ -374,7 +385,12 @@ def get_recombination_rate_for_region(
374
385
  )
375
386
 
376
387
  # Filter to region
377
- region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
388
+ region_df = filter_by_region(
389
+ df,
390
+ region=(chrom, start, end),
391
+ chrom_col="", # Recomb maps don't have chromosome column
392
+ pos_col="pos",
393
+ )
378
394
 
379
395
  return region_df[["pos", "rate"]]
380
396
 
pylocuszoom/schemas.py CHANGED
@@ -10,12 +10,7 @@ from typing import Optional, Union
10
10
  import pandas as pd
11
11
  from pydantic import BaseModel, ConfigDict, field_validator, model_validator
12
12
 
13
-
14
- class LoaderValidationError(Exception):
15
- """Raised when loaded data fails validation."""
16
-
17
- pass
18
-
13
+ from .exceptions import LoaderValidationError
19
14
 
20
15
  # =============================================================================
21
16
  # GWAS Validation
pylocuszoom/utils.py CHANGED
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
8
8
 
9
9
  import pandas as pd
10
10
 
11
+ from .exceptions import ValidationError
12
+
11
13
  if TYPE_CHECKING:
12
14
  from pyspark.sql import DataFrame as SparkDataFrame
13
15
 
@@ -15,10 +17,6 @@ if TYPE_CHECKING:
15
17
  DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
16
18
 
17
19
 
18
- class ValidationError(ValueError):
19
- """Raised when input validation fails."""
20
-
21
-
22
20
  def is_spark_dataframe(df: Any) -> bool:
23
21
  """Check if object is a PySpark DataFrame.
24
22
 
@@ -106,6 +104,58 @@ def normalize_chrom(chrom: Union[int, str]) -> str:
106
104
  return str(chrom).replace("chr", "")
107
105
 
108
106
 
107
+ def filter_by_region(
108
+ df: pd.DataFrame,
109
+ region: tuple,
110
+ chrom_col: str = "chrom",
111
+ pos_col: str = "pos",
112
+ ) -> pd.DataFrame:
113
+ """Filter DataFrame to genomic region with inclusive bounds.
114
+
115
+ Filters rows where position is within [start, end] (inclusive).
116
+ If chrom_col exists in DataFrame, also filters by chromosome.
117
+ Chromosome comparison normalizes types (int/str, chr prefix).
118
+
119
+ Args:
120
+ df: DataFrame to filter.
121
+ region: Tuple of (chrom, start, end) defining the region.
122
+ chrom_col: Column name for chromosome (default: "chrom").
123
+ If column doesn't exist, filters by position only.
124
+ pos_col: Column name for position (default: "pos").
125
+
126
+ Returns:
127
+ Filtered DataFrame (copy, not view).
128
+
129
+ Raises:
130
+ KeyError: If pos_col is not found in DataFrame.
131
+
132
+ Example:
133
+ >>> filtered = filter_by_region(df, region=(1, 1000000, 2000000))
134
+ >>> filtered = filter_by_region(df, region=("chr1", 1e6, 2e6), pos_col="position")
135
+ """
136
+ chrom, start, end = region
137
+
138
+ # Validate position column exists
139
+ if pos_col not in df.columns:
140
+ raise KeyError(
141
+ f"Position column '{pos_col}' not found in DataFrame. "
142
+ f"Available columns: {list(df.columns)}"
143
+ )
144
+
145
+ # Position filtering (inclusive bounds)
146
+ mask = (df[pos_col] >= start) & (df[pos_col] <= end)
147
+
148
+ # Chromosome filtering (if column exists)
149
+ if chrom_col in df.columns:
150
+ chrom_normalized = normalize_chrom(chrom)
151
+ df_chrom_normalized = (
152
+ df[chrom_col].astype(str).str.replace("chr", "", regex=False)
153
+ )
154
+ mask = mask & (df_chrom_normalized == chrom_normalized)
155
+
156
+ return df[mask].copy()
157
+
158
+
109
159
  def validate_dataframe(
110
160
  df: pd.DataFrame,
111
161
  required_cols: List[str],
@@ -0,0 +1,223 @@
1
+ """DataFrame validation builder for pyLocusZoom.
2
+
3
+ Provides a fluent API for validating pandas DataFrames with composable
4
+ validation rules. Accumulates all validation errors before raising.
5
+ """
6
+
7
+ from typing import List, Optional
8
+
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+
12
+ from .utils import ValidationError
13
+
14
+
15
+ class DataFrameValidator:
16
+ """Builder for composable DataFrame validation.
17
+
18
+ Validates DataFrames with method chaining and accumulates all errors
19
+ before raising. This enables clear, readable validation code with
20
+ comprehensive error messages.
21
+
22
+ Example:
23
+ >>> validator = DataFrameValidator(df, name="gwas_df")
24
+ >>> validator.require_columns(["chr", "pos", "p"])
25
+ ... .require_numeric(["pos", "p"])
26
+ ... .require_range("p", min_val=0, max_val=1)
27
+ ... .validate()
28
+ """
29
+
30
+ def __init__(self, df: pd.DataFrame, name: str = "DataFrame"):
31
+ """Initialize validator.
32
+
33
+ Args:
34
+ df: DataFrame to validate.
35
+ name: Name for error messages (e.g., "gwas_df", "genes_df").
36
+ """
37
+ self._df = df
38
+ self._name = name
39
+ self._errors: List[str] = []
40
+
41
+ def require_columns(self, columns: List[str]) -> "DataFrameValidator":
42
+ """Check that required columns exist in DataFrame.
43
+
44
+ Args:
45
+ columns: List of required column names.
46
+
47
+ Returns:
48
+ Self for method chaining.
49
+ """
50
+ if not columns:
51
+ return self
52
+
53
+ missing = [col for col in columns if col not in self._df.columns]
54
+ if missing:
55
+ available = list(self._df.columns)
56
+ self._errors.append(f"Missing columns: {missing}. Available: {available}")
57
+
58
+ return self
59
+
60
+ def require_numeric(self, columns: List[str]) -> "DataFrameValidator":
61
+ """Check that columns have numeric dtype.
62
+
63
+ Skips columns that don't exist (checked separately by require_columns).
64
+
65
+ Args:
66
+ columns: List of column names that should be numeric.
67
+
68
+ Returns:
69
+ Self for method chaining.
70
+ """
71
+ for col in columns:
72
+ # Skip missing columns - let require_columns handle that
73
+ if col not in self._df.columns:
74
+ continue
75
+
76
+ if not is_numeric_dtype(self._df[col]):
77
+ actual_dtype = self._df[col].dtype
78
+ self._errors.append(
79
+ f"Column '{col}' must be numeric, got {actual_dtype}"
80
+ )
81
+
82
+ return self
83
+
84
+ def require_range(
85
+ self,
86
+ column: str,
87
+ min_val: Optional[float] = None,
88
+ max_val: Optional[float] = None,
89
+ exclusive_min: bool = False,
90
+ exclusive_max: bool = False,
91
+ ) -> "DataFrameValidator":
92
+ """Check that column values are within specified range.
93
+
94
+ Args:
95
+ column: Column name to check.
96
+ min_val: Minimum allowed value (inclusive by default).
97
+ max_val: Maximum allowed value (inclusive by default).
98
+ exclusive_min: If True, minimum is exclusive (values must be > min_val).
99
+ exclusive_max: If True, maximum is exclusive (values must be < max_val).
100
+
101
+ Returns:
102
+ Self for method chaining.
103
+ """
104
+ # Skip missing columns
105
+ if column not in self._df.columns:
106
+ return self
107
+
108
+ col_data = self._df[column]
109
+
110
+ # Check minimum bound
111
+ if min_val is not None:
112
+ if exclusive_min:
113
+ invalid_count = (col_data <= min_val).sum()
114
+ if invalid_count > 0:
115
+ self._errors.append(
116
+ f"Column '{column}': {invalid_count} values <= {min_val}"
117
+ )
118
+ else:
119
+ invalid_count = (col_data < min_val).sum()
120
+ if invalid_count > 0:
121
+ self._errors.append(
122
+ f"Column '{column}': {invalid_count} values < {min_val}"
123
+ )
124
+
125
+ # Check maximum bound
126
+ if max_val is not None:
127
+ if exclusive_max:
128
+ invalid_count = (col_data >= max_val).sum()
129
+ if invalid_count > 0:
130
+ self._errors.append(
131
+ f"Column '{column}': {invalid_count} values >= {max_val}"
132
+ )
133
+ else:
134
+ invalid_count = (col_data > max_val).sum()
135
+ if invalid_count > 0:
136
+ self._errors.append(
137
+ f"Column '{column}': {invalid_count} values > {max_val}"
138
+ )
139
+
140
+ return self
141
+
142
+ def require_not_null(self, columns: List[str]) -> "DataFrameValidator":
143
+ """Check that columns have no null (NaN or None) values.
144
+
145
+ Args:
146
+ columns: List of column names to check for nulls.
147
+
148
+ Returns:
149
+ Self for method chaining.
150
+ """
151
+ for col in columns:
152
+ # Skip missing columns
153
+ if col not in self._df.columns:
154
+ continue
155
+
156
+ null_count = self._df[col].isna().sum()
157
+ if null_count > 0:
158
+ self._errors.append(f"Column '{col}' has {null_count} null values")
159
+
160
+ return self
161
+
162
+ def require_ci_ordering(
163
+ self,
164
+ ci_lower_col: str,
165
+ effect_col: str,
166
+ ci_upper_col: str,
167
+ ) -> "DataFrameValidator":
168
+ """Check that confidence intervals are properly ordered.
169
+
170
+ Validates that ci_lower <= effect <= ci_upper for all rows.
171
+ Invalid ordering would produce negative error bar lengths.
172
+
173
+ Args:
174
+ ci_lower_col: Column name for lower CI bound.
175
+ effect_col: Column name for effect size (point estimate).
176
+ ci_upper_col: Column name for upper CI bound.
177
+
178
+ Returns:
179
+ Self for method chaining.
180
+ """
181
+ # Skip if any column is missing
182
+ for col in [ci_lower_col, effect_col, ci_upper_col]:
183
+ if col not in self._df.columns:
184
+ return self
185
+
186
+ lower = self._df[ci_lower_col]
187
+ effect = self._df[effect_col]
188
+ upper = self._df[ci_upper_col]
189
+
190
+ # Check ci_lower <= effect
191
+ lower_gt_effect = (lower > effect).sum()
192
+ if lower_gt_effect > 0:
193
+ self._errors.append(
194
+ f"{lower_gt_effect} rows have {ci_lower_col} > {effect_col}"
195
+ )
196
+
197
+ # Check effect <= ci_upper
198
+ effect_gt_upper = (effect > upper).sum()
199
+ if effect_gt_upper > 0:
200
+ self._errors.append(
201
+ f"{effect_gt_upper} rows have {effect_col} > {ci_upper_col}"
202
+ )
203
+
204
+ # Check ci_lower <= ci_upper (implicit from above, but explicit is clearer)
205
+ lower_gt_upper = (lower > upper).sum()
206
+ if lower_gt_upper > 0:
207
+ self._errors.append(
208
+ f"{lower_gt_upper} rows have {ci_lower_col} > {ci_upper_col}"
209
+ )
210
+
211
+ return self
212
+
213
+ def validate(self) -> None:
214
+ """Raise ValidationError if any validation rules failed.
215
+
216
+ Raises:
217
+ ValidationError: If any validation errors were accumulated.
218
+ Error message includes all accumulated errors.
219
+ """
220
+ if self._errors:
221
+ error_msg = f"{self._name} validation failed:\n"
222
+ error_msg += "\n".join(f" - {error}" for error in self._errors)
223
+ raise ValidationError(error_msg)
@@ -1,15 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pylocuszoom
3
- Version: 0.6.0
3
+ Version: 1.0.0
4
4
  Summary: Publication-ready regional association plots with LD coloring, gene tracks, and recombination overlays
5
5
  Project-URL: Homepage, https://github.com/michael-denyer/pylocuszoom
6
6
  Project-URL: Documentation, https://github.com/michael-denyer/pylocuszoom#readme
7
7
  Project-URL: Repository, https://github.com/michael-denyer/pylocuszoom
8
- Author: Michael Denyer
8
+ Author-email: Michael Denyer <code.denyer@gmail.com>
9
9
  License-Expression: GPL-3.0-or-later
10
10
  License-File: LICENSE.md
11
11
  Keywords: genetics,gwas,locus-zoom,locuszoom,regional-plot,visualization
12
- Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Intended Audience :: Science/Research
14
14
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
15
15
  Classifier: Programming Language :: Python :: 3
@@ -44,20 +44,18 @@ Requires-Dist: pyspark>=3.0.0; extra == 'spark'
44
44
  Description-Content-Type: text/markdown
45
45
 
46
46
  [![CI](https://github.com/michael-denyer/pyLocusZoom/actions/workflows/ci.yml/badge.svg)](https://github.com/michael-denyer/pyLocusZoom/actions/workflows/ci.yml)
47
- [![codecov](https://codecov.io/gh/michael-denyer/pyLocusZoom/graph/badge.svg)](https://codecov.io/gh/michael-denyer/pyLocusZoom)
48
47
  [![PyPI](https://img.shields.io/pypi/v/pylocuszoom)](https://pypi.org/project/pylocuszoom/)
49
- [![Bioconda](https://img.shields.io/conda/vn/bioconda/pylocuszoom)](https://anaconda.org/bioconda/pylocuszoom)
50
48
  [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-red.svg)](https://www.gnu.org/licenses/gpl-3.0)
51
49
  [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
52
50
  [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
53
51
  [![Matplotlib](https://img.shields.io/badge/Matplotlib-3.5+-11557c.svg)](https://matplotlib.org/)
54
- [![Plotly](https://img.shields.io/badge/Plotly-5.0+-3F4F75.svg)](https://plotly.com/python/)
52
+ [![Plotly](https://img.shields.io/badge/Plotly-5.15+-3F4F75.svg)](https://plotly.com/python/)
55
53
  [![Bokeh](https://img.shields.io/badge/Bokeh-3.8+-E6526F.svg)](https://bokeh.org/)
56
54
  [![Pandas](https://img.shields.io/badge/Pandas-1.4+-150458.svg)](https://pandas.pydata.org/)
57
55
  <img src="logo.svg" alt="pyLocusZoom logo" width="120" align="right">
58
56
  # pyLocusZoom
59
57
 
60
- Publication-ready regional association plots with LD coloring, gene tracks, and recombination overlays.
58
+ Designed for publication-ready GWAS visualization with regional association plots, gene tracks, eQTL, PheWAS, fine-mapping, and forest plots.
61
59
 
62
60
  Inspired by [LocusZoom](http://locuszoom.org/) and [locuszoomr](https://github.com/myles-lewis/locuszoomr).
63
61
 
@@ -68,20 +66,22 @@ Inspired by [LocusZoom](http://locuszoom.org/) and [locuszoomr](https://github.c
68
66
  - **Multi-species support**: Built-in reference data for *Canis lupus familiaris* (CanFam3.1/CanFam4) and *Felis catus* (FelCat9), or optionally provide your own for any species
69
67
  - **LD coloring**: SNPs colored by linkage disequilibrium (R²) with lead variant
70
68
  - **Gene tracks**: Annotated gene/exon positions below the association plot
71
- - **Recombination rate**: Overlay showing recombination rate across region (*Canis lupus familiaris* only)
72
- - **SNP labels (matplotlib)**: Automatic labeling of lead SNPs with RS ID
73
- - **Tooltips (Bokeh and Plotly)**: Mouseover for detailed SNP data
69
+ - **Recombination rate**: Optional overlay across region (*Canis lupus familiaris* built-in, not shown in example image)
70
+ - **SNP labels (matplotlib)**: Automatic labeling of top SNPs by p-value (RS IDs)
71
+ - **Hover tooltips (Plotly and Bokeh)**: Detailed SNP data on hover
74
72
 
75
- ![Example regional association plot](examples/regional_plot.png)
73
+ ![Example regional association plot with LD coloring and gene track](examples/regional_plot.png)
74
+ *Regional association plot with LD coloring, gene/exon track, and top SNP labels (recombination overlay disabled in example).*
76
75
 
77
76
  2. **Stacked plots**: Compare multiple GWAS/phenotypes vertically
78
77
  3. **eQTL plot**: Expression QTL data aligned with association plots and gene tracks
79
78
  4. **Fine-mapping plots**: Visualize SuSiE credible sets with posterior inclusion probabilities
80
79
  5. **PheWAS plots**: Phenome-wide association study visualization across multiple phenotypes
81
80
  6. **Forest plots**: Meta-analysis effect size visualization with confidence intervals
82
- 7. **Multiple charting libraries**: matplotlib (static), plotly (interactive), bokeh (dashboards)
81
+ 7. **Multiple backends**: matplotlib (publication-ready), plotly (interactive), bokeh (dashboard integration)
83
82
  8. **Pandas and PySpark support**: Works with both Pandas and PySpark DataFrames for large-scale genomics data
84
83
  9. **Convenience data file loaders**: Load and validate common GWAS, eQTL and fine-mapping file formats
84
+ 10. **Automatic gene annotations**: Fetch gene/exon data from Ensembl REST API with caching (human, mouse, rat, canine, feline, and any Ensembl species)
85
85
 
86
86
  ## Installation
87
87
 
@@ -109,15 +109,14 @@ from pylocuszoom import LocusZoomPlotter
109
109
  # Initialize plotter (loads reference data for canine)
110
110
  plotter = LocusZoomPlotter(species="canine")
111
111
 
112
- # Create regional plot
112
+ # Plot with parameters passed directly
113
113
  fig = plotter.plot(
114
- gwas_df, # DataFrame with ps, p_wald, rs columns
114
+ gwas_df, # DataFrame with ps, p_wald, rs columns
115
115
  chrom=1,
116
116
  start=1000000,
117
117
  end=2000000,
118
- lead_pos=1500000, # Highlight lead SNP
118
+ lead_pos=1500000, # Highlight lead SNP
119
119
  )
120
-
121
120
  fig.savefig("regional_plot.png", dpi=150)
122
121
  ```
123
122
 
@@ -137,9 +136,7 @@ fig = plotter.plot(
137
136
  start=1000000,
138
137
  end=2000000,
139
138
  lead_pos=1500000,
140
- ld_reference_file="genotypes.bed", # For LD calculation
141
- genes_df=genes_df, # Gene annotations
142
- exons_df=exons_df, # Exon annotations
139
+ ld_reference_file="genotypes", # PLINK fileset (without extension)
143
140
  show_recombination=True, # Overlay recombination rate
144
141
  snp_labels=True, # Label top SNPs
145
142
  label_top_n=5, # How many to label
@@ -147,6 +144,8 @@ fig = plotter.plot(
147
144
  p_col="p_wald", # Column name for p-value
148
145
  rs_col="rs", # Column name for SNP ID
149
146
  figsize=(12, 8),
147
+ genes_df=genes_df, # Gene annotations
148
+ exons_df=exons_df, # Exon annotations
150
149
  )
151
150
  ```
152
151
 
@@ -163,6 +162,8 @@ Recombination maps are automatically lifted over from CanFam3.1 to CanFam4 coord
163
162
  ## Using with Other Species
164
163
 
165
164
  ```python
165
+ from pylocuszoom import LocusZoomPlotter
166
+
166
167
  # Feline (LD and gene tracks, user provides recombination data)
167
168
  plotter = LocusZoomPlotter(species="feline")
168
169
 
@@ -172,37 +173,61 @@ plotter = LocusZoomPlotter(
172
173
  recomb_data_dir="/path/to/recomb_maps/",
173
174
  )
174
175
 
175
- # Or provide data per-plot
176
+ # Provide data per-plot
176
177
  fig = plotter.plot(
177
178
  gwas_df,
178
- chrom=1, start=1000000, end=2000000,
179
+ chrom=1,
180
+ start=1000000,
181
+ end=2000000,
179
182
  recomb_df=my_recomb_dataframe,
180
183
  genes_df=my_genes_df,
181
184
  )
182
185
  ```
183
186
 
187
+ ## Automatic Gene Annotations
188
+
189
+ pyLocusZoom can automatically fetch gene annotations from Ensembl for any species:
190
+
191
+ ```python
192
+ from pylocuszoom import LocusZoomPlotter
193
+
194
+ # Enable automatic gene fetching
195
+ plotter = LocusZoomPlotter(species="human", auto_genes=True)
196
+
197
+ # No need to provide genes_df - fetched automatically
198
+ fig = plotter.plot(gwas_df, chrom=13, start=32000000, end=33000000)
199
+ ```
200
+
201
+ Supported species aliases: `human`, `mouse`, `rat`, `canine`/`dog`, `feline`/`cat`, or any Ensembl species name.
202
+ Data is cached locally for fast subsequent plots. Maximum region size is 5Mb (Ensembl API limit).
203
+
184
204
  ## Backends
185
205
 
186
- pyLocusZoom supports multiple rendering backends:
206
+ pyLocusZoom supports multiple rendering backends (set at initialization):
187
207
 
188
208
  ```python
209
+ from pylocuszoom import LocusZoomPlotter
210
+
189
211
  # Static publication-quality plot (default)
190
- fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000, backend="matplotlib")
212
+ plotter = LocusZoomPlotter(species="canine", backend="matplotlib")
213
+ fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
191
214
  fig.savefig("plot.png", dpi=150)
192
215
 
193
216
  # Interactive Plotly (hover tooltips, pan/zoom)
194
- fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000, backend="plotly")
217
+ plotter = LocusZoomPlotter(species="canine", backend="plotly")
218
+ fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
195
219
  fig.write_html("plot.html")
196
220
 
197
221
  # Interactive Bokeh (dashboard-ready)
198
- fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000, backend="bokeh")
222
+ plotter = LocusZoomPlotter(species="canine", backend="bokeh")
223
+ fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
199
224
  ```
200
225
 
201
226
  | Backend | Output | Best For | Features |
202
227
  |---------|--------|----------|----------|
203
- | `matplotlib` | Static PNG/PDF/SVG | Publications, presentations | Full feature set with SNP labels |
204
- | `plotly` | Interactive HTML | Web reports, data exploration | Hover tooltips, pan/zoom |
205
- | `bokeh` | Interactive HTML | Dashboards, web apps | Hover tooltips, pan/zoom |
228
+ | `matplotlib` | Static PNG/PDF/SVG | Publication-ready figures | Full feature set with SNP labels |
229
+ | `plotly` | Interactive HTML | Web reports, exploration | Hover tooltips, pan/zoom |
230
+ | `bokeh` | Interactive HTML | Dashboard integration | Hover tooltips, pan/zoom |
206
231
 
207
232
  > **Note:** All backends support scatter plots, gene tracks, recombination overlay, and LD legend. SNP labels (auto-positioned with adjustText) are matplotlib-only; interactive backends use hover tooltips instead.
208
233
 
@@ -211,6 +236,10 @@ fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000, backend="bokeh"
211
236
  Compare multiple GWAS results vertically with shared x-axis:
212
237
 
213
238
  ```python
239
+ from pylocuszoom import LocusZoomPlotter
240
+
241
+ plotter = LocusZoomPlotter(species="canine")
242
+
214
243
  fig = plotter.plot_stacked(
215
244
  [gwas_height, gwas_bmi, gwas_whr],
216
245
  chrom=1,
@@ -221,22 +250,29 @@ fig = plotter.plot_stacked(
221
250
  )
222
251
  ```
223
252
 
224
- ![Example stacked plot](examples/stacked_plot.png)
253
+ ![Example stacked plot comparing two phenotypes](examples/stacked_plot.png)
254
+ *Stacked plot comparing two phenotypes with LD coloring and shared gene track.*
225
255
 
226
256
  ## eQTL Overlay
227
257
 
228
258
  Add expression QTL data as a separate panel:
229
259
 
230
260
  ```python
261
+ from pylocuszoom import LocusZoomPlotter
262
+
231
263
  eqtl_df = pd.DataFrame({
232
264
  "pos": [1000500, 1001200, 1002000],
233
265
  "p_value": [1e-6, 1e-4, 0.01],
234
266
  "gene": ["BRCA1", "BRCA1", "BRCA1"],
235
267
  })
236
268
 
269
+ plotter = LocusZoomPlotter(species="canine")
270
+
237
271
  fig = plotter.plot_stacked(
238
272
  [gwas_df],
239
- chrom=1, start=1000000, end=2000000,
273
+ chrom=1,
274
+ start=1000000,
275
+ end=2000000,
240
276
  eqtl_df=eqtl_df,
241
277
  eqtl_gene="BRCA1",
242
278
  genes_df=genes_df,
@@ -244,21 +280,28 @@ fig = plotter.plot_stacked(
244
280
  ```
245
281
 
246
282
  ![Example eQTL overlay plot](examples/eqtl_overlay.png)
283
+ *eQTL overlay with effect direction (up/down triangles) and magnitude binning.*
247
284
 
248
285
  ## Fine-mapping Visualization
249
286
 
250
287
  Visualize SuSiE or other fine-mapping results with credible set coloring:
251
288
 
252
289
  ```python
290
+ from pylocuszoom import LocusZoomPlotter
291
+
253
292
  finemapping_df = pd.DataFrame({
254
293
  "pos": [1000500, 1001200, 1002000, 1003500],
255
294
  "pip": [0.85, 0.12, 0.02, 0.45], # Posterior inclusion probability
256
295
  "cs": [1, 1, 0, 2], # Credible set assignment (0 = not in CS)
257
296
  })
258
297
 
298
+ plotter = LocusZoomPlotter(species="canine")
299
+
259
300
  fig = plotter.plot_stacked(
260
301
  [gwas_df],
261
- chrom=1, start=1000000, end=2000000,
302
+ chrom=1,
303
+ start=1000000,
304
+ end=2000000,
262
305
  finemapping_df=finemapping_df,
263
306
  finemapping_cs_col="cs",
264
307
  genes_df=genes_df,
@@ -266,6 +309,7 @@ fig = plotter.plot_stacked(
266
309
  ```
267
310
 
268
311
  ![Example fine-mapping plot](examples/finemapping_plot.png)
312
+ *Fine-mapping visualization with PIP line and credible set coloring (CS1/CS2).*
269
313
 
270
314
  ## PheWAS Plots
271
315
 
@@ -286,6 +330,7 @@ fig = plotter.plot_phewas(
286
330
  ```
287
331
 
288
332
  ![Example PheWAS plot](examples/phewas_plot.png)
333
+ *PheWAS plot showing associations across phenotype categories with significance threshold.*
289
334
 
290
335
  ## Forest Plots
291
336
 
@@ -308,19 +353,18 @@ fig = plotter.plot_forest(
308
353
  ```
309
354
 
310
355
  ![Example forest plot](examples/forest_plot.png)
356
+ *Forest plot with effect sizes, confidence intervals, and weight-proportional markers.*
311
357
 
312
358
  ## PySpark Support
313
359
 
314
- For large-scale genomics data, pass PySpark DataFrames directly:
360
+ For large-scale genomics data, convert PySpark DataFrames with `to_pandas()` before plotting:
315
361
 
316
362
  ```python
317
363
  from pylocuszoom import LocusZoomPlotter, to_pandas
318
364
 
319
- # PySpark DataFrame (automatically converted)
320
- fig = plotter.plot(spark_gwas_df, chrom=1, start=1000000, end=2000000)
321
-
322
- # Or convert manually with sampling for very large data
365
+ # Convert PySpark DataFrame (optionally sampled for very large data)
323
366
  pandas_df = to_pandas(spark_gwas_df, sample_size=100000)
367
+ fig = plotter.plot(pandas_df, chrom=1, start=1000000, end=2000000)
324
368
  ```
325
369
 
326
370
  Install PySpark support: `uv add pylocuszoom[spark]`
@@ -393,7 +437,7 @@ gwas_df = pd.DataFrame({
393
437
  |--------|------|----------|-------------|
394
438
  | `chr` | str or int | Yes | Chromosome identifier. Accepts "1", "chr1", or 1. The "chr" prefix is stripped for matching. |
395
439
  | `start` | int | Yes | Gene start position (bp, 1-based). Transcript start for strand-aware genes. |
396
- | `end` | int | Yes | Gene end position (bp, 1-based). Must be start. |
440
+ | `end` | int | Yes | Gene end position (bp, 1-based). Must be >= start. |
397
441
  | `gene_name` | str | Yes | Gene symbol displayed in track (e.g., "BRCA1", "TP53"). Keep short for readability. |
398
442
 
399
443
  Example:
@@ -495,6 +539,7 @@ Optional:
495
539
  ## Documentation
496
540
 
497
541
  - [User Guide](docs/USER_GUIDE.md) - Comprehensive documentation with API reference
542
+ - [Code Map](docs/CODEMAP.md) - Architecture diagram with source code links
498
543
  - [Architecture](docs/ARCHITECTURE.md) - Design decisions and component overview
499
544
  - [Example Notebook](examples/getting_started.ipynb) - Interactive tutorial
500
545
  - [CHANGELOG](CHANGELOG.md) - Version history