pylocuszoom 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +34 -7
- pylocuszoom/backends/__init__.py +116 -17
- pylocuszoom/backends/base.py +363 -60
- pylocuszoom/backends/bokeh_backend.py +77 -15
- pylocuszoom/backends/hover.py +198 -0
- pylocuszoom/backends/matplotlib_backend.py +263 -3
- pylocuszoom/backends/plotly_backend.py +73 -16
- pylocuszoom/config.py +365 -0
- pylocuszoom/ensembl.py +476 -0
- pylocuszoom/eqtl.py +17 -25
- pylocuszoom/exceptions.py +33 -0
- pylocuszoom/finemapping.py +18 -32
- pylocuszoom/forest.py +10 -11
- pylocuszoom/gene_track.py +169 -142
- pylocuszoom/loaders.py +3 -1
- pylocuszoom/phewas.py +10 -11
- pylocuszoom/plotter.py +311 -277
- pylocuszoom/recombination.py +19 -3
- pylocuszoom/schemas.py +1 -6
- pylocuszoom/utils.py +54 -4
- pylocuszoom/validation.py +223 -0
- {pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/METADATA +82 -37
- pylocuszoom-1.0.0.dist-info/RECORD +31 -0
- pylocuszoom-0.6.0.dist-info/RECORD +0 -26
- {pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/WHEEL +0 -0
- {pylocuszoom-0.6.0.dist-info → pylocuszoom-1.0.0.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/recombination.py
CHANGED
|
@@ -18,6 +18,7 @@ from matplotlib.axes import Axes
|
|
|
18
18
|
from tqdm import tqdm
|
|
19
19
|
|
|
20
20
|
from .logging import logger
|
|
21
|
+
from .utils import filter_by_region
|
|
21
22
|
|
|
22
23
|
# Recombination overlay color
|
|
23
24
|
RECOMB_COLOR = "#7FCDFF" # Light blue
|
|
@@ -252,10 +253,20 @@ def download_canine_recombination_maps(
|
|
|
252
253
|
|
|
253
254
|
logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
|
|
254
255
|
|
|
255
|
-
# Extract tar.gz
|
|
256
|
+
# Extract tar.gz with path traversal protection
|
|
256
257
|
logger.debug("Extracting genetic maps...")
|
|
257
258
|
with tarfile.open(tar_path, "r:gz") as tar:
|
|
258
|
-
|
|
259
|
+
# Filter to prevent path traversal attacks
|
|
260
|
+
safe_members = []
|
|
261
|
+
for member in tar.getmembers():
|
|
262
|
+
# Resolve the path and ensure it stays within tmpdir
|
|
263
|
+
member_path = Path(tmpdir) / member.name
|
|
264
|
+
try:
|
|
265
|
+
member_path.resolve().relative_to(Path(tmpdir).resolve())
|
|
266
|
+
safe_members.append(member)
|
|
267
|
+
except ValueError:
|
|
268
|
+
logger.warning(f"Skipping unsafe path in archive: {member.name}")
|
|
269
|
+
tar.extractall(tmpdir, members=safe_members)
|
|
259
270
|
|
|
260
271
|
# Find and process the extracted files
|
|
261
272
|
extracted_dir = Path(tmpdir)
|
|
@@ -374,7 +385,12 @@ def get_recombination_rate_for_region(
|
|
|
374
385
|
)
|
|
375
386
|
|
|
376
387
|
# Filter to region
|
|
377
|
-
region_df =
|
|
388
|
+
region_df = filter_by_region(
|
|
389
|
+
df,
|
|
390
|
+
region=(chrom, start, end),
|
|
391
|
+
chrom_col="", # Recomb maps don't have chromosome column
|
|
392
|
+
pos_col="pos",
|
|
393
|
+
)
|
|
378
394
|
|
|
379
395
|
return region_df[["pos", "rate"]]
|
|
380
396
|
|
pylocuszoom/schemas.py
CHANGED
|
@@ -10,12 +10,7 @@ from typing import Optional, Union
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from pydantic import BaseModel, ConfigDict, field_validator, model_validator
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
class LoaderValidationError(Exception):
|
|
15
|
-
"""Raised when loaded data fails validation."""
|
|
16
|
-
|
|
17
|
-
pass
|
|
18
|
-
|
|
13
|
+
from .exceptions import LoaderValidationError
|
|
19
14
|
|
|
20
15
|
# =============================================================================
|
|
21
16
|
# GWAS Validation
|
pylocuszoom/utils.py
CHANGED
|
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
|
+
from .exceptions import ValidationError
|
|
12
|
+
|
|
11
13
|
if TYPE_CHECKING:
|
|
12
14
|
from pyspark.sql import DataFrame as SparkDataFrame
|
|
13
15
|
|
|
@@ -15,10 +17,6 @@ if TYPE_CHECKING:
|
|
|
15
17
|
DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
class ValidationError(ValueError):
|
|
19
|
-
"""Raised when input validation fails."""
|
|
20
|
-
|
|
21
|
-
|
|
22
20
|
def is_spark_dataframe(df: Any) -> bool:
|
|
23
21
|
"""Check if object is a PySpark DataFrame.
|
|
24
22
|
|
|
@@ -106,6 +104,58 @@ def normalize_chrom(chrom: Union[int, str]) -> str:
|
|
|
106
104
|
return str(chrom).replace("chr", "")
|
|
107
105
|
|
|
108
106
|
|
|
107
|
+
def filter_by_region(
|
|
108
|
+
df: pd.DataFrame,
|
|
109
|
+
region: tuple,
|
|
110
|
+
chrom_col: str = "chrom",
|
|
111
|
+
pos_col: str = "pos",
|
|
112
|
+
) -> pd.DataFrame:
|
|
113
|
+
"""Filter DataFrame to genomic region with inclusive bounds.
|
|
114
|
+
|
|
115
|
+
Filters rows where position is within [start, end] (inclusive).
|
|
116
|
+
If chrom_col exists in DataFrame, also filters by chromosome.
|
|
117
|
+
Chromosome comparison normalizes types (int/str, chr prefix).
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
df: DataFrame to filter.
|
|
121
|
+
region: Tuple of (chrom, start, end) defining the region.
|
|
122
|
+
chrom_col: Column name for chromosome (default: "chrom").
|
|
123
|
+
If column doesn't exist, filters by position only.
|
|
124
|
+
pos_col: Column name for position (default: "pos").
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Filtered DataFrame (copy, not view).
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
KeyError: If pos_col is not found in DataFrame.
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> filtered = filter_by_region(df, region=(1, 1000000, 2000000))
|
|
134
|
+
>>> filtered = filter_by_region(df, region=("chr1", 1e6, 2e6), pos_col="position")
|
|
135
|
+
"""
|
|
136
|
+
chrom, start, end = region
|
|
137
|
+
|
|
138
|
+
# Validate position column exists
|
|
139
|
+
if pos_col not in df.columns:
|
|
140
|
+
raise KeyError(
|
|
141
|
+
f"Position column '{pos_col}' not found in DataFrame. "
|
|
142
|
+
f"Available columns: {list(df.columns)}"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Position filtering (inclusive bounds)
|
|
146
|
+
mask = (df[pos_col] >= start) & (df[pos_col] <= end)
|
|
147
|
+
|
|
148
|
+
# Chromosome filtering (if column exists)
|
|
149
|
+
if chrom_col in df.columns:
|
|
150
|
+
chrom_normalized = normalize_chrom(chrom)
|
|
151
|
+
df_chrom_normalized = (
|
|
152
|
+
df[chrom_col].astype(str).str.replace("chr", "", regex=False)
|
|
153
|
+
)
|
|
154
|
+
mask = mask & (df_chrom_normalized == chrom_normalized)
|
|
155
|
+
|
|
156
|
+
return df[mask].copy()
|
|
157
|
+
|
|
158
|
+
|
|
109
159
|
def validate_dataframe(
|
|
110
160
|
df: pd.DataFrame,
|
|
111
161
|
required_cols: List[str],
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""DataFrame validation builder for pyLocusZoom.
|
|
2
|
+
|
|
3
|
+
Provides a fluent API for validating pandas DataFrames with composable
|
|
4
|
+
validation rules. Accumulates all validation errors before raising.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pandas.api.types import is_numeric_dtype
|
|
11
|
+
|
|
12
|
+
from .utils import ValidationError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DataFrameValidator:
|
|
16
|
+
"""Builder for composable DataFrame validation.
|
|
17
|
+
|
|
18
|
+
Validates DataFrames with method chaining and accumulates all errors
|
|
19
|
+
before raising. This enables clear, readable validation code with
|
|
20
|
+
comprehensive error messages.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> validator = DataFrameValidator(df, name="gwas_df")
|
|
24
|
+
>>> validator.require_columns(["chr", "pos", "p"])
|
|
25
|
+
... .require_numeric(["pos", "p"])
|
|
26
|
+
... .require_range("p", min_val=0, max_val=1)
|
|
27
|
+
... .validate()
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, df: pd.DataFrame, name: str = "DataFrame"):
|
|
31
|
+
"""Initialize validator.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
df: DataFrame to validate.
|
|
35
|
+
name: Name for error messages (e.g., "gwas_df", "genes_df").
|
|
36
|
+
"""
|
|
37
|
+
self._df = df
|
|
38
|
+
self._name = name
|
|
39
|
+
self._errors: List[str] = []
|
|
40
|
+
|
|
41
|
+
def require_columns(self, columns: List[str]) -> "DataFrameValidator":
|
|
42
|
+
"""Check that required columns exist in DataFrame.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
columns: List of required column names.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Self for method chaining.
|
|
49
|
+
"""
|
|
50
|
+
if not columns:
|
|
51
|
+
return self
|
|
52
|
+
|
|
53
|
+
missing = [col for col in columns if col not in self._df.columns]
|
|
54
|
+
if missing:
|
|
55
|
+
available = list(self._df.columns)
|
|
56
|
+
self._errors.append(f"Missing columns: {missing}. Available: {available}")
|
|
57
|
+
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def require_numeric(self, columns: List[str]) -> "DataFrameValidator":
|
|
61
|
+
"""Check that columns have numeric dtype.
|
|
62
|
+
|
|
63
|
+
Skips columns that don't exist (checked separately by require_columns).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
columns: List of column names that should be numeric.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Self for method chaining.
|
|
70
|
+
"""
|
|
71
|
+
for col in columns:
|
|
72
|
+
# Skip missing columns - let require_columns handle that
|
|
73
|
+
if col not in self._df.columns:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
if not is_numeric_dtype(self._df[col]):
|
|
77
|
+
actual_dtype = self._df[col].dtype
|
|
78
|
+
self._errors.append(
|
|
79
|
+
f"Column '{col}' must be numeric, got {actual_dtype}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return self
|
|
83
|
+
|
|
84
|
+
def require_range(
|
|
85
|
+
self,
|
|
86
|
+
column: str,
|
|
87
|
+
min_val: Optional[float] = None,
|
|
88
|
+
max_val: Optional[float] = None,
|
|
89
|
+
exclusive_min: bool = False,
|
|
90
|
+
exclusive_max: bool = False,
|
|
91
|
+
) -> "DataFrameValidator":
|
|
92
|
+
"""Check that column values are within specified range.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
column: Column name to check.
|
|
96
|
+
min_val: Minimum allowed value (inclusive by default).
|
|
97
|
+
max_val: Maximum allowed value (inclusive by default).
|
|
98
|
+
exclusive_min: If True, minimum is exclusive (values must be > min_val).
|
|
99
|
+
exclusive_max: If True, maximum is exclusive (values must be < max_val).
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Self for method chaining.
|
|
103
|
+
"""
|
|
104
|
+
# Skip missing columns
|
|
105
|
+
if column not in self._df.columns:
|
|
106
|
+
return self
|
|
107
|
+
|
|
108
|
+
col_data = self._df[column]
|
|
109
|
+
|
|
110
|
+
# Check minimum bound
|
|
111
|
+
if min_val is not None:
|
|
112
|
+
if exclusive_min:
|
|
113
|
+
invalid_count = (col_data <= min_val).sum()
|
|
114
|
+
if invalid_count > 0:
|
|
115
|
+
self._errors.append(
|
|
116
|
+
f"Column '{column}': {invalid_count} values <= {min_val}"
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
invalid_count = (col_data < min_val).sum()
|
|
120
|
+
if invalid_count > 0:
|
|
121
|
+
self._errors.append(
|
|
122
|
+
f"Column '{column}': {invalid_count} values < {min_val}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Check maximum bound
|
|
126
|
+
if max_val is not None:
|
|
127
|
+
if exclusive_max:
|
|
128
|
+
invalid_count = (col_data >= max_val).sum()
|
|
129
|
+
if invalid_count > 0:
|
|
130
|
+
self._errors.append(
|
|
131
|
+
f"Column '{column}': {invalid_count} values >= {max_val}"
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
invalid_count = (col_data > max_val).sum()
|
|
135
|
+
if invalid_count > 0:
|
|
136
|
+
self._errors.append(
|
|
137
|
+
f"Column '{column}': {invalid_count} values > {max_val}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return self
|
|
141
|
+
|
|
142
|
+
def require_not_null(self, columns: List[str]) -> "DataFrameValidator":
|
|
143
|
+
"""Check that columns have no null (NaN or None) values.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
columns: List of column names to check for nulls.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Self for method chaining.
|
|
150
|
+
"""
|
|
151
|
+
for col in columns:
|
|
152
|
+
# Skip missing columns
|
|
153
|
+
if col not in self._df.columns:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
null_count = self._df[col].isna().sum()
|
|
157
|
+
if null_count > 0:
|
|
158
|
+
self._errors.append(f"Column '{col}' has {null_count} null values")
|
|
159
|
+
|
|
160
|
+
return self
|
|
161
|
+
|
|
162
|
+
def require_ci_ordering(
|
|
163
|
+
self,
|
|
164
|
+
ci_lower_col: str,
|
|
165
|
+
effect_col: str,
|
|
166
|
+
ci_upper_col: str,
|
|
167
|
+
) -> "DataFrameValidator":
|
|
168
|
+
"""Check that confidence intervals are properly ordered.
|
|
169
|
+
|
|
170
|
+
Validates that ci_lower <= effect <= ci_upper for all rows.
|
|
171
|
+
Invalid ordering would produce negative error bar lengths.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
ci_lower_col: Column name for lower CI bound.
|
|
175
|
+
effect_col: Column name for effect size (point estimate).
|
|
176
|
+
ci_upper_col: Column name for upper CI bound.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Self for method chaining.
|
|
180
|
+
"""
|
|
181
|
+
# Skip if any column is missing
|
|
182
|
+
for col in [ci_lower_col, effect_col, ci_upper_col]:
|
|
183
|
+
if col not in self._df.columns:
|
|
184
|
+
return self
|
|
185
|
+
|
|
186
|
+
lower = self._df[ci_lower_col]
|
|
187
|
+
effect = self._df[effect_col]
|
|
188
|
+
upper = self._df[ci_upper_col]
|
|
189
|
+
|
|
190
|
+
# Check ci_lower <= effect
|
|
191
|
+
lower_gt_effect = (lower > effect).sum()
|
|
192
|
+
if lower_gt_effect > 0:
|
|
193
|
+
self._errors.append(
|
|
194
|
+
f"{lower_gt_effect} rows have {ci_lower_col} > {effect_col}"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# Check effect <= ci_upper
|
|
198
|
+
effect_gt_upper = (effect > upper).sum()
|
|
199
|
+
if effect_gt_upper > 0:
|
|
200
|
+
self._errors.append(
|
|
201
|
+
f"{effect_gt_upper} rows have {effect_col} > {ci_upper_col}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Check ci_lower <= ci_upper (implicit from above, but explicit is clearer)
|
|
205
|
+
lower_gt_upper = (lower > upper).sum()
|
|
206
|
+
if lower_gt_upper > 0:
|
|
207
|
+
self._errors.append(
|
|
208
|
+
f"{lower_gt_upper} rows have {ci_lower_col} > {ci_upper_col}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
return self
|
|
212
|
+
|
|
213
|
+
def validate(self) -> None:
|
|
214
|
+
"""Raise ValidationError if any validation rules failed.
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
ValidationError: If any validation errors were accumulated.
|
|
218
|
+
Error message includes all accumulated errors.
|
|
219
|
+
"""
|
|
220
|
+
if self._errors:
|
|
221
|
+
error_msg = f"{self._name} validation failed:\n"
|
|
222
|
+
error_msg += "\n".join(f" - {error}" for error in self._errors)
|
|
223
|
+
raise ValidationError(error_msg)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pylocuszoom
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Publication-ready regional association plots with LD coloring, gene tracks, and recombination overlays
|
|
5
5
|
Project-URL: Homepage, https://github.com/michael-denyer/pylocuszoom
|
|
6
6
|
Project-URL: Documentation, https://github.com/michael-denyer/pylocuszoom#readme
|
|
7
7
|
Project-URL: Repository, https://github.com/michael-denyer/pylocuszoom
|
|
8
|
-
Author: Michael Denyer
|
|
8
|
+
Author-email: Michael Denyer <code.denyer@gmail.com>
|
|
9
9
|
License-Expression: GPL-3.0-or-later
|
|
10
10
|
License-File: LICENSE.md
|
|
11
11
|
Keywords: genetics,gwas,locus-zoom,locuszoom,regional-plot,visualization
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
|
14
14
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
15
15
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -44,20 +44,18 @@ Requires-Dist: pyspark>=3.0.0; extra == 'spark'
|
|
|
44
44
|
Description-Content-Type: text/markdown
|
|
45
45
|
|
|
46
46
|
[](https://github.com/michael-denyer/pyLocusZoom/actions/workflows/ci.yml)
|
|
47
|
-
[](https://codecov.io/gh/michael-denyer/pyLocusZoom)
|
|
48
47
|
[](https://pypi.org/project/pylocuszoom/)
|
|
49
|
-
[](https://anaconda.org/bioconda/pylocuszoom)
|
|
50
48
|
[](https://www.gnu.org/licenses/gpl-3.0)
|
|
51
49
|
[](https://www.python.org/downloads/)
|
|
52
50
|
[](https://github.com/astral-sh/ruff)
|
|
53
51
|
[](https://matplotlib.org/)
|
|
54
|
-
[](https://plotly.com/python/)
|
|
55
53
|
[](https://bokeh.org/)
|
|
56
54
|
[](https://pandas.pydata.org/)
|
|
57
55
|
<img src="logo.svg" alt="pyLocusZoom logo" width="120" align="right">
|
|
58
56
|
# pyLocusZoom
|
|
59
57
|
|
|
60
|
-
|
|
58
|
+
Designed for publication-ready GWAS visualization with regional association plots, gene tracks, eQTL, PheWAS, fine-mapping, and forest plots.
|
|
61
59
|
|
|
62
60
|
Inspired by [LocusZoom](http://locuszoom.org/) and [locuszoomr](https://github.com/myles-lewis/locuszoomr).
|
|
63
61
|
|
|
@@ -68,20 +66,22 @@ Inspired by [LocusZoom](http://locuszoom.org/) and [locuszoomr](https://github.c
|
|
|
68
66
|
- **Multi-species support**: Built-in reference data for *Canis lupus familiaris* (CanFam3.1/CanFam4) and *Felis catus* (FelCat9), or optionally provide your own for any species
|
|
69
67
|
- **LD coloring**: SNPs colored by linkage disequilibrium (R²) with lead variant
|
|
70
68
|
- **Gene tracks**: Annotated gene/exon positions below the association plot
|
|
71
|
-
- **Recombination rate**:
|
|
72
|
-
- **SNP labels (matplotlib)**: Automatic labeling of
|
|
73
|
-
- **
|
|
69
|
+
- **Recombination rate**: Optional overlay across region (*Canis lupus familiaris* built-in, not shown in example image)
|
|
70
|
+
- **SNP labels (matplotlib)**: Automatic labeling of top SNPs by p-value (RS IDs)
|
|
71
|
+
- **Hover tooltips (Plotly and Bokeh)**: Detailed SNP data on hover
|
|
74
72
|
|
|
75
|
-

|
|
73
|
+

|
|
74
|
+
*Regional association plot with LD coloring, gene/exon track, and top SNP labels (recombination overlay disabled in example).*
|
|
76
75
|
|
|
77
76
|
2. **Stacked plots**: Compare multiple GWAS/phenotypes vertically
|
|
78
77
|
3. **eQTL plot**: Expression QTL data aligned with association plots and gene tracks
|
|
79
78
|
4. **Fine-mapping plots**: Visualize SuSiE credible sets with posterior inclusion probabilities
|
|
80
79
|
5. **PheWAS plots**: Phenome-wide association study visualization across multiple phenotypes
|
|
81
80
|
6. **Forest plots**: Meta-analysis effect size visualization with confidence intervals
|
|
82
|
-
7. **Multiple
|
|
81
|
+
7. **Multiple backends**: matplotlib (publication-ready), plotly (interactive), bokeh (dashboard integration)
|
|
83
82
|
8. **Pandas and PySpark support**: Works with both Pandas and PySpark DataFrames for large-scale genomics data
|
|
84
83
|
9. **Convenience data file loaders**: Load and validate common GWAS, eQTL and fine-mapping file formats
|
|
84
|
+
10. **Automatic gene annotations**: Fetch gene/exon data from Ensembl REST API with caching (human, mouse, rat, canine, feline, and any Ensembl species)
|
|
85
85
|
|
|
86
86
|
## Installation
|
|
87
87
|
|
|
@@ -109,15 +109,14 @@ from pylocuszoom import LocusZoomPlotter
|
|
|
109
109
|
# Initialize plotter (loads reference data for canine)
|
|
110
110
|
plotter = LocusZoomPlotter(species="canine")
|
|
111
111
|
|
|
112
|
-
#
|
|
112
|
+
# Plot with parameters passed directly
|
|
113
113
|
fig = plotter.plot(
|
|
114
|
-
gwas_df,
|
|
114
|
+
gwas_df, # DataFrame with ps, p_wald, rs columns
|
|
115
115
|
chrom=1,
|
|
116
116
|
start=1000000,
|
|
117
117
|
end=2000000,
|
|
118
|
-
lead_pos=1500000,
|
|
118
|
+
lead_pos=1500000, # Highlight lead SNP
|
|
119
119
|
)
|
|
120
|
-
|
|
121
120
|
fig.savefig("regional_plot.png", dpi=150)
|
|
122
121
|
```
|
|
123
122
|
|
|
@@ -137,9 +136,7 @@ fig = plotter.plot(
|
|
|
137
136
|
start=1000000,
|
|
138
137
|
end=2000000,
|
|
139
138
|
lead_pos=1500000,
|
|
140
|
-
ld_reference_file="genotypes
|
|
141
|
-
genes_df=genes_df, # Gene annotations
|
|
142
|
-
exons_df=exons_df, # Exon annotations
|
|
139
|
+
ld_reference_file="genotypes", # PLINK fileset (without extension)
|
|
143
140
|
show_recombination=True, # Overlay recombination rate
|
|
144
141
|
snp_labels=True, # Label top SNPs
|
|
145
142
|
label_top_n=5, # How many to label
|
|
@@ -147,6 +144,8 @@ fig = plotter.plot(
|
|
|
147
144
|
p_col="p_wald", # Column name for p-value
|
|
148
145
|
rs_col="rs", # Column name for SNP ID
|
|
149
146
|
figsize=(12, 8),
|
|
147
|
+
genes_df=genes_df, # Gene annotations
|
|
148
|
+
exons_df=exons_df, # Exon annotations
|
|
150
149
|
)
|
|
151
150
|
```
|
|
152
151
|
|
|
@@ -163,6 +162,8 @@ Recombination maps are automatically lifted over from CanFam3.1 to CanFam4 coord
|
|
|
163
162
|
## Using with Other Species
|
|
164
163
|
|
|
165
164
|
```python
|
|
165
|
+
from pylocuszoom import LocusZoomPlotter
|
|
166
|
+
|
|
166
167
|
# Feline (LD and gene tracks, user provides recombination data)
|
|
167
168
|
plotter = LocusZoomPlotter(species="feline")
|
|
168
169
|
|
|
@@ -172,37 +173,61 @@ plotter = LocusZoomPlotter(
|
|
|
172
173
|
recomb_data_dir="/path/to/recomb_maps/",
|
|
173
174
|
)
|
|
174
175
|
|
|
175
|
-
#
|
|
176
|
+
# Provide data per-plot
|
|
176
177
|
fig = plotter.plot(
|
|
177
178
|
gwas_df,
|
|
178
|
-
chrom=1,
|
|
179
|
+
chrom=1,
|
|
180
|
+
start=1000000,
|
|
181
|
+
end=2000000,
|
|
179
182
|
recomb_df=my_recomb_dataframe,
|
|
180
183
|
genes_df=my_genes_df,
|
|
181
184
|
)
|
|
182
185
|
```
|
|
183
186
|
|
|
187
|
+
## Automatic Gene Annotations
|
|
188
|
+
|
|
189
|
+
pyLocusZoom can automatically fetch gene annotations from Ensembl for any species:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from pylocuszoom import LocusZoomPlotter
|
|
193
|
+
|
|
194
|
+
# Enable automatic gene fetching
|
|
195
|
+
plotter = LocusZoomPlotter(species="human", auto_genes=True)
|
|
196
|
+
|
|
197
|
+
# No need to provide genes_df - fetched automatically
|
|
198
|
+
fig = plotter.plot(gwas_df, chrom=13, start=32000000, end=33000000)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Supported species aliases: `human`, `mouse`, `rat`, `canine`/`dog`, `feline`/`cat`, or any Ensembl species name.
|
|
202
|
+
Data is cached locally for fast subsequent plots. Maximum region size is 5Mb (Ensembl API limit).
|
|
203
|
+
|
|
184
204
|
## Backends
|
|
185
205
|
|
|
186
|
-
pyLocusZoom supports multiple rendering backends:
|
|
206
|
+
pyLocusZoom supports multiple rendering backends (set at initialization):
|
|
187
207
|
|
|
188
208
|
```python
|
|
209
|
+
from pylocuszoom import LocusZoomPlotter
|
|
210
|
+
|
|
189
211
|
# Static publication-quality plot (default)
|
|
190
|
-
|
|
212
|
+
plotter = LocusZoomPlotter(species="canine", backend="matplotlib")
|
|
213
|
+
fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
|
|
191
214
|
fig.savefig("plot.png", dpi=150)
|
|
192
215
|
|
|
193
216
|
# Interactive Plotly (hover tooltips, pan/zoom)
|
|
194
|
-
|
|
217
|
+
plotter = LocusZoomPlotter(species="canine", backend="plotly")
|
|
218
|
+
fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
|
|
195
219
|
fig.write_html("plot.html")
|
|
196
220
|
|
|
197
221
|
# Interactive Bokeh (dashboard-ready)
|
|
198
|
-
|
|
222
|
+
plotter = LocusZoomPlotter(species="canine", backend="bokeh")
|
|
223
|
+
fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000)
|
|
199
224
|
```
|
|
200
225
|
|
|
201
226
|
| Backend | Output | Best For | Features |
|
|
202
227
|
|---------|--------|----------|----------|
|
|
203
|
-
| `matplotlib` | Static PNG/PDF/SVG |
|
|
204
|
-
| `plotly` | Interactive HTML | Web reports,
|
|
205
|
-
| `bokeh` | Interactive HTML |
|
|
228
|
+
| `matplotlib` | Static PNG/PDF/SVG | Publication-ready figures | Full feature set with SNP labels |
|
|
229
|
+
| `plotly` | Interactive HTML | Web reports, exploration | Hover tooltips, pan/zoom |
|
|
230
|
+
| `bokeh` | Interactive HTML | Dashboard integration | Hover tooltips, pan/zoom |
|
|
206
231
|
|
|
207
232
|
> **Note:** All backends support scatter plots, gene tracks, recombination overlay, and LD legend. SNP labels (auto-positioned with adjustText) are matplotlib-only; interactive backends use hover tooltips instead.
|
|
208
233
|
|
|
@@ -211,6 +236,10 @@ fig = plotter.plot(gwas_df, chrom=1, start=1000000, end=2000000, backend="bokeh"
|
|
|
211
236
|
Compare multiple GWAS results vertically with shared x-axis:
|
|
212
237
|
|
|
213
238
|
```python
|
|
239
|
+
from pylocuszoom import LocusZoomPlotter
|
|
240
|
+
|
|
241
|
+
plotter = LocusZoomPlotter(species="canine")
|
|
242
|
+
|
|
214
243
|
fig = plotter.plot_stacked(
|
|
215
244
|
[gwas_height, gwas_bmi, gwas_whr],
|
|
216
245
|
chrom=1,
|
|
@@ -221,22 +250,29 @@ fig = plotter.plot_stacked(
|
|
|
221
250
|
)
|
|
222
251
|
```
|
|
223
252
|
|
|
224
|
-

|
|
253
|
+

|
|
254
|
+
*Stacked plot comparing two phenotypes with LD coloring and shared gene track.*
|
|
225
255
|
|
|
226
256
|
## eQTL Overlay
|
|
227
257
|
|
|
228
258
|
Add expression QTL data as a separate panel:
|
|
229
259
|
|
|
230
260
|
```python
|
|
261
|
+
from pylocuszoom import LocusZoomPlotter
|
|
262
|
+
|
|
231
263
|
eqtl_df = pd.DataFrame({
|
|
232
264
|
"pos": [1000500, 1001200, 1002000],
|
|
233
265
|
"p_value": [1e-6, 1e-4, 0.01],
|
|
234
266
|
"gene": ["BRCA1", "BRCA1", "BRCA1"],
|
|
235
267
|
})
|
|
236
268
|
|
|
269
|
+
plotter = LocusZoomPlotter(species="canine")
|
|
270
|
+
|
|
237
271
|
fig = plotter.plot_stacked(
|
|
238
272
|
[gwas_df],
|
|
239
|
-
chrom=1,
|
|
273
|
+
chrom=1,
|
|
274
|
+
start=1000000,
|
|
275
|
+
end=2000000,
|
|
240
276
|
eqtl_df=eqtl_df,
|
|
241
277
|
eqtl_gene="BRCA1",
|
|
242
278
|
genes_df=genes_df,
|
|
@@ -244,21 +280,28 @@ fig = plotter.plot_stacked(
|
|
|
244
280
|
```
|
|
245
281
|
|
|
246
282
|

|
|
283
|
+
*eQTL overlay with effect direction (up/down triangles) and magnitude binning.*
|
|
247
284
|
|
|
248
285
|
## Fine-mapping Visualization
|
|
249
286
|
|
|
250
287
|
Visualize SuSiE or other fine-mapping results with credible set coloring:
|
|
251
288
|
|
|
252
289
|
```python
|
|
290
|
+
from pylocuszoom import LocusZoomPlotter
|
|
291
|
+
|
|
253
292
|
finemapping_df = pd.DataFrame({
|
|
254
293
|
"pos": [1000500, 1001200, 1002000, 1003500],
|
|
255
294
|
"pip": [0.85, 0.12, 0.02, 0.45], # Posterior inclusion probability
|
|
256
295
|
"cs": [1, 1, 0, 2], # Credible set assignment (0 = not in CS)
|
|
257
296
|
})
|
|
258
297
|
|
|
298
|
+
plotter = LocusZoomPlotter(species="canine")
|
|
299
|
+
|
|
259
300
|
fig = plotter.plot_stacked(
|
|
260
301
|
[gwas_df],
|
|
261
|
-
chrom=1,
|
|
302
|
+
chrom=1,
|
|
303
|
+
start=1000000,
|
|
304
|
+
end=2000000,
|
|
262
305
|
finemapping_df=finemapping_df,
|
|
263
306
|
finemapping_cs_col="cs",
|
|
264
307
|
genes_df=genes_df,
|
|
@@ -266,6 +309,7 @@ fig = plotter.plot_stacked(
|
|
|
266
309
|
```
|
|
267
310
|
|
|
268
311
|

|
|
312
|
+
*Fine-mapping visualization with PIP line and credible set coloring (CS1/CS2).*
|
|
269
313
|
|
|
270
314
|
## PheWAS Plots
|
|
271
315
|
|
|
@@ -286,6 +330,7 @@ fig = plotter.plot_phewas(
|
|
|
286
330
|
```
|
|
287
331
|
|
|
288
332
|

|
|
333
|
+
*PheWAS plot showing associations across phenotype categories with significance threshold.*
|
|
289
334
|
|
|
290
335
|
## Forest Plots
|
|
291
336
|
|
|
@@ -308,19 +353,18 @@ fig = plotter.plot_forest(
|
|
|
308
353
|
```
|
|
309
354
|
|
|
310
355
|

|
|
356
|
+
*Forest plot with effect sizes, confidence intervals, and weight-proportional markers.*
|
|
311
357
|
|
|
312
358
|
## PySpark Support
|
|
313
359
|
|
|
314
|
-
For large-scale genomics data,
|
|
360
|
+
For large-scale genomics data, convert PySpark DataFrames with `to_pandas()` before plotting:
|
|
315
361
|
|
|
316
362
|
```python
|
|
317
363
|
from pylocuszoom import LocusZoomPlotter, to_pandas
|
|
318
364
|
|
|
319
|
-
# PySpark DataFrame (
|
|
320
|
-
fig = plotter.plot(spark_gwas_df, chrom=1, start=1000000, end=2000000)
|
|
321
|
-
|
|
322
|
-
# Or convert manually with sampling for very large data
|
|
365
|
+
# Convert PySpark DataFrame (optionally sampled for very large data)
|
|
323
366
|
pandas_df = to_pandas(spark_gwas_df, sample_size=100000)
|
|
367
|
+
fig = plotter.plot(pandas_df, chrom=1, start=1000000, end=2000000)
|
|
324
368
|
```
|
|
325
369
|
|
|
326
370
|
Install PySpark support: `uv add pylocuszoom[spark]`
|
|
@@ -393,7 +437,7 @@ gwas_df = pd.DataFrame({
|
|
|
393
437
|
|--------|------|----------|-------------|
|
|
394
438
|
| `chr` | str or int | Yes | Chromosome identifier. Accepts "1", "chr1", or 1. The "chr" prefix is stripped for matching. |
|
|
395
439
|
| `start` | int | Yes | Gene start position (bp, 1-based). Transcript start for strand-aware genes. |
|
|
396
|
-
| `end` | int | Yes | Gene end position (bp, 1-based). Must be
|
|
440
|
+
| `end` | int | Yes | Gene end position (bp, 1-based). Must be >= start. |
|
|
397
441
|
| `gene_name` | str | Yes | Gene symbol displayed in track (e.g., "BRCA1", "TP53"). Keep short for readability. |
|
|
398
442
|
|
|
399
443
|
Example:
|
|
@@ -495,6 +539,7 @@ Optional:
|
|
|
495
539
|
## Documentation
|
|
496
540
|
|
|
497
541
|
- [User Guide](docs/USER_GUIDE.md) - Comprehensive documentation with API reference
|
|
542
|
+
- [Code Map](docs/CODEMAP.md) - Architecture diagram with source code links
|
|
498
543
|
- [Architecture](docs/ARCHITECTURE.md) - Design decisions and component overview
|
|
499
544
|
- [Example Notebook](examples/getting_started.ipynb) - Interactive tutorial
|
|
500
545
|
- [CHANGELOG](CHANGELOG.md) - Version history
|