pylocuszoom 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +9 -1
- pylocuszoom/_plotter_utils.py +66 -0
- pylocuszoom/backends/base.py +56 -0
- pylocuszoom/backends/bokeh_backend.py +141 -29
- pylocuszoom/backends/matplotlib_backend.py +60 -0
- pylocuszoom/backends/plotly_backend.py +297 -88
- pylocuszoom/ensembl.py +6 -11
- pylocuszoom/gene_track.py +2 -24
- pylocuszoom/labels.py +6 -2
- pylocuszoom/manhattan.py +246 -0
- pylocuszoom/manhattan_plotter.py +760 -0
- pylocuszoom/plotter.py +236 -270
- pylocuszoom/qq.py +123 -0
- pylocuszoom/recombination.py +7 -7
- pylocuszoom/stats_plotter.py +319 -0
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.1.dist-info}/METADATA +130 -20
- pylocuszoom-1.1.1.dist-info/RECORD +36 -0
- pylocuszoom-1.0.0.dist-info/RECORD +0 -31
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.1.dist-info}/WHEEL +0 -0
- {pylocuszoom-1.0.0.dist-info → pylocuszoom-1.1.1.dist-info}/licenses/LICENSE.md +0 -0
pylocuszoom/ensembl.py
CHANGED
|
@@ -18,7 +18,7 @@ import pandas as pd
|
|
|
18
18
|
import requests
|
|
19
19
|
|
|
20
20
|
from .logging import logger
|
|
21
|
-
from .utils import ValidationError
|
|
21
|
+
from .utils import ValidationError, normalize_chrom
|
|
22
22
|
|
|
23
23
|
# Ensembl API limits regions to 5Mb
|
|
24
24
|
ENSEMBL_MAX_REGION_SIZE = 5_000_000
|
|
@@ -47,11 +47,6 @@ ENSEMBL_MAX_RETRIES = 3
|
|
|
47
47
|
ENSEMBL_RETRY_DELAY = 1.0 # seconds, doubles on each retry
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def _normalize_chrom(chrom: str | int) -> str:
|
|
51
|
-
"""Normalize chromosome name by removing 'chr' prefix."""
|
|
52
|
-
return str(chrom).replace("chr", "")
|
|
53
|
-
|
|
54
|
-
|
|
55
50
|
def _validate_region_size(start: int, end: int, context: str) -> None:
|
|
56
51
|
"""Validate region size is within Ensembl API limits.
|
|
57
52
|
|
|
@@ -129,7 +124,7 @@ def get_cached_genes(
|
|
|
129
124
|
DataFrame if cache hit, None if cache miss.
|
|
130
125
|
"""
|
|
131
126
|
ensembl_species = get_ensembl_species_name(species)
|
|
132
|
-
chrom_str =
|
|
127
|
+
chrom_str = normalize_chrom(chrom)
|
|
133
128
|
cache_key = _cache_key(ensembl_species, chrom_str, start, end)
|
|
134
129
|
|
|
135
130
|
species_dir = cache_dir / ensembl_species
|
|
@@ -161,7 +156,7 @@ def save_cached_genes(
|
|
|
161
156
|
end: Region end position.
|
|
162
157
|
"""
|
|
163
158
|
ensembl_species = get_ensembl_species_name(species)
|
|
164
|
-
chrom_str =
|
|
159
|
+
chrom_str = normalize_chrom(chrom)
|
|
165
160
|
cache_key = _cache_key(ensembl_species, chrom_str, start, end)
|
|
166
161
|
|
|
167
162
|
species_dir = cache_dir / ensembl_species
|
|
@@ -266,7 +261,7 @@ def fetch_genes_from_ensembl(
|
|
|
266
261
|
_validate_region_size(start, end, "genes_df")
|
|
267
262
|
|
|
268
263
|
ensembl_species = get_ensembl_species_name(species)
|
|
269
|
-
chrom_str =
|
|
264
|
+
chrom_str = normalize_chrom(chrom)
|
|
270
265
|
|
|
271
266
|
# Build region string
|
|
272
267
|
region = f"{chrom_str}:{start}-{end}"
|
|
@@ -334,7 +329,7 @@ def fetch_exons_from_ensembl(
|
|
|
334
329
|
_validate_region_size(start, end, "exons_df")
|
|
335
330
|
|
|
336
331
|
ensembl_species = get_ensembl_species_name(species)
|
|
337
|
-
chrom_str =
|
|
332
|
+
chrom_str = normalize_chrom(chrom)
|
|
338
333
|
region = f"{chrom_str}:{start}-{end}"
|
|
339
334
|
|
|
340
335
|
url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
|
|
@@ -408,7 +403,7 @@ def get_genes_for_region(
|
|
|
408
403
|
if cache_dir is None:
|
|
409
404
|
cache_dir = get_ensembl_cache_dir()
|
|
410
405
|
|
|
411
|
-
chrom_str =
|
|
406
|
+
chrom_str = normalize_chrom(chrom)
|
|
412
407
|
|
|
413
408
|
# Check cache first
|
|
414
409
|
if use_cache:
|
pylocuszoom/gene_track.py
CHANGED
|
@@ -175,17 +175,6 @@ def _draw_strand_arrows_matplotlib(
|
|
|
175
175
|
gene_start, gene_end, region_width, strand
|
|
176
176
|
)
|
|
177
177
|
|
|
178
|
-
# Draw connecting line between arrow centers
|
|
179
|
-
if len(arrow_tip_positions) > 1:
|
|
180
|
-
ax.plot(
|
|
181
|
-
[arrow_tip_positions[0], arrow_tip_positions[-1]],
|
|
182
|
-
[y_gene, y_gene],
|
|
183
|
-
color=arrow_color,
|
|
184
|
-
linewidth=1.0,
|
|
185
|
-
zorder=4,
|
|
186
|
-
solid_capstyle="butt",
|
|
187
|
-
)
|
|
188
|
-
|
|
189
178
|
for tip_x in arrow_tip_positions:
|
|
190
179
|
if strand == "+":
|
|
191
180
|
base_x = tip_x - tri_width
|
|
@@ -224,17 +213,6 @@ def _draw_strand_arrows_generic(
|
|
|
224
213
|
gene_start, gene_end, region_width, strand
|
|
225
214
|
)
|
|
226
215
|
|
|
227
|
-
# Draw connecting line between arrow centers
|
|
228
|
-
if len(arrow_tip_positions) > 1:
|
|
229
|
-
backend.line(
|
|
230
|
-
ax,
|
|
231
|
-
x=pd.Series([arrow_tip_positions[0], arrow_tip_positions[-1]]),
|
|
232
|
-
y=pd.Series([y_gene, y_gene]),
|
|
233
|
-
color=arrow_color,
|
|
234
|
-
linewidth=1.0,
|
|
235
|
-
zorder=4,
|
|
236
|
-
)
|
|
237
|
-
|
|
238
216
|
for tip_x in arrow_tip_positions:
|
|
239
217
|
if strand == "+":
|
|
240
218
|
base_x = tip_x - tri_width
|
|
@@ -406,7 +384,7 @@ def plot_gene_track(
|
|
|
406
384
|
gene_name,
|
|
407
385
|
ha="center",
|
|
408
386
|
va="bottom",
|
|
409
|
-
fontsize=
|
|
387
|
+
fontsize=9,
|
|
410
388
|
color="#000000",
|
|
411
389
|
fontweight="medium",
|
|
412
390
|
style="italic",
|
|
@@ -553,7 +531,7 @@ def plot_gene_track_generic(
|
|
|
553
531
|
label_pos,
|
|
554
532
|
y_label,
|
|
555
533
|
gene_name,
|
|
556
|
-
fontsize=
|
|
534
|
+
fontsize=9,
|
|
557
535
|
ha="center",
|
|
558
536
|
va="bottom",
|
|
559
537
|
color="#000000",
|
pylocuszoom/labels.py
CHANGED
|
@@ -11,6 +11,8 @@ import pandas as pd
|
|
|
11
11
|
from matplotlib.axes import Axes
|
|
12
12
|
from matplotlib.text import Annotation
|
|
13
13
|
|
|
14
|
+
from pylocuszoom.logging import logger
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
def add_snp_labels(
|
|
16
18
|
ax: Axes,
|
|
@@ -111,7 +113,9 @@ def add_snp_labels(
|
|
|
111
113
|
expand_points=(1.5, 1.5),
|
|
112
114
|
)
|
|
113
115
|
except ImportError:
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
logger.warning(
|
|
117
|
+
"adjustText not installed - SNP labels may overlap. "
|
|
118
|
+
"Install with: pip install adjustText"
|
|
119
|
+
)
|
|
116
120
|
|
|
117
121
|
return texts
|
pylocuszoom/manhattan.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""Manhattan plot data preparation and chromosome ordering."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
import colorcet as cc
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
# Species aliases
|
|
10
|
+
SPECIES_ALIASES: dict[str, str] = {
|
|
11
|
+
"dog": "canine",
|
|
12
|
+
"cat": "feline",
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# Chromosome orders for supported species
|
|
16
|
+
CHROMOSOME_ORDERS: dict[str, list[str]] = {
|
|
17
|
+
"canine": [str(i) for i in range(1, 39)] + ["X", "Y", "MT"],
|
|
18
|
+
"feline": [
|
|
19
|
+
"A1",
|
|
20
|
+
"A2",
|
|
21
|
+
"A3",
|
|
22
|
+
"B1",
|
|
23
|
+
"B2",
|
|
24
|
+
"B3",
|
|
25
|
+
"B4",
|
|
26
|
+
"C1",
|
|
27
|
+
"C2",
|
|
28
|
+
"D1",
|
|
29
|
+
"D2",
|
|
30
|
+
"D3",
|
|
31
|
+
"D4",
|
|
32
|
+
"E1",
|
|
33
|
+
"E2",
|
|
34
|
+
"E3",
|
|
35
|
+
"X",
|
|
36
|
+
"Y",
|
|
37
|
+
"MT",
|
|
38
|
+
],
|
|
39
|
+
"human": [str(i) for i in range(1, 23)] + ["X", "Y", "MT"],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_chromosome_order(
|
|
44
|
+
species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
|
|
45
|
+
custom_order: list[str] | None = None,
|
|
46
|
+
) -> list[str]:
|
|
47
|
+
"""Get chromosome order for a species.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
species: Species name for built-in order. Supports aliases:
|
|
51
|
+
'dog' -> 'canine', 'cat' -> 'feline'.
|
|
52
|
+
custom_order: Custom chromosome order (overrides species).
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of chromosome names in display order.
|
|
56
|
+
|
|
57
|
+
Raises:
|
|
58
|
+
ValueError: If neither species nor custom_order provided,
|
|
59
|
+
or if species is unknown.
|
|
60
|
+
"""
|
|
61
|
+
if custom_order is not None:
|
|
62
|
+
return custom_order
|
|
63
|
+
if species is not None:
|
|
64
|
+
# Resolve aliases
|
|
65
|
+
resolved_species = SPECIES_ALIASES.get(species, species)
|
|
66
|
+
if resolved_species not in CHROMOSOME_ORDERS:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
f"Unknown species '{species}'. "
|
|
69
|
+
f"Use one of {list(CHROMOSOME_ORDERS.keys())} "
|
|
70
|
+
f"(or aliases: {list(SPECIES_ALIASES.keys())}) "
|
|
71
|
+
f"or provide custom_order."
|
|
72
|
+
)
|
|
73
|
+
return CHROMOSOME_ORDERS[resolved_species]
|
|
74
|
+
raise ValueError("Must provide either species or custom_order")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_chromosome_colors(n_chromosomes: int) -> list[str]:
|
|
78
|
+
"""Get perceptually distinct colors for chromosomes.
|
|
79
|
+
|
|
80
|
+
Uses colorcet glasbey_dark palette for good visual
|
|
81
|
+
separation with saturated colors.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
n_chromosomes: Number of chromosomes to color.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of hex color strings.
|
|
88
|
+
"""
|
|
89
|
+
palette = cc.b_glasbey_bw_minc_20_maxl_70
|
|
90
|
+
return [palette[i % len(palette)] for i in range(n_chromosomes)]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def prepare_manhattan_data(
|
|
94
|
+
df: pd.DataFrame,
|
|
95
|
+
chrom_col: str = "chrom",
|
|
96
|
+
pos_col: str = "pos",
|
|
97
|
+
p_col: str = "p",
|
|
98
|
+
species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
|
|
99
|
+
custom_order: list[str] | None = None,
|
|
100
|
+
) -> pd.DataFrame:
|
|
101
|
+
"""Prepare DataFrame for Manhattan plot rendering.
|
|
102
|
+
|
|
103
|
+
Computes cumulative positions for x-axis and assigns chromosome colors.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
df: GWAS results DataFrame.
|
|
107
|
+
chrom_col: Column name for chromosome.
|
|
108
|
+
pos_col: Column name for position.
|
|
109
|
+
p_col: Column name for p-value.
|
|
110
|
+
species: Species for chromosome ordering.
|
|
111
|
+
custom_order: Custom chromosome order.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
DataFrame with additional columns:
|
|
115
|
+
- _chrom_idx: Integer index for chromosome
|
|
116
|
+
- _cumulative_pos: X-axis position
|
|
117
|
+
- _neg_log_p: -log10(p-value)
|
|
118
|
+
- _color: Hex color for chromosome
|
|
119
|
+
"""
|
|
120
|
+
# Validate required columns
|
|
121
|
+
for col, name in [(chrom_col, "chrom"), (pos_col, "pos"), (p_col, "p")]:
|
|
122
|
+
if col not in df.columns:
|
|
123
|
+
raise ValueError(f"Column '{col}' not found in DataFrame (for {name})")
|
|
124
|
+
|
|
125
|
+
# Get chromosome order
|
|
126
|
+
chrom_order = get_chromosome_order(species, custom_order)
|
|
127
|
+
|
|
128
|
+
# Create working copy
|
|
129
|
+
result = df.copy()
|
|
130
|
+
|
|
131
|
+
# Normalize chromosome names (handle int vs str)
|
|
132
|
+
result["_chrom_str"] = result[chrom_col].astype(str)
|
|
133
|
+
|
|
134
|
+
# Map chromosomes to order index (-1 for unknown)
|
|
135
|
+
chrom_to_idx = {chrom: i for i, chrom in enumerate(chrom_order)}
|
|
136
|
+
result["_chrom_idx"] = result["_chrom_str"].map(
|
|
137
|
+
lambda x: chrom_to_idx.get(x, len(chrom_order))
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Sort by chromosome index then position
|
|
141
|
+
result = result.sort_values(["_chrom_idx", pos_col])
|
|
142
|
+
|
|
143
|
+
# Calculate cumulative positions
|
|
144
|
+
# First get max position per chromosome
|
|
145
|
+
chrom_offsets = {}
|
|
146
|
+
cumulative = 0
|
|
147
|
+
for chrom in chrom_order:
|
|
148
|
+
chrom_data = result[result["_chrom_str"] == chrom]
|
|
149
|
+
if len(chrom_data) > 0:
|
|
150
|
+
chrom_offsets[chrom] = cumulative
|
|
151
|
+
cumulative += chrom_data[pos_col].max() + 1_000_000 # 1Mb gap
|
|
152
|
+
|
|
153
|
+
# Handle chromosomes not in order
|
|
154
|
+
unknown_chroms = set(result["_chrom_str"]) - set(chrom_order)
|
|
155
|
+
for chrom in sorted(unknown_chroms):
|
|
156
|
+
chrom_data = result[result["_chrom_str"] == chrom]
|
|
157
|
+
if len(chrom_data) > 0:
|
|
158
|
+
chrom_offsets[chrom] = cumulative
|
|
159
|
+
cumulative += chrom_data[pos_col].max() + 1_000_000
|
|
160
|
+
|
|
161
|
+
# Calculate cumulative position
|
|
162
|
+
result["_cumulative_pos"] = result.apply(
|
|
163
|
+
lambda row: chrom_offsets.get(row["_chrom_str"], 0) + row[pos_col], axis=1
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Calculate -log10(p)
|
|
167
|
+
result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
|
|
168
|
+
|
|
169
|
+
# Assign colors
|
|
170
|
+
all_chroms = chrom_order + sorted(unknown_chroms)
|
|
171
|
+
colors = get_chromosome_colors(len(all_chroms))
|
|
172
|
+
chrom_to_color = {chrom: colors[i] for i, chrom in enumerate(all_chroms)}
|
|
173
|
+
result["_color"] = result["_chrom_str"].map(chrom_to_color)
|
|
174
|
+
|
|
175
|
+
# Calculate chromosome centers for x-axis labels
|
|
176
|
+
chrom_centers = {}
|
|
177
|
+
for chrom in all_chroms:
|
|
178
|
+
chrom_data = result[result["_chrom_str"] == chrom]
|
|
179
|
+
if len(chrom_data) > 0:
|
|
180
|
+
chrom_centers[chrom] = chrom_data["_cumulative_pos"].mean()
|
|
181
|
+
|
|
182
|
+
result.attrs["chrom_centers"] = chrom_centers
|
|
183
|
+
result.attrs["chrom_order"] = all_chroms
|
|
184
|
+
|
|
185
|
+
return result
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def prepare_categorical_data(
|
|
189
|
+
df: pd.DataFrame,
|
|
190
|
+
category_col: str,
|
|
191
|
+
p_col: str = "p",
|
|
192
|
+
category_order: list[str] | None = None,
|
|
193
|
+
) -> pd.DataFrame:
|
|
194
|
+
"""Prepare DataFrame for categorical Manhattan plot (PheWAS-style).
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
df: Results DataFrame with categories and p-values.
|
|
198
|
+
category_col: Column name for category.
|
|
199
|
+
p_col: Column name for p-value.
|
|
200
|
+
category_order: Custom category order.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
DataFrame with additional columns for plotting.
|
|
204
|
+
"""
|
|
205
|
+
# Validate required columns
|
|
206
|
+
if category_col not in df.columns:
|
|
207
|
+
raise ValueError(f"Column '{category_col}' not found in DataFrame")
|
|
208
|
+
if p_col not in df.columns:
|
|
209
|
+
raise ValueError(f"Column '{p_col}' not found in DataFrame")
|
|
210
|
+
|
|
211
|
+
result = df.copy()
|
|
212
|
+
|
|
213
|
+
# Get category order
|
|
214
|
+
if category_order is None:
|
|
215
|
+
# Get unique values, drop NaN, convert to strings for consistent sorting
|
|
216
|
+
unique_vals = result[category_col].dropna().unique()
|
|
217
|
+
# Convert all to strings and sort to handle mixed types safely
|
|
218
|
+
category_order = sorted([str(v) for v in unique_vals])
|
|
219
|
+
|
|
220
|
+
# Convert category column to string for consistent handling
|
|
221
|
+
result["_cat_str"] = result[category_col].astype(str)
|
|
222
|
+
|
|
223
|
+
# Map categories to index (use string values for lookup)
|
|
224
|
+
cat_to_idx = {cat: i for i, cat in enumerate(category_order)}
|
|
225
|
+
result["_cat_idx"] = result["_cat_str"].map(
|
|
226
|
+
lambda x: cat_to_idx.get(x, len(category_order))
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Use category index as x position (with jitter for multiple points per category)
|
|
230
|
+
np.random.seed(42) # Reproducible jitter
|
|
231
|
+
result["_x_pos"] = result["_cat_idx"] + np.random.uniform(
|
|
232
|
+
-0.3, 0.3, size=len(result)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Calculate -log10(p)
|
|
236
|
+
result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
|
|
237
|
+
|
|
238
|
+
# Assign colors (use string values for lookup)
|
|
239
|
+
colors = get_chromosome_colors(len(category_order))
|
|
240
|
+
cat_to_color = {cat: colors[i] for i, cat in enumerate(category_order)}
|
|
241
|
+
result["_color"] = result["_cat_str"].map(cat_to_color)
|
|
242
|
+
|
|
243
|
+
result.attrs["category_order"] = category_order
|
|
244
|
+
result.attrs["category_centers"] = {cat: i for i, cat in enumerate(category_order)}
|
|
245
|
+
|
|
246
|
+
return result
|