pylocuszoom 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/ensembl.py CHANGED
@@ -18,7 +18,7 @@ import pandas as pd
18
18
  import requests
19
19
 
20
20
  from .logging import logger
21
- from .utils import ValidationError
21
+ from .utils import ValidationError, normalize_chrom
22
22
 
23
23
  # Ensembl API limits regions to 5Mb
24
24
  ENSEMBL_MAX_REGION_SIZE = 5_000_000
@@ -47,11 +47,6 @@ ENSEMBL_MAX_RETRIES = 3
47
47
  ENSEMBL_RETRY_DELAY = 1.0 # seconds, doubles on each retry
48
48
 
49
49
 
50
- def _normalize_chrom(chrom: str | int) -> str:
51
- """Normalize chromosome name by removing 'chr' prefix."""
52
- return str(chrom).replace("chr", "")
53
-
54
-
55
50
  def _validate_region_size(start: int, end: int, context: str) -> None:
56
51
  """Validate region size is within Ensembl API limits.
57
52
 
@@ -129,7 +124,7 @@ def get_cached_genes(
129
124
  DataFrame if cache hit, None if cache miss.
130
125
  """
131
126
  ensembl_species = get_ensembl_species_name(species)
132
- chrom_str = _normalize_chrom(chrom)
127
+ chrom_str = normalize_chrom(chrom)
133
128
  cache_key = _cache_key(ensembl_species, chrom_str, start, end)
134
129
 
135
130
  species_dir = cache_dir / ensembl_species
@@ -161,7 +156,7 @@ def save_cached_genes(
161
156
  end: Region end position.
162
157
  """
163
158
  ensembl_species = get_ensembl_species_name(species)
164
- chrom_str = _normalize_chrom(chrom)
159
+ chrom_str = normalize_chrom(chrom)
165
160
  cache_key = _cache_key(ensembl_species, chrom_str, start, end)
166
161
 
167
162
  species_dir = cache_dir / ensembl_species
@@ -266,7 +261,7 @@ def fetch_genes_from_ensembl(
266
261
  _validate_region_size(start, end, "genes_df")
267
262
 
268
263
  ensembl_species = get_ensembl_species_name(species)
269
- chrom_str = _normalize_chrom(chrom)
264
+ chrom_str = normalize_chrom(chrom)
270
265
 
271
266
  # Build region string
272
267
  region = f"{chrom_str}:{start}-{end}"
@@ -334,7 +329,7 @@ def fetch_exons_from_ensembl(
334
329
  _validate_region_size(start, end, "exons_df")
335
330
 
336
331
  ensembl_species = get_ensembl_species_name(species)
337
- chrom_str = _normalize_chrom(chrom)
332
+ chrom_str = normalize_chrom(chrom)
338
333
  region = f"{chrom_str}:{start}-{end}"
339
334
 
340
335
  url = f"{ENSEMBL_REST_URL}/overlap/region/{ensembl_species}/{region}"
@@ -408,7 +403,7 @@ def get_genes_for_region(
408
403
  if cache_dir is None:
409
404
  cache_dir = get_ensembl_cache_dir()
410
405
 
411
- chrom_str = _normalize_chrom(chrom)
406
+ chrom_str = normalize_chrom(chrom)
412
407
 
413
408
  # Check cache first
414
409
  if use_cache:
pylocuszoom/gene_track.py CHANGED
@@ -175,17 +175,6 @@ def _draw_strand_arrows_matplotlib(
175
175
  gene_start, gene_end, region_width, strand
176
176
  )
177
177
 
178
- # Draw connecting line between arrow centers
179
- if len(arrow_tip_positions) > 1:
180
- ax.plot(
181
- [arrow_tip_positions[0], arrow_tip_positions[-1]],
182
- [y_gene, y_gene],
183
- color=arrow_color,
184
- linewidth=1.0,
185
- zorder=4,
186
- solid_capstyle="butt",
187
- )
188
-
189
178
  for tip_x in arrow_tip_positions:
190
179
  if strand == "+":
191
180
  base_x = tip_x - tri_width
@@ -224,17 +213,6 @@ def _draw_strand_arrows_generic(
224
213
  gene_start, gene_end, region_width, strand
225
214
  )
226
215
 
227
- # Draw connecting line between arrow centers
228
- if len(arrow_tip_positions) > 1:
229
- backend.line(
230
- ax,
231
- x=pd.Series([arrow_tip_positions[0], arrow_tip_positions[-1]]),
232
- y=pd.Series([y_gene, y_gene]),
233
- color=arrow_color,
234
- linewidth=1.0,
235
- zorder=4,
236
- )
237
-
238
216
  for tip_x in arrow_tip_positions:
239
217
  if strand == "+":
240
218
  base_x = tip_x - tri_width
@@ -406,7 +384,7 @@ def plot_gene_track(
406
384
  gene_name,
407
385
  ha="center",
408
386
  va="bottom",
409
- fontsize=7,
387
+ fontsize=9,
410
388
  color="#000000",
411
389
  fontweight="medium",
412
390
  style="italic",
@@ -553,7 +531,7 @@ def plot_gene_track_generic(
553
531
  label_pos,
554
532
  y_label,
555
533
  gene_name,
556
- fontsize=7,
534
+ fontsize=9,
557
535
  ha="center",
558
536
  va="bottom",
559
537
  color="#000000",
pylocuszoom/labels.py CHANGED
@@ -11,6 +11,8 @@ import pandas as pd
11
11
  from matplotlib.axes import Axes
12
12
  from matplotlib.text import Annotation
13
13
 
14
+ from pylocuszoom.logging import logger
15
+
14
16
 
15
17
  def add_snp_labels(
16
18
  ax: Axes,
@@ -111,7 +113,9 @@ def add_snp_labels(
111
113
  expand_points=(1.5, 1.5),
112
114
  )
113
115
  except ImportError:
114
- # adjustText not installed, labels may overlap
115
- pass
116
+ logger.warning(
117
+ "adjustText not installed - SNP labels may overlap. "
118
+ "Install with: pip install adjustText"
119
+ )
116
120
 
117
121
  return texts
@@ -0,0 +1,246 @@
1
+ """Manhattan plot data preparation and chromosome ordering."""
2
+
3
+ from typing import Literal
4
+
5
+ import colorcet as cc
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ # Species aliases
10
+ SPECIES_ALIASES: dict[str, str] = {
11
+ "dog": "canine",
12
+ "cat": "feline",
13
+ }
14
+
15
+ # Chromosome orders for supported species
16
+ CHROMOSOME_ORDERS: dict[str, list[str]] = {
17
+ "canine": [str(i) for i in range(1, 39)] + ["X", "Y", "MT"],
18
+ "feline": [
19
+ "A1",
20
+ "A2",
21
+ "A3",
22
+ "B1",
23
+ "B2",
24
+ "B3",
25
+ "B4",
26
+ "C1",
27
+ "C2",
28
+ "D1",
29
+ "D2",
30
+ "D3",
31
+ "D4",
32
+ "E1",
33
+ "E2",
34
+ "E3",
35
+ "X",
36
+ "Y",
37
+ "MT",
38
+ ],
39
+ "human": [str(i) for i in range(1, 23)] + ["X", "Y", "MT"],
40
+ }
41
+
42
+
43
+ def get_chromosome_order(
44
+ species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
45
+ custom_order: list[str] | None = None,
46
+ ) -> list[str]:
47
+ """Get chromosome order for a species.
48
+
49
+ Args:
50
+ species: Species name for built-in order. Supports aliases:
51
+ 'dog' -> 'canine', 'cat' -> 'feline'.
52
+ custom_order: Custom chromosome order (overrides species).
53
+
54
+ Returns:
55
+ List of chromosome names in display order.
56
+
57
+ Raises:
58
+ ValueError: If neither species nor custom_order provided,
59
+ or if species is unknown.
60
+ """
61
+ if custom_order is not None:
62
+ return custom_order
63
+ if species is not None:
64
+ # Resolve aliases
65
+ resolved_species = SPECIES_ALIASES.get(species, species)
66
+ if resolved_species not in CHROMOSOME_ORDERS:
67
+ raise ValueError(
68
+ f"Unknown species '{species}'. "
69
+ f"Use one of {list(CHROMOSOME_ORDERS.keys())} "
70
+ f"(or aliases: {list(SPECIES_ALIASES.keys())}) "
71
+ f"or provide custom_order."
72
+ )
73
+ return CHROMOSOME_ORDERS[resolved_species]
74
+ raise ValueError("Must provide either species or custom_order")
75
+
76
+
77
+ def get_chromosome_colors(n_chromosomes: int) -> list[str]:
78
+ """Get perceptually distinct colors for chromosomes.
79
+
80
+ Uses colorcet glasbey_dark palette for good visual
81
+ separation with saturated colors.
82
+
83
+ Args:
84
+ n_chromosomes: Number of chromosomes to color.
85
+
86
+ Returns:
87
+ List of hex color strings.
88
+ """
89
+ palette = cc.b_glasbey_bw_minc_20_maxl_70
90
+ return [palette[i % len(palette)] for i in range(n_chromosomes)]
91
+
92
+
93
+ def prepare_manhattan_data(
94
+ df: pd.DataFrame,
95
+ chrom_col: str = "chrom",
96
+ pos_col: str = "pos",
97
+ p_col: str = "p",
98
+ species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
99
+ custom_order: list[str] | None = None,
100
+ ) -> pd.DataFrame:
101
+ """Prepare DataFrame for Manhattan plot rendering.
102
+
103
+ Computes cumulative positions for x-axis and assigns chromosome colors.
104
+
105
+ Args:
106
+ df: GWAS results DataFrame.
107
+ chrom_col: Column name for chromosome.
108
+ pos_col: Column name for position.
109
+ p_col: Column name for p-value.
110
+ species: Species for chromosome ordering.
111
+ custom_order: Custom chromosome order.
112
+
113
+ Returns:
114
+ DataFrame with additional columns:
115
+ - _chrom_idx: Integer index for chromosome
116
+ - _cumulative_pos: X-axis position
117
+ - _neg_log_p: -log10(p-value)
118
+ - _color: Hex color for chromosome
119
+ """
120
+ # Validate required columns
121
+ for col, name in [(chrom_col, "chrom"), (pos_col, "pos"), (p_col, "p")]:
122
+ if col not in df.columns:
123
+ raise ValueError(f"Column '{col}' not found in DataFrame (for {name})")
124
+
125
+ # Get chromosome order
126
+ chrom_order = get_chromosome_order(species, custom_order)
127
+
128
+ # Create working copy
129
+ result = df.copy()
130
+
131
+ # Normalize chromosome names (handle int vs str)
132
+ result["_chrom_str"] = result[chrom_col].astype(str)
133
+
134
+ # Map chromosomes to order index (-1 for unknown)
135
+ chrom_to_idx = {chrom: i for i, chrom in enumerate(chrom_order)}
136
+ result["_chrom_idx"] = result["_chrom_str"].map(
137
+ lambda x: chrom_to_idx.get(x, len(chrom_order))
138
+ )
139
+
140
+ # Sort by chromosome index then position
141
+ result = result.sort_values(["_chrom_idx", pos_col])
142
+
143
+ # Calculate cumulative positions
144
+ # First get max position per chromosome
145
+ chrom_offsets = {}
146
+ cumulative = 0
147
+ for chrom in chrom_order:
148
+ chrom_data = result[result["_chrom_str"] == chrom]
149
+ if len(chrom_data) > 0:
150
+ chrom_offsets[chrom] = cumulative
151
+ cumulative += chrom_data[pos_col].max() + 1_000_000 # 1Mb gap
152
+
153
+ # Handle chromosomes not in order
154
+ unknown_chroms = set(result["_chrom_str"]) - set(chrom_order)
155
+ for chrom in sorted(unknown_chroms):
156
+ chrom_data = result[result["_chrom_str"] == chrom]
157
+ if len(chrom_data) > 0:
158
+ chrom_offsets[chrom] = cumulative
159
+ cumulative += chrom_data[pos_col].max() + 1_000_000
160
+
161
+ # Calculate cumulative position
162
+ result["_cumulative_pos"] = result.apply(
163
+ lambda row: chrom_offsets.get(row["_chrom_str"], 0) + row[pos_col], axis=1
164
+ )
165
+
166
+ # Calculate -log10(p)
167
+ result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
168
+
169
+ # Assign colors
170
+ all_chroms = chrom_order + sorted(unknown_chroms)
171
+ colors = get_chromosome_colors(len(all_chroms))
172
+ chrom_to_color = {chrom: colors[i] for i, chrom in enumerate(all_chroms)}
173
+ result["_color"] = result["_chrom_str"].map(chrom_to_color)
174
+
175
+ # Calculate chromosome centers for x-axis labels
176
+ chrom_centers = {}
177
+ for chrom in all_chroms:
178
+ chrom_data = result[result["_chrom_str"] == chrom]
179
+ if len(chrom_data) > 0:
180
+ chrom_centers[chrom] = chrom_data["_cumulative_pos"].mean()
181
+
182
+ result.attrs["chrom_centers"] = chrom_centers
183
+ result.attrs["chrom_order"] = all_chroms
184
+
185
+ return result
186
+
187
+
188
+ def prepare_categorical_data(
189
+ df: pd.DataFrame,
190
+ category_col: str,
191
+ p_col: str = "p",
192
+ category_order: list[str] | None = None,
193
+ ) -> pd.DataFrame:
194
+ """Prepare DataFrame for categorical Manhattan plot (PheWAS-style).
195
+
196
+ Args:
197
+ df: Results DataFrame with categories and p-values.
198
+ category_col: Column name for category.
199
+ p_col: Column name for p-value.
200
+ category_order: Custom category order.
201
+
202
+ Returns:
203
+ DataFrame with additional columns for plotting.
204
+ """
205
+ # Validate required columns
206
+ if category_col not in df.columns:
207
+ raise ValueError(f"Column '{category_col}' not found in DataFrame")
208
+ if p_col not in df.columns:
209
+ raise ValueError(f"Column '{p_col}' not found in DataFrame")
210
+
211
+ result = df.copy()
212
+
213
+ # Get category order
214
+ if category_order is None:
215
+ # Get unique values, drop NaN, convert to strings for consistent sorting
216
+ unique_vals = result[category_col].dropna().unique()
217
+ # Convert all to strings and sort to handle mixed types safely
218
+ category_order = sorted([str(v) for v in unique_vals])
219
+
220
+ # Convert category column to string for consistent handling
221
+ result["_cat_str"] = result[category_col].astype(str)
222
+
223
+ # Map categories to index (use string values for lookup)
224
+ cat_to_idx = {cat: i for i, cat in enumerate(category_order)}
225
+ result["_cat_idx"] = result["_cat_str"].map(
226
+ lambda x: cat_to_idx.get(x, len(category_order))
227
+ )
228
+
229
+ # Use category index as x position (with jitter for multiple points per category)
230
+ np.random.seed(42) # Reproducible jitter
231
+ result["_x_pos"] = result["_cat_idx"] + np.random.uniform(
232
+ -0.3, 0.3, size=len(result)
233
+ )
234
+
235
+ # Calculate -log10(p)
236
+ result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
237
+
238
+ # Assign colors (use string values for lookup)
239
+ colors = get_chromosome_colors(len(category_order))
240
+ cat_to_color = {cat: colors[i] for i, cat in enumerate(category_order)}
241
+ result["_color"] = result["_cat_str"].map(cat_to_color)
242
+
243
+ result.attrs["category_order"] = category_order
244
+ result.attrs["category_centers"] = {cat: i for i, cat in enumerate(category_order)}
245
+
246
+ return result