pylocuszoom 0.8.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ """Manhattan plot data preparation and chromosome ordering."""
2
+
3
+ from typing import Literal
4
+
5
+ import colorcet as cc
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ # Species aliases
10
+ SPECIES_ALIASES: dict[str, str] = {
11
+ "dog": "canine",
12
+ "cat": "feline",
13
+ }
14
+
15
+ # Chromosome orders for supported species
16
+ CHROMOSOME_ORDERS: dict[str, list[str]] = {
17
+ "canine": [str(i) for i in range(1, 39)] + ["X", "Y", "MT"],
18
+ "feline": [
19
+ "A1",
20
+ "A2",
21
+ "A3",
22
+ "B1",
23
+ "B2",
24
+ "B3",
25
+ "B4",
26
+ "C1",
27
+ "C2",
28
+ "D1",
29
+ "D2",
30
+ "D3",
31
+ "D4",
32
+ "E1",
33
+ "E2",
34
+ "E3",
35
+ "X",
36
+ "Y",
37
+ "MT",
38
+ ],
39
+ "human": [str(i) for i in range(1, 23)] + ["X", "Y", "MT"],
40
+ }
41
+
42
+
43
+ def get_chromosome_order(
44
+ species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
45
+ custom_order: list[str] | None = None,
46
+ ) -> list[str]:
47
+ """Get chromosome order for a species.
48
+
49
+ Args:
50
+ species: Species name for built-in order. Supports aliases:
51
+ 'dog' -> 'canine', 'cat' -> 'feline'.
52
+ custom_order: Custom chromosome order (overrides species).
53
+
54
+ Returns:
55
+ List of chromosome names in display order.
56
+
57
+ Raises:
58
+ ValueError: If neither species nor custom_order provided,
59
+ or if species is unknown.
60
+ """
61
+ if custom_order is not None:
62
+ return custom_order
63
+ if species is not None:
64
+ # Resolve aliases
65
+ resolved_species = SPECIES_ALIASES.get(species, species)
66
+ if resolved_species not in CHROMOSOME_ORDERS:
67
+ raise ValueError(
68
+ f"Unknown species '{species}'. "
69
+ f"Use one of {list(CHROMOSOME_ORDERS.keys())} "
70
+ f"(or aliases: {list(SPECIES_ALIASES.keys())}) "
71
+ f"or provide custom_order."
72
+ )
73
+ return CHROMOSOME_ORDERS[resolved_species]
74
+ raise ValueError("Must provide either species or custom_order")
75
+
76
+
77
+ def get_chromosome_colors(n_chromosomes: int) -> list[str]:
78
+ """Get perceptually distinct colors for chromosomes.
79
+
80
+ Uses colorcet glasbey_dark palette for good visual
81
+ separation with saturated colors.
82
+
83
+ Args:
84
+ n_chromosomes: Number of chromosomes to color.
85
+
86
+ Returns:
87
+ List of hex color strings.
88
+ """
89
+ palette = cc.b_glasbey_bw_minc_20_maxl_70
90
+ return [palette[i % len(palette)] for i in range(n_chromosomes)]
91
+
92
+
93
+ def prepare_manhattan_data(
94
+ df: pd.DataFrame,
95
+ chrom_col: str = "chrom",
96
+ pos_col: str = "pos",
97
+ p_col: str = "p",
98
+ species: Literal["canine", "feline", "human", "dog", "cat"] | None = None,
99
+ custom_order: list[str] | None = None,
100
+ ) -> pd.DataFrame:
101
+ """Prepare DataFrame for Manhattan plot rendering.
102
+
103
+ Computes cumulative positions for x-axis and assigns chromosome colors.
104
+
105
+ Args:
106
+ df: GWAS results DataFrame.
107
+ chrom_col: Column name for chromosome.
108
+ pos_col: Column name for position.
109
+ p_col: Column name for p-value.
110
+ species: Species for chromosome ordering.
111
+ custom_order: Custom chromosome order.
112
+
113
+ Returns:
114
+ DataFrame with additional columns:
115
+ - _chrom_idx: Integer index for chromosome
116
+ - _cumulative_pos: X-axis position
117
+ - _neg_log_p: -log10(p-value)
118
+ - _color: Hex color for chromosome
119
+ """
120
+ # Validate required columns
121
+ for col, name in [(chrom_col, "chrom"), (pos_col, "pos"), (p_col, "p")]:
122
+ if col not in df.columns:
123
+ raise ValueError(f"Column '{col}' not found in DataFrame (for {name})")
124
+
125
+ # Get chromosome order
126
+ chrom_order = get_chromosome_order(species, custom_order)
127
+
128
+ # Create working copy
129
+ result = df.copy()
130
+
131
+ # Normalize chromosome names (handle int vs str)
132
+ result["_chrom_str"] = result[chrom_col].astype(str)
133
+
134
+ # Map chromosomes to order index (-1 for unknown)
135
+ chrom_to_idx = {chrom: i for i, chrom in enumerate(chrom_order)}
136
+ result["_chrom_idx"] = result["_chrom_str"].map(
137
+ lambda x: chrom_to_idx.get(x, len(chrom_order))
138
+ )
139
+
140
+ # Sort by chromosome index then position
141
+ result = result.sort_values(["_chrom_idx", pos_col])
142
+
143
+ # Calculate cumulative positions
144
+ # First get max position per chromosome
145
+ chrom_offsets = {}
146
+ cumulative = 0
147
+ for chrom in chrom_order:
148
+ chrom_data = result[result["_chrom_str"] == chrom]
149
+ if len(chrom_data) > 0:
150
+ chrom_offsets[chrom] = cumulative
151
+ cumulative += chrom_data[pos_col].max() + 1_000_000 # 1Mb gap
152
+
153
+ # Handle chromosomes not in order
154
+ unknown_chroms = set(result["_chrom_str"]) - set(chrom_order)
155
+ for chrom in sorted(unknown_chroms):
156
+ chrom_data = result[result["_chrom_str"] == chrom]
157
+ if len(chrom_data) > 0:
158
+ chrom_offsets[chrom] = cumulative
159
+ cumulative += chrom_data[pos_col].max() + 1_000_000
160
+
161
+ # Calculate cumulative position
162
+ result["_cumulative_pos"] = result.apply(
163
+ lambda row: chrom_offsets.get(row["_chrom_str"], 0) + row[pos_col], axis=1
164
+ )
165
+
166
+ # Calculate -log10(p)
167
+ result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
168
+
169
+ # Assign colors
170
+ all_chroms = chrom_order + sorted(unknown_chroms)
171
+ colors = get_chromosome_colors(len(all_chroms))
172
+ chrom_to_color = {chrom: colors[i] for i, chrom in enumerate(all_chroms)}
173
+ result["_color"] = result["_chrom_str"].map(chrom_to_color)
174
+
175
+ # Calculate chromosome centers for x-axis labels
176
+ chrom_centers = {}
177
+ for chrom in all_chroms:
178
+ chrom_data = result[result["_chrom_str"] == chrom]
179
+ if len(chrom_data) > 0:
180
+ chrom_centers[chrom] = chrom_data["_cumulative_pos"].mean()
181
+
182
+ result.attrs["chrom_centers"] = chrom_centers
183
+ result.attrs["chrom_order"] = all_chroms
184
+
185
+ return result
186
+
187
+
188
+ def prepare_categorical_data(
189
+ df: pd.DataFrame,
190
+ category_col: str,
191
+ p_col: str = "p",
192
+ category_order: list[str] | None = None,
193
+ ) -> pd.DataFrame:
194
+ """Prepare DataFrame for categorical Manhattan plot (PheWAS-style).
195
+
196
+ Args:
197
+ df: Results DataFrame with categories and p-values.
198
+ category_col: Column name for category.
199
+ p_col: Column name for p-value.
200
+ category_order: Custom category order.
201
+
202
+ Returns:
203
+ DataFrame with additional columns for plotting.
204
+ """
205
+ # Validate required columns
206
+ if category_col not in df.columns:
207
+ raise ValueError(f"Column '{category_col}' not found in DataFrame")
208
+ if p_col not in df.columns:
209
+ raise ValueError(f"Column '{p_col}' not found in DataFrame")
210
+
211
+ result = df.copy()
212
+
213
+ # Get category order
214
+ if category_order is None:
215
+ # Get unique values, drop NaN, convert to strings for consistent sorting
216
+ unique_vals = result[category_col].dropna().unique()
217
+ # Convert all to strings and sort to handle mixed types safely
218
+ category_order = sorted([str(v) for v in unique_vals])
219
+
220
+ # Convert category column to string for consistent handling
221
+ result["_cat_str"] = result[category_col].astype(str)
222
+
223
+ # Map categories to index (use string values for lookup)
224
+ cat_to_idx = {cat: i for i, cat in enumerate(category_order)}
225
+ result["_cat_idx"] = result["_cat_str"].map(
226
+ lambda x: cat_to_idx.get(x, len(category_order))
227
+ )
228
+
229
+ # Use category index as x position (with jitter for multiple points per category)
230
+ np.random.seed(42) # Reproducible jitter
231
+ result["_x_pos"] = result["_cat_idx"] + np.random.uniform(
232
+ -0.3, 0.3, size=len(result)
233
+ )
234
+
235
+ # Calculate -log10(p)
236
+ result["_neg_log_p"] = -np.log10(result[p_col].clip(lower=1e-300))
237
+
238
+ # Assign colors (use string values for lookup)
239
+ colors = get_chromosome_colors(len(category_order))
240
+ cat_to_color = {cat: colors[i] for i, cat in enumerate(category_order)}
241
+ result["_color"] = result["_cat_str"].map(cat_to_color)
242
+
243
+ result.attrs["category_order"] = category_order
244
+ result.attrs["category_centers"] = {cat: i for i, cat in enumerate(category_order)}
245
+
246
+ return result