pylocuszoom 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pylocuszoom/__init__.py +120 -0
- pylocuszoom/backends/__init__.py +52 -0
- pylocuszoom/backends/base.py +341 -0
- pylocuszoom/backends/bokeh_backend.py +441 -0
- pylocuszoom/backends/matplotlib_backend.py +288 -0
- pylocuszoom/backends/plotly_backend.py +474 -0
- pylocuszoom/colors.py +107 -0
- pylocuszoom/eqtl.py +218 -0
- pylocuszoom/gene_track.py +311 -0
- pylocuszoom/labels.py +118 -0
- pylocuszoom/ld.py +209 -0
- pylocuszoom/logging.py +153 -0
- pylocuszoom/plotter.py +733 -0
- pylocuszoom/recombination.py +432 -0
- pylocuszoom/reference_data/__init__.py +4 -0
- pylocuszoom/utils.py +194 -0
- pylocuszoom-0.1.0.dist-info/METADATA +367 -0
- pylocuszoom-0.1.0.dist-info/RECORD +20 -0
- pylocuszoom-0.1.0.dist-info/WHEEL +4 -0
- pylocuszoom-0.1.0.dist-info/licenses/LICENSE.md +17 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Recombination rate overlay and data management.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- Recombination rate overlay for regional plots
|
|
5
|
+
- Download and loading of species-specific recombination maps
|
|
6
|
+
- Liftover support for CanFam3.1 to CanFam4 coordinate conversion
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import tarfile
|
|
11
|
+
import tempfile
|
|
12
|
+
import urllib.request
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
from matplotlib.axes import Axes
|
|
18
|
+
|
|
19
|
+
from .logging import logger
|
|
20
|
+
|
|
21
|
+
# Recombination overlay color
|
|
22
|
+
RECOMB_COLOR = "#7FCDFF" # Light blue
|
|
23
|
+
|
|
24
|
+
# Data sources by species
|
|
25
|
+
DOG_RECOMB_URL = (
|
|
26
|
+
"https://github.com/cflerin/dog_recombination/raw/master/dog_genetic_maps.tar.gz"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Liftover chain files
|
|
30
|
+
CANFAM3_TO_CANFAM4_CHAIN_URL = "https://hgdownload.soe.ucsc.edu/gbdb/canFam3/liftOver/canFam3ToCanFam4.over.chain.gz"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _normalize_build(build: Optional[str]) -> Optional[str]:
|
|
34
|
+
"""Normalize genome build name to canonical form.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
build: Build name (e.g., "canfam4", "CanFam4.0", "UU_Cfam_GSD_1.0")
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Normalized build name ("canfam3" or "canfam4"), or None if not specified.
|
|
41
|
+
"""
|
|
42
|
+
if build is None:
|
|
43
|
+
return None
|
|
44
|
+
build_lower = build.lower().replace(".", "").replace("_", "")
|
|
45
|
+
if "canfam4" in build_lower or "uucfamgsd" in build_lower:
|
|
46
|
+
return "canfam4"
|
|
47
|
+
if "canfam3" in build_lower:
|
|
48
|
+
return "canfam3"
|
|
49
|
+
return build.lower()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_chain_file_path() -> Path:
|
|
53
|
+
"""Get path to the CanFam3 to CanFam4 liftover chain file."""
|
|
54
|
+
return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def download_liftover_chain(force: bool = False) -> Path:
|
|
58
|
+
"""Download the CanFam3 to CanFam4 liftover chain file.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
force: Re-download even if file exists.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Path to the downloaded chain file.
|
|
65
|
+
"""
|
|
66
|
+
chain_path = get_chain_file_path()
|
|
67
|
+
|
|
68
|
+
if chain_path.exists() and not force:
|
|
69
|
+
return chain_path
|
|
70
|
+
|
|
71
|
+
chain_path.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
|
|
74
|
+
logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.debug(f"urllib download failed: {e}")
|
|
80
|
+
try:
|
|
81
|
+
import requests
|
|
82
|
+
|
|
83
|
+
response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
|
|
84
|
+
response.raise_for_status()
|
|
85
|
+
chain_path.write_bytes(response.content)
|
|
86
|
+
except ImportError:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
"Failed to download. Install requests: pip install requests"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
logger.info(f"Chain file saved to: {chain_path}")
|
|
92
|
+
return chain_path
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def liftover_recombination_map(
|
|
96
|
+
recomb_df: pd.DataFrame,
|
|
97
|
+
from_build: str = "canfam3",
|
|
98
|
+
to_build: str = "canfam4",
|
|
99
|
+
chrom: Optional[int] = None,
|
|
100
|
+
) -> pd.DataFrame:
|
|
101
|
+
"""Liftover recombination map coordinates between genome builds.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
recomb_df: DataFrame with 'pos' column (and optionally 'chr').
|
|
105
|
+
from_build: Source genome build (default: canfam3).
|
|
106
|
+
to_build: Target genome build (default: canfam4).
|
|
107
|
+
chrom: Chromosome number (required if 'chr' not in recomb_df).
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
DataFrame with lifted coordinates. Positions that fail to map are dropped.
|
|
111
|
+
"""
|
|
112
|
+
from pyliftover import LiftOver
|
|
113
|
+
|
|
114
|
+
# Download chain file if needed
|
|
115
|
+
chain_path = download_liftover_chain()
|
|
116
|
+
|
|
117
|
+
logger.debug(f"Lifting over coordinates from {from_build} to {to_build}")
|
|
118
|
+
lo = LiftOver(str(chain_path))
|
|
119
|
+
|
|
120
|
+
# Get chromosome for each position
|
|
121
|
+
if "chr" in recomb_df.columns:
|
|
122
|
+
chroms = recomb_df["chr"].astype(str)
|
|
123
|
+
elif chrom is not None:
|
|
124
|
+
chroms = pd.Series([str(chrom)] * len(recomb_df))
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError("Either 'chr' column or chrom parameter required")
|
|
127
|
+
|
|
128
|
+
# Liftover each position
|
|
129
|
+
new_positions = []
|
|
130
|
+
keep_mask = []
|
|
131
|
+
|
|
132
|
+
for chr_val, pos in zip(chroms, recomb_df["pos"]):
|
|
133
|
+
chr_str = f"chr{chr_val}" if not str(chr_val).startswith("chr") else chr_val
|
|
134
|
+
result = lo.convert_coordinate(chr_str, int(pos))
|
|
135
|
+
|
|
136
|
+
if result and len(result) > 0:
|
|
137
|
+
# Take first mapping (usually the only one)
|
|
138
|
+
_, new_pos, _, _ = result[0]
|
|
139
|
+
new_positions.append(int(new_pos))
|
|
140
|
+
keep_mask.append(True)
|
|
141
|
+
else:
|
|
142
|
+
new_positions.append(None)
|
|
143
|
+
keep_mask.append(False)
|
|
144
|
+
|
|
145
|
+
# Create output DataFrame
|
|
146
|
+
result_df = recomb_df.copy()
|
|
147
|
+
result_df["pos"] = new_positions
|
|
148
|
+
result_df = result_df[keep_mask].copy()
|
|
149
|
+
|
|
150
|
+
unmapped = len(recomb_df) - len(result_df)
|
|
151
|
+
if unmapped > 0:
|
|
152
|
+
logger.debug(f"Dropped {unmapped} positions that failed to liftover")
|
|
153
|
+
|
|
154
|
+
return result_df.sort_values("pos").reset_index(drop=True)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_default_data_dir() -> Path:
|
|
158
|
+
"""Get default directory for recombination map data.
|
|
159
|
+
|
|
160
|
+
Returns platform-appropriate cache directory:
|
|
161
|
+
- macOS: ~/Library/Caches/snp-scope-plot
|
|
162
|
+
- Linux: ~/.cache/snp-scope-plot
|
|
163
|
+
- Windows: %LOCALAPPDATA%/snp-scope-plot
|
|
164
|
+
"""
|
|
165
|
+
if os.name == "nt": # Windows
|
|
166
|
+
base = Path(os.environ.get("LOCALAPPDATA", Path.home()))
|
|
167
|
+
elif os.path.exists("/dbfs"): # Databricks
|
|
168
|
+
return Path("/dbfs/FileStore/reference_data/recombination_maps")
|
|
169
|
+
else:
|
|
170
|
+
# macOS and Linux
|
|
171
|
+
xdg_cache = os.environ.get("XDG_CACHE_HOME")
|
|
172
|
+
if xdg_cache:
|
|
173
|
+
base = Path(xdg_cache)
|
|
174
|
+
else:
|
|
175
|
+
base = Path.home() / ".cache"
|
|
176
|
+
|
|
177
|
+
return base / "snp-scope-plot" / "recombination_maps"
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def download_dog_recombination_maps(
|
|
181
|
+
output_dir: Optional[str] = None,
|
|
182
|
+
force: bool = False,
|
|
183
|
+
) -> Path:
|
|
184
|
+
"""Download dog recombination rate maps from Campbell et al. 2016.
|
|
185
|
+
|
|
186
|
+
Downloads from: https://github.com/cflerin/dog_recombination
|
|
187
|
+
|
|
188
|
+
Data is in CanFam3.1 coordinates with columns:
|
|
189
|
+
- chr: Chromosome number
|
|
190
|
+
- pos: Physical position (bp)
|
|
191
|
+
- rate: Recombination rate (cM/Mb)
|
|
192
|
+
- cM: Cumulative genetic distance (centiMorgans)
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
output_dir: Directory to save maps. Uses platform cache if None.
|
|
196
|
+
force: Re-download even if files exist.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Path to the directory containing recombination map files.
|
|
200
|
+
"""
|
|
201
|
+
# Determine output directory
|
|
202
|
+
if output_dir is None:
|
|
203
|
+
output_path = get_default_data_dir()
|
|
204
|
+
else:
|
|
205
|
+
output_path = Path(output_dir)
|
|
206
|
+
|
|
207
|
+
# Check if already downloaded
|
|
208
|
+
if output_path.exists() and not force:
|
|
209
|
+
existing_files = list(output_path.glob("chr*_recomb.tsv"))
|
|
210
|
+
if len(existing_files) >= 38: # 38 autosomes + X
|
|
211
|
+
return output_path
|
|
212
|
+
|
|
213
|
+
# Create output directory
|
|
214
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
|
|
216
|
+
logger.info("Downloading dog recombination maps from GitHub...")
|
|
217
|
+
logger.debug(f"Source: {DOG_RECOMB_URL}")
|
|
218
|
+
|
|
219
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
220
|
+
# Download tar.gz file
|
|
221
|
+
tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
urllib.request.urlretrieve(DOG_RECOMB_URL, tar_path)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
logger.debug(f"urllib download failed: {e}")
|
|
227
|
+
logger.debug("Trying alternative method with requests...")
|
|
228
|
+
try:
|
|
229
|
+
import requests
|
|
230
|
+
|
|
231
|
+
response = requests.get(DOG_RECOMB_URL, timeout=60)
|
|
232
|
+
response.raise_for_status()
|
|
233
|
+
tar_path.write_bytes(response.content)
|
|
234
|
+
except ImportError:
|
|
235
|
+
raise RuntimeError(
|
|
236
|
+
"Failed to download. Install requests: pip install requests"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
|
|
240
|
+
|
|
241
|
+
# Extract tar.gz
|
|
242
|
+
logger.debug("Extracting genetic maps...")
|
|
243
|
+
with tarfile.open(tar_path, "r:gz") as tar:
|
|
244
|
+
tar.extractall(tmpdir)
|
|
245
|
+
|
|
246
|
+
# Find and process the extracted files
|
|
247
|
+
extracted_dir = Path(tmpdir)
|
|
248
|
+
|
|
249
|
+
# Look for genetic map files (may be in a subdirectory)
|
|
250
|
+
map_files = list(extracted_dir.rglob("chr*.txt"))
|
|
251
|
+
if not map_files:
|
|
252
|
+
map_files = list(extracted_dir.rglob("*chr*.tsv"))
|
|
253
|
+
|
|
254
|
+
if not map_files:
|
|
255
|
+
all_files = list(extracted_dir.rglob("*"))
|
|
256
|
+
logger.error(f"Extracted files: {[f.name for f in all_files[:20]]}")
|
|
257
|
+
raise RuntimeError("Could not find chromosome map files in archive")
|
|
258
|
+
|
|
259
|
+
logger.debug(f"Found {len(map_files)} chromosome files")
|
|
260
|
+
|
|
261
|
+
# Copy and rename files
|
|
262
|
+
for map_file in map_files:
|
|
263
|
+
name = map_file.stem
|
|
264
|
+
if "chr" in name.lower():
|
|
265
|
+
chrom = name.lower().split("chr")[-1].split("_")[0].split(".")[0]
|
|
266
|
+
output_file = output_path / f"chr{chrom}_recomb.tsv"
|
|
267
|
+
|
|
268
|
+
with open(map_file, "r") as f:
|
|
269
|
+
content = f.read()
|
|
270
|
+
|
|
271
|
+
# Ensure header is present
|
|
272
|
+
lines = content.strip().split("\n")
|
|
273
|
+
if not lines[0].startswith("chr") and not lines[0].startswith("pos"):
|
|
274
|
+
content = "chr\tpos\trate\tcM\n" + content
|
|
275
|
+
|
|
276
|
+
with open(output_file, "w") as f:
|
|
277
|
+
f.write(content)
|
|
278
|
+
|
|
279
|
+
logger.info(f"Recombination maps saved to: {output_path}")
|
|
280
|
+
return output_path
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def load_recombination_map(
|
|
284
|
+
chrom: int,
|
|
285
|
+
species: str = "dog",
|
|
286
|
+
data_dir: Optional[str] = None,
|
|
287
|
+
) -> pd.DataFrame:
|
|
288
|
+
"""Load recombination map for a specific chromosome.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
chrom: Chromosome number (1-38 for dog, 1-18 for cat) or 'X'.
|
|
292
|
+
species: Species name ('dog', 'cat').
|
|
293
|
+
data_dir: Directory containing recombination maps.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
DataFrame with columns: pos, rate, cM.
|
|
297
|
+
|
|
298
|
+
Raises:
|
|
299
|
+
FileNotFoundError: If map file not found.
|
|
300
|
+
"""
|
|
301
|
+
if data_dir is None:
|
|
302
|
+
data_dir = get_default_data_dir()
|
|
303
|
+
|
|
304
|
+
data_path = Path(data_dir)
|
|
305
|
+
chrom_str = str(chrom).replace("chr", "")
|
|
306
|
+
map_file = data_path / f"chr{chrom_str}_recomb.tsv"
|
|
307
|
+
|
|
308
|
+
if not map_file.exists():
|
|
309
|
+
raise FileNotFoundError(
|
|
310
|
+
f"Recombination map not found: {map_file}\n"
|
|
311
|
+
f"Run download_{species}_recombination_maps() first to download the data."
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
df = pd.read_csv(map_file, sep="\t")
|
|
315
|
+
|
|
316
|
+
# Ensure numeric columns
|
|
317
|
+
df["pos"] = pd.to_numeric(df["pos"], errors="coerce")
|
|
318
|
+
df["rate"] = pd.to_numeric(df["rate"], errors="coerce")
|
|
319
|
+
if "cM" in df.columns:
|
|
320
|
+
df["cM"] = pd.to_numeric(df["cM"], errors="coerce")
|
|
321
|
+
|
|
322
|
+
return df.dropna(subset=["pos", "rate"])
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def get_recombination_rate_for_region(
|
|
326
|
+
chrom: int,
|
|
327
|
+
start: int,
|
|
328
|
+
end: int,
|
|
329
|
+
species: str = "dog",
|
|
330
|
+
data_dir: Optional[str] = None,
|
|
331
|
+
genome_build: Optional[str] = None,
|
|
332
|
+
) -> pd.DataFrame:
|
|
333
|
+
"""Get recombination rate data for a genomic region.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
chrom: Chromosome number.
|
|
337
|
+
start: Start position (bp).
|
|
338
|
+
end: End position (bp).
|
|
339
|
+
species: Species name ('dog', 'cat').
|
|
340
|
+
data_dir: Directory containing recombination maps.
|
|
341
|
+
genome_build: Target genome build (e.g., "canfam4"). If specified and
|
|
342
|
+
different from source data (CanFam3.1), coordinates are lifted over.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
DataFrame with pos and rate columns for the region.
|
|
346
|
+
|
|
347
|
+
Note:
|
|
348
|
+
Built-in dog recombination maps are in CanFam3.1 coordinates.
|
|
349
|
+
If genome_build="canfam4", positions are automatically lifted over.
|
|
350
|
+
This requires pyliftover: pip install pyliftover
|
|
351
|
+
"""
|
|
352
|
+
df = load_recombination_map(chrom, species=species, data_dir=data_dir)
|
|
353
|
+
|
|
354
|
+
# Liftover if needed
|
|
355
|
+
build = _normalize_build(genome_build)
|
|
356
|
+
if species == "dog" and build == "canfam4":
|
|
357
|
+
logger.debug(f"Lifting over recombination map for chr{chrom} to CanFam4")
|
|
358
|
+
df = liftover_recombination_map(
|
|
359
|
+
df, from_build="canfam3", to_build="canfam4", chrom=chrom
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# Filter to region
|
|
363
|
+
region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
|
|
364
|
+
|
|
365
|
+
return region_df[["pos", "rate"]]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def add_recombination_overlay(
|
|
369
|
+
ax: Axes,
|
|
370
|
+
recomb_df: pd.DataFrame,
|
|
371
|
+
start: int,
|
|
372
|
+
end: int,
|
|
373
|
+
) -> Optional[Axes]:
|
|
374
|
+
"""Add recombination rate as secondary y-axis overlay.
|
|
375
|
+
|
|
376
|
+
Plots recombination rate (cM/Mb) as a light blue line on a
|
|
377
|
+
secondary y-axis, styled to match LocusZoom.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
ax: Primary matplotlib axes object.
|
|
381
|
+
recomb_df: DataFrame with 'pos' and 'rate' columns.
|
|
382
|
+
start: Region start position.
|
|
383
|
+
end: Region end position.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
Secondary axes object for recombination rate, or None if no data.
|
|
387
|
+
"""
|
|
388
|
+
# Create secondary y-axis
|
|
389
|
+
recomb_ax = ax.twinx()
|
|
390
|
+
|
|
391
|
+
# Filter to region
|
|
392
|
+
region_recomb = recomb_df[
|
|
393
|
+
(recomb_df["pos"] >= start) & (recomb_df["pos"] <= end)
|
|
394
|
+
].copy()
|
|
395
|
+
|
|
396
|
+
if region_recomb.empty:
|
|
397
|
+
recomb_ax.set_visible(False)
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
# Plot recombination rate as light blue line
|
|
401
|
+
recomb_ax.plot(
|
|
402
|
+
region_recomb["pos"],
|
|
403
|
+
region_recomb["rate"],
|
|
404
|
+
color=RECOMB_COLOR,
|
|
405
|
+
linewidth=1.5,
|
|
406
|
+
alpha=0.7,
|
|
407
|
+
zorder=0, # Behind scatter points
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Fill under curve
|
|
411
|
+
recomb_ax.fill_between(
|
|
412
|
+
region_recomb["pos"],
|
|
413
|
+
0,
|
|
414
|
+
region_recomb["rate"],
|
|
415
|
+
color=RECOMB_COLOR,
|
|
416
|
+
alpha=0.15,
|
|
417
|
+
zorder=0,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Format secondary axis
|
|
421
|
+
recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color=RECOMB_COLOR, fontsize=9)
|
|
422
|
+
recomb_ax.tick_params(axis="y", labelcolor=RECOMB_COLOR, labelsize=8)
|
|
423
|
+
recomb_ax.set_ylim(bottom=0)
|
|
424
|
+
|
|
425
|
+
# Don't let recomb rate overwhelm the plot
|
|
426
|
+
max_rate = region_recomb["rate"].max()
|
|
427
|
+
recomb_ax.set_ylim(0, max(max_rate * 1.2, 20))
|
|
428
|
+
|
|
429
|
+
# Remove top spine for cleaner look
|
|
430
|
+
recomb_ax.spines["top"].set_visible(False)
|
|
431
|
+
|
|
432
|
+
return recomb_ax
|
pylocuszoom/utils.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Utility functions for pyLocusZoom.
|
|
2
|
+
|
|
3
|
+
Shared helpers used across multiple modules.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
|
13
|
+
|
|
14
|
+
# Type alias for DataFrames (pandas or PySpark)
|
|
15
|
+
DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ValidationError(ValueError):
|
|
19
|
+
"""Raised when input validation fails."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_spark_dataframe(df: Any) -> bool:
|
|
23
|
+
"""Check if object is a PySpark DataFrame.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
df: Object to check.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
True if PySpark DataFrame, False otherwise.
|
|
30
|
+
"""
|
|
31
|
+
# Check class name to avoid importing pyspark
|
|
32
|
+
return type(df).__name__ == "DataFrame" and type(df).__module__.startswith("pyspark")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def to_pandas(
|
|
36
|
+
df: DataFrameLike,
|
|
37
|
+
sample_size: Optional[int] = None,
|
|
38
|
+
) -> pd.DataFrame:
|
|
39
|
+
"""Convert DataFrame-like object to pandas DataFrame.
|
|
40
|
+
|
|
41
|
+
Supports pandas DataFrames (returned as-is) and PySpark DataFrames
|
|
42
|
+
(converted to pandas). For large PySpark DataFrames, use sample_size
|
|
43
|
+
to limit the data transferred.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
df: pandas DataFrame or PySpark DataFrame.
|
|
47
|
+
sample_size: For PySpark, limit to this many rows. If None,
|
|
48
|
+
converts entire DataFrame (may be slow for large data).
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
pandas DataFrame.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
TypeError: If df is not a supported DataFrame type.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
>>> # PySpark DataFrame
|
|
58
|
+
>>> pdf = to_pandas(spark_df, sample_size=100000)
|
|
59
|
+
>>>
|
|
60
|
+
>>> # pandas DataFrame (passthrough)
|
|
61
|
+
>>> pdf = to_pandas(pandas_df)
|
|
62
|
+
"""
|
|
63
|
+
if isinstance(df, pd.DataFrame):
|
|
64
|
+
return df
|
|
65
|
+
|
|
66
|
+
if is_spark_dataframe(df):
|
|
67
|
+
if sample_size is not None:
|
|
68
|
+
# Sample to limit data transfer
|
|
69
|
+
total = df.count()
|
|
70
|
+
if total > sample_size:
|
|
71
|
+
fraction = sample_size / total
|
|
72
|
+
df = df.sample(fraction=fraction, seed=42)
|
|
73
|
+
return df.toPandas()
|
|
74
|
+
|
|
75
|
+
# Try pandas conversion as fallback
|
|
76
|
+
if hasattr(df, "to_pandas"):
|
|
77
|
+
return df.to_pandas()
|
|
78
|
+
if hasattr(df, "toPandas"):
|
|
79
|
+
return df.toPandas()
|
|
80
|
+
|
|
81
|
+
raise TypeError(
|
|
82
|
+
f"Unsupported DataFrame type: {type(df).__name__}. "
|
|
83
|
+
f"Expected pandas.DataFrame or pyspark.sql.DataFrame"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def normalize_chrom(chrom: Union[int, str]) -> str:
|
|
88
|
+
"""Normalize chromosome identifier by removing 'chr' prefix.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
chrom: Chromosome as integer (1, 2, ...) or string ("chr1", "1").
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
String without 'chr' prefix (e.g., "1", "X").
|
|
95
|
+
|
|
96
|
+
Example:
|
|
97
|
+
>>> normalize_chrom(1)
|
|
98
|
+
'1'
|
|
99
|
+
>>> normalize_chrom("chr1")
|
|
100
|
+
'1'
|
|
101
|
+
>>> normalize_chrom("chrX")
|
|
102
|
+
'X'
|
|
103
|
+
"""
|
|
104
|
+
return str(chrom).replace("chr", "")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def validate_dataframe(
|
|
108
|
+
df: pd.DataFrame,
|
|
109
|
+
required_cols: List[str],
|
|
110
|
+
name: str = "DataFrame",
|
|
111
|
+
) -> None:
|
|
112
|
+
"""Validate that a DataFrame has required columns.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
df: DataFrame to validate.
|
|
116
|
+
required_cols: List of required column names.
|
|
117
|
+
name: Name for error messages (e.g., "gwas_df", "genes_df").
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ValidationError: If required columns are missing.
|
|
121
|
+
|
|
122
|
+
Example:
|
|
123
|
+
>>> validate_dataframe(df, ["chr", "start", "end"], "genes_df")
|
|
124
|
+
"""
|
|
125
|
+
missing = [col for col in required_cols if col not in df.columns]
|
|
126
|
+
if missing:
|
|
127
|
+
available = list(df.columns)
|
|
128
|
+
raise ValidationError(
|
|
129
|
+
f"{name} missing required columns: {missing}. "
|
|
130
|
+
f"Available columns: {available}"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def validate_gwas_df(
|
|
135
|
+
df: pd.DataFrame,
|
|
136
|
+
pos_col: str = "ps",
|
|
137
|
+
p_col: str = "p_wald",
|
|
138
|
+
rs_col: Optional[str] = None,
|
|
139
|
+
) -> None:
|
|
140
|
+
"""Validate GWAS results DataFrame.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
df: GWAS results DataFrame.
|
|
144
|
+
pos_col: Column name for position.
|
|
145
|
+
p_col: Column name for p-values.
|
|
146
|
+
rs_col: Column name for SNP IDs (optional).
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
ValidationError: If required columns are missing.
|
|
150
|
+
"""
|
|
151
|
+
required = [pos_col, p_col]
|
|
152
|
+
if rs_col:
|
|
153
|
+
required.append(rs_col)
|
|
154
|
+
validate_dataframe(df, required, "gwas_df")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def validate_genes_df(df: pd.DataFrame) -> None:
|
|
158
|
+
"""Validate gene annotations DataFrame.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
df: Gene annotations DataFrame.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
ValidationError: If required columns are missing.
|
|
165
|
+
"""
|
|
166
|
+
validate_dataframe(df, ["chr", "start", "end", "gene_name"], "genes_df")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def validate_plink_files(bfile_path: Union[str, Path]) -> Path:
|
|
170
|
+
"""Validate that PLINK binary fileset exists.
|
|
171
|
+
|
|
172
|
+
Checks for .bed, .bim, and .fam files.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
bfile_path: Path prefix for PLINK files (without extension).
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Path object if files exist.
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
ValidationError: If any PLINK files are missing.
|
|
182
|
+
"""
|
|
183
|
+
path = Path(bfile_path)
|
|
184
|
+
missing = []
|
|
185
|
+
for ext in [".bed", ".bim", ".fam"]:
|
|
186
|
+
if not path.with_suffix(ext).exists():
|
|
187
|
+
missing.append(ext)
|
|
188
|
+
|
|
189
|
+
if missing:
|
|
190
|
+
raise ValidationError(
|
|
191
|
+
f"PLINK files missing for {path}: {missing}. "
|
|
192
|
+
f"Expected: {path}.bed, {path}.bim, {path}.fam"
|
|
193
|
+
)
|
|
194
|
+
return path
|