pylocuszoom 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,432 @@
1
+ """Recombination rate overlay and data management.
2
+
3
+ Provides:
4
+ - Recombination rate overlay for regional plots
5
+ - Download and loading of species-specific recombination maps
6
+ - Liftover support for CanFam3.1 to CanFam4 coordinate conversion
7
+ """
8
+
9
+ import os
10
+ import tarfile
11
+ import tempfile
12
+ import urllib.request
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ import pandas as pd
17
+ from matplotlib.axes import Axes
18
+
19
+ from .logging import logger
20
+
21
+ # Recombination overlay color
22
+ RECOMB_COLOR = "#7FCDFF" # Light blue
23
+
24
+ # Data sources by species
25
+ DOG_RECOMB_URL = (
26
+ "https://github.com/cflerin/dog_recombination/raw/master/dog_genetic_maps.tar.gz"
27
+ )
28
+
29
+ # Liftover chain files
30
+ CANFAM3_TO_CANFAM4_CHAIN_URL = "https://hgdownload.soe.ucsc.edu/gbdb/canFam3/liftOver/canFam3ToCanFam4.over.chain.gz"
31
+
32
+
33
+ def _normalize_build(build: Optional[str]) -> Optional[str]:
34
+ """Normalize genome build name to canonical form.
35
+
36
+ Args:
37
+ build: Build name (e.g., "canfam4", "CanFam4.0", "UU_Cfam_GSD_1.0")
38
+
39
+ Returns:
40
+ Normalized build name ("canfam3" or "canfam4"), or None if not specified.
41
+ """
42
+ if build is None:
43
+ return None
44
+ build_lower = build.lower().replace(".", "").replace("_", "")
45
+ if "canfam4" in build_lower or "uucfamgsd" in build_lower:
46
+ return "canfam4"
47
+ if "canfam3" in build_lower:
48
+ return "canfam3"
49
+ return build.lower()
50
+
51
+
52
+ def get_chain_file_path() -> Path:
53
+ """Get path to the CanFam3 to CanFam4 liftover chain file."""
54
+ return get_default_data_dir() / "canFam3ToCanFam4.over.chain.gz"
55
+
56
+
57
+ def download_liftover_chain(force: bool = False) -> Path:
58
+ """Download the CanFam3 to CanFam4 liftover chain file.
59
+
60
+ Args:
61
+ force: Re-download even if file exists.
62
+
63
+ Returns:
64
+ Path to the downloaded chain file.
65
+ """
66
+ chain_path = get_chain_file_path()
67
+
68
+ if chain_path.exists() and not force:
69
+ return chain_path
70
+
71
+ chain_path.parent.mkdir(parents=True, exist_ok=True)
72
+
73
+ logger.info("Downloading CanFam3 to CanFam4 liftover chain...")
74
+ logger.debug(f"Source: {CANFAM3_TO_CANFAM4_CHAIN_URL}")
75
+
76
+ try:
77
+ urllib.request.urlretrieve(CANFAM3_TO_CANFAM4_CHAIN_URL, chain_path)
78
+ except Exception as e:
79
+ logger.debug(f"urllib download failed: {e}")
80
+ try:
81
+ import requests
82
+
83
+ response = requests.get(CANFAM3_TO_CANFAM4_CHAIN_URL, timeout=60)
84
+ response.raise_for_status()
85
+ chain_path.write_bytes(response.content)
86
+ except ImportError:
87
+ raise RuntimeError(
88
+ "Failed to download. Install requests: pip install requests"
89
+ )
90
+
91
+ logger.info(f"Chain file saved to: {chain_path}")
92
+ return chain_path
93
+
94
+
95
+ def liftover_recombination_map(
96
+ recomb_df: pd.DataFrame,
97
+ from_build: str = "canfam3",
98
+ to_build: str = "canfam4",
99
+ chrom: Optional[int] = None,
100
+ ) -> pd.DataFrame:
101
+ """Liftover recombination map coordinates between genome builds.
102
+
103
+ Args:
104
+ recomb_df: DataFrame with 'pos' column (and optionally 'chr').
105
+ from_build: Source genome build (default: canfam3).
106
+ to_build: Target genome build (default: canfam4).
107
+ chrom: Chromosome number (required if 'chr' not in recomb_df).
108
+
109
+ Returns:
110
+ DataFrame with lifted coordinates. Positions that fail to map are dropped.
111
+ """
112
+ from pyliftover import LiftOver
113
+
114
+ # Download chain file if needed
115
+ chain_path = download_liftover_chain()
116
+
117
+ logger.debug(f"Lifting over coordinates from {from_build} to {to_build}")
118
+ lo = LiftOver(str(chain_path))
119
+
120
+ # Get chromosome for each position
121
+ if "chr" in recomb_df.columns:
122
+ chroms = recomb_df["chr"].astype(str)
123
+ elif chrom is not None:
124
+ chroms = pd.Series([str(chrom)] * len(recomb_df))
125
+ else:
126
+ raise ValueError("Either 'chr' column or chrom parameter required")
127
+
128
+ # Liftover each position
129
+ new_positions = []
130
+ keep_mask = []
131
+
132
+ for chr_val, pos in zip(chroms, recomb_df["pos"]):
133
+ chr_str = f"chr{chr_val}" if not str(chr_val).startswith("chr") else chr_val
134
+ result = lo.convert_coordinate(chr_str, int(pos))
135
+
136
+ if result and len(result) > 0:
137
+ # Take first mapping (usually the only one)
138
+ _, new_pos, _, _ = result[0]
139
+ new_positions.append(int(new_pos))
140
+ keep_mask.append(True)
141
+ else:
142
+ new_positions.append(None)
143
+ keep_mask.append(False)
144
+
145
+ # Create output DataFrame
146
+ result_df = recomb_df.copy()
147
+ result_df["pos"] = new_positions
148
+ result_df = result_df[keep_mask].copy()
149
+
150
+ unmapped = len(recomb_df) - len(result_df)
151
+ if unmapped > 0:
152
+ logger.debug(f"Dropped {unmapped} positions that failed to liftover")
153
+
154
+ return result_df.sort_values("pos").reset_index(drop=True)
155
+
156
+
157
+ def get_default_data_dir() -> Path:
158
+ """Get default directory for recombination map data.
159
+
160
+ Returns platform-appropriate cache directory:
161
+ - macOS: ~/Library/Caches/snp-scope-plot
162
+ - Linux: ~/.cache/snp-scope-plot
163
+ - Windows: %LOCALAPPDATA%/snp-scope-plot
164
+ """
165
+ if os.name == "nt": # Windows
166
+ base = Path(os.environ.get("LOCALAPPDATA", Path.home()))
167
+ elif os.path.exists("/dbfs"): # Databricks
168
+ return Path("/dbfs/FileStore/reference_data/recombination_maps")
169
+ else:
170
+ # macOS and Linux
171
+ xdg_cache = os.environ.get("XDG_CACHE_HOME")
172
+ if xdg_cache:
173
+ base = Path(xdg_cache)
174
+ else:
175
+ base = Path.home() / ".cache"
176
+
177
+ return base / "snp-scope-plot" / "recombination_maps"
178
+
179
+
180
+ def download_dog_recombination_maps(
181
+ output_dir: Optional[str] = None,
182
+ force: bool = False,
183
+ ) -> Path:
184
+ """Download dog recombination rate maps from Campbell et al. 2016.
185
+
186
+ Downloads from: https://github.com/cflerin/dog_recombination
187
+
188
+ Data is in CanFam3.1 coordinates with columns:
189
+ - chr: Chromosome number
190
+ - pos: Physical position (bp)
191
+ - rate: Recombination rate (cM/Mb)
192
+ - cM: Cumulative genetic distance (centiMorgans)
193
+
194
+ Args:
195
+ output_dir: Directory to save maps. Uses platform cache if None.
196
+ force: Re-download even if files exist.
197
+
198
+ Returns:
199
+ Path to the directory containing recombination map files.
200
+ """
201
+ # Determine output directory
202
+ if output_dir is None:
203
+ output_path = get_default_data_dir()
204
+ else:
205
+ output_path = Path(output_dir)
206
+
207
+ # Check if already downloaded
208
+ if output_path.exists() and not force:
209
+ existing_files = list(output_path.glob("chr*_recomb.tsv"))
210
+ if len(existing_files) >= 38: # 38 autosomes + X
211
+ return output_path
212
+
213
+ # Create output directory
214
+ output_path.mkdir(parents=True, exist_ok=True)
215
+
216
+ logger.info("Downloading dog recombination maps from GitHub...")
217
+ logger.debug(f"Source: {DOG_RECOMB_URL}")
218
+
219
+ with tempfile.TemporaryDirectory() as tmpdir:
220
+ # Download tar.gz file
221
+ tar_path = Path(tmpdir) / "dog_genetic_maps.tar.gz"
222
+
223
+ try:
224
+ urllib.request.urlretrieve(DOG_RECOMB_URL, tar_path)
225
+ except Exception as e:
226
+ logger.debug(f"urllib download failed: {e}")
227
+ logger.debug("Trying alternative method with requests...")
228
+ try:
229
+ import requests
230
+
231
+ response = requests.get(DOG_RECOMB_URL, timeout=60)
232
+ response.raise_for_status()
233
+ tar_path.write_bytes(response.content)
234
+ except ImportError:
235
+ raise RuntimeError(
236
+ "Failed to download. Install requests: pip install requests"
237
+ )
238
+
239
+ logger.debug(f"Downloaded {tar_path.stat().st_size / 1024:.1f} KB")
240
+
241
+ # Extract tar.gz
242
+ logger.debug("Extracting genetic maps...")
243
+ with tarfile.open(tar_path, "r:gz") as tar:
244
+ tar.extractall(tmpdir)
245
+
246
+ # Find and process the extracted files
247
+ extracted_dir = Path(tmpdir)
248
+
249
+ # Look for genetic map files (may be in a subdirectory)
250
+ map_files = list(extracted_dir.rglob("chr*.txt"))
251
+ if not map_files:
252
+ map_files = list(extracted_dir.rglob("*chr*.tsv"))
253
+
254
+ if not map_files:
255
+ all_files = list(extracted_dir.rglob("*"))
256
+ logger.error(f"Extracted files: {[f.name for f in all_files[:20]]}")
257
+ raise RuntimeError("Could not find chromosome map files in archive")
258
+
259
+ logger.debug(f"Found {len(map_files)} chromosome files")
260
+
261
+ # Copy and rename files
262
+ for map_file in map_files:
263
+ name = map_file.stem
264
+ if "chr" in name.lower():
265
+ chrom = name.lower().split("chr")[-1].split("_")[0].split(".")[0]
266
+ output_file = output_path / f"chr{chrom}_recomb.tsv"
267
+
268
+ with open(map_file, "r") as f:
269
+ content = f.read()
270
+
271
+ # Ensure header is present
272
+ lines = content.strip().split("\n")
273
+ if not lines[0].startswith("chr") and not lines[0].startswith("pos"):
274
+ content = "chr\tpos\trate\tcM\n" + content
275
+
276
+ with open(output_file, "w") as f:
277
+ f.write(content)
278
+
279
+ logger.info(f"Recombination maps saved to: {output_path}")
280
+ return output_path
281
+
282
+
283
+ def load_recombination_map(
284
+ chrom: int,
285
+ species: str = "dog",
286
+ data_dir: Optional[str] = None,
287
+ ) -> pd.DataFrame:
288
+ """Load recombination map for a specific chromosome.
289
+
290
+ Args:
291
+ chrom: Chromosome number (1-38 for dog, 1-18 for cat) or 'X'.
292
+ species: Species name ('dog', 'cat').
293
+ data_dir: Directory containing recombination maps.
294
+
295
+ Returns:
296
+ DataFrame with columns: pos, rate, cM.
297
+
298
+ Raises:
299
+ FileNotFoundError: If map file not found.
300
+ """
301
+ if data_dir is None:
302
+ data_dir = get_default_data_dir()
303
+
304
+ data_path = Path(data_dir)
305
+ chrom_str = str(chrom).replace("chr", "")
306
+ map_file = data_path / f"chr{chrom_str}_recomb.tsv"
307
+
308
+ if not map_file.exists():
309
+ raise FileNotFoundError(
310
+ f"Recombination map not found: {map_file}\n"
311
+ f"Run download_{species}_recombination_maps() first to download the data."
312
+ )
313
+
314
+ df = pd.read_csv(map_file, sep="\t")
315
+
316
+ # Ensure numeric columns
317
+ df["pos"] = pd.to_numeric(df["pos"], errors="coerce")
318
+ df["rate"] = pd.to_numeric(df["rate"], errors="coerce")
319
+ if "cM" in df.columns:
320
+ df["cM"] = pd.to_numeric(df["cM"], errors="coerce")
321
+
322
+ return df.dropna(subset=["pos", "rate"])
323
+
324
+
325
+ def get_recombination_rate_for_region(
326
+ chrom: int,
327
+ start: int,
328
+ end: int,
329
+ species: str = "dog",
330
+ data_dir: Optional[str] = None,
331
+ genome_build: Optional[str] = None,
332
+ ) -> pd.DataFrame:
333
+ """Get recombination rate data for a genomic region.
334
+
335
+ Args:
336
+ chrom: Chromosome number.
337
+ start: Start position (bp).
338
+ end: End position (bp).
339
+ species: Species name ('dog', 'cat').
340
+ data_dir: Directory containing recombination maps.
341
+ genome_build: Target genome build (e.g., "canfam4"). If specified and
342
+ different from source data (CanFam3.1), coordinates are lifted over.
343
+
344
+ Returns:
345
+ DataFrame with pos and rate columns for the region.
346
+
347
+ Note:
348
+ Built-in dog recombination maps are in CanFam3.1 coordinates.
349
+ If genome_build="canfam4", positions are automatically lifted over.
350
+ This requires pyliftover: pip install pyliftover
351
+ """
352
+ df = load_recombination_map(chrom, species=species, data_dir=data_dir)
353
+
354
+ # Liftover if needed
355
+ build = _normalize_build(genome_build)
356
+ if species == "dog" and build == "canfam4":
357
+ logger.debug(f"Lifting over recombination map for chr{chrom} to CanFam4")
358
+ df = liftover_recombination_map(
359
+ df, from_build="canfam3", to_build="canfam4", chrom=chrom
360
+ )
361
+
362
+ # Filter to region
363
+ region_df = df[(df["pos"] >= start) & (df["pos"] <= end)].copy()
364
+
365
+ return region_df[["pos", "rate"]]
366
+
367
+
368
+ def add_recombination_overlay(
369
+ ax: Axes,
370
+ recomb_df: pd.DataFrame,
371
+ start: int,
372
+ end: int,
373
+ ) -> Optional[Axes]:
374
+ """Add recombination rate as secondary y-axis overlay.
375
+
376
+ Plots recombination rate (cM/Mb) as a light blue line on a
377
+ secondary y-axis, styled to match LocusZoom.
378
+
379
+ Args:
380
+ ax: Primary matplotlib axes object.
381
+ recomb_df: DataFrame with 'pos' and 'rate' columns.
382
+ start: Region start position.
383
+ end: Region end position.
384
+
385
+ Returns:
386
+ Secondary axes object for recombination rate, or None if no data.
387
+ """
388
+ # Create secondary y-axis
389
+ recomb_ax = ax.twinx()
390
+
391
+ # Filter to region
392
+ region_recomb = recomb_df[
393
+ (recomb_df["pos"] >= start) & (recomb_df["pos"] <= end)
394
+ ].copy()
395
+
396
+ if region_recomb.empty:
397
+ recomb_ax.set_visible(False)
398
+ return None
399
+
400
+ # Plot recombination rate as light blue line
401
+ recomb_ax.plot(
402
+ region_recomb["pos"],
403
+ region_recomb["rate"],
404
+ color=RECOMB_COLOR,
405
+ linewidth=1.5,
406
+ alpha=0.7,
407
+ zorder=0, # Behind scatter points
408
+ )
409
+
410
+ # Fill under curve
411
+ recomb_ax.fill_between(
412
+ region_recomb["pos"],
413
+ 0,
414
+ region_recomb["rate"],
415
+ color=RECOMB_COLOR,
416
+ alpha=0.15,
417
+ zorder=0,
418
+ )
419
+
420
+ # Format secondary axis
421
+ recomb_ax.set_ylabel("Recombination rate (cM/Mb)", color=RECOMB_COLOR, fontsize=9)
422
+ recomb_ax.tick_params(axis="y", labelcolor=RECOMB_COLOR, labelsize=8)
423
+ recomb_ax.set_ylim(bottom=0)
424
+
425
+ # Don't let recomb rate overwhelm the plot
426
+ max_rate = region_recomb["rate"].max()
427
+ recomb_ax.set_ylim(0, max(max_rate * 1.2, 20))
428
+
429
+ # Remove top spine for cleaner look
430
+ recomb_ax.spines["top"].set_visible(False)
431
+
432
+ return recomb_ax
@@ -0,0 +1,4 @@
1
+ """Reference data for snp-scope-plot.
2
+
3
+ Contains species-specific reference data downloaders and loaders.
4
+ """
pylocuszoom/utils.py ADDED
@@ -0,0 +1,194 @@
1
+ """Utility functions for pyLocusZoom.
2
+
3
+ Shared helpers used across multiple modules.
4
+ """
5
+
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
8
+
9
+ import pandas as pd
10
+
11
+ if TYPE_CHECKING:
12
+ from pyspark.sql import DataFrame as SparkDataFrame
13
+
14
+ # Type alias for DataFrames (pandas or PySpark)
15
+ DataFrameLike = Union[pd.DataFrame, "SparkDataFrame", Any]
16
+
17
+
18
+ class ValidationError(ValueError):
19
+ """Raised when input validation fails."""
20
+
21
+
22
+ def is_spark_dataframe(df: Any) -> bool:
23
+ """Check if object is a PySpark DataFrame.
24
+
25
+ Args:
26
+ df: Object to check.
27
+
28
+ Returns:
29
+ True if PySpark DataFrame, False otherwise.
30
+ """
31
+ # Check class name to avoid importing pyspark
32
+ return type(df).__name__ == "DataFrame" and type(df).__module__.startswith("pyspark")
33
+
34
+
35
+ def to_pandas(
36
+ df: DataFrameLike,
37
+ sample_size: Optional[int] = None,
38
+ ) -> pd.DataFrame:
39
+ """Convert DataFrame-like object to pandas DataFrame.
40
+
41
+ Supports pandas DataFrames (returned as-is) and PySpark DataFrames
42
+ (converted to pandas). For large PySpark DataFrames, use sample_size
43
+ to limit the data transferred.
44
+
45
+ Args:
46
+ df: pandas DataFrame or PySpark DataFrame.
47
+ sample_size: For PySpark, limit to this many rows. If None,
48
+ converts entire DataFrame (may be slow for large data).
49
+
50
+ Returns:
51
+ pandas DataFrame.
52
+
53
+ Raises:
54
+ TypeError: If df is not a supported DataFrame type.
55
+
56
+ Example:
57
+ >>> # PySpark DataFrame
58
+ >>> pdf = to_pandas(spark_df, sample_size=100000)
59
+ >>>
60
+ >>> # pandas DataFrame (passthrough)
61
+ >>> pdf = to_pandas(pandas_df)
62
+ """
63
+ if isinstance(df, pd.DataFrame):
64
+ return df
65
+
66
+ if is_spark_dataframe(df):
67
+ if sample_size is not None:
68
+ # Sample to limit data transfer
69
+ total = df.count()
70
+ if total > sample_size:
71
+ fraction = sample_size / total
72
+ df = df.sample(fraction=fraction, seed=42)
73
+ return df.toPandas()
74
+
75
+ # Try pandas conversion as fallback
76
+ if hasattr(df, "to_pandas"):
77
+ return df.to_pandas()
78
+ if hasattr(df, "toPandas"):
79
+ return df.toPandas()
80
+
81
+ raise TypeError(
82
+ f"Unsupported DataFrame type: {type(df).__name__}. "
83
+ f"Expected pandas.DataFrame or pyspark.sql.DataFrame"
84
+ )
85
+
86
+
87
+ def normalize_chrom(chrom: Union[int, str]) -> str:
88
+ """Normalize chromosome identifier by removing 'chr' prefix.
89
+
90
+ Args:
91
+ chrom: Chromosome as integer (1, 2, ...) or string ("chr1", "1").
92
+
93
+ Returns:
94
+ String without 'chr' prefix (e.g., "1", "X").
95
+
96
+ Example:
97
+ >>> normalize_chrom(1)
98
+ '1'
99
+ >>> normalize_chrom("chr1")
100
+ '1'
101
+ >>> normalize_chrom("chrX")
102
+ 'X'
103
+ """
104
+ return str(chrom).replace("chr", "")
105
+
106
+
107
+ def validate_dataframe(
108
+ df: pd.DataFrame,
109
+ required_cols: List[str],
110
+ name: str = "DataFrame",
111
+ ) -> None:
112
+ """Validate that a DataFrame has required columns.
113
+
114
+ Args:
115
+ df: DataFrame to validate.
116
+ required_cols: List of required column names.
117
+ name: Name for error messages (e.g., "gwas_df", "genes_df").
118
+
119
+ Raises:
120
+ ValidationError: If required columns are missing.
121
+
122
+ Example:
123
+ >>> validate_dataframe(df, ["chr", "start", "end"], "genes_df")
124
+ """
125
+ missing = [col for col in required_cols if col not in df.columns]
126
+ if missing:
127
+ available = list(df.columns)
128
+ raise ValidationError(
129
+ f"{name} missing required columns: {missing}. "
130
+ f"Available columns: {available}"
131
+ )
132
+
133
+
134
+ def validate_gwas_df(
135
+ df: pd.DataFrame,
136
+ pos_col: str = "ps",
137
+ p_col: str = "p_wald",
138
+ rs_col: Optional[str] = None,
139
+ ) -> None:
140
+ """Validate GWAS results DataFrame.
141
+
142
+ Args:
143
+ df: GWAS results DataFrame.
144
+ pos_col: Column name for position.
145
+ p_col: Column name for p-values.
146
+ rs_col: Column name for SNP IDs (optional).
147
+
148
+ Raises:
149
+ ValidationError: If required columns are missing.
150
+ """
151
+ required = [pos_col, p_col]
152
+ if rs_col:
153
+ required.append(rs_col)
154
+ validate_dataframe(df, required, "gwas_df")
155
+
156
+
157
+ def validate_genes_df(df: pd.DataFrame) -> None:
158
+ """Validate gene annotations DataFrame.
159
+
160
+ Args:
161
+ df: Gene annotations DataFrame.
162
+
163
+ Raises:
164
+ ValidationError: If required columns are missing.
165
+ """
166
+ validate_dataframe(df, ["chr", "start", "end", "gene_name"], "genes_df")
167
+
168
+
169
+ def validate_plink_files(bfile_path: Union[str, Path]) -> Path:
170
+ """Validate that PLINK binary fileset exists.
171
+
172
+ Checks for .bed, .bim, and .fam files.
173
+
174
+ Args:
175
+ bfile_path: Path prefix for PLINK files (without extension).
176
+
177
+ Returns:
178
+ Path object if files exist.
179
+
180
+ Raises:
181
+ ValidationError: If any PLINK files are missing.
182
+ """
183
+ path = Path(bfile_path)
184
+ missing = []
185
+ for ext in [".bed", ".bim", ".fam"]:
186
+ if not path.with_suffix(ext).exists():
187
+ missing.append(ext)
188
+
189
+ if missing:
190
+ raise ValidationError(
191
+ f"PLINK files missing for {path}: {missing}. "
192
+ f"Expected: {path}.bed, {path}.bim, {path}.fam"
193
+ )
194
+ return path