pylocuszoom 1.2.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pylocuszoom/ld.py CHANGED
@@ -16,6 +16,72 @@ from .logging import logger
16
16
  from .utils import validate_plink_files
17
17
 
18
18
 
19
+ def build_pairwise_ld_command(
20
+ plink_path: str,
21
+ bfile_path: str,
22
+ output_path: str,
23
+ snp_list_file: Optional[str] = None,
24
+ chrom: Optional[int] = None,
25
+ start: Optional[int] = None,
26
+ end: Optional[int] = None,
27
+ species: Optional[str] = "canine",
28
+ metric: str = "r2",
29
+ ) -> list:
30
+ """Build PLINK command for pairwise LD matrix computation.
31
+
32
+ Generates command for computing an N x N LD matrix using PLINK's
33
+ --r2 square (or --r dprime square) command.
34
+
35
+ Args:
36
+ plink_path: Path to PLINK executable.
37
+ bfile_path: Input binary fileset prefix (.bed/.bim/.fam).
38
+ output_path: Output prefix (creates .ld and .snplist files).
39
+ snp_list_file: Path to file with SNP IDs to extract (one per line).
40
+ chrom: Chromosome number for region-based extraction.
41
+ start: Start position (bp) for region-based extraction.
42
+ end: End position (bp) for region-based extraction.
43
+ species: Species flag ('canine', 'feline', or None for human).
44
+ metric: LD metric ('r2' or 'dprime').
45
+
46
+ Returns:
47
+ List of command arguments for subprocess.
48
+ """
49
+ cmd = [plink_path]
50
+
51
+ # Species flag
52
+ if species == "canine":
53
+ cmd.append("--dog")
54
+ elif species == "feline":
55
+ cmd.extend(["--chr-set", "18"])
56
+
57
+ # Input and output
58
+ cmd.extend(["--bfile", bfile_path])
59
+ cmd.extend(["--out", output_path])
60
+
61
+ # LD metric and square matrix flag
62
+ if metric == "dprime":
63
+ cmd.extend(["--r", "dprime", "square"])
64
+ else:
65
+ cmd.extend(["--r2", "square"])
66
+
67
+ # Track SNP order in output
68
+ cmd.append("--write-snplist")
69
+
70
+ # SNP extraction mode
71
+ if snp_list_file:
72
+ cmd.extend(["--extract", snp_list_file])
73
+
74
+ # Region-based extraction
75
+ if chrom is not None:
76
+ cmd.extend(["--chr", str(chrom)])
77
+ if start is not None:
78
+ cmd.extend(["--from-bp", str(start)])
79
+ if end is not None:
80
+ cmd.extend(["--to-bp", str(end)])
81
+
82
+ return cmd
83
+
84
+
19
85
  def find_plink() -> Optional[str]:
20
86
  """Find PLINK executable on PATH.
21
87
 
@@ -84,6 +150,51 @@ def build_ld_command(
84
150
  return cmd
85
151
 
86
152
 
153
+ def parse_pairwise_ld_output(
154
+ ld_file: str, snplist_file: str
155
+ ) -> tuple[pd.DataFrame, list[str]]:
156
+ """Parse PLINK pairwise LD matrix output files.
157
+
158
+ PLINK --r2 square outputs:
159
+ - .ld file: N x N matrix of R2/D' values (whitespace-separated, no headers)
160
+ - .snplist file: SNP IDs in order (one per line)
161
+
162
+ Args:
163
+ ld_file: Path to .ld output file (square matrix).
164
+ snplist_file: Path to .snplist output file (SNP IDs).
165
+
166
+ Returns:
167
+ Tuple of (DataFrame with R2/D' values, list of SNP IDs).
168
+ DataFrame has SNP IDs as both index and columns.
169
+ Returns (empty DataFrame, empty list) if files not found.
170
+ """
171
+ # Check if files exist
172
+ if not os.path.exists(ld_file) or not os.path.exists(snplist_file):
173
+ return pd.DataFrame(), []
174
+
175
+ # Read SNP list
176
+ with open(snplist_file) as f:
177
+ snp_ids = [line.strip() for line in f if line.strip()]
178
+
179
+ if not snp_ids:
180
+ return pd.DataFrame(), []
181
+
182
+ # Read LD matrix (whitespace-separated, no headers)
183
+ # Values can be numbers or 'nan'
184
+ matrix = pd.read_csv(
185
+ ld_file,
186
+ sep=r"\s+",
187
+ header=None,
188
+ names=snp_ids,
189
+ index_col=False,
190
+ )
191
+
192
+ # Set SNP IDs as row index
193
+ matrix.index = snp_ids
194
+
195
+ return matrix, snp_ids
196
+
197
+
87
198
  def parse_ld_output(ld_file: str, lead_snp: str) -> pd.DataFrame:
88
199
  """Parse PLINK .ld output file.
89
200
 
@@ -208,3 +319,131 @@ def calculate_ld(
208
319
  # Clean up temp directory
209
320
  if cleanup_working_dir and os.path.exists(working_dir):
210
321
  shutil.rmtree(working_dir, ignore_errors=True)
322
+
323
+
324
+ def calculate_pairwise_ld(
325
+ bfile_path: str,
326
+ snp_list: list[str] | None = None,
327
+ chrom: int | None = None,
328
+ start: int | None = None,
329
+ end: int | None = None,
330
+ plink_path: str | None = None,
331
+ working_dir: str | None = None,
332
+ species: str = "canine",
333
+ metric: str = "r2",
334
+ ) -> tuple[pd.DataFrame, list[str]]:
335
+ """Calculate pairwise LD matrix for a set of variants.
336
+
337
+ Runs PLINK --r2 square to compute an N x N LD matrix, suitable for
338
+ LD heatmap visualization.
339
+
340
+ Args:
341
+ bfile_path: Path to PLINK binary fileset (.bed/.bim/.fam prefix).
342
+ snp_list: List of SNP IDs to compute pairwise LD between.
343
+ chrom: Chromosome number for region-based extraction.
344
+ start: Start position (bp) for region-based extraction.
345
+ end: End position (bp) for region-based extraction.
346
+ plink_path: Path to PLINK executable. Auto-detects if None.
347
+ working_dir: Directory for PLINK output files. Uses temp dir if None.
348
+ species: Species flag ('canine', 'feline', or None for human).
349
+ metric: LD metric ('r2' or 'dprime').
350
+
351
+ Returns:
352
+ Tuple of (LD matrix DataFrame, list of SNP IDs).
353
+ DataFrame has SNP IDs as both index and columns.
354
+ Returns (empty DataFrame, empty list) if PLINK fails.
355
+
356
+ Raises:
357
+ FileNotFoundError: If PLINK executable not found.
358
+ ValidationError: If PLINK binary files (.bed/.bim/.fam) are missing.
359
+ ValidationError: If requested SNPs are not found in reference panel.
360
+
361
+ Example:
362
+ >>> matrix, snp_ids = calculate_pairwise_ld(
363
+ ... bfile_path="/path/to/genotypes",
364
+ ... snp_list=["rs1", "rs2", "rs3"],
365
+ ... )
366
+ >>> # matrix is 3x3 DataFrame with LD values
367
+ >>> matrix.loc["rs1", "rs2"] # LD between rs1 and rs2
368
+ """
369
+ from .utils import ValidationError
370
+
371
+ # Find PLINK
372
+ if plink_path is None:
373
+ plink_path = find_plink()
374
+ if plink_path is None:
375
+ raise FileNotFoundError(
376
+ "PLINK not found. Install PLINK 1.9 or specify plink_path."
377
+ )
378
+
379
+ logger.debug(f"Using PLINK at {plink_path}")
380
+
381
+ # Validate PLINK files exist
382
+ validate_plink_files(bfile_path)
383
+
384
+ # Use temp directory if working_dir not specified
385
+ cleanup_working_dir = False
386
+ if working_dir is None:
387
+ working_dir = tempfile.mkdtemp(prefix="snp_scope_pairwise_ld_")
388
+ cleanup_working_dir = True
389
+
390
+ try:
391
+ os.makedirs(working_dir, exist_ok=True)
392
+ output_prefix = os.path.join(working_dir, "pairwise_ld")
393
+
394
+ # Write SNP list to file if provided
395
+ snp_list_file = None
396
+ if snp_list:
397
+ snp_list_file = os.path.join(working_dir, "snp_list.txt")
398
+ with open(snp_list_file, "w") as f:
399
+ for snp in snp_list:
400
+ f.write(f"{snp}\n")
401
+
402
+ # Build and run PLINK command
403
+ cmd = build_pairwise_ld_command(
404
+ plink_path=plink_path,
405
+ bfile_path=bfile_path,
406
+ output_path=output_prefix,
407
+ snp_list_file=snp_list_file,
408
+ chrom=chrom,
409
+ start=start,
410
+ end=end,
411
+ species=species,
412
+ metric=metric,
413
+ )
414
+
415
+ logger.debug(f"Running PLINK command: {' '.join(cmd)}")
416
+
417
+ result = subprocess.run(
418
+ cmd,
419
+ cwd=working_dir,
420
+ capture_output=True,
421
+ text=True,
422
+ )
423
+
424
+ if result.returncode != 0:
425
+ logger.warning(
426
+ f"PLINK pairwise LD calculation failed: {result.stderr[:200]}"
427
+ )
428
+ return pd.DataFrame(), []
429
+
430
+ # Parse output
431
+ ld_file = f"{output_prefix}.ld"
432
+ snplist_file = f"{output_prefix}.snplist"
433
+
434
+ matrix, found_snps = parse_pairwise_ld_output(ld_file, snplist_file)
435
+
436
+ # Validate all requested SNPs were found
437
+ if snp_list:
438
+ missing_snps = set(snp_list) - set(found_snps)
439
+ if missing_snps:
440
+ raise ValidationError(
441
+ f"SNPs not found in reference panel: {', '.join(sorted(missing_snps))}"
442
+ )
443
+
444
+ return matrix, found_snps
445
+
446
+ finally:
447
+ # Clean up temp directory
448
+ if cleanup_working_dir and os.path.exists(working_dir):
449
+ shutil.rmtree(working_dir, ignore_errors=True)
@@ -0,0 +1,252 @@
1
+ """LD heatmap generator for pairwise linkage disequilibrium visualization.
2
+
3
+ Provides triangular heatmap display of pairwise LD values (R² or D')
4
+ with colorbar legend and SNP highlighting support.
5
+ """
6
+
7
+ from typing import Any, List, Optional, Tuple, Union
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+ from .backends import BackendType, get_backend
13
+ from .colors import (
14
+ LD_HEATMAP_COLORS,
15
+ LEAD_SNP_HIGHLIGHT_COLOR,
16
+ SECONDARY_HIGHLIGHT_COLOR,
17
+ )
18
+
19
+
20
+ class LDHeatmapPlotter:
21
+ """LD heatmap generator for pairwise LD visualization.
22
+
23
+ Creates triangular heatmaps showing pairwise linkage disequilibrium
24
+ between variants. Supports R² and D' metrics, lead SNP highlighting,
25
+ and multiple backend renderers.
26
+
27
+ Supports multiple rendering backends:
28
+ - matplotlib (default): Static publication-quality plots
29
+ - plotly: Interactive HTML with hover tooltips
30
+ - bokeh: Interactive HTML for dashboards
31
+
32
+ Args:
33
+ species: Species name ('canine', 'feline', 'human', or None).
34
+ Currently unused but kept for API consistency.
35
+ backend: Plotting backend ('matplotlib', 'plotly', or 'bokeh').
36
+
37
+ Example:
38
+ >>> plotter = LDHeatmapPlotter()
39
+ >>> fig = plotter.plot_ld_heatmap(ld_matrix, lead_snp="rs12345")
40
+ >>> fig.savefig("ld_heatmap.png", dpi=150)
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ species: str = "canine",
46
+ backend: BackendType = "matplotlib",
47
+ ):
48
+ """Initialize the LD heatmap plotter."""
49
+ self.species = species # Kept for backward compatibility, currently unused
50
+ self._backend = get_backend(backend)
51
+ self.backend_name = backend
52
+
53
+ def plot_ld_heatmap(
54
+ self,
55
+ ld_matrix: Union[pd.DataFrame, np.ndarray],
56
+ snp_ids: Optional[List[str]] = None,
57
+ lead_snp: Optional[str] = None,
58
+ highlight_snps: Optional[List[str]] = None,
59
+ metric: str = "r2",
60
+ figsize: Tuple[float, float] = (8, 8),
61
+ title: Optional[str] = None,
62
+ show_colorbar: bool = True,
63
+ ) -> Any:
64
+ """Create triangular LD heatmap.
65
+
66
+ Args:
67
+ ld_matrix: Square DataFrame or numpy array with pairwise LD values.
68
+ NaN values are displayed as grey (missing data).
69
+ snp_ids: List of SNP IDs for axis labels. If None, uses matrix index.
70
+ lead_snp: SNP ID to highlight as lead variant (red highlight).
71
+ highlight_snps: Additional SNP IDs to highlight (blue highlight).
72
+ metric: LD metric label for colorbar ("r2" or "dprime").
73
+ figsize: Figure size as (width, height).
74
+ title: Plot title.
75
+ show_colorbar: Whether to show colorbar legend.
76
+
77
+ Returns:
78
+ Figure object (type depends on backend).
79
+
80
+ Raises:
81
+ ValueError: If ld_matrix is not square.
82
+ ValueError: If lead_snp not found in snp_ids.
83
+ ValueError: If any highlight_snps not found in snp_ids.
84
+
85
+ Example:
86
+ >>> fig = plotter.plot_ld_heatmap(
87
+ ... ld_matrix,
88
+ ... snp_ids=["rs1", "rs2", "rs3"],
89
+ ... lead_snp="rs1",
90
+ ... metric="r2",
91
+ ... )
92
+ """
93
+ # Extract data and snp_ids from DataFrame if needed
94
+ if isinstance(ld_matrix, pd.DataFrame):
95
+ data = ld_matrix.values
96
+ if snp_ids is None:
97
+ snp_ids = list(ld_matrix.index.astype(str))
98
+ else:
99
+ data = np.asarray(ld_matrix)
100
+ if snp_ids is None:
101
+ snp_ids = [str(i) for i in range(data.shape[0])]
102
+
103
+ # Validate square matrix
104
+ if data.ndim != 2 or data.shape[0] != data.shape[1]:
105
+ raise ValueError(f"ld_matrix must be square, got shape {data.shape}")
106
+
107
+ n_snps = len(snp_ids)
108
+ if data.shape[0] != n_snps:
109
+ raise ValueError(
110
+ f"snp_ids length ({n_snps}) does not match matrix dimension ({data.shape[0]})"
111
+ )
112
+
113
+ # Validate lead_snp
114
+ lead_idx = None
115
+ if lead_snp is not None:
116
+ if lead_snp not in snp_ids:
117
+ raise ValueError(f"lead_snp '{lead_snp}' not found in snp_ids")
118
+ lead_idx = snp_ids.index(lead_snp)
119
+
120
+ # Validate highlight_snps
121
+ highlight_indices = []
122
+ if highlight_snps:
123
+ for snp in highlight_snps:
124
+ if snp not in snp_ids:
125
+ raise ValueError(f"highlight_snp '{snp}' not found in snp_ids")
126
+ highlight_indices.append(snp_ids.index(snp))
127
+
128
+ # Create figure with single panel
129
+ fig, axes = self._backend.create_figure(
130
+ n_panels=1,
131
+ height_ratios=[1.0],
132
+ figsize=figsize,
133
+ sharex=False,
134
+ )
135
+ ax = axes[0]
136
+
137
+ # Render triangular heatmap
138
+ mappable = self._backend.add_heatmap(
139
+ ax,
140
+ data=data,
141
+ x_coords=list(range(n_snps)),
142
+ y_coords=list(range(n_snps)),
143
+ cmap_colors=LD_HEATMAP_COLORS,
144
+ vmin=0.0,
145
+ vmax=1.0,
146
+ mask_upper=True,
147
+ )
148
+
149
+ # Add colorbar
150
+ if show_colorbar:
151
+ label = "R²" if metric == "r2" else "D'"
152
+ self._backend.add_colorbar(ax, mappable, label=label)
153
+
154
+ # Highlight lead SNP
155
+ if lead_idx is not None:
156
+ self._highlight_snp(
157
+ ax=ax,
158
+ fig=fig,
159
+ snp_idx=lead_idx,
160
+ n_snps=n_snps,
161
+ color=LEAD_SNP_HIGHLIGHT_COLOR,
162
+ )
163
+
164
+ # Highlight additional SNPs
165
+ for idx in highlight_indices:
166
+ self._highlight_snp(
167
+ ax=ax,
168
+ fig=fig,
169
+ snp_idx=idx,
170
+ n_snps=n_snps,
171
+ color=SECONDARY_HIGHLIGHT_COLOR,
172
+ )
173
+
174
+ # Set axis ticks with SNP labels
175
+ tick_positions = list(range(n_snps))
176
+ self._backend.set_xticks(ax, tick_positions, snp_ids, rotation=90)
177
+ self._backend.set_yticks(ax, tick_positions, snp_ids)
178
+
179
+ # Set title
180
+ if title:
181
+ self._backend.set_title(ax, title)
182
+
183
+ # Finalize layout
184
+ self._backend.finalize_layout(fig)
185
+
186
+ return fig
187
+
188
+ def _highlight_snp(
189
+ self,
190
+ ax: Any,
191
+ fig: Any,
192
+ snp_idx: int,
193
+ n_snps: int,
194
+ color: str,
195
+ ) -> None:
196
+ """Add visual highlight for a SNP's row/column in the heatmap.
197
+
198
+ Draws rectangle borders around the row and column cells for the
199
+ given SNP in the lower triangle.
200
+
201
+ Args:
202
+ ax: Axes object from backend.
203
+ fig: Figure object from backend.
204
+ snp_idx: Index of the SNP to highlight.
205
+ n_snps: Total number of SNPs in the matrix.
206
+ color: Highlight color.
207
+ """
208
+ # Compute all cell positions to highlight (x, y pairs)
209
+ # Row cells: columns 0 to snp_idx, row = snp_idx
210
+ row_cells = [(j, snp_idx) for j in range(snp_idx + 1)]
211
+ # Column cells: column = snp_idx, rows snp_idx+1 to end (skip diagonal)
212
+ col_cells = [(snp_idx, i) for i in range(snp_idx + 1, n_snps)]
213
+ all_cells = row_cells + col_cells
214
+
215
+ if self.backend_name == "matplotlib":
216
+ from matplotlib.patches import Rectangle
217
+
218
+ for x, y in all_cells:
219
+ rect = Rectangle(
220
+ (x - 0.5, y - 0.5),
221
+ 1.0,
222
+ 1.0,
223
+ fill=False,
224
+ edgecolor=color,
225
+ linewidth=2,
226
+ zorder=10,
227
+ )
228
+ ax.add_patch(rect)
229
+
230
+ elif self.backend_name == "plotly":
231
+ for x, y in all_cells:
232
+ fig.add_shape(
233
+ type="rect",
234
+ x0=x - 0.5,
235
+ x1=x + 0.5,
236
+ y0=y - 0.5,
237
+ y1=y + 0.5,
238
+ line=dict(color=color, width=2),
239
+ fillcolor="rgba(0,0,0,0)",
240
+ )
241
+
242
+ elif self.backend_name == "bokeh":
243
+ for x, y in all_cells:
244
+ ax.rect(
245
+ x=x,
246
+ y=y,
247
+ width=1,
248
+ height=1,
249
+ fill_alpha=0,
250
+ line_color=color,
251
+ line_width=2,
252
+ )