pylocuszoom 1.1.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,18 +4,20 @@ Provides utilities for loading, validating, and preparing statistical
4
4
  fine-mapping results (SuSiE, FINEMAP, etc.) for visualization.
5
5
  """
6
6
 
7
- from typing import List, Optional
7
+ from typing import Any, List, Optional
8
8
 
9
9
  import pandas as pd
10
10
 
11
+ from .backends.base import PlotBackend
12
+ from .backends.hover import HoverConfig, HoverDataBuilder
13
+ from .colors import PIP_LINE_COLOR, get_credible_set_color
11
14
  from .exceptions import FinemappingValidationError, ValidationError
12
15
  from .logging import logger
13
16
  from .utils import filter_by_region
14
17
  from .validation import DataFrameValidator
15
18
 
16
- # Required columns for fine-mapping data
19
+ # Required columns for fine-mapping data (default column names)
17
20
  REQUIRED_FINEMAPPING_COLS = ["pos", "pip"]
18
- OPTIONAL_FINEMAPPING_COLS = ["rs", "cs", "cs_id", "effect", "se"]
19
21
 
20
22
 
21
23
  def validate_finemapping_df(
@@ -207,3 +209,109 @@ def calculate_credible_set_coverage(
207
209
  coverage[cs_id] = cs_data[pip_col].sum()
208
210
 
209
211
  return coverage
212
+
213
+
214
+ def plot_finemapping(
215
+ backend: PlotBackend,
216
+ ax: Any,
217
+ df: pd.DataFrame,
218
+ pos_col: str = "pos",
219
+ pip_col: str = "pip",
220
+ cs_col: Optional[str] = "cs",
221
+ show_credible_sets: bool = True,
222
+ pip_threshold: float = 0.0,
223
+ ) -> None:
224
+ """Plot fine-mapping results (PIP line with credible set coloring).
225
+
226
+ Renders posterior inclusion probabilities as a line plot, with optional
227
+ scatter points colored by credible set membership.
228
+
229
+ Args:
230
+ backend: Plotting backend implementing PlotBackend protocol.
231
+ ax: Axes or panel to plot on.
232
+ df: Fine-mapping DataFrame with pos and pip columns.
233
+ pos_col: Column name for position.
234
+ pip_col: Column name for posterior inclusion probability.
235
+ cs_col: Column name for credible set assignment (optional).
236
+ show_credible_sets: Whether to color points by credible set.
237
+ pip_threshold: Minimum PIP to display as scatter point.
238
+ """
239
+ # Build hover data using HoverDataBuilder
240
+ extra_cols = {pip_col: "PIP"}
241
+ if cs_col and cs_col in df.columns:
242
+ extra_cols[cs_col] = "Credible Set"
243
+ hover_config = HoverConfig(
244
+ pos_col=pos_col if pos_col in df.columns else None,
245
+ extra_cols=extra_cols,
246
+ )
247
+ hover_builder = HoverDataBuilder(hover_config)
248
+
249
+ # Sort by position for line plotting
250
+ df = df.sort_values(pos_col)
251
+
252
+ # Plot PIP as line
253
+ backend.line(
254
+ ax,
255
+ df[pos_col],
256
+ df[pip_col],
257
+ color=PIP_LINE_COLOR,
258
+ linewidth=1.5,
259
+ alpha=0.8,
260
+ zorder=1,
261
+ )
262
+
263
+ # Check if credible sets are available
264
+ has_cs = cs_col is not None and cs_col in df.columns and show_credible_sets
265
+ credible_sets = get_credible_sets(df, cs_col) if has_cs else []
266
+
267
+ if credible_sets:
268
+ # Plot points colored by credible set
269
+ for cs_id in credible_sets:
270
+ cs_data = df[df[cs_col] == cs_id]
271
+ color = get_credible_set_color(cs_id)
272
+ backend.scatter(
273
+ ax,
274
+ cs_data[pos_col],
275
+ cs_data[pip_col],
276
+ colors=color,
277
+ sizes=50,
278
+ marker="o",
279
+ edgecolor="black",
280
+ linewidth=0.5,
281
+ zorder=3,
282
+ hover_data=hover_builder.build_dataframe(cs_data),
283
+ )
284
+ # Plot variants not in any credible set (only if threshold is set)
285
+ if pip_threshold > 0:
286
+ non_cs_data = df[(df[cs_col].isna()) | (df[cs_col] == 0)]
287
+ non_cs_data = non_cs_data[non_cs_data[pip_col] >= pip_threshold]
288
+ if not non_cs_data.empty:
289
+ backend.scatter(
290
+ ax,
291
+ non_cs_data[pos_col],
292
+ non_cs_data[pip_col],
293
+ colors="#BEBEBE",
294
+ sizes=30,
295
+ marker="o",
296
+ edgecolor="black",
297
+ linewidth=0.3,
298
+ zorder=2,
299
+ hover_data=hover_builder.build_dataframe(non_cs_data),
300
+ )
301
+ else:
302
+ # No credible sets - show all points above threshold
303
+ if pip_threshold > 0:
304
+ high_pip = df[df[pip_col] >= pip_threshold]
305
+ if not high_pip.empty:
306
+ backend.scatter(
307
+ ax,
308
+ high_pip[pos_col],
309
+ high_pip[pip_col],
310
+ colors=PIP_LINE_COLOR,
311
+ sizes=50,
312
+ marker="o",
313
+ edgecolor="black",
314
+ linewidth=0.5,
315
+ zorder=3,
316
+ hover_data=hover_builder.build_dataframe(high_pip),
317
+ )
pylocuszoom/labels.py CHANGED
@@ -24,6 +24,7 @@ def add_snp_labels(
24
24
  genes_df: Optional[pd.DataFrame] = None,
25
25
  chrom: Optional[Union[int, str]] = None,
26
26
  max_label_length: int = 15,
27
+ adjust: bool = True,
27
28
  **kwargs: Any,
28
29
  ) -> List[Annotation]:
29
30
  """Add text labels to top SNPs in the regional plot.
@@ -41,6 +42,8 @@ def add_snp_labels(
41
42
  genes_df: Unused, kept for backward compatibility.
42
43
  chrom: Unused, kept for backward compatibility.
43
44
  max_label_length: Maximum label length before truncation.
45
+ adjust: If True, run adjustText immediately. If False, caller must
46
+ call adjust_snp_labels() after setting axis limits.
44
47
 
45
48
  Returns:
46
49
  List of matplotlib text annotation objects.
@@ -101,21 +104,43 @@ def add_snp_labels(
101
104
  )
102
105
  texts.append(text)
103
106
 
104
- # Only use adjustText when there are multiple labels to avoid overlap
105
- if len(texts) > 1:
106
- try:
107
- from adjustText import adjust_text
108
-
109
- adjust_text(
110
- texts,
111
- ax=ax,
112
- arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
113
- expand_points=(1.5, 1.5),
114
- )
115
- except ImportError:
116
- logger.warning(
117
- "adjustText not installed - SNP labels may overlap. "
118
- "Install with: pip install adjustText"
119
- )
107
+ if adjust:
108
+ adjust_snp_labels(ax, texts)
120
109
 
121
110
  return texts
111
+
112
+
113
+ def adjust_snp_labels(ax: Axes, texts: List[Annotation]) -> None:
114
+ """Adjust SNP label positions to avoid overlaps.
115
+
116
+ This function should be called AFTER all axis limits have been set,
117
+ as adjustText needs to know the final plot bounds to position labels
118
+ correctly within the visible area.
119
+
120
+ Args:
121
+ ax: Matplotlib axes object.
122
+ texts: List of text annotation objects from add_snp_labels().
123
+
124
+ Example:
125
+ >>> texts = add_snp_labels(ax, df, adjust=False)
126
+ >>> ax.set_xlim(start, end)
127
+ >>> ax.set_ylim(0, max_y)
128
+ >>> adjust_snp_labels(ax, texts)
129
+ """
130
+ if len(texts) <= 1:
131
+ return
132
+
133
+ try:
134
+ from adjustText import adjust_text
135
+
136
+ adjust_text(
137
+ texts,
138
+ ax=ax,
139
+ arrowprops=dict(arrowstyle="-", color="gray", lw=0.5),
140
+ expand_points=(1.5, 1.5),
141
+ )
142
+ except ImportError:
143
+ logger.warning(
144
+ "adjustText not installed - SNP labels may overlap. "
145
+ "Install with: pip install adjustText"
146
+ )
pylocuszoom/ld.py CHANGED
@@ -16,6 +16,72 @@ from .logging import logger
16
16
  from .utils import validate_plink_files
17
17
 
18
18
 
19
+ def build_pairwise_ld_command(
20
+ plink_path: str,
21
+ bfile_path: str,
22
+ output_path: str,
23
+ snp_list_file: Optional[str] = None,
24
+ chrom: Optional[int] = None,
25
+ start: Optional[int] = None,
26
+ end: Optional[int] = None,
27
+ species: Optional[str] = "canine",
28
+ metric: str = "r2",
29
+ ) -> list:
30
+ """Build PLINK command for pairwise LD matrix computation.
31
+
32
+ Generates command for computing an N x N LD matrix using PLINK's
33
+ --r2 square (or --r dprime square) command.
34
+
35
+ Args:
36
+ plink_path: Path to PLINK executable.
37
+ bfile_path: Input binary fileset prefix (.bed/.bim/.fam).
38
+ output_path: Output prefix (creates .ld and .snplist files).
39
+ snp_list_file: Path to file with SNP IDs to extract (one per line).
40
+ chrom: Chromosome number for region-based extraction.
41
+ start: Start position (bp) for region-based extraction.
42
+ end: End position (bp) for region-based extraction.
43
+ species: Species flag ('canine', 'feline', or None for human).
44
+ metric: LD metric ('r2' or 'dprime').
45
+
46
+ Returns:
47
+ List of command arguments for subprocess.
48
+ """
49
+ cmd = [plink_path]
50
+
51
+ # Species flag
52
+ if species == "canine":
53
+ cmd.append("--dog")
54
+ elif species == "feline":
55
+ cmd.extend(["--chr-set", "18"])
56
+
57
+ # Input and output
58
+ cmd.extend(["--bfile", bfile_path])
59
+ cmd.extend(["--out", output_path])
60
+
61
+ # LD metric and square matrix flag
62
+ if metric == "dprime":
63
+ cmd.extend(["--r", "dprime", "square"])
64
+ else:
65
+ cmd.extend(["--r2", "square"])
66
+
67
+ # Track SNP order in output
68
+ cmd.append("--write-snplist")
69
+
70
+ # SNP extraction mode
71
+ if snp_list_file:
72
+ cmd.extend(["--extract", snp_list_file])
73
+
74
+ # Region-based extraction
75
+ if chrom is not None:
76
+ cmd.extend(["--chr", str(chrom)])
77
+ if start is not None:
78
+ cmd.extend(["--from-bp", str(start)])
79
+ if end is not None:
80
+ cmd.extend(["--to-bp", str(end)])
81
+
82
+ return cmd
83
+
84
+
19
85
  def find_plink() -> Optional[str]:
20
86
  """Find PLINK executable on PATH.
21
87
 
@@ -84,6 +150,51 @@ def build_ld_command(
84
150
  return cmd
85
151
 
86
152
 
153
+ def parse_pairwise_ld_output(
154
+ ld_file: str, snplist_file: str
155
+ ) -> tuple[pd.DataFrame, list[str]]:
156
+ """Parse PLINK pairwise LD matrix output files.
157
+
158
+ PLINK --r2 square outputs:
159
+ - .ld file: N x N matrix of R2/D' values (whitespace-separated, no headers)
160
+ - .snplist file: SNP IDs in order (one per line)
161
+
162
+ Args:
163
+ ld_file: Path to .ld output file (square matrix).
164
+ snplist_file: Path to .snplist output file (SNP IDs).
165
+
166
+ Returns:
167
+ Tuple of (DataFrame with R2/D' values, list of SNP IDs).
168
+ DataFrame has SNP IDs as both index and columns.
169
+ Returns (empty DataFrame, empty list) if files not found.
170
+ """
171
+ # Check if files exist
172
+ if not os.path.exists(ld_file) or not os.path.exists(snplist_file):
173
+ return pd.DataFrame(), []
174
+
175
+ # Read SNP list
176
+ with open(snplist_file) as f:
177
+ snp_ids = [line.strip() for line in f if line.strip()]
178
+
179
+ if not snp_ids:
180
+ return pd.DataFrame(), []
181
+
182
+ # Read LD matrix (whitespace-separated, no headers)
183
+ # Values can be numbers or 'nan'
184
+ matrix = pd.read_csv(
185
+ ld_file,
186
+ sep=r"\s+",
187
+ header=None,
188
+ names=snp_ids,
189
+ index_col=False,
190
+ )
191
+
192
+ # Set SNP IDs as row index
193
+ matrix.index = snp_ids
194
+
195
+ return matrix, snp_ids
196
+
197
+
87
198
  def parse_ld_output(ld_file: str, lead_snp: str) -> pd.DataFrame:
88
199
  """Parse PLINK .ld output file.
89
200
 
@@ -208,3 +319,131 @@ def calculate_ld(
208
319
  # Clean up temp directory
209
320
  if cleanup_working_dir and os.path.exists(working_dir):
210
321
  shutil.rmtree(working_dir, ignore_errors=True)
322
+
323
+
324
+ def calculate_pairwise_ld(
325
+ bfile_path: str,
326
+ snp_list: list[str] | None = None,
327
+ chrom: int | None = None,
328
+ start: int | None = None,
329
+ end: int | None = None,
330
+ plink_path: str | None = None,
331
+ working_dir: str | None = None,
332
+ species: str = "canine",
333
+ metric: str = "r2",
334
+ ) -> tuple[pd.DataFrame, list[str]]:
335
+ """Calculate pairwise LD matrix for a set of variants.
336
+
337
+ Runs PLINK --r2 square to compute an N x N LD matrix, suitable for
338
+ LD heatmap visualization.
339
+
340
+ Args:
341
+ bfile_path: Path to PLINK binary fileset (.bed/.bim/.fam prefix).
342
+ snp_list: List of SNP IDs to compute pairwise LD between.
343
+ chrom: Chromosome number for region-based extraction.
344
+ start: Start position (bp) for region-based extraction.
345
+ end: End position (bp) for region-based extraction.
346
+ plink_path: Path to PLINK executable. Auto-detects if None.
347
+ working_dir: Directory for PLINK output files. Uses temp dir if None.
348
+ species: Species flag ('canine', 'feline', or None for human).
349
+ metric: LD metric ('r2' or 'dprime').
350
+
351
+ Returns:
352
+ Tuple of (LD matrix DataFrame, list of SNP IDs).
353
+ DataFrame has SNP IDs as both index and columns.
354
+ Returns (empty DataFrame, empty list) if PLINK fails.
355
+
356
+ Raises:
357
+ FileNotFoundError: If PLINK executable not found.
358
+ ValidationError: If PLINK binary files (.bed/.bim/.fam) are missing.
359
+ ValidationError: If requested SNPs are not found in reference panel.
360
+
361
+ Example:
362
+ >>> matrix, snp_ids = calculate_pairwise_ld(
363
+ ... bfile_path="/path/to/genotypes",
364
+ ... snp_list=["rs1", "rs2", "rs3"],
365
+ ... )
366
+ >>> # matrix is 3x3 DataFrame with LD values
367
+ >>> matrix.loc["rs1", "rs2"] # LD between rs1 and rs2
368
+ """
369
+ from .utils import ValidationError
370
+
371
+ # Find PLINK
372
+ if plink_path is None:
373
+ plink_path = find_plink()
374
+ if plink_path is None:
375
+ raise FileNotFoundError(
376
+ "PLINK not found. Install PLINK 1.9 or specify plink_path."
377
+ )
378
+
379
+ logger.debug(f"Using PLINK at {plink_path}")
380
+
381
+ # Validate PLINK files exist
382
+ validate_plink_files(bfile_path)
383
+
384
+ # Use temp directory if working_dir not specified
385
+ cleanup_working_dir = False
386
+ if working_dir is None:
387
+ working_dir = tempfile.mkdtemp(prefix="snp_scope_pairwise_ld_")
388
+ cleanup_working_dir = True
389
+
390
+ try:
391
+ os.makedirs(working_dir, exist_ok=True)
392
+ output_prefix = os.path.join(working_dir, "pairwise_ld")
393
+
394
+ # Write SNP list to file if provided
395
+ snp_list_file = None
396
+ if snp_list:
397
+ snp_list_file = os.path.join(working_dir, "snp_list.txt")
398
+ with open(snp_list_file, "w") as f:
399
+ for snp in snp_list:
400
+ f.write(f"{snp}\n")
401
+
402
+ # Build and run PLINK command
403
+ cmd = build_pairwise_ld_command(
404
+ plink_path=plink_path,
405
+ bfile_path=bfile_path,
406
+ output_path=output_prefix,
407
+ snp_list_file=snp_list_file,
408
+ chrom=chrom,
409
+ start=start,
410
+ end=end,
411
+ species=species,
412
+ metric=metric,
413
+ )
414
+
415
+ logger.debug(f"Running PLINK command: {' '.join(cmd)}")
416
+
417
+ result = subprocess.run(
418
+ cmd,
419
+ cwd=working_dir,
420
+ capture_output=True,
421
+ text=True,
422
+ )
423
+
424
+ if result.returncode != 0:
425
+ logger.warning(
426
+ f"PLINK pairwise LD calculation failed: {result.stderr[:200]}"
427
+ )
428
+ return pd.DataFrame(), []
429
+
430
+ # Parse output
431
+ ld_file = f"{output_prefix}.ld"
432
+ snplist_file = f"{output_prefix}.snplist"
433
+
434
+ matrix, found_snps = parse_pairwise_ld_output(ld_file, snplist_file)
435
+
436
+ # Validate all requested SNPs were found
437
+ if snp_list:
438
+ missing_snps = set(snp_list) - set(found_snps)
439
+ if missing_snps:
440
+ raise ValidationError(
441
+ f"SNPs not found in reference panel: {', '.join(sorted(missing_snps))}"
442
+ )
443
+
444
+ return matrix, found_snps
445
+
446
+ finally:
447
+ # Clean up temp directory
448
+ if cleanup_working_dir and os.path.exists(working_dir):
449
+ shutil.rmtree(working_dir, ignore_errors=True)