pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ """
2
+ pycmplot.plotting.linear
3
+ ========================
4
+ Multi-track linear Manhattan plot.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from typing import Optional
11
+
12
+ import matplotlib.colors as mcolors
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ from matplotlib.patches import FancyArrowPatch
16
+ from natsort import natsort_keygen
17
+
18
+ from pycmplot.constants import CHROM_ORDER
19
+ from pycmplot.stats import get_highlight_snps
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Annotation helpers (cluster-aware label spreading)
26
+ # ---------------------------------------------------------------------------
27
+
28
+ def _cluster_annotations_by_chr(
29
+ annot_df,
30
+ chr_col: str = "CHR",
31
+ x_col: str = "x",
32
+ window_size: float = 50e6,
33
+ ) -> list[list]:
34
+ """Cluster annotations within each chromosome by genomic proximity."""
35
+ clusters: list[list] = []
36
+ for _chr_name, df_chr in annot_df.groupby(chr_col):
37
+ df_chr = df_chr.sort_values(x_col)
38
+ current_cluster = [df_chr.index[0]]
39
+ last_x = df_chr.iloc[0][x_col]
40
+
41
+ for idx, row in df_chr.iloc[1:].iterrows():
42
+ x = row[x_col]
43
+ if x - last_x <= window_size:
44
+ current_cluster.append(idx)
45
+ else:
46
+ clusters.append(current_cluster)
47
+ current_cluster = [idx]
48
+ last_x = x
49
+
50
+ clusters.append(current_cluster)
51
+ return clusters
52
+
53
+
54
+ def _draw_annotation_arrows(
55
+ ax,
56
+ annot_df,
57
+ chr_col: str,
58
+ label_col: str,
59
+ offsets: dict,
60
+ chr_max: dict,
61
+ spread_width: float = 60e6,
62
+ y_tip: float = 0.0,
63
+ y_text: float = 0.55,
64
+ ) -> None:
65
+ """Draw angled FancyArrowPatch arrows from text labels to signal positions."""
66
+ annot_df = annot_df.sort_values(by=[chr_col, "x"], key=natsort_keygen())
67
+ last_xtext = 0 - spread_width
68
+
69
+ for chr_name, df_chr in annot_df.groupby(chr_col, sort=False):
70
+ df_chr = df_chr.sort_values("x")
71
+ chr_start = offsets[chr_name]
72
+ chr_end = offsets[chr_name] + chr_max[chr_name]
73
+
74
+ x_signals = df_chr["x"].values
75
+ labels = df_chr[label_col].values
76
+ n = len(df_chr)
77
+
78
+ # Adaptive spread
79
+ chr_range = chr_end - chr_start
80
+ sw = spread_width
81
+ pad = sw / int(str(sw)[:2]) / 2
82
+ while sw > chr_range:
83
+ sw -= pad
84
+
85
+ sig_start = df_chr["x"].iloc[0]
86
+ xmin = sig_start - sw
87
+ xmax = xmin + n * sw
88
+ x_texts = np.arange(xmin, xmax, sw)
89
+
90
+ first_xtext = x_texts[0]
91
+ while first_xtext <= last_xtext:
92
+ x_texts = [xv + sw for xv in x_texts]
93
+ first_xtext = x_texts[0]
94
+
95
+ for x_sig, x_txt, label in zip(x_signals, x_texts, labels):
96
+ dx = x_txt - x_sig
97
+ rad = 0.15 * np.sign(dx)
98
+
99
+ arrow = FancyArrowPatch(
100
+ (x_txt, y_text),
101
+ (x_sig, y_tip - 0.05),
102
+ arrowstyle="-|>",
103
+ mutation_scale=12,
104
+ lw=0.6,
105
+ color="grey",
106
+ alpha=0.5,
107
+ connectionstyle=f"arc3,rad={rad}",
108
+ )
109
+ ax.add_patch(arrow)
110
+
111
+ ax.text(
112
+ x_txt,
113
+ y_text + 0.02,
114
+ str(label),
115
+ rotation=45,
116
+ ha="left",
117
+ va="bottom",
118
+ fontsize=10,
119
+ clip_on=False,
120
+ color="black",
121
+ fontstyle="italic",
122
+ fontweight="regular",
123
+ )
124
+
125
+ last_xtext = x_texts[-1]
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Public function
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def plot_linear(
133
+ tracks: list,
134
+ track_labels: Optional[list[str]] = None,
135
+ annot_df=None,
136
+ highlight: bool = False,
137
+ highlight_thresh: float = 1e-7,
138
+ chr_col: str = "CHR",
139
+ pos_col: str = "BP",
140
+ p_col: str = "P",
141
+ trim_pval: Optional[float] = None,
142
+ logp: bool = True,
143
+ label_col: str = "label",
144
+ chr_order: Optional[list[str]] = None,
145
+ chr_spacing: float = 9e6,
146
+ track_heights: Optional[list[float]] = None,
147
+ track_spacing: float = 0.10,
148
+ point_size: float = 5,
149
+ colors: Optional[list[str]] = None,
150
+ sig_lines: Optional[list[dict]] = None,
151
+ plot_title: Optional[str] = None,
152
+ fig_format: Optional[str] = None,
153
+ dpi: int = 300,
154
+ figsize: tuple = (15, 9),
155
+ ):
156
+ """Generate a multi-track linear Manhattan plot.
157
+
158
+ Parameters
159
+ ----------
160
+ tracks:
161
+ List of DataFrames, one per GWAS trait. Each must have columns
162
+ *chr_col*, *pos_col*, and *p_col*.
163
+ track_labels:
164
+ Y-axis labels for each track.
165
+ annot_df:
166
+ Optional DataFrame of lead SNPs to annotate (must contain *chr_col*,
167
+ *pos_col*, *label_col*).
168
+ label_col:
169
+ Column to use in the annot_df e.g. column containing gene names.
170
+ highlight:
171
+ Highlight loci within ``500 kb`` of a lead SNP.
172
+ chr_spacing:
173
+ Gap (bp) inserted between chromosomes on the x-axis.
174
+ sig_lines:
175
+ List of ``{"genome": float, "suggestive": float}`` dicts, one per track.
176
+ plot_title:
177
+ Output file path (extension determines format when *fig_format* is ``None``).
178
+ fig_format:
179
+ Override output format (e.g. ``'png'``, ``'pdf'``).
180
+
181
+ Returns
182
+ -------
183
+ (fig, axes)
184
+ """
185
+ if chr_order is None:
186
+ chr_order = CHROM_ORDER
187
+
188
+ chr_to_idx = {c: i for i, c in enumerate(chr_order)}
189
+
190
+ # ------------------------------------------------------------------
191
+ # Prep DataFrames
192
+ # ------------------------------------------------------------------
193
+ def _prep(df):
194
+ df = df.copy()
195
+ if trim_pval:
196
+ df = df[df[p_col] < trim_pval]
197
+ if logp:
198
+ df["logP"] = -np.log10(df[p_col])
199
+
200
+ df[chr_col] = (
201
+ df[chr_col]
202
+ .astype(str)
203
+ .str.replace("chr", "", regex=False)
204
+ .str.upper()
205
+ .replace({"23": "X", "24": "Y", "M": "MT", "MTDNA": "MT"})
206
+ )
207
+
208
+ if highlight:
209
+ df, _ = get_highlight_snps(
210
+ df=df,
211
+ window=500_000,
212
+ highlight_thresh=highlight_thresh,
213
+ logp=logp,
214
+ )
215
+
216
+ df = df[df[chr_col].isin(chr_order)]
217
+ df["chr_idx"] = df[chr_col].map(chr_to_idx)
218
+ return df.sort_values(["chr_idx", pos_col])
219
+
220
+ tracks = [_prep(df) for df in tracks]
221
+ if annot_df is not None:
222
+ annot_df = _prep(annot_df)
223
+
224
+ # ------------------------------------------------------------------
225
+ # Cumulative x-axis positions
226
+ # ------------------------------------------------------------------
227
+ chr_max: dict[str, float] = {}
228
+ offsets: dict[str, float] = {}
229
+ offset = 0.0
230
+
231
+ for c in chr_order:
232
+ max_pos = max(
233
+ [df[df[chr_col] == c][pos_col].max() for df in tracks if c in df[chr_col].values]
234
+ + [0]
235
+ )
236
+ chr_max[c] = max_pos
237
+ offsets[c] = offset
238
+ offset += max_pos + chr_spacing
239
+
240
+ def _add_cum(df):
241
+ df = df.copy()
242
+ df["x"] = df.apply(lambda r: r[pos_col] + offsets[r[chr_col]], axis=1)
243
+ return df
244
+
245
+ tracks = [_add_cum(df) for df in tracks]
246
+ if annot_df is not None:
247
+ annot_df = _add_cum(annot_df)
248
+
249
+ # ------------------------------------------------------------------
250
+ # Figure layout
251
+ # ------------------------------------------------------------------
252
+ n_tracks = len(tracks)
253
+
254
+ if track_heights is None:
255
+ track_heights = [1] + [3] * n_tracks
256
+
257
+ fig = plt.figure(figsize=figsize)
258
+ gs = fig.add_gridspec(
259
+ n_tracks + 1, 1,
260
+ height_ratios=track_heights,
261
+ hspace=track_spacing,
262
+ )
263
+
264
+ ax_annot = fig.add_subplot(gs[0, 0])
265
+ axes = [ax_annot]
266
+ for i in range(n_tracks):
267
+ axes.append(fig.add_subplot(gs[i + 1, 0], sharex=ax_annot))
268
+
269
+ if colors is None:
270
+ colors = ["gray", "steelblue"]
271
+
272
+ # Per-track highlight colours from tab20 colormap
273
+ cmap = plt.get_cmap("tab20")
274
+ hex_colors = [mcolors.to_hex(cmap(i / n_tracks)) for i in range(n_tracks)]
275
+
276
+ # ------------------------------------------------------------------
277
+ # Plot data tracks
278
+ # ------------------------------------------------------------------
279
+ t_labels = track_labels or [f"Track {i+1}" for i in range(n_tracks)]
280
+
281
+ for i, (ax, df, t_label, h_color) in enumerate(
282
+ zip(axes[1:], tracks, t_labels, hex_colors)
283
+ ):
284
+ color_cycle = [colors[j % len(colors)] for j in df["chr_idx"]]
285
+ df = df[df[p_col] >= 0]
286
+
287
+ y_vals = df["logP"] if logp else df[p_col]
288
+ ax.scatter(df["x"], y_vals, c=color_cycle, s=point_size)
289
+
290
+ if highlight:
291
+ sig = df[df["in_locus"]]
292
+ if not sig.empty:
293
+ sig_y = sig["logP"] if logp else sig[p_col]
294
+ ax.scatter(sig["x"].to_numpy(), sig_y.to_numpy(), s=point_size,
295
+ marker="o", color="brown")
296
+
297
+ ax.set_ylabel(t_label, color="black")
298
+
299
+ if sig_lines is not None and i < len(sig_lines):
300
+ sl = sig_lines[i]
301
+ if "genome" in sl:
302
+ ax.axhline(y=sl["genome"], color="red", linestyle="--", linewidth=0.6)
303
+ if "suggestive" in sl:
304
+ ax.axhline(y=sl["suggestive"], color="grey", linestyle="--", linewidth=0.5)
305
+
306
+ ax.spines["top"].set_visible(False)
307
+ ax.spines["right"].set_visible(False)
308
+
309
+ left_pad = chr_spacing * 0.2
310
+ xmax = max(offsets[c] + chr_max[c] for c in chr_order)
311
+ ax.set_xlim(-left_pad, xmax)
312
+
313
+ # ------------------------------------------------------------------
314
+ # Annotation track
315
+ # ------------------------------------------------------------------
316
+ if annot_df is not None:
317
+ # Vertical lines across all data tracks
318
+ for x in annot_df["x"].values:
319
+ for ax in axes[1:]:
320
+ ax.axvline(x, color="grey", alpha=0.45, linewidth=0.7,
321
+ linestyle="--", zorder=0)
322
+
323
+ _draw_annotation_arrows(
324
+ ax_annot,
325
+ annot_df,
326
+ chr_col=chr_col,
327
+ label_col=label_col,
328
+ offsets=offsets,
329
+ chr_max=chr_max,
330
+ spread_width=60e6,
331
+ )
332
+
333
+ ax_annot.set_ylim(0, 1)
334
+ ax_annot.axis("off")
335
+
336
+ # ------------------------------------------------------------------
337
+ # Chromosome labels on x-axis
338
+ # ------------------------------------------------------------------
339
+ xticks, xlabels = [], []
340
+ for c in chr_order:
341
+ if chr_max[c] == 0:
342
+ continue
343
+ start = offsets[c]
344
+ end = offsets[c] + chr_max[c]
345
+ mid = (start + end) / 2
346
+ xticks.append(mid)
347
+ xlabels.append(c)
348
+ for ax in axes:
349
+ ax.axvline(end, color="lightgray", linewidth=0.1, alpha=0.05)
350
+
351
+ axes[-1].set_xticks(xticks)
352
+ axes[-1].set_xticklabels(xlabels)
353
+ axes[-1].set_xlabel("Chromosome", fontsize=12)
354
+
355
+ for ax in axes[:-1]:
356
+ ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False)
357
+ ax.spines["bottom"].set_visible(False)
358
+
359
+ plt.subplots_adjust(hspace=track_spacing, left=0.08)
360
+ plt.tight_layout()
361
+
362
+ fig.text(
363
+ 0.03, 0.5,
364
+ "-log\u2081\u2080(P)" if logp else p_col,
365
+ va="center",
366
+ rotation="vertical",
367
+ fontsize=12,
368
+ )
369
+
370
+ if plot_title:
371
+ fmt = fig_format or Path(plot_title).suffix.lstrip(".") or "png"
372
+ plt.savefig(plot_title, format=fmt, dpi=dpi)
373
+ logger.info("Saved linear Manhattan plot: %s", plot_title)
374
+
375
+ return fig, axes
pycmplot/resources.py ADDED
@@ -0,0 +1,116 @@
1
+ """
2
+ pycmplot.resources
3
+ ==================
4
+ Centralised configuration for external resource files that cannot be bundled
5
+ with the package (large reference files, chain files, etc.).
6
+
7
+ Users can supply paths in three ways, in order of priority:
8
+
9
+ 1. Pass a :class:`ResourceConfig` instance directly to functions that need it.
10
+ 2. Set environment variables before running:
11
+
12
+ .. code-block:: bash
13
+
14
+ export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain
15
+ export PYCMPLOT_GENEINFO_HG38=/path/to/Homo_sapiens.GRCh38.geneinfo.tsv.gz
16
+ export PYCMPLOT_GENEINFO_HG19=/path/to/Homo_sapiens.GRCh37.geneinfo.tsv.gz
17
+ export PYCMPLOT_FEATURESINFO=/path/to/Homo_sapiens.GRCh38.features.tsv.gz
18
+
19
+ 3. Edit the defaults in this module for a site-wide installation.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ from dataclasses import dataclass, field
26
+ from pathlib import Path
27
+ from importlib.resources import files
28
+
29
+ # define _env
30
+ def _env(var: str, default: str | None = None) -> str | None:
31
+ return os.environ.get(var, default)
32
+
33
+ # define packaged data helper
34
+ def _pkg_data(filename: str) -> str:
35
+ return str(files("pycmplot.data") / filename)
36
+
37
+
38
+
39
+ @dataclass
40
+ class ResourceConfig:
41
+ """Paths to external reference files used by pycmplot.
42
+
43
+ Attributes
44
+ ----------
45
+ chain_hg19_hg38 :
46
+ LiftOver chain file for hg19 → hg38 conversion.
47
+ geneinfo_hg38 :
48
+ Tab-delimited gene info file for GRCh38 (used for nearest-gene annotation).
49
+ geneinfo_hg19 :
50
+ Tab-delimited gene info file for GRCh37 (fallback when data is hg19).
51
+ featuresinfo :
52
+ Extended features info file (all biotypes) for GRCh38.
53
+ """
54
+
55
+ chain_hg19_hg38: str | None = field(
56
+ default_factory=lambda: _env(
57
+ "PYCMPLOT_CHAIN_HG19_HG38",
58
+ _pkg_data("hg19ToHg38.over.chain"),
59
+ )
60
+ )
61
+ geneinfo_hg38: str | None = field(
62
+ default_factory=lambda: _env(
63
+ "PYCMPLOT_GENEINFO_HG38",
64
+ _pkg_data("Homo_sapiens.GRCh38.geneinfo.tsv.gz"),
65
+ )
66
+ )
67
+ geneinfo_hg19: str | None = field(
68
+ default_factory=lambda: _env(
69
+ "PYCMPLOT_GENEINFO_HG19",
70
+ _pkg_data("Homo_sapiens.GRCh37.geneinfo.tsv.gz"),
71
+ )
72
+ )
73
+ #featuresinfo: str | None = field(
74
+ # default_factory=lambda: _env(
75
+ # "PYCMPLOT_FEATURESINFO",
76
+ # _pkg_data("featuresinfo.tsv.gz"),
77
+ # )
78
+ #)
79
+
80
+ def require(self, attr: str) -> str:
81
+ """Return the path for *attr*, raising a clear error if it is unset."""
82
+ val = getattr(self, attr)
83
+ if val is None:
84
+ env_var = {
85
+ "chain_hg19_hg38": "PYCMPLOT_CHAIN_HG19_HG38",
86
+ "geneinfo_hg38": "PYCMPLOT_GENEINFO_HG38",
87
+ "geneinfo_hg19": "PYCMPLOT_GENEINFO_HG19",
88
+ #"featuresinfo": "PYCMPLOT_FEATURESINFO",
89
+ }.get(attr, attr.upper())
90
+ raise FileNotFoundError(
91
+ f"Resource '{attr}' is not configured.\n"
92
+ f"Set the environment variable {env_var} or pass a "
93
+ f"ResourceConfig('{attr}'='/path/to/file') to the function."
94
+ )
95
+ path = Path(val)
96
+
97
+ if path.exists():
98
+ return str(path)
99
+
100
+ # fallback: try importlib resource resolution
101
+ try:
102
+ resource = files("pycmplot.data") / Path(val).name
103
+ with as_file(resource) as real_path:
104
+ return str(real_path)
105
+ except Exception:
106
+ pass
107
+
108
+ raise FileNotFoundError(
109
+ f"Resource file not found: {val}\n"
110
+ f"Check the path set for '{attr}'."
111
+ )
112
+ return str(path)
113
+
114
+
115
+ # Module-level default instance — picks up environment variables automatically.
116
+ default_resources = ResourceConfig()
pycmplot/stats.py ADDED
@@ -0,0 +1,106 @@
1
+ """
2
+ pycmplot.stats
3
+ ==============
4
+ Statistical helper functions for identifying lead SNPs and loci to highlight.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+
12
+
13
+ def get_lead_snps(
14
+ df: pd.DataFrame,
15
+ highlight_thresh: float = 5e-8,
16
+ logp: bool = False,
17
+ window: int = 500_000,
18
+ ) -> pd.DataFrame:
19
+ """Identify independent lead SNPs by greedy distance clumping.
20
+
21
+ Starting from the most significant SNP, each subsequent SNP is kept only
22
+ if it is > *window* bp away from all previously kept leads on the same
23
+ chromosome.
24
+
25
+ Parameters
26
+ ----------
27
+ df:
28
+ Summary statistics DataFrame containing columns ``CHR``, ``POS``,
29
+ ``P`` (and ``logP`` when *logp* is ``True``).
30
+ highlight_thresh:
31
+ P-value (or −log₁₀(p) when *logp* is ``True``) significance cutoff.
32
+ logp:
33
+ If ``True``, filter and rank by the ``logP`` column instead of ``P``.
34
+ window:
35
+ Clumping window in base-pairs (default 500 kb).
36
+
37
+ Returns
38
+ -------
39
+ pd.DataFrame
40
+ Subset of *df* containing only the lead SNPs.
41
+ """
42
+ if logp:
43
+ thresh = -np.log10(float(highlight_thresh))
44
+ sig = df[df["logP"] >= thresh].copy()
45
+ p_col = "logP"
46
+ ascending = False
47
+ else:
48
+ sig = df[df["P"] <= highlight_thresh].copy()
49
+ p_col = "P"
50
+ ascending = True
51
+
52
+ sig = sig.sort_values(p_col, ascending=ascending)
53
+ leads: list[pd.Series] = []
54
+
55
+ while not sig.empty:
56
+ top = sig.iloc[0]
57
+ leads.append(top)
58
+ sig = sig[
59
+ ~(
60
+ (sig["CHR"] == top["CHR"])
61
+ & (abs(sig["POS"] - top["POS"]) <= window)
62
+ )
63
+ ]
64
+
65
+ return pd.DataFrame(leads)
66
+
67
+
68
+ def get_highlight_snps(
69
+ df: pd.DataFrame,
70
+ highlight_thresh: float = 5e-8,
71
+ logp: bool = False,
72
+ window: int = 500_000,
73
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
74
+ """Mark all SNPs within *window* bp of a lead SNP.
75
+
76
+ Adds an ``in_locus`` boolean column to *df* and returns the annotated
77
+ DataFrame together with the lead SNP DataFrame.
78
+
79
+ Parameters
80
+ ----------
81
+ df, highlight_thresh, logp, window:
82
+ See :func:`get_lead_snps`.
83
+
84
+ Returns
85
+ -------
86
+ (df_annotated, leads_df)
87
+ """
88
+ df = df.copy()
89
+ df["in_locus"] = False
90
+
91
+ leads_df = get_lead_snps(
92
+ df=df,
93
+ highlight_thresh=highlight_thresh,
94
+ logp=False,
95
+ window=window,
96
+ )
97
+
98
+ for _, row in leads_df.iterrows():
99
+ min_pos = row["POS"] - window
100
+ max_pos = row["POS"] + window
101
+ chrom = row["CHR"]
102
+
103
+ mask = (df["CHR"] == chrom) & (df["POS"] >= min_pos) & (df["POS"] <= max_pos)
104
+ df.loc[mask, "in_locus"] = True
105
+
106
+ return df, leads_df