pycmplot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycmplot/__init__.py ADDED
@@ -0,0 +1,43 @@
1
+ """
2
+ pycmplot
3
+ ========
4
+ Multi-track circular and linear Manhattan plot generation for GWAS summary statistics.
5
+
6
+ Quickstart
7
+ ----------
8
+ Command-line::
9
+
10
+ pycmplot -s file1.gz,file2.gz -l HbF,MCV --logp --mode lm
11
+
12
+ Python API::
13
+
14
+ from pycmplot.plotting import plot_linear, plot_circular
15
+ from pycmplot.stats import get_lead_snps
16
+ from pycmplot.annotation import get_hits_summary_table
17
+
18
+ Public surface
19
+ --------------
20
+ """
21
+
22
+ from pycmplot.plotting.linear import plot_linear
23
+ from pycmplot.plotting.circular import plot_circular, compute_track_radii_dict
24
+ from pycmplot.stats import get_lead_snps, get_highlight_snps
25
+ from pycmplot.io import get_sumstats_and_merged_sector_list
26
+ from pycmplot.annotation import get_hits_summary_table
27
+ from pycmplot.constants import hg38_chr_lengths, BIOTYPE_WEIGHTS
28
+ from pycmplot.resources import ResourceConfig
29
+
30
+ __all__ = [
31
+ "plot_linear",
32
+ "plot_circular",
33
+ "compute_track_radii_dict",
34
+ "get_lead_snps",
35
+ "get_highlight_snps",
36
+ "get_sumstats_and_merged_sector_list",
37
+ "get_hits_summary_table",
38
+ "hg38_chr_lengths",
39
+ "BIOTYPE_WEIGHTS",
40
+ "ResourceConfig",
41
+ ]
42
+
43
+ __version__ = "0.1.0"
pycmplot/_core.py ADDED
@@ -0,0 +1,419 @@
1
+ """
2
+ pycmplot._core
3
+ ==============
4
+ Main entry point — orchestrates CLI parsing, data loading, and plotting.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import re
11
+ import sys
12
+ import warnings
13
+ from pathlib import Path
14
+
15
+ import numpy as np
16
+
17
+ # Suppress noisy font-manager warnings before any matplotlib import
18
+ logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)
19
+ warnings.filterwarnings("ignore")
20
+
21
+ logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def main() -> None:
26
+ """CLI entry point — ``pycmplot`` console script."""
27
+
28
+ # ------------------------------------------------------------------
29
+ # Deferred imports so ``import pycmplot`` remains fast
30
+ # ------------------------------------------------------------------
31
+ from pycirclize import Circos
32
+
33
+ from pycmplot.cli import get_arguments, DESCMSG
34
+ from pycmplot.io import (
35
+ get_sumstats_and_merged_sector_list,
36
+ detect_delimiter,
37
+ resolve_delimiter,
38
+ get_file_header,
39
+ )
40
+ from pycmplot.plotting.linear import plot_linear
41
+ from pycmplot.plotting.circular import plot_circular, compute_track_radii_dict
42
+ from pycmplot.resources import ResourceConfig
43
+
44
+ # ------------------------------------------------------------------
45
+ # Parse CLI
46
+ # ------------------------------------------------------------------
47
+ args = get_arguments(DESCMSG)
48
+ print(DESCMSG)
49
+
50
+ mode = args.mode
51
+ sum_stats_raw = args.sum_stats
52
+ chrom_arg = args.chrom_column
53
+ pos_arg = args.pos_column
54
+ snp_arg = args.snp_column
55
+ build_arg = args.build_column
56
+ labels_raw = args.labels
57
+ pcol_arg = args.pval_column
58
+ logp = args.logp
59
+ chrom_label_size = args.chrom_label_size
60
+ chrom_label_side = args.chrom_label_side
61
+ track_label_size = args.track_label_size
62
+ track_label_orientation = args.track_label_orientation
63
+ sort_track = args.sort_track
64
+ trim_pval = args.trim_pval
65
+ signif_threshold = args.signif_threshold
66
+ signif_line = args.signif_line
67
+ suggest_threshold= args.suggest_threshold
68
+ annotate = args.annotate
69
+ annotation_size = args.annotation_size
70
+ point_size = args.point_size
71
+ highlight = args.highlight
72
+ highlight_thresh = args.highlight_thresh
73
+ highlight_line = args.highlight_line
74
+ colors_raw = args.colors
75
+ r_min = args.r_min
76
+ r_max = args.r_max
77
+ pad = args.pad
78
+ output_format = args.output_format
79
+ output_dir = args.output_dir
80
+ dpi = args.dpi
81
+ plot_title = args.plot_title
82
+ plot_title_size = args.plot_title_size
83
+ track_heights = args.track_heights
84
+ track_spacing = args.track_spacing
85
+ chr_spacing = args.chr_spacing
86
+
87
+ # ------------------------------------------------------------------
88
+ # Resolve delimiter
89
+ # ------------------------------------------------------------------
90
+ if args.delim:
91
+ sep = resolve_delimiter(args.delim)
92
+ else:
93
+ sep = None # autodetect per file
94
+
95
+ # ------------------------------------------------------------------
96
+ # Output paths
97
+ # ------------------------------------------------------------------
98
+ out_path = Path(output_dir).resolve()
99
+ out_path.mkdir(parents=True, exist_ok=True)
100
+
101
+ pltitle = re.sub(r"[^a-zA-Z0-9\s]", "", plot_title).replace(" ", "_")
102
+ labels = [lbl.strip() for lbl in labels_raw.strip().split(",")]
103
+ sum_stats = [s.strip() for s in sum_stats_raw.strip().split(",")]
104
+
105
+ if len(sum_stats) != len(labels):
106
+ sys.exit(
107
+ "Error: number of summary stats files and labels must match.\n"
108
+ f" Files: {sum_stats}\n"
109
+ f" Labels: {labels}"
110
+ )
111
+
112
+ plt_base = str(out_path / f"{pltitle}_{'_'.join(labels)}_{mode.lower()}")
113
+ suffix = "_logp" if logp else "_pval"
114
+ plt_name = f"{plt_base}{suffix}.{output_format.lower()}"
115
+ table_out = f"{plt_base}{suffix}_locus_summary_table.tsv"
116
+
117
+ # ------------------------------------------------------------------
118
+ # Column-name candidate lists for auto-resolution
119
+ # ------------------------------------------------------------------
120
+ chr_candidates = [chrom_arg, "CHR", "CHROM", "Chromosome", "#CHROM", "#CHR",
121
+ "Chrom", "chrom", "chr", "chromosome", "#chr", "#chrom"]
122
+ pos_candidates = [pos_arg, "BP", "POS", "bp", "pos", "Basepair"]
123
+ snp_candidates = [snp_arg, "SNP", "RSID", "rsID", "MarkerName", "MarkerID",
124
+ "Predictor", "Marker", "SNPID", "ID"]
125
+ pvl_candidates = [pcol_arg, "P", "P-value", "Wald_P", "pvalue", "p_val", "pval"]
126
+ bld_candidates = [build_arg, "BUILD", "Genome", "Genome_Build", "Genome-build"]
127
+
128
+ # Remove None entries
129
+ chr_candidates = [c for c in chr_candidates if c]
130
+ pos_candidates = [c for c in pos_candidates if c]
131
+ snp_candidates = [c for c in snp_candidates if c]
132
+ pvl_candidates = [c for c in pvl_candidates if c]
133
+ bld_candidates = [c for c in bld_candidates if c]
134
+
135
+ # ------------------------------------------------------------------
136
+ # Resolve column names per file
137
+ # ------------------------------------------------------------------
138
+ sumstats_hdr_dic: dict = {}
139
+
140
+ for name, fpath in zip(labels, sum_stats):
141
+ if sep:
142
+ file_sep, dialect = sep, None
143
+ else:
144
+ file_sep, dialect = detect_delimiter(fpath, sample_size=5_000)
145
+
146
+ hdr = get_file_header(fpath, delim=file_sep, dialect=dialect)
147
+
148
+ try:
149
+ chrom_col = next(c for c in hdr if c in set(chr_candidates))
150
+ pos_col = next(c for c in hdr if c in set(pos_candidates))
151
+ snp_col = next(c for c in hdr if c in set(snp_candidates))
152
+ pcol = next(c for c in hdr if c in set(pvl_candidates))
153
+ bcol = next(c for c in hdr if c in set(bld_candidates))
154
+ except StopIteration as exc:
155
+ sys.exit(
156
+ f"Error: could not find a required column in {fpath}.\n"
157
+ f" Header: {hdr}\n"
158
+ f" Details: {exc}"
159
+ )
160
+
161
+ old_cols = [chrom_col, pos_col, snp_col, pcol, bcol]
162
+ new_cols = {
163
+ chrom_col: "CHR",
164
+ pos_col: "POS",
165
+ snp_col: "SNP",
166
+ pcol: "P",
167
+ bcol: "BUILD",
168
+ }
169
+ col_dtypes = {
170
+ chrom_col: str,
171
+ pos_col: object,
172
+ snp_col: str,
173
+ pcol: float,
174
+ bcol: str,
175
+ }
176
+
177
+ sumstats_hdr_dic[name] = [old_cols, col_dtypes, new_cols, file_sep]
178
+
179
+ # ------------------------------------------------------------------
180
+ # Colours
181
+ # ------------------------------------------------------------------
182
+ colors = [c.strip() for c in colors_raw.strip().split(",")]
183
+
184
+ # ------------------------------------------------------------------
185
+ # ResourceConfig — picks up environment variables automatically
186
+ # ------------------------------------------------------------------
187
+ resources = ResourceConfig()
188
+
189
+ # ------------------------------------------------------------------
190
+ # Load data, compute sectors, get hits table
191
+ # ------------------------------------------------------------------
192
+ (
193
+ merged_assoc_sector_sizes,
194
+ sumstats_loaded,
195
+ hits_table,
196
+ signif_lines,
197
+ ) = get_sumstats_and_merged_sector_list(
198
+ sum_stats=sum_stats,
199
+ labels=labels,
200
+ trim_pval=trim_pval,
201
+ logp=logp,
202
+ file_info=sumstats_hdr_dic,
203
+ sort_tracks=sort_track,
204
+ table_out=table_out,
205
+ highlight=highlight,
206
+ highlight_thresh=highlight_thresh,
207
+ signif_threshold=signif_threshold,
208
+ signif_line=signif_line,
209
+ suggest_threshold=suggest_threshold,
210
+ resources=resources,
211
+ )
212
+
213
+ # ------------------------------------------------------------------
214
+ # CIRCULAR MANHATTAN
215
+ # ------------------------------------------------------------------
216
+ if mode.upper() == "CM":
217
+ logger.info("Generating CIRCULAR MANHATTAN Plot ...")
218
+ circos = Circos(merged_assoc_sector_sizes, space=0.8)
219
+
220
+ if plot_title:
221
+ circos.text(text=plot_title, size=plot_title_size, weight="normal")
222
+
223
+ n_studies = len(sumstats_loaded)
224
+
225
+ radii = compute_track_radii_dict(
226
+ n_tracks=n_studies,
227
+ pad=pad,
228
+ r_min=r_min,
229
+ r_max=r_max,
230
+ annotate=bool(annotate),
231
+ )
232
+
233
+ annotation_track_key = next(reversed(radii))
234
+ annotation_track_radius = radii[annotation_track_key]
235
+
236
+ # Reverse so outermost track is plotted first
237
+ radii_reversed = dict(reversed(list(radii.items())))
238
+
239
+ inside_loc = r_min - 3
240
+ outside_loc = 101
241
+ chrom_label_loc = outside_loc if chrom_label_side == "outside" else inside_loc
242
+
243
+ if annotate:
244
+ annot_key = next(iter(radii_reversed))
245
+ annot_r = radii_reversed.pop(annot_key)
246
+ radii_reversed["annot_track_r"] = annot_r
247
+
248
+ for index, (sector_radius, sumstats_key, sumstats_value, signif_dict) in enumerate(
249
+ zip(
250
+ radii_reversed.values(),
251
+ sumstats_loaded.keys(),
252
+ sumstats_loaded.values(),
253
+ signif_lines,
254
+ )
255
+ ):
256
+ assoc = sumstats_value[0].copy()
257
+ assoc["P"] = assoc["P"].dropna()
258
+ assoc["CHR"] = assoc["CHR"].replace("23", "X").replace("24", "Y")
259
+ sumstat_name = sumstats_key
260
+
261
+ sig_thresh = signif_dict["genome"]
262
+ sug_thresh = signif_dict["suggestive"]
263
+ logger.info(f"SIGNIFICANCE THRESHOLD: {sig_thresh}")
264
+ logger.info(f"SUGGESTIVE THRESHOLD: {sug_thresh}")
265
+
266
+ if logp:
267
+ assoc["logP"] = assoc["logP"].dropna()
268
+
269
+ for sector in circos.sectors:
270
+ plot_circular(
271
+ sector=sector,
272
+ sector_radius=sector_radius,
273
+ annotation_r=annotation_track_radius if annotate else None,
274
+ sector_sizes=merged_assoc_sector_sizes,
275
+ track_index=index,
276
+ chrom_label_loc=chrom_label_loc,
277
+ chrom_label_size=chrom_label_size,
278
+ track_label_size=track_label_size,
279
+ track_label_orientation=track_label_orientation,
280
+ assoc=assoc,
281
+ assoc_label=sumstat_name,
282
+ logp=logp,
283
+ signif_line=sig_thresh,
284
+ signif_threshold=sig_thresh,
285
+ suggest_line=True if signif_line else False,
286
+ suggest_threshold=sug_thresh,
287
+ highlight=highlight,
288
+ highlight_thresh=highlight_thresh,
289
+ colors=colors,
290
+ )
291
+
292
+ # ------------------------------------------------------------------
293
+ # Circular: gene/SNP annotations
294
+ # ------------------------------------------------------------------
295
+ if annotate and not hits_table.empty:
296
+ for i, (_, row) in enumerate(hits_table.iterrows()):
297
+ if str(annotate).upper() != "GENE":
298
+ label = row["SNP"]
299
+ fstyle = "normal"
300
+ else:
301
+ if row["genic"]:
302
+ label = row["nearest_upstream_gene"]
303
+ else:
304
+ label = row["top_gene"]
305
+ fstyle = "italic"
306
+
307
+ for sector in circos.sectors:
308
+ if str(row["CHR"]) == sector.name:
309
+ a_track = sector.add_track(annotation_track_radius)
310
+ a_track.axis(fc="none", lw=0, ec="none", alpha=0)
311
+
312
+ r_low = annotation_track_radius[0]
313
+ r_high = annotation_track_radius[1]
314
+ r_pos = r_low if i % 2 == 0 else r_high
315
+ pos = row["POS"]
316
+
317
+ a_track.annotate(
318
+ x=pos,
319
+ label=str(label),
320
+ min_r=r_low,
321
+ max_r=r_low + 3,
322
+ label_size=annotation_size,
323
+ text_kws={
324
+ "size": "large",
325
+ "color": "black",
326
+ "alpha": 1,
327
+ "fontstyle": fstyle,
328
+ "fontweight": "normal",
329
+ "multialignment": "left",
330
+ },
331
+ )
332
+
333
+ if highlight_line:
334
+ sector_rlim = [t.r_lim for t in sector.tracks]
335
+ sector_min_r = min(sector_rlim)[0]
336
+ sector.line(
337
+ r=[sector_min_r, r_low],
338
+ start=pos,
339
+ end=pos,
340
+ color="lightgrey",
341
+ lw=0.4,
342
+ ls="--",
343
+ )
344
+
345
+ # ------------------------------------------------------------------
346
+ # Circular: single y-axis label on last sector
347
+ # ------------------------------------------------------------------
348
+ for sector in circos.sectors:
349
+ if sector.name == list(merged_assoc_sector_sizes.keys())[-1]:
350
+ if logp:
351
+ SUB = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")
352
+ y_label = "-log10(p-value)".translate(SUB)
353
+ else:
354
+ y_label = "p-value"
355
+
356
+ sector_rlim = [t.r_lim for t in sector.tracks]
357
+ sector_min_r = min(sector_rlim)[0]
358
+ sector_max_r = max(sector_rlim)[1]
359
+
360
+ sector.text(
361
+ y_label,
362
+ x=sector.end - (sector.end - sector.start) / 5,
363
+ r=(sector_min_r + sector_max_r) / 2
364
+ + (sector_min_r + sector_max_r) / 12,
365
+ adjust_rotation=False,
366
+ ignore_range_error=True,
367
+ size=float(track_label_size),
368
+ color="black",
369
+ fontstyle="italic",
370
+ fontweight="regular",
371
+ rotation=92,
372
+ rotation_mode="default",
373
+ va="top",
374
+ ha="right",
375
+ )
376
+
377
+ circos.plotfig().savefig(fname=plt_name.lower(), dpi=dpi)
378
+ logger.info("Saved circular Manhattan plot: %s", plt_name)
379
+
380
+ # ------------------------------------------------------------------
381
+ # LINEAR MANHATTAN
382
+ # ------------------------------------------------------------------
383
+ else:
384
+ logger.info("Generating LINEAR MANHATTAN Plot ...")
385
+ dfs = [v[0] for v in sumstats_loaded.values()]
386
+ t_labels = list(sumstats_loaded.keys())
387
+
388
+ if not track_heights:
389
+ t_heights = None
390
+ else:
391
+ t_heights = [float(x) for x in track_heights.strip().split(",")]
392
+
393
+ plot_linear(
394
+ tracks=dfs,
395
+ track_labels=t_labels,
396
+ chr_col="CHR",
397
+ pos_col="POS",
398
+ p_col="P",
399
+ trim_pval=trim_pval,
400
+ logp=True if logp else False,
401
+ point_size=point_size,
402
+ highlight=highlight,
403
+ annot_df=hits_table if not hits_table.empty else None,
404
+ label_col="top_gene",
405
+ chr_spacing=chr_spacing,
406
+ track_heights=t_heights,
407
+ track_spacing=track_spacing,
408
+ colors=colors,
409
+ sig_lines=signif_lines,
410
+ plot_title=plt_name,
411
+ dpi=dpi,
412
+ fig_format=output_format,
413
+ figsize=(15, 9),
414
+ )
415
+ logger.info("Saved linear Manhattan plot: %s", plt_name)
416
+
417
+
418
+ if __name__ == "__main__":
419
+ main()