pycmplot 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycmplot/_core.py +57 -1
- pycmplot/cli.py +38 -17
- pycmplot/io.py +35 -18
- pycmplot/plotting/circular.py +11 -7
- pycmplot/plotting/linear.py +140 -9
- pycmplot/plotting/qq.py +643 -0
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/METADATA +3 -2
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/RECORD +12 -11
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/WHEEL +0 -0
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/entry_points.txt +0 -0
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/licenses/LICENSE +0 -0
- {pycmplot-0.2.1.dist-info → pycmplot-0.2.2.dist-info}/top_level.txt +0 -0
pycmplot/_core.py
CHANGED
|
@@ -92,6 +92,7 @@ def main() -> None:
|
|
|
92
92
|
)
|
|
93
93
|
from pycmplot.plotting.linear import plot_linear
|
|
94
94
|
from pycmplot.plotting.circular import plot_circular
|
|
95
|
+
from pycmplot.plotting.qq import plot_qq_combined, plot_qq_separate, plot_qq_overlay
|
|
95
96
|
from pycmplot.resources import ResourceConfig
|
|
96
97
|
|
|
97
98
|
# ------------------------------------------------------------------
|
|
@@ -110,6 +111,13 @@ def main() -> None:
|
|
|
110
111
|
labels_raw = args.labels
|
|
111
112
|
pcol_arg = args.pval_column
|
|
112
113
|
logp = args.logp
|
|
114
|
+
qq = args.qq_plot
|
|
115
|
+
qq_separate = args.qq_separate
|
|
116
|
+
qq_ncols = args.qq_ncols
|
|
117
|
+
qq_thin = args.qq_thin
|
|
118
|
+
thin_below = args.thin_below
|
|
119
|
+
qq_max_points = args.qq_max_points
|
|
120
|
+
qq_overlay = args.qq_overlay
|
|
113
121
|
chrom_label_size = args.chrom_label_size
|
|
114
122
|
chrom_label_side = args.chrom_label_side
|
|
115
123
|
track_label_size = args.track_label_size
|
|
@@ -164,7 +172,8 @@ def main() -> None:
|
|
|
164
172
|
# ------------------------------------------------------------------
|
|
165
173
|
(
|
|
166
174
|
plt_name,
|
|
167
|
-
table_out
|
|
175
|
+
table_out,
|
|
176
|
+
plt_base,
|
|
168
177
|
) = get_output_paths(
|
|
169
178
|
labels,
|
|
170
179
|
mode = mode,
|
|
@@ -202,6 +211,7 @@ def main() -> None:
|
|
|
202
211
|
sumstats_loaded,
|
|
203
212
|
hits_table,
|
|
204
213
|
signif_lines,
|
|
214
|
+
pval_dict,
|
|
205
215
|
) = get_sumstats_and_merged_sector_list(
|
|
206
216
|
sum_stats=sum_stats,
|
|
207
217
|
labels=labels,
|
|
@@ -296,6 +306,52 @@ def main() -> None:
|
|
|
296
306
|
figsize=(15, 9)
|
|
297
307
|
)
|
|
298
308
|
|
|
309
|
+
# ------------------------------------------------------------------
|
|
310
|
+
# QQ PLOT
|
|
311
|
+
# ------------------------------------------------------------------
|
|
312
|
+
if qq and sumstats_loaded:
|
|
313
|
+
logger.info("Generating QQ Plot(s) ...")
|
|
314
|
+
qq_stem = f"{plt_base}_qq"
|
|
315
|
+
|
|
316
|
+
if qq_separate:
|
|
317
|
+
plot_qq_separate(
|
|
318
|
+
pval_dict=pval_dict,
|
|
319
|
+
thin=qq_thin,
|
|
320
|
+
thin_below=thin_below,
|
|
321
|
+
max_points=qq_max_points,
|
|
322
|
+
output_path=qq_stem,
|
|
323
|
+
colors=colors,
|
|
324
|
+
signif_threshold=signif_threshold or 5e-8,
|
|
325
|
+
dpi=dpi,
|
|
326
|
+
fig_format=output_format,
|
|
327
|
+
)
|
|
328
|
+
elif qq_overlay:
|
|
329
|
+
plot_qq_overlay(
|
|
330
|
+
pval_dict=pval_dict,
|
|
331
|
+
thin=qq_thin,
|
|
332
|
+
thin_below=thin_below,
|
|
333
|
+
max_points=qq_max_points,
|
|
334
|
+
colors=colors,
|
|
335
|
+
signif_threshold=signif_threshold or 5e-8,
|
|
336
|
+
dpi=dpi,
|
|
337
|
+
title=plot_title,
|
|
338
|
+
output_path=f"{qq_stem}_overlay",
|
|
339
|
+
fig_format=output_format,
|
|
340
|
+
)
|
|
341
|
+
else:
|
|
342
|
+
plot_qq_combined(
|
|
343
|
+
pval_dict=pval_dict,
|
|
344
|
+
thin=qq_thin,
|
|
345
|
+
thin_below=thin_below,
|
|
346
|
+
max_points=qq_max_points,
|
|
347
|
+
colors=colors,
|
|
348
|
+
ncols=qq_ncols,
|
|
349
|
+
signif_threshold=signif_threshold or 5e-8,
|
|
350
|
+
dpi=dpi,
|
|
351
|
+
title=plot_title,
|
|
352
|
+
output_path=f"{qq_stem}_combined",
|
|
353
|
+
fig_format=output_format,
|
|
354
|
+
)
|
|
299
355
|
|
|
300
356
|
if __name__ == "__main__":
|
|
301
357
|
main()
|
pycmplot/cli.py
CHANGED
|
@@ -329,29 +329,50 @@ def get_arguments(descmsg: str = DESCMSG) -> argparse.Namespace:
|
|
|
329
329
|
)
|
|
330
330
|
opt.add_argument(
|
|
331
331
|
"-bc", "--build_column", required=False, type=str, metavar="str",
|
|
332
|
-
|
|
333
|
-
|
|
332
|
+
help=("Name of column containing genome build (hg18/hg19/hg38)."
|
|
333
|
+
"Or use ``--build`` below to supply genome builds per summary stat file."
|
|
334
|
+
))
|
|
334
335
|
opt.add_argument(
|
|
335
|
-
"-b","--build",
|
|
336
|
-
help=
|
|
337
|
-
Comma-sperated list of genome build of summary stats file(s) listed
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
"""
|
|
343
|
-
required=False,
|
|
344
|
-
type=str,
|
|
345
|
-
metavar='str'
|
|
336
|
+
"-b","--build", required=False, type=str, metavar='str',
|
|
337
|
+
help=
|
|
338
|
+
"""Comma-sperated list of genome build of summary stats file(s) listed
|
|
339
|
+
in the same order as sumstats files. e.g. hg19,hg38,hg38,hg19 means:
|
|
340
|
+
file1.txt.gz --> hg19
|
|
341
|
+
file2.txt.gz --> hg38
|
|
342
|
+
file3.tsv --> hg38 ... etc
|
|
343
|
+
"""
|
|
346
344
|
)
|
|
347
345
|
opt.add_argument(
|
|
348
346
|
"--logp", action="store_true",
|
|
349
347
|
help="Plot −log₁₀(p) instead of raw p-values."
|
|
350
348
|
)
|
|
351
|
-
opt.add_argument(
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
349
|
+
opt.add_argument("-qq", "--qq_plot", action="store_true",
|
|
350
|
+
help="Generate QQ-plot(s) alongside the Manhattan plot.")
|
|
351
|
+
opt.add_argument("-qq_sep", "--qq_separate", action="store_true",
|
|
352
|
+
help=(
|
|
353
|
+
"Save one QQ-plot file per sumstat instead of a "
|
|
354
|
+
"combined multi-panel figure. Only used when -qq is set."
|
|
355
|
+
))
|
|
356
|
+
opt.add_argument("-qq_cols", "--qq_ncols", default=3, type=int, metavar="int",
|
|
357
|
+
help="Number of columns in the combined QQ-plot grid (default: 3).")
|
|
358
|
+
opt.add_argument("-qq_thin", "--qq_thin", action="store_true", default=False,
|
|
359
|
+
help=(
|
|
360
|
+
"Thin null-like p-values before QQ plotting for speed (default: off)."
|
|
361
|
+
"Include this flag to turn on for speed."
|
|
362
|
+
))
|
|
363
|
+
opt.add_argument("-thin_below", "--thin_below", type=float, metavar="float", default=0.01,
|
|
364
|
+
help=(
|
|
365
|
+
"P-value threshold below which all points are always kept."
|
|
366
|
+
"Points above this threshold are downsampled (default: 0.01)."
|
|
367
|
+
))
|
|
368
|
+
opt.add_argument("-qq_max_pts", "--qq_max_points", default=50000, type=int, metavar="int",
|
|
369
|
+
help="Max points to plot per QQ track after thinning (default: 50000).")
|
|
370
|
+
opt.add_argument("-qq_ov", "--qq_overlay", action="store_true",
|
|
371
|
+
help=(
|
|
372
|
+
"Plot all sumstats on a single overlaid QQ-plot, "
|
|
373
|
+
"each coloured by label with lambda in the legend. "
|
|
374
|
+
"Only used when -qq is set."
|
|
375
|
+
))
|
|
355
376
|
opt.add_argument(
|
|
356
377
|
"-tp", "--trim_pval", type=float, metavar="float",
|
|
357
378
|
help="Trim variants with p > this value before plotting."
|
pycmplot/io.py
CHANGED
|
@@ -291,7 +291,9 @@ def strip_comma_separated_input_streams(
|
|
|
291
291
|
|
|
292
292
|
if builds:
|
|
293
293
|
builds = [s.strip() for s in builds.strip().split(",")]
|
|
294
|
-
if len(sum_stats)
|
|
294
|
+
if len(sum_stats) == len(labels) == len(builds):
|
|
295
|
+
pass
|
|
296
|
+
else:
|
|
295
297
|
sys.exit(
|
|
296
298
|
"Error: number of summary stats files, labels, and builds must match.\n"
|
|
297
299
|
f" Files: {sum_stats}\n"
|
|
@@ -429,16 +431,16 @@ def get_output_paths(
|
|
|
429
431
|
|
|
430
432
|
labels = [re.sub(r"[^a-zA-Z0-9\s]", "", x).replace(" ", "_") for x in labels]
|
|
431
433
|
|
|
432
|
-
plt_base = str(out_path / f"{pltitle}_{'_'.join(labels)}_{mode.lower()}")
|
|
433
|
-
|
|
434
434
|
suffix = "_logp" if logp else "_pval"
|
|
435
435
|
|
|
436
|
-
|
|
436
|
+
plt_base = str(out_path / f"{pltitle}_{'_'.join(labels)}_{mode.lower()}{suffix}")
|
|
437
|
+
|
|
438
|
+
plt_name = f"{plt_base}.{output_format.lower()}"
|
|
437
439
|
|
|
438
|
-
table_out = f"{plt_base}
|
|
440
|
+
table_out = f"{plt_base}_locus_summary_table.tsv"
|
|
439
441
|
|
|
440
442
|
|
|
441
|
-
return plt_name, table_out
|
|
443
|
+
return plt_name, table_out, plt_base
|
|
442
444
|
|
|
443
445
|
|
|
444
446
|
|
|
@@ -561,11 +563,13 @@ def prep_pycmplot_input_info(
|
|
|
561
563
|
snp_candidates = [c for c in snp_candidates if c]
|
|
562
564
|
pvl_candidates = [c for c in pvl_candidates if c]
|
|
563
565
|
|
|
564
|
-
bld_candidates = [
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
566
|
+
bld_candidates = []
|
|
567
|
+
if buildc:
|
|
568
|
+
bld_candidates = ["BUILD", "Genome", "Genome_Build", "Genome-build"]
|
|
569
|
+
bld_candidates_l = [x.lower() for x in bld_candidates]
|
|
570
|
+
bld_candidates_u = [x.upper() for x in bld_candidates]
|
|
571
|
+
bld_candidates = [buildc] + bld_candidates + bld_candidates_l + bld_candidates_u
|
|
572
|
+
bld_candidates = [c for c in bld_candidates if c]
|
|
569
573
|
|
|
570
574
|
# ------------------------------------------------------------------
|
|
571
575
|
# Resolve column names per file
|
|
@@ -795,6 +799,7 @@ def get_sumstats_and_merged_sector_list(
|
|
|
795
799
|
}
|
|
796
800
|
|
|
797
801
|
sumstats_loaded: dict[str, list] = {}
|
|
802
|
+
pval_dict: dict[str, np.ndarray | pd.Series] = {}
|
|
798
803
|
all_lead_snps: list[pd.DataFrame] = []
|
|
799
804
|
|
|
800
805
|
for label in sumstats.keys() & (file_info or {}).keys():
|
|
@@ -818,6 +823,11 @@ def get_sumstats_and_merged_sector_list(
|
|
|
818
823
|
dtype=sumstat_dtypes,
|
|
819
824
|
).rename(columns=sumstat_newcols)
|
|
820
825
|
|
|
826
|
+
# Get dict of p-values for qq-plotting before applying trim_pval
|
|
827
|
+
logger.info("Extracting raw p-values for qq-plotting ...")
|
|
828
|
+
pval_dict[label] = df["P"].dropna().astype(float).values
|
|
829
|
+
|
|
830
|
+
|
|
821
831
|
# Add build column if not exist and build supplied
|
|
822
832
|
if build:
|
|
823
833
|
df['BUILD'] = build
|
|
@@ -917,7 +927,13 @@ def get_sumstats_and_merged_sector_list(
|
|
|
917
927
|
for _ in sumstats
|
|
918
928
|
]
|
|
919
929
|
|
|
920
|
-
|
|
930
|
+
|
|
931
|
+
# sort dicts by user-supplied order
|
|
932
|
+
sumstats_loaded = {key: sumstats_loaded[key] for key in labels if key in sumstats_loaded}
|
|
933
|
+
pval_dict = {key: pval_dict[key] for key in labels if key in pval_dict}
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
# or sort by user option
|
|
921
937
|
if sort_tracks is not None:
|
|
922
938
|
if sort_tracks.lower() == "label":
|
|
923
939
|
sumstats_loaded = dict(sorted(sumstats_loaded.items()))
|
|
@@ -928,6 +944,7 @@ def get_sumstats_and_merged_sector_list(
|
|
|
928
944
|
key=lambda item: (item[0], natsort.natsort_keygen()(item[1][1])),
|
|
929
945
|
)
|
|
930
946
|
)
|
|
947
|
+
|
|
931
948
|
|
|
932
949
|
# Compute per-sumstat sector sizes (chrom → [min_pos, max_pos])
|
|
933
950
|
assoc_sector_sizes_list: list[dict] = []
|
|
@@ -941,7 +958,7 @@ def get_sumstats_and_merged_sector_list(
|
|
|
941
958
|
for chrom in assoc["CHR"].unique():
|
|
942
959
|
sub = assoc[assoc["CHR"] == chrom]
|
|
943
960
|
lo_val = max(sub["POS"].min() - 1_000_000, 0)
|
|
944
|
-
hi_val = sub["POS"].max()
|
|
961
|
+
hi_val = sub["POS"].max()
|
|
945
962
|
assoc_dic[str(chrom)] = [lo_val, hi_val]
|
|
946
963
|
|
|
947
964
|
min_dic_val = min(assoc_dic.values())
|
|
@@ -955,9 +972,9 @@ def get_sumstats_and_merged_sector_list(
|
|
|
955
972
|
|
|
956
973
|
# Add spacer sector for y-axis labelling
|
|
957
974
|
if min_dic_val is not None:
|
|
958
|
-
if len(labels) <= 5:
|
|
959
|
-
|
|
960
|
-
else:
|
|
961
|
-
|
|
975
|
+
#if len(labels) <= 5:
|
|
976
|
+
# merged["Spacer1"] = [x + x / 2 for x in min_dic_val]
|
|
977
|
+
#else:
|
|
978
|
+
merged["Spacer1"] = [x * 2 for x in min_dic_val]
|
|
962
979
|
|
|
963
|
-
return merged, sumstats_loaded, hits_table, signif_lines
|
|
980
|
+
return merged, sumstats_loaded, hits_table, signif_lines, pval_dict
|
pycmplot/plotting/circular.py
CHANGED
|
@@ -380,6 +380,7 @@ def plot_circular(
|
|
|
380
380
|
annotate: str = None,
|
|
381
381
|
label_col: str = None,
|
|
382
382
|
chrom_label_side: str = 'inside',
|
|
383
|
+
chrom_label_size: float = 6,
|
|
383
384
|
signif_line: float = 5e-8,
|
|
384
385
|
highlight: bool = False,
|
|
385
386
|
highlight_thresh: float = 5e-8,
|
|
@@ -387,7 +388,6 @@ def plot_circular(
|
|
|
387
388
|
highlight_line: bool = False,
|
|
388
389
|
highlight_line_color: str = 'grey',
|
|
389
390
|
colors: list[str] = ['steelblue', 'grey'],
|
|
390
|
-
chrom_label_size: float = 6,
|
|
391
391
|
track_label_size: float = 6,
|
|
392
392
|
track_label_orientation: str = 'vertical',
|
|
393
393
|
hits_table: pd.DataFrame = None,
|
|
@@ -526,7 +526,8 @@ def plot_circular(
|
|
|
526
526
|
labels = list(sumstats_loaded.keys())
|
|
527
527
|
(
|
|
528
528
|
plt_name,
|
|
529
|
-
table_out
|
|
529
|
+
table_out,
|
|
530
|
+
plt_base,
|
|
530
531
|
) = get_output_paths(
|
|
531
532
|
labels,
|
|
532
533
|
mode='cm',
|
|
@@ -558,14 +559,16 @@ def plot_circular(
|
|
|
558
559
|
radii_reversed = dict(reversed(list(radii.items())))
|
|
559
560
|
|
|
560
561
|
inside_loc = r_min - 3
|
|
561
|
-
outside_loc =
|
|
562
|
-
chrom_label_loc = outside_loc if chrom_label_side == "outside" else inside_loc
|
|
562
|
+
outside_loc = r_max + 4
|
|
563
563
|
|
|
564
564
|
if annotate:
|
|
565
565
|
annot_key = next(iter(radii_reversed))
|
|
566
566
|
annot_r = radii_reversed.pop(annot_key)
|
|
567
|
+
outside_loc = max(list(radii_reversed.values())[0]) + 2
|
|
567
568
|
radii_reversed["annot_track_r"] = annot_r
|
|
568
569
|
|
|
570
|
+
chrom_label_loc = outside_loc if chrom_label_side == "outside" else inside_loc
|
|
571
|
+
|
|
569
572
|
for index, (sector_radius, sumstats_key, sumstats_value, signif_dict) in enumerate(
|
|
570
573
|
zip(
|
|
571
574
|
radii_reversed.values(),
|
|
@@ -647,7 +650,7 @@ def plot_circular(
|
|
|
647
650
|
x=pos,
|
|
648
651
|
label=str(label),
|
|
649
652
|
min_r=r_low,
|
|
650
|
-
max_r=r_low +
|
|
653
|
+
max_r=r_low + 6,
|
|
651
654
|
label_size=annotation_size,
|
|
652
655
|
text_kws={
|
|
653
656
|
"size": "large",
|
|
@@ -668,8 +671,9 @@ def plot_circular(
|
|
|
668
671
|
r=[sector_min_r, r_low],
|
|
669
672
|
start=pos,
|
|
670
673
|
end=pos,
|
|
674
|
+
alpha=0.4,
|
|
671
675
|
color=highlight_line_color,
|
|
672
|
-
lw=0.
|
|
676
|
+
lw=0.4,
|
|
673
677
|
ls="--",
|
|
674
678
|
)
|
|
675
679
|
|
|
@@ -709,6 +713,6 @@ def plot_circular(
|
|
|
709
713
|
|
|
710
714
|
if plt_name:
|
|
711
715
|
fig.savefig(fname=plt_name.lower(), dpi=dpi)
|
|
712
|
-
logger.info("Saved circular Manhattan plot: %s", plt_name)
|
|
716
|
+
logger.info("Saved circular Manhattan plot: %s", plt_name.lower())
|
|
713
717
|
|
|
714
718
|
return fig
|
pycmplot/plotting/linear.py
CHANGED
|
@@ -100,7 +100,7 @@ def _cluster_annotations_by_chr(
|
|
|
100
100
|
return clusters
|
|
101
101
|
|
|
102
102
|
|
|
103
|
-
def
|
|
103
|
+
def _draw_annotation_arrows(
|
|
104
104
|
ax,
|
|
105
105
|
annot_df,
|
|
106
106
|
chr_col: str,
|
|
@@ -212,7 +212,7 @@ def _draw_annotation_arrows_2(
|
|
|
212
212
|
|
|
213
213
|
|
|
214
214
|
|
|
215
|
-
def
|
|
215
|
+
def _draw_annotation_arrows_2(
|
|
216
216
|
ax,
|
|
217
217
|
annot_df,
|
|
218
218
|
chr_col: str,
|
|
@@ -361,6 +361,118 @@ def _draw_annotation_arrows(
|
|
|
361
361
|
|
|
362
362
|
last_xtext = max(x_texts)
|
|
363
363
|
|
|
364
|
+
|
|
365
|
+
# Using cumulative distance for anntations and separating clusters
|
|
366
|
+
def _draw_annotation_arrows_3(
|
|
367
|
+
ax,
|
|
368
|
+
annot_df,
|
|
369
|
+
chr_col: str,
|
|
370
|
+
label_col: str,
|
|
371
|
+
offsets: dict,
|
|
372
|
+
chr_max: dict,
|
|
373
|
+
spread_width: float = 60e6,
|
|
374
|
+
isolation_threshold: float = 80e6,
|
|
375
|
+
stack_threshold: float = 10e6,
|
|
376
|
+
y_text_base: float = 0.55,
|
|
377
|
+
y_stack_step: float = 0.02,
|
|
378
|
+
max_rad: float = 0.35,
|
|
379
|
+
y_tip: float = 0.0,
|
|
380
|
+
) -> None:
|
|
381
|
+
|
|
382
|
+
annot_df = annot_df.sort_values(by=[chr_col, "x"], key=natsort_keygen())
|
|
383
|
+
last_xtext = 0 - spread_width
|
|
384
|
+
|
|
385
|
+
for chr_name, df_chr in annot_df.groupby(chr_col, sort=False):
|
|
386
|
+
df_chr = df_chr.sort_values("x")
|
|
387
|
+
chr_start = offsets[chr_name]
|
|
388
|
+
chr_end = offsets[chr_name] + chr_max[chr_name]
|
|
389
|
+
chr_range = chr_end - chr_start
|
|
390
|
+
|
|
391
|
+
x_signals = df_chr["x"].values
|
|
392
|
+
labels = df_chr[label_col].values
|
|
393
|
+
n = len(x_signals)
|
|
394
|
+
|
|
395
|
+
# ------------------------------------------------------------------
|
|
396
|
+
# Compute label x positions (spread or straight)
|
|
397
|
+
# ------------------------------------------------------------------
|
|
398
|
+
x_texts = []
|
|
399
|
+
for k, x_sig in enumerate(x_signals):
|
|
400
|
+
neighbours = np.delete(x_signals, k)
|
|
401
|
+
min_dist = np.min(np.abs(neighbours - x_sig)) if len(neighbours) else np.inf
|
|
402
|
+
|
|
403
|
+
if min_dist >= isolation_threshold:
|
|
404
|
+
x_texts.append(x_sig) # Tier 1: sit directly above
|
|
405
|
+
else:
|
|
406
|
+
x_texts.append(None) # Tier 2: needs spreading
|
|
407
|
+
|
|
408
|
+
spread_indices = [k for k, v in enumerate(x_texts) if v is None]
|
|
409
|
+
if spread_indices:
|
|
410
|
+
sw = spread_width
|
|
411
|
+
pad = sw / int(str(sw)[:2]) / 2
|
|
412
|
+
while sw > chr_range and sw > pad:
|
|
413
|
+
sw -= pad
|
|
414
|
+
|
|
415
|
+
sig_start = x_signals[spread_indices[0]]
|
|
416
|
+
xmin = sig_start - sw
|
|
417
|
+
positions = np.arange(xmin, xmin + len(spread_indices) * sw, sw)
|
|
418
|
+
|
|
419
|
+
while positions[0] <= last_xtext:
|
|
420
|
+
positions = positions + sw
|
|
421
|
+
|
|
422
|
+
for j, k in enumerate(spread_indices):
|
|
423
|
+
x_texts[k] = positions[j]
|
|
424
|
+
|
|
425
|
+
# ------------------------------------------------------------------
|
|
426
|
+
# Compute label y positions using cumulative x distance
|
|
427
|
+
# ------------------------------------------------------------------
|
|
428
|
+
y_texts = [y_text_base] * n
|
|
429
|
+
|
|
430
|
+
for k in range(1, n):
|
|
431
|
+
cum_dist = abs(x_texts[k] - x_texts[k - 1])
|
|
432
|
+
if cum_dist <= stack_threshold:
|
|
433
|
+
# too close to previous label — stack upward adaptively
|
|
434
|
+
y_texts[k] = y_texts[k - 1] + y_stack_step + (
|
|
435
|
+
y_stack_step * (1 - cum_dist / stack_threshold)
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
y_texts[k] = y_text_base # far enough — reset to baseline
|
|
439
|
+
|
|
440
|
+
# ------------------------------------------------------------------
|
|
441
|
+
# Draw arrows and labels
|
|
442
|
+
# ------------------------------------------------------------------
|
|
443
|
+
for x_sig, x_txt, y_txt, label in zip(x_signals, x_texts, y_texts, labels):
|
|
444
|
+
dx = x_txt - x_sig
|
|
445
|
+
rad = np.clip(dx / (spread_width * 2), -max_rad, max_rad)
|
|
446
|
+
|
|
447
|
+
arrow = FancyArrowPatch(
|
|
448
|
+
(x_txt, y_txt),
|
|
449
|
+
(x_sig, y_tip - 0.05),
|
|
450
|
+
arrowstyle="-|>",
|
|
451
|
+
mutation_scale=12,
|
|
452
|
+
lw=0.6,
|
|
453
|
+
color="grey",
|
|
454
|
+
alpha=0.5,
|
|
455
|
+
connectionstyle=f"arc3,rad={rad}",
|
|
456
|
+
transform=ax.transData,
|
|
457
|
+
)
|
|
458
|
+
ax.add_patch(arrow)
|
|
459
|
+
|
|
460
|
+
ax.text(
|
|
461
|
+
x_txt,
|
|
462
|
+
y_txt + 0.02,
|
|
463
|
+
str(label),
|
|
464
|
+
rotation=45,
|
|
465
|
+
ha="left",
|
|
466
|
+
va="bottom",
|
|
467
|
+
fontsize=10,
|
|
468
|
+
clip_on=False,
|
|
469
|
+
color="black",
|
|
470
|
+
fontstyle="italic",
|
|
471
|
+
fontweight="regular",
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
last_xtext = max(x_texts)
|
|
475
|
+
|
|
364
476
|
# ---------------------------------------------------------------------------
|
|
365
477
|
# Public function
|
|
366
478
|
# ---------------------------------------------------------------------------
|
|
@@ -647,7 +759,8 @@ def plot_linearm(
|
|
|
647
759
|
# Annotation track
|
|
648
760
|
# ------------------------------------------------------------------
|
|
649
761
|
if annotate and annot_df is not None:
|
|
650
|
-
|
|
762
|
+
|
|
763
|
+
|
|
651
764
|
_draw_annotation_arrows(
|
|
652
765
|
ax_annot,
|
|
653
766
|
annot_df,
|
|
@@ -657,9 +770,10 @@ def plot_linearm(
|
|
|
657
770
|
chr_max=chr_max,
|
|
658
771
|
spread_width=60e6,
|
|
659
772
|
)
|
|
660
|
-
|
|
773
|
+
|
|
661
774
|
|
|
662
|
-
|
|
775
|
+
"""
|
|
776
|
+
_draw_annotation_arrows_2(
|
|
663
777
|
ax=ax_annot,
|
|
664
778
|
annot_df=annot_df,
|
|
665
779
|
chr_col=chr_col,
|
|
@@ -667,13 +781,31 @@ def plot_linearm(
|
|
|
667
781
|
offsets=offsets,
|
|
668
782
|
chr_max=chr_max,
|
|
669
783
|
spread_width=60e6,
|
|
670
|
-
isolation_threshold=
|
|
784
|
+
isolation_threshold=40e6, # above this → straight (Tier 1)
|
|
671
785
|
stack_threshold=10e6, # below this → stack (Tier 3)
|
|
672
786
|
max_tilt=45, # max angleA departure from vertical
|
|
673
787
|
y_tip=0.0,
|
|
674
788
|
y_text=0.55,
|
|
675
789
|
y_stack_step=0.12, # vertical gap between stacked labels
|
|
676
790
|
)
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
_draw_annotation_arrows_3(
|
|
794
|
+
ax=ax_annot,
|
|
795
|
+
annot_df=annot_df,
|
|
796
|
+
chr_col=chr_col,
|
|
797
|
+
label_col=label_col,
|
|
798
|
+
offsets=offsets,
|
|
799
|
+
chr_max=chr_max,
|
|
800
|
+
spread_width=60e6,
|
|
801
|
+
isolation_threshold=80e6,
|
|
802
|
+
stack_threshold=90e6,
|
|
803
|
+
y_text_base=0.55,
|
|
804
|
+
y_stack_step=0.03,
|
|
805
|
+
max_rad=0.35,
|
|
806
|
+
y_tip=0.0,
|
|
807
|
+
)
|
|
808
|
+
"""
|
|
677
809
|
|
|
678
810
|
ax_annot.set_ylim(0, 1)
|
|
679
811
|
ax_annot.axis("off")
|
|
@@ -859,12 +991,11 @@ def plot_linear(
|
|
|
859
991
|
logger.info("'SNP' column is used for annotation since '%s' column could not be resolved in hits table.", label_col)
|
|
860
992
|
pass
|
|
861
993
|
|
|
862
|
-
logger.info(f"LABEL COL: {label}")
|
|
863
|
-
|
|
864
994
|
# plot name
|
|
865
995
|
(
|
|
866
996
|
plt_name,
|
|
867
|
-
table_out
|
|
997
|
+
table_out,
|
|
998
|
+
plt_base,
|
|
868
999
|
) = get_output_paths(
|
|
869
1000
|
labels = t_labels,
|
|
870
1001
|
mode='lm',
|
pycmplot/plotting/qq.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""
|
|
4
|
+
pycmplot.plotting.qq
|
|
5
|
+
====================
|
|
6
|
+
QQ (quantile-quantile) plots for GWAS p-values.
|
|
7
|
+
|
|
8
|
+
Speed notes
|
|
9
|
+
-----------
|
|
10
|
+
GWAS summary statistics often contain millions of SNPs. Most of those points
|
|
11
|
+
lie near the null diagonal and are visually redundant. Two optimisations are
|
|
12
|
+
applied by default:
|
|
13
|
+
|
|
14
|
+
1. **P-value thinning** (``thin_below`` / ``max_points``):
|
|
15
|
+
All points above a -log10(p) tail threshold are kept in full; the bulk
|
|
16
|
+
of null-like points below that threshold are randomly downsampled to at
|
|
17
|
+
most ``max_points`` total. Lambda (λ) is always computed on the *full*
|
|
18
|
+
unfiltered array before thinning, so the statistic is never affected.
|
|
19
|
+
|
|
20
|
+
2. **Rasterised scatter** (``rasterized=True``):
|
|
21
|
+
The scatter layer is rendered as a bitmap inside vector formats (PDF/SVG),
|
|
22
|
+
dramatically reducing file size and save time for large point clouds.
|
|
23
|
+
|
|
24
|
+
Public functions
|
|
25
|
+
----------------
|
|
26
|
+
thin_pvals Downsample null-like p-values for fast plotting.
|
|
27
|
+
plot_qq_single Draw one QQ plot onto a given Axes.
|
|
28
|
+
plot_qq_combined All QQ plots in a single figure (grid layout).
|
|
29
|
+
plot_qq_separate One output file per sumstat.
|
|
30
|
+
plot_qq_overlay All sumstats overlaid on one axes, coloured by label.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import logging
|
|
34
|
+
import math
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Optional
|
|
37
|
+
|
|
38
|
+
import matplotlib.pyplot as plt
|
|
39
|
+
import matplotlib.colors as mcolors
|
|
40
|
+
import numpy as np
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from scipy.stats import beta as beta_dist
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Thinning helper
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def thin_pvals(
|
|
52
|
+
pvals: np.ndarray,
|
|
53
|
+
tail_threshold: float = 0.01,
|
|
54
|
+
max_points: int = 50_000,
|
|
55
|
+
seed: int = 42,
|
|
56
|
+
) -> tuple[np.ndarray, np.ndarray, int]:
|
|
57
|
+
"""Downsample p-values for faster QQ plotting with no visible breaks.
|
|
58
|
+
|
|
59
|
+
Rather than splitting into tail / bulk regions with different sampling
|
|
60
|
+
strategies (which produces a visible seam at the threshold), this function
|
|
61
|
+
uses a single **log-uniform** thinning pass over all p-values:
|
|
62
|
+
|
|
63
|
+
1. Sort p-values ascending and convert to −log₁₀ scale.
|
|
64
|
+
2. Pick ``max_points`` evenly-spaced indices along the −log₁₀ axis.
|
|
65
|
+
Because −log₁₀ compresses large p-values and expands small ones, this
|
|
66
|
+
automatically gives dense coverage in the interesting tail and sparse
|
|
67
|
+
coverage in the null bulk — with no hard boundary.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
pvals:
|
|
72
|
+
Full array of raw p-values.
|
|
73
|
+
tail_threshold:
|
|
74
|
+
Kept for API compatibility; no longer used as a hard split point.
|
|
75
|
+
All points above −log₁₀(tail_threshold) are always represented because
|
|
76
|
+
the log-uniform spacing naturally keeps them.
|
|
77
|
+
max_points:
|
|
78
|
+
Maximum number of points to return (default 50 000).
|
|
79
|
+
seed:
|
|
80
|
+
Unused (kept for API compatibility — log-uniform selection is
|
|
81
|
+
deterministic).
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
(kept_pvals, kept_ranks, n_full)
|
|
86
|
+
*kept_pvals* — thinned p-values in ascending order.
|
|
87
|
+
*kept_ranks* — 1-based ranks in the full sorted array.
|
|
88
|
+
*n_full* — total SNP count before thinning (for expected quantiles).
|
|
89
|
+
|
|
90
|
+
Notes
|
|
91
|
+
-----
|
|
92
|
+
Lambda (λ) must be computed on the full *pvals* array **before** calling
|
|
93
|
+
this function — thinning changes the empirical distribution.
|
|
94
|
+
"""
|
|
95
|
+
pvals = np.asarray(pvals, dtype=float)
|
|
96
|
+
pvals = pvals[np.isfinite(pvals) & (pvals > 0) & (pvals <= 1)]
|
|
97
|
+
n_full = len(pvals)
|
|
98
|
+
|
|
99
|
+
if n_full <= max_points:
|
|
100
|
+
# Nothing to thin
|
|
101
|
+
sort_idx = np.argsort(pvals)
|
|
102
|
+
return pvals[sort_idx], np.arange(1, n_full + 1), n_full
|
|
103
|
+
|
|
104
|
+
# Sort ascending; full_ranks[i] = i+1
|
|
105
|
+
pvals_sorted = np.sort(pvals)
|
|
106
|
+
full_ranks = np.arange(1, n_full + 1)
|
|
107
|
+
|
|
108
|
+
# Work in −log10 space so spacing is proportional to visual separation
|
|
109
|
+
logp = -np.log10(pvals_sorted) # ascending p → descending logp
|
|
110
|
+
logp_min = logp[0] # smallest logp (bulk end)
|
|
111
|
+
logp_max = logp[-1] # largest logp (tail end)
|
|
112
|
+
|
|
113
|
+
# Evenly-spaced target positions along the logp axis
|
|
114
|
+
targets = np.linspace(logp_min, logp_max, max_points)
|
|
115
|
+
|
|
116
|
+
# For each target, pick the closest actual point (searchsorted on
|
|
117
|
+
# the reversed array since logp is descending)
|
|
118
|
+
logp_desc = logp[::-1] # descending for searchsorted
|
|
119
|
+
idx_desc = np.searchsorted(logp_desc, targets, side="left")
|
|
120
|
+
idx_desc = np.clip(idx_desc, 0, n_full - 1)
|
|
121
|
+
|
|
122
|
+
# Convert back to ascending-p indices and deduplicate
|
|
123
|
+
idx_asc = (n_full - 1 - idx_desc)
|
|
124
|
+
idx_asc = np.unique(idx_asc) # sorted, no duplicates
|
|
125
|
+
|
|
126
|
+
kept_pvals = pvals_sorted[idx_asc]
|
|
127
|
+
kept_ranks = full_ranks[idx_asc]
|
|
128
|
+
|
|
129
|
+
n_kept = len(kept_pvals)
|
|
130
|
+
logger.debug(
|
|
131
|
+
"QQ thinning: %d → %d points (%.1f%% retained)",
|
|
132
|
+
n_full, n_kept, 100 * n_kept / n_full,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
return kept_pvals, kept_ranks, n_full
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ---------------------------------------------------------------------------
|
|
139
|
+
# Core array builder
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
def _qq_arrays(
|
|
143
|
+
pvals: np.ndarray,
|
|
144
|
+
ranks: Optional[np.ndarray] = None,
|
|
145
|
+
n_full: Optional[int] = None,
|
|
146
|
+
ci: float = 0.95,
|
|
147
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
148
|
+
"""Return (expected, observed, ci_lower, ci_upper) in −log10 scale.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
pvals:
|
|
153
|
+
Sorted (ascending) p-values to plot — may be a thinned subset.
|
|
154
|
+
ranks:
|
|
155
|
+
1-based ranks of *pvals* in the full distribution. If ``None``,
|
|
156
|
+
assumes *pvals* is the complete set and ranks are 1..n.
|
|
157
|
+
n_full:
|
|
158
|
+
Total number of SNPs in the full (pre-thinning) dataset. Used to
|
|
159
|
+
compute correct expected quantiles. Defaults to ``len(pvals)``.
|
|
160
|
+
ci:
|
|
161
|
+
Confidence interval level.
|
|
162
|
+
"""
|
|
163
|
+
pvals = np.asarray(pvals, dtype=float)
|
|
164
|
+
n = len(pvals)
|
|
165
|
+
|
|
166
|
+
if n_full is None:
|
|
167
|
+
n_full = n
|
|
168
|
+
if ranks is None:
|
|
169
|
+
ranks = np.arange(1, n + 1)
|
|
170
|
+
|
|
171
|
+
# Expected −log10(p): rank i → expected p = i/(n_full+1)
|
|
172
|
+
expected = -np.log10(ranks / (n_full + 1))
|
|
173
|
+
|
|
174
|
+
# Observed −log10(p): rank i paired with the i-th smallest p-value
|
|
175
|
+
observed = -np.log10(pvals)
|
|
176
|
+
|
|
177
|
+
# CI from the beta distribution (uses original ranks in full dataset)
|
|
178
|
+
alpha = 1.0 - ci
|
|
179
|
+
ci_lo = -np.log10(beta_dist.ppf(1 - alpha / 2, ranks, n_full - ranks + 1))
|
|
180
|
+
ci_hi = -np.log10(beta_dist.ppf( alpha / 2, ranks, n_full - ranks + 1))
|
|
181
|
+
|
|
182
|
+
# Sort by expected ascending for clean polygon fill
|
|
183
|
+
order = np.argsort(expected)
|
|
184
|
+
return expected[order], observed[order], ci_lo[order], ci_hi[order]
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# ---------------------------------------------------------------------------
|
|
188
|
+
# Lambda
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
def _compute_lambda(pvals: np.ndarray) -> float:
|
|
192
|
+
"""Genomic inflation factor λ = median(χ²_obs) / median(χ²_expected)."""
|
|
193
|
+
from scipy.stats import chi2
|
|
194
|
+
pvals = pvals[np.isfinite(pvals) & (pvals > 0) & (pvals <= 1)]
|
|
195
|
+
if len(pvals) == 0:
|
|
196
|
+
return float("nan")
|
|
197
|
+
obs_median_chi2 = chi2.ppf(1 - np.median(pvals), df=1)
|
|
198
|
+
expected_median_chi2 = chi2.ppf(0.5, df=1) # ≈ 0.4549
|
|
199
|
+
return round(float(obs_median_chi2 / expected_median_chi2), 4)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
# Single-axis QQ plot
|
|
204
|
+
# ---------------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def plot_qq_single(
|
|
207
|
+
pvals: np.ndarray | pd.Series,
|
|
208
|
+
ax: plt.Axes,
|
|
209
|
+
label: Optional[str] = None,
|
|
210
|
+
color: str = "steelblue",
|
|
211
|
+
point_size: float = 8,
|
|
212
|
+
ci: float = 0.95,
|
|
213
|
+
ci_alpha: float = 0.15,
|
|
214
|
+
signif_threshold: Optional[float] = 5e-8,
|
|
215
|
+
show_lambda: bool = True,
|
|
216
|
+
title: Optional[str] = None,
|
|
217
|
+
# --- speed options ---
|
|
218
|
+
thin: bool = False,
|
|
219
|
+
thin_below: float = 0.01,
|
|
220
|
+
max_points: int = 50_000,
|
|
221
|
+
rasterized: bool = True,
|
|
222
|
+
) -> plt.Axes:
|
|
223
|
+
"""Draw a single QQ plot onto *ax*.
|
|
224
|
+
|
|
225
|
+
Parameters
|
|
226
|
+
----------
|
|
227
|
+
pvals:
|
|
228
|
+
Array or Series of raw p-values (not −log10).
|
|
229
|
+
ax:
|
|
230
|
+
Matplotlib Axes to draw on.
|
|
231
|
+
label:
|
|
232
|
+
Legend label for the scatter points.
|
|
233
|
+
color:
|
|
234
|
+
Colour for points and CI fill.
|
|
235
|
+
point_size:
|
|
236
|
+
Scatter point size.
|
|
237
|
+
ci:
|
|
238
|
+
Confidence interval level (default 0.95).
|
|
239
|
+
ci_alpha:
|
|
240
|
+
Transparency of the CI band.
|
|
241
|
+
signif_threshold:
|
|
242
|
+
If given, draw a horizontal dashed line at −log10(threshold).
|
|
243
|
+
show_lambda:
|
|
244
|
+
Annotate the plot with the genomic inflation factor λ.
|
|
245
|
+
title:
|
|
246
|
+
Axes title.
|
|
247
|
+
thin:
|
|
248
|
+
Enable p-value thinning for speed (default ``True``).
|
|
249
|
+
thin_below:
|
|
250
|
+
P-value threshold below which all points are always kept.
|
|
251
|
+
Points above this threshold are downsampled.
|
|
252
|
+
max_points:
|
|
253
|
+
Maximum number of points to plot after thinning (default 50 000).
|
|
254
|
+
rasterized:
|
|
255
|
+
Render the scatter as a bitmap inside vector output formats —
|
|
256
|
+
greatly reduces PDF/SVG file size (default ``True``).
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
-------
|
|
260
|
+
plt.Axes
|
|
261
|
+
"""
|
|
262
|
+
pvals_full = np.asarray(pvals, dtype=float)
|
|
263
|
+
pvals_full = pvals_full[np.isfinite(pvals_full) & (pvals_full > 0) & (pvals_full <= 1)]
|
|
264
|
+
|
|
265
|
+
# Lambda always on the full array
|
|
266
|
+
lam = _compute_lambda(pvals_full)
|
|
267
|
+
|
|
268
|
+
if thin and len(pvals_full) > max_points:
|
|
269
|
+
plot_pvals, plot_ranks, n_full = thin_pvals(
|
|
270
|
+
pvals_full, tail_threshold=thin_below, max_points=max_points
|
|
271
|
+
)
|
|
272
|
+
else:
|
|
273
|
+
plot_pvals = np.sort(pvals_full)
|
|
274
|
+
plot_ranks = np.arange(1, len(plot_pvals) + 1)
|
|
275
|
+
n_full = len(plot_pvals)
|
|
276
|
+
|
|
277
|
+
expected, observed, ci_lo, ci_hi = _qq_arrays(
|
|
278
|
+
plot_pvals, ranks=plot_ranks, n_full=n_full, ci=ci
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# CI band
|
|
282
|
+
ax.fill_between(
|
|
283
|
+
expected, ci_lo, ci_hi,
|
|
284
|
+
color=color, alpha=ci_alpha, linewidth=0,
|
|
285
|
+
label=f"{int(ci * 100)}% CI",
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Diagonal null line
|
|
289
|
+
max_val = max(expected.max(), observed.max()) * 1.05
|
|
290
|
+
ax.plot([0, max_val], [0, max_val], color="grey", linewidth=0.8,
|
|
291
|
+
linestyle="--", zorder=1)
|
|
292
|
+
|
|
293
|
+
# Observed points
|
|
294
|
+
ax.scatter(
|
|
295
|
+
expected, observed,
|
|
296
|
+
s=point_size, color=color, alpha=0.85,
|
|
297
|
+
label=label, zorder=2, edgecolors="none",
|
|
298
|
+
rasterized=rasterized,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
'''"""
|
|
302
|
+
# Significance line
|
|
303
|
+
if signif_threshold is not None:
|
|
304
|
+
sig_logp = -np.log10(signif_threshold)
|
|
305
|
+
ax.axhline(sig_logp, color="red", linewidth=0.7, linestyle="--",
|
|
306
|
+
label=f"p={signif_threshold:.0e}")
|
|
307
|
+
"""'''
|
|
308
|
+
|
|
309
|
+
# Lambda annotation
|
|
310
|
+
if show_lambda and not math.isnan(lam):
|
|
311
|
+
ax.text(
|
|
312
|
+
0.05, 0.95,
|
|
313
|
+
f"λ = {lam:.4f}",
|
|
314
|
+
transform=ax.transAxes,
|
|
315
|
+
va="top", ha="left",
|
|
316
|
+
fontsize=9, fontstyle="italic",
|
|
317
|
+
color="black",
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
ax.set_xlabel("Expected −log₁₀(p)", fontsize=10)
|
|
321
|
+
ax.set_ylabel("Observed −log₁₀(p)", fontsize=10)
|
|
322
|
+
ax.spines["top"].set_visible(False)
|
|
323
|
+
ax.spines["right"].set_visible(False)
|
|
324
|
+
|
|
325
|
+
if title:
|
|
326
|
+
ax.set_title(title, fontsize=10, pad=6)
|
|
327
|
+
if label:
|
|
328
|
+
ax.legend(fontsize=8, frameon=False, loc="lower right")
|
|
329
|
+
|
|
330
|
+
return ax
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
# Combined multi-panel figure
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
def plot_qq_combined(
|
|
338
|
+
pval_dict: dict[str, np.ndarray | pd.Series],
|
|
339
|
+
colors: Optional[list[str]] = None,
|
|
340
|
+
point_size: float = 8,
|
|
341
|
+
ci: float = 0.95,
|
|
342
|
+
signif_threshold: Optional[float] = 5e-8,
|
|
343
|
+
show_lambda: bool = True,
|
|
344
|
+
ncols: int = 3,
|
|
345
|
+
figsize: Optional[tuple] = None,
|
|
346
|
+
dpi: int = 300,
|
|
347
|
+
title: Optional[str] = None,
|
|
348
|
+
output_path: Optional[str] = None,
|
|
349
|
+
fig_format: str = "png",
|
|
350
|
+
thin: bool = False,
|
|
351
|
+
thin_below: float = 0.01,
|
|
352
|
+
max_points: int = 50_000,
|
|
353
|
+
rasterized: bool = True,
|
|
354
|
+
) -> tuple[plt.Figure, list[plt.Axes]]:
|
|
355
|
+
"""Plot all QQ plots in a single figure arranged in a grid.
|
|
356
|
+
|
|
357
|
+
Parameters
|
|
358
|
+
----------
|
|
359
|
+
pval_dict:
|
|
360
|
+
Ordered dict of ``{label: p_value_array}``.
|
|
361
|
+
colors:
|
|
362
|
+
List of colours, one per track. Cycles if fewer than tracks.
|
|
363
|
+
ncols:
|
|
364
|
+
Number of columns in the subplot grid (default 3).
|
|
365
|
+
figsize:
|
|
366
|
+
Figure size. Auto-calculated from *ncols* and number of tracks
|
|
367
|
+
if ``None``.
|
|
368
|
+
output_path:
|
|
369
|
+
If given, save the figure here.
|
|
370
|
+
thin, thin_below, max_points, rasterized:
|
|
371
|
+
See :func:`plot_qq_single`.
|
|
372
|
+
|
|
373
|
+
Returns
|
|
374
|
+
-------
|
|
375
|
+
(fig, axes)
|
|
376
|
+
"""
|
|
377
|
+
n = len(pval_dict)
|
|
378
|
+
if n == 0:
|
|
379
|
+
raise ValueError("pval_dict is empty.")
|
|
380
|
+
|
|
381
|
+
nrows = math.ceil(n / ncols)
|
|
382
|
+
|
|
383
|
+
cmap = plt.get_cmap("tab10")
|
|
384
|
+
colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
385
|
+
#if colors is None:
|
|
386
|
+
# cmap = plt.get_cmap("tab10")
|
|
387
|
+
# colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
388
|
+
#elif len(colors) < n:
|
|
389
|
+
# colors = [colors[i % len(colors)] for i in range(n)]
|
|
390
|
+
|
|
391
|
+
if figsize is None:
|
|
392
|
+
figsize = (ncols * 4.5, nrows * 4.5)
|
|
393
|
+
|
|
394
|
+
fig, axes_grid = plt.subplots(nrows, ncols, figsize=figsize, squeeze=False)
|
|
395
|
+
axes_flat = axes_grid.flatten()
|
|
396
|
+
|
|
397
|
+
for idx, (label, pvals) in enumerate(pval_dict.items()):
|
|
398
|
+
plot_qq_single(
|
|
399
|
+
pvals=pvals,
|
|
400
|
+
ax=axes_flat[idx],
|
|
401
|
+
label=label,
|
|
402
|
+
color=colors[idx],
|
|
403
|
+
point_size=point_size,
|
|
404
|
+
ci=ci,
|
|
405
|
+
signif_threshold=signif_threshold,
|
|
406
|
+
show_lambda=show_lambda,
|
|
407
|
+
title=label,
|
|
408
|
+
thin=thin,
|
|
409
|
+
thin_below=thin_below,
|
|
410
|
+
max_points=max_points,
|
|
411
|
+
rasterized=rasterized,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
for ax in axes_flat[n:]:
|
|
415
|
+
ax.set_visible(False)
|
|
416
|
+
|
|
417
|
+
if title:
|
|
418
|
+
fig.suptitle(title, fontsize=13, y=1.01)
|
|
419
|
+
|
|
420
|
+
plt.tight_layout()
|
|
421
|
+
|
|
422
|
+
if output_path:
|
|
423
|
+
fmt = fig_format or Path(output_path).suffix.lstrip(".") or "png"
|
|
424
|
+
fig.savefig(f"{output_path}.{fmt}", format=fmt, dpi=dpi, bbox_inches="tight")
|
|
425
|
+
logger.info("Saved combined QQ plot: %s", f"{output_path}.{fmt}")
|
|
426
|
+
|
|
427
|
+
return fig, list(axes_flat[:n])
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
# ---------------------------------------------------------------------------
|
|
431
|
+
# Separate figures — one file per sumstat
|
|
432
|
+
# ---------------------------------------------------------------------------
|
|
433
|
+
|
|
434
|
+
def plot_qq_separate(
|
|
435
|
+
pval_dict: dict[str, np.ndarray | pd.Series],
|
|
436
|
+
output_path: str = ".",
|
|
437
|
+
colors: Optional[list[str]] = None,
|
|
438
|
+
point_size: float = 8,
|
|
439
|
+
ci: float = 0.95,
|
|
440
|
+
signif_threshold: Optional[float] = 5e-8,
|
|
441
|
+
show_lambda: bool = True,
|
|
442
|
+
figsize: tuple = (5, 5),
|
|
443
|
+
dpi: int = 300,
|
|
444
|
+
fig_format: str = "png",
|
|
445
|
+
thin: bool = False,
|
|
446
|
+
thin_below: float = 0.01,
|
|
447
|
+
max_points: int = 50_000,
|
|
448
|
+
rasterized: bool = True,
|
|
449
|
+
) -> list[str]:
|
|
450
|
+
"""Save one QQ plot per sumstat as individual files.
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
pval_dict:
|
|
455
|
+
Ordered dict of ``{label: p_value_array}``.
|
|
456
|
+
output_dir:
|
|
457
|
+
Directory to save files in.
|
|
458
|
+
file_stem:
|
|
459
|
+
Prefix for output filenames.
|
|
460
|
+
colors:
|
|
461
|
+
List of colours, one per track.
|
|
462
|
+
thin, thin_below, max_points, rasterized:
|
|
463
|
+
See :func:`plot_qq_single`.
|
|
464
|
+
|
|
465
|
+
Returns
|
|
466
|
+
-------
|
|
467
|
+
List of output file paths.
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
n = len(pval_dict)
|
|
471
|
+
|
|
472
|
+
cmap = plt.get_cmap("tab10")
|
|
473
|
+
colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
474
|
+
#if colors is None:
|
|
475
|
+
# cmap = plt.get_cmap("tab10")
|
|
476
|
+
# colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
477
|
+
#elif len(colors) < n:
|
|
478
|
+
# colors = [colors[i % len(colors)] for i in range(n)]
|
|
479
|
+
|
|
480
|
+
saved: list[str] = []
|
|
481
|
+
|
|
482
|
+
for idx, (label, pvals) in enumerate(pval_dict.items()):
|
|
483
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
484
|
+
|
|
485
|
+
plot_qq_single(
|
|
486
|
+
pvals=pvals,
|
|
487
|
+
ax=ax,
|
|
488
|
+
label=label,
|
|
489
|
+
color=colors[idx],
|
|
490
|
+
point_size=point_size,
|
|
491
|
+
ci=ci,
|
|
492
|
+
signif_threshold=signif_threshold,
|
|
493
|
+
show_lambda=show_lambda,
|
|
494
|
+
title=label,
|
|
495
|
+
thin=thin,
|
|
496
|
+
thin_below=thin_below,
|
|
497
|
+
max_points=max_points,
|
|
498
|
+
rasterized=rasterized,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
plt.tight_layout()
|
|
502
|
+
|
|
503
|
+
safe_label = label.replace(" ", "_").replace("/", "-")
|
|
504
|
+
out_path = f"{output_path}_{safe_label}.{fig_format}"
|
|
505
|
+
fig.savefig(out_path, format=fig_format, dpi=dpi, bbox_inches="tight")
|
|
506
|
+
plt.close(fig)
|
|
507
|
+
logger.info("Saved QQ plot: %s", out_path)
|
|
508
|
+
saved.append(out_path)
|
|
509
|
+
|
|
510
|
+
return saved
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# ---------------------------------------------------------------------------
|
|
514
|
+
# Overlay — all sumstats on one axes
|
|
515
|
+
# ---------------------------------------------------------------------------
|
|
516
|
+
|
|
517
|
+
def plot_qq_overlay(
|
|
518
|
+
pval_dict: dict[str, np.ndarray | pd.Series],
|
|
519
|
+
colors: Optional[list[str]] = None,
|
|
520
|
+
point_size: float = 8,
|
|
521
|
+
ci: float = 0.95,
|
|
522
|
+
ci_alpha: float = 0.10,
|
|
523
|
+
signif_threshold: Optional[float] = 5e-8,
|
|
524
|
+
show_lambda: bool = True,
|
|
525
|
+
figsize: tuple = (6, 6),
|
|
526
|
+
dpi: int = 300,
|
|
527
|
+
title: Optional[str] = None,
|
|
528
|
+
output_path: Optional[str] = None,
|
|
529
|
+
fig_format: str = "png",
|
|
530
|
+
thin: bool = False,
|
|
531
|
+
thin_below: float = 0.01,
|
|
532
|
+
max_points: int = 50_000,
|
|
533
|
+
rasterized: bool = True,
|
|
534
|
+
) -> tuple[plt.Figure, plt.Axes]:
|
|
535
|
+
"""Plot all sumstats on a single QQ axes, each coloured differently.
|
|
536
|
+
|
|
537
|
+
Lambda (λ) values appear in the legend label for each sumstat.
|
|
538
|
+
|
|
539
|
+
Parameters
|
|
540
|
+
----------
|
|
541
|
+
pval_dict:
|
|
542
|
+
Ordered dict of ``{label: p_value_array}``.
|
|
543
|
+
colors:
|
|
544
|
+
List of colours, one per sumstat. Defaults to ``tab10`` palette.
|
|
545
|
+
ci_alpha:
|
|
546
|
+
Transparency of CI bands (default 0.10 — lower than single-panel
|
|
547
|
+
default to keep overlapping bands readable).
|
|
548
|
+
show_lambda:
|
|
549
|
+
Append λ to each legend entry.
|
|
550
|
+
thin, thin_below, max_points, rasterized:
|
|
551
|
+
See :func:`plot_qq_single`.
|
|
552
|
+
|
|
553
|
+
Returns
|
|
554
|
+
-------
|
|
555
|
+
(fig, ax)
|
|
556
|
+
"""
|
|
557
|
+
n = len(pval_dict)
|
|
558
|
+
if n == 0:
|
|
559
|
+
raise ValueError("pval_dict is empty.")
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
cmap = plt.get_cmap("tab10")
|
|
563
|
+
colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
564
|
+
#if colors is None:
|
|
565
|
+
# cmap = plt.get_cmap("tab10")
|
|
566
|
+
# colors = [mcolors.to_hex(cmap(i % 10)) for i in range(n)]
|
|
567
|
+
#elif len(colors) < n:
|
|
568
|
+
# colors = [colors[i % len(colors)] for i in range(n)]
|
|
569
|
+
|
|
570
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
571
|
+
global_max = 0.0
|
|
572
|
+
|
|
573
|
+
for idx, (label, pvals) in enumerate(pval_dict.items()):
|
|
574
|
+
pvals_full = np.asarray(pvals, dtype=float)
|
|
575
|
+
pvals_full = pvals_full[np.isfinite(pvals_full) & (pvals_full > 0) & (pvals_full <= 1)]
|
|
576
|
+
|
|
577
|
+
# Lambda on full array before any thinning
|
|
578
|
+
lam = _compute_lambda(pvals_full)
|
|
579
|
+
|
|
580
|
+
if thin and len(pvals_full) > max_points:
|
|
581
|
+
plot_pvals, plot_ranks, n_full = thin_pvals(
|
|
582
|
+
pvals_full, tail_threshold=thin_below, max_points=max_points
|
|
583
|
+
)
|
|
584
|
+
else:
|
|
585
|
+
plot_pvals = np.sort(pvals_full)
|
|
586
|
+
plot_ranks = np.arange(1, len(plot_pvals) + 1)
|
|
587
|
+
n_full = len(plot_pvals)
|
|
588
|
+
|
|
589
|
+
expected, observed, ci_lo, ci_hi = _qq_arrays(
|
|
590
|
+
plot_pvals, ranks=plot_ranks, n_full=n_full, ci=ci
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
color = colors[idx]
|
|
594
|
+
legend_label = f"{label} (λ={lam:.4f})" if show_lambda else label
|
|
595
|
+
|
|
596
|
+
ax.fill_between(
|
|
597
|
+
expected, ci_lo, ci_hi,
|
|
598
|
+
color=color, alpha=ci_alpha, linewidth=0,
|
|
599
|
+
)
|
|
600
|
+
ax.scatter(
|
|
601
|
+
expected, observed,
|
|
602
|
+
s=point_size, color=color, alpha=0.85,
|
|
603
|
+
label=legend_label, zorder=2 + idx, edgecolors="none",
|
|
604
|
+
rasterized=rasterized,
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
global_max = max(global_max, expected.max(), observed.max())
|
|
608
|
+
|
|
609
|
+
ax.plot(
|
|
610
|
+
[0, global_max * 1.05], [0, global_max * 1.05],
|
|
611
|
+
color="grey", linewidth=0.8, linestyle="--", zorder=1,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
'''"""
|
|
615
|
+
if signif_threshold is not None:
|
|
616
|
+
ax.axhline(
|
|
617
|
+
-np.log10(signif_threshold),
|
|
618
|
+
color="red", linewidth=0.7, linestyle="--",
|
|
619
|
+
label=f"p = {signif_threshold:.0e}",
|
|
620
|
+
)
|
|
621
|
+
"""'''
|
|
622
|
+
|
|
623
|
+
ax.set_xlabel("Expected −log₁₀(p)", fontsize=11)
|
|
624
|
+
ax.set_ylabel("Observed −log₁₀(p)", fontsize=11)
|
|
625
|
+
ax.spines["top"].set_visible(False)
|
|
626
|
+
ax.spines["right"].set_visible(False)
|
|
627
|
+
|
|
628
|
+
ax.legend(
|
|
629
|
+
fontsize=8, frameon=True, framealpha=0.7,
|
|
630
|
+
edgecolor="lightgrey", loc="lower right",
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if title:
|
|
634
|
+
ax.set_title(title, fontsize=11, pad=8)
|
|
635
|
+
|
|
636
|
+
plt.tight_layout()
|
|
637
|
+
|
|
638
|
+
if output_path:
|
|
639
|
+
fmt = fig_format or Path(output_path).suffix.lstrip(".") or "png"
|
|
640
|
+
fig.savefig(f"{output_path}.{fmt}", format=fmt, dpi=dpi, bbox_inches="tight")
|
|
641
|
+
logger.info("Saved combined QQ plot: %s", f"{output_path}.{fmt}")
|
|
642
|
+
|
|
643
|
+
return fig, ax
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycmplot
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Multi-track circular and linear Manhattan plot generation for GWAS summary statistics
|
|
5
5
|
Author: Kevin Esoh
|
|
6
6
|
Author-email: Kevin Esoh <kesohku1@jh.edu>
|
|
@@ -183,7 +183,8 @@ pycmplot \
|
|
|
183
183
|
| `-b, --build` | Comma-separated genome builds of sumstats | off |
|
|
184
184
|
| `-bc, --build_column` | Genome build column name (containing hg18/hg19/hg38) | off |
|
|
185
185
|
| `-m, --mode` | `lm` linear or `cm` circular | `lm` |
|
|
186
|
-
| `-qq, --qq_plot` | Also generate a QQ-plot | off
|
|
186
|
+
| `-qq, --qq_plot` | Also generate a QQ-plot | off |
|
|
187
|
+
| `-qq_thin, --qq_thin` | Thin p-values for faster QQ-plotting | off |
|
|
187
188
|
| `--logp` | Plot -log10(p) | off |
|
|
188
189
|
| `-sig, --signif_threshold` | Genome-wide significance threshold | off (auto 0.05/N) |
|
|
189
190
|
| `-sigl, --signif_line` | Value for genome-wide significance line if different from `-sig` | 5e-8 |
|
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
pycmplot/__init__.py,sha256=fGBHi1vh_9_eu2Xks5B5tPk51RZxji5s-khGhSTNQCI,1288
|
|
2
|
-
pycmplot/_core.py,sha256=
|
|
2
|
+
pycmplot/_core.py,sha256=H8Ab0db5cV9wbkuq8Nq9xjTqwRkra5ook6xu6A-N-ys,13021
|
|
3
3
|
pycmplot/annotation.py,sha256=gMgDfnHmMYpkLuQIaJNHmtXVHHIeSUAcviLrisF2vmY,20886
|
|
4
|
-
pycmplot/cli.py,sha256=
|
|
4
|
+
pycmplot/cli.py,sha256=k6uBR5V0Y89VSzq-GHZxfR0XmBGS6aEUt66fqYEmQ40,19430
|
|
5
5
|
pycmplot/constants.py,sha256=XaT3pTWM3dkawU1cA0HFpaNnUupSjv28wpPgmnVEjL0,3431
|
|
6
|
-
pycmplot/io.py,sha256=
|
|
6
|
+
pycmplot/io.py,sha256=hWM4x_euyYUZ3gzFaP9tkZYHi_1fj8XE1wE9f5XOdbE,35301
|
|
7
7
|
pycmplot/liftover.py,sha256=ZawfO9ZKZADFwyXZBnbrovh4TnV-ja1qHHnIgtxSCBM,6942
|
|
8
8
|
pycmplot/resources.py,sha256=r0zHy_-9wu98lkqKENYrptX54uO6np_x94_ju3v2KYE,6414
|
|
9
9
|
pycmplot/stats.py,sha256=8TXHxfGc4sUr3rE3cHnS2mXfIS1PPj0YgDk1C-z2Pqk,5813
|
|
10
10
|
pycmplot/data/Homo_sapiens.GRCh37.geneinfo.tsv.gz,sha256=kLldtgT5-k4ZzU5jN--woFZEuOaWe9pQ4g4hhB3sdQI,840666
|
|
11
11
|
pycmplot/data/Homo_sapiens.GRCh38.geneinfo.tsv.gz,sha256=cRAuNxifZi12yOxNKrVt0uTS52HwDDFzV_1N4E2Qwuw,626249
|
|
12
12
|
pycmplot/data/hg19ToHg38.over.chain,sha256=oHPYkUIztVQtKXYauOxLOBUFKxOWSRnBKh77LjEfvzk,606773
|
|
13
|
-
pycmplot/plotting/circular.py,sha256=
|
|
14
|
-
pycmplot/plotting/linear.py,sha256=
|
|
15
|
-
pycmplot
|
|
13
|
+
pycmplot/plotting/circular.py,sha256=Xa_2GDW7lLKb8PeN_IL3_hvfkQaqcbQ68Q5QrxjCKs4,27295
|
|
14
|
+
pycmplot/plotting/linear.py,sha256=lrioORKyOaz34UAOhBWg3W2kLq6nBT7HNkPxy8hX340,37416
|
|
15
|
+
pycmplot/plotting/qq.py,sha256=ylPJj9gMlDqTOR7JWnQ7wCfvEBUczqEUVMsFiXXEt3s,20883
|
|
16
|
+
pycmplot-0.2.2.dist-info/licenses/LICENSE,sha256=7HtJWU-I9Tayt7xnvHU0D6oVqeTp3hMqCbTxbkYBTZQ,20904
|
|
16
17
|
pycmplot_docs/docs/conf.py,sha256=gUt_OitflxpaOrIjeP2aYJ_LCWqTRRdmo_HIcVVf3hI,2992
|
|
17
|
-
pycmplot-0.2.
|
|
18
|
-
pycmplot-0.2.
|
|
19
|
-
pycmplot-0.2.
|
|
20
|
-
pycmplot-0.2.
|
|
21
|
-
pycmplot-0.2.
|
|
18
|
+
pycmplot-0.2.2.dist-info/METADATA,sha256=9qN2xJp3CGcTp8kxjfmzAz7AhfJVPmk60jTqJ8sxeqI,8108
|
|
19
|
+
pycmplot-0.2.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
pycmplot-0.2.2.dist-info/entry_points.txt,sha256=cE8IAltA_Q-QQuWQ5DE3Lv-9ktYQ_jyWaD6I97QbeyU,49
|
|
21
|
+
pycmplot-0.2.2.dist-info/top_level.txt,sha256=gxbPirasq6TczoykxC2gfk5_En7R65BN-J5ADiV5i3c,23
|
|
22
|
+
pycmplot-0.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|