gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/data/formatbook.json +722 -721
- gwaslab/g_Log.py +22 -5
- gwaslab/g_Sumstats.py +110 -163
- gwaslab/g_SumstatsPair.py +76 -25
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_Sumstats_summary.py +3 -3
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +36 -17
- gwaslab/hm_harmonize_sumstats.py +354 -221
- gwaslab/hm_rsid_to_chrpos.py +1 -1
- gwaslab/io_preformat_input.py +49 -43
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/io_to_formats.py +428 -295
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +3 -3
- gwaslab/qc_fix_sumstats.py +891 -778
- gwaslab/util_ex_calculate_ldmatrix.py +31 -13
- gwaslab/util_ex_gwascatalog.py +25 -25
- gwaslab/util_ex_ldproxyfinder.py +10 -10
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_ex_process_ref.py +3 -3
- gwaslab/util_ex_run_coloc.py +26 -4
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +30 -30
- gwaslab/util_in_filter_value.py +201 -74
- gwaslab/util_in_get_density.py +10 -10
- gwaslab/util_in_get_sig.py +445 -71
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +42 -37
- gwaslab/viz_aux_reposition_text.py +10 -7
- gwaslab/viz_aux_save_figure.py +18 -8
- gwaslab/viz_plot_compare_af.py +32 -33
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +34 -26
- gwaslab/viz_plot_mqqplot.py +126 -75
- gwaslab/viz_plot_qqplot.py +11 -8
- gwaslab/viz_plot_regionalplot.py +36 -33
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +65 -61
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.37.dist-info/RECORD +0 -72
- /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/viz_plot_regionalplot.py
CHANGED
|
@@ -86,7 +86,7 @@ def _plot_regional(
|
|
|
86
86
|
region_ld_threshold = region_ld_threshold,
|
|
87
87
|
region_ld_colors = region_ld_colors,
|
|
88
88
|
marker_size= marker_size,
|
|
89
|
-
log=log)
|
|
89
|
+
log=log,verbose=verbose)
|
|
90
90
|
else:
|
|
91
91
|
ax1, lead_id = _pinpoint_lead(sumstats = sumstats,
|
|
92
92
|
ax1 = ax1,
|
|
@@ -94,14 +94,14 @@ def _plot_regional(
|
|
|
94
94
|
region_ld_threshold = region_ld_threshold,
|
|
95
95
|
region_ld_colors = region_ld_colors1,
|
|
96
96
|
marker_size= marker_size,
|
|
97
|
-
log=log)
|
|
97
|
+
log=log,verbose=verbose)
|
|
98
98
|
ax1, lead_id2 = _pinpoint_lead(sumstats = sumstats,
|
|
99
99
|
ax1 = ax1,
|
|
100
100
|
region_ref=region_ref_second,
|
|
101
101
|
region_ld_threshold = region_ld_threshold,
|
|
102
102
|
region_ld_colors = region_ld_colors2,
|
|
103
103
|
marker_size= marker_size,
|
|
104
|
-
log=log)
|
|
104
|
+
log=log,verbose=verbose)
|
|
105
105
|
|
|
106
106
|
if (vcf_path is not None) and region_ld_legend:
|
|
107
107
|
if region_ref_second is None:
|
|
@@ -122,6 +122,8 @@ def _plot_regional(
|
|
|
122
122
|
region_ld_colors=region_ld_colors2,
|
|
123
123
|
position=2)
|
|
124
124
|
cbar = [cbar1, cbar2]
|
|
125
|
+
else:
|
|
126
|
+
cbar=None
|
|
125
127
|
if region_title is not None:
|
|
126
128
|
ax1 = _add_region_title(region_title, ax1=ax1,region_title_args=region_title_args )
|
|
127
129
|
## recombinnation rate ##################################################
|
|
@@ -238,7 +240,7 @@ def _plot_regional(
|
|
|
238
240
|
return ax1, ax3, ax4, cbar, lead_snp_i, lead_snp_i2
|
|
239
241
|
|
|
240
242
|
# + ###########################################################################################################################################################################
|
|
241
|
-
def _get_lead_id(sumstats=None, region_ref=None, log=None):
|
|
243
|
+
def _get_lead_id(sumstats=None, region_ref=None, log=None, verbose=True):
|
|
242
244
|
region_ref_to_check = copy.copy(region_ref)
|
|
243
245
|
try:
|
|
244
246
|
if len(region_ref_to_check)>0 and type(region_ref_to_check) is not str:
|
|
@@ -258,23 +260,23 @@ def _get_lead_id(sumstats=None, region_ref=None, log=None):
|
|
|
258
260
|
if region_ref_to_check is not None:
|
|
259
261
|
if type(lead_id) is list:
|
|
260
262
|
if len(lead_id)==0 :
|
|
261
|
-
log.
|
|
263
|
+
log.warning("{} not found. Roll back to lead variant...".format(region_ref_to_check))
|
|
262
264
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
263
265
|
else:
|
|
264
266
|
log.write(" -Reference variant ID: {} - {}".format(region_ref_to_check, lead_id))
|
|
265
267
|
|
|
266
268
|
if lead_id is None:
|
|
267
|
-
log.write(" -Extracting lead variant...")
|
|
269
|
+
log.write(" -Extracting lead variant...", verbose=verbose)
|
|
268
270
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
269
271
|
|
|
270
272
|
return lead_id
|
|
271
273
|
|
|
272
|
-
def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
|
|
274
|
+
def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log, verbose):
|
|
273
275
|
if region_ref is None:
|
|
274
|
-
log.write(" -Extracting lead variant...")
|
|
276
|
+
log.write(" -Extracting lead variant..." , verbose=verbose)
|
|
275
277
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
276
278
|
else:
|
|
277
|
-
lead_id = _get_lead_id(sumstats, region_ref, log)
|
|
279
|
+
lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
|
|
278
280
|
|
|
279
281
|
ax1.scatter(sumstats.loc[lead_id,"i"],sumstats.loc[lead_id,"scaled_P"],
|
|
280
282
|
color=region_ld_colors[-1],
|
|
@@ -396,7 +398,7 @@ def _plot_gene_track(
|
|
|
396
398
|
log=Log()):
|
|
397
399
|
|
|
398
400
|
# load gtf
|
|
399
|
-
|
|
401
|
+
log.write(" -Loading gtf files from:" + gtf_path, verbose=verbose)
|
|
400
402
|
uniq_gene_region,exons = process_gtf( gtf_path = gtf_path ,
|
|
401
403
|
region = region,
|
|
402
404
|
region_flank_factor = region_flank_factor,
|
|
@@ -414,7 +416,7 @@ def _plot_gene_track(
|
|
|
414
416
|
font_size_in_pixels= taf[2] * pixels_per_track
|
|
415
417
|
font_size_in_points = font_size_in_pixels * pixels_per_point
|
|
416
418
|
linewidth_in_points= pixels_per_track * pixels_per_point
|
|
417
|
-
|
|
419
|
+
log.write(" -plotting gene track..", verbose=verbose)
|
|
418
420
|
|
|
419
421
|
sig_gene_name = "Undefined"
|
|
420
422
|
sig_gene_name2 = "Undefined"
|
|
@@ -422,6 +424,7 @@ def _plot_gene_track(
|
|
|
422
424
|
texts_to_adjust_middle = []
|
|
423
425
|
texts_to_adjust_right = []
|
|
424
426
|
for index,row in uniq_gene_region.iterrows():
|
|
427
|
+
|
|
425
428
|
gene_color="#020080"
|
|
426
429
|
#if row[6][0]=="+":
|
|
427
430
|
if row["strand"][0]=="+":
|
|
@@ -494,7 +497,7 @@ def _plot_gene_track(
|
|
|
494
497
|
ax3.plot((gene_track_start_i+row["start"],gene_track_start_i+row["end"]),
|
|
495
498
|
(row["stack"]*2,row["stack"]*2),linewidth=linewidth_in_points*taf[3],color=exon_color,solid_capstyle="butt")
|
|
496
499
|
|
|
497
|
-
|
|
500
|
+
log.write(" -Finished plotting gene track..", verbose=verbose)
|
|
498
501
|
|
|
499
502
|
return ax3,texts_to_adjust_middle
|
|
500
503
|
|
|
@@ -502,25 +505,26 @@ def _plot_gene_track(
|
|
|
502
505
|
# Helpers
|
|
503
506
|
# -############################################################################################################################################################################
|
|
504
507
|
def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
|
|
505
|
-
|
|
506
|
-
|
|
508
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
509
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
507
510
|
|
|
508
511
|
|
|
509
512
|
|
|
510
513
|
# load genotype data of the targeted region
|
|
511
514
|
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
512
515
|
if ref_genotype is None:
|
|
513
|
-
|
|
516
|
+
log.warning("No data was retrieved. Skipping ...")
|
|
514
517
|
ref_genotype=dict()
|
|
515
518
|
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
516
|
-
|
|
517
|
-
|
|
519
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
520
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
518
521
|
# match sumstats pos and ref pos:
|
|
519
522
|
# get ref index for its first appearance of sumstats pos
|
|
520
523
|
#######################################################################################
|
|
521
524
|
def match_varaint(x):
|
|
522
525
|
# x: "POS,NEA,EA"
|
|
523
526
|
if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
|
|
527
|
+
# position match
|
|
524
528
|
if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
|
|
525
529
|
# multiple position matches
|
|
526
530
|
for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
|
|
@@ -530,17 +534,16 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
530
534
|
return j
|
|
531
535
|
elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
|
|
532
536
|
if x.iloc[2] == ref_genotype["variants/REF"][j]:
|
|
533
|
-
return j
|
|
534
|
-
|
|
535
|
-
return None
|
|
537
|
+
return j
|
|
538
|
+
return None
|
|
536
539
|
else:
|
|
537
540
|
# single match
|
|
538
541
|
return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
|
|
539
542
|
else:
|
|
540
543
|
# no position match
|
|
541
544
|
return None
|
|
542
|
-
|
|
543
|
-
sumstats["REFINDEX"] = sumstats
|
|
545
|
+
log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
|
|
546
|
+
sumstats["REFINDEX"] = sumstats[[pos,nea,ea]].apply(lambda x: match_varaint(x),axis=1)
|
|
544
547
|
#############################################################################################
|
|
545
548
|
#sumstats["REFINDEX"] = sumstats[pos].apply(lambda x: np.where(ref_genotype["variants/POS"] == x )[0][0] if np.any(ref_genotype["variants/POS"] == x) else None)
|
|
546
549
|
|
|
@@ -549,7 +552,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
549
552
|
if region_ref is None:
|
|
550
553
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
551
554
|
else:
|
|
552
|
-
lead_id = _get_lead_id(sumstats, region_ref, log)
|
|
555
|
+
lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
|
|
553
556
|
lead_pos = sumstats.loc[lead_id,pos]
|
|
554
557
|
|
|
555
558
|
# if lead pos is available:
|
|
@@ -565,12 +568,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
565
568
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
566
569
|
try:
|
|
567
570
|
if len(set(lead_snp_genotype[0]))==1:
|
|
568
|
-
log.
|
|
571
|
+
log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
|
|
569
572
|
except:
|
|
570
573
|
pass
|
|
571
574
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
572
575
|
|
|
573
|
-
|
|
576
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
574
577
|
|
|
575
578
|
if len(other_snp_genotype)>1:
|
|
576
579
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -578,7 +581,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
578
581
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
579
582
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
|
|
580
583
|
else:
|
|
581
|
-
|
|
584
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
582
585
|
sumstats["RSQ"]=None
|
|
583
586
|
|
|
584
587
|
sumstats["RSQ"] = sumstats["RSQ"].astype("float")
|
|
@@ -598,7 +601,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
598
601
|
#####################################################################################################
|
|
599
602
|
if region_ref_second is not None:
|
|
600
603
|
|
|
601
|
-
lead_id2 = _get_lead_id(sumstats, region_ref_second, log)
|
|
604
|
+
lead_id2 = _get_lead_id(sumstats, region_ref_second, log, verbose)
|
|
602
605
|
|
|
603
606
|
lead_pos2 = sumstats.loc[lead_id2,pos]
|
|
604
607
|
if lead_pos2 in ref_genotype["variants/POS"]:
|
|
@@ -613,12 +616,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
613
616
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
614
617
|
try:
|
|
615
618
|
if len(set(lead_snp_genotype[0]))==1:
|
|
616
|
-
log.
|
|
619
|
+
log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
|
|
617
620
|
except:
|
|
618
621
|
pass
|
|
619
622
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
620
623
|
|
|
621
|
-
|
|
624
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
622
625
|
|
|
623
626
|
if len(other_snp_genotype)>1:
|
|
624
627
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -626,7 +629,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
626
629
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
627
630
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ2"] = valid_r2
|
|
628
631
|
else:
|
|
629
|
-
|
|
632
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
630
633
|
sumstats["RSQ2"]=None
|
|
631
634
|
|
|
632
635
|
sumstats["RSQ2"] = sumstats["RSQ2"].astype("float")
|
|
@@ -650,7 +653,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
650
653
|
#sumstats.loc[lead_id,"LEAD2"]
|
|
651
654
|
####################################################################################################
|
|
652
655
|
|
|
653
|
-
|
|
656
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
654
657
|
return sumstats
|
|
655
658
|
|
|
656
659
|
# -############################################################################################################################################################################
|
|
@@ -712,8 +715,8 @@ def process_gtf(gtf_path,
|
|
|
712
715
|
# extract protein coding gene
|
|
713
716
|
if region_protein_coding is True:
|
|
714
717
|
#genes_1mb = genes_1mb.loc[genes_1mb["gene_biotype"]=="protein_coding",:].copy()
|
|
715
|
-
pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding"),"name"].values
|
|
716
|
-
genes_1mb = genes_1mb.loc[genes_1mb["name"].isin(pc_genes_1mb_list),:]
|
|
718
|
+
pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding") & (genes_1mb["name"]!=""),"name"].values
|
|
719
|
+
genes_1mb = genes_1mb.loc[(genes_1mb["feature"].isin(["exon","gene"])) & (genes_1mb["name"].isin(pc_genes_1mb_list)),:]
|
|
717
720
|
# extract exon
|
|
718
721
|
exons = genes_1mb.loc[genes_1mb["feature"]=="exon",:].copy()
|
|
719
722
|
|
gwaslab/viz_plot_rg_heatmap.py
CHANGED
|
@@ -8,6 +8,7 @@ import matplotlib
|
|
|
8
8
|
from gwaslab.g_Log import Log
|
|
9
9
|
import scipy.stats as ss
|
|
10
10
|
from gwaslab.viz_aux_save_figure import save_figure
|
|
11
|
+
|
|
11
12
|
#################################################################################################
|
|
12
13
|
def convert_p_to_width(p,sig_level):
|
|
13
14
|
width_factor= -np.log10(sig_level)
|
|
@@ -54,7 +55,7 @@ def plot_rg(ldscrg,
|
|
|
54
55
|
save=None,
|
|
55
56
|
save_args=None):
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
log.write("Start to create ldsc genetic correlation heatmap..." ,verbose=verbose)
|
|
58
59
|
# configure arguments
|
|
59
60
|
if fig_args is None:
|
|
60
61
|
fig_args = {"dpi":300}
|
|
@@ -78,14 +79,14 @@ def plot_rg(ldscrg,
|
|
|
78
79
|
save_args = {}
|
|
79
80
|
|
|
80
81
|
#drop na records in P column
|
|
81
|
-
|
|
82
|
+
log.write("Raw dataset records:",len(ldscrg) ,verbose=verbose)
|
|
82
83
|
df=ldscrg.dropna(subset=[p]).copy()
|
|
83
84
|
|
|
84
|
-
|
|
85
|
+
log.write(" -Raw dataset non-NA records:",len(df) ,verbose=verbose)
|
|
85
86
|
# create unique pair column
|
|
86
87
|
df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
|
|
87
88
|
|
|
88
|
-
|
|
89
|
+
log.write("Filling diagnal line and duplicated pair for plotting..." ,verbose=verbose)
|
|
89
90
|
# fill na
|
|
90
91
|
df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
|
|
91
92
|
df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
|
|
@@ -96,16 +97,23 @@ def plot_rg(ldscrg,
|
|
|
96
97
|
p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
|
|
97
98
|
p_dup_list = p2_dup_list + p1_dup_list
|
|
98
99
|
if len(set(p_dup_list)) > 0:
|
|
99
|
-
|
|
100
|
+
log.write(" -Diagnal records:", len(set(p_dup_list)) ,verbose=verbose)
|
|
100
101
|
df_fill_dia["p1"] = p_dup_list
|
|
101
102
|
df_fill_dia["p2"] = df_fill_dia["p1"]
|
|
102
103
|
df_fill_dia["rg"] = 1
|
|
103
104
|
|
|
104
105
|
df_fill_na = pd.DataFrame(columns=df.columns)
|
|
105
106
|
df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
|
|
107
|
+
|
|
108
|
+
to_concate=[]
|
|
109
|
+
for i in [df,df_fill_reverse,df_fill_dia,df_fill_na]:
|
|
110
|
+
if len(i)>0:
|
|
111
|
+
to_concate.append(i.dropna(axis=1))
|
|
112
|
+
|
|
106
113
|
# fill diagonal
|
|
107
|
-
df = pd.concat(
|
|
108
|
-
|
|
114
|
+
df = pd.concat(to_concate,ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
|
|
115
|
+
|
|
116
|
+
#log.write(" -Dataset shape match:", len(df)==)
|
|
109
117
|
#
|
|
110
118
|
## remove record with p1 = p2, dropna in P column
|
|
111
119
|
dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
|
|
@@ -116,11 +124,11 @@ def plot_rg(ldscrg,
|
|
|
116
124
|
## drop duplicate and keep only unique pairs
|
|
117
125
|
dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
|
|
118
126
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
127
|
+
log.write("Valid unique trait pairs:",len(dfp) ,verbose=verbose)
|
|
128
|
+
log.write(" -Valid unique trait1:",dfp["p1"].nunique() ,verbose=verbose)
|
|
129
|
+
log.write(" -Valid unique trait2:",dfp["p2"].nunique() ,verbose=verbose)
|
|
130
|
+
log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05) ,verbose=verbose)
|
|
131
|
+
log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))) ,verbose=verbose)
|
|
124
132
|
|
|
125
133
|
#if correction=="fdr":
|
|
126
134
|
# fdr corrected p
|
|
@@ -131,7 +139,7 @@ def plot_rg(ldscrg,
|
|
|
131
139
|
dfp["fdr_p"]=ss.false_discovery_control(dfp[p],method=fdr_method)
|
|
132
140
|
dfp["fdr"] =ss.false_discovery_control(dfp[p],method=fdr_method) < 0.05
|
|
133
141
|
|
|
134
|
-
|
|
142
|
+
log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]) ,verbose=verbose)
|
|
135
143
|
# convert to dict for annotation and plotting
|
|
136
144
|
df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
|
|
137
145
|
dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
|
|
@@ -167,7 +175,7 @@ def plot_rg(ldscrg,
|
|
|
167
175
|
df["x"]=df[p2].map(dic_p2)
|
|
168
176
|
df["x_y"]=df[p2].map(dic_p1)
|
|
169
177
|
|
|
170
|
-
|
|
178
|
+
log.write("Plotting heatmap..." ,verbose=verbose)
|
|
171
179
|
########ticks###############################################
|
|
172
180
|
fig,ax = plt.subplots(**fig_args)
|
|
173
181
|
|
|
@@ -196,7 +204,7 @@ def plot_rg(ldscrg,
|
|
|
196
204
|
panno_list={1:{},2:{}}
|
|
197
205
|
rgtoanno=[]
|
|
198
206
|
|
|
199
|
-
|
|
207
|
+
log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]) ,verbose=verbose)
|
|
200
208
|
|
|
201
209
|
for i,row in df.iterrows():
|
|
202
210
|
xcenter=row["x"]
|
|
@@ -298,11 +306,11 @@ def plot_rg(ldscrg,
|
|
|
298
306
|
|
|
299
307
|
# annotate p
|
|
300
308
|
if panno is True:
|
|
301
|
-
|
|
309
|
+
log.write("P value annotation text (Order: Bon -> FDR -> Pnom): " ,verbose=verbose)
|
|
302
310
|
for i,correction in enumerate(corrections):
|
|
303
311
|
for j,sig_level in enumerate(sig_levels):
|
|
304
312
|
index = len(sig_levels)*i + j
|
|
305
|
-
|
|
313
|
+
log.write(" -{} : {}-corrected P < {} ".format(panno_texts[index], correction, sig_level) ,verbose=verbose)
|
|
306
314
|
for panno_set_number in panno_list.keys():
|
|
307
315
|
for key, i in panno_list[panno_set_number].items():
|
|
308
316
|
if panno_set_number == 1:
|
|
@@ -318,14 +326,8 @@ def plot_rg(ldscrg,
|
|
|
318
326
|
ax.set_aspect('equal', adjustable='box')
|
|
319
327
|
|
|
320
328
|
save_figure(fig, save, keyword="ldscrg",save_args=save_args, log=log, verbose=verbose)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
# fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
|
|
325
|
-
# log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
|
|
326
|
-
# else:
|
|
327
|
-
# fig.savefig(save,bbox_inches="tight",**save_args)
|
|
328
|
-
# log.write(" -Saved to "+ save + " successfully!" )
|
|
329
|
-
if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
|
|
329
|
+
|
|
330
|
+
log.write("Finished creating ldsc genetic correlation heatmap!" ,verbose=verbose)
|
|
331
|
+
|
|
330
332
|
return fig,ax,log,df
|
|
331
333
|
|
|
@@ -59,23 +59,25 @@ def plot_stacked_mqq(objects,
|
|
|
59
59
|
log=Log(),
|
|
60
60
|
**mqq_args
|
|
61
61
|
):
|
|
62
|
+
|
|
62
63
|
log.write("Start to create stacked mqq plot by iteratively calling plot_mqq:",verbose=verbose)
|
|
63
64
|
# load sumstats
|
|
65
|
+
|
|
66
|
+
##########################################################################################################################################
|
|
64
67
|
sumstats_list = []
|
|
65
68
|
for each_object in objects:
|
|
66
69
|
sumstats_list.append(each_object.data)
|
|
67
70
|
|
|
68
|
-
|
|
69
71
|
if fig_args is None:
|
|
70
72
|
fig_args = {"dpi":200}
|
|
71
73
|
if region_lead_grid_line is None:
|
|
72
74
|
region_lead_grid_line = {"alpha":0.5,"linewidth" : 2,"linestyle":"--","color":"#FF0000"}
|
|
73
75
|
if title_pos is None:
|
|
74
|
-
title_pos = [0.
|
|
76
|
+
title_pos = [0.01,0.97]
|
|
75
77
|
if title_args is None:
|
|
76
78
|
title_args = {}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
|
|
80
|
+
# create figure and axes ##################################################################################################################
|
|
79
81
|
if mode=="r":
|
|
80
82
|
if len(vcfs)==1:
|
|
81
83
|
vcfs = vcfs *len(sumstats_list)
|
|
@@ -105,27 +107,29 @@ def plot_stacked_mqq(objects,
|
|
|
105
107
|
**fig_args)
|
|
106
108
|
plt.subplots_adjust(hspace=region_hspace)
|
|
107
109
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
110
|
+
##########################################################################################################################################
|
|
111
111
|
mqq_args_for_each_plot = _sort_args(mqq_args, n_plot)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
112
|
+
##########################################################################################################################################
|
|
113
|
+
# get x axis dict
|
|
115
114
|
if mode=="m":
|
|
116
115
|
_posdiccul = _get_chrom_dic(sumstats_list,chrom="CHR",pos="POS",chrpad=0.02)
|
|
117
116
|
else:
|
|
118
117
|
_posdiccul=None
|
|
119
118
|
|
|
119
|
+
##########################################################################################################################################
|
|
120
|
+
# a dict to store lead variants of each plot
|
|
120
121
|
lead_variants_is={}
|
|
122
|
+
|
|
123
|
+
##########################################################################################################################################
|
|
121
124
|
# plot manhattan plot
|
|
122
125
|
for index,sumstats in enumerate(sumstats_list):
|
|
126
|
+
|
|
127
|
+
#################################################################
|
|
123
128
|
if mode=="m" or mode=="r":
|
|
124
129
|
figax = (fig,axes[index],axes[-1])
|
|
125
130
|
elif mode=="mqq":
|
|
126
131
|
figax = (fig,axes[index,0],axes[index,1])
|
|
127
|
-
|
|
128
|
-
|
|
132
|
+
#################################################################
|
|
129
133
|
if index==0:
|
|
130
134
|
# plot last m and gene track
|
|
131
135
|
fig,log,lead_i,lead_i2 = mqqplot(sumstats,
|
|
@@ -151,6 +155,7 @@ def plot_stacked_mqq(objects,
|
|
|
151
155
|
)
|
|
152
156
|
lead_variants_is[index] = (lead_i,lead_i2)
|
|
153
157
|
else:
|
|
158
|
+
# plot only the scatter plot
|
|
154
159
|
fig,log,lead_i,lead_i2 = mqqplot(sumstats,
|
|
155
160
|
chrom="CHR",
|
|
156
161
|
pos="POS",
|
|
@@ -178,13 +183,32 @@ def plot_stacked_mqq(objects,
|
|
|
178
183
|
# adjust labels
|
|
179
184
|
# drop labels for each plot
|
|
180
185
|
# set a common laebl for all plots
|
|
181
|
-
|
|
182
|
-
axes[index].set_ylabel("")
|
|
186
|
+
|
|
183
187
|
|
|
184
188
|
if titles is not None:
|
|
185
189
|
for index,title in enumerate(titles):
|
|
186
190
|
axes[index].text(title_pos[0], title_pos[1] , title, transform=axes[index].transAxes,ha="left", va='top',**title_args)
|
|
191
|
+
##########################################################################################################################################
|
|
192
|
+
# draw the line for lead variants
|
|
193
|
+
_draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line)
|
|
194
|
+
|
|
195
|
+
##########################################################################################################################################
|
|
196
|
+
_drop_old_y_labels(axes, n_plot)
|
|
197
|
+
|
|
198
|
+
_add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height )
|
|
199
|
+
|
|
200
|
+
##########################################################################################################################################
|
|
201
|
+
save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
|
|
202
|
+
|
|
203
|
+
log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
|
|
204
|
+
|
|
205
|
+
return fig, log
|
|
187
206
|
|
|
207
|
+
def _drop_old_y_labels(axes, n_plot):
|
|
208
|
+
for index in range(n_plot):
|
|
209
|
+
axes[index].set_ylabel("")
|
|
210
|
+
|
|
211
|
+
def _draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line):
|
|
188
212
|
if mode=="r":
|
|
189
213
|
for index, sig_is in lead_variants_is.items():
|
|
190
214
|
for sig_i in sig_is:
|
|
@@ -192,19 +216,14 @@ def plot_stacked_mqq(objects,
|
|
|
192
216
|
for each_axis_index in range(n_plot + 1):
|
|
193
217
|
axes[each_axis_index].axvline(x=sig_i, zorder=2,**region_lead_grid_line)
|
|
194
218
|
|
|
195
|
-
|
|
219
|
+
def _add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height ):
|
|
196
220
|
gene_track_height_ratio = gene_track_height/(gene_track_height + n_plot*subplot_height)
|
|
197
221
|
ylabel_height = (1 - gene_track_height_ratio)*0.5 + gene_track_height_ratio
|
|
198
222
|
if mode=="r":
|
|
199
223
|
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
200
224
|
fig.text(0.93, ylabel_height, "Recombination rate(cM/Mb)", va='center', rotation=-90)
|
|
201
225
|
elif mode=="m":
|
|
202
|
-
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
203
|
-
|
|
204
|
-
save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
|
|
205
|
-
log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
|
|
206
|
-
return fig, log
|
|
207
|
-
|
|
226
|
+
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
208
227
|
|
|
209
228
|
def _sort_args(mqq_args, n_plot):
|
|
210
229
|
mqq_args_for_each_plot={i:{} for i in range(n_plot)}
|