gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gwaslab might be problematic. Click here for more details.
- gwaslab/bd_common_data.py +6 -3
- gwaslab/bd_download.py +9 -9
- gwaslab/bd_get_hapmap3.py +43 -9
- gwaslab/g_Log.py +14 -5
- gwaslab/g_Sumstats.py +86 -18
- gwaslab/g_SumstatsPair.py +70 -23
- gwaslab/g_SumstatsT.py +2 -2
- gwaslab/g_version.py +10 -10
- gwaslab/hm_casting.py +9 -4
- gwaslab/hm_harmonize_sumstats.py +88 -83
- gwaslab/io_preformat_input.py +14 -14
- gwaslab/io_read_ldsc.py +49 -1
- gwaslab/ldsc_irwls.py +198 -0
- gwaslab/ldsc_jackknife.py +514 -0
- gwaslab/ldsc_ldscore.py +417 -0
- gwaslab/ldsc_parse.py +294 -0
- gwaslab/ldsc_regressions.py +747 -0
- gwaslab/ldsc_sumstats.py +629 -0
- gwaslab/qc_check_datatype.py +1 -1
- gwaslab/qc_fix_sumstats.py +163 -161
- gwaslab/util_ex_calculate_ldmatrix.py +2 -2
- gwaslab/util_ex_gwascatalog.py +24 -24
- gwaslab/util_ex_ldproxyfinder.py +9 -9
- gwaslab/util_ex_ldsc.py +189 -0
- gwaslab/util_in_calculate_gc.py +6 -6
- gwaslab/util_in_calculate_power.py +42 -43
- gwaslab/util_in_convert_h2.py +8 -8
- gwaslab/util_in_fill_data.py +28 -28
- gwaslab/util_in_filter_value.py +91 -52
- gwaslab/util_in_get_density.py +8 -8
- gwaslab/util_in_get_sig.py +407 -65
- gwaslab/viz_aux_annotate_plot.py +12 -12
- gwaslab/viz_aux_quickfix.py +18 -18
- gwaslab/viz_aux_reposition_text.py +3 -3
- gwaslab/viz_aux_save_figure.py +14 -5
- gwaslab/viz_plot_compare_af.py +29 -30
- gwaslab/viz_plot_compare_effect.py +63 -71
- gwaslab/viz_plot_miamiplot2.py +6 -6
- gwaslab/viz_plot_mqqplot.py +17 -3
- gwaslab/viz_plot_qqplot.py +1 -1
- gwaslab/viz_plot_regionalplot.py +33 -32
- gwaslab/viz_plot_rg_heatmap.py +28 -26
- gwaslab/viz_plot_stackedregional.py +40 -21
- gwaslab/viz_plot_trumpetplot.py +50 -55
- gwaslab-3.4.39.dist-info/LICENSE +674 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
- gwaslab-3.4.39.dist-info/RECORD +80 -0
- gwaslab-3.4.38.dist-info/RECORD +0 -72
- /gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
- {gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
gwaslab/viz_plot_regionalplot.py
CHANGED
|
@@ -86,7 +86,7 @@ def _plot_regional(
|
|
|
86
86
|
region_ld_threshold = region_ld_threshold,
|
|
87
87
|
region_ld_colors = region_ld_colors,
|
|
88
88
|
marker_size= marker_size,
|
|
89
|
-
log=log)
|
|
89
|
+
log=log,verbose=verbose)
|
|
90
90
|
else:
|
|
91
91
|
ax1, lead_id = _pinpoint_lead(sumstats = sumstats,
|
|
92
92
|
ax1 = ax1,
|
|
@@ -94,14 +94,14 @@ def _plot_regional(
|
|
|
94
94
|
region_ld_threshold = region_ld_threshold,
|
|
95
95
|
region_ld_colors = region_ld_colors1,
|
|
96
96
|
marker_size= marker_size,
|
|
97
|
-
log=log)
|
|
97
|
+
log=log,verbose=verbose)
|
|
98
98
|
ax1, lead_id2 = _pinpoint_lead(sumstats = sumstats,
|
|
99
99
|
ax1 = ax1,
|
|
100
100
|
region_ref=region_ref_second,
|
|
101
101
|
region_ld_threshold = region_ld_threshold,
|
|
102
102
|
region_ld_colors = region_ld_colors2,
|
|
103
103
|
marker_size= marker_size,
|
|
104
|
-
log=log)
|
|
104
|
+
log=log,verbose=verbose)
|
|
105
105
|
|
|
106
106
|
if (vcf_path is not None) and region_ld_legend:
|
|
107
107
|
if region_ref_second is None:
|
|
@@ -240,7 +240,7 @@ def _plot_regional(
|
|
|
240
240
|
return ax1, ax3, ax4, cbar, lead_snp_i, lead_snp_i2
|
|
241
241
|
|
|
242
242
|
# + ###########################################################################################################################################################################
|
|
243
|
-
def _get_lead_id(sumstats=None, region_ref=None, log=None):
|
|
243
|
+
def _get_lead_id(sumstats=None, region_ref=None, log=None, verbose=True):
|
|
244
244
|
region_ref_to_check = copy.copy(region_ref)
|
|
245
245
|
try:
|
|
246
246
|
if len(region_ref_to_check)>0 and type(region_ref_to_check) is not str:
|
|
@@ -260,23 +260,23 @@ def _get_lead_id(sumstats=None, region_ref=None, log=None):
|
|
|
260
260
|
if region_ref_to_check is not None:
|
|
261
261
|
if type(lead_id) is list:
|
|
262
262
|
if len(lead_id)==0 :
|
|
263
|
-
log.
|
|
263
|
+
log.warning("{} not found. Roll back to lead variant...".format(region_ref_to_check))
|
|
264
264
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
265
265
|
else:
|
|
266
266
|
log.write(" -Reference variant ID: {} - {}".format(region_ref_to_check, lead_id))
|
|
267
267
|
|
|
268
268
|
if lead_id is None:
|
|
269
|
-
log.write(" -Extracting lead variant...")
|
|
269
|
+
log.write(" -Extracting lead variant...", verbose=verbose)
|
|
270
270
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
271
271
|
|
|
272
272
|
return lead_id
|
|
273
273
|
|
|
274
|
-
def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
|
|
274
|
+
def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log, verbose):
|
|
275
275
|
if region_ref is None:
|
|
276
|
-
log.write(" -Extracting lead variant...")
|
|
276
|
+
log.write(" -Extracting lead variant..." , verbose=verbose)
|
|
277
277
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
278
278
|
else:
|
|
279
|
-
lead_id = _get_lead_id(sumstats, region_ref, log)
|
|
279
|
+
lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
|
|
280
280
|
|
|
281
281
|
ax1.scatter(sumstats.loc[lead_id,"i"],sumstats.loc[lead_id,"scaled_P"],
|
|
282
282
|
color=region_ld_colors[-1],
|
|
@@ -398,7 +398,7 @@ def _plot_gene_track(
|
|
|
398
398
|
log=Log()):
|
|
399
399
|
|
|
400
400
|
# load gtf
|
|
401
|
-
|
|
401
|
+
log.write(" -Loading gtf files from:" + gtf_path, verbose=verbose)
|
|
402
402
|
uniq_gene_region,exons = process_gtf( gtf_path = gtf_path ,
|
|
403
403
|
region = region,
|
|
404
404
|
region_flank_factor = region_flank_factor,
|
|
@@ -416,7 +416,7 @@ def _plot_gene_track(
|
|
|
416
416
|
font_size_in_pixels= taf[2] * pixels_per_track
|
|
417
417
|
font_size_in_points = font_size_in_pixels * pixels_per_point
|
|
418
418
|
linewidth_in_points= pixels_per_track * pixels_per_point
|
|
419
|
-
|
|
419
|
+
log.write(" -plotting gene track..", verbose=verbose)
|
|
420
420
|
|
|
421
421
|
sig_gene_name = "Undefined"
|
|
422
422
|
sig_gene_name2 = "Undefined"
|
|
@@ -424,6 +424,7 @@ def _plot_gene_track(
|
|
|
424
424
|
texts_to_adjust_middle = []
|
|
425
425
|
texts_to_adjust_right = []
|
|
426
426
|
for index,row in uniq_gene_region.iterrows():
|
|
427
|
+
|
|
427
428
|
gene_color="#020080"
|
|
428
429
|
#if row[6][0]=="+":
|
|
429
430
|
if row["strand"][0]=="+":
|
|
@@ -496,7 +497,7 @@ def _plot_gene_track(
|
|
|
496
497
|
ax3.plot((gene_track_start_i+row["start"],gene_track_start_i+row["end"]),
|
|
497
498
|
(row["stack"]*2,row["stack"]*2),linewidth=linewidth_in_points*taf[3],color=exon_color,solid_capstyle="butt")
|
|
498
499
|
|
|
499
|
-
|
|
500
|
+
log.write(" -Finished plotting gene track..", verbose=verbose)
|
|
500
501
|
|
|
501
502
|
return ax3,texts_to_adjust_middle
|
|
502
503
|
|
|
@@ -504,25 +505,26 @@ def _plot_gene_track(
|
|
|
504
505
|
# Helpers
|
|
505
506
|
# -############################################################################################################################################################################
|
|
506
507
|
def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
|
|
507
|
-
|
|
508
|
-
|
|
508
|
+
log.write("Start to load reference genotype...", verbose=verbose)
|
|
509
|
+
log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
|
|
509
510
|
|
|
510
511
|
|
|
511
512
|
|
|
512
513
|
# load genotype data of the targeted region
|
|
513
514
|
ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
|
|
514
515
|
if ref_genotype is None:
|
|
515
|
-
|
|
516
|
+
log.warning("No data was retrieved. Skipping ...")
|
|
516
517
|
ref_genotype=dict()
|
|
517
518
|
ref_genotype["variants/POS"]=np.array([],dtype="int64")
|
|
518
|
-
|
|
519
|
-
|
|
519
|
+
log.write(" -Retrieving index...", verbose=verbose)
|
|
520
|
+
log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
|
|
520
521
|
# match sumstats pos and ref pos:
|
|
521
522
|
# get ref index for its first appearance of sumstats pos
|
|
522
523
|
#######################################################################################
|
|
523
524
|
def match_varaint(x):
|
|
524
525
|
# x: "POS,NEA,EA"
|
|
525
526
|
if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
|
|
527
|
+
# position match
|
|
526
528
|
if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
|
|
527
529
|
# multiple position matches
|
|
528
530
|
for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
|
|
@@ -532,16 +534,15 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
532
534
|
return j
|
|
533
535
|
elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
|
|
534
536
|
if x.iloc[2] == ref_genotype["variants/REF"][j]:
|
|
535
|
-
return j
|
|
536
|
-
|
|
537
|
-
return None
|
|
537
|
+
return j
|
|
538
|
+
return None
|
|
538
539
|
else:
|
|
539
540
|
# single match
|
|
540
541
|
return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
|
|
541
542
|
else:
|
|
542
543
|
# no position match
|
|
543
544
|
return None
|
|
544
|
-
|
|
545
|
+
log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
|
|
545
546
|
sumstats["REFINDEX"] = sumstats[[pos,nea,ea]].apply(lambda x: match_varaint(x),axis=1)
|
|
546
547
|
#############################################################################################
|
|
547
548
|
#sumstats["REFINDEX"] = sumstats[pos].apply(lambda x: np.where(ref_genotype["variants/POS"] == x )[0][0] if np.any(ref_genotype["variants/POS"] == x) else None)
|
|
@@ -551,7 +552,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
551
552
|
if region_ref is None:
|
|
552
553
|
lead_id = sumstats["scaled_P"].idxmax()
|
|
553
554
|
else:
|
|
554
|
-
lead_id = _get_lead_id(sumstats, region_ref, log)
|
|
555
|
+
lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
|
|
555
556
|
lead_pos = sumstats.loc[lead_id,pos]
|
|
556
557
|
|
|
557
558
|
# if lead pos is available:
|
|
@@ -567,12 +568,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
567
568
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
568
569
|
try:
|
|
569
570
|
if len(set(lead_snp_genotype[0]))==1:
|
|
570
|
-
log.
|
|
571
|
+
log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
|
|
571
572
|
except:
|
|
572
573
|
pass
|
|
573
574
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
574
575
|
|
|
575
|
-
|
|
576
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
576
577
|
|
|
577
578
|
if len(other_snp_genotype)>1:
|
|
578
579
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -580,7 +581,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
580
581
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
581
582
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
|
|
582
583
|
else:
|
|
583
|
-
|
|
584
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
584
585
|
sumstats["RSQ"]=None
|
|
585
586
|
|
|
586
587
|
sumstats["RSQ"] = sumstats["RSQ"].astype("float")
|
|
@@ -600,7 +601,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
600
601
|
#####################################################################################################
|
|
601
602
|
if region_ref_second is not None:
|
|
602
603
|
|
|
603
|
-
lead_id2 = _get_lead_id(sumstats, region_ref_second, log)
|
|
604
|
+
lead_id2 = _get_lead_id(sumstats, region_ref_second, log, verbose)
|
|
604
605
|
|
|
605
606
|
lead_pos2 = sumstats.loc[lead_id2,pos]
|
|
606
607
|
if lead_pos2 in ref_genotype["variants/POS"]:
|
|
@@ -615,12 +616,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
615
616
|
lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
|
|
616
617
|
try:
|
|
617
618
|
if len(set(lead_snp_genotype[0]))==1:
|
|
618
|
-
log.
|
|
619
|
+
log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
|
|
619
620
|
except:
|
|
620
621
|
pass
|
|
621
622
|
other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
|
|
622
623
|
|
|
623
|
-
|
|
624
|
+
log.write(" -Calculating Rsq...", verbose=verbose)
|
|
624
625
|
|
|
625
626
|
if len(other_snp_genotype)>1:
|
|
626
627
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
|
|
@@ -628,7 +629,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
628
629
|
valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
|
|
629
630
|
sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ2"] = valid_r2
|
|
630
631
|
else:
|
|
631
|
-
|
|
632
|
+
log.write(" -Lead SNP not found in reference...", verbose=verbose)
|
|
632
633
|
sumstats["RSQ2"]=None
|
|
633
634
|
|
|
634
635
|
sumstats["RSQ2"] = sumstats["RSQ2"].astype("float")
|
|
@@ -652,7 +653,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
|
|
|
652
653
|
#sumstats.loc[lead_id,"LEAD2"]
|
|
653
654
|
####################################################################################################
|
|
654
655
|
|
|
655
|
-
|
|
656
|
+
log.write("Finished loading reference genotype successfully!", verbose=verbose)
|
|
656
657
|
return sumstats
|
|
657
658
|
|
|
658
659
|
# -############################################################################################################################################################################
|
|
@@ -714,8 +715,8 @@ def process_gtf(gtf_path,
|
|
|
714
715
|
# extract protein coding gene
|
|
715
716
|
if region_protein_coding is True:
|
|
716
717
|
#genes_1mb = genes_1mb.loc[genes_1mb["gene_biotype"]=="protein_coding",:].copy()
|
|
717
|
-
pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding"),"name"].values
|
|
718
|
-
genes_1mb = genes_1mb.loc[genes_1mb["name"].isin(pc_genes_1mb_list),:]
|
|
718
|
+
pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding") & (genes_1mb["name"]!=""),"name"].values
|
|
719
|
+
genes_1mb = genes_1mb.loc[(genes_1mb["feature"].isin(["exon","gene"])) & (genes_1mb["name"].isin(pc_genes_1mb_list)),:]
|
|
719
720
|
# extract exon
|
|
720
721
|
exons = genes_1mb.loc[genes_1mb["feature"]=="exon",:].copy()
|
|
721
722
|
|
gwaslab/viz_plot_rg_heatmap.py
CHANGED
|
@@ -8,6 +8,7 @@ import matplotlib
|
|
|
8
8
|
from gwaslab.g_Log import Log
|
|
9
9
|
import scipy.stats as ss
|
|
10
10
|
from gwaslab.viz_aux_save_figure import save_figure
|
|
11
|
+
|
|
11
12
|
#################################################################################################
|
|
12
13
|
def convert_p_to_width(p,sig_level):
|
|
13
14
|
width_factor= -np.log10(sig_level)
|
|
@@ -54,7 +55,7 @@ def plot_rg(ldscrg,
|
|
|
54
55
|
save=None,
|
|
55
56
|
save_args=None):
|
|
56
57
|
|
|
57
|
-
|
|
58
|
+
log.write("Start to create ldsc genetic correlation heatmap..." ,verbose=verbose)
|
|
58
59
|
# configure arguments
|
|
59
60
|
if fig_args is None:
|
|
60
61
|
fig_args = {"dpi":300}
|
|
@@ -78,14 +79,14 @@ def plot_rg(ldscrg,
|
|
|
78
79
|
save_args = {}
|
|
79
80
|
|
|
80
81
|
#drop na records in P column
|
|
81
|
-
|
|
82
|
+
log.write("Raw dataset records:",len(ldscrg) ,verbose=verbose)
|
|
82
83
|
df=ldscrg.dropna(subset=[p]).copy()
|
|
83
84
|
|
|
84
|
-
|
|
85
|
+
log.write(" -Raw dataset non-NA records:",len(df) ,verbose=verbose)
|
|
85
86
|
# create unique pair column
|
|
86
87
|
df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
|
|
87
88
|
|
|
88
|
-
|
|
89
|
+
log.write("Filling diagnal line and duplicated pair for plotting..." ,verbose=verbose)
|
|
89
90
|
# fill na
|
|
90
91
|
df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
|
|
91
92
|
df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
|
|
@@ -96,16 +97,23 @@ def plot_rg(ldscrg,
|
|
|
96
97
|
p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
|
|
97
98
|
p_dup_list = p2_dup_list + p1_dup_list
|
|
98
99
|
if len(set(p_dup_list)) > 0:
|
|
99
|
-
|
|
100
|
+
log.write(" -Diagnal records:", len(set(p_dup_list)) ,verbose=verbose)
|
|
100
101
|
df_fill_dia["p1"] = p_dup_list
|
|
101
102
|
df_fill_dia["p2"] = df_fill_dia["p1"]
|
|
102
103
|
df_fill_dia["rg"] = 1
|
|
103
104
|
|
|
104
105
|
df_fill_na = pd.DataFrame(columns=df.columns)
|
|
105
106
|
df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
|
|
107
|
+
|
|
108
|
+
to_concate=[]
|
|
109
|
+
for i in [df,df_fill_reverse,df_fill_dia,df_fill_na]:
|
|
110
|
+
if len(i)>0:
|
|
111
|
+
to_concate.append(i.dropna(axis=1))
|
|
112
|
+
|
|
106
113
|
# fill diagonal
|
|
107
|
-
df = pd.concat(
|
|
108
|
-
|
|
114
|
+
df = pd.concat(to_concate,ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
|
|
115
|
+
|
|
116
|
+
#log.write(" -Dataset shape match:", len(df)==)
|
|
109
117
|
#
|
|
110
118
|
## remove record with p1 = p2, dropna in P column
|
|
111
119
|
dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
|
|
@@ -116,11 +124,11 @@ def plot_rg(ldscrg,
|
|
|
116
124
|
## drop duplicate and keep only unique pairs
|
|
117
125
|
dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
|
|
118
126
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
127
|
+
log.write("Valid unique trait pairs:",len(dfp) ,verbose=verbose)
|
|
128
|
+
log.write(" -Valid unique trait1:",dfp["p1"].nunique() ,verbose=verbose)
|
|
129
|
+
log.write(" -Valid unique trait2:",dfp["p2"].nunique() ,verbose=verbose)
|
|
130
|
+
log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05) ,verbose=verbose)
|
|
131
|
+
log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))) ,verbose=verbose)
|
|
124
132
|
|
|
125
133
|
#if correction=="fdr":
|
|
126
134
|
# fdr corrected p
|
|
@@ -131,7 +139,7 @@ def plot_rg(ldscrg,
|
|
|
131
139
|
dfp["fdr_p"]=ss.false_discovery_control(dfp[p],method=fdr_method)
|
|
132
140
|
dfp["fdr"] =ss.false_discovery_control(dfp[p],method=fdr_method) < 0.05
|
|
133
141
|
|
|
134
|
-
|
|
142
|
+
log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]) ,verbose=verbose)
|
|
135
143
|
# convert to dict for annotation and plotting
|
|
136
144
|
df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
|
|
137
145
|
dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
|
|
@@ -167,7 +175,7 @@ def plot_rg(ldscrg,
|
|
|
167
175
|
df["x"]=df[p2].map(dic_p2)
|
|
168
176
|
df["x_y"]=df[p2].map(dic_p1)
|
|
169
177
|
|
|
170
|
-
|
|
178
|
+
log.write("Plotting heatmap..." ,verbose=verbose)
|
|
171
179
|
########ticks###############################################
|
|
172
180
|
fig,ax = plt.subplots(**fig_args)
|
|
173
181
|
|
|
@@ -196,7 +204,7 @@ def plot_rg(ldscrg,
|
|
|
196
204
|
panno_list={1:{},2:{}}
|
|
197
205
|
rgtoanno=[]
|
|
198
206
|
|
|
199
|
-
|
|
207
|
+
log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]) ,verbose=verbose)
|
|
200
208
|
|
|
201
209
|
for i,row in df.iterrows():
|
|
202
210
|
xcenter=row["x"]
|
|
@@ -298,11 +306,11 @@ def plot_rg(ldscrg,
|
|
|
298
306
|
|
|
299
307
|
# annotate p
|
|
300
308
|
if panno is True:
|
|
301
|
-
|
|
309
|
+
log.write("P value annotation text (Order: Bon -> FDR -> Pnom): " ,verbose=verbose)
|
|
302
310
|
for i,correction in enumerate(corrections):
|
|
303
311
|
for j,sig_level in enumerate(sig_levels):
|
|
304
312
|
index = len(sig_levels)*i + j
|
|
305
|
-
|
|
313
|
+
log.write(" -{} : {}-corrected P < {} ".format(panno_texts[index], correction, sig_level) ,verbose=verbose)
|
|
306
314
|
for panno_set_number in panno_list.keys():
|
|
307
315
|
for key, i in panno_list[panno_set_number].items():
|
|
308
316
|
if panno_set_number == 1:
|
|
@@ -318,14 +326,8 @@ def plot_rg(ldscrg,
|
|
|
318
326
|
ax.set_aspect('equal', adjustable='box')
|
|
319
327
|
|
|
320
328
|
save_figure(fig, save, keyword="ldscrg",save_args=save_args, log=log, verbose=verbose)
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
# fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
|
|
325
|
-
# log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
|
|
326
|
-
# else:
|
|
327
|
-
# fig.savefig(save,bbox_inches="tight",**save_args)
|
|
328
|
-
# log.write(" -Saved to "+ save + " successfully!" )
|
|
329
|
-
if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
|
|
329
|
+
|
|
330
|
+
log.write("Finished creating ldsc genetic correlation heatmap!" ,verbose=verbose)
|
|
331
|
+
|
|
330
332
|
return fig,ax,log,df
|
|
331
333
|
|
|
@@ -59,23 +59,25 @@ def plot_stacked_mqq(objects,
|
|
|
59
59
|
log=Log(),
|
|
60
60
|
**mqq_args
|
|
61
61
|
):
|
|
62
|
+
|
|
62
63
|
log.write("Start to create stacked mqq plot by iteratively calling plot_mqq:",verbose=verbose)
|
|
63
64
|
# load sumstats
|
|
65
|
+
|
|
66
|
+
##########################################################################################################################################
|
|
64
67
|
sumstats_list = []
|
|
65
68
|
for each_object in objects:
|
|
66
69
|
sumstats_list.append(each_object.data)
|
|
67
70
|
|
|
68
|
-
|
|
69
71
|
if fig_args is None:
|
|
70
72
|
fig_args = {"dpi":200}
|
|
71
73
|
if region_lead_grid_line is None:
|
|
72
74
|
region_lead_grid_line = {"alpha":0.5,"linewidth" : 2,"linestyle":"--","color":"#FF0000"}
|
|
73
75
|
if title_pos is None:
|
|
74
|
-
title_pos = [0.
|
|
76
|
+
title_pos = [0.01,0.97]
|
|
75
77
|
if title_args is None:
|
|
76
78
|
title_args = {}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
|
|
80
|
+
# create figure and axes ##################################################################################################################
|
|
79
81
|
if mode=="r":
|
|
80
82
|
if len(vcfs)==1:
|
|
81
83
|
vcfs = vcfs *len(sumstats_list)
|
|
@@ -105,27 +107,29 @@ def plot_stacked_mqq(objects,
|
|
|
105
107
|
**fig_args)
|
|
106
108
|
plt.subplots_adjust(hspace=region_hspace)
|
|
107
109
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
110
|
+
##########################################################################################################################################
|
|
111
111
|
mqq_args_for_each_plot = _sort_args(mqq_args, n_plot)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
112
|
+
##########################################################################################################################################
|
|
113
|
+
# get x axis dict
|
|
115
114
|
if mode=="m":
|
|
116
115
|
_posdiccul = _get_chrom_dic(sumstats_list,chrom="CHR",pos="POS",chrpad=0.02)
|
|
117
116
|
else:
|
|
118
117
|
_posdiccul=None
|
|
119
118
|
|
|
119
|
+
##########################################################################################################################################
|
|
120
|
+
# a dict to store lead variants of each plot
|
|
120
121
|
lead_variants_is={}
|
|
122
|
+
|
|
123
|
+
##########################################################################################################################################
|
|
121
124
|
# plot manhattan plot
|
|
122
125
|
for index,sumstats in enumerate(sumstats_list):
|
|
126
|
+
|
|
127
|
+
#################################################################
|
|
123
128
|
if mode=="m" or mode=="r":
|
|
124
129
|
figax = (fig,axes[index],axes[-1])
|
|
125
130
|
elif mode=="mqq":
|
|
126
131
|
figax = (fig,axes[index,0],axes[index,1])
|
|
127
|
-
|
|
128
|
-
|
|
132
|
+
#################################################################
|
|
129
133
|
if index==0:
|
|
130
134
|
# plot last m and gene track
|
|
131
135
|
fig,log,lead_i,lead_i2 = mqqplot(sumstats,
|
|
@@ -151,6 +155,7 @@ def plot_stacked_mqq(objects,
|
|
|
151
155
|
)
|
|
152
156
|
lead_variants_is[index] = (lead_i,lead_i2)
|
|
153
157
|
else:
|
|
158
|
+
# plot only the scatter plot
|
|
154
159
|
fig,log,lead_i,lead_i2 = mqqplot(sumstats,
|
|
155
160
|
chrom="CHR",
|
|
156
161
|
pos="POS",
|
|
@@ -178,13 +183,32 @@ def plot_stacked_mqq(objects,
|
|
|
178
183
|
# adjust labels
|
|
179
184
|
# drop labels for each plot
|
|
180
185
|
# set a common laebl for all plots
|
|
181
|
-
|
|
182
|
-
axes[index].set_ylabel("")
|
|
186
|
+
|
|
183
187
|
|
|
184
188
|
if titles is not None:
|
|
185
189
|
for index,title in enumerate(titles):
|
|
186
190
|
axes[index].text(title_pos[0], title_pos[1] , title, transform=axes[index].transAxes,ha="left", va='top',**title_args)
|
|
191
|
+
##########################################################################################################################################
|
|
192
|
+
# draw the line for lead variants
|
|
193
|
+
_draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line)
|
|
194
|
+
|
|
195
|
+
##########################################################################################################################################
|
|
196
|
+
_drop_old_y_labels(axes, n_plot)
|
|
197
|
+
|
|
198
|
+
_add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height )
|
|
199
|
+
|
|
200
|
+
##########################################################################################################################################
|
|
201
|
+
save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
|
|
202
|
+
|
|
203
|
+
log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
|
|
204
|
+
|
|
205
|
+
return fig, log
|
|
187
206
|
|
|
207
|
+
def _drop_old_y_labels(axes, n_plot):
|
|
208
|
+
for index in range(n_plot):
|
|
209
|
+
axes[index].set_ylabel("")
|
|
210
|
+
|
|
211
|
+
def _draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line):
|
|
188
212
|
if mode=="r":
|
|
189
213
|
for index, sig_is in lead_variants_is.items():
|
|
190
214
|
for sig_i in sig_is:
|
|
@@ -192,19 +216,14 @@ def plot_stacked_mqq(objects,
|
|
|
192
216
|
for each_axis_index in range(n_plot + 1):
|
|
193
217
|
axes[each_axis_index].axvline(x=sig_i, zorder=2,**region_lead_grid_line)
|
|
194
218
|
|
|
195
|
-
|
|
219
|
+
def _add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height ):
|
|
196
220
|
gene_track_height_ratio = gene_track_height/(gene_track_height + n_plot*subplot_height)
|
|
197
221
|
ylabel_height = (1 - gene_track_height_ratio)*0.5 + gene_track_height_ratio
|
|
198
222
|
if mode=="r":
|
|
199
223
|
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
200
224
|
fig.text(0.93, ylabel_height, "Recombination rate(cM/Mb)", va='center', rotation=-90)
|
|
201
225
|
elif mode=="m":
|
|
202
|
-
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
203
|
-
|
|
204
|
-
save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
|
|
205
|
-
log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
|
|
206
|
-
return fig, log
|
|
207
|
-
|
|
226
|
+
fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
|
|
208
227
|
|
|
209
228
|
def _sort_args(mqq_args, n_plot):
|
|
210
229
|
mqq_args_for_each_plot={i:{} for i in range(n_plot)}
|