gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show
  1. gwaslab/bd_common_data.py +6 -3
  2. gwaslab/bd_download.py +9 -9
  3. gwaslab/bd_get_hapmap3.py +43 -9
  4. gwaslab/data/formatbook.json +722 -721
  5. gwaslab/g_Log.py +22 -5
  6. gwaslab/g_Sumstats.py +110 -163
  7. gwaslab/g_SumstatsPair.py +76 -25
  8. gwaslab/g_SumstatsT.py +2 -2
  9. gwaslab/g_Sumstats_summary.py +3 -3
  10. gwaslab/g_version.py +10 -10
  11. gwaslab/hm_casting.py +36 -17
  12. gwaslab/hm_harmonize_sumstats.py +354 -221
  13. gwaslab/hm_rsid_to_chrpos.py +1 -1
  14. gwaslab/io_preformat_input.py +49 -43
  15. gwaslab/io_read_ldsc.py +49 -1
  16. gwaslab/io_to_formats.py +428 -295
  17. gwaslab/ldsc_irwls.py +198 -0
  18. gwaslab/ldsc_jackknife.py +514 -0
  19. gwaslab/ldsc_ldscore.py +417 -0
  20. gwaslab/ldsc_parse.py +294 -0
  21. gwaslab/ldsc_regressions.py +747 -0
  22. gwaslab/ldsc_sumstats.py +629 -0
  23. gwaslab/qc_check_datatype.py +3 -3
  24. gwaslab/qc_fix_sumstats.py +891 -778
  25. gwaslab/util_ex_calculate_ldmatrix.py +31 -13
  26. gwaslab/util_ex_gwascatalog.py +25 -25
  27. gwaslab/util_ex_ldproxyfinder.py +10 -10
  28. gwaslab/util_ex_ldsc.py +189 -0
  29. gwaslab/util_ex_process_ref.py +3 -3
  30. gwaslab/util_ex_run_coloc.py +26 -4
  31. gwaslab/util_in_calculate_gc.py +6 -6
  32. gwaslab/util_in_calculate_power.py +42 -43
  33. gwaslab/util_in_convert_h2.py +8 -8
  34. gwaslab/util_in_fill_data.py +30 -30
  35. gwaslab/util_in_filter_value.py +201 -74
  36. gwaslab/util_in_get_density.py +10 -10
  37. gwaslab/util_in_get_sig.py +445 -71
  38. gwaslab/viz_aux_annotate_plot.py +12 -12
  39. gwaslab/viz_aux_quickfix.py +42 -37
  40. gwaslab/viz_aux_reposition_text.py +10 -7
  41. gwaslab/viz_aux_save_figure.py +18 -8
  42. gwaslab/viz_plot_compare_af.py +32 -33
  43. gwaslab/viz_plot_compare_effect.py +63 -71
  44. gwaslab/viz_plot_miamiplot2.py +34 -26
  45. gwaslab/viz_plot_mqqplot.py +126 -75
  46. gwaslab/viz_plot_qqplot.py +11 -8
  47. gwaslab/viz_plot_regionalplot.py +36 -33
  48. gwaslab/viz_plot_rg_heatmap.py +28 -26
  49. gwaslab/viz_plot_stackedregional.py +40 -21
  50. gwaslab/viz_plot_trumpetplot.py +65 -61
  51. gwaslab-3.4.39.dist-info/LICENSE +674 -0
  52. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
  53. gwaslab-3.4.39.dist-info/RECORD +80 -0
  54. gwaslab-3.4.37.dist-info/RECORD +0 -72
  55. /gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
  56. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
  57. {gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0
@@ -86,7 +86,7 @@ def _plot_regional(
86
86
  region_ld_threshold = region_ld_threshold,
87
87
  region_ld_colors = region_ld_colors,
88
88
  marker_size= marker_size,
89
- log=log)
89
+ log=log,verbose=verbose)
90
90
  else:
91
91
  ax1, lead_id = _pinpoint_lead(sumstats = sumstats,
92
92
  ax1 = ax1,
@@ -94,14 +94,14 @@ def _plot_regional(
94
94
  region_ld_threshold = region_ld_threshold,
95
95
  region_ld_colors = region_ld_colors1,
96
96
  marker_size= marker_size,
97
- log=log)
97
+ log=log,verbose=verbose)
98
98
  ax1, lead_id2 = _pinpoint_lead(sumstats = sumstats,
99
99
  ax1 = ax1,
100
100
  region_ref=region_ref_second,
101
101
  region_ld_threshold = region_ld_threshold,
102
102
  region_ld_colors = region_ld_colors2,
103
103
  marker_size= marker_size,
104
- log=log)
104
+ log=log,verbose=verbose)
105
105
 
106
106
  if (vcf_path is not None) and region_ld_legend:
107
107
  if region_ref_second is None:
@@ -122,6 +122,8 @@ def _plot_regional(
122
122
  region_ld_colors=region_ld_colors2,
123
123
  position=2)
124
124
  cbar = [cbar1, cbar2]
125
+ else:
126
+ cbar=None
125
127
  if region_title is not None:
126
128
  ax1 = _add_region_title(region_title, ax1=ax1,region_title_args=region_title_args )
127
129
  ## recombinnation rate ##################################################
@@ -238,7 +240,7 @@ def _plot_regional(
238
240
  return ax1, ax3, ax4, cbar, lead_snp_i, lead_snp_i2
239
241
 
240
242
  # + ###########################################################################################################################################################################
241
- def _get_lead_id(sumstats=None, region_ref=None, log=None):
243
+ def _get_lead_id(sumstats=None, region_ref=None, log=None, verbose=True):
242
244
  region_ref_to_check = copy.copy(region_ref)
243
245
  try:
244
246
  if len(region_ref_to_check)>0 and type(region_ref_to_check) is not str:
@@ -258,23 +260,23 @@ def _get_lead_id(sumstats=None, region_ref=None, log=None):
258
260
  if region_ref_to_check is not None:
259
261
  if type(lead_id) is list:
260
262
  if len(lead_id)==0 :
261
- log.write(" -WARNING: {} not found. Roll back to lead variant...".format(region_ref_to_check))
263
+ log.warning("{} not found. Roll back to lead variant...".format(region_ref_to_check))
262
264
  lead_id = sumstats["scaled_P"].idxmax()
263
265
  else:
264
266
  log.write(" -Reference variant ID: {} - {}".format(region_ref_to_check, lead_id))
265
267
 
266
268
  if lead_id is None:
267
- log.write(" -Extracting lead variant...")
269
+ log.write(" -Extracting lead variant...", verbose=verbose)
268
270
  lead_id = sumstats["scaled_P"].idxmax()
269
271
 
270
272
  return lead_id
271
273
 
272
- def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
274
+ def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log, verbose):
273
275
  if region_ref is None:
274
- log.write(" -Extracting lead variant...")
276
+ log.write(" -Extracting lead variant..." , verbose=verbose)
275
277
  lead_id = sumstats["scaled_P"].idxmax()
276
278
  else:
277
- lead_id = _get_lead_id(sumstats, region_ref, log)
279
+ lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
278
280
 
279
281
  ax1.scatter(sumstats.loc[lead_id,"i"],sumstats.loc[lead_id,"scaled_P"],
280
282
  color=region_ld_colors[-1],
@@ -396,7 +398,7 @@ def _plot_gene_track(
396
398
  log=Log()):
397
399
 
398
400
  # load gtf
399
- if verbose: log.write(" -Loading gtf files from:" + gtf_path)
401
+ log.write(" -Loading gtf files from:" + gtf_path, verbose=verbose)
400
402
  uniq_gene_region,exons = process_gtf( gtf_path = gtf_path ,
401
403
  region = region,
402
404
  region_flank_factor = region_flank_factor,
@@ -414,7 +416,7 @@ def _plot_gene_track(
414
416
  font_size_in_pixels= taf[2] * pixels_per_track
415
417
  font_size_in_points = font_size_in_pixels * pixels_per_point
416
418
  linewidth_in_points= pixels_per_track * pixels_per_point
417
- if verbose: log.write(" -plotting gene track..")
419
+ log.write(" -plotting gene track..", verbose=verbose)
418
420
 
419
421
  sig_gene_name = "Undefined"
420
422
  sig_gene_name2 = "Undefined"
@@ -422,6 +424,7 @@ def _plot_gene_track(
422
424
  texts_to_adjust_middle = []
423
425
  texts_to_adjust_right = []
424
426
  for index,row in uniq_gene_region.iterrows():
427
+
425
428
  gene_color="#020080"
426
429
  #if row[6][0]=="+":
427
430
  if row["strand"][0]=="+":
@@ -494,7 +497,7 @@ def _plot_gene_track(
494
497
  ax3.plot((gene_track_start_i+row["start"],gene_track_start_i+row["end"]),
495
498
  (row["stack"]*2,row["stack"]*2),linewidth=linewidth_in_points*taf[3],color=exon_color,solid_capstyle="butt")
496
499
 
497
- if verbose: log.write(" -Finished plotting gene track..")
500
+ log.write(" -Finished plotting gene track..", verbose=verbose)
498
501
 
499
502
  return ax3,texts_to_adjust_middle
500
503
 
@@ -502,25 +505,26 @@ def _plot_gene_track(
502
505
  # Helpers
503
506
  # -############################################################################################################################################################################
504
507
  def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
505
- if verbose: log.write("Start to load reference genotype...")
506
- if verbose: log.write(" -reference vcf path : "+ vcf_path)
508
+ log.write("Start to load reference genotype...", verbose=verbose)
509
+ log.write(" -reference vcf path : "+ vcf_path, verbose=verbose)
507
510
 
508
511
 
509
512
 
510
513
  # load genotype data of the targeted region
511
514
  ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
512
515
  if ref_genotype is None:
513
- if verbose: log.write(" -Warning: no data was retrieved. Skipping ...")
516
+ log.warning("No data was retrieved. Skipping ...")
514
517
  ref_genotype=dict()
515
518
  ref_genotype["variants/POS"]=np.array([],dtype="int64")
516
- if verbose: log.write(" -Retrieving index...")
517
- if verbose: log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])))
519
+ log.write(" -Retrieving index...", verbose=verbose)
520
+ log.write(" -Ref variants in the region: {}".format(len(ref_genotype["variants/POS"])), verbose=verbose)
518
521
  # match sumstats pos and ref pos:
519
522
  # get ref index for its first appearance of sumstats pos
520
523
  #######################################################################################
521
524
  def match_varaint(x):
522
525
  # x: "POS,NEA,EA"
523
526
  if np.any(ref_genotype["variants/POS"] == x.iloc[0]):
527
+ # position match
524
528
  if len(np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0])>1:
525
529
  # multiple position matches
526
530
  for j in np.where(ref_genotype["variants/POS"] == x.iloc[0])[0]:
@@ -530,17 +534,16 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
530
534
  return j
531
535
  elif x.iloc[1] in ref_genotype["variants/ALT"][j]:
532
536
  if x.iloc[2] == ref_genotype["variants/REF"][j]:
533
- return j
534
- else:
535
- return None
537
+ return j
538
+ return None
536
539
  else:
537
540
  # single match
538
541
  return np.where(ref_genotype["variants/POS"] == x.iloc[0] )[0][0]
539
542
  else:
540
543
  # no position match
541
544
  return None
542
- if verbose: log.write(" -Matching variants using POS, NEA, EA ...")
543
- sumstats["REFINDEX"] = sumstats.loc[:,[pos,nea,ea]].apply(lambda x: match_varaint(x),axis=1)
545
+ log.write(" -Matching variants using POS, NEA, EA ...", verbose=verbose)
546
+ sumstats["REFINDEX"] = sumstats[[pos,nea,ea]].apply(lambda x: match_varaint(x),axis=1)
544
547
  #############################################################################################
545
548
  #sumstats["REFINDEX"] = sumstats[pos].apply(lambda x: np.where(ref_genotype["variants/POS"] == x )[0][0] if np.any(ref_genotype["variants/POS"] == x) else None)
546
549
 
@@ -549,7 +552,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
549
552
  if region_ref is None:
550
553
  lead_id = sumstats["scaled_P"].idxmax()
551
554
  else:
552
- lead_id = _get_lead_id(sumstats, region_ref, log)
555
+ lead_id = _get_lead_id(sumstats, region_ref, log, verbose)
553
556
  lead_pos = sumstats.loc[lead_id,pos]
554
557
 
555
558
  # if lead pos is available:
@@ -565,12 +568,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
565
568
  lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
566
569
  try:
567
570
  if len(set(lead_snp_genotype[0]))==1:
568
- log.write(" -WARNING: The variant is mono-allelic in reference VCF. LD can not be calculated.", verbose=verbose)
571
+ log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
569
572
  except:
570
573
  pass
571
574
  other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
572
575
 
573
- if verbose: log.write(" -Calculating Rsq...")
576
+ log.write(" -Calculating Rsq...", verbose=verbose)
574
577
 
575
578
  if len(other_snp_genotype)>1:
576
579
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
@@ -578,7 +581,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
578
581
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
579
582
  sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ"] = valid_r2
580
583
  else:
581
- if verbose: log.write(" -Lead SNP not found in reference...")
584
+ log.write(" -Lead SNP not found in reference...", verbose=verbose)
582
585
  sumstats["RSQ"]=None
583
586
 
584
587
  sumstats["RSQ"] = sumstats["RSQ"].astype("float")
@@ -598,7 +601,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
598
601
  #####################################################################################################
599
602
  if region_ref_second is not None:
600
603
 
601
- lead_id2 = _get_lead_id(sumstats, region_ref_second, log)
604
+ lead_id2 = _get_lead_id(sumstats, region_ref_second, log, verbose)
602
605
 
603
606
  lead_pos2 = sumstats.loc[lead_id2,pos]
604
607
  if lead_pos2 in ref_genotype["variants/POS"]:
@@ -613,12 +616,12 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
613
616
  lead_snp_genotype = GenotypeArray([ref_genotype["calldata/GT"][lead_snp_ref_index]]).to_n_alt()
614
617
  try:
615
618
  if len(set(lead_snp_genotype[0]))==1:
616
- log.write(" -WARNING: The variant is mono-allelic in reference VCF. LD can not be calculated.", verbose=verbose)
619
+ log.warning("The variant is mono-allelic in reference VCF. LD can not be calculated.")
617
620
  except:
618
621
  pass
619
622
  other_snp_genotype = GenotypeArray(ref_genotype["calldata/GT"][other_snps_ref_index]).to_n_alt()
620
623
 
621
- if verbose: log.write(" -Calculating Rsq...")
624
+ log.write(" -Calculating Rsq...", verbose=verbose)
622
625
 
623
626
  if len(other_snp_genotype)>1:
624
627
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype)[0],2)
@@ -626,7 +629,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
626
629
  valid_r2= np.power(rogers_huff_r_between(lead_snp_genotype,other_snp_genotype),2)
627
630
  sumstats.loc[~sumstats["REFINDEX"].isna(),"RSQ2"] = valid_r2
628
631
  else:
629
- if verbose: log.write(" -Lead SNP not found in reference...")
632
+ log.write(" -Lead SNP not found in reference...", verbose=verbose)
630
633
  sumstats["RSQ2"]=None
631
634
 
632
635
  sumstats["RSQ2"] = sumstats["RSQ2"].astype("float")
@@ -650,7 +653,7 @@ def process_vcf(sumstats, vcf_path, region,region_ref, region_ref_second, log, v
650
653
  #sumstats.loc[lead_id,"LEAD2"]
651
654
  ####################################################################################################
652
655
 
653
- if verbose: log.write("Finished loading reference genotype successfully!")
656
+ log.write("Finished loading reference genotype successfully!", verbose=verbose)
654
657
  return sumstats
655
658
 
656
659
  # -############################################################################################################################################################################
@@ -712,8 +715,8 @@ def process_gtf(gtf_path,
712
715
  # extract protein coding gene
713
716
  if region_protein_coding is True:
714
717
  #genes_1mb = genes_1mb.loc[genes_1mb["gene_biotype"]=="protein_coding",:].copy()
715
- pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding"),"name"].values
716
- genes_1mb = genes_1mb.loc[genes_1mb["name"].isin(pc_genes_1mb_list),:]
718
+ pc_genes_1mb_list = genes_1mb.loc[(genes_1mb["feature"]=="gene")& (genes_1mb["gene_biotype"]=="protein_coding") & (genes_1mb["name"]!=""),"name"].values
719
+ genes_1mb = genes_1mb.loc[(genes_1mb["feature"].isin(["exon","gene"])) & (genes_1mb["name"].isin(pc_genes_1mb_list)),:]
717
720
  # extract exon
718
721
  exons = genes_1mb.loc[genes_1mb["feature"]=="exon",:].copy()
719
722
 
@@ -8,6 +8,7 @@ import matplotlib
8
8
  from gwaslab.g_Log import Log
9
9
  import scipy.stats as ss
10
10
  from gwaslab.viz_aux_save_figure import save_figure
11
+
11
12
  #################################################################################################
12
13
  def convert_p_to_width(p,sig_level):
13
14
  width_factor= -np.log10(sig_level)
@@ -54,7 +55,7 @@ def plot_rg(ldscrg,
54
55
  save=None,
55
56
  save_args=None):
56
57
 
57
- if verbose: log.write("Start to create ldsc genetic correlation heatmap...")
58
+ log.write("Start to create ldsc genetic correlation heatmap..." ,verbose=verbose)
58
59
  # configure arguments
59
60
  if fig_args is None:
60
61
  fig_args = {"dpi":300}
@@ -78,14 +79,14 @@ def plot_rg(ldscrg,
78
79
  save_args = {}
79
80
 
80
81
  #drop na records in P column
81
- if verbose: log.write("Raw dataset records:",len(ldscrg))
82
+ log.write("Raw dataset records:",len(ldscrg) ,verbose=verbose)
82
83
  df=ldscrg.dropna(subset=[p]).copy()
83
84
 
84
- if verbose: log.write(" -Raw dataset non-NA records:",len(df))
85
+ log.write(" -Raw dataset non-NA records:",len(df) ,verbose=verbose)
85
86
  # create unique pair column
86
87
  df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
87
88
 
88
- if verbose: log.write("Filling diagnal line and duplicated pair for plotting...")
89
+ log.write("Filling diagnal line and duplicated pair for plotting..." ,verbose=verbose)
89
90
  # fill na
90
91
  df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
91
92
  df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
@@ -96,16 +97,23 @@ def plot_rg(ldscrg,
96
97
  p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
97
98
  p_dup_list = p2_dup_list + p1_dup_list
98
99
  if len(set(p_dup_list)) > 0:
99
- if verbose: log.write(" -Diagnal records:", len(set(p_dup_list)))
100
+ log.write(" -Diagnal records:", len(set(p_dup_list)) ,verbose=verbose)
100
101
  df_fill_dia["p1"] = p_dup_list
101
102
  df_fill_dia["p2"] = df_fill_dia["p1"]
102
103
  df_fill_dia["rg"] = 1
103
104
 
104
105
  df_fill_na = pd.DataFrame(columns=df.columns)
105
106
  df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
107
+
108
+ to_concate=[]
109
+ for i in [df,df_fill_reverse,df_fill_dia,df_fill_na]:
110
+ if len(i)>0:
111
+ to_concate.append(i.dropna(axis=1))
112
+
106
113
  # fill diagonal
107
- df = pd.concat([df,df_fill_reverse,df_fill_dia,df_fill_na],ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
108
- #if verbose: log.write(" -Dataset shape match:", len(df)==)
114
+ df = pd.concat(to_concate,ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
115
+
116
+ #log.write(" -Dataset shape match:", len(df)==)
109
117
  #
110
118
  ## remove record with p1 = p2, dropna in P column
111
119
  dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
@@ -116,11 +124,11 @@ def plot_rg(ldscrg,
116
124
  ## drop duplicate and keep only unique pairs
117
125
  dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
118
126
 
119
- if verbose: log.write("Valid unique trait pairs:",len(dfp))
120
- if verbose: log.write(" -Valid unique trait1:",dfp["p1"].nunique())
121
- if verbose: log.write(" -Valid unique trait2:",dfp["p2"].nunique())
122
- if verbose: log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05))
123
- if verbose: log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))))
127
+ log.write("Valid unique trait pairs:",len(dfp) ,verbose=verbose)
128
+ log.write(" -Valid unique trait1:",dfp["p1"].nunique() ,verbose=verbose)
129
+ log.write(" -Valid unique trait2:",dfp["p2"].nunique() ,verbose=verbose)
130
+ log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05) ,verbose=verbose)
131
+ log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))) ,verbose=verbose)
124
132
 
125
133
  #if correction=="fdr":
126
134
  # fdr corrected p
@@ -131,7 +139,7 @@ def plot_rg(ldscrg,
131
139
  dfp["fdr_p"]=ss.false_discovery_control(dfp[p],method=fdr_method)
132
140
  dfp["fdr"] =ss.false_discovery_control(dfp[p],method=fdr_method) < 0.05
133
141
 
134
- if verbose: log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]))
142
+ log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]) ,verbose=verbose)
135
143
  # convert to dict for annotation and plotting
136
144
  df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
137
145
  dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
@@ -167,7 +175,7 @@ def plot_rg(ldscrg,
167
175
  df["x"]=df[p2].map(dic_p2)
168
176
  df["x_y"]=df[p2].map(dic_p1)
169
177
 
170
- if verbose: log.write("Plotting heatmap...")
178
+ log.write("Plotting heatmap..." ,verbose=verbose)
171
179
  ########ticks###############################################
172
180
  fig,ax = plt.subplots(**fig_args)
173
181
 
@@ -196,7 +204,7 @@ def plot_rg(ldscrg,
196
204
  panno_list={1:{},2:{}}
197
205
  rgtoanno=[]
198
206
 
199
- if verbose: log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]))
207
+ log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]) ,verbose=verbose)
200
208
 
201
209
  for i,row in df.iterrows():
202
210
  xcenter=row["x"]
@@ -298,11 +306,11 @@ def plot_rg(ldscrg,
298
306
 
299
307
  # annotate p
300
308
  if panno is True:
301
- if verbose: log.write("P value annotation text : ")
309
+ log.write("P value annotation text (Order: Bon -> FDR -> Pnom): " ,verbose=verbose)
302
310
  for i,correction in enumerate(corrections):
303
311
  for j,sig_level in enumerate(sig_levels):
304
312
  index = len(sig_levels)*i + j
305
- if verbose: log.write(" -{} : {}-corrected P < {}".format(panno_texts[index], correction, sig_level))
313
+ log.write(" -{} : {}-corrected P < {} ".format(panno_texts[index], correction, sig_level) ,verbose=verbose)
306
314
  for panno_set_number in panno_list.keys():
307
315
  for key, i in panno_list[panno_set_number].items():
308
316
  if panno_set_number == 1:
@@ -318,14 +326,8 @@ def plot_rg(ldscrg,
318
326
  ax.set_aspect('equal', adjustable='box')
319
327
 
320
328
  save_figure(fig, save, keyword="ldscrg",save_args=save_args, log=log, verbose=verbose)
321
- #if save:
322
- # if verbose: log.write("Saving plot:")
323
- # if save==True:
324
- # fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
325
- # log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
326
- # else:
327
- # fig.savefig(save,bbox_inches="tight",**save_args)
328
- # log.write(" -Saved to "+ save + " successfully!" )
329
- if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
329
+
330
+ log.write("Finished creating ldsc genetic correlation heatmap!" ,verbose=verbose)
331
+
330
332
  return fig,ax,log,df
331
333
 
@@ -59,23 +59,25 @@ def plot_stacked_mqq(objects,
59
59
  log=Log(),
60
60
  **mqq_args
61
61
  ):
62
+
62
63
  log.write("Start to create stacked mqq plot by iteratively calling plot_mqq:",verbose=verbose)
63
64
  # load sumstats
65
+
66
+ ##########################################################################################################################################
64
67
  sumstats_list = []
65
68
  for each_object in objects:
66
69
  sumstats_list.append(each_object.data)
67
70
 
68
-
69
71
  if fig_args is None:
70
72
  fig_args = {"dpi":200}
71
73
  if region_lead_grid_line is None:
72
74
  region_lead_grid_line = {"alpha":0.5,"linewidth" : 2,"linestyle":"--","color":"#FF0000"}
73
75
  if title_pos is None:
74
- title_pos = [0.03,0.97]
76
+ title_pos = [0.01,0.97]
75
77
  if title_args is None:
76
78
  title_args = {}
77
- # create figure and axes
78
-
79
+
80
+ # create figure and axes ##################################################################################################################
79
81
  if mode=="r":
80
82
  if len(vcfs)==1:
81
83
  vcfs = vcfs *len(sumstats_list)
@@ -105,27 +107,29 @@ def plot_stacked_mqq(objects,
105
107
  **fig_args)
106
108
  plt.subplots_adjust(hspace=region_hspace)
107
109
 
108
- #
109
-
110
-
110
+ ##########################################################################################################################################
111
111
  mqq_args_for_each_plot = _sort_args(mqq_args, n_plot)
112
-
113
-
114
-
112
+ ##########################################################################################################################################
113
+ # get x axis dict
115
114
  if mode=="m":
116
115
  _posdiccul = _get_chrom_dic(sumstats_list,chrom="CHR",pos="POS",chrpad=0.02)
117
116
  else:
118
117
  _posdiccul=None
119
118
 
119
+ ##########################################################################################################################################
120
+ # a dict to store lead variants of each plot
120
121
  lead_variants_is={}
122
+
123
+ ##########################################################################################################################################
121
124
  # plot manhattan plot
122
125
  for index,sumstats in enumerate(sumstats_list):
126
+
127
+ #################################################################
123
128
  if mode=="m" or mode=="r":
124
129
  figax = (fig,axes[index],axes[-1])
125
130
  elif mode=="mqq":
126
131
  figax = (fig,axes[index,0],axes[index,1])
127
-
128
-
132
+ #################################################################
129
133
  if index==0:
130
134
  # plot last m and gene track
131
135
  fig,log,lead_i,lead_i2 = mqqplot(sumstats,
@@ -151,6 +155,7 @@ def plot_stacked_mqq(objects,
151
155
  )
152
156
  lead_variants_is[index] = (lead_i,lead_i2)
153
157
  else:
158
+ # plot only the scatter plot
154
159
  fig,log,lead_i,lead_i2 = mqqplot(sumstats,
155
160
  chrom="CHR",
156
161
  pos="POS",
@@ -178,13 +183,32 @@ def plot_stacked_mqq(objects,
178
183
  # adjust labels
179
184
  # drop labels for each plot
180
185
  # set a common laebl for all plots
181
- for index in range(n_plot):
182
- axes[index].set_ylabel("")
186
+
183
187
 
184
188
  if titles is not None:
185
189
  for index,title in enumerate(titles):
186
190
  axes[index].text(title_pos[0], title_pos[1] , title, transform=axes[index].transAxes,ha="left", va='top',**title_args)
191
+ ##########################################################################################################################################
192
+ # draw the line for lead variants
193
+ _draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line)
194
+
195
+ ##########################################################################################################################################
196
+ _drop_old_y_labels(axes, n_plot)
197
+
198
+ _add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height )
199
+
200
+ ##########################################################################################################################################
201
+ save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
202
+
203
+ log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
204
+
205
+ return fig, log
187
206
 
207
+ def _drop_old_y_labels(axes, n_plot):
208
+ for index in range(n_plot):
209
+ axes[index].set_ylabel("")
210
+
211
+ def _draw_grid_line_for_lead_variants(mode, lead_variants_is, n_plot, axes, region_lead_grid_line):
188
212
  if mode=="r":
189
213
  for index, sig_is in lead_variants_is.items():
190
214
  for sig_i in sig_is:
@@ -192,19 +216,14 @@ def plot_stacked_mqq(objects,
192
216
  for each_axis_index in range(n_plot + 1):
193
217
  axes[each_axis_index].axvline(x=sig_i, zorder=2,**region_lead_grid_line)
194
218
 
195
-
219
+ def _add_new_y_label(mode, fig, gene_track_height,n_plot,subplot_height ):
196
220
  gene_track_height_ratio = gene_track_height/(gene_track_height + n_plot*subplot_height)
197
221
  ylabel_height = (1 - gene_track_height_ratio)*0.5 + gene_track_height_ratio
198
222
  if mode=="r":
199
223
  fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
200
224
  fig.text(0.93, ylabel_height, "Recombination rate(cM/Mb)", va='center', rotation=-90)
201
225
  elif mode=="m":
202
- fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
203
-
204
- save_figure(fig = fig, save = save, keyword= "stacked_" + mode, save_args=save_args, log = log, verbose=verbose)
205
- log.write("Finished creating stacked mqq plot by iteratively calling plot_mqq.",verbose=verbose)
206
- return fig, log
207
-
226
+ fig.text(0.08, ylabel_height , "$-log_{10}(P)$", va='center', rotation='vertical')
208
227
 
209
228
  def _sort_args(mqq_args, n_plot):
210
229
  mqq_args_for_each_plot={i:{} for i in range(n_plot)}