gwaslab 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of gwaslab might be problematic. Click here for more details.

gwaslab/Sumstats.py CHANGED
@@ -428,12 +428,14 @@ class Sumstats():
428
428
  return new_Sumstats_object
429
429
  ######################################################################
430
430
 
431
- def check_af(self,**args):
432
- self.data = parallelecheckaf(self.data,log=self.log,**args)
431
+ def check_af(self,ref_infer,**args):
432
+ self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
433
+ self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
434
+
433
435
 
434
436
  def plot_daf(self, **args):
435
- plot = plotdaf(self.data, **args)
436
-
437
+ fig,outliers = plotdaf(self.data, **args)
438
+ return fig, outliers
437
439
  def plot_mqq(self, build=None, **args):
438
440
 
439
441
  chrom="CHR"
gwaslab/__init__.py CHANGED
@@ -32,4 +32,6 @@ from gwaslab.download import update_record
32
32
  from gwaslab.to_pickle import dump_pickle
33
33
  from gwaslab.to_pickle import load_pickle
34
34
  from gwaslab.config import options
35
- from gwaslab.version import _show_version as show_version
35
+ from gwaslab.version import _show_version as show_version
36
+ from gwaslab.calculate_power import get_power
37
+ from gwaslab.calculate_power import get_beta
gwaslab/annotateplot.py CHANGED
@@ -328,7 +328,7 @@ def annotate_pair(
328
328
  arm_scale = arm_scale_d[anno_count]
329
329
 
330
330
  # vertical arm length in pixels
331
- armB_length_in_point = ax.transData.transform((skip,1.15*maxy_anno))[1]-ax.transData.transform((skip, row["scaled_P"]+1))[1]
331
+ armB_length_in_point = ax.transData.transform((skip,1.15*maxy_anno))[1]-ax.transData.transform((skip, row["scaled_P"]+1))[1]-arm_offset/2
332
332
  # times arm_scale to increase or reduce the length
333
333
  armB_length_in_point = armB_length_in_point*arm_scale
334
334
 
@@ -564,7 +564,7 @@ def annotate_subtype(
564
564
 
565
565
 
566
566
  xy=(row["i"],row["scaled_P"]+0.2)
567
- xytext=(last_pos,1.15*maxy*arm_scale)
567
+ xytext=(last_pos, 1.15*maxy*arm_scale)
568
568
 
569
569
  if anno_fixed_arm_length is not None:
570
570
  armB_length_in_point = anno_fixed_arm_length
@@ -1,47 +1,124 @@
1
- def get_power(genotype_or=1.3 ,
1
+ import pandas as pd
2
+ import numpy as np
3
+ import scipy.stats as ss
4
+ from gwaslab.Log import Log
5
+ import scipy as sp
6
+
7
+ def get_power(
8
+ mode="b",
9
+ t=0,
10
+ genotype_or=1.3 ,
11
+ beta=0.3,
12
+ eaf=0.1,
13
+ n=10000,
2
14
  scase= 2000,
3
15
  scontrol= 15000,
4
16
  prevalence= 0.15,
5
17
  daf = 0.2,
6
- sig_level= 5e-8
18
+ sig_level= 5e-8,
19
+ vary=1,
20
+ log=Log(),
21
+ verbose=True
7
22
  ):
8
- print("Input settings:{}".format(daf))
9
- print(" -Number of cases:{}".format(scase))
10
- print(" -Number of controls:{}".format(scontrol))
11
- print(" -Risk allele OR:{:.3f}".format(genotype_or))
12
- print(" -Disease prevalence:{:.3f}".format(prevalence))
13
- print(" -Risk allele frequency: {:.3f}".format(daf))
14
- print(" -Significance level: {:.3e}".format(sig_level))
15
- # Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
16
- aaf = daf**2
17
- abf = 2 * (daf) * (1 - daf)
18
- bbf = (1- daf)**2
19
-
20
- # additive
21
- x = [ 2*genotype_or-1, genotype_or, 1 ]
22
-
23
- aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
24
- abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
25
- bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
26
- print("Probability of disease :")
27
- print(" - Individuals with AA genotype: {:.3f}".format(aap))
28
- print(" - Individuals with AB genotype: {:.3f}".format(abp))
29
- print(" - Individuals with BB genotype: {:.3f}".format(bbp))
30
-
31
- pcase= (aap * aaf + abp * abf*0.5) / prevalence
32
- pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
33
-
34
- vcase = pcase *(1-pcase)
35
- vcontrol =pcontrol *(1-pcontrol)
36
- print("Expected risk allele frequency:")
37
- print(" - In cases: {:.3f}".format(pcase))
38
- print(" - In controls: {:.3f}".format(pcontrol))
39
-
40
- num= (pcase - pcontrol)
41
- den= np.sqrt( (vcase/scase + vcontrol/scontrol)*0.5 )
42
- u = num / den
43
-
44
- c = ss.norm.isf(sig_level/2)
45
- power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
46
- print("Expected power: {:.3f}".format(power))
47
- return power
23
+ if mode=="b":
24
+ print("Input settings:{}".format(daf))
25
+ print(" -Number of cases:{}".format(scase))
26
+ print(" -Number of controls:{}".format(scontrol))
27
+ print(" -Risk allele OR:{:.3f}".format(genotype_or))
28
+ print(" -Disease prevalence:{:.3f}".format(prevalence))
29
+ print(" -Risk allele frequency: {:.3f}".format(daf))
30
+ print(" -Significance level: {:.3e}".format(sig_level))
31
+ # Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
32
+ aaf = daf**2
33
+ abf = 2 * (daf) * (1 - daf)
34
+ bbf = (1- daf)**2
35
+
36
+ # additive
37
+ x = [ 2*genotype_or-1, genotype_or, 1 ]
38
+
39
+ aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
40
+ abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
41
+ bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
42
+ print("Probability of disease :")
43
+ print(" - Individuals with AA genotype: {:.3f}".format(aap))
44
+ print(" - Individuals with AB genotype: {:.3f}".format(abp))
45
+ print(" - Individuals with BB genotype: {:.3f}".format(bbp))
46
+
47
+ pcase= (aap * aaf + abp * abf*0.5) / prevalence
48
+ pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
49
+
50
+ vcase = pcase *(1-pcase)
51
+ vcontrol =pcontrol *(1-pcontrol)
52
+ print("Expected risk allele frequency:")
53
+ print(" - In cases: {:.3f}".format(pcase))
54
+ print(" - In controls: {:.3f}".format(pcontrol))
55
+
56
+ num= (pcase - pcontrol)
57
+ den= np.sqrt( (vcase/scase + vcontrol/scontrol)*0.5 )
58
+ u = num / den
59
+
60
+ c = ss.norm.isf(sig_level/2)
61
+ power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
62
+ print("Expected power: {:.3f}".format(power))
63
+
64
+ elif mode=="q":
65
+ if verbose:
66
+ log.write("Significance level: {}".format(sig_level))
67
+ log.write("EAF: {}".format(eaf))
68
+ log.write("BETA: {}".format(beta))
69
+ log.write("N: {}".format(n))
70
+ log.write("H2: {}".format(2*eaf*(1-eaf)*(beta**2)))
71
+ c = ss.chi2.isf(sig_level/2,df=1)
72
+ NCP = n * 2*eaf*(1-eaf)*(beta**2)/vary
73
+ power = 1 - ss.ncx2.cdf(c,df=1,nc=NCP)
74
+
75
+ return power
76
+
77
+ def get_beta(
78
+ mode="b",
79
+ t=0,
80
+ genotype_or=1.3 ,
81
+ eaf=0.1,
82
+ n=10000,
83
+ scase= 2000,
84
+ scontrol= 15000,
85
+ prevalence= 0.15,
86
+ daf = 0.2,
87
+ sig_level= 5e-8,
88
+ vary=1,
89
+ log=Log(),
90
+ verbose=True,
91
+ n_matrix=500
92
+ ):
93
+ if mode=="q":
94
+ if t >0:
95
+ def calculate_power_single(
96
+ beta,
97
+ eaf,
98
+ n,
99
+ t,
100
+ sig_level=5e-8,vary=1):
101
+
102
+ c = ss.chi2.isf(sig_level/2,df=1)
103
+ h2 = 2*eaf*(1-eaf)*(beta**2)
104
+ NCP = n * h2/vary
105
+ power = 1 - ss.ncx2.cdf(c,df=1,nc=NCP)
106
+ return power
107
+
108
+ matrix = np.zeros((n_matrix,n_matrix),dtype=float)
109
+ eafs = np.linspace(0.5,0.0001,n_matrix)
110
+ betas = np.linspace(0.0001,10,n_matrix)
111
+
112
+ for i in range(n_matrix):
113
+ matrix[i,] = calculate_power_single(beta=betas,eaf=eafs[i],n=n,t=t)
114
+
115
+
116
+ i,j=1,1
117
+ eaf_beta = []
118
+ while i<n_matrix-1 and j<n_matrix-1:
119
+ if matrix[i,j] < t:
120
+ j+=1
121
+ else:
122
+ i+=1
123
+ eaf_beta.append((eafs[i],betas[j]))
124
+ return pd.DataFrame(eaf_beta)
gwaslab/compare_effect.py CHANGED
@@ -711,28 +711,67 @@ def reorderLegend(ax=None, order=None, add=None):
711
711
  def plotdaf(sumstats,
712
712
  eaf="EAF",
713
713
  daf="DAF",
714
- scatter_args={"s":1},
715
714
  threshold=0.16,
715
+ xlabel="Alternative Allele Frequency in Reference Population (RAF)",
716
+ ylabel="Effect Allele Frequency in Sumstats (EAF)",
716
717
  is_reg=True,
718
+ r2=True,
717
719
  is_45_helper_line=True,
718
720
  is_threshold=True,
719
- helper_line_args={"color":'black', "linestyle":'-',"lw":1},
720
- threshold_line_args={"color":'#cccccc', "linestyle":'dotted'},
721
- reg_line_args={"color":'#cccccc', "linestyle":'--'},
722
- plt_args={"figsize":(8,4),"dpi":300},
723
- histplot_args={"log_scale":(False,True)},
724
- fontargs={'family':'sans','fontname':'Arial','fontsize':8},
725
- verbose=True,
726
- log=Log()
721
+ helper_line_args=None,
722
+ threshold_line_args=None,
723
+ reg_line_args=None,
724
+ plt_args=None,
725
+ scatter_args=None,
726
+ scatter_args_outlier =None,
727
+ histplot_args=None,
728
+ font_args=None,
729
+ r2_args=None,
730
+ legend1=True,
731
+ legend2=True,
732
+ save=False,
733
+ save_args=None,
734
+ verbose=True,
735
+ log=Log()
727
736
  ):
728
737
 
729
-
738
+ if font_args is None:
739
+ font_args={'family':'sans','fontname':'Arial','fontsize':8}
740
+ if scatter_args is None:
741
+ scatter_args={"s":1}
742
+ if scatter_args_outlier is None:
743
+ scatter_args_outlier={"s":3,"c":"red"}
744
+ if plt_args is None:
745
+ plt_args={"figsize":(8,4),"dpi":300}
746
+ if histplot_args is None:
747
+ histplot_args={"log_scale":(False,True)}
748
+ if reg_line_args is None:
749
+ reg_line_args={"color":'#cccccc', "linestyle":'--'}
750
+ if threshold_line_args is None:
751
+ threshold_line_args={"color":'#cccccc', "linestyle":'dotted'}
752
+ if helper_line_args is None:
753
+ helper_line_args={"color":'black', "linestyle":'-',"lw":1}
754
+ if r2_args is None:
755
+ r2_args = {"va":"bottom","ha":"right"}
756
+
757
+
730
758
  if verbose: log.write("Start to plot Reference frequency vs Effect allele frequency plot...")
731
759
  if not ((eaf in sumstats.columns) and (daf in sumstats.columns)):
732
760
  raise ValueError("EAF and/or DAF columns were not detected.")
733
761
 
762
+ if "SNPID" in sumstats.columns:
763
+ snpid = "SNPID"
764
+ else:
765
+ snpid = "rsID"
766
+
767
+ alleles =[]
768
+ if "EA" in sumstats.columns:
769
+ alleles.append("EA")
770
+ if "NEA" in sumstats.columns:
771
+ alleles.append("NEA")
734
772
 
735
- sumstats = sumstats.loc[(~sumstats[eaf].isna())&(~sumstats[daf].isna()),[eaf,daf]].copy()
773
+
774
+ sumstats = sumstats.loc[(~sumstats[eaf].isna())&(~sumstats[daf].isna()),[snpid,eaf,daf]+alleles].copy()
736
775
  sumstats.loc[:,daf] = sumstats.loc[:,daf].astype("float")
737
776
  sumstats.loc[:,eaf] = sumstats.loc[:,eaf].astype("float")
738
777
  if verbose: log.write(" -Plotting valriants:" + str(len(sumstats)))
@@ -740,7 +779,15 @@ def plotdaf(sumstats,
740
779
  sumstats.loc[:,"RAF"]=sumstats[eaf] - sumstats[daf]
741
780
  sns.set_style("ticks")
742
781
  fig, (ax1, ax2) = plt.subplots(1, 2,**plt_args)
743
- ax1.scatter(sumstats["RAF"],sumstats[eaf],**scatter_args)
782
+ ax1.scatter(sumstats["RAF"],sumstats[eaf],label="Non-outlier", **scatter_args)
783
+
784
+ if is_threshold is True:
785
+ is_outliers = sumstats[daf].abs() > threshold
786
+ if sum(is_outliers)>0:
787
+ ax1.scatter(sumstats.loc[is_outliers, "RAF"],sumstats.loc[is_outliers, eaf],label="Outlier", **scatter_args_outlier)
788
+
789
+ if legend1 ==True:
790
+ ax1.legend()
744
791
 
745
792
  if is_reg is True:
746
793
  if verbose: log.write(" -Plotting regression line...")
@@ -749,6 +796,9 @@ def plotdaf(sumstats,
749
796
  if verbose:log.write(" -Intercept = ", reg[1])
750
797
  if verbose:log.write(" -R2 = ", reg[2])
751
798
  ax1.axline(xy1=(0,reg[1]),slope=reg[0],zorder=1,**reg_line_args)
799
+ if r2 is True:
800
+ ax1.text(0.98,0.02, "$R^2 = {:.3f}$".format(reg[2]), transform=ax1.transAxes, **r2_args)
801
+
752
802
  if is_threshold is True:
753
803
  if verbose: log.write(" -Threshold : " + str(threshold))
754
804
  num = sum(np.abs(sumstats[daf])>threshold )
@@ -756,22 +806,38 @@ def plotdaf(sumstats,
756
806
  if verbose: log.write(" -Percentage for variants with relatively large DAF : ",num/len(sumstats) )
757
807
  ax1.axline(xy1=(0,threshold),slope=1,zorder=1,**threshold_line_args)
758
808
  ax1.axline(xy1=(threshold,0),slope=1,zorder=1,**threshold_line_args)
809
+
759
810
  xl,xh=ax1.get_xlim()
760
811
  yl,yh=ax1.get_ylim()
812
+
761
813
  if is_45_helper_line is True:
762
814
  ax1.axline([0,0], [1,1],zorder=1, **helper_line_args)
763
- ax1.set_xlabel("Alternative Allele Frequency in Reference Population (RAF)",**fontargs)
764
- ax1.set_ylabel("Effect Allele Frequency in Sumstats (EAF)",**fontargs)
815
+
816
+ ax1.set_xlabel(xlabel,**font_args)
817
+ ax1.set_ylabel(ylabel,**font_args)
765
818
  ax1.set_xlim([0,1])
766
819
  ax1.set_ylim([0,1])
767
820
 
821
+
768
822
  sumstats.loc[:,"ID"] = sumstats.index
823
+
769
824
  to_plot = pd.melt(sumstats,id_vars=['ID'], value_vars=['EAF',"RAF"], var_name='Types', value_name='Allele Frequency')
770
825
 
771
- sns.histplot(data=to_plot, x="Allele Frequency", hue="Types", fill=True, ax=ax2,**histplot_args)
772
- ax2.set_xlabel("Allele Frequency",**fontargs)
826
+ sns.histplot(data=to_plot, x="Allele Frequency", hue="Types", fill=True, ax=ax2, legend = legend2 ,**histplot_args)
827
+ ax2.set_xlabel("Allele Frequency",**font_args)
828
+
829
+
773
830
  plt.tight_layout()
774
- return fig
831
+ if save:
832
+ if verbose: log.write("Saving plot:")
833
+ if save==True:
834
+ fig.savefig("./allele_frequency_comparison.png",bbox_inches="tight",**save_args)
835
+ log.write(" -Saved to "+ "./allele_frequency_comparison.png" + " successfully!" )
836
+ else:
837
+ fig.savefig(save,bbox_inches="tight",**save_args)
838
+ log.write(" -Saved to "+ save + " successfully!" )
839
+ sumstats = sumstats.drop(columns="ID")
840
+ return fig, sumstats[is_outliers].copy()
775
841
 
776
842
  def test_q(df,beta1,se1,beta2,se2,q_level=0.05):
777
843
  w1="Weight_1"
gwaslab/download.py CHANGED
@@ -210,13 +210,20 @@ def download_ref(name,
210
210
  local_path = directory + local_filename
211
211
  log.write(" -Downloading to:",local_path)
212
212
 
213
+ # if existing in default path
214
+ if search_local(local_path) == True:
215
+ log.write(" -File {} exists.".format(local_path))
216
+ else:
217
+ download_file(url,local_path)
218
+
213
219
  # download file
214
- download_file(url,local_path)
220
+ #download_file(url,local_path)
221
+
215
222
  # update record in config json
216
223
  if name+"_md5" in dicts.keys():
217
224
  file_status = check_file_integrity(local_path=local_path, md5sum=dicts[name+"_md5"],log=log)
218
225
  if file_status==0:
219
- log.write("Downloading ",name," failed! Please check the internet connection.")
226
+ log.write("Md5sum verification of ",name," failed! Please check again.")
220
227
  update_record(name,local_path)
221
228
 
222
229
  # if vcf.gz -> check tbi
@@ -224,7 +231,11 @@ def download_ref(name,
224
231
  if name+"_tbi" in dicts.keys():
225
232
  tbi_url = dicts[name+"_tbi"]
226
233
  try:
227
- download_file(tbi_url, local_path+".tbi")
234
+ if search_local(local_path+".tbi") == True:
235
+ log.write(" -File {} exists.".format(local_path+".tbi"))
236
+ else:
237
+ download_file(tbi_url,local_path+".tbi")
238
+ #download_file(tbi_url, local_path+".tbi")
228
239
  update_record(name+"_tbi",local_path+ ".tbi")
229
240
  log.write(" -Downloading to:",local_path+".tbi")
230
241
  except:
@@ -343,7 +354,8 @@ def check_and_download(name):
343
354
  data_path = get_path(name)
344
355
  return data_path
345
356
 
346
-
357
+ def search_local(file_path):
358
+ return path.exists(file_path)
347
359
  ##### format book ###################################################################################################
348
360
 
349
361
  def update_formatbook(log=Log()):
@@ -389,4 +401,7 @@ def check_format(fmt,log=Log()):
389
401
  for i in book[fmt].values():
390
402
  log.write(i,end="")
391
403
 
404
+
405
+
406
+
392
407
  ########################################################################################################