PyPI - gwaslab - Versions diffs - 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl - Mend

gwaslab 3.4.14py3-none-any.whl → 3.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (20) hide show

gwaslab/Sumstats.py +6 -4
gwaslab/__init__.py +3 -1
gwaslab/annotateplot.py +2 -2
gwaslab/calculate_power.py +119 -42
gwaslab/compare_effect.py +83 -17
gwaslab/download.py +19 -4
gwaslab/fill.py +183 -57
gwaslab/miamiplot.py +25 -10
gwaslab/mqqplot.py +4 -3
gwaslab/plotrg.py +208 -75
gwaslab/regionalplot.py +21 -3
gwaslab/retrievedata.py +49 -18
gwaslab/to_pickle.py +12 -0
gwaslab/trumpetplot.py +0 -0
gwaslab/version.py +3 -3
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/METADATA +2 -2
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/RECORD +20 -19
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/LICENSE +0 -0
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/WHEEL +0 -0
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/top_level.txt +0 -0

gwaslab/Sumstats.py CHANGED Viewed

@@ -428,12 +428,14 @@ class Sumstats():
             return new_Sumstats_object
     ######################################################################
-    def check_af(self,**args):
-        self.data = parallelecheckaf(self.data,log=self.log,**args)
+    def check_af(self,ref_infer,**args):
+        self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
+        self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
     def plot_daf(self, **args):
-        plot = plotdaf(self.data, **args)
+        fig,outliers = plotdaf(self.data, **args)
+        return fig, outliers
     def plot_mqq(self, build=None, **args):
         chrom="CHR"

gwaslab/__init__.py CHANGED Viewed

@@ -32,4 +32,6 @@ from gwaslab.download import update_record
 from gwaslab.to_pickle import dump_pickle
 from gwaslab.to_pickle import load_pickle
 from gwaslab.config import options
-from gwaslab.version import _show_version as show_version
+from gwaslab.version import _show_version as show_version
+from gwaslab.calculate_power import get_power
+from gwaslab.calculate_power import get_beta

gwaslab/annotateplot.py CHANGED Viewed

@@ -328,7 +328,7 @@ def annotate_pair(
                             arm_scale = arm_scale_d[anno_count]
                     # vertical arm length in pixels
-                    armB_length_in_point = ax.transData.transform((skip,1.15*maxy_anno))[1]-ax.transData.transform((skip, row["scaled_P"]+1))[1]
+                    armB_length_in_point = ax.transData.transform((skip,1.15*maxy_anno))[1]-ax.transData.transform((skip, row["scaled_P"]+1))[1]-arm_offset/2
                     # times arm_scale to increase or reduce the length
                     armB_length_in_point = armB_length_in_point*arm_scale
@@ -564,7 +564,7 @@ def annotate_subtype(
             xy=(row["i"],row["scaled_P"]+0.2)
-            xytext=(last_pos,1.15*maxy*arm_scale)
+            xytext=(last_pos, 1.15*maxy*arm_scale)
             if anno_fixed_arm_length is not None:
                 armB_length_in_point = anno_fixed_arm_length

gwaslab/calculate_power.py CHANGED Viewed

@@ -1,47 +1,124 @@
-def get_power(genotype_or=1.3 ,
+import pandas as pd
+import numpy as np
+import scipy.stats as ss
+from gwaslab.Log import Log
+import scipy as sp
+def get_power(
+              mode="b",
+              t=0,
+              genotype_or=1.3 ,
+              beta=0.3,
+              eaf=0.1,
+              n=10000,
               scase= 2000,
               scontrol= 15000,
               prevalence= 0.15,
               daf = 0.2,
-              sig_level= 5e-8
+              sig_level= 5e-8,
+              vary=1,
+              log=Log(),
+              verbose=True
              ):
-    print("Input settings:{}".format(daf))
-    print(" -Number of cases:{}".format(scase))
-    print(" -Number of controls:{}".format(scontrol))
-    print(" -Risk allele OR:{:.3f}".format(genotype_or))
-    print(" -Disease prevalence:{:.3f}".format(prevalence))
-    print(" -Risk allele frequency: {:.3f}".format(daf))
-    print(" -Significance level: {:.3e}".format(sig_level))
-    # Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
-    aaf = daf**2
-    abf = 2 * (daf) * (1 - daf)
-    bbf = (1- daf)**2
-    # additive
-    x = [ 2*genotype_or-1, genotype_or, 1 ]
-    aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
-    abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
-    bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
-    print("Probability of disease :")
-    print(" - Individuals with AA genotype: {:.3f}".format(aap))
-    print(" - Individuals with AB genotype: {:.3f}".format(abp))
-    print(" - Individuals with BB genotype: {:.3f}".format(bbp))
-    pcase= (aap * aaf + abp * abf*0.5) / prevalence
-    pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
-    vcase = pcase *(1-pcase)
-    vcontrol =pcontrol *(1-pcontrol)
-    print("Expected risk allele frequency:")
-    print(" - In cases: {:.3f}".format(pcase))
-    print(" - In controls: {:.3f}".format(pcontrol))
-    num= (pcase - pcontrol)
-    den= np.sqrt( (vcase/scase +  vcontrol/scontrol)*0.5 )
-    u = num / den
-    c = ss.norm.isf(sig_level/2)
-    power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
-    print("Expected power: {:.3f}".format(power))
-    return power
+    if mode=="b":
+        print("Input settings:{}".format(daf))
+        print(" -Number of cases:{}".format(scase))
+        print(" -Number of controls:{}".format(scontrol))
+        print(" -Risk allele OR:{:.3f}".format(genotype_or))
+        print(" -Disease prevalence:{:.3f}".format(prevalence))
+        print(" -Risk allele frequency: {:.3f}".format(daf))
+        print(" -Significance level: {:.3e}".format(sig_level))
+        # Skol, A. D., Scott, L. J., Abecasis, G. R., & Boehnke, M. (2006). Joint analysis is more efficient than replication-based analysis for two-stage genome-wide association studies. Nature genetics, 38(2), 209-213.
+        aaf = daf**2
+        abf = 2 * (daf) * (1 - daf)
+        bbf = (1- daf)**2
+        # additive
+        x = [ 2*genotype_or-1, genotype_or, 1 ]
+        aap= x[0] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
+        abp= x[1] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
+        bbp= x[2] * prevalence / (x[0]*aaf + x[1]*abf + x[2]*bbf)
+        print("Probability of disease :")
+        print(" - Individuals with AA genotype: {:.3f}".format(aap))
+        print(" - Individuals with AB genotype: {:.3f}".format(abp))
+        print(" - Individuals with BB genotype: {:.3f}".format(bbp))
+        pcase= (aap * aaf + abp * abf*0.5) / prevalence
+        pcontrol=((1-aap )* aaf + (1-abp )* abf*0.5) / (1 - prevalence)
+        vcase = pcase *(1-pcase)
+        vcontrol =pcontrol *(1-pcontrol)
+        print("Expected risk allele frequency:")
+        print(" - In cases: {:.3f}".format(pcase))
+        print(" - In controls: {:.3f}".format(pcontrol))
+        num= (pcase - pcontrol)
+        den= np.sqrt( (vcase/scase +  vcontrol/scontrol)*0.5 )
+        u = num / den
+        c = ss.norm.isf(sig_level/2)
+        power = 1 - ss.norm.cdf(c-u) + ss.norm.cdf(-c-u)
+        print("Expected power: {:.3f}".format(power))
+    elif mode=="q":
+        if verbose:
+            log.write("Significance level: {}".format(sig_level))
+            log.write("EAF: {}".format(eaf))
+            log.write("BETA: {}".format(beta))
+            log.write("N: {}".format(n))
+            log.write("H2: {}".format(2*eaf*(1-eaf)*(beta**2)))
+            c = ss.chi2.isf(sig_level/2,df=1)
+            NCP = n * 2*eaf*(1-eaf)*(beta**2)/vary
+            power = 1 - ss.ncx2.cdf(c,df=1,nc=NCP)
+    return power
+def get_beta(
+              mode="b",
+              t=0,
+              genotype_or=1.3 ,
+              eaf=0.1,
+              n=10000,
+              scase= 2000,
+              scontrol= 15000,
+              prevalence= 0.15,
+              daf = 0.2,
+              sig_level= 5e-8,
+              vary=1,
+              log=Log(),
+              verbose=True,
+              n_matrix=500
+             ):
+    if mode=="q":
+        if t >0:
+            def calculate_power_single(
+                                beta,
+                                eaf,
+                                n,
+                                t,
+                                sig_level=5e-8,vary=1):
+                c = ss.chi2.isf(sig_level/2,df=1)
+                h2 = 2*eaf*(1-eaf)*(beta**2)
+                NCP = n * h2/vary
+                power = 1 - ss.ncx2.cdf(c,df=1,nc=NCP)
+                return power
+            matrix = np.zeros((n_matrix,n_matrix),dtype=float)
+            eafs = np.linspace(0.5,0.0001,n_matrix)
+            betas =  np.linspace(0.0001,10,n_matrix)
+            for i in range(n_matrix):
+                    matrix[i,] = calculate_power_single(beta=betas,eaf=eafs[i],n=n,t=t)
+            i,j=1,1
+            eaf_beta = []
+            while i<n_matrix-1 and j<n_matrix-1:
+                if matrix[i,j] < t:
+                    j+=1
+                else:
+                    i+=1
+                    eaf_beta.append((eafs[i],betas[j]))
+        return pd.DataFrame(eaf_beta)

gwaslab/compare_effect.py CHANGED Viewed

@@ -711,28 +711,67 @@ def reorderLegend(ax=None, order=None, add=None):
 def plotdaf(sumstats,
              eaf="EAF",
              daf="DAF",
-             scatter_args={"s":1},
              threshold=0.16,
+             xlabel="Alternative Allele Frequency in Reference Population (RAF)",
+             ylabel="Effect Allele Frequency in Sumstats (EAF)",
              is_reg=True,
+             r2=True,
              is_45_helper_line=True,
              is_threshold=True,
-             helper_line_args={"color":'black', "linestyle":'-',"lw":1},
-             threshold_line_args={"color":'#cccccc', "linestyle":'dotted'},
-             reg_line_args={"color":'#cccccc', "linestyle":'--'},
-             plt_args={"figsize":(8,4),"dpi":300},
-            histplot_args={"log_scale":(False,True)},
-            fontargs={'family':'sans','fontname':'Arial','fontsize':8},
-            verbose=True,
-            log=Log()
+             helper_line_args=None,
+             threshold_line_args=None,
+             reg_line_args=None,
+             plt_args=None,
+             scatter_args=None,
+             scatter_args_outlier =None,
+             histplot_args=None,
+             font_args=None,
+             r2_args=None,
+             legend1=True,
+             legend2=True,
+             save=False,
+             save_args=None,
+             verbose=True,
+             log=Log()
            ):
+    if font_args is None:
+        font_args={'family':'sans','fontname':'Arial','fontsize':8}
+    if scatter_args is None:
+        scatter_args={"s":1}
+    if scatter_args_outlier is None:
+        scatter_args_outlier={"s":3,"c":"red"}
+    if plt_args is None:
+        plt_args={"figsize":(8,4),"dpi":300}
+    if histplot_args is None:
+        histplot_args={"log_scale":(False,True)}
+    if reg_line_args is None:
+        reg_line_args={"color":'#cccccc', "linestyle":'--'}
+    if threshold_line_args is None:
+        threshold_line_args={"color":'#cccccc', "linestyle":'dotted'}
+    if helper_line_args is None:
+        helper_line_args={"color":'black', "linestyle":'-',"lw":1}
+    if r2_args is None:
+        r2_args = {"va":"bottom","ha":"right"}
     if verbose: log.write("Start to plot Reference frequency vs Effect allele frequency plot...")
     if not ((eaf in sumstats.columns) and (daf in sumstats.columns)):
         raise ValueError("EAF and/or DAF columns were not detected.")
+    if "SNPID" in sumstats.columns:
+        snpid = "SNPID"
+    else:
+        snpid = "rsID"
+    alleles =[]
+    if "EA" in sumstats.columns:
+        alleles.append("EA")
+    if "NEA" in sumstats.columns:
+        alleles.append("NEA")
-    sumstats = sumstats.loc[(~sumstats[eaf].isna())&(~sumstats[daf].isna()),[eaf,daf]].copy()
+    sumstats = sumstats.loc[(~sumstats[eaf].isna())&(~sumstats[daf].isna()),[snpid,eaf,daf]+alleles].copy()
     sumstats.loc[:,daf] = sumstats.loc[:,daf].astype("float")
     sumstats.loc[:,eaf] = sumstats.loc[:,eaf].astype("float")
     if verbose: log.write(" -Plotting valriants:" + str(len(sumstats)))
@@ -740,7 +779,15 @@ def plotdaf(sumstats,
     sumstats.loc[:,"RAF"]=sumstats[eaf] - sumstats[daf]
     sns.set_style("ticks")
     fig, (ax1, ax2) = plt.subplots(1, 2,**plt_args)
-    ax1.scatter(sumstats["RAF"],sumstats[eaf],**scatter_args)
+    ax1.scatter(sumstats["RAF"],sumstats[eaf],label="Non-outlier", **scatter_args)
+    if is_threshold is True:
+        is_outliers = sumstats[daf].abs() > threshold
+        if sum(is_outliers)>0:
+            ax1.scatter(sumstats.loc[is_outliers, "RAF"],sumstats.loc[is_outliers, eaf],label="Outlier", **scatter_args_outlier)
+    if legend1 ==True:
+        ax1.legend()
     if is_reg is True:
         if verbose: log.write(" -Plotting regression line...")
@@ -749,6 +796,9 @@ def plotdaf(sumstats,
         if verbose:log.write(" -Intercept = ", reg[1])
         if verbose:log.write(" -R2 = ", reg[2])
         ax1.axline(xy1=(0,reg[1]),slope=reg[0],zorder=1,**reg_line_args)
+        if r2 is True:
+            ax1.text(0.98,0.02, "$R^2 = {:.3f}$".format(reg[2]), transform=ax1.transAxes, **r2_args)
     if is_threshold is True:
         if verbose: log.write(" -Threshold : " + str(threshold))
         num = sum(np.abs(sumstats[daf])>threshold )
@@ -756,22 +806,38 @@ def plotdaf(sumstats,
         if verbose: log.write(" -Percentage for variants with relatively large DAF : ",num/len(sumstats) )
         ax1.axline(xy1=(0,threshold),slope=1,zorder=1,**threshold_line_args)
         ax1.axline(xy1=(threshold,0),slope=1,zorder=1,**threshold_line_args)
     xl,xh=ax1.get_xlim()
     yl,yh=ax1.get_ylim()
     if is_45_helper_line is True:
         ax1.axline([0,0], [1,1],zorder=1, **helper_line_args)
-    ax1.set_xlabel("Alternative Allele Frequency in Reference Population (RAF)",**fontargs)
-    ax1.set_ylabel("Effect Allele Frequency in Sumstats (EAF)",**fontargs)
+    ax1.set_xlabel(xlabel,**font_args)
+    ax1.set_ylabel(ylabel,**font_args)
     ax1.set_xlim([0,1])
     ax1.set_ylim([0,1])
     sumstats.loc[:,"ID"] = sumstats.index
     to_plot = pd.melt(sumstats,id_vars=['ID'], value_vars=['EAF',"RAF"], var_name='Types', value_name='Allele Frequency')
-    sns.histplot(data=to_plot, x="Allele Frequency", hue="Types", fill=True, ax=ax2,**histplot_args)
-    ax2.set_xlabel("Allele Frequency",**fontargs)
+    sns.histplot(data=to_plot, x="Allele Frequency", hue="Types", fill=True, ax=ax2, legend = legend2 ,**histplot_args)
+    ax2.set_xlabel("Allele Frequency",**font_args)
     plt.tight_layout()
-    return fig
+    if save:
+        if verbose: log.write("Saving plot:")
+        if save==True:
+            fig.savefig("./allele_frequency_comparison.png",bbox_inches="tight",**save_args)
+            log.write(" -Saved to "+ "./allele_frequency_comparison.png" + " successfully!" )
+        else:
+            fig.savefig(save,bbox_inches="tight",**save_args)
+            log.write(" -Saved to "+ save + " successfully!" )
+    sumstats = sumstats.drop(columns="ID")
+    return fig, sumstats[is_outliers].copy()
 def test_q(df,beta1,se1,beta2,se2,q_level=0.05):
     w1="Weight_1"

gwaslab/download.py CHANGED Viewed

@@ -210,13 +210,20 @@ def download_ref(name,
         local_path = directory + local_filename
         log.write(" -Downloading to:",local_path)
+        # if existing in default path
+        if search_local(local_path) == True:
+            log.write(" -File {} exists.".format(local_path))
+        else:
+            download_file(url,local_path)
         # download file
-        download_file(url,local_path)
+        #download_file(url,local_path)
         # update record in config json
         if name+"_md5" in dicts.keys():
             file_status = check_file_integrity(local_path=local_path, md5sum=dicts[name+"_md5"],log=log)
             if file_status==0:
-                log.write("Downloading ",name," failed! Please check the internet connection.")
+                log.write("Md5sum verification of ",name," failed! Please check again.")
         update_record(name,local_path)
         # if vcf.gz -> check tbi
@@ -224,7 +231,11 @@ def download_ref(name,
                 if name+"_tbi" in dicts.keys():
                     tbi_url = dicts[name+"_tbi"]
                 try:
-                    download_file(tbi_url, local_path+".tbi")
+                    if search_local(local_path+".tbi") == True:
+                        log.write(" -File {} exists.".format(local_path+".tbi"))
+                    else:
+                        download_file(tbi_url,local_path+".tbi")
+                    #download_file(tbi_url, local_path+".tbi")
                     update_record(name+"_tbi",local_path+ ".tbi")
                     log.write(" -Downloading to:",local_path+".tbi")
                 except:
@@ -343,7 +354,8 @@ def check_and_download(name):
     data_path = get_path(name)
     return data_path
+def search_local(file_path):
+    return path.exists(file_path)
 ##### format book ###################################################################################################
 def update_formatbook(log=Log()):
@@ -389,4 +401,7 @@ def check_format(fmt,log=Log()):
     for i in book[fmt].values():
         log.write(i,end="")
 ########################################################################################################

gwaslab 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.14py3-none-any.whl → 3.4.16py3-none-any.whl