PyPI - gwaslab - Versions diffs - 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl - Mend

gwaslab 3.4.14py3-none-any.whl → 3.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (20) hide show

gwaslab/Sumstats.py +6 -4
gwaslab/__init__.py +3 -1
gwaslab/annotateplot.py +2 -2
gwaslab/calculate_power.py +119 -42
gwaslab/compare_effect.py +83 -17
gwaslab/download.py +19 -4
gwaslab/fill.py +183 -57
gwaslab/miamiplot.py +25 -10
gwaslab/mqqplot.py +4 -3
gwaslab/plotrg.py +208 -75
gwaslab/regionalplot.py +21 -3
gwaslab/retrievedata.py +49 -18
gwaslab/to_pickle.py +12 -0
gwaslab/trumpetplot.py +0 -0
gwaslab/version.py +3 -3
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/METADATA +2 -2
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/RECORD +20 -19
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/LICENSE +0 -0
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/WHEEL +0 -0
{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/top_level.txt +0 -0

gwaslab/plotrg.py CHANGED Viewed

@@ -17,7 +17,7 @@ def convert_p_to_width(p,sig_level):
         #scaled using mlog10(p)
         return max(-np.log10(p)/width_factor,0.1)
-def conver_rg_to_color(rg,cmap):
+def convert_rg_to_color(rg,cmap):
     #(1,1)
     if rg>1: rg=1
     if rg<-1: rg=-1
@@ -25,48 +25,120 @@ def conver_rg_to_color(rg,cmap):
 ####################################################################################################
 def plot_rg(ldscrg,
-        p1="p1",p2="p2",rg="rg",p="p",
-        sig_level=0.05,
-        rganno=False,
-        correction="",
+        p1="p1",
+        p2="p2",
+        rg="rg",
+        p="p",
+        sig_levels=None,
+        rganno="non",
+        panno=True,
+        corrections=None,
+        panno_texts=None,
+        equal_aspect=True,
         cmap = matplotlib.cm.get_cmap('RdBu'),
+        full_cell =None,
         log=Log(),
         panno_args=None,
+        rganno_args=None,
         verbose=True,
         asize=10,
         sort_key=None,
         square=False,
-        colorbarargs={"shrink":0.82},
-        **args):
+        colorbar_args=None,
+        fig_args=None,
+        xticklabel_args=None,
+        yticklabel_args=None,
+        fdr_method="i",
+        fontsize=10,
+        save=None,
+        save_args=None):
-    if verbose: log.write("Total non-NA records:",len(ldscrg.dropna(subset=[p])))
+    if verbose: log.write("Start to create ldsc genetic correlation heatmap...")
+    # configure arguments
+    if fig_args is None:
+        fig_args = {"dpi":300}
+    if colorbar_args is None:
+        colorbar_args={"shrink":0.82}
+    if yticklabel_args is None:
+        yticklabel_args={"fontsize":fontsize, "fontfamily":"Arial"}
+    if xticklabel_args is None:
+        xticklabel_args={"rotation":45,"horizontalalignment":"left", "verticalalignment":"bottom","fontsize":fontsize, "fontfamily":"Arial"}
+    if sig_levels is None:
+        sig_levels = [0.05]
+    if corrections is None:
+        corrections = ["non", "fdr","bon"]
+    if panno_texts is None:
+        panno_texts = ["*"*(i+1) for i in range(len(sig_levels)*len(corrections))]
+    if full_cell is None:
+        full_cell = ("fdr",0.05)
+    if rganno_args is None:
+        rganno_args ={}
+    #drop na records in P column
+    if verbose: log.write("Raw dataset records:",len(ldscrg))
     df=ldscrg.dropna(subset=[p]).copy()
+    if verbose: log.write(" -Raw dataset non-NA records:",len(df))
+    # create unique pair column
     df["p1p2"]=df.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
+    if verbose: log.write("Filling diagnal line and duplicated pair for plotting...")
+    # fill na
+    df_fill_reverse = df.loc[(df[p2].isin(df[p1].values)) & (df[p1].isin(df[p2].values)),:].copy()
+    df_fill_reverse = df_fill_reverse.rename(columns={p1:p2,p2:p1})
+    # fill dia
+    df_fill_dia = pd.DataFrame(columns=df.columns)
+    p1_dup_list = list(df.loc[(df[p2].isin(df[p1].values)),"p2"].values)
+    p2_dup_list = list(df.loc[(df[p1].isin(df[p2].values)),"p1"].values)
+    p_dup_list = p2_dup_list + p1_dup_list
+    if len(set(p_dup_list)) > 0:
+        if verbose: log.write(" -Diagnal records:", len(set(p_dup_list)))
+    df_fill_dia["p1"] = p_dup_list
+    df_fill_dia["p2"] = df_fill_dia["p1"]
+    df_fill_dia["rg"] = 1
+    df_fill_na = pd.DataFrame(columns=df.columns)
+    df_fill_na[[p1,p2]] = [(i,j) for i in df[p1].sort_values(ascending=False).drop_duplicates() for j in df[p2].sort_values(ascending=False).drop_duplicates()]
+    # fill diagonal
+    df = pd.concat([df,df_fill_reverse,df_fill_dia,df_fill_na],ignore_index=True).sort_values(by=p).drop_duplicates(subset=[p1,p2])
+    #if verbose: log.write(" -Dataset shape match:", len(df)==)
+    #
+    ## remove record with p1 = p2, dropna in P column
     dfp=ldscrg.loc[ldscrg[p1]!=ldscrg[p2],:].dropna(subset=[p]).copy()
+    ## create pair column
     dfp["p1p2"]=dfp.apply(lambda x:"_".join(sorted([x[p1],x[p2]])),axis=1)
+    ## drop duplicate and keep only unique pairs
     dfp = dfp.drop_duplicates(subset=["p1p2"]).copy()
-    if verbose: log.write("Valid unique records:",len(dfp))
-    if verbose: log.write("Significant correlations after Bonferroni correction:",sum(dfp[p]<0.05/len(dfp)))
+    if verbose: log.write("Valid unique trait pairs:",len(dfp))
+    if verbose: log.write(" -Valid unique trait1:",dfp["p1"].nunique())
+    if verbose: log.write(" -Valid unique trait2:",dfp["p2"].nunique())
+    if verbose: log.write(" -Significant correlations with P < 0.05:",sum(dfp[p]<0.05))
+    if verbose: log.write(" -Significant correlations after Bonferroni correction:",sum(dfp[p]<(0.05/len(dfp))))
-    if correction=="fdr":
-        dfp["fdr_p"]=fdrcorrection(dfp[p],alpha=1)[1]
-        dfp["fdr"]=fdrcorrection(dfp[p],alpha=sig_level)[0]
-        if verbose: log.write("Significant correlations after FDR correction:",sum(dfp["fdr"]))
-        dfp=dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
-    else:
-        dfp=dfp.set_index("p1p2").loc[:,p].to_dict()
+    #if correction=="fdr":
+        # fdr corrected p
+    dfp["fdr_p"]=fdrcorrection(dfp[p],alpha=1,method=fdr_method)[1]
+        # is fdr < sig_level
+    dfp["fdr"]=fdrcorrection(dfp[p],alpha=0.05,method=fdr_method)[0]
+    if verbose: log.write(" -Significant correlations with FDR <0.05:",sum(dfp["fdr"]))
+        # convert to dict for annotation and plotting
+    df_rawp = dfp.set_index("p1p2").loc[:,p].to_dict()
+    dfp = dfp.set_index("p1p2").loc[:,"fdr_p"].to_dict()
     #########ticks dict###########################################
     dic_p1={}
     dic_p2={}
     dic_p1_r={}
     dic_p2_r={}
+    ## sort position
     if sort_key is None:
+        # alphabetic order
         for i,p1_name in enumerate(df[p1].sort_values(ascending=False).drop_duplicates()):
             dic_p1[p1_name]  = i
             dic_p1_r[i] = p1_name
@@ -74,6 +146,7 @@ def plot_rg(ldscrg,
             dic_p2[p2_name]  = i
             dic_p2_r[i] = p2_name
     else:
+        # user-provided order
         for i,p1_name in enumerate(df[p1].sort_values(ascending=False,key=sort_key).drop_duplicates()):
             dic_p1[p1_name]  = i
             dic_p1_r[i] = p1_name
@@ -81,14 +154,17 @@ def plot_rg(ldscrg,
             dic_p2[p2_name]  = i
             dic_p2_r[i] = p2_name
+    # assign coordinate
     df["y"]=df[p1].map(dic_p1)
     df["y_x"]=df[p1].map(dic_p2)
     df["x"]=df[p2].map(dic_p2)
     df["x_y"]=df[p2].map(dic_p1)
-    if verbose: log.write("Plotting...")
+    if verbose: log.write("Plotting heatmap...")
     ########ticks###############################################
-    fig,ax = plt.subplots(dpi=300,**args)
+    fig,ax = plt.subplots(**fig_args)
+    # configure x/y ticks
     xticks=df["x"].sort_values().drop_duplicates().astype(int)
     yticks=df["y"].sort_values().drop_duplicates().astype(int)
     ax.xaxis.tick_top()
@@ -103,88 +179,145 @@ def plot_rg(ldscrg,
     ax.tick_params('both', length=0, width=0, which='minor')
     #labels
-    ax.set_yticklabels(yticks.map(dic_p1_r),fontsize=15)
-    ax.set_xticklabels(xticks.map(dic_p2_r),rotation=45,horizontalalignment="left", verticalalignment="bottom",fontsize=15)
-    width_max=1
+    ax.set_yticklabels(yticks.map(dic_p1_r),**yticklabel_args)
+    ax.set_xticklabels(xticks.map(dic_p2_r),**xticklabel_args)
     #########patches###########################################
     squares=[]
-    panno=[]
+    panno_list={1:{},2:{}}
     rgtoanno=[]
-    maxsigp=sig_level
-    #if correction=="fdr":
-    #    if len(df.loc[df["fdr"]==True,p])>=1:
-    #        maxsigp = df.loc[df["fdr"]==True,p].max()*1.0001
-    #
-    #    else:
-    #        maxsigp = sig_level/len(df.dropna(subset=[p]))
-    if correction=="fdr":
-        p="fdr_p"
+    if verbose: log.write("Full cell : {}-corrected P == {}".format(full_cell[0],full_cell[1]))
     for i,row in df.iterrows():
         xcenter=row["x"]
         ycenter=row["y"]
-        if row[p1]==row[p2]:
+        if np.isnan(row[rg]):
             width=1
             x=xcenter-width/2
             y=ycenter-width/2
-            rgba = conver_rg_to_color(1,cmap)
-        else:
-            adjusted_p = dfp["_".join(sorted([row[p1],row[p2]]))]
-            if adjusted_p<0.05 and square is True:
-                if xcenter + ycenter < len(df[p1].unique()):
-                    panno.append([xcenter,ycenter,adjusted_p])
-            elif adjusted_p<0.05:
-                panno.append([xcenter,ycenter,adjusted_p])
-            width= convert_p_to_width(adjusted_p,sig_level)
-            x=xcenter-width/2
-            y=ycenter-width/2
-            rgba = conver_rg_to_color(row[rg],cmap)
-            if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno is True):
-                rgtoanno.append([xcenter,ycenter,row[rg],rgba])
+            ax.plot([x,x+width],[y,y+width],c="grey")
+            ax.plot([x,x+width],[y+width,y],c="grey")
-        if xcenter + ycenter < len(df[p1].unique()) and (square is True) and (rganno is True):
-            squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
-        elif (square is not True):
-            squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
+        else:
+            if row[p1]==row[p2]:
+                # diagonal line
+                width=1
+                x=xcenter-width/2
+                y=ycenter-width/2
+                rgba = convert_rg_to_color(1,cmap)
+            else:
+                # get the adjusted p value from dict
+                if  xcenter + ycenter < len(df[p1].unique()):
+                    panno_set=1
+                else:
+                    panno_set=2
+                for i,correction in enumerate(corrections):
+                    for j,sig_level in enumerate(sig_levels):
+                        index = len(sig_levels)*i + j
+                        p1p2="_".join(sorted([row[p1],row[p2]]))
+                        raw_p = df_rawp[p1p2]
+                        if correction in ["B","bonferroni ","bon","Bon","b"]:
+                            current_threhold = sig_level/len(dfp)
+                            if raw_p < current_threhold:
+                                panno_list[panno_set][p1p2] = [xcenter,ycenter,raw_p,"bon",panno_texts[index]]
+                        elif correction in ["fdr","FDR","F","f"]:
+                            adjusted_p = dfp[p1p2]
+                            if adjusted_p < sig_level and square is True:
+                                #if square is True, only annotate half
+                                if xcenter + ycenter < len(df[p1].unique()):
+                                    panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
+                            elif adjusted_p < sig_level:
+                                    panno_list[panno_set][p1p2]=[xcenter,ycenter,adjusted_p,"fdr",panno_texts[index]]
+                        elif correction == "non":
+                            if raw_p < sig_level:
+                                panno_list[panno_set][p1p2]=[xcenter,ycenter,"raw",raw_p,panno_texts[index]]
+                # configuring the square
+                if full_cell[0] == "fdr":
+                    width= convert_p_to_width(adjusted_p,full_cell[1])
+                elif full_cell[0] == "bon":
+                    width= convert_p_to_width(raw_p*len(dfp),full_cell[1])
+                else:
+                    width= convert_p_to_width(raw_p,full_cell[1])
+                x=xcenter-width/2
+                y=ycenter-width/2
+                rgba = convert_rg_to_color(row[rg],cmap)
+                if xcenter + ycenter > len(df[p1].unique())-1 and (square is True) and (rganno == "half"):
+                    rgtoanno.append([xcenter,ycenter,row[rg],rgba])
+                elif "full" in rganno:
+                    rgtoanno.append([xcenter,ycenter,row[rg],rgba])
+            #if xcenter + ycenter < len(df[p1].unique()) and (square is True) and (rganno == "half"):
+            #    squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
+            #elif (square is not True):
+            if ("nb" not in rganno):
+                if rganno == "half":
+                    if xcenter + ycenter < len(df[p1].unique()) and (square is True):
+                        squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
+                else:
+                    squares.append(patches.Rectangle((x,y),width=width,height=width,fc=rgba,ec="white",lw=0))
     squares_collection = matplotlib.collections.PatchCollection(squares,match_original=True)
     ax.add_collection(squares_collection)
     if rganno is not False:
+        rganno_default_args = {"weight":"bold","ha":"center", "va":"center", "fontfamily":"Arial","fontsize":fontsize}
+        for key, value in rganno_args.items():
+            rganno_default_args[key] = value
         for i in rgtoanno:
             if i[2]>1: i[2]=1
             if i[2]<-1: i[2]=-1
-            ax.text(i[0],i[1],"{:.3f}".format(i[2]),color=i[3],weight="bold",ha="center", va="center",font="Arial")
+            if "color" in rganno_default_args.keys() or "c" in rganno_default_args.keys():
+                ax.text(i[0],i[1],"{:.3f}".format(i[2]),**rganno_default_args)
+            else:
+                ax.text(i[0],i[1],"{:.3f}".format(i[2]),color=i[3],**rganno_default_args)
-    panno_default_args={"size":asize,"color":"white","weight":"bold","ha":"center","va":"center","font":"Arial"}
+    # configure args for p annotation
+    panno_default_args={"size":asize,"color":"white","weight":"bold","horizontalalignment":"center","verticalalignment":"center_baseline","font":"Arial"}
     if panno_args is not None:
         for key, value in panno_args.items():
             panno_default_args[key] = value
-    for i in panno:
-        if i[2]<sig_level/len(dfp):
-            ax.text(i[0],i[1],"**", **panno_default_args)
-        else:
-            ax.text(i[0],i[1],"*", **panno_default_args)
+    # annotate p
+    if panno is True:
+        if verbose: log.write("P value annotation text : ")
+        for i,correction in enumerate(corrections):
+            for j,sig_level in enumerate(sig_levels):
+                index = len(sig_levels)*i + j
+                if verbose: log.write(" -{} : {}-corrected P < {}".format(panno_texts[index], correction, sig_level))
+        for panno_set_number in panno_list.keys():
+            for key, i in panno_list[panno_set_number].items():
+                if panno_set_number == 1:
+                    ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
+                else:
+                    ax.text(i[0],i[1]-0.1,i[4], **panno_default_args)
     ## color bar ###############################################
     norm = matplotlib.colors.Normalize(vmin=-1, vmax=1)
-    fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **colorbarargs)
+    fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax, **colorbar_args)
-    return fig,ax,log
+    if equal_aspect is True:
+        ax.set_aspect('equal', adjustable='box')
+    if save:
+        if verbose: log.write("Saving plot:")
+        if save==True:
+            fig.savefig("./ldscrg_heatmap.png",bbox_inches="tight",**save_args)
+            log.write(" -Saved to "+ "./ldscrg_heatmap.png" + " successfully!" )
+        else:
+            fig.savefig(save,bbox_inches="tight",**save_args)
+            log.write(" -Saved to "+ save + " successfully!" )
+    if verbose: log.write("Finished creating ldsc genetic correlation heatmap!")
+    return fig,ax,log,df

gwaslab/regionalplot.py CHANGED Viewed

@@ -10,6 +10,7 @@ from gwaslab.CommonData import get_chr_to_number
 from gwaslab.CommonData import get_number_to_chr
 from gwaslab.CommonData import get_recombination_rate
 from gwaslab.CommonData import get_gtf
+from gwaslab.retrievedata import check_vcf_chr_prefix
 from pyensembl import EnsemblRelease
 from allel import GenotypeArray
 from allel import read_vcf
@@ -34,7 +35,7 @@ def _plot_regional(
     chrom_df,
     xtick_chr_dict,
     cut_line_color,
-    vcf_chr_dict = get_number_to_chr(),
+    vcf_chr_dict = None,
     gtf_path="default",
     gtf_chr_dict = get_number_to_chr(),
     gtf_gene_name=None,
@@ -65,7 +66,17 @@ def _plot_regional(
     pos="POS",
     verbose=True,
     log=Log()
-):
+):
+    if vcf_path is not None:
+        if vcf_chr_dict is None:
+            if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
+            prefix = check_vcf_chr_prefix(vcf_path)
+            if prefix is not None:
+                if verbose: log.write(" -Prefix for chromosomes: ",prefix)
+                vcf_chr_dict = get_number_to_chr(prefix=prefix)
+            else:
+                if verbose: log.write(" -No prefix for chromosomes." )
+                vcf_chr_dict = get_number_to_chr()
     # if regional plot : pinpoint lead , add color bar ##################################################
     if (region is not None) :
@@ -231,11 +242,17 @@ def _get_lead_id(sumstats, region_ref, log):
         if len(lead_id)>0:
             lead_id = int(lead_id[0])
     if region_ref is not None:
-        log.write(" -Lead variant ID: {} - {}".format(region_ref, lead_id))
+        if type(lead_id) is list:
+            if len(lead_id)==0 :
+                log.write(" -WARNING: {} not found. Roll back to lead variant...".format(region_ref))
+                lead_id = sumstats["scaled_P"].idxmax()
+        else:
+            log.write(" -Reference variant ID: {} - {}".format(region_ref, lead_id))
     if lead_id is None:
         log.write(" -Extracting lead variant...")
         lead_id = sumstats["scaled_P"].idxmax()
     return lead_id
 def _pinpoint_lead(sumstats,ax1,region_ref, region_ld_threshold, region_ld_colors, marker_size, log):
@@ -464,6 +481,7 @@ def _plot_gene_track(
 def process_vcf(sumstats, vcf_path, region,region_ref, region_ref2, log, verbose, pos ,nea,ea, region_ld_threshold, vcf_chr_dict,tabix):
     if verbose: log.write("Start to load reference genotype...")
     if verbose: log.write(" -reference vcf path : "+ vcf_path)
     # load genotype data of the targeted region
     ref_genotype = read_vcf(vcf_path,region=vcf_chr_dict[region[0]]+":"+str(region[1])+"-"+str(region[2]),tabix=tabix)
     if ref_genotype is None:

gwaslab/retrievedata.py CHANGED Viewed

@@ -290,7 +290,7 @@ def assign_rsid_single(sumstats,path,rsid="rsID",chr="CHR",pos="POS",ref="NEA",a
 def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsID",chr="CHR",pos="POS",ref="NEA",alt="EA",status="STATUS",
                           n_cores=1,chunksize=5000000,ref_snpid="SNPID",ref_rsid="rsID",
-                          overwrite="empty",verbose=True,log=Log(),chr_dict=get_number_to_chr()):
+                          overwrite="empty",verbose=True,log=Log(),chr_dict=None):
     '''
     overwrite mode :
     all ,    overwrite rsid for all availalbe rsid
@@ -303,7 +303,12 @@ def parallelizeassignrsid(sumstats, path, ref_mode="vcf",snpid="SNPID",rsid="rsI
         if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
         if verbose: log.write(" -CPU Cores to use :",n_cores)
         if verbose: log.write(" -Reference VCF file:", path)
+        chr_dict = auto_check_vcf_chr_dict(path, chr_dict, verbose, log)
         if verbose: log.write(" -Assigning rsID based on chr:pos and ref:alt/alt:ref...")
         ##############################################
         if rsid not in sumstats.columns:
             sumstats[rsid]=pd.Series(dtype="string")
@@ -476,11 +481,13 @@ def check_indel(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NE
 def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,remove_snp="",mode="pi",n_cores=1,remove_indel="",
                        chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",
-                       chr_dict=get_number_to_chr(),verbose=True,log=Log()):
+                       chr_dict=None,verbose=True,log=Log()):
     if verbose: log.write("Start to infer strand for palindromic SNPs...")
     if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
     if verbose: log.write(" -Reference vcf file:", ref_infer)
+    chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
     # check if the columns are complete
     if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
         raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
@@ -601,13 +608,16 @@ def parallelinferstrand(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.40,
 ################################################################################################################
-def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_cores=1,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=get_number_to_chr(),force=False, verbose=True,log=Log()):
+def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,column_name="DAF",suffix="",n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
     if verbose: log.write("Start to check the difference between EAF and refence vcf alt frequency ...")
     if verbose: log.write(" -Current Dataframe shape :",len(sumstats)," x ", len(sumstats.columns))
     if verbose: log.write(" -Reference vcf file:", ref_infer)
     if verbose: log.write(" -CPU Cores to use :",n_cores)
+    chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
+    column_name = column_name + suffix
     # check if the columns are complete
     if not ((chr in sumstats.columns) and (pos in sumstats.columns) and (ref in sumstats.columns) and (alt in sumstats.columns) and (status in sumstats.columns)):
         raise ValueError("Not enough information: CHR, POS, NEA , EA, ALT, STATUS...")
@@ -618,7 +628,7 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
         if not force:
             good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
         if verbose: log.write(" -Checking variants:", sum(good_chrpos))
-        sumstats["DAF"]=np.nan
+        sumstats[column_name]=np.nan
     ########################
         if sum(~sumstats[eaf].isna())<10000:
@@ -626,8 +636,8 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
         df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt,eaf]], n_cores)
         pool = Pool(n_cores)
         if sum(~sumstats[eaf].isna())>0:
-            map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
-            sumstats.loc[good_chrpos,["DAF"]] = pd.concat(pool.map(map_func,df_split))
+            map_func = partial(checkaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,column_name=column_name,chr_dict=chr_dict)
+            sumstats.loc[good_chrpos,[column_name]] = pd.concat(pool.map(map_func,df_split))
         pool.close()
         pool.join()
     ###########################
@@ -635,24 +645,24 @@ def parallelecheckaf(sumstats,ref_infer,ref_alt_freq=None,maf_threshold=0.4,n_co
         #sumstats.loc[good_chrpos,"DAF"] = status_inferred.values
         #sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
-        if verbose: log.write(" - DAF min:", np.nanmax(sumstats.loc[:,"DAF"]))
-        if verbose: log.write(" - DAF max:", np.nanmin(sumstats.loc[:,"DAF"]))
-        if verbose: log.write(" - abs(DAF) min:", np.nanmax(np.abs(sumstats.loc[:,"DAF"])))
-        if verbose: log.write(" - abs(DAF) max:", np.nanmin(np.abs(sumstats.loc[:,"DAF"])))
-        if verbose: log.write(" - DAF sd:", np.nanstd(sumstats.loc[:,"DAF"]))
-        if verbose: log.write(" - abs(DAF) sd:", np.nanstd(np.abs(sumstats.loc[:,"DAF"])))
+        if verbose: log.write(" - {} min:".format(column_name), np.nanmax(sumstats.loc[:,column_name]))
+        if verbose: log.write(" - {} max:".format(column_name), np.nanmin(sumstats.loc[:,column_name]))
+        if verbose: log.write(" - {} sd:".format(column_name), np.nanstd(sumstats.loc[:,column_name]))
+        if verbose: log.write(" - abs({}) min:".format(column_name), np.nanmin(np.abs(sumstats.loc[:,column_name])))
+        if verbose: log.write(" - abs({}) max:".format(column_name), np.nanmax(np.abs(sumstats.loc[:,column_name])))
+        if verbose: log.write(" - abs({}) sd:".format(column_name), np.nanstd(np.abs(sumstats.loc[:,column_name])))
+        if verbose: log.write("Finished allele frequency checking!")
     return sumstats
-def checkaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
+def checkaf(sumstats,ref_infer,ref_alt_freq=None,column_name="DAF",chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
     #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
     vcf_reader = VariantFile(ref_infer)
     def afapply(x,vcf,alt_freq,chr_dict):
             return check_daf(x[0],x[1]-1,x[1],x[2],x[3],x[4],vcf_reader,ref_alt_freq,chr_dict)
     map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
     status_inferred = sumstats.apply(map_func,axis=1)
-    sumstats.loc[:,"DAF"] = status_inferred.values
-    sumstats.loc[:,"DAF"]=sumstats.loc[:,"DAF"].astype("float")
+    sumstats.loc[:,column_name] = status_inferred.values
+    sumstats.loc[:,column_name]=sumstats.loc[:,column_name].astype("float")
     return sumstats
 def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
@@ -665,4 +675,25 @@ def check_daf(chr,start,end,ref,alt,eaf,vcf_reader,alt_freq,chr_dict=None):
                 return eaf - record.info[alt_freq][0]
     return np.nan
 ################################################################################################################
-################################################################################################################
+################################################################################################################
+def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
+    if vcf_path is not None:
+        if vcf_chr_dict is None:
+            if verbose: log.write(" -Checking prefix for chromosomes in vcf files..." )
+            prefix = check_vcf_chr_prefix(vcf_path)
+            if prefix is not None:
+                if verbose: log.write(" -Prefix for chromosomes: ",prefix)
+                vcf_chr_dict = get_number_to_chr(prefix=prefix)
+            else:
+                if verbose: log.write(" -No prefix for chromosomes in the VCF files." )
+                vcf_chr_dict = get_number_to_chr()
+    return vcf_chr_dict
+def check_vcf_chr_prefix(vcf_bcf_path):
+    vcf_bcf = VariantFile(vcf_bcf_path)
+    for i in list(vcf_bcf.header.contigs):
+        m = re.search('(chr|Chr|CHR)([0-9xXyYmM]+)', i)
+        if m is not None:
+            return m.group(1)
+    else:
+        return None

gwaslab/to_pickle.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pickle
 import os
+import gc
 from gwaslab.Log import Log
 def dump_pickle(glsumstats,path="~/mysumstats.pickle",overwrite=False):
@@ -20,3 +21,14 @@ def load_pickle(path):
             return glsumstats
     else:
         Log().write("File not exists : ", path)
+def load_data_from_pickle(path,usecols=None):
+    data = load_pickle(path).data
+    existing_cols = []
+    if usecols is not None:
+        for i in usecols:
+            if i in data.columns:
+                existing_cols.append(i)
+        data = data.loc[:,existing_cols]
+        gc.collect()
+    return data

gwaslab/trumpetplot.py ADDED Viewed

File without changes

gwaslab/version.py CHANGED Viewed

@@ -2,13 +2,13 @@ from gwaslab.Log import Log
 def _show_version(log=Log()):
     # show when loading sumstats
-    log.write("GWASLab version 3.4.14 https://cloufield.github.io/gwaslab/")
+    log.write("GWASLab version 3.4.15 https://cloufield.github.io/gwaslab/")
     log.write("(C) 2022-2023, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com")
 def gwaslab_info():
     # for output header
     dic={
-   "version":"3.4.14",
-   "release_date":"20230609"
+   "version":"3.4.15",
+   "release_date":"20230620"
     }
     return dic

{gwaslab-3.4.14.dist-info → gwaslab-3.4.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: gwaslab
-Version: 3.4.14
+Version: 3.4.16
 Summary: A collection of handy tools for GWAS SumStats
 Author-email: Yunye <yunye@gwaslab.com>
 Project-URL: Homepage, https://cloufield.github.io/gwaslab/
@@ -45,7 +45,7 @@ Note: GWASLab is being updated very frequently for now. I will release the first
 ## Install
 ```
-pip install gwaslab==3.4.13
+pip install gwaslab==3.4.15
 ```

gwaslab 3.4.14__py3-none-any.whl → 3.4.16__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.14py3-none-any.whl → 3.4.16py3-none-any.whl