PyPI - gwaslab - Versions diffs - 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl - Mend

gwaslab 3.5.7py3-none-any.whl → 3.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (67) hide show

gwaslab/__init__.py +2 -0
gwaslab/bd_common_data.py +1 -0
gwaslab/bd_get_hapmap3.py +0 -1
gwaslab/data/formatbook.json +78 -0
gwaslab/data/reference.json +3 -1
gwaslab/g_Sumstats.py +110 -25
gwaslab/g_SumstatsMulti.py +287 -0
gwaslab/g_SumstatsPair.py +101 -16
gwaslab/g_Sumstats_polars.py +245 -0
gwaslab/g_headers.py +12 -3
gwaslab/g_meta.py +124 -47
gwaslab/g_meta_update.py +48 -0
gwaslab/g_vchange_status_polars.py +44 -0
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +169 -110
gwaslab/hm_casting_polars.py +202 -0
gwaslab/hm_harmonize_sumstats.py +19 -8
gwaslab/io_load_ld.py +529 -0
gwaslab/io_preformat_input.py +11 -0
gwaslab/io_preformat_input_polars.py +632 -0
gwaslab/io_process_args.py +25 -1
gwaslab/io_read_ldsc.py +34 -3
gwaslab/io_read_pipcs.py +62 -6
gwaslab/prscs_gigrnd.py +122 -0
gwaslab/prscs_mcmc_gtb.py +136 -0
gwaslab/prscs_parse_genet.py +98 -0
gwaslab/qc_build.py +53 -0
gwaslab/qc_check_datatype.py +10 -8
gwaslab/qc_check_datatype_polars.py +128 -0
gwaslab/qc_fix_sumstats.py +25 -23
gwaslab/qc_fix_sumstats_polars.py +193 -0
gwaslab/util_ex_calculate_ldmatrix.py +49 -19
gwaslab/util_ex_gwascatalog.py +71 -28
gwaslab/util_ex_infer_ancestry.py +65 -0
gwaslab/util_ex_ldsc.py +67 -21
gwaslab/util_ex_match_ldmatrix.py +396 -0
gwaslab/util_ex_run_2samplemr.py +0 -2
gwaslab/util_ex_run_ccgwas.py +155 -0
gwaslab/util_ex_run_coloc.py +1 -1
gwaslab/util_ex_run_hyprcoloc.py +117 -0
gwaslab/util_ex_run_magma.py +74 -0
gwaslab/util_ex_run_mesusie.py +155 -0
gwaslab/util_ex_run_mtag.py +92 -0
gwaslab/util_ex_run_prscs.py +85 -0
gwaslab/util_ex_run_susie.py +40 -9
gwaslab/util_in_estimate_ess.py +18 -0
gwaslab/util_in_fill_data.py +20 -1
gwaslab/util_in_filter_value.py +10 -5
gwaslab/util_in_get_sig.py +71 -13
gwaslab/util_in_meta.py +168 -4
gwaslab/util_in_meta_polars.py +174 -0
gwaslab/viz_aux_annotate_plot.py +13 -2
gwaslab/viz_plot_compare_effect.py +87 -23
gwaslab/viz_plot_credible_sets.py +55 -11
gwaslab/viz_plot_effect.py +22 -12
gwaslab/viz_plot_miamiplot2.py +3 -2
gwaslab/viz_plot_mqqplot.py +94 -84
gwaslab/viz_plot_qqplot.py +9 -7
gwaslab/viz_plot_regional2.py +2 -1
gwaslab/viz_plot_stackedregional.py +4 -1
{gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/METADATA +46 -68
gwaslab-3.6.0.dist-info/RECORD +119 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/WHEEL +1 -1
gwaslab-3.5.7.dist-info/RECORD +0 -96
{gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE +0 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.6.0.dist-info}/top_level.txt +0 -0

gwaslab/util_ex_run_susie.py CHANGED Viewed

@@ -9,7 +9,20 @@ from gwaslab.g_version import _check_susie_version
 from gwaslab.qc_fix_sumstats import start_to
 from gwaslab.qc_fix_sumstats import finished
-def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr=0.1,refine="TRUE",L=10, fillldna=True, n=None, delete=False, susie_args="", log=Log(),verbose=True):
+def _run_susie_rss(filepath,
+                   r="Rscript",
+                   mode="bs",
+                   max_iter=100000,
+                   min_abs_corr=0.1,
+                   refine="TRUE",
+                   L=10,
+                   fillldna=True,
+                   n=None,
+                   delete=False,  #if delete output file
+                   susie_args="",
+                   log=Log(),
+                   main_sumstats=None,
+                   verbose=True):
     ##start function with col checking##########################################################
     _start_line = "run finemapping using SuSieR from command line"
     _end_line = "running finemapping using SuSieR from command line"
@@ -43,8 +56,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
     for index, row in filelist.iterrows():
         gc.collect()
         study = row["STUDY"]
-        ld_r_matrix = row["LD_R_MATRIX"]
-        sumstats = row["LOCUS_SUMSTATS"]
+        ld_r_matrix = row["LD_R_MATRIX"] #ld matrix path
+        sumstats = row["LOCUS_SUMSTATS"] #sumsttas path
         output_prefix = sumstats.replace(".sumstats.gz","")
         log.write(" -Running for: {} - {}".format(row["SNPID"],row["STUDY"] ))
         log.write("  -Locus sumstats:{}".format(sumstats))
@@ -54,7 +67,7 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
         rscript='''
         library(susieR)
-        sumstats <- read.csv("{}")
+        sumstats <- read.csv("{}",sep="\t")
         R <- as.matrix(read.csv("{}",sep="\t",header=FALSE))
         {}
@@ -67,6 +80,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
         output <- susie_fitted_summary$vars
         output$SNPID <- sumstats$SNPID[susie_fitted_summary$vars$variable]
+        output$LOCUS <- "{}"
+        output$STUDY <- "{}"
         write.csv(output, "{}.pipcs", row.names = FALSE)
         '''.format(sumstats,
@@ -79,6 +94,8 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
                     refine,
                     L,
                     susie_args,
+                    row["SNPID"],
+                    row["STUDY"],
                     output_prefix)
         susier_line = "susie_rss({}, n = {}, R = R, max_iter = {}, min_abs_corr={}, refine = {}, L = {}{})".format("z= sumstats$Z," if mode=="z" else "bhat = sumstats$BETA,shat = sumstats$SE,",
                     n if n is not None else "n",
@@ -88,34 +105,48 @@ def _run_susie_rss(filepath, r="Rscript", mode="bs",max_iter=100000,min_abs_corr
                     L,
                     susie_args)
         log.write("  -SuSieR script: {}".format(susier_line))
-        with open("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]),"w") as file:
+        temp_r_path = "_{}_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"],id(sumstats))
+        log.write("  -Createing temp R script: {}".format(temp_r_path))
+        with open(temp_r_path,"w") as file:
                 file.write(rscript)
-        script_run_r = "{} _{}_{}_gwaslab_susie_temp.R".format(r, study,row["SNPID"])
+        script_run_r = "{} {}".format(r, temp_r_path)
         try:
+            log.write("  -Running SuSieR from command line...")
             output = subprocess.check_output(script_run_r, stderr=subprocess.STDOUT, shell=True,text=True)
             #plink_process = subprocess.Popen("exec "+script_run_r, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True,text=True)
             #output1,output2 = plink_process.communicate()
             #output= output1 + output2+ "\n"
             #plink_process.kill()
-            log.write("  -Running SuSieR from command line...")
             r_log+= output + "\n"
             pip_cs = pd.read_csv("{}.pipcs".format(output_prefix))
             pip_cs["LOCUS"] = row["SNPID"]
             pip_cs["STUDY"] = row["STUDY"]
             locus_pip_cs = pd.concat([locus_pip_cs,pip_cs],ignore_index=True)
-            os.remove("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]))
+            os.remove(temp_r_path)
+            log.write("  -Removing temp R script: {}".format(temp_r_path))
             if delete == True:
                 os.remove("{}.pipcs".format(output_prefix))
+                log.write("  -Removing output file: {}".format(temp_r_path))
             else:
                 log.write("  -SuSieR result summary to: {}".format("{}.pipcs".format(output_prefix)))
         except subprocess.CalledProcessError as e:
             log.write(e.output)
-            os.remove("_{}_{}_gwaslab_susie_temp.R".format(study,row["SNPID"]))
+            os.remove(temp_r_path)
+            log.write("  -Removing temp R script: {}".format(temp_r_path))
     locus_pip_cs = locus_pip_cs.rename(columns={"variable":"N_SNP","variable_prob":"PIP","cs":"CREDIBLE_SET_INDEX"})
+    locus_pip_cs = pd.merge(locus_pip_cs, main_sumstats, on="SNPID",how="left")
     finished(log=log, verbose=verbose, end_line=_end_line)
     return locus_pip_cs
+def _get_cs_lead(pipcs):
+    leads = pipcs.loc[pipcs["CREDIBLE_SET_INDEX"]>0,:]
+    leads = leads.sort_values(by="PIP",ascending=False).drop_duplicates(subset=["STUDY","LOCUS","CREDIBLE_SET_INDEX"])
+    return leads

gwaslab/util_in_estimate_ess.py ADDED Viewed

@@ -0,0 +1,18 @@
+import numpy as np
+from scipy.stats import norm
+from gwaslab.g_Log import Log
+def _get_ess(sumstats, method="metal",log=Log(),verbose=True):
+    log.write("Start to estimate effective sample size (N_EFF)...", verbose=verbose)
+    if type(method) is str:
+        if method =="metal":
+            log.write(" - Method: {} ".format(method), verbose=verbose)
+            log.write(" - Referencec: {} ".format("Willer, C. J., Li, Y., & Abecasis, G. R. (2010)"), verbose=verbose)
+            log.write(" - Equation: {} ".format(" N_EFF = 4 * N_CASE * N_CONTROL / (N_CASE + N_CONTROL)"), verbose=verbose)
+            # Willer, C. J., Li, Y., & Abecasis, G. R. (2010). METAL: fast and efficient meta-analysis of genomewide association scans. Bioinformatics, 26(17), 2190-2191.
+            sumstats["N_EFF"] =  4 / (1/sumstats["N_CASE"] + 1/sumstats["N_CONTROL"])
+    else:
+        sumstats["N_EFF"] =  method
+    log.write("Finished estimating effective sample size (N_EFF)...", verbose=verbose)
+    return sumstats

gwaslab/util_in_fill_data.py CHANGED Viewed

@@ -355,4 +355,23 @@ def rank_based_int(series, c=3/8):
     #https://onlinelibrary.wiley.com/doi/10.1111/biom.13214
     n=sum(~series.isna())
     normalized_value = norm.ppf((series.rank()-c)/(n+1-2*c))
-    return normalized_value
+    return normalized_value
+################################################################################################################################################################################
+def _get_multi_min(sumstats_multi, col, nstudy,log=Log(), verbose=True):
+    cols =[]
+    for i in range(nstudy):
+        single_header = "{}_{}".format(col, i + 1)
+        if single_header in sumstats_multi.columns:
+            cols.append(single_header)
+    combined_header = "{}_{}".format(col, "MIN")
+    log.write("  -Filling {} using {}".format(combined_header,",".join(cols)), verbose=verbose)
+    sumstats_multi[combined_header] = sumstats_multi[cols].min(axis=1)
+    combined_header_index = "{}_{}_COL".format(col, "MIN")
+    sumstats_multi[combined_header_index] = sumstats_multi[cols].idxmin(axis=1)
+    return sumstats_multi

gwaslab/util_in_filter_value.py CHANGED Viewed

@@ -217,7 +217,10 @@ def filterregionout(sumstats, path=None, chrom="CHR",pos="POS", high_ld=False, b
     gc.collect()
     return sumstats
-def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
+def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS",
+               ea="EA", nea="NEA",build="19",
+               change_status=True,
+               verbose=True,log=Log()):
     ##start function with col checking##########################################################
     _start_line = "infer genome build version using hapmap3 SNPs"
     _end_line = "inferring genome build version using hapmap3 SNPs"
@@ -261,13 +264,15 @@ def inferbuild(sumstats,status="STATUS",chrom="CHR", pos="POS", ea="EA", nea="NE
     if match_count_for_19 > match_count_for_38:
         log.write(" -Since num_hg19 >> num_hg38, assigning genome build hg19...", verbose=verbose)
-        sumstats[status] = vchange_status(sumstats[status],1,"9","1")
-        sumstats[status] = vchange_status(sumstats[status],2,"9","9")
+        if change_status==True:
+            sumstats[status] = vchange_status(sumstats[status],1,"9","1")
+            sumstats[status] = vchange_status(sumstats[status],2,"9","9")
         inferred_build="19"
     elif match_count_for_19 < match_count_for_38:
         log.write(" -Since num_hg19 << num_hg38, assigning genome build hg38...", verbose=verbose)
-        sumstats[status] = vchange_status(sumstats[status],1,"9","3")
-        sumstats[status] = vchange_status(sumstats[status],2,"9","8")
+        if change_status==True:
+            sumstats[status] = vchange_status(sumstats[status],1,"9","3")
+            sumstats[status] = vchange_status(sumstats[status],2,"9","8")
         inferred_build="38"
     else:
         log.write(" -Since num_hg19 = num_hg38, unable to infer...", verbose=verbose)

gwaslab/util_in_get_sig.py CHANGED Viewed

@@ -17,6 +17,7 @@ from gwaslab.util_ex_gwascatalog import gwascatalog_trait
 from gwaslab.qc_fix_sumstats import check_dataframe_shape
 from gwaslab.qc_fix_sumstats import start_to
 from gwaslab.qc_fix_sumstats import finished
+from gwaslab.qc_build import _check_build
 from gwaslab.util_in_correct_winnerscurse import wc_correct
 # getsig
 # closest_gene
@@ -372,6 +373,8 @@ def getnovel(insumstats,
            xymt=["X","Y","MT"],
            anno=False,
            wc_correction=False,
+           use_cache=True,
+           cache_dir="./",
            build="19",
            source="ensembl",
            gwascatalog_source="NCBI",
@@ -405,15 +408,26 @@ def getnovel(insumstats,
     ############################################################################################
     knownsig = pd.DataFrame()
     if efo != False:
+        # For GWAS catalog, checking if sumstats build is hg38
+        _check_build(target_build="38" ,build=build ,log=log,verbose=verbose)
         if type(efo) is not list:
             log.write("Start to retrieve data using EFO: {}...".format(efo), verbose=verbose)
-            known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
+            known_Sumstats = gwascatalog_trait(efo,source=gwascatalog_source,
+                                               sig_level=sig_level,
+                                               use_cache=use_cache,
+                                               cache_dir=cache_dir,
+                                               verbose=verbose,log=log)
             knownsig = known_Sumstats.data.copy()
-        else:
+        else:
             knownsig=pd.DataFrame()
             log.write("Start to retrieve data using {} EFOs: {}...".format(len(efo),efo), verbose=verbose)
             for single_efo in efo:
-                known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,sig_level=sig_level,verbose=verbose,log=log)
+                known_Sumstats = gwascatalog_trait(single_efo,source=gwascatalog_source,
+                                                   use_cache=use_cache,
+                                                   cache_dir=cache_dir,
+                                                   sig_level=sig_level,verbose=verbose,log=log)
                 known_Sumstats.data["EFOID"] = single_efo
                 knownsig = pd.concat([known_Sumstats.data, knownsig],ignore_index=True)
         knownsig["CHR"] = knownsig["CHR"].astype("Int64")
@@ -832,44 +846,88 @@ def _check_novel_set(insumstats,
         else:
             reference_dict[row[group_key]] = {row[snpset]:set([row[snpid]])}
     ############################################################################################
+    #match group/trait
     try:
         no_reference_avaialble = allsig.loc[~allsig[group_key].isin(reference_dict.keys()),group_key]
         if len(no_reference_avaialble)>0:
             log.write(" -Groups not in reference: {}".format( ",".join(no_reference_avaialble)), verbose=verbose)
     except:
         pass
+    ############################################################################################
     log.write(" -Checking if variants are in reference variant sets...", verbose=verbose)
-    known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
+    #known_list = allsig.apply(lambda x: check_overlap(x,snpid, group_key,reference_dict), axis=1)
+    new_row_list = []
+    for index, row in allsig.iterrows():
+        row = check_overlap(row, snpset, snpid, group_key,reference_dict)
+        new_row_list = new_row_list+row
+        known_df = pd.DataFrame(new_row_list,
+                                columns=[snpid,group_key, snpset,"KNOWN_SET","OVERLAP_VARIANT","KNOWN_SET_VARIANT"])
-    allsig["KNOWN_SET"] = known_list.str[0]
-    allsig["KNOWN_VARIANT"] = known_list.str[1]
+    allsig = pd.merge(allsig,known_df, on=[snpid, group_key, snpset],how="left")
+    #allsig["KNOWN_SET"] = known_list.str[0]
+    #allsig["OVERLAP_VARIANT"] = known_list.str[1]
+    #allsig["KNOWN_SET_VARIANT"] = known_list.str[2]
+    ##
+    is_overlapped = ~allsig["KNOWN_SET"].isna()
+    allsig["KNOWN_SET_SIZE"] = 0
+    allsig.loc[is_overlapped, "KNOWN_SET_SIZE"] = allsig.loc[is_overlapped, "KNOWN_SET_VARIANT"].str.len()
+    # sumstats set dic
     back_dict={}
     for i in allsig[group_key].unique():
+        # for each trait in sumstats
         back_dict[i] ={}
         for j in allsig.loc[allsig[group_key]==i,snpset].unique():
+            #for each locus in each trait
             back_dict[i][j] =set()
-            for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j) & (~allsig["KNOWN_SET"].isna()),:].iterrows():
-                back_dict[i][j].add("{}-{}-{}".format(row[group_key], row["KNOWN_SET"],row["KNOWN_VARIANT"]))
+            for index, row in allsig.loc[(allsig[group_key]==i) & (allsig[snpset]==j),:].iterrows():
+                #for each variant in each locus
+                back_dict[i][j].add("{}".format(row["SNPID"]))
-    allsig["KNOWN_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
+    allsig["SUMSTATS_SET_VARIANT"] = allsig.apply(lambda x: assign_set_variant(x,group_key,snpset,back_dict), axis=1)
+    allsig["SUMSTATS_SET_SIZE"] = 0
+    allsig["SUMSTATS_SET_SIZE"] = allsig[ "SUMSTATS_SET_VARIANT"].str.len()
     finished(log,verbose,_end_line)
     return allsig
-def check_overlap(x,snpid, group_key,reference_dict):
+def check_overlap(x,snpset, snpid, group_key,reference_dict):
+    matched=[]
     if x[group_key] in reference_dict.keys():
+        # if trait match
         for key, value in reference_dict[x[group_key]].items():
+            # locus and snplist
             if x[snpid] in value:
-                return key, x[snpid]
-    return pd.NA, pd.NA,
+                # if sumstats snp in reference snplist for locus
+                # return locus and snsumstats snppid
+                matched.append( (x[snpid], x[group_key],  x[snpset],  key,  x[snpid],  value))
+    if len(matched)==0:
+        matched = [(x[snpid], x[group_key],  x[snpset], pd.NA, pd.NA, pd.NA)]
+    return matched
+#def check_overlap(x,snpid, group_key,reference_dict):
+#    if x[group_key] in reference_dict.keys():
+#        # if trait match
+#        for key, value in reference_dict[x[group_key]].items():
+#            # locus and snplist
+#            if x[snpid] in value:
+#                # if sumstats snp in reference snplist for locus
+#                # return locus and snsumstats snppid
+#                return key, x[snpid], value
+#    return pd.NA, pd.NA, pd.NA
 def assign_set_variant(x,group_key,snpset,back_dict):
     if x[group_key] in back_dict.keys():
+        # if trait match
         if x[snpset] in back_dict[x[group_key]].keys():
+            #if locus match
             if len(back_dict[x[group_key]][x[snpset]]) >0:
+                # return sumstats snplist for locus
                 return back_dict[x[group_key]][x[snpset]]
     return pd.NA

gwaslab/util_in_meta.py CHANGED Viewed

@@ -7,8 +7,12 @@ from gwaslab.g_Log import Log
 from gwaslab.io_to_pickle import load_data_from_pickle
 from gwaslab.g_Sumstats import Sumstats
 import gc
+import statsmodels.api as sm
-def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log()):
+def meta_analyze(sumstats_list,
+                 random_effects=False,
+                 match_allele=True,
+                 log=Log()):
     ###########################################################################
     columns=["SNPID","CHR","POS","EA","NEA"]
@@ -16,6 +20,7 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
     log.write("Start to perform meta-analysis...")
     log.write(" -Datasets:")
     for index,sumstats_path in enumerate(sumstats_list):
         if isinstance(sumstats_path, pd.DataFrame):
             log.write("  -Sumstats #{}: {} ".format(index, sumstats_path))
@@ -42,8 +47,6 @@ def meta_analyze(sumstats_list,random_effects=False, match_allele=True, log=Log(
         del new_rows
         gc.collect()
     ###########################################################################
     log.write(" -Initiating result DataFrame...")
     columns=["SNPID","CHR","POS","EA","NEA","_BETAW_SUM","_EA_N","_NEA_N","_BETA2W_SUM","_W_SUM","EAF","N","DIRECTION","BETA","SE","DOF"]
@@ -231,4 +234,165 @@ def get_sumstats(input_path,usekeys=None):
             sumstats = sumstats[usekeys]
     else:
         sumstats = Sumstats(path,fmt="auto",verbose=False,usekeys=usekeys,**path_args).data
-    return sumstats
+    return sumstats
+############################################################################################################################################################################
+def meta_analyze_multi(sumstats_multi,
+                       random_effects=False,
+                       nstudy=1,
+                       match_allele=True,
+                       log=Log()):
+    log.write("Start to perform meta-analysis...")
+    ###########################################################################
+    log.write(" -Initiating result DataFrame...")
+    sumstats_multi["_INDEX"] = range(len(sumstats_multi))
+    results_df = _init_result_df(sumstats_multi)
+    ##########################################################################
+    log.write(" -Iterating through {} datasets to compute statistics for fixed-effect model...".format(nstudy))
+    for i in range(nstudy):
+        n="N_{}".format(i+1)
+        beta="BETA_{}".format(i+1)
+        se="SE_{}".format(i+1)
+        eaf="EAF_{}".format(i+1)
+        single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
+        to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
+        sumstats_index = to_use_sumstats.index
+        results_df_not_in_sumstat_index = results_df.index[~results_df.index.isin(to_use_sumstats.index)]
+        # N and DOF
+        results_df.loc[sumstats_index, "N"]         += to_use_sumstats[n].fillna(0)
+        results_df.loc[sumstats_index, "DOF"]       += 1
+        # BEAT and SE
+        results_df.loc[sumstats_index,"_BETA2W_SUM"] += to_use_sumstats[beta]**2 *(1/(to_use_sumstats[se]**2))
+        results_df.loc[sumstats_index,"_BETAW_SUM"]  += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2))
+        results_df.loc[sumstats_index,"_W_SUM"]      += 1/(to_use_sumstats[se]**2)
+        results_df.loc[sumstats_index,"_W2_SUM"]     += results_df.loc[sumstats_index,"_W_SUM"]**2
+        # EAF
+        results_df.loc[sumstats_index,"_EA_N"] += to_use_sumstats[n]*to_use_sumstats[eaf]
+        results_df.loc[sumstats_index,"_NEA_N"]  += to_use_sumstats[n]*(1 - to_use_sumstats[eaf])
+        # DIRECTION
+        beta_index = to_use_sumstats[to_use_sumstats[beta]>0].index
+        results_df.loc[beta_index, "DIRECTION"] += "+"
+        beta_index = to_use_sumstats[to_use_sumstats[beta]==0].index
+        results_df.loc[beta_index, "DIRECTION"] += "0"
+        beta_index = to_use_sumstats[to_use_sumstats[beta]<0].index
+        results_df.loc[beta_index, "DIRECTION"] += "-"
+        results_df.loc[results_df_not_in_sumstat_index, "DIRECTION"] += "?"
+        del to_use_sumstats
+        gc.collect()
+    ##############################################################################
+    # fixed - effect statistics
+    results_df["BETA"] = results_df["_BETAW_SUM"] / results_df["_W_SUM"]
+    results_df["EAF"] = results_df["_EA_N"] / (results_df["_EA_N"] + results_df["_NEA_N"])
+    results_df["SE"] = np.sqrt(1/results_df["_W_SUM"])
+    results_df["Z"] = results_df["BETA"] / results_df["SE"]
+    results_df["P"] = norm.sf(abs(results_df["Z"]))*2
+    results_df["Q"] = results_df["_BETA2W_SUM"] - (results_df["_BETAW_SUM"]**2 / results_df["_W_SUM"])
+    for dof in results_df["DOF"].unique():
+        results_df_dof_index = results_df["DOF"] == dof
+        results_df.loc[results_df_dof_index,"P_HET"] = chi2.sf(results_df.loc[results_df_dof_index, "Q"].values,dof)
+        gc.collect()
+    results_df["I2"] = (results_df["Q"] - results_df["DOF"])/results_df["Q"]
+    results_df.loc[results_df["I2"]<0, "I2"] = 0
+    results_df=results_df.drop(columns=["_EA_N","_NEA_N"])
+    gc.collect()
+    ###########################################################################
+    if random_effects==True:
+        log.write(" -Iterating through {} datasets to compute statistics for random-effects model...".format(nstudy))
+        results_df["_R2"] = (results_df["Q"] - results_df["DOF"])/(results_df["_W_SUM"] - (results_df["_W2_SUM"]/results_df["_W_SUM"]))
+        results_df.loc[results_df["_R2"]<0, "_R2"] = 0
+        variant_index_random = results_df[results_df["_R2"]>0].index
+        results_df["_BETAW_SUM_R"] = 0.0
+        results_df["_W_SUM_R"] = 0.0
+        results_df["BETA_RANDOM"] = results_df["BETA"]
+        results_df["SE_RANDOM"] = results_df["SE"]
+        for i in range(nstudy):
+            n="N_{}".format(i+1)
+            beta="BETA_{}".format(i+1)
+            se="SE_{}".format(i+1)
+            eaf="EAF_{}".format(i+1)
+            single_study_cols=[n,beta,se,eaf,"SNPID","_INDEX"]
+            to_use_sumstats = sumstats_multi.loc[~sumstats_multi["BETA_{}".format(i+1)].isna(),single_study_cols].drop_duplicates(subset="_INDEX").set_index("_INDEX")
+            sumstats_index = to_use_sumstats.index
+            # BEAT and SE
+            results_df.loc[sumstats_index,"_BETAW_SUM_R"]  += to_use_sumstats[beta]*(1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"]))
+            results_df.loc[sumstats_index,"_W_SUM_R"]      += 1/(to_use_sumstats[se]**2 + results_df.loc[sumstats_index,"_R2"])
+            del to_use_sumstats
+            del sumstats_index
+            gc.collect()
+        results_df.loc[variant_index_random,"BETA_RANDOM"] = results_df.loc[variant_index_random,"_BETAW_SUM_R"] / results_df.loc[variant_index_random,"_W_SUM_R"]
+        results_df.loc[variant_index_random,"SE_RANDOM"] = np.sqrt(1/results_df.loc[variant_index_random,"_W_SUM_R"])
+        results_df["Z_RANDOM"] = results_df["BETA_RANDOM"] / results_df["SE_RANDOM"]
+        results_df["P_RANDOM"] = norm.sf(abs(results_df["Z_RANDOM"]))*2
+        results_df = results_df.drop(columns=["_BETAW_SUM_R","_W_SUM_R"])
+        gc.collect()
+    ###########################################################################
+    results_df = results_df.drop(columns=["_BETAW_SUM","_BETA2W_SUM","_W_SUM","_R2","_W2_SUM"]).sort_values(by=["CHR","POS"]).reset_index()
+    gc.collect()
+    log.write("Finished meta-analysis successfully!")
+    if random_effects==True:
+        other_cols = ["BETA_RANDOM","SE_RANDOM","Z_RANDOM","P_RANDOM"]
+    else:
+        other_cols = []
+    results_df = results_df.drop(columns=["_INDEX"])
+    results_df = Sumstats(results_df, fmt="gwaslab", other = other_cols)
+    return results_df
+def _init_result_df(sumstats):
+    results_df = sumstats[["_INDEX","SNPID","CHR","POS","EA","NEA"]]
+    results_df = results_df.drop_duplicates(subset="_INDEX").set_index("_INDEX")
+    results_df["N"] = 0
+    results_df["_BETAW_SUM"] = 0.0
+    results_df["_BETA2W_SUM"] = 0.0
+    results_df["_W_SUM"] = 0.0
+    results_df["_W2_SUM"] = 0.0
+    results_df["_EA_N"] = 0.0
+    results_df["_NEA_N"] = 0.0
+    results_df["N"] = 0
+    results_df["DIRECTION"] = ""
+    results_df["BETA"] = 0.0
+    results_df["SE"] = 0.0
+    results_df["DOF"] = -1
+    results_df["_R2"] = 0
+    dtype_dict ={
+        "_BETAW_SUM":"float64",
+        "_EA_N":"float64",
+        "_NEA_N":"float64",
+        "_BETA2W_SUM":"float64",
+        "_W_SUM":"float64",
+        "BETA":"float64",
+        "SE":"float64",
+        "N":"Int64",
+        "DOF":"Int64"
+    }
+    results_df=results_df.astype(dtype_dict)
+    return results_df

gwaslab 3.5.7__py3-none-any.whl → 3.6.0__py3-none-any.whl

Potentially problematic release.

gwaslab 3.5.7py3-none-any.whl → 3.6.0py3-none-any.whl