PyPI - gwaslab - Versions diffs - 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl - Mend

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (57) hide show

gwaslab/bd_common_data.py +6 -3
gwaslab/bd_download.py +9 -9
gwaslab/bd_get_hapmap3.py +43 -9
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +22 -5
gwaslab/g_Sumstats.py +110 -163
gwaslab/g_SumstatsPair.py +76 -25
gwaslab/g_SumstatsT.py +2 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_version.py +10 -10
gwaslab/hm_casting.py +36 -17
gwaslab/hm_harmonize_sumstats.py +354 -221
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +49 -43
gwaslab/io_read_ldsc.py +49 -1
gwaslab/io_to_formats.py +428 -295
gwaslab/ldsc_irwls.py +198 -0
gwaslab/ldsc_jackknife.py +514 -0
gwaslab/ldsc_ldscore.py +417 -0
gwaslab/ldsc_parse.py +294 -0
gwaslab/ldsc_regressions.py +747 -0
gwaslab/ldsc_sumstats.py +629 -0
gwaslab/qc_check_datatype.py +3 -3
gwaslab/qc_fix_sumstats.py +891 -778
gwaslab/util_ex_calculate_ldmatrix.py +31 -13
gwaslab/util_ex_gwascatalog.py +25 -25
gwaslab/util_ex_ldproxyfinder.py +10 -10
gwaslab/util_ex_ldsc.py +189 -0
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_calculate_gc.py +6 -6
gwaslab/util_in_calculate_power.py +42 -43
gwaslab/util_in_convert_h2.py +8 -8
gwaslab/util_in_fill_data.py +30 -30
gwaslab/util_in_filter_value.py +201 -74
gwaslab/util_in_get_density.py +10 -10
gwaslab/util_in_get_sig.py +445 -71
gwaslab/viz_aux_annotate_plot.py +12 -12
gwaslab/viz_aux_quickfix.py +42 -37
gwaslab/viz_aux_reposition_text.py +10 -7
gwaslab/viz_aux_save_figure.py +18 -8
gwaslab/viz_plot_compare_af.py +32 -33
gwaslab/viz_plot_compare_effect.py +63 -71
gwaslab/viz_plot_miamiplot2.py +34 -26
gwaslab/viz_plot_mqqplot.py +126 -75
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +36 -33
gwaslab/viz_plot_rg_heatmap.py +28 -26
gwaslab/viz_plot_stackedregional.py +40 -21
gwaslab/viz_plot_trumpetplot.py +65 -61
gwaslab-3.4.39.dist-info/LICENSE +674 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/METADATA +5 -4
gwaslab-3.4.39.dist-info/RECORD +80 -0
gwaslab-3.4.37.dist-info/RECORD +0 -72
/gwaslab-3.4.37.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0

gwaslab/g_Log.py CHANGED Viewed

@@ -1,17 +1,34 @@
 import time
 class Log():
     def __init__(self):
-        self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
+        self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
     def write(self,*message,end="\n",show_time=True, verbose=True):
         if show_time is True:
-            if verbose: print(str(time.ctime(time.time())),*message,end=end)
-            self.log_text = self.log_text + str(time.ctime(time.time())) + " " + " ".join(map(str,message)) + end
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
+            self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
         else:
             if verbose: print(*message,end=end)
             self.log_text = self.log_text + " ".join(map(str,message)) + end
+    def warning(self,*message,end="\n",show_time=True, verbose=True):
+        self.write(" #WARNING! {}".format(" ".join(map(str,message))),
+                   end=end,
+                   show_time=show_time,
+                   verbose=verbose)
     def show(self):
         print(self.log_text)
     def save(self,path,verbose=True):
         with open(path,"w") as f:
-            if verbose: print(str(time.ctime(time.time())) + " " + " -Save log file to : ", path)
-            f.write(self.log_text)
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
+            f.write(self.log_text)
+    def log(self,*message,end="\n",show_time=True, verbose=True):
+        if show_time is True:
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
+            self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
+        else:
+            if verbose: print(*message,end=end)
+            self.log_text = self.log_text + " ".join(map(str,message)) + end

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -5,7 +5,7 @@ import copy
 from gwaslab.g_Sumstats_summary import summarize
 from gwaslab.g_Sumstats_summary import lookupstatus
 from gwaslab.io_preformat_input import preformat
-from gwaslab.io_to_formats import tofmt
+from gwaslab.io_to_formats import _to_format
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixID
 from gwaslab.qc_fix_sumstats import removedup
@@ -32,9 +32,14 @@ from gwaslab.util_in_filter_value import filterout
 from gwaslab.util_in_filter_value import filterin
 from gwaslab.util_in_filter_value import filterregionin
 from gwaslab.util_in_filter_value import filterregionout
+from gwaslab.util_in_filter_value import _filter_indel
+from gwaslab.util_in_filter_value import _filter_palindromic
+from gwaslab.util_in_filter_value import _filter_snp
 from gwaslab.util_in_filter_value import inferbuild
 from gwaslab.util_in_filter_value import sampling
 from gwaslab.util_in_filter_value import _get_flanking
+from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
+from gwaslab.util_in_filter_value import _get_flanking_by_id
 from gwaslab.util_in_calculate_gc import lambdaGC
 from gwaslab.util_in_convert_h2 import _get_per_snp_r2
 from gwaslab.util_in_get_sig import getsig
@@ -42,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
 from gwaslab.util_in_get_density import assigndensity
 from gwaslab.util_in_get_sig import annogene
 from gwaslab.util_in_get_sig import getnovel
+from gwaslab.util_in_get_sig import _check_cis
+from gwaslab.util_in_get_sig import _check_novel_set
 from gwaslab.util_in_fill_data import filldata
 from gwaslab.bd_get_hapmap3 import gethapmap3
 from gwaslab.bd_common_data import get_chr_list
@@ -62,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
 from gwaslab.viz_plot_compare_af import plotdaf
 from gwaslab.util_ex_run_susie import _run_susie_rss
 from gwaslab.qc_fix_sumstats import _check_data_consistency
+from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
+from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
+from gwaslab.bd_get_hapmap3 import gethapmap3
 import gc
 #20220309
@@ -119,7 +129,8 @@ class Sumstats():
         # basic attributes
         self.data = pd.DataFrame()
         self.log = Log()
+        self.ldsc_h2 = None
+        self.ldsc_rg = None
         # meta information
         self.meta = _init_meta()
         self.build = build
@@ -133,7 +144,7 @@ class Sumstats():
         self.pipcs = pd.DataFrame()
         # print gwaslab version information
-        if verbose: _show_version(self.log)
+        _show_version(self.log, verbose=verbose)
         #preformat the data
         self.data  = preformat(
@@ -403,19 +414,16 @@ class Sumstats():
         _check_data_consistency(self.data,log=self.log,**args)
     def check_id(self,**args):
         pass
     def check_ref(self,ref_seq,**args):
         self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
         self.data = checkref(self.data,ref_seq,log=self.log,**args)
     def infer_strand(self,ref_infer,**args):
         self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
         self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
     def flip_allele_stats(self,**args):
         self.data = flipallelestats(self.data,log=self.log,**args)
     def normalize_allele(self,**args):
         self.data = parallelnormalizeallele(self.data,log=self.log,**args)
     def assign_rsid(self,
                     ref_rsid_tsv=None,
                     ref_rsid_vcf=None,
@@ -426,14 +434,11 @@ class Sumstats():
         if ref_rsid_vcf is not None:
             self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
             self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
     def rsid_to_chrpos(self,**args):
         self.data = rsidtochrpos(self.data,log=self.log,**args)
     def rsid_to_chrpos2(self,**args):
         self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
     ############################################################################################################
     def sort_coordinate(self,**sort_args):
@@ -449,14 +454,27 @@ class Sumstats():
 # utilities ############################################################################################################
     # filter series ######################################################################
-    def get_flanking(self, inplace=False,**args):
+    def filter_flanking(self, inplace=False,**args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
             new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
             return new_Sumstats_object
         else:
             self.data = _get_flanking(self.data, **args)
+    def filter_flanking_by_chrpos(self, chrpos,  inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
+    def filter_flanking_by_id(self, snpid, inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_id(self.data, snpid, **args)
     def filter_value(self, expr, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -464,7 +482,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filtervalues(self.data, expr,log=self.log,**args)
     def filter_out(self, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -472,7 +489,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filterout(self.data,log=self.log,**args)
     def filter_in(self, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -494,7 +510,28 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filterregionout(self.data,log=self.log,**args)
+    def filter_palindromic(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_palindromic(self.data,log=self.log,**args)
+    def filter_snp(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_snp(self.data,log=self.log,**args)
+    def filter_indel(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_indel(self.data,log=self.log,**args)
     def random_variants(self,inplace=False,n=1,p=None,**args):
         if inplace is True:
             self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
@@ -502,18 +539,25 @@ class Sumstats():
             new_Sumstats_object = copy.deepcopy(self)
             new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
             return new_Sumstats_object
+    def filter_hapmap3(self, inplace=False, build=None, **args ):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        if inplace is True:
+            self.data = gethapmap3(self.data, build=build,log=self.log, **args)
+        else:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
+            return new_Sumstats_object
     ######################################################################
     def check_af(self,ref_infer,**args):
         self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
         self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
     def infer_af(self,ref_infer,**args):
         self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
         self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
         self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
     def plot_daf(self, **args):
         fig,outliers = plotdaf(self.data, **args)
         return fig, outliers
@@ -548,8 +592,10 @@ class Sumstats():
         return plot
-    def plot_trumpet(self, **args):
-        fig = plottrumpet(self.data, **args)
+    def plot_trumpet(self, build=None, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        fig = plottrumpet(self.data,build = build,  **args)
         return fig
     def get_lead(self, build=None, gls=False, **args):
@@ -617,7 +663,37 @@ class Sumstats():
                            **args)
         # return sumstats object
         return output
+    def check_cis(self, **args):
+        if "SNPID" in self.data.columns:
+            id_to_use = "SNPID"
+        else:
+            id_to_use = "rsID"
+        output = _check_cis(self.data,
+                           id=id_to_use,
+                           chrom="CHR",
+                           pos="POS",
+                           p="P",
+                           log=self.log,
+                           **args)
+        # return sumstats object
+        return output
+    def check_novel_set(self, **args):
+        if "SNPID" in self.data.columns:
+            id_to_use = "SNPID"
+        else:
+            id_to_use = "rsID"
+        output = _check_novel_set(self.data,
+                           id=id_to_use,
+                           chrom="CHR",
+                           pos="POS",
+                           p="P",
+                           log=self.log,
+                           **args)
+        # return sumstats object
+        return output
     def anno_gene(self, **args):
         if "SNPID" in self.data.columns:
             id_to_use = "SNPID"
@@ -653,6 +729,18 @@ class Sumstats():
             output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
             self.meta["Genomic inflation factor"] = output
             return output
+    def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
+        self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
+    def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
+        self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
 # external ################################################################################################
     def to_finemapping(self,**args):
@@ -670,148 +758,7 @@ class Sumstats():
 # to_format ###############################################################################################
-    def to_format(self,
-              path="./sumstats",
-              fmt="gwaslab",
-              extract=None,
-              exclude=None,
-              cols=None,
-              id_use="rsID",
-              hapmap3=False,
-              exclude_hla=False,
-              hla_range=(25,34),
-              build=None,
-              n=None,
-              verbose=True,
-              no_status=False,
-              output_log=True,
-              to_csvargs=None,
-              float_formats=None,
-              xymt_number=False,
-              xymt=None,
-              chr_prefix="",
-              ssfmeta=False,
-              md5sum=False,
-              bgzip=False,
-              tabix=False,
-              tabix_indexargs={}):
+    def to_format(self, path, build=None, **args):
         if build is None:
             build = self.meta["gwaslab"]["genome_build"]
-        onetime_log = copy.deepcopy(self.log)
-        if  to_csvargs is None:
-            to_csvargs = {}
-        if  float_formats is None:
-            float_formats={}
-        if cols is None:
-            cols=[]
-        if xymt is None:
-            xymt = ["X","Y","MT"]
-        formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
-        if fmt in formatlist:
-            if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
-        else:
-            raise ValueError("Please select a format to output")
-        #######################################################################################################
-        # filter
-        output = self.data.copy()
-        if extract is not None:
-            output = output.loc[output[id_use].isin(extract),:]
-        if exclude is not None:
-            output = output.loc[~output[id_use].isin(exclude),:]
-        #hla and hapmap3 #######################################################################################
-        suffix=fmt
-        #exclude hla
-        if exclude_hla is True:
-            if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
-            before = len(output)
-            is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
-            output = output.loc[~is_hla,:]
-            after = len(output)
-            if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
-            suffix = "noMHC."+suffix
-        #extract hapmap3 SNPs
-        if hapmap3 is True:
-            output = gethapmap3(output,build=build,verbose=True)
-            after = len(output)
-            if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
-            suffix = "hapmap3."+suffix
-        # add a n column
-        if n is not None:
-            output["N"] = n
-        #######################################################################################################
-        #formatting float statistics
-        if verbose: onetime_log.write(" -Formatting statistics ...")
-        formats = {'EAF': '{:.4g}',
-                'BETA': '{:.4f}',
-                'Z': '{:.4f}',
-                'CHISQ': '{:.4f}',
-                'SE': '{:.4f}',
-                'OR': '{:.4f}',
-                'OR_95U': '{:.4f}',
-                'OR_95L': '{:.4f}',
-                'INFO': '{:.4f}',
-                'P': '{:.4e}',
-                'MLOG10P': '{:.4f}',
-                'DAF': '{:.4f}'
-                  }
-        for col, f in float_formats.items():
-            if col in output.columns:
-                formats[col]=f
-        for col, f in formats.items():
-            if col in output.columns:
-                if output[col].dtype in ["float64","float32","float16","float"]:
-                    output[col] = output[col].map(f.format)
-        if verbose:
-            onetime_log.write(" - Float statistics formats:")
-            keys=[]
-            values=[]
-            for key,value in formats.items():
-                if key in output.columns:
-                    keys.append(key)
-                    values.append(value)
-            onetime_log.write("  - Columns:",keys)
-            onetime_log.write("  - Output formats:",values)
-        ##########################################################################################################
-        # output, mapping column names
-        if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
-            tofmt(output,
-                  path=path,
-                  fmt=fmt,
-                  cols=cols,
-                  suffix=suffix,
-                  build=build,
-                  verbose=True,
-                  no_status=no_status,
-                  log=onetime_log,
-                  to_csvargs=to_csvargs,
-                  chr_prefix=chr_prefix,
-                  meta = self.meta,
-                  ssfmeta=ssfmeta,
-                  bgzip=bgzip,
-                  tabix=tabix,
-                  tabix_indexargs=tabix_indexargs,
-                  md5sum=md5sum,
-                  xymt_number=xymt_number,
-                  xymt=xymt)
-        if output_log is True:
-            log_path = path + "."+ suffix + ".log"
-            if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
-            if verbose: onetime_log.write("Finished outputting successfully!")
-            try:
-                onetime_log.save(log_path, verbose=False)
-            except:
-                pass
+        _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)

gwaslab/g_SumstatsPair.py CHANGED Viewed

@@ -6,33 +6,40 @@ from gwaslab.util_in_filter_value import filtervalues
 from gwaslab.g_Log import Log
 from math import floor
 from gwaslab.g_Sumstats import Sumstats
-from gwaslab.hm_casting import _merge_mold_with_sumstats
+from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
 from gwaslab.hm_casting import _align_with_mold
 from gwaslab.hm_casting import _fill_missing_columns
 from gwaslab.hm_casting import _check_daf
 from gwaslab.hm_casting import _assign_warning_code
 from gwaslab.qc_fix_sumstats import flipallelestats
+from gwaslab.qc_check_datatype import check_datatype
+from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.hm_casting import _renaming_cols
 from gwaslab.hm_casting import _sort_pair_cols
 from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
 from gwaslab.util_ex_run_coloc import _run_coloc_susie
 from gwaslab.viz_plot_miamiplot2 import plot_miami2
+from gwaslab.viz_plot_compare_af import  plotdaf
 from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
 from gwaslab.util_ex_run_clumping import _clump
 from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
 class SumstatsPair( ):
-    def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
+    def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
         if not isinstance(sumstatsObject1, Sumstats):
             raise ValueError("Please provide GWASLab Sumstats Object #1.")
         if not isinstance(sumstatsObject2, Sumstats):
             raise ValueError("Please provide GWASLab Sumstats Object #2.")
-        self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
+            self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        else:
+            self.study_name = "{}_{}".format("STUDY1", "STUDY2")
         self.snp_info_cols = []
         self.stats_cols =[]
-        self.other_cols=[]
+        self.stats_cols2 =[]
+        self.other_cols =[]
+        self.other_cols2 =[]
         self.log = Log()
         self.suffixes = suffixes
         self.colocalization=pd.DataFrame()
@@ -41,26 +48,53 @@ class SumstatsPair( ):
         self.mr = {}
         self.clumps ={}
         self.ns = None
+        self.to_finemapping_file_path = ""
+        self.plink_log = ""
+        self.log.write( "Start to create SumstatsPair object..." )
+        self.log.write( " -Checking sumstats 1..." , verbose=verbose)
+        check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
+        check_dataframe_shape(sumstats=sumstatsObject1.data,
+                        log=self.log,
+                        verbose=verbose)
+        self.log.write( " -Checking sumstats 2..." , verbose=verbose)
+        check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
+        check_dataframe_shape(sumstats=sumstatsObject2.data,
+                                log=self.log,
+                                verbose=verbose)
         for i in sumstatsObject1.data.columns:
             if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
                 self.snp_info_cols.append(i)
-            elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","OR95L","OR95U","MAF","EAF"]:
+            elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
                 self.stats_cols.append(i)
             else:
                 self.other_cols.append(i)
-        self.data = sumstatsObject1.data.loc[:,self.snp_info_cols + self.stats_cols]
+        for i in sumstatsObject2.data.columns:
+            if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
+                continue
+            elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
+                self.stats_cols2.append(i)
+            else:
+                self.other_cols2.append(i)
+        self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
+        self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
+        self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
+        self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
+        # extract only info and stats cols
+        self.data = sumstatsObject1.data
+        #rename with _1
         self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
         self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
+        self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
         self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
-        self.to_finemapping_file_path = ""
-        self.plink_log = ""
         if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
             n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
             n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
@@ -70,8 +104,9 @@ class SumstatsPair( ):
     def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
-        molded_sumstats, sumstats1 = _merge_mold_with_sumstats(self.data,
-                                                    sumstatsObject2.data,
+        # sumstats1 with suffix _1, sumstats2 with no suffix
+        molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
+                                                    sumstats=sumstatsObject2.data,
                                                     log=self.log,
                                                     verbose=verbose,
                                                     suffixes=(suffixes[0],""),
@@ -79,16 +114,21 @@ class SumstatsPair( ):
         molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
+        # flip sumstats2 statistics
         molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
+        # drop sumstats2 EA NEA
         molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
+        # rename sumstats1 EA NEA
         molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
-        if not len(set(self.stats_cols) & set (sumstatsObject2.data.columns)) == len(self.stats_cols):
-            cols_to_fill = set(self.stats_cols).difference(set(sumstatsObject2.data.columns))
+        if not set(self.stats_cols2) == set(self.stats_cols):
+            cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
             molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
-        molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols, log=self.log, verbose=verbose, suffixes=suffixes)
+        # rename sumstast2 with _2
+        molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
         molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
@@ -104,13 +144,7 @@ class SumstatsPair( ):
     def run_coloc_susie(self,**args):
         self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
-    def plot_miami(self,**args):
-        plot_miami2(merged_sumstats=self.data,
-                    suffixes=self.suffixes,
-                    **args)
     def run_two_sample_mr(self, clump=False, **args):
         exposure1 = self.study_name.split("_")[0]
         outcome2 = self.study_name.split("_")[1]
@@ -126,4 +160,21 @@ class SumstatsPair( ):
             return new_Sumstats_object
         else:
             self.data = filtervalues(self.data, expr,log=self.log,**args)
-        gc.collect()
+        gc.collect()
+    ## Visualization #############################################################################################################################################
+    def plot_miami(self,**args):
+        plot_miami2(merged_sumstats=self.data,
+                    suffixes=self.suffixes,
+                    **args)
+    def compare_af(self, **args):
+        return plotdaf( self.data,
+                     eaf="EAF_2",
+                     raf="EAF_1",
+                     xlabel="Effect Allele Frequency in Sumstats 1",
+                     ylabel="Effect Allele Frequency in Sumstats 2",
+                     **args)

gwaslab/g_SumstatsT.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from gwaslab.g_Sumstats import Sumstats
-from gwaslab.hm_casting import _merge_mold_with_sumstats
+from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
 from gwaslab.hm_casting import _align_with_mold
 from gwaslab.hm_casting import _fill_missing_columns
 from gwaslab.hm_casting import _check_daf
@@ -34,7 +34,7 @@ class SumstatsT( ):
     def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
-        molded_sumstats = _merge_mold_with_sumstats(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
+        molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
         molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)

gwaslab 3.4.37__py3-none-any.whl → 3.4.39__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.37py3-none-any.whl → 3.4.39py3-none-any.whl