PyPI - gwaslab - Versions diffs - 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl - Mend

gwaslab 3.4.38py3-none-any.whl → 3.4.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (51) hide show

gwaslab/bd_common_data.py +6 -3
gwaslab/bd_download.py +9 -9
gwaslab/bd_get_hapmap3.py +43 -9
gwaslab/g_Log.py +14 -5
gwaslab/g_Sumstats.py +86 -18
gwaslab/g_SumstatsPair.py +70 -23
gwaslab/g_SumstatsT.py +2 -2
gwaslab/g_version.py +10 -10
gwaslab/hm_casting.py +9 -4
gwaslab/hm_harmonize_sumstats.py +88 -83
gwaslab/io_preformat_input.py +14 -14
gwaslab/io_read_ldsc.py +49 -1
gwaslab/ldsc_irwls.py +198 -0
gwaslab/ldsc_jackknife.py +514 -0
gwaslab/ldsc_ldscore.py +417 -0
gwaslab/ldsc_parse.py +294 -0
gwaslab/ldsc_regressions.py +747 -0
gwaslab/ldsc_sumstats.py +629 -0
gwaslab/qc_check_datatype.py +1 -1
gwaslab/qc_fix_sumstats.py +163 -161
gwaslab/util_ex_calculate_ldmatrix.py +2 -2
gwaslab/util_ex_gwascatalog.py +24 -24
gwaslab/util_ex_ldproxyfinder.py +9 -9
gwaslab/util_ex_ldsc.py +189 -0
gwaslab/util_in_calculate_gc.py +6 -6
gwaslab/util_in_calculate_power.py +42 -43
gwaslab/util_in_convert_h2.py +8 -8
gwaslab/util_in_fill_data.py +28 -28
gwaslab/util_in_filter_value.py +91 -52
gwaslab/util_in_get_density.py +8 -8
gwaslab/util_in_get_sig.py +407 -65
gwaslab/viz_aux_annotate_plot.py +12 -12
gwaslab/viz_aux_quickfix.py +18 -18
gwaslab/viz_aux_reposition_text.py +3 -3
gwaslab/viz_aux_save_figure.py +14 -5
gwaslab/viz_plot_compare_af.py +29 -30
gwaslab/viz_plot_compare_effect.py +63 -71
gwaslab/viz_plot_miamiplot2.py +6 -6
gwaslab/viz_plot_mqqplot.py +17 -3
gwaslab/viz_plot_qqplot.py +1 -1
gwaslab/viz_plot_regionalplot.py +33 -32
gwaslab/viz_plot_rg_heatmap.py +28 -26
gwaslab/viz_plot_stackedregional.py +40 -21
gwaslab/viz_plot_trumpetplot.py +50 -55
gwaslab-3.4.39.dist-info/LICENSE +674 -0
{gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/METADATA +4 -3
gwaslab-3.4.39.dist-info/RECORD +80 -0
gwaslab-3.4.38.dist-info/RECORD +0 -72
/gwaslab-3.4.38.dist-info/LICENSE → /gwaslab-3.4.39.dist-info/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/WHEEL +0 -0
{gwaslab-3.4.38.dist-info → gwaslab-3.4.39.dist-info}/top_level.txt +0 -0

gwaslab/bd_common_data.py CHANGED Viewed

@@ -280,17 +280,20 @@ def gtf_to_protein_coding(gtfpath,log=Log(),verbose=True):
     protein_coding_path = gtfpath[:-6]+"protein_coding.gtf.gz"
     # if not existing, extract protein coding records and output to a new file
     if not path.isfile(protein_coding_path):
         # get gene list
-        if verbose: log.write(" - Extracting protein_coding genes from {}".format(gtfpath))
+        log.write(" - Extracting protein_coding genes from {}".format(gtfpath),verbose=verbose)
         gtf = read_gtf(gtfpath,usecols=["feature","gene_biotype","gene_id","gene_name"])
         gene_list = gtf.loc[(gtf["feature"]=="gene") & (gtf["gene_biotype"]=="protein_coding"),"gene_id"].values
-        if verbose: log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)))
+        log.write(" - Loaded {} protein_coding genes.".format(len(gene_list)),verbose=verbose)
         # extract entry using csv
         gtf_raw = pd.read_csv(gtfpath,sep="\t",header=None,comment="#",dtype="string")
         gtf_raw["_gene_id"] = gtf_raw[8].str.extract(r'gene_id "([\w\.-]+)"')
         gtf_raw = gtf_raw.loc[ gtf_raw["_gene_id"].isin(gene_list) ,:]
         gtf_raw = gtf_raw.drop("_gene_id",axis=1)
-        if verbose: log.write(" - Extracted records are saved to : {} ".format(protein_coding_path))
+        log.write(" - Extracted records are saved to : {} ".format(protein_coding_path),verbose=verbose)
         gtf_raw.to_csv(protein_coding_path, header=None, index=None, sep="\t")
     return protein_coding_path

gwaslab/bd_download.py CHANGED Viewed

@@ -106,7 +106,7 @@ def check_available_ref(log=Log(),verbose=True):
     Check available reference files for gwaslab.
     Return a dictionary of available reference files.
     '''
-    if verbose : log.write("Start to check available reference files...")
+    log.write("Start to check available reference files...", verbose=verbose)
     #ref_path = path.dirname(__file__) + '/data/reference.json'
     ref_path = options.paths["reference"]
     if not path.exists(ref_path):
@@ -115,11 +115,11 @@ def check_available_ref(log=Log(),verbose=True):
     dicts = json.load(open(ref_path))
     if dicts is not None:
         for key,value in dicts.items():
-            if verbose :log.write(" -",key," : ",value)
+            log.write(" -",key," : ",value, verbose=verbose)
         return dicts
     else:
-        if verbose :log.write(" -No available reference files.")
-    if verbose :log.write("Finished checking available reference files...")
+        log.write(" -No available reference files.", verbose=verbose)
+    log.write("Finished checking available reference files...", verbose=verbose)
     return {}
 def update_available_ref(log=Log()):
@@ -167,8 +167,8 @@ def get_path(name,log=Log(),verbose=True):
     #config_path =  path.dirname(__file__) + '/data/config.json'
     config_path = options.paths["config"]
     if not path.exists(config_path):
-        if verbose : log.write("Config file not exists...")
-        if verbose : log.write("Created new config file...")
+        log.write("Config file not exists...", verbose=verbose)
+        log.write("Created new config file...", verbose=verbose)
         initiate_config()
     else:
         try:
@@ -176,9 +176,9 @@ def get_path(name,log=Log(),verbose=True):
             if path.exists(dicts[name]):
                 return dicts[name]
             else:
-                if verbose : log.write("File not exist.")
+                log.write("File not exist.", verbose=verbose)
         except:
-            if verbose : log.write("No records in config file. Please download first.")
+            log.write("No records in config file. Please download first.", verbose=verbose)
     return False
 ##################################################################################
@@ -277,7 +277,7 @@ def check_file_integrity(local_path, md5sum,log):
         log.write(" -MD5 verified.")
         return 1
     else:
-        log.write("WARNING: -MD5 VERIFICATION FAILED !")
+        log.warning("-MD5 VERIFICATION FAILED!")
         return 0
 def remove_file(name,log=Log()):

gwaslab/bd_get_hapmap3.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import pandas as pd
 from os import path
 from gwaslab.g_Log import Log
+from gwaslab.qc_fix_sumstats import start_to
+from gwaslab.qc_fix_sumstats import skipped
+from gwaslab.qc_fix_sumstats import finished
 #A unique identifier (e.g., the rs number)
 #Allele 1 (effect allele)
 #Allele 2 (non-effect allele)
@@ -8,30 +12,60 @@ from gwaslab.g_Log import Log
 #A P-value
 #A signed summary statistic (beta, OR, log odds, Z-score, etc)
-def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True,log=Log()):
-    if verbose:log.write(" -Processing "+str(len(sumstats))+" raw variants...")
+def gethapmap3(sumstats,rsid="rsID",chrom="CHR", pos="POS", ea="EA", nea="NEA",build="19", verbose=True, match_allele= True, log=Log()):
+    ##start function with col checking##########################################################
+    _start_line = "extract HapMap3 SNPs"
+    _end_line = "extracting HapMap3 SNPs"
+    _start_cols =[]
+    _start_function = ".gethapmap3"
+    _must_args ={}
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            **_must_args)
+    if is_enough_info == False: return None
+    ############################################################################################
     if build=="19":
         data_path =  path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db150_hg19.snplist.gz'
     elif build=="38":
         data_path =  path.dirname(__file__) + '/data/hapmap3_SNPs/hapmap3_db151_hg38.snplist.gz'
-    if verbose:log.write(" -Loading Hapmap3 variants data...")
-    hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"],dtype={"#CHROM":"string","POS":"string"})
+    log.write(" -Loading Hapmap3 variants from built-in datasets...", verbose=verbose)
+    if match_allele:
+        additional_cols= ["A1","A2"]
+    else:
+        additional_cols=[]
+    hapmap3_ref = pd.read_csv(data_path,sep="\s+",usecols=["#CHROM","POS","rsid"]+additional_cols, dtype={"#CHROM":"string","POS":"string"})
     #rsid    A1      A2      #CHROM  POS
     #rs3094315       G       A       1       752566
     if rsid in sumstats.columns:
         output = sumstats.loc[sumstats[rsid].isin(hapmap3_ref["rsid"].values),:].copy()
         return output
     elif chrom in sumstats.columns and pos in sumstats.columns:
-        if verbose: log.write(" -Since rsID not in sumstats, chr:pos( build "+build+") will be used for matching...")
+        log.write(" -Since rsID not in sumstats, CHR:POS( build "+build+") will be used for matching...", verbose=verbose)
         sumstats   ["chr:pos"] = sumstats[chrom].astype("string")+":"+sumstats[pos].astype("string")
         hapmap3_ref["chr:pos"] = hapmap3_ref["#CHROM"]+":"+hapmap3_ref["POS"]
         hapmap3_ref = hapmap3_ref.rename(columns={"rsid":"rsID"})
-        output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
-        output = output.drop(columns="chr:pos")
-        if verbose: log.write(" -Raw input contains "+str(len(output))+" hapmaps variants based on chr:pos...")
+        output = pd.merge(sumstats,hapmap3_ref.loc[:,["chr:pos","rsID"]+additional_cols],left_on="chr:pos",right_on="chr:pos",how="inner",suffixes=('', '_hapmap3')).copy()
+        if match_allele:
+            log.write(" -Checking if alleles are same...")
+            is_matched = ((output[ea].astype("string") == output["A1"]) & (output[nea].astype("string") == output["A2"])) \
+                            | ((output[ea].astype("string") == output["A2"]) & (output[nea].astype("string") == output["A1"]))
+            log.write(" -Variants with macthed alleles: {}".format(sum(is_matched)))
+            output = output.loc[is_matched,:]
+        output = output.drop(columns=["chr:pos"]+additional_cols)
+        log.write(" -Raw input contains "+str(len(output))+" Hapmap3 variants based on CHR:POS...", verbose=verbose)
+        finished(log=log,verbose=verbose,end_line=_end_line)
         return output
     else:
         raise ValueError("Not enough information to match SNPs. Please check your sumstats...")

gwaslab/g_Log.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import time
 class Log():
     def __init__(self):
-        self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
+        self.log_text=str(time.strftime('%Y/%m/%d %H:%M:%S'))+ " " + "Sumstats Object created."+ "\n"
     def write(self,*message,end="\n",show_time=True, verbose=True):
         if show_time is True:
-            if verbose: print(str(time.ctime(time.time())),*message,end=end)
-            self.log_text = self.log_text + str(time.ctime(time.time())) + " " + " ".join(map(str,message)) + end
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
+            self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
         else:
             if verbose: print(*message,end=end)
             self.log_text = self.log_text + " ".join(map(str,message)) + end
@@ -21,5 +21,14 @@ class Log():
         print(self.log_text)
     def save(self,path,verbose=True):
         with open(path,"w") as f:
-            if verbose: print(str(time.ctime(time.time())) + " " + " -Save log file to : ", path)
-            f.write(self.log_text)
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " -Save log file to : ", path)
+            f.write(self.log_text)
+    def log(self,*message,end="\n",show_time=True, verbose=True):
+        if show_time is True:
+            if verbose: print(str(time.strftime('%Y/%m/%d %H:%M:%S')),*message,end=end)
+            self.log_text = self.log_text + str(time.strftime('%Y/%m/%d %H:%M:%S')) + " " + " ".join(map(str,message)) + end
+        else:
+            if verbose: print(*message,end=end)
+            self.log_text = self.log_text + " ".join(map(str,message)) + end

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -32,6 +32,9 @@ from gwaslab.util_in_filter_value import filterout
 from gwaslab.util_in_filter_value import filterin
 from gwaslab.util_in_filter_value import filterregionin
 from gwaslab.util_in_filter_value import filterregionout
+from gwaslab.util_in_filter_value import _filter_indel
+from gwaslab.util_in_filter_value import _filter_palindromic
+from gwaslab.util_in_filter_value import _filter_snp
 from gwaslab.util_in_filter_value import inferbuild
 from gwaslab.util_in_filter_value import sampling
 from gwaslab.util_in_filter_value import _get_flanking
@@ -44,6 +47,8 @@ from gwaslab.util_in_get_density import getsignaldensity
 from gwaslab.util_in_get_density import assigndensity
 from gwaslab.util_in_get_sig import annogene
 from gwaslab.util_in_get_sig import getnovel
+from gwaslab.util_in_get_sig import _check_cis
+from gwaslab.util_in_get_sig import _check_novel_set
 from gwaslab.util_in_fill_data import filldata
 from gwaslab.bd_get_hapmap3 import gethapmap3
 from gwaslab.bd_common_data import get_chr_list
@@ -64,6 +69,9 @@ from gwaslab.viz_plot_trumpetplot import plottrumpet
 from gwaslab.viz_plot_compare_af import plotdaf
 from gwaslab.util_ex_run_susie import _run_susie_rss
 from gwaslab.qc_fix_sumstats import _check_data_consistency
+from gwaslab.util_ex_ldsc import _estimate_h2_by_ldsc
+from gwaslab.util_ex_ldsc import _estimate_rg_by_ldsc
+from gwaslab.bd_get_hapmap3 import gethapmap3
 import gc
 #20220309
@@ -121,7 +129,8 @@ class Sumstats():
         # basic attributes
         self.data = pd.DataFrame()
         self.log = Log()
+        self.ldsc_h2 = None
+        self.ldsc_rg = None
         # meta information
         self.meta = _init_meta()
         self.build = build
@@ -135,7 +144,7 @@ class Sumstats():
         self.pipcs = pd.DataFrame()
         # print gwaslab version information
-        if verbose: _show_version(self.log)
+        _show_version(self.log, verbose=verbose)
         #preformat the data
         self.data  = preformat(
@@ -405,19 +414,16 @@ class Sumstats():
         _check_data_consistency(self.data,log=self.log,**args)
     def check_id(self,**args):
         pass
     def check_ref(self,ref_seq,**args):
         self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
         self.data = checkref(self.data,ref_seq,log=self.log,**args)
     def infer_strand(self,ref_infer,**args):
         self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
         self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
     def flip_allele_stats(self,**args):
         self.data = flipallelestats(self.data,log=self.log,**args)
     def normalize_allele(self,**args):
         self.data = parallelnormalizeallele(self.data,log=self.log,**args)
     def assign_rsid(self,
                     ref_rsid_tsv=None,
                     ref_rsid_vcf=None,
@@ -428,14 +434,11 @@ class Sumstats():
         if ref_rsid_vcf is not None:
             self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
             self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
     def rsid_to_chrpos(self,**args):
         self.data = rsidtochrpos(self.data,log=self.log,**args)
     def rsid_to_chrpos2(self,**args):
         self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
     ############################################################################################################
     def sort_coordinate(self,**sort_args):
@@ -458,7 +461,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = _get_flanking(self.data, **args)
     def filter_flanking_by_chrpos(self, chrpos,  inplace=False,**args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -466,7 +468,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
     def filter_flanking_by_id(self, snpid, inplace=False,**args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -474,7 +475,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = _get_flanking_by_id(self.data, snpid, **args)
     def filter_value(self, expr, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -482,7 +482,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filtervalues(self.data, expr,log=self.log,**args)
     def filter_out(self, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -490,7 +489,6 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filterout(self.data,log=self.log,**args)
     def filter_in(self, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -512,7 +510,28 @@ class Sumstats():
             return new_Sumstats_object
         else:
             self.data = filterregionout(self.data,log=self.log,**args)
+    def filter_palindromic(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_palindromic(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_palindromic(self.data,log=self.log,**args)
+    def filter_snp(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_snp(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_snp(self.data,log=self.log,**args)
+    def filter_indel(self, inplace=False, **args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _filter_indel(new_Sumstats_object.data,log=new_Sumstats_object.log,**args)
+            return new_Sumstats_object
+        else:
+            self.data = _filter_indel(self.data,log=self.log,**args)
     def random_variants(self,inplace=False,n=1,p=None,**args):
         if inplace is True:
             self.data = sampling(self.data,n=n,p=p,log=self.log,**args)
@@ -520,18 +539,25 @@ class Sumstats():
             new_Sumstats_object = copy.deepcopy(self)
             new_Sumstats_object.data = sampling(new_Sumstats_object.data,n=n,p=p,log=new_Sumstats_object.log,**args)
             return new_Sumstats_object
+    def filter_hapmap3(self, inplace=False, build=None, **args ):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        if inplace is True:
+            self.data = gethapmap3(self.data, build=build,log=self.log, **args)
+        else:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = gethapmap3(new_Sumstats_object.data, build=build,log=self.log, **args)
+            return new_Sumstats_object
     ######################################################################
     def check_af(self,ref_infer,**args):
         self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
         self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
     def infer_af(self,ref_infer,**args):
         self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
         self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
         self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
     def plot_daf(self, **args):
         fig,outliers = plotdaf(self.data, **args)
         return fig, outliers
@@ -637,7 +663,37 @@ class Sumstats():
                            **args)
         # return sumstats object
         return output
+    def check_cis(self, **args):
+        if "SNPID" in self.data.columns:
+            id_to_use = "SNPID"
+        else:
+            id_to_use = "rsID"
+        output = _check_cis(self.data,
+                           id=id_to_use,
+                           chrom="CHR",
+                           pos="POS",
+                           p="P",
+                           log=self.log,
+                           **args)
+        # return sumstats object
+        return output
+    def check_novel_set(self, **args):
+        if "SNPID" in self.data.columns:
+            id_to_use = "SNPID"
+        else:
+            id_to_use = "rsID"
+        output = _check_novel_set(self.data,
+                           id=id_to_use,
+                           chrom="CHR",
+                           pos="POS",
+                           p="P",
+                           log=self.log,
+                           **args)
+        # return sumstats object
+        return output
     def anno_gene(self, **args):
         if "SNPID" in self.data.columns:
             id_to_use = "SNPID"
@@ -673,6 +729,18 @@ class Sumstats():
             output = lambdaGC(self.data[["CHR",mode]],mode=mode,**args)
             self.meta["Genomic inflation factor"] = output
             return output
+    def estimate_h2_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
+        self.ldsc_h2 = _estimate_h2_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
+    def estimate_rg_by_ldsc(self, build=None, verbose=True, match_allele=True, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        insumstats = gethapmap3(self.data.copy(), build=build, verbose=verbose , match_allele=True )
+        self.ldsc_rg = _estimate_rg_by_ldsc(insumstats=insumstats, log=self.log, verbose=verbose, **args)
 # external ################################################################################################
     def to_finemapping(self,**args):

gwaslab/g_SumstatsPair.py CHANGED Viewed

@@ -6,23 +6,26 @@ from gwaslab.util_in_filter_value import filtervalues
 from gwaslab.g_Log import Log
 from math import floor
 from gwaslab.g_Sumstats import Sumstats
-from gwaslab.hm_casting import _merge_mold_with_sumstats
+from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
 from gwaslab.hm_casting import _align_with_mold
 from gwaslab.hm_casting import _fill_missing_columns
 from gwaslab.hm_casting import _check_daf
 from gwaslab.hm_casting import _assign_warning_code
 from gwaslab.qc_fix_sumstats import flipallelestats
+from gwaslab.qc_check_datatype import check_datatype
+from gwaslab.qc_check_datatype import check_dataframe_shape
 from gwaslab.hm_casting import _renaming_cols
 from gwaslab.hm_casting import _sort_pair_cols
 from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
 from gwaslab.util_ex_run_coloc import _run_coloc_susie
 from gwaslab.viz_plot_miamiplot2 import plot_miami2
+from gwaslab.viz_plot_compare_af import  plotdaf
 from gwaslab.util_ex_run_2samplemr import _run_two_sample_mr
 from gwaslab.util_ex_run_clumping import _clump
 from gwaslab.util_ex_ldproxyfinder import _extract_with_ld_proxy
 class SumstatsPair( ):
-    def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ):
+    def __init__(self, sumstatsObject1, sumstatsObject2, study=None, suffixes = ("_1","_2") ,verbose=True ):
         if not isinstance(sumstatsObject1, Sumstats):
             raise ValueError("Please provide GWASLab Sumstats Object #1.")
@@ -34,7 +37,9 @@ class SumstatsPair( ):
             self.study_name = "{}_{}".format("STUDY1", "STUDY2")
         self.snp_info_cols = []
         self.stats_cols =[]
-        self.other_cols=[]
+        self.stats_cols2 =[]
+        self.other_cols =[]
+        self.other_cols2 =[]
         self.log = Log()
         self.suffixes = suffixes
         self.colocalization=pd.DataFrame()
@@ -43,28 +48,53 @@ class SumstatsPair( ):
         self.mr = {}
         self.clumps ={}
         self.ns = None
+        self.to_finemapping_file_path = ""
+        self.plink_log = ""
         self.log.write( "Start to create SumstatsPair object..." )
+        self.log.write( " -Checking sumstats 1..." , verbose=verbose)
+        check_datatype(sumstatsObject1.data, log=self.log, verbose=verbose)
+        check_dataframe_shape(sumstats=sumstatsObject1.data,
+                        log=self.log,
+                        verbose=verbose)
+        self.log.write( " -Checking sumstats 2..." , verbose=verbose)
+        check_datatype(sumstatsObject2.data, log=self.log, verbose=verbose)
+        check_dataframe_shape(sumstats=sumstatsObject2.data,
+                                log=self.log,
+                                verbose=verbose)
         for i in sumstatsObject1.data.columns:
             if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
                 self.snp_info_cols.append(i)
-            elif i in ["BETA","SE","P","MLOG10P","N","Z","OR","OR95L","OR95U","MAF","EAF"]:
+            elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
                 self.stats_cols.append(i)
             else:
                 self.other_cols.append(i)
-        self.data = sumstatsObject1.data.loc[:,self.snp_info_cols + self.stats_cols]
+        for i in sumstatsObject2.data.columns:
+            if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
+                continue
+            elif i in ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]:
+                self.stats_cols2.append(i)
+            else:
+                self.other_cols2.append(i)
+        self.log.write( " -Variant Info columns: {}".format(self.snp_info_cols) , verbose=verbose)
+        self.log.write( " -Variant statistics columns: {}".format(self.stats_cols) , verbose=verbose)
+        self.log.write( " -Sumstats1 other columns: {}".format(self.other_cols) , verbose=verbose)
+        self.log.write( " -Sumstats2 other columns: {}".format(self.other_cols2) , verbose=verbose)
+        # extract only info and stats cols
+        self.data = sumstatsObject1.data
+        #rename with _1
         self.data = self.data.rename(columns={"EA":"EA_1","NEA":"NEA_1"})
         self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.stats_cols})
+        self.data = self.data.rename(columns={i:i + suffixes[0] for i in self.other_cols})
         self.data, self.sumstats1 = self._merge_two_sumstats(sumstatsObject2, suffixes=suffixes)
-        self.to_finemapping_file_path = ""
-        self.plink_log = ""
         if "N{}".format(self.suffixes[0]) in self.data.columns and "N{}".format(self.suffixes[1]) in self.data.columns:
             n1 = int(floor(self.data["N{}".format(self.suffixes[0])].mean()))
             n2 = int(floor(self.data["N{}".format(self.suffixes[1])].mean()))
@@ -74,8 +104,9 @@ class SumstatsPair( ):
     def _merge_two_sumstats(self, sumstatsObject2, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None,suffixes=("_1","_2")):
-        molded_sumstats, sumstats1 = _merge_mold_with_sumstats(self.data,
-                                                    sumstatsObject2.data,
+        # sumstats1 with suffix _1, sumstats2 with no suffix
+        molded_sumstats, sumstats1 = _merge_mold_with_sumstats_by_chrpos(mold=self.data,
+                                                    sumstats=sumstatsObject2.data,
                                                     log=self.log,
                                                     verbose=verbose,
                                                     suffixes=(suffixes[0],""),
@@ -83,16 +114,21 @@ class SumstatsPair( ):
         molded_sumstats = _align_with_mold(molded_sumstats, log=self.log, verbose=verbose,suffixes=(suffixes[0],""))
+        # flip sumstats2 statistics
         molded_sumstats = flipallelestats(molded_sumstats, log=self.log, verbose=verbose)
+        # drop sumstats2 EA NEA
         molded_sumstats = molded_sumstats.drop(columns=["EA","NEA"])
+        # rename sumstats1 EA NEA
         molded_sumstats = molded_sumstats.rename(columns={"EA_1":"EA","NEA_1":"NEA"})
-        if not len(set(self.stats_cols) & set (sumstatsObject2.data.columns)) == len(self.stats_cols):
-            cols_to_fill = set(self.stats_cols).difference(set(sumstatsObject2.data.columns))
+        if not set(self.stats_cols2) == set(self.stats_cols):
+            cols_to_fill = set(self.stats_cols).difference(set(self.stats_cols2))
             molded_sumstats = _fill_missing_columns(molded_sumstats, cols_to_fill, log=self.log, verbose=verbose)
-        molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols, log=self.log, verbose=verbose, suffixes=suffixes)
+        # rename sumstast2 with _2
+        molded_sumstats = _renaming_cols(molded_sumstats, self.stats_cols + self.other_cols2, log=self.log, verbose=verbose, suffixes=suffixes)
         molded_sumstats = _sort_pair_cols(molded_sumstats, verbose=verbose, log=self.log)
@@ -108,13 +144,7 @@ class SumstatsPair( ):
     def run_coloc_susie(self,**args):
         self.colocalization = _run_coloc_susie(self.to_finemapping_file_path,log=self.log,ncols=self.ns,**args)
-    def plot_miami(self,**args):
-        plot_miami2(merged_sumstats=self.data,
-                    suffixes=self.suffixes,
-                    **args)
     def run_two_sample_mr(self, clump=False, **args):
         exposure1 = self.study_name.split("_")[0]
         outcome2 = self.study_name.split("_")[1]
@@ -130,4 +160,21 @@ class SumstatsPair( ):
             return new_Sumstats_object
         else:
             self.data = filtervalues(self.data, expr,log=self.log,**args)
-        gc.collect()
+        gc.collect()
+    ## Visualization #############################################################################################################################################
+    def plot_miami(self,**args):
+        plot_miami2(merged_sumstats=self.data,
+                    suffixes=self.suffixes,
+                    **args)
+    def compare_af(self, **args):
+        return plotdaf( self.data,
+                     eaf="EAF_2",
+                     raf="EAF_1",
+                     xlabel="Effect Allele Frequency in Sumstats 1",
+                     ylabel="Effect Allele Frequency in Sumstats 2",
+                     **args)

gwaslab/g_SumstatsT.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 from gwaslab.g_Sumstats import Sumstats
-from gwaslab.hm_casting import _merge_mold_with_sumstats
+from gwaslab.hm_casting import _merge_mold_with_sumstats_by_chrpos
 from gwaslab.hm_casting import _align_with_mold
 from gwaslab.hm_casting import _fill_missing_columns
 from gwaslab.hm_casting import _check_daf
@@ -34,7 +34,7 @@ class SumstatsT( ):
     def cast(self, sumstatsObject, threshold=0.2, verbose=True,windowsizeb=10, ref_path=None):
-        molded_sumstats = _merge_mold_with_sumstats(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
+        molded_sumstats = _merge_mold_with_sumstats_by_chrpos(self.snp_info, sumstatsObject.data, log=sumstatsObject.log, verbose=verbose, windowsizeb=windowsizeb,ref_path=ref_path)
         molded_sumstats = _align_with_mold(molded_sumstats, log=sumstatsObject.log, verbose=verbose)

gwaslab 3.4.38__py3-none-any.whl → 3.4.39__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.38py3-none-any.whl → 3.4.39py3-none-any.whl