PyPI - gwaslab - Versions diffs - 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl - Mend

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show

gwaslab/__init__.py +1 -1
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +8 -0
gwaslab/g_Sumstats.py +80 -178
gwaslab/g_SumstatsPair.py +6 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_meta.py +13 -3
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +29 -15
gwaslab/hm_harmonize_sumstats.py +312 -159
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +46 -37
gwaslab/io_to_formats.py +428 -295
gwaslab/qc_check_datatype.py +15 -1
gwaslab/qc_fix_sumstats.py +956 -719
gwaslab/util_ex_calculate_ldmatrix.py +29 -11
gwaslab/util_ex_gwascatalog.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +1 -1
gwaslab/util_ex_process_h5.py +26 -17
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_convert_h2.py +1 -1
gwaslab/util_in_fill_data.py +44 -5
gwaslab/util_in_filter_value.py +122 -34
gwaslab/util_in_get_density.py +2 -2
gwaslab/util_in_get_sig.py +41 -9
gwaslab/viz_aux_quickfix.py +26 -21
gwaslab/viz_aux_reposition_text.py +7 -4
gwaslab/viz_aux_save_figure.py +6 -5
gwaslab/viz_plot_compare_af.py +5 -5
gwaslab/viz_plot_compare_effect.py +22 -5
gwaslab/viz_plot_miamiplot2.py +28 -20
gwaslab/viz_plot_mqqplot.py +214 -98
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +16 -9
gwaslab/viz_plot_trumpetplot.py +15 -6
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
gwaslab-3.4.38.dist-info/RECORD +72 -0
gwaslab-3.4.36.dist-info/RECORD +0 -72
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0

gwaslab/g_Log.py CHANGED Viewed

@@ -2,6 +2,7 @@ import time
 class Log():
     def __init__(self):
         self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
     def write(self,*message,end="\n",show_time=True, verbose=True):
         if show_time is True:
             if verbose: print(str(time.ctime(time.time())),*message,end=end)
@@ -9,6 +10,13 @@ class Log():
         else:
             if verbose: print(*message,end=end)
             self.log_text = self.log_text + " ".join(map(str,message)) + end
+    def warning(self,*message,end="\n",show_time=True, verbose=True):
+        self.write(" #WARNING! {}".format(" ".join(map(str,message))),
+                   end=end,
+                   show_time=show_time,
+                   verbose=verbose)
     def show(self):
         print(self.log_text)
     def save(self,path,verbose=True):

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -5,7 +5,7 @@ import copy
 from gwaslab.g_Sumstats_summary import summarize
 from gwaslab.g_Sumstats_summary import lookupstatus
 from gwaslab.io_preformat_input import preformat
-from gwaslab.io_to_formats import tofmt
+from gwaslab.io_to_formats import _to_format
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixID
 from gwaslab.qc_fix_sumstats import removedup
@@ -35,6 +35,8 @@ from gwaslab.util_in_filter_value import filterregionout
 from gwaslab.util_in_filter_value import inferbuild
 from gwaslab.util_in_filter_value import sampling
 from gwaslab.util_in_filter_value import _get_flanking
+from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
+from gwaslab.util_in_filter_value import _get_flanking_by_id
 from gwaslab.util_in_calculate_gc import lambdaGC
 from gwaslab.util_in_convert_h2 import _get_per_snp_r2
 from gwaslab.util_in_get_sig import getsig
@@ -52,7 +54,8 @@ from gwaslab.bd_common_data import get_format_dict
 from gwaslab.bd_common_data import get_formats_list
 from gwaslab.g_version import _show_version
 from gwaslab.g_version import gwaslab_info
-from gwaslab.g_meta import init_meta
+from gwaslab.g_meta import _init_meta
+from gwaslab.g_meta import _append_meta_record
 from gwaslab.util_ex_run_clumping import _clump
 from gwaslab.util_ex_calculate_ldmatrix import tofinemapping
 from gwaslab.util_ex_calculate_prs import _calculate_prs
@@ -60,6 +63,7 @@ from gwaslab.viz_plot_mqqplot import mqqplot
 from gwaslab.viz_plot_trumpetplot import plottrumpet
 from gwaslab.viz_plot_compare_af import plotdaf
 from gwaslab.util_ex_run_susie import _run_susie_rss
+from gwaslab.qc_fix_sumstats import _check_data_consistency
 import gc
 #20220309
@@ -119,10 +123,9 @@ class Sumstats():
         self.log = Log()
         # meta information
-        self.meta = init_meta()
+        self.meta = _init_meta()
         self.build = build
-        self.meta["gwaslab"]["study_name"] = study
-        #self.meta["gwaslab"]["genome_build"] = build
+        self.meta["gwaslab"]["study_name"] =  study
         self.meta["gwaslab"]["species"] = species
         # initialize attributes for clumping and finmapping
@@ -217,8 +220,22 @@ class Sumstats():
         return lookupstatus(self.data[status])
     def set_build(self, build, verbose=True):
-        self.data = _set_build(self.data, build=build, log=self.log,verbose=verbose)
+        self.data, self.meta["gwaslab"]["genome_build"] = _set_build(self.data, build=build, log=self.log,verbose=verbose)
         gc.collect()
+    def infer_build(self,**args):
+        self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
+    def liftover(self,to_build, from_build=None,**args):
+        if from_build is None:
+            if self.meta["gwaslab"]["genome_build"]=="99":
+                self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
+            from_build = self.meta["gwaslab"]["genome_build"]
+        self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
+        self.meta["is_sorted"] = False
+        self.meta["is_harmonised"] = False
+        self.meta["gwaslab"]["genome_build"]=to_build
 # QC ######################################################################################
     #clean the sumstats with one line
     def basic_check(self,
@@ -231,6 +248,7 @@ class Sumstats():
                     fixpos_args={},
                     fixallele_args={},
                     sanitycheckstats_args={},
+                    consistencycheck_args={},
                     normalize=True,
                     normalizeallele_args={},
                     verbose=True):
@@ -241,6 +259,8 @@ class Sumstats():
         self.data = fixpos(self.data,log=self.log,remove=remove,verbose=verbose,**fixpos_args)
         self.data = fixallele(self.data,log=self.log,remove=remove,verbose=verbose,**fixallele_args)
         self.data = sanitycheckstats(self.data,log=self.log,verbose=verbose,**sanitycheckstats_args)
+        _check_data_consistency(self.data,log=self.log,verbose=verbose,**consistencycheck_args)
         if normalize is True:
             self.data = parallelnormalizeallele(self.data,n_cores=n_cores,verbose=verbose,log=self.log,**normalizeallele_args)
         if remove_dup is True:
@@ -329,9 +349,9 @@ class Sumstats():
             self.data= parallelinferstrand(self.data,ref_infer = ref_infer,ref_alt_freq=ref_alt_freq,maf_threshold=maf_threshold,
                                               n_cores=n_cores,log=self.log,**inferstrand_args)
-            self.meta["gwaslab"]["references"]["ref_infer"] = ref_infer
+            self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
             self.data =flipallelestats(self.data,log=self.log,**flipallelestats_args)
             gc.collect()
@@ -341,13 +361,18 @@ class Sumstats():
             self.data = parallelizeassignrsid(self.data,path=ref_rsid_tsv,ref_mode="tsv",
                                                  n_cores=n_cores,log=self.log,**assignrsid_args)
             self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
             gc.collect()
         if ref_rsid_vcf is not None:
             self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",
                                                  n_cores=n_cores,log=self.log,**assignrsid_args)
-            self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
+            self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
             gc.collect()
         ######################################################
         if remove is True:
@@ -376,17 +401,23 @@ class Sumstats():
         self.data = removedup(self.data,log=self.log,**args)
     def check_sanity(self,**args):
         self.data = sanitycheckstats(self.data,log=self.log,**args)
-    #
+    def check_data_consistency(self, **args):
+        _check_data_consistency(self.data,log=self.log,**args)
     def check_id(self,**args):
         pass
-    def check_ref(self,**args):
-        self.data = checkref(self.data,log=self.log,**args)
-    def infer_strand(self,**args):
-        self.data = parallelinferstrand(self.data,log=self.log,**args)
+    def check_ref(self,ref_seq,**args):
+        self.meta["gwaslab"]["references"]["ref_seq"] = ref_seq
+        self.data = checkref(self.data,ref_seq,log=self.log,**args)
+    def infer_strand(self,ref_infer,**args):
+        self.meta["gwaslab"]["references"]["ref_infer"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer"] , ref_infer)
+        self.data = parallelinferstrand(self.data,ref_infer=ref_infer,log=self.log,**args)
     def flip_allele_stats(self,**args):
         self.data = flipallelestats(self.data,log=self.log,**args)
     def normalize_allele(self,**args):
         self.data = parallelnormalizeallele(self.data,log=self.log,**args)
     def assign_rsid(self,
                     ref_rsid_tsv=None,
                     ref_rsid_vcf=None,
@@ -396,21 +427,15 @@ class Sumstats():
             self.meta["gwaslab"]["references"]["ref_rsid_tsv"] = ref_rsid_tsv
         if ref_rsid_vcf is not None:
             self.data = parallelizeassignrsid(self.data,path=ref_rsid_vcf,ref_mode="vcf",log=self.log,**args)
-            self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = ref_rsid_vcf
+            self.meta["gwaslab"]["references"]["ref_rsid_vcf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_rsid_vcf"] , ref_rsid_vcf)
     def rsid_to_chrpos(self,**args):
         self.data = rsidtochrpos(self.data,log=self.log,**args)
     def rsid_to_chrpos2(self,**args):
         self.data = parallelrsidtochrpos(self.data,log=self.log,**args)
-    def liftover(self,to_build, from_build=None,**args):
-        if from_build is None:
-            if self.meta["gwaslab"]["genome_build"]=="99":
-                self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
-            from_build = self.meta["gwaslab"]["genome_build"]
-        self.data = parallelizeliftovervariant(self.data,from_build=from_build, to_build=to_build, log=self.log,**args)
-        self.meta["is_sorted"] = False
-        self.meta["is_harmonised"] = False
-        self.meta["gwaslab"]["genome_build"]=to_build
     ############################################################################################################
     def sort_coordinate(self,**sort_args):
@@ -420,14 +445,13 @@ class Sumstats():
         self.data = sortcolumn(self.data,log=self.log,**args)
     ############################################################################################################
-    def fill_data(self, **args):
-        self.data = filldata(self.data,**args)
-    def infer_build(self,**args):
-        self.data, self.meta["gwaslab"]["genome_build"] = inferbuild(self.data,**args)
+    def fill_data(self, verbose=True, **args):
+        self.data = filldata(self.data, verbose=verbose, **args)
+        self.data = sortcolumn(self.data, verbose=verbose, log=self.log)
 # utilities ############################################################################################################
     # filter series ######################################################################
-    def get_flanking(self, inplace=False,**args):
+    def filter_flanking(self, inplace=False,**args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
             new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
@@ -435,6 +459,22 @@ class Sumstats():
         else:
             self.data = _get_flanking(self.data, **args)
+    def filter_flanking_by_chrpos(self, chrpos,  inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
+    def filter_flanking_by_id(self, snpid, inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_id(self.data, snpid, **args)
     def filter_value(self, expr, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -485,11 +525,12 @@ class Sumstats():
     def check_af(self,ref_infer,**args):
         self.data = parallelecheckaf(self.data,ref_infer=ref_infer,log=self.log,**args)
-        self.meta["gwaslab"]["references"]["ref_infer_daf"] = ref_infer
+        self.meta["gwaslab"]["references"]["ref_infer_daf"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_daf"] , ref_infer)
     def infer_af(self,ref_infer,**args):
         self.data = paralleleinferaf(self.data,ref_infer=ref_infer,log=self.log,**args)
         self.meta["gwaslab"]["references"]["ref_infer_af"] = ref_infer
+        self.meta["gwaslab"]["references"]["ref_infer_af"] = _append_meta_record(self.meta["gwaslab"]["references"]["ref_infer_af"] , ref_infer)
     def plot_daf(self, **args):
         fig,outliers = plotdaf(self.data, **args)
@@ -525,8 +566,10 @@ class Sumstats():
         return plot
-    def plot_trumpet(self, **args):
-        fig = plottrumpet(self.data, **args)
+    def plot_trumpet(self, build=None, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        fig = plottrumpet(self.data,build = build,  **args)
         return fig
     def get_lead(self, build=None, gls=False, **args):
@@ -647,148 +690,7 @@ class Sumstats():
 # to_format ###############################################################################################
-    def to_format(self,
-              path="./sumstats",
-              fmt="gwaslab",
-              extract=None,
-              exclude=None,
-              cols=None,
-              id_use="rsID",
-              hapmap3=False,
-              exclude_hla=False,
-              hla_range=(25,34),
-              build=None,
-              n=None,
-              verbose=True,
-              no_status=False,
-              output_log=True,
-              to_csvargs=None,
-              float_formats=None,
-              xymt_number=False,
-              xymt=None,
-              chr_prefix="",
-              ssfmeta=False,
-              md5sum=False,
-              bgzip=False,
-              tabix=False,
-              tabix_indexargs={}):
+    def to_format(self, path, build=None, **args):
         if build is None:
             build = self.meta["gwaslab"]["genome_build"]
-        onetime_log = copy.deepcopy(self.log)
-        if  to_csvargs is None:
-            to_csvargs = {}
-        if  float_formats is None:
-            float_formats={}
-        if cols is None:
-            cols=[]
-        if xymt is None:
-            xymt = ["X","Y","MT"]
-        formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
-        if fmt in formatlist:
-            if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
-        else:
-            raise ValueError("Please select a format to output")
-        #######################################################################################################
-        # filter
-        output = self.data.copy()
-        if extract is not None:
-            output = output.loc[output[id_use].isin(extract),:]
-        if exclude is not None:
-            output = output.loc[~output[id_use].isin(exclude),:]
-        #hla and hapmap3 #######################################################################################
-        suffix=fmt
-        #exclude hla
-        if exclude_hla is True:
-            if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
-            before = len(output)
-            is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
-            output = output.loc[~is_hla,:]
-            after = len(output)
-            if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
-            suffix = "noMHC."+suffix
-        #extract hapmap3 SNPs
-        if hapmap3 is True:
-            output = gethapmap3(output,build=build,verbose=True)
-            after = len(output)
-            if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
-            suffix = "hapmap3."+suffix
-        # add a n column
-        if n is not None:
-            output["N"] = n
-        #######################################################################################################
-        #formatting float statistics
-        if verbose: onetime_log.write(" -Formatting statistics ...")
-        formats = {'EAF': '{:.4g}',
-                'BETA': '{:.4f}',
-                'Z': '{:.4f}',
-                'CHISQ': '{:.4f}',
-                'SE': '{:.4f}',
-                'OR': '{:.4f}',
-                'OR_95U': '{:.4f}',
-                'OR_95L': '{:.4f}',
-                'INFO': '{:.4f}',
-                'P': '{:.4e}',
-                'MLOG10P': '{:.4f}',
-                'DAF': '{:.4f}'
-                  }
-        for col, f in float_formats.items():
-            if col in output.columns:
-                formats[col]=f
-        for col, f in formats.items():
-            if col in output.columns:
-                if output[col].dtype in ["float64","float32","float16","float"]:
-                    output[col] = output[col].map(f.format)
-        if verbose:
-            onetime_log.write(" - Float statistics formats:")
-            keys=[]
-            values=[]
-            for key,value in formats.items():
-                if key in output.columns:
-                    keys.append(key)
-                    values.append(value)
-            onetime_log.write("  - Columns:",keys)
-            onetime_log.write("  - Output formats:",values)
-        ##########################################################################################################
-        # output, mapping column names
-        if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
-            tofmt(output,
-                  path=path,
-                  fmt=fmt,
-                  cols=cols,
-                  suffix=suffix,
-                  build=build,
-                  verbose=True,
-                  no_status=no_status,
-                  log=onetime_log,
-                  to_csvargs=to_csvargs,
-                  chr_prefix=chr_prefix,
-                  meta = self.meta,
-                  ssfmeta=ssfmeta,
-                  bgzip=bgzip,
-                  tabix=tabix,
-                  tabix_indexargs=tabix_indexargs,
-                  md5sum=md5sum,
-                  xymt_number=xymt_number,
-                  xymt=xymt)
-        if output_log is True:
-            log_path = path + "."+ suffix + ".log"
-            if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
-            if verbose: onetime_log.write("Finished outputting successfully!")
-            try:
-                onetime_log.save(log_path, verbose=False)
-            except:
-                pass
+        _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)

gwaslab/g_SumstatsPair.py CHANGED Viewed

@@ -28,8 +28,10 @@ class SumstatsPair( ):
             raise ValueError("Please provide GWASLab Sumstats Object #1.")
         if not isinstance(sumstatsObject2, Sumstats):
             raise ValueError("Please provide GWASLab Sumstats Object #2.")
-        self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
+            self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        else:
+            self.study_name = "{}_{}".format("STUDY1", "STUDY2")
         self.snp_info_cols = []
         self.stats_cols =[]
         self.other_cols=[]
@@ -42,6 +44,8 @@ class SumstatsPair( ):
         self.clumps ={}
         self.ns = None
+        self.log.write( "Start to create SumstatsPair object..." )
         for i in sumstatsObject1.data.columns:
             if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
                 self.snp_info_cols.append(i)

gwaslab/g_Sumstats_summary.py CHANGED Viewed

@@ -15,7 +15,7 @@ def summarize(insumstats,
     for i in [snpid,rsid,eaf,p,n,status]:
         if i in insumstats.columns:
             cols.append(i)
-    sumstats= insumstats.loc[:,cols].copy()
+    sumstats= insumstats[cols].copy()
     ###############################################################################
     numeric_cols=[]
     output = {}
@@ -68,7 +68,7 @@ def summarize(insumstats,
         sumstats.drop(columns='uniq_index',inplace=True)
         status_dic = {}
         for index,row in status_summary.iterrows():
-            status_dic[str(index)]=row[0]
+            status_dic[str(index)]=row.iloc[0]
         output["STATUS"]=status_dic
         numeric_cols.append("STATUS")
     df = pd.DataFrame.from_dict({(i,j): output[i][j]
@@ -84,7 +84,7 @@ def summarize(insumstats,
     return df
 def sum_status(id_to_use, sumstats):
-        results = sumstats.groupby("STATUS").count()
+        results = sumstats.groupby("STATUS",observed=True).count()
         results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
         return results

gwaslab/g_meta.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from gwaslab.g_version import gwaslab_info
-def init_meta():
+def _init_meta():
     metadata = {"gwaslab":{
                         "gwaslab_version": gwaslab_info()["version"],
                         "study_name":"Sumstats_1",
@@ -23,7 +23,11 @@ def init_meta():
                             "ref_rsid_tsv":"Unknown",
                             "ref_rsid_vcf":"Unknown",
                             "ref_seq":"Unknown",
-                            "ref_infer":"Unknown"
+                            "ref_infer":"Unknown",
+                            "ref_infer_af":"Unknown",
+                            "ref_infer_daf":"Unknown",
+                            "ref_rsid_to_chrpos_tsv":"Unknown",
+                            "ref_rsid_to_chrpos_vcf":"Unknown"
                         }
                     },
                      "genotyping_technology":"Unknown",
@@ -45,4 +49,10 @@ def init_meta():
                      "coordinate_system":"1-based",
                      "sex": "M|F|combined"
                      }
-    return metadata.copy()
+    return metadata.copy()
+def _append_meta_record(old, new):
+    if old == "Unknown" or old== "Unchecked":
+        return new
+    else:
+        return "{}, {}".format(old, new)

gwaslab/g_version.py CHANGED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.36",
-   "release_date":"20240123"
+   "version":"3.4.38",
+   "release_date":"20240203"
     }
     return dic

gwaslab/hm_casting.py CHANGED Viewed

@@ -14,9 +14,11 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
     for i in sumstats.columns:
         if i in ["SNPID","rsID"]:
             cols_to_drop.append(i)
+    log.write("Start to merge sumstats...", verbose=verbose)
     if len(cols_to_drop)>0:
-        log.write("Dropping old IDs:{}".format(cols_to_drop))
+        log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
         sumstats = sumstats.drop(columns=cols_to_drop)
     if ref_path is not None :
@@ -30,17 +32,18 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
         mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
     mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
-    log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
+    log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
     mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
-    log.write("Matched variants:{}".format(len(mold_sumstats)))
+    log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
-    if ref_path is not None:
-        # match removed sumstats
-        mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
-        iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
-        _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
-        mold_sumstats.drop(columns=["_INDEX",""])
+    #if ref_path is not None:
+    #    # match removed sumstats
+    #    mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
+    #    iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
+    #    _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
+    #    mold_sumstats.drop(columns=["_INDEX",""])
     if return_not_matched_mold == True:
         sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
@@ -59,14 +62,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
     all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
     allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
-    sumstats.loc[:, [nea1,ea1,nea2,ea2]] = sumstats.loc[:, [nea1,ea1,nea2,ea2]].astype(allele_type)
+    sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
     is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
     is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
     is_allele_set_match = is_flipped_match | is_perfect_match
-    sumstats.loc[~is_allele_set_match,:]
+    log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
+    log.write("  -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
+    log.write("  -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
+    log.write("  -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
     return sumstats.loc[is_allele_set_match,:]
 def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
@@ -77,10 +83,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
     nea2="NEA"+suffixes[1]
     status1="STATUS"+suffixes[0]
     status2="STATUS"+suffixes[1]
     is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
     is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
+    log.write(" -Aligning alleles with reference: ", verbose=verbose)
+    log.write("  -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
+    log.write("  -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
+    log.write("  -For perfect match: copy STATUS from reference...", verbose=verbose)
     sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
+    log.write("  -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
     sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
     return sumstats
@@ -119,9 +133,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
         if i not in order:
             output_columns.append(i)
-    if verbose: log.write(" -Reordering columns to    :", ",".join(output_columns))
-    molded_sumstats = molded_sumstats.loc[:, output_columns]
-    if verbose: log.write("Finished sorting columns successfully!")
+    if verbose: log.write(" -Reordering columns to    :", ",".join(output_columns), verbose=verbose)
+    molded_sumstats = molded_sumstats[ output_columns]
+    if verbose: log.write("Finished sorting columns successfully!", verbose=verbose)
     return molded_sumstats

gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl