PyPI - gwaslab - Versions diffs - 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl - Mend

gwaslab 3.4.37py3-none-any.whl → 3.4.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (37) hide show

gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +8 -0
gwaslab/g_Sumstats.py +26 -147
gwaslab/g_SumstatsPair.py +6 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +29 -15
gwaslab/hm_harmonize_sumstats.py +291 -163
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +43 -37
gwaslab/io_to_formats.py +428 -295
gwaslab/qc_check_datatype.py +3 -3
gwaslab/qc_fix_sumstats.py +793 -682
gwaslab/util_ex_calculate_ldmatrix.py +29 -11
gwaslab/util_ex_gwascatalog.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +1 -1
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_convert_h2.py +1 -1
gwaslab/util_in_fill_data.py +2 -2
gwaslab/util_in_filter_value.py +122 -34
gwaslab/util_in_get_density.py +2 -2
gwaslab/util_in_get_sig.py +41 -9
gwaslab/viz_aux_quickfix.py +24 -19
gwaslab/viz_aux_reposition_text.py +7 -4
gwaslab/viz_aux_save_figure.py +6 -5
gwaslab/viz_plot_compare_af.py +5 -5
gwaslab/viz_plot_miamiplot2.py +28 -20
gwaslab/viz_plot_mqqplot.py +109 -72
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +3 -1
gwaslab/viz_plot_trumpetplot.py +15 -6
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/METADATA +2 -2
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/RECORD +37 -37
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
{gwaslab-3.4.37.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0

gwaslab/g_Log.py CHANGED Viewed

@@ -2,6 +2,7 @@ import time
 class Log():
     def __init__(self):
         self.log_text=str(time.ctime(time.time()))+ " " + "Sumstats Object created."+ "\n"
     def write(self,*message,end="\n",show_time=True, verbose=True):
         if show_time is True:
             if verbose: print(str(time.ctime(time.time())),*message,end=end)
@@ -9,6 +10,13 @@ class Log():
         else:
             if verbose: print(*message,end=end)
             self.log_text = self.log_text + " ".join(map(str,message)) + end
+    def warning(self,*message,end="\n",show_time=True, verbose=True):
+        self.write(" #WARNING! {}".format(" ".join(map(str,message))),
+                   end=end,
+                   show_time=show_time,
+                   verbose=verbose)
     def show(self):
         print(self.log_text)
     def save(self,path,verbose=True):

gwaslab/g_Sumstats.py CHANGED Viewed

@@ -5,7 +5,7 @@ import copy
 from gwaslab.g_Sumstats_summary import summarize
 from gwaslab.g_Sumstats_summary import lookupstatus
 from gwaslab.io_preformat_input import preformat
-from gwaslab.io_to_formats import tofmt
+from gwaslab.io_to_formats import _to_format
 from gwaslab.g_Log import Log
 from gwaslab.qc_fix_sumstats import fixID
 from gwaslab.qc_fix_sumstats import removedup
@@ -35,6 +35,8 @@ from gwaslab.util_in_filter_value import filterregionout
 from gwaslab.util_in_filter_value import inferbuild
 from gwaslab.util_in_filter_value import sampling
 from gwaslab.util_in_filter_value import _get_flanking
+from gwaslab.util_in_filter_value import _get_flanking_by_chrpos
+from gwaslab.util_in_filter_value import _get_flanking_by_id
 from gwaslab.util_in_calculate_gc import lambdaGC
 from gwaslab.util_in_convert_h2 import _get_per_snp_r2
 from gwaslab.util_in_get_sig import getsig
@@ -449,7 +451,7 @@ class Sumstats():
 # utilities ############################################################################################################
     # filter series ######################################################################
-    def get_flanking(self, inplace=False,**args):
+    def filter_flanking(self, inplace=False,**args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
             new_Sumstats_object.data = _get_flanking(new_Sumstats_object.data, **args)
@@ -457,6 +459,22 @@ class Sumstats():
         else:
             self.data = _get_flanking(self.data, **args)
+    def filter_flanking_by_chrpos(self, chrpos,  inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_chrpos(new_Sumstats_object.data, chrpos, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_chrpos(self.data, chrpos,**args)
+    def filter_flanking_by_id(self, snpid, inplace=False,**args):
+        if inplace is False:
+            new_Sumstats_object = copy.deepcopy(self)
+            new_Sumstats_object.data = _get_flanking_by_id(new_Sumstats_object.data, snpid, **args)
+            return new_Sumstats_object
+        else:
+            self.data = _get_flanking_by_id(self.data, snpid, **args)
     def filter_value(self, expr, inplace=False, **args):
         if inplace is False:
             new_Sumstats_object = copy.deepcopy(self)
@@ -548,8 +566,10 @@ class Sumstats():
         return plot
-    def plot_trumpet(self, **args):
-        fig = plottrumpet(self.data, **args)
+    def plot_trumpet(self, build=None, **args):
+        if build is None:
+            build = self.meta["gwaslab"]["genome_build"]
+        fig = plottrumpet(self.data,build = build,  **args)
         return fig
     def get_lead(self, build=None, gls=False, **args):
@@ -670,148 +690,7 @@ class Sumstats():
 # to_format ###############################################################################################
-    def to_format(self,
-              path="./sumstats",
-              fmt="gwaslab",
-              extract=None,
-              exclude=None,
-              cols=None,
-              id_use="rsID",
-              hapmap3=False,
-              exclude_hla=False,
-              hla_range=(25,34),
-              build=None,
-              n=None,
-              verbose=True,
-              no_status=False,
-              output_log=True,
-              to_csvargs=None,
-              float_formats=None,
-              xymt_number=False,
-              xymt=None,
-              chr_prefix="",
-              ssfmeta=False,
-              md5sum=False,
-              bgzip=False,
-              tabix=False,
-              tabix_indexargs={}):
+    def to_format(self, path, build=None, **args):
         if build is None:
             build = self.meta["gwaslab"]["genome_build"]
-        onetime_log = copy.deepcopy(self.log)
-        if  to_csvargs is None:
-            to_csvargs = {}
-        if  float_formats is None:
-            float_formats={}
-        if cols is None:
-            cols=[]
-        if xymt is None:
-            xymt = ["X","Y","MT"]
-        formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
-        if fmt in formatlist:
-            if verbose: onetime_log.write("Start to format the output sumstats in: ",fmt, " format")
-        else:
-            raise ValueError("Please select a format to output")
-        #######################################################################################################
-        # filter
-        output = self.data.copy()
-        if extract is not None:
-            output = output.loc[output[id_use].isin(extract),:]
-        if exclude is not None:
-            output = output.loc[~output[id_use].isin(exclude),:]
-        #hla and hapmap3 #######################################################################################
-        suffix=fmt
-        #exclude hla
-        if exclude_hla is True:
-            if verbose: onetime_log.write(" -Excluding variants in MHC (HLA) region ...")
-            before = len(output)
-            is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
-            output = output.loc[~is_hla,:]
-            after = len(output)
-            if verbose: onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]))
-            suffix = "noMHC."+suffix
-        #extract hapmap3 SNPs
-        if hapmap3 is True:
-            output = gethapmap3(output,build=build,verbose=True)
-            after = len(output)
-            if verbose: onetime_log.write(" -Extract "+ str(after) + " variants in Hapmap3 datasets for build "+build+".")
-            suffix = "hapmap3."+suffix
-        # add a n column
-        if n is not None:
-            output["N"] = n
-        #######################################################################################################
-        #formatting float statistics
-        if verbose: onetime_log.write(" -Formatting statistics ...")
-        formats = {'EAF': '{:.4g}',
-                'BETA': '{:.4f}',
-                'Z': '{:.4f}',
-                'CHISQ': '{:.4f}',
-                'SE': '{:.4f}',
-                'OR': '{:.4f}',
-                'OR_95U': '{:.4f}',
-                'OR_95L': '{:.4f}',
-                'INFO': '{:.4f}',
-                'P': '{:.4e}',
-                'MLOG10P': '{:.4f}',
-                'DAF': '{:.4f}'
-                  }
-        for col, f in float_formats.items():
-            if col in output.columns:
-                formats[col]=f
-        for col, f in formats.items():
-            if col in output.columns:
-                if output[col].dtype in ["float64","float32","float16","float"]:
-                    output[col] = output[col].map(f.format)
-        if verbose:
-            onetime_log.write(" - Float statistics formats:")
-            keys=[]
-            values=[]
-            for key,value in formats.items():
-                if key in output.columns:
-                    keys.append(key)
-                    values.append(value)
-            onetime_log.write("  - Columns:",keys)
-            onetime_log.write("  - Output formats:",values)
-        ##########################################################################################################
-        # output, mapping column names
-        if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
-            tofmt(output,
-                  path=path,
-                  fmt=fmt,
-                  cols=cols,
-                  suffix=suffix,
-                  build=build,
-                  verbose=True,
-                  no_status=no_status,
-                  log=onetime_log,
-                  to_csvargs=to_csvargs,
-                  chr_prefix=chr_prefix,
-                  meta = self.meta,
-                  ssfmeta=ssfmeta,
-                  bgzip=bgzip,
-                  tabix=tabix,
-                  tabix_indexargs=tabix_indexargs,
-                  md5sum=md5sum,
-                  xymt_number=xymt_number,
-                  xymt=xymt)
-        if output_log is True:
-            log_path = path + "."+ suffix + ".log"
-            if verbose: onetime_log.write(" -Saving log file to: {}".format(log_path))
-            if verbose: onetime_log.write("Finished outputting successfully!")
-            try:
-                onetime_log.save(log_path, verbose=False)
-            except:
-                pass
+        _to_format(self.data, path, log=self.log, meta=self.meta, build=build, **args)

gwaslab/g_SumstatsPair.py CHANGED Viewed

@@ -28,8 +28,10 @@ class SumstatsPair( ):
             raise ValueError("Please provide GWASLab Sumstats Object #1.")
         if not isinstance(sumstatsObject2, Sumstats):
             raise ValueError("Please provide GWASLab Sumstats Object #2.")
-        self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        if sumstatsObject1.meta["gwaslab"]["study_name"]!=sumstatsObject2.meta["gwaslab"]["study_name"]:
+            self.study_name = "{}_{}".format(sumstatsObject1.meta["gwaslab"]["study_name"], sumstatsObject2.meta["gwaslab"]["study_name"])
+        else:
+            self.study_name = "{}_{}".format("STUDY1", "STUDY2")
         self.snp_info_cols = []
         self.stats_cols =[]
         self.other_cols=[]
@@ -42,6 +44,8 @@ class SumstatsPair( ):
         self.clumps ={}
         self.ns = None
+        self.log.write( "Start to create SumstatsPair object..." )
         for i in sumstatsObject1.data.columns:
             if i in ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]:
                 self.snp_info_cols.append(i)

gwaslab/g_Sumstats_summary.py CHANGED Viewed

@@ -15,7 +15,7 @@ def summarize(insumstats,
     for i in [snpid,rsid,eaf,p,n,status]:
         if i in insumstats.columns:
             cols.append(i)
-    sumstats= insumstats.loc[:,cols].copy()
+    sumstats= insumstats[cols].copy()
     ###############################################################################
     numeric_cols=[]
     output = {}
@@ -68,7 +68,7 @@ def summarize(insumstats,
         sumstats.drop(columns='uniq_index',inplace=True)
         status_dic = {}
         for index,row in status_summary.iterrows():
-            status_dic[str(index)]=row[0]
+            status_dic[str(index)]=row.iloc[0]
         output["STATUS"]=status_dic
         numeric_cols.append("STATUS")
     df = pd.DataFrame.from_dict({(i,j): output[i][j]
@@ -84,7 +84,7 @@ def summarize(insumstats,
     return df
 def sum_status(id_to_use, sumstats):
-        results = sumstats.groupby("STATUS").count()
+        results = sumstats.groupby("STATUS",observed=True).count()
         results = results.loc[results[id_to_use]>0,:].sort_values(id_to_use,ascending=False)
         return results

gwaslab/g_version.py CHANGED Viewed

@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.4.37",
-   "release_date":"20240129"
+   "version":"3.4.38",
+   "release_date":"20240203"
     }
     return dic

gwaslab/hm_casting.py CHANGED Viewed

@@ -14,9 +14,11 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
     for i in sumstats.columns:
         if i in ["SNPID","rsID"]:
             cols_to_drop.append(i)
+    log.write("Start to merge sumstats...", verbose=verbose)
     if len(cols_to_drop)>0:
-        log.write("Dropping old IDs:{}".format(cols_to_drop))
+        log.write(" -Dropping old IDs:{}".format(cols_to_drop), verbose=verbose)
         sumstats = sumstats.drop(columns=cols_to_drop)
     if ref_path is not None :
@@ -30,17 +32,18 @@ def _merge_mold_with_sumstats(mold, sumstats, ref_path=None, windowsizeb=10, log
         mold["_IDENTIFIER_FOR_VARIANT"] = range(len(mold))
     mold_sumstats = pd.merge(mold, sumstats, on=["CHR","POS"], how="inner",suffixes=suffixes)
-    log.write("After merging by CHR and POS:{}".format(len(mold_sumstats)))
+    log.write(" -After merging by CHR and POS:{}".format(len(mold_sumstats)), verbose=verbose)
     mold_sumstats = _keep_variants_with_same_allele_set(mold_sumstats,suffixes=suffixes)
-    log.write("Matched variants:{}".format(len(mold_sumstats)))
+    log.write(" -Matched variants:{}".format(len(mold_sumstats)), verbose=verbose)
-    if ref_path is not None:
-        # match removed sumstats
-        mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
-        iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
-        _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
-        mold_sumstats.drop(columns=["_INDEX",""])
+    #if ref_path is not None:
+    #    # match removed sumstats
+    #    mold_removed = mold.loc[~mold[index1].isin(mold_sumstats[index1]),:]
+    #    iron_removed = sumstats.loc[~sumstats[index2].isin(mold_sumstats[index2]),:]
+    #    _match_two_sumstats(mold_removed,iron_removed,ref_path,windowsizeb=windowsizeb)
+    #    mold_sumstats.drop(columns=["_INDEX",""])
     if return_not_matched_mold == True:
         sumstats1 = mold.loc[~mold["_IDENTIFIER_FOR_VARIANT"].isin(mold_sumstats["_IDENTIFIER_FOR_VARIANT"]),:]
@@ -59,14 +62,17 @@ def _keep_variants_with_same_allele_set(sumstats, log=Log(),verbose=True,suffixe
     all_alleles = set(list(sumstats[ea1].unique())+list(sumstats[nea1].unique())+list(sumstats[ea2].unique())+list(sumstats[nea2].unique()))
     allele_type = CategoricalDtype(categories=all_alleles, ordered=False)
-    sumstats.loc[:, [nea1,ea1,nea2,ea2]] = sumstats.loc[:, [nea1,ea1,nea2,ea2]].astype(allele_type)
+    sumstats[[nea1,ea1,nea2,ea2]] = sumstats[[nea1,ea1,nea2,ea2]].astype(allele_type)
     is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
     is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
     is_allele_set_match = is_flipped_match | is_perfect_match
-    sumstats.loc[~is_allele_set_match,:]
+    log.write(" -Matching alleles and keeping only variants with same allele set: ", verbose=verbose)
+    log.write("  -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
+    log.write("  -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
+    log.write("  -Unmatched : {}".format(sum(~is_allele_set_match)), verbose=verbose)
     return sumstats.loc[is_allele_set_match,:]
 def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
@@ -77,10 +83,18 @@ def _align_with_mold(sumstats, log=Log(),verbose=True, suffixes=("_MOLD","")):
     nea2="NEA"+suffixes[1]
     status1="STATUS"+suffixes[0]
     status2="STATUS"+suffixes[1]
     is_perfect_match = (sumstats[ea2] == sumstats[ea1]) & (sumstats[nea2] == sumstats[nea1])
     is_flipped_match = (sumstats[ea2] == sumstats[nea1]) & (sumstats[nea2] == sumstats[ea1])
+    log.write(" -Aligning alleles with reference: ", verbose=verbose)
+    log.write("  -Perfect match: {}".format(sum(is_perfect_match)), verbose=verbose)
+    log.write("  -Flipped match: {}".format(sum(is_flipped_match)), verbose=verbose)
+    log.write("  -For perfect match: copy STATUS from reference...", verbose=verbose)
     sumstats.loc[is_perfect_match,status2] = copy_status(sumstats.loc[is_perfect_match,status1], sumstats.loc[is_perfect_match,status2],6)
+    log.write("  -For Flipped match: convert STATUS xxxxx[456789]x to xxxxx3x...", verbose=verbose)
     sumstats.loc[is_flipped_match,status2] = vchange_status(sumstats.loc[is_flipped_match,status2],6,"456789","333333")
     return sumstats
@@ -119,9 +133,9 @@ def _sort_pair_cols(molded_sumstats, verbose=True, log=Log(), order=None, stats_
         if i not in order:
             output_columns.append(i)
-    if verbose: log.write(" -Reordering columns to    :", ",".join(output_columns))
-    molded_sumstats = molded_sumstats.loc[:, output_columns]
-    if verbose: log.write("Finished sorting columns successfully!")
+    if verbose: log.write(" -Reordering columns to    :", ",".join(output_columns), verbose=verbose)
+    molded_sumstats = molded_sumstats[ output_columns]
+    if verbose: log.write("Finished sorting columns successfully!", verbose=verbose)
     return molded_sumstats

gwaslab 3.4.37__py3-none-any.whl → 3.4.38__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.37py3-none-any.whl → 3.4.38py3-none-any.whl