PyPI - gwaslab - Versions diffs - 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl - Mend

gwaslab 3.5.4py3-none-any.whl → 3.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (34) hide show

gwaslab/__init__.py +3 -1
gwaslab/g_Sumstats.py +56 -9
gwaslab/g_SumstatsPair.py +16 -12
gwaslab/g_SumstatsSet.py +663 -0
gwaslab/g_headers.py +131 -0
gwaslab/g_meta.py +2 -1
gwaslab/g_version.py +3 -3
gwaslab/hm_harmonize_sumstats.py +91 -1
gwaslab/io_preformat_input.py +29 -7
gwaslab/io_read_pipcs.py +23 -0
gwaslab/io_to_formats.py +45 -44
gwaslab/qc_check_datatype.py +65 -42
gwaslab/qc_fix_sumstats.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +162 -3
gwaslab/util_ex_ldsc.py +9 -0
gwaslab/util_ex_run_2samplemr.py +34 -0
gwaslab/util_ex_run_clumping.py +4 -2
gwaslab/util_in_fill_data.py +28 -3
gwaslab/util_in_filter_value.py +66 -1
gwaslab/util_in_merge.py +51 -0
gwaslab/viz_aux_save_figure.py +2 -1
gwaslab/viz_plot_credible_sets.py +99 -0
gwaslab/viz_plot_effect.py +283 -0
gwaslab/viz_plot_miamiplot2.py +1 -1
gwaslab/viz_plot_mqqplot.py +31 -11
gwaslab/viz_plot_regional2.py +133 -32
gwaslab/viz_plot_stackedregional.py +64 -34
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/METADATA +4 -4
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/RECORD +33 -28
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/WHEEL +1 -1
gwaslab/vis_plot_credible sets.py +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.5.4.dist-info → gwaslab-3.5.6.dist-info}/top_level.txt +0 -0

gwaslab/g_headers.py ADDED Viewed

@@ -0,0 +1,131 @@
+dtype_dic={
+ 'SNPID'             : 'string'   ,
+ 'rsID'              : 'string'   ,
+ 'CHR'               : 'Int64'    ,
+ 'POS'               : 'Int64'    ,
+ 'EA'                : 'category' ,
+ 'NEA'               : 'category' ,
+ 'STATUS'            : 'category' ,
+ 'REF'               : 'category' ,
+ 'ALT'               : 'category' ,
+ 'EAF'               : 'float64'  ,
+ 'NEAF'              : 'float64'  ,
+ 'MAF'               : 'float64'  ,
+ 'INFO'              : 'float32'  ,
+ 'BETA'              : 'float64'  ,
+ 'SE'                : 'float64'  ,
+ 'BETA_95U'          : 'float64'  ,
+ 'BETA_95L'          : 'float64'  ,
+ 'OR'                : 'float64'  ,
+ 'OR_95U'            : 'float64'  ,
+ 'OR_95L'            : 'float64'  ,
+ 'HR'                : 'float64'  ,
+ 'HR_95U'            : 'float64'  ,
+ 'HR_95L'            : 'float64'  ,
+ 'CHISQ'             : 'float64'  ,
+ 'Z'                 : 'float64'  ,
+ 'T'                 : 'float64'  ,
+ 'F'                 : 'float64'  ,
+ 'P'                 : 'float64'  ,
+ 'P_MANTISSA'        : 'float64'  ,
+ 'P_EXPONENT'        : 'float64'  ,
+ 'MLOG10P'           : 'float64'  ,
+ 'SNPR2'             : 'float64'  ,
+ 'DOF'               : 'Int64'    ,
+ 'P_HET'             : 'float64'  ,
+ 'I2_HET'            : 'float64'  ,
+ 'DENSITY'           : 'Int64'    ,
+ 'N'                 : 'Int64'    ,
+ 'N_CASE'            : 'Int64'    ,
+ 'N_CONTROL'         : 'Int64'    ,
+ 'GENENAME'          : 'string'   ,
+ 'CIS/TRANS'         : 'string'   ,
+ 'DISTANCE_TO_KNOWN' : 'Int64'    ,
+ 'LOCATION_OF_KNOWN' : 'string'   ,
+ 'KNOWN_ID'          : 'string'   ,
+ 'KNOWN_PUBMED_ID'   : 'string'   ,
+ 'KNOWN_AUTHOR'      : 'string'   ,
+ 'KNOWN_SET_VARIANT' : 'string'   ,
+ 'KNOWN_VARIANT'     : 'string'   ,
+ 'KNOWN_SET'         : 'string'   ,
+ 'NOVEL'             : 'string'   ,
+ 'PIP'               :' float64   ',
+ 'CREDIBLE_SET_INDEX': 'Int64'    ,
+ 'N_SNP'             : 'Int64'    ,
+ 'LOCUS'             : 'string'   ,
+ 'STUDY'             : 'string'   ,
+}
+description_dic={
+ 'SNPID'             :' variant ID (CHR:POS:NEA:EA)                    ',
+ 'rsID'              :' dbSNP rsID                                     ',
+ 'CHR'               :' chromosome number (X 23, Y 24, MT 25)          ',
+ 'POS'               :' base pair position                             ',
+ 'EA'                :' effect allele                                  ',
+ 'NEA'               :' non-effect allele                              ',
+ 'STATUS'            :' variant standardization & harmonization status ',
+ 'REF'               :' reference allele in reference genome           ',
+ 'ALT'               :' alternative allele                             ',
+ 'EAF'               :' effect allele frequency                        ',
+ 'NEAF'              :' non-effect allele frequency                    ',
+ 'MAF'               :' minor allele frequency                         ',
+ 'INFO'              :' imputation INFO/RSQ                            ',
+ 'BETA'              :' effect size beta                               ',
+ 'SE'                :' standard error of beta                         ',
+ 'BETA_95U'          :' upper bound of beta 95% condidence interval    ',
+ 'BETA_95L'          :' lower bound of beta 95% condidence interval    ',
+ 'OR'                :' odds ratio                                     ',
+ 'OR_95U'            :' upper bound of OR 95% condidence interval      ',
+ 'OR_95L'            :' lower bound of OR 95% condidence interval      ',
+ 'HR'                :' hazard ratio                                   ',
+ 'HR_95U'            :' upper bound of HR 95% condidence interval      ',
+ 'HR_95L'            :' lower bound of HR 95% condidence interval      ',
+ 'CHISQ'             :' chi square                                     ',
+ 'Z'                 :' z score                                        ',
+ 'T'                 :' t statistics                                   ',
+ 'F'                 :' F statistics                                   ',
+ 'P'                 :' P value                                        ',
+ 'P_MANTISSA'        :' P mantissa                                     ',
+ 'P_EXPONENT'        :' P exponent                                     ',
+ 'MLOG10P'           :' $-log_{10}(P)$                                 ',
+ 'SNPR2'             :' per variant R2                                 ',
+ 'DOF'               :' degree of freedom                              ',
+ 'P_HET'             :' heterogeneity test P value                     ',
+ 'I2_HET'            :' heterogeneity I2                               ',
+ 'DENSITY'           :' signal density                                 ',
+ 'N'                 :' total sample size                              ',
+ 'N_CASE'            :' number of cases                                ',
+ 'N_CONTROL'         :' number of controls                             ',
+ 'GENENAME'          :' nearest gene symbol                            ',
+ 'CIS/TRANS'         :' whether the variant is in cis or trans region  ',
+ 'DISTANCE_TO_KNOWN' :' distance to nearest known variants             ',
+ 'LOCATION_OF_KNOWN' :' relative location to nearest known variants    ',
+ 'KNOWN_ID'          :' nearest known variant ID                       ',
+ 'KNOWN_PUBMED_ID'   :' pubmed ID of the known variant                 ',
+ 'KNOWN_AUTHOR'      :' author of the study                            ',
+ 'KNOWN_SET_VARIANT' :' known set and overlapping variant              ',
+ 'KNOWN_VARIANT'     :' known variant overlapping with the variant     ',
+ 'KNOWN_SET'         :' variant set of the known variant               ',
+ 'PIP'               :' Posterior Inclusion Probability                ',
+ 'CREDIBLE_SET_INDEX':' credible sets index           ',
+ 'N_SNP'             :' number of variants included in this locus for finemapping           ',
+ 'LOCUS'             :' locus name, usually the lead variant of the locus           ',
+ 'STUDY'             :' study name           '}
+def _get_headers(mode="all"):
+    if mode=="info":
+        return ["SNPID","rsID","CHR","POS","EA","NEA","STATUS"]
+    elif mode=="stats":
+        return ["BETA","SE","P","MLOG10P","N","N_CASE","N_CONTROL","Z","T","F","OR","OR_95L","OR_95U","HR","HR_95L","HR_95U","MAF","EAF","BETA_95L","BETA_95U"]
+    else:
+        return description_dic.keys()
+def _check_overlap_with_reserved_keys(other):
+    overlapped=[]
+    for i in other:
+        if i in _get_headers():
+            overlapped.append(i)
+    return overlapped

gwaslab/g_meta.py CHANGED Viewed

@@ -55,4 +55,5 @@ def _append_meta_record(old, new):
     if old == "Unknown" or old== "Unchecked":
         return new
     else:
-        return "{}, {}".format(old, new)
+        return "{}, {}".format(old, new)

gwaslab/g_version.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 def _show_version(log=Log(), verbose=True):
     # show version when loading sumstats
     log.write("GWASLab v{} https://cloufield.github.io/gwaslab/".format(gwaslab_info()["version"]),verbose=verbose)
-    log.write("(C) 2022-2024, Yunye He, Kamatani Lab, MIT License, gwaslab@gmail.com",verbose=verbose)
+    log.write("(C) 2022-2025, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com",verbose=verbose)
 def _get_version():
     # return short version string like v3.4.33
@@ -15,8 +15,8 @@ def _get_version():
 def gwaslab_info():
     # version meta information
     dic={
-   "version":"3.5.4",
-   "release_date":"20241218"
+   "version":"3.5.6",
+   "release_date":"20250306"
     }
     return dic

gwaslab/hm_harmonize_sumstats.py CHANGED Viewed

@@ -1490,10 +1490,100 @@ def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
             elif record.ref==alt and (ref in record.alts):
                 return 1 - record.info[alt_freq][0]
     return np.nan
+##############################################################################################################################################################################################
+################################################################################################################
+def _paralleleinferafwithmaf(sumstats,ref_infer,ref_alt_freq=None,n_cores=1, chr="CHR",pos="POS",ref="NEA",alt="EA",
+                            eaf="EAF",maf="MAF",ref_eaf="_REF_EAF",status="STATUS",chr_dict=None,force=False, verbose=True,log=Log()):
+    ##start function with col checking##########################################################
+    _start_line = "infer sumstats EAF from sumstats MAF using reference VCF ALT frequency"
+    _end_line = "inferring sumstats EAF from sumstats MAF using reference VCF ALT frequency"
+    _start_cols = [chr,pos,ref,alt,status]
+    _start_function = ".infer_af()"
+    _must_args ={"ref_alt_freq":ref_alt_freq}
-################################################################################################################
+    is_enough_info = start_to(sumstats=sumstats,
+                            log=log,
+                            verbose=verbose,
+                            start_line=_start_line,
+                            end_line=_end_line,
+                            start_cols=_start_cols,
+                            start_function=_start_function,
+                            n_cores=n_cores,
+                            ref_vcf=ref_infer,
+                            **_must_args)
+    if is_enough_info == False: return sumstats
+    ############################################################################################
+    chr_dict = auto_check_vcf_chr_dict(ref_infer, chr_dict, verbose, log)
+    if eaf not in sumstats.columns:
+        sumstats[eaf]=np.nan
+    if ref_eaf not in sumstats.columns:
+        sumstats[ref_eaf]=np.nan
+    prenumber = sum(sumstats[eaf].isna())
+    # ref_alt_freq INFO in vcf was provided
+    if ref_alt_freq is not None:
+        log.write(" -Field for alternative allele frequency in VCF INFO: {}".format(ref_alt_freq), verbose=verbose)
+        if not force:
+            good_chrpos =  sumstats[status].str.match(r'\w\w\w[0]\w\w\w', case=False, flags=0, na=False)
+        log.write(" -Checking variants:", sum(good_chrpos),verbose=verbose)
+        ########################
+        #extract ref af
+        if sum(sumstats[eaf].isna())<10000:
+            n_cores=1
+        #df_split = np.array_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        df_split = _df_split(sumstats.loc[good_chrpos,[chr,pos,ref,alt]], n_cores)
+        pool = Pool(n_cores)
+        map_func = partial(inferaf,chr=chr,pos=pos,ref=ref,alt=alt,eaf=ref_eaf,ref_infer=ref_infer,ref_alt_freq=ref_alt_freq,chr_dict=chr_dict)
+        sumstats.loc[good_chrpos,[ref_eaf]] = pd.concat(pool.map(map_func,df_split))
+        pool.close()
+        pool.join()
+        ###########################
+        # infer sumstats EAF
+        # based on sumstats MAF and reference EAF
+        is_filpped = ((sumstats[ref_eaf]>=0.5)&(sumstats[maf]<=0.5)) |((sumstats[ref_eaf]<0.5)&(sumstats[maf]>0.5))
+        sumstats[eaf] = sumstats[maf]
+        log.write(" -Flipping MAF to obtain EAF for {} variants".format(sum(is_filpped)),verbose=verbose)
+        sumstats.loc[is_filpped,eaf] = 1 - sumstats.loc[is_filpped,maf]
+        ###########################
+        afternumber = sum(sumstats[eaf].isna())
+        log.write(" -Inferred EAF for {} variants.".format(prenumber - afternumber),verbose=verbose)
+        log.write(" -EAF is still missing for {} variants.".format(afternumber),verbose=verbose)
+        sumstats = sumstats.drop(columns=[ref_eaf])
+    finished(log,verbose,_end_line)
+    return sumstats
+def inferaf(sumstats,ref_infer,ref_alt_freq=None,chr="CHR",pos="POS",ref="NEA",alt="EA",eaf="EAF",chr_dict=None):
+    #vcf_reader = vcf.Reader(open(ref_infer, 'rb'))
+    vcf_reader = VariantFile(ref_infer)
+    def afapply(x,vcf,alt_freq,chr_dict):
+            return infer_af(x.iloc[0],x.iloc[1]-1,x.iloc[1],x.iloc[2],x.iloc[3],vcf_reader,ref_alt_freq,chr_dict)
+    map_func = partial(afapply,vcf=vcf_reader,alt_freq=ref_alt_freq,chr_dict=chr_dict)
+    status_inferred = sumstats.apply(map_func,axis=1)
+    sumstats[eaf] = status_inferred.values
+    sumstats[eaf]=sumstats[eaf].astype("float")
+    return sumstats
+def infer_af(chr,start,end,ref,alt,vcf_reader,alt_freq,chr_dict=None):
+    if chr_dict is not None: chr=chr_dict[chr]
+    chr_seq = vcf_reader.fetch(chr,start,end)
+    for record in chr_seq:
+        if record.pos==end:
+            if record.ref==ref and (alt in record.alts):
+                return record.info[alt_freq][0]
+            elif record.ref==alt and (ref in record.alts):
+                return 1 - record.info[alt_freq][0]
+    return np.nan
+##############################################################################################################################################################################################
 def auto_check_vcf_chr_dict(vcf_path, vcf_chr_dict, verbose, log):
     if vcf_path is not None:
         if vcf_chr_dict is None:

gwaslab/io_preformat_input.py CHANGED Viewed

@@ -8,11 +8,13 @@ from gwaslab.bd_common_data import get_format_dict
 from gwaslab.qc_fix_sumstats import sortcolumn
 from gwaslab.qc_fix_sumstats import _process_build
 from gwaslab.qc_check_datatype import check_datatype
+from gwaslab.qc_check_datatype import quick_convert_datatype
 from gwaslab.qc_check_datatype import check_dataframe_memory_usage
+from gwaslab.g_headers import _check_overlap_with_reserved_keys
 #20221030
 def preformat(sumstats,
           fmt=None,
+          tab_fmt="tsv",
           snpid=None,
           rsid=None,
           chrom=None,
@@ -66,12 +68,21 @@ def preformat(sumstats,
     rename_dictionary = {}
     usecols = []
     dtype_dictionary ={}
+    if readargs is None:
+        readargs={}
  #######################################################################################################################################################
     # workflow:
     # 1. formatbook
     # 2. user specified header
     # 3. usekeys
+    if tab_fmt=="parquet":
+        if type(sumstats) is str:
+            log.write("Start to load data from parquet file....",verbose=verbose)
+            log.write(" -path: {}".format(sumstats),verbose=verbose)
+            sumstats = pd.read_parquet(sumstats,**readargs)
+            log.write("Finished loading parquet file into pd.DataFrame....",verbose=verbose)
+        else:
+            raise ValueError("Please input a path for parquet file.")
     if fmt is not None:
         # loading format parameters
@@ -145,9 +156,11 @@ def preformat(sumstats,
             if key in raw_cols:
                 usecols.append(key)
             if value in ["EA","NEA"]:
-                dtype_dictionary[value]="category"
-            if value in ["CHR","STATUS"]:
-                dtype_dictionary[value]="string"
+                dtype_dictionary[key]="category"
+            if value in ["STATUS"]:
+                dtype_dictionary[key]="string"
+            if value in ["CHR"]:
+                dtype_dictionary[key]="string"
     except ValueError:
         raise ValueError("Please input a path or a pd.DataFrame, and make sure the separator is correct and the columns you specified are in the file.")
@@ -276,6 +289,8 @@ def preformat(sumstats,
         rename_dictionary[status]="STATUS"
         dtype_dictionary[status]="string"
     if other:
+        overlapped = _check_overlap_with_reserved_keys(other)
+        log.warning("Columns with headers overlapping with GWASLab reserved keywords:{}".format(overlapped),verbose=verbose)
         usecols = usecols + other
         for i in other:
             rename_dictionary[i] = i
@@ -359,8 +374,13 @@ def preformat(sumstats,
             sumstats = sumstats[usecols].copy()
             for key,value in dtype_dictionary.items():
                 if key in usecols:
-                    sumstats[key] = sumstats[key].astype(value)
+                    astype = value
+                    if rename_dictionary[key]=="CHR":
+                        astype ="Int64"
+                    try:
+                        sumstats[key] = sumstats[key].astype(astype)
+                    except:
+                        sumstats[key] = sumstats[key].astype("string")
     except ValueError:
         raise ValueError("Please input a path or a pd.DataFrame, and make sure it contain the columns.")
@@ -400,6 +420,8 @@ def preformat(sumstats,
     ## reodering ###################################################################################################
     sumstats = sortcolumn(sumstats=sumstats,log=log,verbose=verbose)
+    sumstats = quick_convert_datatype(sumstats,log=log,verbose=verbose)
     check_datatype(sumstats,log=log,verbose=verbose)
     gc.collect()
     check_dataframe_memory_usage(sumstats,log=log,verbose=verbose)

gwaslab/io_read_pipcs.py ADDED Viewed

@@ -0,0 +1,23 @@
+import pandas as pd
+from gwaslab.g_Log import Log
+from gwaslab.qc_check_datatype import check_datatype
+from gwaslab.qc_check_datatype import check_dataframe_memory_usage
+def _read_pipcs(data, output_prefix, log=Log(),verbose=True):
+    log.write("Start to load PIP and CREDIBLE_SET_INDEX from file...",verbose=verbose)
+    log.write(" -File:{}.pipcs".format(output_prefix),verbose=verbose)
+    pipcs = pd.read_csv("{}.pipcs".format(output_prefix))
+    log.write(" -Merging CHR and POS from main dataframe...",verbose=verbose)
+    pipcs = _merge_chrpos(data,pipcs)
+    log.write(" -Current pipcs Dataframe shape :",len(pipcs)," x ", len(pipcs.columns),verbose=verbose)
+    check_datatype(pipcs,log=log,verbose=verbose)
+    check_dataframe_memory_usage(pipcs,log=log,verbose=verbose)
+    log.write("Finished loading PIP and CREDIBLE_SET_INDEX from file!",verbose=verbose)
+    return pipcs
+def _merge_chrpos(data,pipcs):
+    df = pd.merge(pipcs, data,on="SNPID",how="left")
+    return df

gwaslab/io_to_formats.py CHANGED Viewed

@@ -114,48 +114,49 @@ def _to_format(sumstats,
     #######################################################################################################
     #formatting float statistics
-    onetime_log.write(" -Formatting statistics ...",verbose=verbose)
-    formats = {
-            'EAF': '{:.4g}',
-            'MAF': '{:.4g}',
-            'BETA': '{:.4f}',
-            'SE': '{:.4f}',
-            'BETA_95U': '{:.4f}',
-            'BETA_95L': '{:.4f}',
-            'Z': '{:.4f}',
-            'CHISQ': '{:.4f}',
-            'F': '{:.4f}',
-            'OR': '{:.4f}',
-            'OR_95U': '{:.4f}',
-            'OR_95L': '{:.4f}',
-            'HR': '{:.4f}',
-            'HR_95U': '{:.4f}',
-            'HR_95L': '{:.4f}',
-            'INFO': '{:.4f}',
-            'P': '{:.4e}',
-            'MLOG10P': '{:.4f}',
-            'DAF': '{:.4f}'}
-    for col, f in float_formats.items():
-        if col in output.columns:
-            formats[col]=f
-    for col, f in formats.items():
-        if col in output.columns:
-            if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
-                output[col] = output[col].map(f.format)
-    onetime_log.write(" -Float statistics formats:",verbose=verbose)
-    keys=[]
-    values=[]
-    for key,value in formats.items():
-        if key in output.columns:
-            keys.append(key)
-            values.append(value)
-    onetime_log.write("  - Columns       :",keys,verbose=verbose)
-    onetime_log.write("  - Output formats:",values,verbose=verbose)
+    if tab_fmt!="parquet":
+        onetime_log.write(" -Formatting statistics ...",verbose=verbose)
+        formats = {
+                'EAF': '{:.4g}',
+                'MAF': '{:.4g}',
+                'BETA': '{:.4f}',
+                'SE': '{:.4f}',
+                'BETA_95U': '{:.4f}',
+                'BETA_95L': '{:.4f}',
+                'Z': '{:.4f}',
+                'CHISQ': '{:.4f}',
+                'F': '{:.4f}',
+                'OR': '{:.4f}',
+                'OR_95U': '{:.4f}',
+                'OR_95L': '{:.4f}',
+                'HR': '{:.4f}',
+                'HR_95U': '{:.4f}',
+                'HR_95L': '{:.4f}',
+                'INFO': '{:.4f}',
+                'P': '{:.4e}',
+                'MLOG10P': '{:.4f}',
+                'DAF': '{:.4f}'}
+        for col, f in float_formats.items():
+            if col in output.columns:
+                formats[col]=f
+        for col, f in formats.items():
+            if col in output.columns:
+                if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
+                    output[col] = output[col].map(f.format)
+        onetime_log.write(" -Float statistics formats:",verbose=verbose)
+        keys=[]
+        values=[]
+        for key,value in formats.items():
+            if key in output.columns:
+                keys.append(key)
+                values.append(value)
+        onetime_log.write("  - Columns       :",keys,verbose=verbose)
+        onetime_log.write("  - Output formats:",values,verbose=verbose)
     ##########################################################################################################
     # output, mapping column names
@@ -233,7 +234,7 @@ def tofmt(sumstats,
         if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
             sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
         # add prefix to CHR
-        elif chr_prefix is not None:
+        elif len(chr_prefix)>0:
             sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
     ####################################################################################################################
@@ -409,7 +410,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
                 log.write(f"  -@ detected: writing each chromosome to a single file...",verbose=verbose)
                 log.write("  -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
                 for single_chr in list(sumstats["CHR"].unique()):
-                    single_path = path.replace("@",single_chr)
+                    single_path = path.replace("@","{}".format(single_chr))
                     fast_to_csv(sumstats.loc[sumstats[chr_header]==single_chr,:],
                                  single_path,
@@ -422,7 +423,7 @@ def _write_tabular(sumstats,rename_dictionary, path, tab_fmt, to_csvargs, to_tab
                 log.write(f"  -@ detected: writing each chromosome to a single file...",verbose=verbose)
                 log.write("  -Chromosomes:{}...".format(list(sumstats["CHR"].unique())),verbose=verbose)
                 for single_chr in list(sumstats["CHR"].unique()):
-                    single_path = path.replace("@",single_chr)
+                    single_path = path.replace("@","{}".format(single_chr))
                     sumstats.loc[sumstats[chr_header]==single_chr,:].to_csv(path, index=None, **to_csvargs)
             else:

gwaslab/qc_check_datatype.py CHANGED Viewed

@@ -5,7 +5,54 @@ from gwaslab.g_Log import Log
 # pandas.api.types.is_int64_dtype
 # pandas.api.types.is_categorical_dtype
+dtype_dict ={
+    "SNPID":["string","object"],
+    "rsID":["string","object"],
+    "CHR":["Int64","int64","int32","Int32","int"],
+    "POS":["int64","Int64"],
+    "EA":["category"],
+    "NEA":["category"],
+    "REF":["category"],
+    "ALT":["category"],
+    "BETA":["float64"],
+    "BETA_95L":["float64"],
+    "BETA_95U":["float64"],
+    "SE":["float64"],
+    "N":["Int64","int64","int32","Int32","int"],
+    "N_CASE":["Int64","int64","int32","Int32","int"],
+    "N_CONTROL":["Int64","int64","int32","Int32","int"],
+    "OR":["float64"],
+    "OR_95L":["float64"],
+    "OR_95U":["float64"],
+    "HR":["float64"],
+    "HR_95L":["float64"],
+    "HR_95U":["float64"],
+    "P":["float64"],
+    "MLOG10P":["float64"],
+    "Z":["float64"],
+    "F":["float64"],
+    "T":["float64"],
+    "TEST":["string","object","category"],
+    "CHISQ":["float64"],
+    "I2":["float64"],
+    "PHET":["float64"],
+    "SNPR2":["float64"],
+    "EAF":["float64","float","float32"],
+    "NEAF":["float64","float","float32"],
+    "MAF":["float64","float","float32"],
+    "INFO":["float64","float","float32"],
+    "DOF":["Int64","int64","int32","Int32","int"],
+    "STATUS":["category"],
+    "DIRECTION":["string","object"],
+    'PIP'               :["float64","float","float32"],
+    'CREDIBLE_SET_INDEX':["Int64","int64","int32","Int32","int"],
+    'N_SNP'             :["Int64","int64","int32","Int32","int"],
+    'LOCUS'             :["string","object","category"],
+    'STUDY'             :["string","object","category"]
+    }
 def check_datatype(sumstats, verbose=True, log=Log()):
     try:
         headers = []
         dtypes = []
@@ -39,47 +86,6 @@ def check_datatype(sumstats, verbose=True, log=Log()):
 def verify_datatype(header, dtype):
-    dtype_dict ={
-    "SNPID":["object","string"],
-    "rsID":["object","string"],
-    "CHR":["int32","Int32","int64","Int64"],
-    "POS":["int64","Int64"],
-    "EA":"category",
-    "NEA":"category",
-    "REF":"category",
-    "ALT":"category",
-    "BETA":"float64",
-    "BETA_95L":"float64",
-    "BETA_95U":"float64",
-    "SE":"float64",
-    "N":["int","Int32","Int64","int32","int64"],
-    "N_CASE":["int","Int32","Int64","int32","int64"],
-    "N_CONTROL":["int","Int32","Int64","int32","int64"],
-    "OR":"float64",
-    "OR_95L":"float64",
-    "OR_95U":"float64",
-    "HR":"float64",
-    "HR_95L":"float64",
-    "HR_95U":"float64",
-    "P":"float64",
-    "MLOG10P":"float64",
-    "Z":"float64",
-    "F":"float64",
-    "T":"float64",
-    "TEST":["object","string","category"],
-    "CHISQ":"float64",
-    "I2":"float64",
-    "PHET":"float64",
-    "SNPR2":"float64",
-    "EAF":["float","float32","float64"],
-    "NEAF":["float","float32","float64"],
-    "MAF":["float","float32","float64"],
-    "INFO":["float32","float64"],
-    "DOF":["int","Int32","Int64","int32","int64"],
-    "STATUS":"category",
-    "DIRECTION":["object","string"],
-    }
     if header in dtype_dict.keys():
         if str(dtype) in dtype_dict[header]:
             return "T"
@@ -88,6 +94,22 @@ def verify_datatype(header, dtype):
     else:
         return "NA"
+def quick_convert_datatype(sumstats, log, verbose):
+    for col in sumstats.columns:
+        if col in dtype_dict.keys():
+            if str(sumstats[col].dtypes) not in dtype_dict[col]:
+                datatype=dtype_dict[col][0]
+                log.write(" -Trying to convert datatype for {}: {} -> {}...".format(col, str(sumstats[col].dtypes), datatype), end="" ,verbose=verbose)
+                try:
+                    sumstats[col] = sumstats[col].astype(datatype)
+                    log.write("{}".format(datatype),show_time=False, verbose=verbose)
+                except:
+                    log.write("Failed...",show_time=False,verbose=verbose)
+                    pass
+    return sumstats
 def check_dataframe_shape(sumstats, log, verbose):
     memory_in_mb = sumstats.memory_usage().sum()/1024/1024
     try:
@@ -100,4 +122,5 @@ def check_dataframe_memory_usage(sumstats, log, verbose):
     try:
         log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
     except:
-        log.warning("Error: cannot get Memory usage...")
+        log.warning("Error: cannot get Memory usage...")

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -1178,7 +1178,7 @@ def sanitycheckstats(sumstats,
                      t=(-99999,99999),
                      f=(0,float("Inf")),
                      p=(0,1),
-                     mlog10p=(0,9999),
+                     mlog10p=(0,99999),
                      beta=(-100,100),
                      se=(0,float("Inf")),
                      OR=(-100,100),

gwaslab 3.5.4__py3-none-any.whl → 3.5.6__py3-none-any.whl

Potentially problematic release.

gwaslab 3.5.4py3-none-any.whl → 3.5.6py3-none-any.whl