PyPI - gwaslab - Versions diffs - 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl - Mend

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (42) hide show

gwaslab/__init__.py +1 -1
gwaslab/data/formatbook.json +722 -721
gwaslab/g_Log.py +8 -0
gwaslab/g_Sumstats.py +80 -178
gwaslab/g_SumstatsPair.py +6 -2
gwaslab/g_Sumstats_summary.py +3 -3
gwaslab/g_meta.py +13 -3
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +29 -15
gwaslab/hm_harmonize_sumstats.py +312 -159
gwaslab/hm_rsid_to_chrpos.py +1 -1
gwaslab/io_preformat_input.py +46 -37
gwaslab/io_to_formats.py +428 -295
gwaslab/qc_check_datatype.py +15 -1
gwaslab/qc_fix_sumstats.py +956 -719
gwaslab/util_ex_calculate_ldmatrix.py +29 -11
gwaslab/util_ex_gwascatalog.py +1 -1
gwaslab/util_ex_ldproxyfinder.py +1 -1
gwaslab/util_ex_process_h5.py +26 -17
gwaslab/util_ex_process_ref.py +3 -3
gwaslab/util_ex_run_coloc.py +26 -4
gwaslab/util_in_convert_h2.py +1 -1
gwaslab/util_in_fill_data.py +44 -5
gwaslab/util_in_filter_value.py +122 -34
gwaslab/util_in_get_density.py +2 -2
gwaslab/util_in_get_sig.py +41 -9
gwaslab/viz_aux_quickfix.py +26 -21
gwaslab/viz_aux_reposition_text.py +7 -4
gwaslab/viz_aux_save_figure.py +6 -5
gwaslab/viz_plot_compare_af.py +5 -5
gwaslab/viz_plot_compare_effect.py +22 -5
gwaslab/viz_plot_miamiplot2.py +28 -20
gwaslab/viz_plot_mqqplot.py +214 -98
gwaslab/viz_plot_qqplot.py +11 -8
gwaslab/viz_plot_regionalplot.py +16 -9
gwaslab/viz_plot_trumpetplot.py +15 -6
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/METADATA +3 -3
gwaslab-3.4.38.dist-info/RECORD +72 -0
gwaslab-3.4.36.dist-info/RECORD +0 -72
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/LICENSE +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/WHEEL +0 -0
{gwaslab-3.4.36.dist-info → gwaslab-3.4.38.dist-info}/top_level.txt +0 -0

gwaslab/io_to_formats.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import pandas as pd
 import yaml
 import hashlib
+import copy
 from pysam import tabix_compress
 from pysam import tabix_index
 from datetime import datetime
@@ -11,6 +12,8 @@ from gwaslab.g_Log import Log
 from gwaslab.bd_common_data import get_format_dict
 from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.g_version import gwaslab_info
+from gwaslab.bd_get_hapmap3 import gethapmap3
 # to vcf
 # to fmt
     ## vcf
@@ -19,7 +22,168 @@ from gwaslab.g_version import gwaslab_info
     ## annovar
     ## general : ldsc, plink, plink2, saige, regenie
 ###################################################################################################################################################
+def _to_format(sumstats,
+              path="./sumstats",
+              fmt="gwaslab",
+              extract=None,
+              exclude=None,
+              cols=None,
+              id_use="rsID",
+              hapmap3=False,
+              exclude_hla=False,
+              hla_range=(25,34),
+              build=None,
+              n=None,
+              no_status=False,
+              output_log=True,
+              to_csvargs=None,
+              float_formats=None,
+              xymt_number=False,
+              xymt=None,
+              chr_prefix="",
+              meta=None,
+              ssfmeta=False,
+              md5sum=False,
+              bgzip=False,
+              tabix=False,
+              tabix_indexargs={},
+              log=Log(),
+              verbose=True):
+    if  to_csvargs is None:
+        to_csvargs = {}
+    if  float_formats is None:
+        float_formats={}
+    if cols is None:
+        cols=[]
+    if xymt is None:
+        xymt = ["X","Y","MT"]
+    onetime_log = copy.deepcopy(log)
+    #######################################################################################################
+    formatlist= get_formats_list() + ["vep","bed","annovar","vcf"]
+    if fmt in formatlist:
+        onetime_log.write("Start to convert the output sumstats in: ",fmt, " format",verbose=verbose)
+    else:
+        raise ValueError("Please select a format to output")
+    suffix=fmt
+    #######################################################################################################
+    # filter
+    output = sumstats.copy()
+    if extract is not None:
+        onetime_log.write(" -Extracting {} variants from the main DataFrame...".format(len(extract)),verbose=verbose)
+        output = output.loc[output[id_use].isin(extract),:]
+        onetime_log.write(" -Extracted {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
+    if exclude is not None:
+        onetime_log.write(" -Excluding {} variants from the main DataFrame...".format(len(exclude)),verbose=verbose)
+        output = output.loc[~output[id_use].isin(exclude),:]
+        onetime_log.write(" -Excluded {} variants from the main DataFrame...".format(len(output)),verbose=verbose)
+    #hla and hapmap3 #######################################################################################
+    #exclude hla
+    if exclude_hla==True:
+        onetime_log.write(" -Excluding variants in MHC (HLA) region ...",verbose=verbose)
+        before = len(output)
+        is_hla = (output["CHR"].astype("string") == "6") & (output["POS"].astype("Int64") > hla_range[0]*1000000) & (output["POS"].astype("Int64") < hla_range[1]*1000000)
+        output = output.loc[~is_hla,:]
+        after = len(output)
+        onetime_log.write(" -Exclude "+ str(before - after) + " variants in MHC (HLA) region : {}Mb - {}Mb.".format(hla_range[0], hla_range[1]),verbose=verbose)
+        suffix = "noMHC."+suffix
+    #extract hapmap3 SNPs
+    if hapmap3==True:
+        output = gethapmap3(output,build=build,verbose=verbose)
+        after = len(output)
+        onetime_log.write(" -Extract {} variants in Hapmap3 datasets for build {}.".format(after, build ),verbose=verbose)
+        suffix = "hapmap3."+suffix
+    # add a n column
+    if n is not None:
+        output["N"] = n
+    #######################################################################################################
+    #formatting float statistics
+    onetime_log.write(" -Formatting statistics ...",verbose=verbose)
+    formats = {
+            'EAF': '{:.4g}',
+            'MAF': '{:.4g}',
+            'BETA': '{:.4f}',
+            'SE': '{:.4f}',
+            'BETA_95U': '{:.4f}',
+            'BETA_95L': '{:.4f}',
+            'Z': '{:.4f}',
+            'CHISQ': '{:.4f}',
+            'F': '{:.4f}',
+            'OR': '{:.4f}',
+            'OR_95U': '{:.4f}',
+            'OR_95L': '{:.4f}',
+            'HR': '{:.4f}',
+            'HR_95U': '{:.4f}',
+            'HR_95L': '{:.4f}',
+            'INFO': '{:.4f}',
+            'P': '{:.4e}',
+            'MLOG10P': '{:.4f}',
+            'DAF': '{:.4f}'}
+    for col, f in float_formats.items():
+        if col in output.columns:
+            formats[col]=f
+    for col, f in formats.items():
+        if col in output.columns:
+            if str(output[col].dtype) in ["Float32","Float64","float64","float32","float16","float"]:
+                output[col] = output[col].map(f.format)
+    onetime_log.write(" -Float statistics formats:",verbose=verbose)
+    keys=[]
+    values=[]
+    for key,value in formats.items():
+        if key in output.columns:
+            keys.append(key)
+            values.append(value)
+    onetime_log.write("  - Columns       :",keys,verbose=verbose)
+    onetime_log.write("  - Output formats:",values,verbose=verbose)
+    ##########################################################################################################
+    # output, mapping column names
+    if fmt in get_formats_list() + ["vep","bed","annovar","vcf"]:
+        tofmt(output,
+              path=path,
+              fmt=fmt,
+              cols=cols,
+              suffix=suffix,
+              build=build,
+              verbose=verbose,
+                no_status=no_status,
+                log=onetime_log,
+                to_csvargs=to_csvargs,
+                chr_prefix=chr_prefix,
+                meta=meta,
+                ssfmeta=ssfmeta,
+                bgzip=bgzip,
+                tabix=tabix,
+                tabix_indexargs=tabix_indexargs,
+                md5sum=md5sum,
+                xymt_number=xymt_number,
+                xymt=xymt)
+    if output_log is True:
+        log_path = path + "."+ suffix + ".log"
+        onetime_log.write(" -Saving log file to: {}".format(log_path),verbose=verbose)
+        onetime_log.write("Finished outputting successfully!",verbose=verbose)
+        try:
+            onetime_log.save(log_path, verbose=False)
+        except:
+            pass
 ###################################################################################################################################################
 def tofmt(sumstats,
           meta,
@@ -47,229 +211,74 @@ def tofmt(sumstats,
     if fmt in ["ssf"]:
         xymt_number=True
         if "SNPID" in sumstats.columns:
-            if verbose: log.write(' - Replacing SNPID separator from ":" to "_"...')
+            log.write(' -Replacing SNPID separator from ":" to "_"...')
             sumstats["SNPID"] = sumstats["SNPID"].str.replace(":","_")
-    if verbose: log.write(" - Start outputting sumstats in "+fmt+" format...")
+    log.write(" -Start outputting sumstats in "+fmt+" format...")
     if "CHR" in sumstats.columns:
         if xymt_number is False and pd.api.types.is_integer_dtype(sumstats["CHR"]):
             sumstats["CHR"]= sumstats["CHR"].map(get_number_to_chr(xymt=xymt,prefix=chr_prefix))
         elif chr_prefix is not None:
             sumstats["CHR"]= chr_prefix + sumstats["CHR"].astype("string")
-    ### calculate meta data
-    if "EAF" in sumstats.columns:
-        min_maf = sumstats["EAF"].min()
-    else:
-        min_maf = "Unknown"
-    if "N" in sumstats.columns:
-        n_median =  sumstats["N"].median()
-        n_max = sumstats["N"].max()
-        n_min = sumstats["N"].min()
-    else:
-        n_median = "Unknown"
-        n_max = "Unknown"
-        n_min = "Unknown"
+    ####################################################################################################################
     if fmt=="bed":
         # bed-like format, 0-based,
         # first 3 columns : chromosome, start, end
         # https://genome.ucsc.edu/FAQ/FAQformat.html#format1
-        is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
-        is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
-        is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
+        is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
+        log.write(" -formatting to 0-based bed-like file...")
+        log.write(" -format description: {}".format("https://genome.ucsc.edu/FAQ/FAQformat.html#format1"))
-        if verbose: log.write(" -Number of SNPs :",sum(is_snp))
-        if verbose: log.write(" -Number of Insertions :",sum(is_insert))
-        if verbose: log.write(" -Number of Deletions :",sum(is_delete))
-        if verbose: log.write(" -formatting to 0-based bed-like file...")
-        # for snp
-        # start = pos - 1 ; end = pos
-        # A/G
-        # AT/CG
-        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"]-1
-        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
-        sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
-        # for insertion
-        # start = pos : end = pos
-        # A/ATC -> -/TC
-        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"]
-        sumstats.loc[is_insert,"END"]    = sumstats.loc[is_insert,"POS"]
-        sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
-        # for deletion
-        # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
-        # ATC/A -> TC/-
-        sumstats.loc[is_delete,"START"]  = sumstats.loc[is_delete,"POS"]
-        sumstats.loc[is_delete,"END"]    = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
-        sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
-        sumstats["STRAND"]="+"
+        sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
-        sumstats["START"] = sumstats["START"].astype("Int64")
-        sumstats["END"] = sumstats["END"].astype("Int64")
         ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"] + cols
-        sumstats = sumstats.loc[:,ouput_cols ]
-        path = path + "."+suffix
-        if verbose: log.write(" -Output columns:",sumstats.columns)
-        if verbose: log.write(" -Output path:",path)
-        sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
-        #tabix_compress
-        #tabix_index
-        if bgzip is True:
-            if verbose: log.write(" -bgzip compressing ...")
-            tabix_compress(path, path+".gz",force=True)
-        if tabix is True:
-            if verbose: log.write(" -tabix indexing...")
-            if "preset" not in  tabix_indexargs:
-                tabix_indexargs["preset"] = "bed"
-            if "force" not in tabix_indexargs:
-                tabix_indexargs["force"] = True
-            tabix_index(path+".gz", **tabix_indexargs)
+        _output_bed_like(sumstats,  path, "bed", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
     ####################################################################################################################
     elif fmt=="vep":
         # bed-like format, 1-based
         # first 6 columns : chromosome, start, end, allele, strand, identifier
         # https://asia.ensembl.org/info/docs/tools/vep/vep_formats.html
-        is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
-        is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
-        is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
-        if verbose: log.write(" -Number of SNPs :",sum(is_snp))
-        if verbose: log.write(" -Number of Insertions :",sum(is_insert))
-        if verbose: log.write(" -Number of Deletions :",sum(is_delete))
+        is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
-        if verbose: log.write(" -formatting to 1-based bed-like file (for vep)...")
-        # for snp
-        # start = pos ; end = pos
-        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
-        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
-        sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
-        # for insertion
-        # start = pos+1 ; end = pos
-        # A/ATC -> -/TC
-        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"] + 1
-        sumstats.loc[is_insert,"END"]    = sumstats.loc[is_insert,"POS"]
-        sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
-        # for deletion
-        # start = pos ; end = pos + len(Ref) -1
-        # ATC/A -> TC/-
-        sumstats.loc[is_delete,"START"]  = sumstats.loc[is_delete,"POS"] + 1
-        sumstats.loc[is_delete,"END"]    = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
-        sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
-        sumstats["STRAND"]="+"
-        sumstats["START"] = sumstats["START"].astype("Int64")
-        sumstats["END"] = sumstats["END"].astype("Int64")
+        log.write(" -formatting to 1-based bed-like file (for vep)...")
+        log.write(" -format description: {}".format("http://asia.ensembl.org/info/docs/tools/vep/vep_formats.html"))
+        sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete , log, verbose)
         ouput_cols=["CHR","START","END","NEA/EA","STRAND","SNPID"]+ cols
-        sumstats = sumstats.loc[:,ouput_cols]
-        path = path + "."+suffix+".gz"
-        if verbose: log.write(" -Output columns:",sumstats.columns)
-        if verbose: log.write(" -Output path:",path)
-        sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
-        if md5sum is True: md5sum_file(path,log,verbose)
+        _output_bed_like(sumstats,  path,"vep", suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
     ####################################################################################################################
     elif fmt=="annovar":
         # bed-like format, 1-based,
         # first 3 columns : Chromosome ("chr" prefix is optional), Start, End, Reference Allelel, Alternative Allele
         # https://annovar.openbioinformatics.org/en/latest/user-guide/input/
-        is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
-        is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
-        is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
-        if verbose: log.write(" -Number of SNPs :",sum(is_snp))
-        if verbose: log.write(" -Number of Insertions :",sum(is_insert))
-        if verbose: log.write(" -Number of Deletions :",sum(is_delete))
-        if verbose: log.write(" -formatting to 1-based bed-like file...")
-        # for snp
-        # start = pos ; end = pos
-        # A/G
-        # AT/CG
-        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"]
-        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
-        sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
-        sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
-        # for insertion
-        # start = pos : end = pos
-        # A/ATC -> -/TC
-        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"]+1
-        sumstats.loc[is_insert,"END"]   = sumstats.loc[is_insert,"POS"]+1
-        sumstats.loc[is_insert,"NEA_out"] = "-"
-        sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
-        # for deletion
-        # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
-        # ATC/A -> TC/-
-        sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
-        sumstats.loc[is_delete,"END"]  = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
-        sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
-        sumstats.loc[is_delete,"EA_out"] = "-"
+        is_snp,is_insert,is_delete = _check_indel(sumstats,log,verbose)
+        log.write(" -formatting to 1-based bed-like file...")
+        log.write(" -format description: {}".format("https://annovar.openbioinformatics.org/en/latest/user-guide/input/"))
-        sumstats["START"] = sumstats["START"].astype("Int64")
-        sumstats["END"] = sumstats["END"].astype("Int64")
+        sumstats = _adjust_position(sumstats, fmt, is_snp, is_insert, is_delete, log, verbose )
         ouput_cols=["CHR","START","END","NEA_out","EA_out","SNPID"]+ cols
-        sumstats = sumstats.loc[:,ouput_cols]
-        path = path + "."+suffix
-        if verbose: log.write(" -Output columns:",sumstats.columns)
-        if verbose: log.write(" -Output path:",path)
-        sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
-        #tabix_compress
-        #tabix_index
-        if bgzip is True:
-            if verbose: log.write(" -bgzip compressing ...")
-            tabix_compress(path, path+".gz",force=True)
-            if md5sum is True: md5sum_file(path+".gz",log,verbose)
-        if tabix is True:
-            if verbose: log.write(" -tabix indexing...")
-            if "preset" not in tabix_indexargs:
-                tabix_indexargs["preset"] = "bed"
-            if "force" not in tabix_indexargs:
-                tabix_indexargs["force"] = True
-            tabix_index(path+".gz", **tabix_indexargs)
+        _output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose)
     ####################################################################################################################
     elif fmt=="vcf":
-        if verbose: log.write(" -"+fmt+" format will be loaded...")
+        # GWAS-VCF
+        log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
         meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
-        #loading format data
-        if verbose:
-            log.write(" -"+fmt+" format meta info:")
-            for key,value in meta_data.items():
-                if key not in ["format_fixed_header","format_contig_19","format_contig_38"]:
-                    log.write("  -",key," : ",value)
+        print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True, skip_meta_records=["format_fixed_header","format_contig_19","format_contig_38"])
         # determine which ID to use
         if "rsID" in sumstats.columns:
             rename_dictionary["rsID"]="ID"
         else:
             rename_dictionary["SNPID"]="ID"
-        # logging
-        if verbose:
-            log.write(" -gwaslab to "+fmt+" format dictionary:")
-            keys=[]
-            values=[]
-            for key,value in rename_dictionary.items():
-                keys.append(key)
-                values.append(value)
-            log.write("  - gwaslab keys:",','.join(keys))
-            log.write("  - "+fmt+" values:",','.join(values))
         # get the columns to output
         ouput_cols=[]
@@ -277,12 +286,10 @@ def tofmt(sumstats,
             if i in rename_dictionary.keys():
                 ouput_cols.append(i)
         ouput_cols = ouput_cols +["STATUS"]+ cols
-        sumstats = sumstats.loc[:,ouput_cols]
+        sumstats = sumstats[ouput_cols]
         sumstats = sumstats.rename(columns=rename_dictionary)
-        # calculate meta data
-        harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
-        switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
+        # replace : with _
         sumstats["ID"] = sumstats["ID"].str.replace(":","_")
         # process Allele frequency data
@@ -297,35 +304,21 @@ def tofmt(sumstats,
             if i in meta_data["format_format"]:
                 output_format.append(i)
-        # Create vcf header
-        vcf_header= meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
-        # Create sample header
-        vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(meta["gwaslab"]["study_name"],len(sumstats),harmonised,len(sumstats)-harmonised,switchedalleles,meta["gwaslab"]["study_type"])
-        vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
-         #StudyID=meta["Name"]
-        #otalVariants = len(sumstats)
-        #HarmonisedVariants =
-        #VariantsNotHarmonised =
-        #StudyType=
-        ##SAMPLE=<ID=IEU-b-1,TotalVariants=9851866,VariantsNotRead=0,HarmonisedVariants=9851866,VariantsNotHarmonised=0,SwitchedAlleles=9851866,StudyType=Continuous>
         # determine path
         path = path + "."+suffix
-        if verbose: log.write(" -Output path:",path)
-        if verbose: log.write(" -vcf header contig build:"+str(build))
+        vcf_header =  _process_vcf_header(sumstats, meta, meta_data, build, log, verbose)
+        log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
         # output header
         with open(path,"w") as file:
             file.write(vcf_header)
         with open(path,"a") as file:
-            if verbose: log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
+            log.write(" -Output columns:"," ".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]]))
             file.write("\t".join(meta_data["format_fixed"]+[meta["gwaslab"]["study_name"]])+"\n")
-            if verbose: log.write(" -Outputing data...")
-            counter=0
+            log.write(" -Outputing data...")
             QUAL="."
             FILTER="PASS"
             for index,row in sumstats.iterrows():
@@ -337,112 +330,153 @@ def tofmt(sumstats,
                 INFO=str(row["INFO"])
                 FORMAT=":".join(output_format)
                 DATA=":".join(row[output_format].astype("string"))
-                file.write(CHROM+"\t"+POS+"\t"+ID+"\t"+REF+"\t"+ALT+"\t"+QUAL+"\t"+FILTER+"\t"+INFO+"\t"+FORMAT+"\t"+DATA+"\n")
-        if bgzip==True:
-            if verbose: log.write(" -bgzip compressing ...")
-            tabix_compress(path, path+".gz",force=True)
-            if md5sum is True: md5sum_file(path+".gz",log,verbose)
-        if tabix==True:
-            if verbose: log.write(" -tabix indexing...")
-            if "preset" not in tabix_indexargs:
-                tabix_indexargs["preset"] = "vcf"
-            if "force" not in tabix_indexargs:
-                tabix_indexargs["force"] = True
-            tabix_index(path+".gz", **tabix_indexargs)
+                file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, DATA))
+        _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
     ####################################################################################################################
-    elif fmt in get_formats_list():
-        if verbose: log.write(" -"+fmt+" format will be loaded...")
+    elif fmt in get_formats_list():
+        # tabular
+        log.write(" -"+fmt+" format will be loaded...",verbose=verbose)
         meta_data,rename_dictionary = get_format_dict(fmt,inverse=True)
         print_format_info(fmt=fmt, meta_data=meta_data,rename_dictionary=rename_dictionary,verbose=verbose, log=log, output=True)
-        #if verbose:
-        #    log.write(" -"+fmt+" format meta info:")
-        #    for key,value in meta_data.items():
-        #        if type(value) is list:
-        #            log.write("  -",key," : ",','.join(value))
-        #        else:
-        #            log.write("  -",key," : ",value)
-        #if verbose:
-        #    log.write(" -gwaslab to "+fmt+" format dictionary:",)
-        #    keys=[]
-        #    values=[]
-        #    for key,value in rename_dictionary.items():
-        #        keys.append(key)
-        #        values.append(value)
-        #    log.write("  - gwaslab keys:",  ','.join(keys))
-        #    log.write("  - "+fmt+" values:"  , ','.join(values))
-        # grab format cols that exist in sumstats
-        ouput_cols=[]
-        for i in sumstats.columns:
-            if i in rename_dictionary.keys():
-                ouput_cols.append(i)
-        # + additional cols
-        ouput_cols = ouput_cols + cols
-        try:
-            if no_status == True:
-                ouput_cols.remove("STATUS")
-        except:
-            pass
-        sumstats = sumstats.loc[:,ouput_cols]
-        sumstats = sumstats.rename(columns=rename_dictionary)
         ymal_path = path + "."+suffix+".tsv-meta.ymal"
         path = path + "."+suffix+".tsv.gz"
+        log.write(" -Output path:",path, verbose=verbose)
-        if verbose: log.write(" -Output path:",path)
+        sumstats,to_csvargs = _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose)
-        if path is not None:
-            if "format_separator" in meta_data.keys():
-                to_csvargs["sep"] = meta_data["format_separator"]
-            else:
-                to_csvargs["sep"]="\t"
-            if "format_na" in meta_data.keys():
-                to_csvargs["na_rep"] = meta_data["format_na"]
-            if "format_col_order" in meta_data.keys():
-                fixed_col =[]
-                other_col=[]
-                for i in meta_data["format_col_order"]:
-                    if i in sumstats.columns:
-                        fixed_col.append(i)
-                for i in sumstats.columns:
-                    if i not in meta_data["format_col_order"]:
-                        other_col.append(i)
-                sumstats = sumstats.loc[:,fixed_col + other_col]
-                if verbose: log.write(" -Reordering columns...")
-            if verbose: log.write(" -Output columns:",','.join(sumstats.columns))
-            sumstats.to_csv(path, index=None,**to_csvargs)
-        if md5sum is True:
+        log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
+        sumstats.to_csv(path, index=None,**to_csvargs)
+        if md5sum == True:
             md5_value = md5sum_file(path,log,verbose)
         else:
             md5_value = calculate_md5sum_file(path)
         ## update ssf-style meta data and export to yaml file
-        if ssfmeta==True:
-            meta_copy = meta.copy()
-            if "format_cite_name" in meta_data.keys():
-                meta_copy["file_type"] = meta_data["format_cite_name"]
-            else:
-                meta_copy["file_type"] = fmt
-            meta_copy["minor_allele_freq_lower_limit"] = min_maf
-            meta_copy["data_file_name"] = path
-            meta_copy["data_file_md5sum"] = md5_value
-            meta_copy["date_last_modified"] = get_format_date_and_time()
-            meta_copy["samples"]["sample_size"] = n_max
-            meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
-            meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
-            meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
-            if verbose: log.write(" -Exporting SSF-style meta data to {}".format(ymal_path))
-            with open(ymal_path, 'w') as outfile:
-                yaml.dump(meta_copy, outfile)
+        _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose)
         return sumstats
+####################################################################################################################
+def _configure_output_cols_and_args(sumstats, rename_dictionary, cols, no_status, path, meta_data, to_csvargs, log, verbose):
+    # grab format cols that exist in sumstats
+    ouput_cols=[]
+    for i in sumstats.columns:
+        if i in rename_dictionary.keys():
+            ouput_cols.append(i)
+    # + additional cols and remove duplicated
+    ouput_cols = list(set(ouput_cols + cols))
+    # remove STATUS
+    try:
+        if no_status == True:
+            ouput_cols.remove("STATUS")
+    except:
+        pass
+    #filter and rename to target fromat headers
+    sumstats = sumstats[ouput_cols]
+    sumstats = sumstats.rename(columns=rename_dictionary)
+    # configure target format args and reorder columns
+    if "format_separator" in meta_data.keys():
+        to_csvargs["sep"] = meta_data["format_separator"]
+    else:
+        to_csvargs["sep"]="\t"
+    if "format_na" in meta_data.keys():
+        to_csvargs["na_rep"] = meta_data["format_na"]
+    if "format_col_order" in meta_data.keys():
+        fixed_col =[]
+        other_col=[]
+        for i in meta_data["format_col_order"]:
+            if i in sumstats.columns:
+                fixed_col.append(i)
+        for i in sumstats.columns:
+            if i not in meta_data["format_col_order"]:
+                other_col.append(i)
+        sumstats = sumstats[fixed_col + other_col]
+    log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
+    return sumstats, to_csvargs
+def _configure_ssf_meta(sumstats, fmt, ssfmeta, meta, meta_data, path, md5_value, ymal_path, log, verbose):
+    ### calculate meta data
+    if "EAF" in sumstats.columns:
+        min_maf = sumstats["EAF"].min()
+    else:
+        min_maf = "Unknown"
+    if "N" in sumstats.columns:
+        n_median =  sumstats["N"].median()
+        n_max = sumstats["N"].max()
+        n_min = sumstats["N"].min()
+    else:
+        n_median = "Unknown"
+        n_max = "Unknown"
+        n_min = "Unknown"
+    if ssfmeta==True:
+        sumstats_meta_copy = meta.copy()
+        if "format_cite_name" in meta_data.keys():
+            sumstats_meta_copy["file_type"] = meta_data["format_cite_name"]
+        else:
+            sumstats_meta_copy["file_type"] = fmt
+        sumstats_meta_copy["minor_allele_freq_lower_limit"] = min_maf
+        sumstats_meta_copy["data_file_name"] = path
+        sumstats_meta_copy["data_file_md5sum"] = md5_value
+        sumstats_meta_copy["date_last_modified"] = get_format_date_and_time()
+        sumstats_meta_copy["samples"]["sample_size"] = n_max
+        sumstats_meta_copy["gwaslab"]["samples"]["sample_size_min"] = n_min
+        sumstats_meta_copy["gwaslab"]["samples"]["sample_size_median"] = n_median
+        sumstats_meta_copy["gwaslab"]["variants"]["variant_number"] = len(sumstats)
+        log.write(" -Exporting SSF-style meta data to {}".format(ymal_path),verbose=verbose)
+        with open(ymal_path, 'w') as outfile:
+            yaml.dump(sumstats_meta_copy, outfile)
+def _output_bed_like(sumstats, path, fmt, suffix, ouput_cols,to_csvargs,bgzip, tabix, tabix_indexargs, md5sum, log, verbose):
+    sumstats = sumstats[ouput_cols]
+    path = path + "."+suffix
+    log.write(" -Output columns: {}".format(",".join(sumstats.columns)),verbose=verbose)
+    log.write(" -Writing sumstats to: {}...".format(path),verbose=verbose)
+    sumstats.to_csv(path,sep="\t",index=None,header=None,**to_csvargs)
+    _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose)
+def _bgzip_tabix_md5sum(path, fmt, bgzip, md5sum, tabix, tabix_indexargs, log, verbose):
+    if bgzip == True:
+        log.write(" -bgzip compressing : {}...".format(path+".gz"),verbose=verbose)
+        tabix_compress(path, path+".gz",force=True)
+    if md5sum == True:
+        if bgzip == True:
+            md5sum_file(path+".gz",log,verbose)
+        else:
+            md5sum_file(path,log,verbose)
+    if tabix == True and bgzip == True:
+        log.write(" -tabix indexing : : {}...".format(path+".gz.tbi"),verbose=verbose)
+        if "preset" not in  tabix_indexargs:
+            tabix_indexargs["preset"] = fmt
+        if "force" not in tabix_indexargs:
+            tabix_indexargs["force"] = True
+        tabix_index(path+".gz", **tabix_indexargs)
+def _check_indel(sumstats,log,verbose):
+    is_snp = (sumstats["EA"].str.len() == sumstats["NEA"].str.len())
+    is_insert = (sumstats["EA"].str.len()>1) &(sumstats["NEA"].str.len()==1)
+    is_delete = (sumstats["EA"].str.len()==1) &(sumstats["NEA"].str.len()>1)
+    log.write(" -Number of SNPs :",sum(is_snp))
+    log.write(" -Number of Insertions :",sum(is_insert))
+    log.write(" -Number of Deletions :",sum(is_delete))
+    return is_snp,is_insert,is_delete
 def md5sum_file(filename,log,verbose):
-    if verbose: log.write(" -md5sum hashing for the file:",filename)
+    log.write(" -md5sum hashing for the file:",filename,verbose=verbose)
     md5_hash = hashlib.md5()
     with open(filename,"rb") as f:
         # Read and update hash in chunks
@@ -451,7 +485,8 @@ def md5sum_file(filename,log,verbose):
     with open(filename+".md5sum","w") as f:
         out = str(md5_hash.hexdigest())
         f.write(out+"\n")
-        if verbose: log.write(" -md5sum path:",filename+".md5sum")
+        log.write(" -md5sum path:",filename+".md5sum",verbose=verbose)
+        log.write(" -md5sum: {}".format(out),verbose=verbose)
         return out
 def calculate_md5sum_file(filename):
@@ -466,4 +501,102 @@ def calculate_md5sum_file(filename):
 def get_format_date_and_time():
     now = datetime.now()
     dt_string = now.strftime("%Y-%m-%d-%H:%M:%S")
-    return dt_string
+    return dt_string
+def _adjust_position(sumstats, fmt,is_snp, is_insert, is_delete, log, verbose):
+    log.write(" -Adjusting positions in format-specific manner..",verbose=verbose)
+    if fmt=="bed":
+        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"]-1
+        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
+        sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
+        # for insertion
+        # start = pos : end = pos
+        # A/ATC -> -/TC
+        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"]
+        sumstats.loc[is_insert,"END"]    = sumstats.loc[is_insert,"POS"]
+        sumstats.loc[is_insert,"NEA/EA"] = "-/"+sumstats.loc[is_insert,"EA"].str.slice(start=1)
+        # for deletion
+        # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
+        # ATC/A -> TC/-
+        sumstats.loc[is_delete,"START"]  = sumstats.loc[is_delete,"POS"]
+        sumstats.loc[is_delete,"END"]    = sumstats.loc[is_delete,"POS"] + sumstats.loc[is_delete,"NEA"].str.len() - 1
+        sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
+        sumstats["STRAND"]="+"
+    elif fmt=="vep":
+        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
+        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"] + (sumstats.loc[is_snp,"NEA"].str.len() - 1 )
+        sumstats.loc[is_snp,"NEA/EA"] = sumstats.loc[is_snp,"NEA"].astype("string")+"/"+sumstats.loc[is_snp,"EA"].astype("string")
+        # for insertion
+        # start = pos+1 ; end = pos
+        # A/ATC -> -/TC
+        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"] + 1
+        sumstats.loc[is_insert,"END"]    = sumstats.loc[is_insert,"POS"]
+        sumstats.loc[is_insert,"NEA/EA"] = "-/" + sumstats.loc[is_insert,"EA"].str.slice(start=1)
+        # for deletion
+        # start = pos ; end = pos + len(Ref) -1
+        # ATC/A -> TC/-
+        sumstats.loc[is_delete,"START"]  = sumstats.loc[is_delete,"POS"] + 1
+        sumstats.loc[is_delete,"END"]    = sumstats.loc[is_delete,"POS"] + (sumstats.loc[is_delete,"NEA"].str.len() -1)
+        sumstats.loc[is_delete,"NEA/EA"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)+"/-"
+        sumstats["STRAND"]="+"
+    elif fmt=="annovar":
+        # for snp
+        # start = pos ; end = pos
+        # A/G
+        # AT/CG
+        sumstats.loc[is_snp,"START"]  = sumstats.loc[is_snp,"POS"]
+        sumstats.loc[is_snp,"END"]    = sumstats.loc[is_snp,"POS"]-1 + sumstats.loc[is_snp,"NEA"].str.len()
+        sumstats.loc[is_snp,"NEA_out"] = sumstats.loc[is_snp,"NEA"].astype("string")
+        sumstats.loc[is_snp,"EA_out"] = sumstats.loc[is_snp,"EA"].astype("string")
+        # for insertion
+        # start = pos : end = pos
+        # A/ATC -> -/TC
+        sumstats.loc[is_insert,"START"]  = sumstats.loc[is_insert,"POS"]+1
+        sumstats.loc[is_insert,"END"]   = sumstats.loc[is_insert,"POS"]+1
+        sumstats.loc[is_insert,"NEA_out"] = "-"
+        sumstats.loc[is_insert,"EA_out"] = sumstats.loc[is_insert,"EA"].str.slice(start=1)
+        # for deletion
+        # start = pos - 1 +1; end = pos -1 +1+ len(Ref)
+        # ATC/A -> TC/-
+        sumstats.loc[is_delete,"START"] = sumstats.loc[is_delete,"POS"]
+        sumstats.loc[is_delete,"END"]  = sumstats.loc[is_delete,"POS"]- 1 + sumstats.loc[is_delete,"NEA"].str.len()
+        sumstats.loc[is_delete,"NEA_out"] = sumstats.loc[is_delete,"NEA"].str.slice(start=1)
+        sumstats.loc[is_delete,"EA_out"] = "-"
+    sumstats["START"] = sumstats["START"].astype("Int64")
+    sumstats["END"] = sumstats["END"].astype("Int64")
+    return sumstats
+def _process_vcf_header(sumstats, meta, meta_data, build, log, verbose):
+    log.write(" -Creating VCF file header...",verbose=verbose)
+    log.write("  -VCF header contig build:"+str(build),verbose=verbose)
+    # calculate meta data
+    harmonised = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][012][01234]", case=False, flags=0, na=False ) )
+    switchedalleles = sum(sumstats["STATUS"].str.match( r"\w\w\w[0][0123][12][24]", case=False, flags=0, na=False ) )
+    # Create vcf header
+    vcf_header = meta_data["format_fixed_header"] +"\n"+ meta_data["format_contig_"+str(build)]+"\n"
+    # Create sample header
+    vcf_header+="##SAMPLE=<ID={},TotalVariants={},VariantsNotRead=0,HarmonisedVariants={},VariantsNotHarmonised={},SwitchedAlleles={},StudyType={}>\n".format(
+                    meta["gwaslab"]["study_name"], len(sumstats), harmonised, len(sumstats)-harmonised, switchedalleles, meta["gwaslab"]["study_type"])
+    vcf_header+="##gwaslab_version="+gwaslab_info()["version"]+"\n"
+    log.write("  -ID:{}".format( meta["gwaslab"]["study_name"]),verbose=verbose)
+    log.write("  -StudyType:{}".format(meta["gwaslab"]["study_type"]),verbose=verbose)
+    log.write("  -TotalVariants:{}".format(len(sumstats)),verbose=verbose)
+    log.write("  -HarmonisedVariants:{}".format(harmonised),verbose=verbose)
+    log.write("  -VariantsNotHarmonised:{}".format(len(sumstats)-harmonised),verbose=verbose)
+    log.write("  -SwitchedAlleles:{}".format(switchedalleles),verbose=verbose)
+    return vcf_header

gwaslab 3.4.36__py3-none-any.whl → 3.4.38__py3-none-any.whl

Potentially problematic release.

gwaslab 3.4.36py3-none-any.whl → 3.4.38py3-none-any.whl