PyPI - gwaslab - Versions diffs - 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl - Mend

gwaslab 3.5.7py3-none-any.whl → 3.5.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of gwaslab might be problematic. Click here for more details.

Files changed (63) hide show

gwaslab/__init__.py +2 -0
gwaslab/bd_common_data.py +1 -0
gwaslab/bd_get_hapmap3.py +0 -1
gwaslab/data/formatbook.json +78 -0
gwaslab/g_Sumstats.py +98 -24
gwaslab/g_SumstatsMulti.py +287 -0
gwaslab/g_SumstatsPair.py +101 -16
gwaslab/g_Sumstats_polars.py +245 -0
gwaslab/g_headers.py +12 -3
gwaslab/g_meta.py +123 -47
gwaslab/g_meta_update.py +48 -0
gwaslab/g_vchange_status_polars.py +44 -0
gwaslab/g_version.py +2 -2
gwaslab/hm_casting.py +169 -110
gwaslab/hm_casting_polars.py +202 -0
gwaslab/hm_harmonize_sumstats.py +19 -8
gwaslab/io_load_ld.py +529 -0
gwaslab/io_preformat_input.py +11 -0
gwaslab/io_preformat_input_polars.py +632 -0
gwaslab/io_process_args.py +25 -1
gwaslab/io_read_ldsc.py +34 -3
gwaslab/io_read_pipcs.py +62 -6
gwaslab/prscs_gigrnd.py +122 -0
gwaslab/prscs_mcmc_gtb.py +136 -0
gwaslab/prscs_parse_genet.py +98 -0
gwaslab/qc_build.py +53 -0
gwaslab/qc_check_datatype.py +10 -8
gwaslab/qc_check_datatype_polars.py +128 -0
gwaslab/qc_fix_sumstats.py +25 -23
gwaslab/qc_fix_sumstats_polars.py +193 -0
gwaslab/util_ex_calculate_ldmatrix.py +49 -19
gwaslab/util_ex_gwascatalog.py +71 -28
gwaslab/util_ex_ldsc.py +67 -21
gwaslab/util_ex_match_ldmatrix.py +396 -0
gwaslab/util_ex_run_2samplemr.py +0 -2
gwaslab/util_ex_run_ccgwas.py +155 -0
gwaslab/util_ex_run_coloc.py +1 -1
gwaslab/util_ex_run_hyprcoloc.py +117 -0
gwaslab/util_ex_run_mesusie.py +155 -0
gwaslab/util_ex_run_mtag.py +92 -0
gwaslab/util_ex_run_prscs.py +85 -0
gwaslab/util_ex_run_susie.py +40 -9
gwaslab/util_in_estimate_ess.py +18 -0
gwaslab/util_in_fill_data.py +20 -1
gwaslab/util_in_filter_value.py +10 -5
gwaslab/util_in_get_sig.py +71 -13
gwaslab/util_in_meta.py +168 -4
gwaslab/util_in_meta_polars.py +174 -0
gwaslab/viz_plot_compare_effect.py +87 -23
gwaslab/viz_plot_credible_sets.py +55 -11
gwaslab/viz_plot_effect.py +22 -12
gwaslab/viz_plot_miamiplot2.py +3 -2
gwaslab/viz_plot_mqqplot.py +84 -81
gwaslab/viz_plot_qqplot.py +6 -6
gwaslab/viz_plot_regional2.py +2 -1
gwaslab/viz_plot_stackedregional.py +4 -1
{gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/METADATA +8 -6
gwaslab-3.5.8.dist-info/RECORD +117 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/WHEEL +1 -1
gwaslab-3.5.7.dist-info/RECORD +0 -96
{gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE +0 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info/licenses}/LICENSE_before_v3.4.39 +0 -0
{gwaslab-3.5.7.dist-info → gwaslab-3.5.8.dist-info}/top_level.txt +0 -0

gwaslab/qc_check_datatype_polars.py ADDED Viewed

@@ -0,0 +1,128 @@
+import gc
+import pandas as pd
+import polars as pl
+import numpy as np
+from gwaslab.g_Log import Log
+# pandas.api.types.is_int64_dtype
+# pandas.api.types.is_categorical_dtype
+dtype_dict ={
+    "SNPID":[pl.String()],
+    "rsID": [pl.String()],
+    "CHR":  [pl.Int64()],
+    "POS":  [pl.Int64()],
+    "EA":   [pl.String()],
+    "NEA":[pl.String()],
+    "REF":[pl.String()],
+    "ALT":[pl.String()],
+    "BETA":[pl.Float64()],
+    "BETA_95L":[pl.Float64()],
+    "BETA_95U":[pl.Float64()],
+    "SE":[pl.Float64()],
+    "N":[pl.Int64()],
+    "N_CASE":[pl.Int64()],
+    "N_CONTROL":[pl.Int64()],
+    "OR":[pl.Float64()],
+    "OR_95L":[pl.Float64()],
+    "OR_95U":[pl.Float64()],
+    "HR":[pl.Float64()],
+    "HR_95L":[pl.Float64()],
+    "HR_95U":[pl.Float64()],
+    "P":[pl.Float64()],
+    "MLOG10P":[pl.Float64()],
+    "Z":[pl.Float64()],
+    "F":[pl.Float64()],
+    "T":[pl.Float64()],
+    "TEST":[pl.String()],
+    "CHISQ":[pl.Float64()],
+    "I2":[pl.Float64()],
+    "P_HET":[pl.Float64()],
+    "SNPR2":[pl.Float64()],
+    "EAF":[pl.Float64()],
+    "NEAF":[pl.Float64()],
+    "MAF":[pl.Float64()],
+    "INFO":[pl.Float64()],
+    "DOF":[pl.Int64()],
+    "STATUS":[pl.String()],
+    "DIRECTION":[pl.String()],
+    'PIP'               :[pl.Float64()],
+    'CREDIBLE_SET_INDEX':[pl.Int64()],
+    'N_SNP'             :[pl.Int64()],
+    'LOCUS'             :[pl.String()],
+    'STUDY'             :[pl.String()],
+    'BETA_RANDOM' :[pl.Float64()],
+    'SE_RANDOM' :[pl.Float64()],
+    'Z_RANDOM' :[pl.Float64()],
+    'P_RANDOM' :[pl.Float64()]
+    }
+def check_datatype(sumstats, verbose=True, log=Log()):
+    #try:
+    headers = []
+    dtypes = []
+    verified = []
+    raw_verified =[]
+    for header,dtype in  sumstats.schema.items():
+        width = max(len(header),len(str(dtype)))
+        header_fix_length = header + " "*(width- len(header) )
+        dtype_fix_length  = str(dtype) + " "*(width- len(str(dtype)))
+        verified_str = verify_datatype(header, dtype)
+        verified_fix_length  = verified_str + " " *(width- len(verified_str))
+        headers.append(format(header_fix_length))
+        dtypes.append((str(dtype_fix_length)))
+        verified.append(verified_fix_length)
+        if verified_str == "F":
+            raw_verified.append(header)
+    log.write(" -Column  :", " ".join(headers), verbose=verbose)
+    log.write(" -DType   :", " ".join(dtypes), verbose=verbose)
+    log.write(" -Verified:", " ".join(verified), verbose=verbose)
+    if len(raw_verified)>0:
+        log.warning("Columns with possibly incompatible dtypes: {}".format(",".join(raw_verified)), verbose=verbose)
+    #except:
+    #    pass
+def verify_datatype(header, dtype):
+    if header in dtype_dict.keys():
+        if dtype in dtype_dict[header]:
+            return "T"
+        else:
+            return "F"
+    else:
+        return "NA"
+def quick_convert_datatype(sumstats, log, verbose):
+    for col in sumstats.columns:
+        if col in dtype_dict.keys():
+            if sumstats[col].dtype not in dtype_dict[col]:
+                datatype=dtype_dict[col][0]
+                log.write(" -Trying to convert datatype for {}: {} -> {}...".format(col, str(sumstats[col].dtype), datatype), end="" ,verbose=verbose)
+                try:
+                    sumstats = sumstats.cast({col: datatype})
+                    log.write("{}".format(datatype),show_time=False, verbose=verbose)
+                except:
+                    log.write("Failed...",show_time=False,verbose=verbose)
+                    pass
+    return sumstats
+def check_dataframe_shape(sumstats, log, verbose):
+    memory_in_mb = sumstats.estimated_size(unit="mb")
+    try:
+        log.write(" -Current Dataframe shape : {} x {} ; Memory usage: {:.2f} MB".format(len(sumstats),len(sumstats.columns),memory_in_mb), verbose=verbose)
+    except:
+        log.warning("Error: cannot get Dataframe shape...")
+def check_dataframe_memory_usage(sumstats, log, verbose):
+    memory_in_mb = sumstats.estimated_size(unit="mb")
+    try:
+        log.write(" -Current Dataframe memory usage: {:.2f} MB".format(memory_in_mb), verbose=verbose)
+    except:
+        log.warning("Error: cannot get Memory usage...")

gwaslab/qc_fix_sumstats.py CHANGED Viewed

@@ -16,6 +16,8 @@ from gwaslab.bd_common_data import get_number_to_chr
 from gwaslab.bd_common_data import get_chr_list
 from gwaslab.qc_check_datatype import check_datatype
 from gwaslab.qc_check_datatype import check_dataframe_shape
+from gwaslab.qc_build import _process_build
+from gwaslab.qc_build import _set_build
 from gwaslab.g_version import _get_version
 from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
 from gwaslab.util_in_fill_data import _convert_betase_to_p
@@ -41,29 +43,29 @@ from gwaslab.bd_common_data import get_chain
 ###############################################################################################################
 # 20220514
-def _process_build(build,log,verbose):
-    if str(build).lower() in ["hg19","19","37","b37","grch37"]:
-        log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
-        final_build = "19"
-    elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
-        log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
-        final_build = "18"
-    elif str(build).lower() in ["hg38","38","b38","grch38"]:
-        log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
-        final_build = "38"
-    elif str(build).lower() in ["t2t","hs1","chm13","13"]:
-        log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
-        final_build = "13"
-    else:
-        log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
-        final_build = "99"
-    return final_build
-def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
-    build = _process_build(build,log=log,verbose=verbose)
-    sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
-    sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
-    return sumstats, build
+#def _process_build(build,log,verbose):
+#    if str(build).lower() in ["hg19","19","37","b37","grch37"]:
+#        log.write(" -Genomic coordinates are based on GRCh37/hg19...", verbose=verbose)
+#        final_build = "19"
+#    elif str(build).lower() in ["hg18","18","36","b36","grch36"]:
+#        log.write(" -Genomic coordinates are based on GRCh36/hg18...", verbose=verbose)
+#        final_build = "18"
+#    elif str(build).lower() in ["hg38","38","b38","grch38"]:
+#        log.write(" -Genomic coordinates are based on GRCh38/hg38...", verbose=verbose)
+#        final_build = "38"
+#    elif str(build).lower() in ["t2t","hs1","chm13","13"]:
+#        log.write(" -Genomic coordinates are based on T2T-CHM13...", verbose=verbose)
+#        final_build = "13"
+#    else:
+#        log.warning("Version of genomic coordinates is unknown...", verbose=verbose)
+#        final_build = "99"
+#    return final_build
+#
+#def _set_build(sumstats, build="99", status="STATUS",verbose=True,log=Log()):
+#    build = _process_build(build,log=log,verbose=verbose)
+#    sumstats[status] = vchange_status(sumstats[status], 1, "139",build[0]*3)
+#    sumstats[status] = vchange_status(sumstats[status], 2, "89",build[1]*3)
+#    return sumstats, build
 def fixID(sumstats,
        snpid="SNPID",rsid="rsID",chrom="CHR",pos="POS",nea="NEA",ea="EA",status="STATUS",fixprefix=False,

gwaslab/qc_fix_sumstats_polars.py ADDED Viewed

@@ -0,0 +1,193 @@
+import re
+import gc
+import pandas as pd
+import numpy as np
+from itertools import repeat
+from multiprocessing import  Pool
+from liftover import get_lifter
+from liftover import ChainFile
+from functools import partial
+from gwaslab.g_vchange_status_polars import vchange_statusp
+from gwaslab.g_vchange_status import status_match
+from gwaslab.g_vchange_status import change_status
+from gwaslab.g_Log import Log
+from gwaslab.bd_common_data import get_chr_to_number
+from gwaslab.bd_common_data import get_number_to_chr
+from gwaslab.bd_common_data import get_chr_list
+from gwaslab.qc_check_datatype import check_datatype
+from gwaslab.qc_check_datatype import check_dataframe_shape
+from gwaslab.qc_build import _process_build
+from gwaslab.qc_build import _set_build
+from gwaslab.g_version import _get_version
+from gwaslab.util_in_fill_data import _convert_betase_to_mlog10p
+from gwaslab.util_in_fill_data import _convert_betase_to_p
+from gwaslab.util_in_fill_data import _convert_mlog10p_to_p
+from gwaslab.bd_common_data import get_chain
+import polars as pl
+###############################################################################################################
+# 20220426
+def get_reverse_complementary_allele(a):
+    dic = str.maketrans({
+       "A":"T",
+       "T":"A",
+       "C":"G",
+       "G":"C"})
+    return a[::-1].translate(dic)
+def flip_direction(string):
+    flipped_string=""
+    for char in string:
+        if char=="?":
+            flipped_string+="?"
+        elif char=="+":
+            flipped_string+="-"
+        elif char=="-":
+            flipped_string+="+"
+        else: #sometime it is 0
+            flipped_string+=char
+    return flipped_string
+def flip_by_swap(sumstats, matched_index, log, verbose):
+    if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
+        log.write(" -Swapping column: NEA <=> EA...", verbose=verbose)
+        sumstats = sumstats.with_columns(
+                pl.when( matched_index )
+                .then(  pl.col("EA")  )
+                .otherwise( pl.col("NEA") )
+                .alias("NEA"),
+                pl.when( matched_index )
+                .then(   pl.col("NEA")  )
+                .otherwise( pl.col("EA") )
+                .alias("EA"),
+                )
+    return sumstats
+def flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1):
+    for header in ["OR","OR_95L","OR_95U","HR","HR_95L","HR_95U"]:
+        if header in sumstats.columns:
+                log.write(" -Flipping column: {header} = 1 / {header}...".format(header = header), verbose=verbose)
+                sumstats = sumstats.with_columns(
+                pl.when( matched_index )
+                .then(  1/ pl.col(header) )
+                .otherwise( pl.col(header) )
+                .alias(header)
+                )
+    return sumstats
+def flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1):
+    header="EAF"
+    if header in sumstats.columns:
+        log.write(" -Flipping column: EAF = 1 - EAF...", verbose=verbose)
+        sumstats = sumstats.with_columns(
+                pl.when( matched_index )
+                .then(  1 - pl.col(header) )
+                .otherwise( pl.col(header) )
+                .alias(header)
+                )
+    return sumstats
+def flip_by_sign(sumstats, matched_index, log, verbose, cols=None):
+    for header in ["BETA","BETA_95L","BETA_95U","T","Z"]:
+        if header in sumstats.columns:
+                log.write(" -Flipping column: {header} = - {header}...".format(header = header), verbose=verbose)
+                sumstats = sumstats.with_columns(
+                pl.when( matched_index )
+                .then(  - pl.col(header) )
+                .otherwise( pl.col(header) )
+                .alias(header)
+                )
+    if "DIRECTION" in sumstats.columns:
+        sumstats = sumstats.with_columns(
+                pl.when( matched_index )
+                .then(  pl.col("DIRECTION").map_batches(lambda x: pl.Series(flip_direction(x))) )
+                .otherwise( pl.col("DIRECTION") )
+                .alias("DIRECTION")
+                )
+    return sumstats
+def flipallelestatsp(sumstats,status="STATUS",verbose=True,log=Log()):
+    ##start function with col checking#########################################################
+    if_stats_flipped = False
+    ###################get reverse complementary####################
+    pattern = r"\w\w\w\w\w[45]\w"
+    #matched_index = status_match(sumstats[status],6,[4,5]) #
+    #matched_index = sumstats[status].str[5].str.match(r"4|5")
+    matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[45]\w")
+    if len(sumstats.filter(matched_index))>0:
+        log.write("Start to convert alleles to reverse complement for SNPs with status xxxxx[45]x...{}".format(_get_version()), verbose=verbose)
+        log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
+        if ("NEA" in sumstats.columns) and ("EA" in sumstats.columns) :
+            log.write(" -Converting to reverse complement : EA and NEA...", verbose=verbose)
+            sumstats = sumstats.filter(matched_index).with_columns(
+                NEA = pl.col("NEA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x))),
+                EA = pl.col("EA").map_batches(lambda x: pl.Series(get_reverse_complementary_allele(x)))
+                                                                   )
+            sumstats  = vchange_statusp(sumstats, matched_index, status,6, "4","2")
+            log.write(" -Changed the status for flipped variants : xxxxx4x -> xxxxx2x", verbose=verbose)
+        if_stats_flipped = True
+    ###################flip ref####################
+    pattern = r"\w\w\w\w\w[35]\w"
+    #matched_index = status_match(sumstats[status],6,[3,5]) #sumstats[status].str.match(pattern)
+    matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[35]\w")
+    if len(sumstats.filter(matched_index))>0:
+        log.write("Start to flip allele-specific stats for SNPs with status xxxxx[35]x: ALT->EA , REF->NEA ...{}".format(_get_version()), verbose=verbose)
+        log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
+        sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
+        sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
+        sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        #change status
+        log.write(" -Changed the status for flipped variants : xxxxx[35]x -> xxxxx[12]x", verbose=verbose)
+        sumstats  = vchange_statusp(sumstats, matched_index,status,6, "35","12")
+        if_stats_flipped = True
+    ###################flip ref for undistingushable indels####################
+    pattern = r"\w\w\w\w[123][67]6"
+    #matched_index = status_match(sumstats[status],6,[1,2,3])|status_match(sumstats[status],6,[6,7])|status_match(sumstats[status],7,6) #sumstats[status].str.match(pattern)
+    matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w[123][67]6")
+    if len(sumstats.filter(matched_index))>0:
+        log.write("Start to flip allele-specific stats for standardized indels with status xxxx[123][67][6]: ALT->EA , REF->NEA...{}".format(_get_version()), verbose=verbose)
+        log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
+        sumstats = flip_by_swap(sumstats, matched_index, log, verbose)
+        sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
+        sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        #change status
+        log.write(" -Changed the status for flipped variants xxxx[123][67]6 -> xxxx[123][67]4", verbose=verbose)
+        sumstats  = vchange_statusp(sumstats, matched_index,status, 7, "6","4")
+        if_stats_flipped = True
+         # flip ref
+    ###################flip statistics for reverse strand panlindromic variants####################
+    pattern = r"\w\w\w\w\w[012]5"
+    #matched_index = status_match(sumstats[status],6,[0,1,2]) | status_match(sumstats[status],7,[5])#sumstats[status].str.match(pattern)
+    matched_index = pl.col(status).cast(pl.String).str.contains("^\w\w\w\w\w[012]5")
+    if len(sumstats.filter(matched_index))>0:
+        log.write("Start to flip allele-specific stats for palindromic SNPs with status xxxxx[12]5: (-)strand <=> (+)strand...{}".format(_get_version()), verbose=verbose)
+        log.write(" -Flipping "+ str(len(sumstats.filter(matched_index))) +" variants...", verbose=verbose)
+        sumstats = flip_by_sign(sumstats, matched_index, log, verbose, cols=None)
+        sumstats = flip_by_subtract(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        sumstats = flip_by_inverse(sumstats, matched_index, log, verbose, cols=None, factor=1)
+        #change status
+        log.write(" -Changed the status for flipped variants:  xxxxx[012]5: ->  xxxxx[012]2", verbose=verbose)
+        sumstats  = vchange_statusp(sumstats, matched_index,status,7, "5","2")
+        if_stats_flipped = True
+    if if_stats_flipped != True:
+        log.write(" -No statistics have been changed.")
+    return sumstats

gwaslab/util_ex_calculate_ldmatrix.py CHANGED Viewed

@@ -16,6 +16,7 @@ def tofinemapping(sumstats,
                   bfile=None,
                   vcf=None,
                   loci=None,
+                  loci_chrpos=None,
                   out="./",
                   plink="plink",
                   plink2="plink2",
@@ -28,8 +29,10 @@ def tofinemapping(sumstats,
                   overwrite=False,
                   log=Log(),
                   suffixes=None,
+                  extra_plink_option="",
                   verbose=True,
                   **kwargs):
     ##start function with col checking##########################################################
     _start_line = "calculate LD matrix"
     _end_line = "calculating LD matrix"
@@ -52,11 +55,21 @@ def tofinemapping(sumstats,
     if getlead_args is None:
         getlead_args={"windowsizekb":1000}
-    if loci is None:
-        log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
-        sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+    if loci_chrpos is None:
+        if loci is None:
+            log.write(" -Loci were not provided. All significant loci will be automatically extracted...",verbose=verbose)
+            sig_df = getsig(sumstats,id="SNPID",chrom="CHR",pos="POS",p="P"+suffixes[0],**getlead_args)
+        else:
+            sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
     else:
-        sig_df = sumstats.loc[sumstats["SNPID"].isin(loci),:]
+        sig_df = pd.DataFrame()
+        for chrpos in loci_chrpos:
+            chrpos_row_dict={}
+            chrpos_row_dict["SNPID"]="{}:{}".format(chrpos[0], chrpos[1])
+            chrpos_row_dict["CHR"] = chrpos[0]
+            chrpos_row_dict["POS"] = chrpos[1]
+            chrpos_row = pd.Series(chrpos_row_dict).to_frame().T
+            sig_df = pd.concat([sig_df, chrpos_row],ignore_index=True)
     log.write(" -plink1.9 path: {}".format(plink),verbose=verbose)
     log.write(" -plink2 path: {}".format(plink2),verbose=verbose)
@@ -128,6 +141,8 @@ def tofinemapping(sumstats,
                                                             filetype=filetype,
                                                             plink=plink,
                                                             plink2=plink2,
+                                                            extra_plink_option=extra_plink_option,
+                                                            ref_allele_path = matched_sumstats_path,
                                                             verbose=verbose)
@@ -136,7 +151,7 @@ def tofinemapping(sumstats,
         row_dict["SNPID"]=row["SNPID"]
         row_dict["SNPID_LIST"] = matched_snp_list_path
         row_dict["LD_R_MATRIX"] = matched_ld_matrix_path
-        row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path
+        row_dict["LOCUS_SUMSTATS"] = matched_sumstats_path+".gz"
         file_row = pd.Series(row_dict).to_frame().T
         output_file_list = pd.concat([output_file_list, file_row],ignore_index=True)
@@ -156,7 +171,7 @@ def tofinemapping(sumstats,
-def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,verbose=True):
+def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, windowsizekb,out,plink_log,log,memory,mode,filetype,plink,plink2,ref_allele_path, extra_plink_option="",verbose=True):
     '''
     Calculate LD r matrix by calling PLINK; return file name and log
     '''
@@ -177,18 +192,32 @@ def _calculate_ld_r(study, matched_sumstats_snpid, row, bfile_prefix, n_cores, w
         if filetype=="pfile":
             raise ValueError("Please use bfile instead of pfile for PLINK1.")
+        #log.write(" -Flipping plink file ref allele to match...",verbose=verbose)
+        #script_vcf_to_bfile = """
+        #{} \
+        #    --bfile {} \
+        #    --extract {} \
+        #    --chr {} \
+        #    --ref-allele 'force' {} 4 1 \
+        #    --threads {} {} \
+        #    --make-bed \
+        #    --out {}
+        #""".format(plink2, bfile_to_use, snplist_path,  row["CHR"],ref_allele_path, n_cores, memory_flag if memory is not None else "", output_prefix+"_gwaslab_tmp")
+        log.write(" -Calculating r matrix...",verbose=verbose)
         script_vcf_to_bfile = """
         {} \
             --bfile {} \
-            --keep-allele-order \
+            --a2-allele {} 4 1 \
             --extract {} \
             --chr {} \
             --{} square gz \
             --allow-no-sex \
             --threads {} {}\
             --write-snplist \
-            --out {}
-        """.format(plink, bfile_to_use, snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix)
+            --out {} {}
+        """.format(plink, bfile_to_use, ref_allele_path,  snplist_path , row["CHR"], mode, n_cores, memory_flag if memory is not None else "", output_prefix, extra_plink_option)
         try:
             output = subprocess.check_output(script_vcf_to_bfile, stderr=subprocess.STDOUT, shell=True,text=True)
@@ -236,20 +265,20 @@ def _align_sumstats_with_bim(row, locus_sumstats, ref_bim, log=Log(),suffixes=No
         log.warning("Lead variant was not available in reference!")
     # adjust statistics
-    output_columns=["SNPID","CHR","POS","EA_bim","NEA_bim"]
+    output_columns=["SNPID","CHR","POS","EA","NEA"]
     for suffix in suffixes:
         if ("BETA"+suffix in locus_sumstats.columns) and ("SE"+suffix in locus_sumstats.columns):
-            log.write("   -Flipping BETA{} for variants with flipped alleles...".format(suffix))
-            combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
+            #log.write("   -Flipping BETA{} for variants with flipped alleles...".format(suffix))
+            #combined_df.loc[flipped_match,"BETA"+suffix] = - combined_df.loc[flipped_match,"BETA"+suffix]
             output_columns.append("BETA"+suffix)
             output_columns.append("SE"+suffix)
         if "Z" in locus_sumstats.columns:
-            log.write("   -Flipping Z{} for variants with flipped alleles...".format(suffix))
-            combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
+            #log.write("   -Flipping Z{} for variants with flipped alleles...".format(suffix))
+            #combined_df.loc[flipped_match,"Z"+suffix] = - combined_df.loc[flipped_match,"Z"+suffix]
             output_columns.append("Z"+suffix)
         if "EAF" in locus_sumstats.columns:
-            log.write("   -Flipping EAF{} for variants with flipped alleles...".format(suffix))
-            combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
+            #log.write("   -Flipping EAF{} for variants with flipped alleles...".format(suffix))
+            #combined_df.loc[flipped_match,"EAF"+suffix] = 1 - combined_df.loc[flipped_match,"EAF"+suffix]
             output_columns.append("EAF"+suffix)
         if "N" in locus_sumstats.columns:
             output_columns.append("N"+suffix)
@@ -266,9 +295,9 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
         log.write(" -Exporting SNP list of {}  to: {}...".format(len(matched_sumstats) ,matched_snp_list_path))
         # create locus-sumstats EA, NEA, (BETA, SE), Z
-        matched_sumstats_path =  "{}/{}_{}_{}.sumstats.gz".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
+        matched_sumstats_path =  "{}/{}_{}_{}.sumstats".format(out.rstrip("/"), study, row["SNPID"] ,windowsizekb)
-        to_export_columns=["CHR","POS","EA_bim","NEA_bim"]
+        to_export_columns=["CHR","POS","EA","NEA"]
         for suffix in suffixes:
             if "Z"+suffix in matched_sumstats.columns :
                 to_export_columns.append("Z"+suffix)
@@ -282,7 +311,8 @@ def _export_snplist_and_locus_sumstats(matched_sumstats, out, study, row, window
         log.write(" -Exporting locus sumstats to: {}...".format(matched_sumstats_path))
         log.write(" -Exported columns: {}...".format(["SNPID"]+to_export_columns))
-        matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, index=None)
+        matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path, sep="\t",index=None)
+        matched_sumstats[ ["SNPID"]+to_export_columns].to_csv(matched_sumstats_path+".gz", sep="\t",index=None)
         return matched_snp_list_path, matched_sumstats_path
 def _check_snpid_order(snplist_path, matched_sumstats_snpid,log):

gwaslab/util_ex_gwascatalog.py CHANGED Viewed

@@ -3,43 +3,86 @@ import json
 import pandas as pd
 import gwaslab as gl
 from gwaslab.g_Log import Log
+from datetime import datetime
+import os
-def gwascatalog_trait(efo,source="NCBI",sig_level=5e-8,verbose=True,log=Log()):
+def find_efo_cache(efo, path):
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if efo in file:
+                return os.path.join(root, file)
+    return False
+def gwascatalog_trait(efo,
+                      source="NCBI",
+                      sig_level=5e-8,
+                      use_cache=True,
+                      cache_dir="./",
+                      verbose=True,
+                      log=Log()):
     #https://www.ebi.ac.uk/gwas/rest/docs/api
     base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo
     log.write("Start to retrieve data from GWASCatalog...", verbose=verbose)
-    log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
-    log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
-    log.write(" -EFO trait api: "+ base_url, verbose=verbose)
-    text = requests.get(base_url)
+    if use_cache==True:
+        log.write("searching cache in : {}".format(cache_dir))
+        cache = find_efo_cache(efo, cache_dir)
+        if cache==False:
+            log.write(" -Cache not found for {}... Downloading from GWASCatalog...".format(cache), verbose=verbose)
+    else:
+        cache = False
+    if cache==False:
+        #log.write(" -Please make sure your sumstats is based on GRCh38...", verbose=verbose)
+        log.write(" -Requesting (GET) trait information through the GWASCatalog API...", verbose=verbose)
+        log.write(" -EFO trait api: "+ base_url, verbose=verbose)
+        text = requests.get(base_url)
-    log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
-    if text.status_code!=200:
-        log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
-        log.write(" -Message:{}".format(text.text), verbose=verbose)
-        return 0
+        log.write(" -Status code: {}".format(text.status_code), verbose=verbose)
+        if text.status_code!=200:
+            log.write(" -Status code is not 200. Access failed. Please check your internet or the GWAS Catalog sever status.", verbose=verbose)
+            log.write(" -Message:{}".format(text.text), verbose=verbose)
+            return 0
-    api_response = json.loads(text.text)
-    log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
-    log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
+        api_response = json.loads(text.text)
+        log.write(" -Trait Name:",api_response["trait"], verbose=verbose)
+        log.write(" -Trait URL:",api_response["uri"], verbose=verbose)
+        base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
+        log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
+        log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
+        log.write(" -Note: this step might take a while...", verbose=verbose)
+        # get request and check status code of response
+        raw_data = requests.get(base_url)
+        # whether to proceed based on status code
+        is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
+        if is_proceed is False: return False
+        log.write(" -Loading json ...", verbose=verbose)
+        # Transform API response from JSON into Python dictionary
+        api_response = json.loads(raw_data.text)
+        now = datetime.now() # current date and time
+        datestring = now.strftime("%Y%m%d")
+        json_path = cache_dir + "GWASCatalog_{}_associationsByTraitSummary_text_{}.json".format(efo, datestring)
+        try:
+            log.write(" -Saving json to: {} ...".format(json_path), verbose=verbose)
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(api_response, f, ensure_ascii=False, indent=4)
+        except:
+            pass
+    else:
+        log.write(" -Loading cache for {}: {} ...".format(efo, cache), verbose=verbose)
+        with open(cache) as f:
+            api_response = json.load(f)
-    base_url = "https://www.ebi.ac.uk/gwas/rest/api/efoTraits/"+efo+"/associations?projection=associationByEfoTrait"
-    log.write(" -Requesting (GET) GWAS associations through the GWASCatalog API...", verbose=verbose)
-    log.write(" -associationsByTraitSummary API: "+ base_url, verbose=verbose)
-    log.write(" -Note: this step might take a while...", verbose=verbose)
-    # get request and check status code of response
-    raw_data = requests.get(base_url)
-    # whether to proceed based on status code
-    is_proceed = check_request_status_code(raw_data.status_code,verbose=verbose,log=log)
-    if is_proceed is False: return False
-    log.write(" -Loading json ...", verbose=verbose)
-    # Transform API response from JSON into Python dictionary
-    api_response = json.loads(raw_data.text)
     log.write(" -Parsing json ...", verbose=verbose)
     # An
     records=list()

gwaslab 3.5.7__py3-none-any.whl → 3.5.8__py3-none-any.whl

Potentially problematic release.

gwaslab 3.5.7py3-none-any.whl → 3.5.8py3-none-any.whl